{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7839559871158865, "eval_steps": 500, "global_step": 50016, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005015713289289101, "grad_norm": 2.737861394882202, "learning_rate": 1.875e-05, "loss": 36.6669, "step": 32, "throughput": 4350.062995504972 }, { "epoch": 0.0010031426578578201, "grad_norm": 2.384377956390381, "learning_rate": 3.75e-05, "loss": 27.7836, "step": 64, "throughput": 6868.906038827848 }, { "epoch": 0.0015047139867867302, "grad_norm": 2.172982931137085, "learning_rate": 5.625e-05, "loss": 23.6446, "step": 96, "throughput": 8676.71146512722 }, { "epoch": 0.0020062853157156403, "grad_norm": 1.5305997133255005, "learning_rate": 7.5e-05, "loss": 21.199, "step": 128, "throughput": 9992.827972037452 }, { "epoch": 0.0025078566446445506, "grad_norm": 1.4041763544082642, "learning_rate": 9.374999999999999e-05, "loss": 19.6528, "step": 160, "throughput": 10982.420092196491 }, { "epoch": 0.0030094279735734604, "grad_norm": 1.2152384519577026, "learning_rate": 0.0001125, "loss": 18.5342, "step": 192, "throughput": 11769.216948894604 }, { "epoch": 0.0035109993025023707, "grad_norm": 0.9721791744232178, "learning_rate": 0.00013125, "loss": 17.5499, "step": 224, "throughput": 12404.199759486375 }, { "epoch": 0.0040125706314312806, "grad_norm": 0.8485270142555237, "learning_rate": 0.00015, "loss": 16.7178, "step": 256, "throughput": 12926.906808106636 }, { "epoch": 0.004514141960360191, "grad_norm": 0.879156231880188, "learning_rate": 0.00016874999999999998, "loss": 15.9278, "step": 288, "throughput": 13364.992559591625 }, { "epoch": 0.005015713289289101, "grad_norm": 0.7019696235656738, "learning_rate": 0.00018749999999999998, "loss": 15.2188, "step": 320, "throughput": 13738.019263643286 }, { "epoch": 0.005517284618218011, "grad_norm": 0.5537405014038086, "learning_rate": 0.00020624999999999997, "loss": 14.6855, "step": 352, "throughput": 13864.274977038647 }, { "epoch": 0.006018855947146921, "grad_norm": 0.5422670841217041, "learning_rate": 0.000225, "loss": 14.2397, "step": 384, "throughput": 14152.243480340532 }, { "epoch": 0.006520427276075831, "grad_norm": 0.5699282288551331, "learning_rate": 0.00024375, "loss": 13.9064, "step": 416, "throughput": 14404.992949736901 }, { "epoch": 0.007021998605004741, "grad_norm": 0.4603062868118286, "learning_rate": 0.0002625, "loss": 13.6128, "step": 448, "throughput": 14623.56986825748 }, { "epoch": 0.007523569933933652, "grad_norm": 0.4180799424648285, "learning_rate": 0.00028125, "loss": 13.3493, "step": 480, "throughput": 14824.190947537965 }, { "epoch": 0.008025141262862561, "grad_norm": 0.4132392108440399, "learning_rate": 0.0003, "loss": 13.1644, "step": 512, "throughput": 15004.16560878327 }, { "epoch": 0.008526712591791472, "grad_norm": 0.3716597557067871, "learning_rate": 0.00029999972162979993, "loss": 12.9831, "step": 544, "throughput": 15166.039125342944 }, { "epoch": 0.009028283920720382, "grad_norm": 0.31194815039634705, "learning_rate": 0.00029999888652034774, "loss": 12.7736, "step": 576, "throughput": 15312.97548988371 }, { "epoch": 0.009529855249649291, "grad_norm": 0.27463942766189575, "learning_rate": 0.00029999749467508744, "loss": 12.6429, "step": 608, "throughput": 15447.179519135016 }, { "epoch": 0.010031426578578202, "grad_norm": 0.35916781425476074, "learning_rate": 0.0002999955460997589, "loss": 12.4571, "step": 640, "throughput": 15515.372038285244 }, { "epoch": 0.010532997907507112, "grad_norm": 0.2248268574476242, "learning_rate": 0.0002999930408023982, "loss": 12.3474, "step": 672, "throughput": 15565.854800125042 }, { "epoch": 0.011034569236436023, "grad_norm": 0.2440202385187149, "learning_rate": 0.00029998997879333714, "loss": 12.2072, "step": 704, "throughput": 15673.428153679059 }, { "epoch": 0.011536140565364932, "grad_norm": 0.22323162853717804, "learning_rate": 0.0002999863600852034, "loss": 12.0949, "step": 736, "throughput": 15772.819065073185 }, { "epoch": 0.012037711894293842, "grad_norm": 0.21207238733768463, "learning_rate": 0.0002999821846929206, "loss": 12.0205, "step": 768, "throughput": 15861.48978900848 }, { "epoch": 0.012539283223222753, "grad_norm": 0.21534381806850433, "learning_rate": 0.000299977452633708, "loss": 11.9246, "step": 800, "throughput": 15947.834714894536 }, { "epoch": 0.013040854552151662, "grad_norm": 0.1891532838344574, "learning_rate": 0.00029997216392708075, "loss": 11.8366, "step": 832, "throughput": 16028.38745709301 }, { "epoch": 0.013542425881080573, "grad_norm": 0.26148706674575806, "learning_rate": 0.00029996631859484943, "loss": 11.7597, "step": 864, "throughput": 16103.565898562283 }, { "epoch": 0.014043997210009483, "grad_norm": 0.19636370241641998, "learning_rate": 0.00029995991666112014, "loss": 11.6889, "step": 896, "throughput": 16174.401229490417 }, { "epoch": 0.014545568538938392, "grad_norm": 0.3416622281074524, "learning_rate": 0.0002999529581522946, "loss": 11.622, "step": 928, "throughput": 16233.68864359473 }, { "epoch": 0.015047139867867303, "grad_norm": 0.18073995411396027, "learning_rate": 0.0002999454430970696, "loss": 11.5889, "step": 960, "throughput": 16210.364274523525 }, { "epoch": 0.015548711196796213, "grad_norm": 0.1570938527584076, "learning_rate": 0.0002999373715264373, "loss": 11.5215, "step": 992, "throughput": 16271.492628215825 }, { "epoch": 0.016050282525725122, "grad_norm": 0.20865876972675323, "learning_rate": 0.0002999287434736849, "loss": 11.4671, "step": 1024, "throughput": 16329.189967431246 }, { "epoch": 0.016551853854654033, "grad_norm": 0.15938478708267212, "learning_rate": 0.0002999195589743945, "loss": 11.4095, "step": 1056, "throughput": 16380.195823306925 }, { "epoch": 0.017053425183582945, "grad_norm": 0.16895660758018494, "learning_rate": 0.000299909818066443, "loss": 11.3584, "step": 1088, "throughput": 16431.633737801927 }, { "epoch": 0.017554996512511852, "grad_norm": 0.21328414976596832, "learning_rate": 0.00029989952079000195, "loss": 11.3362, "step": 1120, "throughput": 16480.600441523216 }, { "epoch": 0.018056567841440763, "grad_norm": 0.17861202359199524, "learning_rate": 0.0002998886671875373, "loss": 11.2748, "step": 1152, "throughput": 16527.308305360624 }, { "epoch": 0.018558139170369675, "grad_norm": 0.1585921049118042, "learning_rate": 0.0002998772573038094, "loss": 11.2309, "step": 1184, "throughput": 16571.695879414754 }, { "epoch": 0.019059710499298582, "grad_norm": 0.18928000330924988, "learning_rate": 0.0002998652911858726, "loss": 11.1846, "step": 1216, "throughput": 16613.799461460218 }, { "epoch": 0.019561281828227493, "grad_norm": 0.15984398126602173, "learning_rate": 0.00029985276888307524, "loss": 11.1471, "step": 1248, "throughput": 16609.892953842762 }, { "epoch": 0.020062853157156404, "grad_norm": 0.21660014986991882, "learning_rate": 0.00029983969044705927, "loss": 11.1291, "step": 1280, "throughput": 16622.300930347366 }, { "epoch": 0.020564424486085316, "grad_norm": 0.14777784049510956, "learning_rate": 0.0002998260559317603, "loss": 11.0892, "step": 1312, "throughput": 16660.521579844877 }, { "epoch": 0.021065995815014223, "grad_norm": 0.14789365231990814, "learning_rate": 0.00029981186539340703, "loss": 11.0426, "step": 1344, "throughput": 16697.111978019177 }, { "epoch": 0.021567567143943134, "grad_norm": 0.14859367907047272, "learning_rate": 0.0002997971188905213, "loss": 11.0245, "step": 1376, "throughput": 16729.518871853692 }, { "epoch": 0.022069138472872046, "grad_norm": 0.15714970231056213, "learning_rate": 0.0002997818164839178, "loss": 10.9909, "step": 1408, "throughput": 16762.959924093906 }, { "epoch": 0.022570709801800953, "grad_norm": 0.15366333723068237, "learning_rate": 0.00029976595823670354, "loss": 10.9599, "step": 1440, "throughput": 16795.25507278253 }, { "epoch": 0.023072281130729864, "grad_norm": 0.12857754528522491, "learning_rate": 0.0002997495442142781, "loss": 10.9525, "step": 1472, "throughput": 16826.31322241198 }, { "epoch": 0.023573852459658776, "grad_norm": 0.14877088367938995, "learning_rate": 0.000299732574484333, "loss": 10.9111, "step": 1504, "throughput": 16856.042827265162 }, { "epoch": 0.024075423788587683, "grad_norm": 0.13609924912452698, "learning_rate": 0.0002997150491168514, "loss": 10.8869, "step": 1536, "throughput": 16860.23517737338 }, { "epoch": 0.024576995117516594, "grad_norm": 0.13336826860904694, "learning_rate": 0.0002996969681841079, "loss": 10.8515, "step": 1568, "throughput": 16849.334342204478 }, { "epoch": 0.025078566446445506, "grad_norm": 0.1425756961107254, "learning_rate": 0.0002996783317606684, "loss": 10.8343, "step": 1600, "throughput": 16876.970940638224 }, { "epoch": 0.025580137775374417, "grad_norm": 0.12339065968990326, "learning_rate": 0.0002996591399233895, "loss": 10.8043, "step": 1632, "throughput": 16903.458674886315 }, { "epoch": 0.026081709104303324, "grad_norm": 0.1654420793056488, "learning_rate": 0.00029963939275141855, "loss": 10.7661, "step": 1664, "throughput": 16926.983914307886 }, { "epoch": 0.026583280433232236, "grad_norm": 0.1435118317604065, "learning_rate": 0.00029961909032619275, "loss": 10.7588, "step": 1696, "throughput": 16951.641196906367 }, { "epoch": 0.027084851762161147, "grad_norm": 0.13442976772785187, "learning_rate": 0.00029959823273143947, "loss": 10.7364, "step": 1728, "throughput": 16975.48019723633 }, { "epoch": 0.027586423091090054, "grad_norm": 0.13002510368824005, "learning_rate": 0.0002995768200531755, "loss": 10.7439, "step": 1760, "throughput": 16998.619261377895 }, { "epoch": 0.028087994420018966, "grad_norm": 0.13215093314647675, "learning_rate": 0.00029955485237970675, "loss": 10.7027, "step": 1792, "throughput": 17020.925154476437 }, { "epoch": 0.028589565748947877, "grad_norm": 0.11973442882299423, "learning_rate": 0.00029953232980162793, "loss": 10.6846, "step": 1824, "throughput": 17042.518070188875 }, { "epoch": 0.029091137077876784, "grad_norm": 0.1303834766149521, "learning_rate": 0.0002995092524118223, "loss": 10.6449, "step": 1856, "throughput": 17016.161749213952 }, { "epoch": 0.029592708406805696, "grad_norm": 0.12708888947963715, "learning_rate": 0.00029948562030546107, "loss": 10.6539, "step": 1888, "throughput": 17035.112974831827 }, { "epoch": 0.030094279735734607, "grad_norm": 0.12121839076280594, "learning_rate": 0.00029946143358000306, "loss": 10.6242, "step": 1920, "throughput": 17055.38715248678 }, { "epoch": 0.030595851064663518, "grad_norm": 0.11469029635190964, "learning_rate": 0.0002994366923351945, "loss": 10.6108, "step": 1952, "throughput": 17075.084240308945 }, { "epoch": 0.031097422393592426, "grad_norm": 0.12286818772554398, "learning_rate": 0.00029941139667306817, "loss": 10.5874, "step": 1984, "throughput": 17092.458306452812 }, { "epoch": 0.03159899372252133, "grad_norm": 0.1339081972837448, "learning_rate": 0.00029938554669794364, "loss": 10.5631, "step": 2016, "throughput": 17111.09462915212 }, { "epoch": 0.032100565051450244, "grad_norm": 0.11653994768857956, "learning_rate": 0.00029935914251642625, "loss": 10.5505, "step": 2048, "throughput": 17129.118733264822 }, { "epoch": 0.032602136380379156, "grad_norm": 0.11728750914335251, "learning_rate": 0.0002993321842374069, "loss": 10.5368, "step": 2080, "throughput": 17125.597353447316 }, { "epoch": 0.03310370770930807, "grad_norm": 0.14113257825374603, "learning_rate": 0.00029930467197206156, "loss": 10.507, "step": 2112, "throughput": 17142.736351591233 }, { "epoch": 0.03360527903823698, "grad_norm": 0.1468958556652069, "learning_rate": 0.000299276605833851, "loss": 10.4866, "step": 2144, "throughput": 17138.43663722723 }, { "epoch": 0.03410685036716589, "grad_norm": 0.11644583195447922, "learning_rate": 0.00029924798593851994, "loss": 10.4893, "step": 2176, "throughput": 17133.26269190115 }, { "epoch": 0.0346084216960948, "grad_norm": 0.13621202111244202, "learning_rate": 0.00029921881240409703, "loss": 10.4784, "step": 2208, "throughput": 17149.698906784317 }, { "epoch": 0.035109993025023704, "grad_norm": 0.13037540018558502, "learning_rate": 0.00029918908535089394, "loss": 10.4713, "step": 2240, "throughput": 17165.729847041897 }, { "epoch": 0.035611564353952616, "grad_norm": 0.10882680118083954, "learning_rate": 0.00029915880490150515, "loss": 10.4456, "step": 2272, "throughput": 17179.659850846612 }, { "epoch": 0.03611313568288153, "grad_norm": 0.1198781281709671, "learning_rate": 0.0002991279711808072, "loss": 10.4441, "step": 2304, "throughput": 17194.730227287117 }, { "epoch": 0.03661470701181044, "grad_norm": 0.123746857047081, "learning_rate": 0.0002990965843159587, "loss": 10.4084, "step": 2336, "throughput": 17209.49343914183 }, { "epoch": 0.03711627834073935, "grad_norm": 0.11193118989467621, "learning_rate": 0.000299064644436399, "loss": 10.4024, "step": 2368, "throughput": 17223.8604387402 }, { "epoch": 0.03761784966966826, "grad_norm": 0.12986549735069275, "learning_rate": 0.0002990321516738482, "loss": 10.3658, "step": 2400, "throughput": 17234.548263588465 }, { "epoch": 0.038119420998597164, "grad_norm": 0.1199018731713295, "learning_rate": 0.00029899910616230674, "loss": 10.3805, "step": 2432, "throughput": 17245.132224434677 }, { "epoch": 0.038620992327526076, "grad_norm": 0.11385921388864517, "learning_rate": 0.0002989655080380543, "loss": 10.3796, "step": 2464, "throughput": 17225.296922944664 }, { "epoch": 0.03912256365645499, "grad_norm": 0.11526582390069962, "learning_rate": 0.0002989313574396496, "loss": 10.3479, "step": 2496, "throughput": 17236.683727834174 }, { "epoch": 0.0396241349853839, "grad_norm": 0.12292210012674332, "learning_rate": 0.00029889665450792983, "loss": 10.3344, "step": 2528, "throughput": 17249.837851508317 }, { "epoch": 0.04012570631431281, "grad_norm": 0.11053690314292908, "learning_rate": 0.0002988613993860101, "loss": 10.327, "step": 2560, "throughput": 17262.691844135206 }, { "epoch": 0.04062727764324172, "grad_norm": 0.12507599592208862, "learning_rate": 0.0002988255922192825, "loss": 10.318, "step": 2592, "throughput": 17273.835852105618 }, { "epoch": 0.04112884897217063, "grad_norm": 0.14252890646457672, "learning_rate": 0.000298789233155416, "loss": 10.305, "step": 2624, "throughput": 17286.01479157816 }, { "epoch": 0.041630420301099536, "grad_norm": 0.12239658832550049, "learning_rate": 0.0002987523223443554, "loss": 10.2977, "step": 2656, "throughput": 17298.06003124777 }, { "epoch": 0.04213199163002845, "grad_norm": 0.11500866711139679, "learning_rate": 0.000298714859938321, "loss": 10.2824, "step": 2688, "throughput": 17306.5386805577 }, { "epoch": 0.04263356295895736, "grad_norm": 0.13773761689662933, "learning_rate": 0.0002986768460918079, "loss": 10.269, "step": 2720, "throughput": 17318.022733030513 }, { "epoch": 0.04313513428788627, "grad_norm": 0.11034831404685974, "learning_rate": 0.0002986382809615853, "loss": 10.2613, "step": 2752, "throughput": 17310.182294333277 }, { "epoch": 0.04363670561681518, "grad_norm": 0.12043263763189316, "learning_rate": 0.00029859916470669596, "loss": 10.2641, "step": 2784, "throughput": 17305.36764489267 }, { "epoch": 0.04413827694574409, "grad_norm": 0.12668395042419434, "learning_rate": 0.0002985594974884554, "loss": 10.2143, "step": 2816, "throughput": 17316.46325404571 }, { "epoch": 0.044639848274673, "grad_norm": 0.12551531195640564, "learning_rate": 0.00029851927947045136, "loss": 10.2288, "step": 2848, "throughput": 17327.313985046396 }, { "epoch": 0.04514141960360191, "grad_norm": 0.1118423193693161, "learning_rate": 0.000298478510818543, "loss": 10.2037, "step": 2880, "throughput": 17336.701168332987 }, { "epoch": 0.04564299093253082, "grad_norm": 0.11913536489009857, "learning_rate": 0.0002984371917008604, "loss": 10.1908, "step": 2912, "throughput": 17347.173504797236 }, { "epoch": 0.04614456226145973, "grad_norm": 0.11208291351795197, "learning_rate": 0.0002983953222878037, "loss": 10.2091, "step": 2944, "throughput": 17357.365218601015 }, { "epoch": 0.04664613359038864, "grad_norm": 0.13201284408569336, "learning_rate": 0.0002983529027520426, "loss": 10.1697, "step": 2976, "throughput": 17367.380788820446 }, { "epoch": 0.04714770491931755, "grad_norm": 0.10959289968013763, "learning_rate": 0.0002983099332685153, "loss": 10.1807, "step": 3008, "throughput": 17374.438712058905 }, { "epoch": 0.04764927624824646, "grad_norm": 0.12129059433937073, "learning_rate": 0.000298266414014428, "loss": 10.1717, "step": 3040, "throughput": 17374.21131079343 }, { "epoch": 0.04815084757717537, "grad_norm": 0.11781725287437439, "learning_rate": 0.0002982223451692544, "loss": 10.1645, "step": 3072, "throughput": 17362.983274354043 }, { "epoch": 0.04865241890610428, "grad_norm": 0.11948370188474655, "learning_rate": 0.0002981777269147344, "loss": 10.1535, "step": 3104, "throughput": 17372.49721677286 }, { "epoch": 0.04915399023503319, "grad_norm": 0.11986897140741348, "learning_rate": 0.0002981325594348739, "loss": 10.1651, "step": 3136, "throughput": 17381.819105870687 }, { "epoch": 0.0496555615639621, "grad_norm": 0.1096222773194313, "learning_rate": 0.00029808684291594373, "loss": 10.1223, "step": 3168, "throughput": 17391.02283950976 }, { "epoch": 0.05015713289289101, "grad_norm": 0.11586567759513855, "learning_rate": 0.0002980405775464789, "loss": 10.1181, "step": 3200, "throughput": 17398.88477420086 }, { "epoch": 0.05065870422181992, "grad_norm": 0.11325585842132568, "learning_rate": 0.00029799376351727797, "loss": 10.1027, "step": 3232, "throughput": 17407.772020075172 }, { "epoch": 0.051160275550748834, "grad_norm": 0.11115550249814987, "learning_rate": 0.00029794640102140206, "loss": 10.0934, "step": 3264, "throughput": 17416.48282303885 }, { "epoch": 0.05166184687967774, "grad_norm": 0.1690579205751419, "learning_rate": 0.00029789849025417433, "loss": 10.0908, "step": 3296, "throughput": 17422.67145275401 }, { "epoch": 0.05216341820860665, "grad_norm": 0.1091977134346962, "learning_rate": 0.0002978500314131789, "loss": 10.1244, "step": 3328, "throughput": 17431.123336344346 }, { "epoch": 0.05266498953753556, "grad_norm": 0.12021326273679733, "learning_rate": 0.00029780102469826014, "loss": 10.0776, "step": 3360, "throughput": 17418.5921345674 }, { "epoch": 0.05316656086646447, "grad_norm": 0.11450007557868958, "learning_rate": 0.00029775147031152195, "loss": 10.0661, "step": 3392, "throughput": 17419.855544322516 }, { "epoch": 0.05366813219539338, "grad_norm": 0.11600250005722046, "learning_rate": 0.0002977013684573267, "loss": 10.068, "step": 3424, "throughput": 17428.08526860908 }, { "epoch": 0.054169703524322294, "grad_norm": 0.1143857091665268, "learning_rate": 0.0002976507193422946, "loss": 10.0618, "step": 3456, "throughput": 17436.174986582257 }, { "epoch": 0.0546712748532512, "grad_norm": 0.11679524928331375, "learning_rate": 0.00029759952317530284, "loss": 10.0712, "step": 3488, "throughput": 17443.11412749548 }, { "epoch": 0.05517284618218011, "grad_norm": 0.10890108346939087, "learning_rate": 0.0002975477801674845, "loss": 10.0198, "step": 3520, "throughput": 17449.86598475358 }, { "epoch": 0.05567441751110902, "grad_norm": 0.12132527679204941, "learning_rate": 0.00029749549053222784, "loss": 10.0404, "step": 3552, "throughput": 17457.54716352314 }, { "epoch": 0.05617598884003793, "grad_norm": 0.10533833503723145, "learning_rate": 0.0002974426544851755, "loss": 10.0162, "step": 3584, "throughput": 17461.716367238747 }, { "epoch": 0.05667756016896684, "grad_norm": 0.1043761745095253, "learning_rate": 0.00029738927224422354, "loss": 10.019, "step": 3616, "throughput": 17469.163920337327 }, { "epoch": 0.057179131497895753, "grad_norm": 0.12880338728427887, "learning_rate": 0.0002973353440295205, "loss": 9.9935, "step": 3648, "throughput": 17467.133660286712 }, { "epoch": 0.057680702826824665, "grad_norm": 0.11385416239500046, "learning_rate": 0.0002972808700634664, "loss": 9.9976, "step": 3680, "throughput": 17457.92126652086 }, { "epoch": 0.05818227415575357, "grad_norm": 0.10410229116678238, "learning_rate": 0.0002972258505707121, "loss": 9.9902, "step": 3712, "throughput": 17465.242572055144 }, { "epoch": 0.05868384548468248, "grad_norm": 0.11616440117359161, "learning_rate": 0.00029717028577815817, "loss": 9.978, "step": 3744, "throughput": 17472.43441514092 }, { "epoch": 0.05918541681361139, "grad_norm": 0.10885748267173767, "learning_rate": 0.0002971141759149539, "loss": 9.9977, "step": 3776, "throughput": 17479.47152805395 }, { "epoch": 0.0596869881425403, "grad_norm": 0.09921612590551376, "learning_rate": 0.00029705752121249665, "loss": 9.9735, "step": 3808, "throughput": 17485.442570293268 }, { "epoch": 0.060188559471469213, "grad_norm": 0.10691357403993607, "learning_rate": 0.0002970003219044305, "loss": 9.96, "step": 3840, "throughput": 17491.197596983668 }, { "epoch": 0.060690130800398125, "grad_norm": 0.11703182011842728, "learning_rate": 0.0002969425782266455, "loss": 9.9753, "step": 3872, "throughput": 17496.026084267538 }, { "epoch": 0.061191702129327036, "grad_norm": 0.11055783927440643, "learning_rate": 0.0002968842904172769, "loss": 9.9648, "step": 3904, "throughput": 17500.58649689582 }, { "epoch": 0.06169327345825594, "grad_norm": 0.10314410924911499, "learning_rate": 0.00029682545871670375, "loss": 9.9586, "step": 3936, "throughput": 17504.020001952344 }, { "epoch": 0.06219484478718485, "grad_norm": 0.10210820287466049, "learning_rate": 0.0002967660833675481, "loss": 9.9413, "step": 3968, "throughput": 17493.79839046395 }, { "epoch": 0.06269641611611376, "grad_norm": 0.11651694774627686, "learning_rate": 0.0002967061646146741, "loss": 9.9297, "step": 4000, "throughput": 17497.19311474326 }, { "epoch": 0.06319798744504267, "grad_norm": 0.11037289351224899, "learning_rate": 0.00029664570270518685, "loss": 9.9138, "step": 4032, "throughput": 17503.607958066386 }, { "epoch": 0.06369955877397158, "grad_norm": 0.1176404356956482, "learning_rate": 0.00029658469788843147, "loss": 9.9193, "step": 4064, "throughput": 17509.946216930257 }, { "epoch": 0.06420113010290049, "grad_norm": 0.10412602126598358, "learning_rate": 0.00029652315041599203, "loss": 9.908, "step": 4096, "throughput": 17515.229704243564 }, { "epoch": 0.0647027014318294, "grad_norm": 0.10687946528196335, "learning_rate": 0.00029646106054169046, "loss": 9.9116, "step": 4128, "throughput": 17510.8929672248 }, { "epoch": 0.06520427276075831, "grad_norm": 0.1051303818821907, "learning_rate": 0.00029639842852158553, "loss": 9.9078, "step": 4160, "throughput": 17516.986313956397 }, { "epoch": 0.06570584408968723, "grad_norm": 0.09516575187444687, "learning_rate": 0.00029633525461397194, "loss": 9.9009, "step": 4192, "throughput": 17518.336116573944 }, { "epoch": 0.06620741541861613, "grad_norm": 0.10216325521469116, "learning_rate": 0.00029627153907937903, "loss": 9.9047, "step": 4224, "throughput": 17524.402411826053 }, { "epoch": 0.06670898674754504, "grad_norm": 0.09496073424816132, "learning_rate": 0.0002962072821805699, "loss": 9.8684, "step": 4256, "throughput": 17516.880157408814 }, { "epoch": 0.06721055807647396, "grad_norm": 0.09801316261291504, "learning_rate": 0.0002961424841825402, "loss": 9.8765, "step": 4288, "throughput": 17514.939707148635 }, { "epoch": 0.06771212940540286, "grad_norm": 0.10971728712320328, "learning_rate": 0.00029607714535251703, "loss": 9.8709, "step": 4320, "throughput": 17520.7930113957 }, { "epoch": 0.06821370073433178, "grad_norm": 0.10133639723062515, "learning_rate": 0.00029601126595995794, "loss": 9.8745, "step": 4352, "throughput": 17526.298259170235 }, { "epoch": 0.06871527206326068, "grad_norm": 0.11306998878717422, "learning_rate": 0.0002959448462765497, "loss": 9.8557, "step": 4384, "throughput": 17532.004624718895 }, { "epoch": 0.0692168433921896, "grad_norm": 0.09759490936994553, "learning_rate": 0.0002958778865762072, "loss": 9.8712, "step": 4416, "throughput": 17537.640257356656 }, { "epoch": 0.0697184147211185, "grad_norm": 0.10575896501541138, "learning_rate": 0.0002958103871350727, "loss": 9.8513, "step": 4448, "throughput": 17542.25457088177 }, { "epoch": 0.07021998605004741, "grad_norm": 0.10681546479463577, "learning_rate": 0.0002957423482315139, "loss": 9.8529, "step": 4480, "throughput": 17545.144703490852 }, { "epoch": 0.07072155737897633, "grad_norm": 0.1152559444308281, "learning_rate": 0.0002956737701461235, "loss": 9.8385, "step": 4512, "throughput": 17548.74305341976 }, { "epoch": 0.07122312870790523, "grad_norm": 0.10631895065307617, "learning_rate": 0.00029560465316171773, "loss": 9.8269, "step": 4544, "throughput": 17550.01474642025 }, { "epoch": 0.07172470003683415, "grad_norm": 0.10484053939580917, "learning_rate": 0.0002955349975633352, "loss": 9.8415, "step": 4576, "throughput": 17542.430141402656 }, { "epoch": 0.07222627136576305, "grad_norm": 0.12255118787288666, "learning_rate": 0.00029546480363823577, "loss": 9.837, "step": 4608, "throughput": 17545.271449204676 }, { "epoch": 0.07272784269469197, "grad_norm": 0.09655202925205231, "learning_rate": 0.0002953940716758995, "loss": 9.8122, "step": 4640, "throughput": 17550.573230292368 }, { "epoch": 0.07322941402362088, "grad_norm": 0.0936635285615921, "learning_rate": 0.0002953228019680252, "loss": 9.8208, "step": 4672, "throughput": 17555.763552451823 }, { "epoch": 0.07373098535254978, "grad_norm": 0.10081731528043747, "learning_rate": 0.0002952509948085293, "loss": 9.7989, "step": 4704, "throughput": 17560.945999053136 }, { "epoch": 0.0742325566814787, "grad_norm": 0.11177249252796173, "learning_rate": 0.00029517865049354477, "loss": 9.8218, "step": 4736, "throughput": 17564.308658957896 }, { "epoch": 0.0747341280104076, "grad_norm": 0.10516203194856644, "learning_rate": 0.0002951057693214197, "loss": 9.7971, "step": 4768, "throughput": 17568.519454363795 }, { "epoch": 0.07523569933933652, "grad_norm": 0.09306059777736664, "learning_rate": 0.0002950323515927164, "loss": 9.7782, "step": 4800, "throughput": 17570.196634560925 }, { "epoch": 0.07573727066826542, "grad_norm": 0.10436815023422241, "learning_rate": 0.0002949583976102097, "loss": 9.7929, "step": 4832, "throughput": 17574.36304626455 }, { "epoch": 0.07623884199719433, "grad_norm": 0.10673332214355469, "learning_rate": 0.00029488390767888606, "loss": 9.7824, "step": 4864, "throughput": 17568.283545826413 }, { "epoch": 0.07674041332612325, "grad_norm": 0.10187188535928726, "learning_rate": 0.0002948088821059422, "loss": 9.7773, "step": 4896, "throughput": 17567.223444270698 }, { "epoch": 0.07724198465505215, "grad_norm": 0.09946262836456299, "learning_rate": 0.0002947333212007838, "loss": 9.7803, "step": 4928, "throughput": 17571.22188826861 }, { "epoch": 0.07774355598398107, "grad_norm": 0.11062668263912201, "learning_rate": 0.0002946572252750242, "loss": 9.7851, "step": 4960, "throughput": 17575.99999033726 }, { "epoch": 0.07824512731290997, "grad_norm": 0.1022370308637619, "learning_rate": 0.0002945805946424834, "loss": 9.7647, "step": 4992, "throughput": 17580.71820594211 }, { "epoch": 0.07874669864183889, "grad_norm": 0.10585004836320877, "learning_rate": 0.0002945034296191861, "loss": 9.7739, "step": 5024, "throughput": 17585.3697487813 }, { "epoch": 0.0792482699707678, "grad_norm": 0.10841728746891022, "learning_rate": 0.00029442573052336127, "loss": 9.7694, "step": 5056, "throughput": 17588.24348090029 }, { "epoch": 0.0797498412996967, "grad_norm": 0.10807620733976364, "learning_rate": 0.0002943474976754401, "loss": 9.726, "step": 5088, "throughput": 17590.41015953132 }, { "epoch": 0.08025141262862562, "grad_norm": 0.10132047533988953, "learning_rate": 0.0002942687313980552, "loss": 9.7531, "step": 5120, "throughput": 17594.225674820857 }, { "epoch": 0.08075298395755452, "grad_norm": 0.10171724855899811, "learning_rate": 0.0002941894320160389, "loss": 9.7544, "step": 5152, "throughput": 17592.732609094946 }, { "epoch": 0.08125455528648344, "grad_norm": 0.1025429293513298, "learning_rate": 0.00029410959985642205, "loss": 9.7367, "step": 5184, "throughput": 17588.42501942629 }, { "epoch": 0.08175612661541234, "grad_norm": 0.10521358251571655, "learning_rate": 0.0002940292352484327, "loss": 9.7222, "step": 5216, "throughput": 17591.457901596536 }, { "epoch": 0.08225769794434126, "grad_norm": 0.10784083604812622, "learning_rate": 0.0002939483385234948, "loss": 9.7218, "step": 5248, "throughput": 17595.868519367257 }, { "epoch": 0.08275926927327017, "grad_norm": 0.10504278540611267, "learning_rate": 0.0002938669100152266, "loss": 9.7445, "step": 5280, "throughput": 17600.25831414381 }, { "epoch": 0.08326084060219907, "grad_norm": 0.09491508454084396, "learning_rate": 0.00029378495005943954, "loss": 9.7135, "step": 5312, "throughput": 17603.869297250032 }, { "epoch": 0.08376241193112799, "grad_norm": 0.11275453120470047, "learning_rate": 0.00029370245899413677, "loss": 9.7141, "step": 5344, "throughput": 17606.548134823293 }, { "epoch": 0.0842639832600569, "grad_norm": 0.10523293912410736, "learning_rate": 0.0002936194371595116, "loss": 9.7171, "step": 5376, "throughput": 17609.9973105787 }, { "epoch": 0.08476555458898581, "grad_norm": 0.10381247848272324, "learning_rate": 0.00029353588489794636, "loss": 9.707, "step": 5408, "throughput": 17611.932878916054 }, { "epoch": 0.08526712591791472, "grad_norm": 0.08856762945652008, "learning_rate": 0.0002934518025540109, "loss": 9.7049, "step": 5440, "throughput": 17614.62864714934 }, { "epoch": 0.08576869724684363, "grad_norm": 0.09817332029342651, "learning_rate": 0.00029336719047446096, "loss": 9.7237, "step": 5472, "throughput": 17606.804804142805 }, { "epoch": 0.08627026857577254, "grad_norm": 0.10031472146511078, "learning_rate": 0.000293282049008237, "loss": 9.695, "step": 5504, "throughput": 17608.69165022491 }, { "epoch": 0.08677183990470144, "grad_norm": 0.1021127700805664, "learning_rate": 0.00029319637850646273, "loss": 9.6985, "step": 5536, "throughput": 17612.11712008491 }, { "epoch": 0.08727341123363036, "grad_norm": 0.10871855914592743, "learning_rate": 0.0002931101793224435, "loss": 9.7016, "step": 5568, "throughput": 17616.15297625078 }, { "epoch": 0.08777498256255926, "grad_norm": 0.09045758843421936, "learning_rate": 0.0002930234518116651, "loss": 9.7002, "step": 5600, "throughput": 17619.38128133957 }, { "epoch": 0.08827655389148818, "grad_norm": 0.091500423848629, "learning_rate": 0.000292936196331792, "loss": 9.6671, "step": 5632, "throughput": 17622.575227982983 }, { "epoch": 0.08877812522041709, "grad_norm": 0.1074434295296669, "learning_rate": 0.000292848413242666, "loss": 9.6926, "step": 5664, "throughput": 17626.52213908722 }, { "epoch": 0.089279696549346, "grad_norm": 0.09811190515756607, "learning_rate": 0.0002927601029063049, "loss": 9.6708, "step": 5696, "throughput": 17628.319110769124 }, { "epoch": 0.08978126787827491, "grad_norm": 0.1028069257736206, "learning_rate": 0.0002926712656869007, "loss": 9.6649, "step": 5728, "throughput": 17632.157170322804 }, { "epoch": 0.09028283920720381, "grad_norm": 0.09495889395475388, "learning_rate": 0.0002925819019508184, "loss": 9.6708, "step": 5760, "throughput": 17627.799875140157 }, { "epoch": 0.09078441053613273, "grad_norm": 0.08912132680416107, "learning_rate": 0.0002924920120665943, "loss": 9.6776, "step": 5792, "throughput": 17626.529906001404 }, { "epoch": 0.09128598186506164, "grad_norm": 0.10304176807403564, "learning_rate": 0.00029240159640493463, "loss": 9.6722, "step": 5824, "throughput": 17629.08179239365 }, { "epoch": 0.09178755319399055, "grad_norm": 0.09453807771205902, "learning_rate": 0.00029231065533871374, "loss": 9.661, "step": 5856, "throughput": 17632.87142952827 }, { "epoch": 0.09228912452291946, "grad_norm": 0.09665724635124207, "learning_rate": 0.0002922191892429729, "loss": 9.6408, "step": 5888, "throughput": 17635.962107195053 }, { "epoch": 0.09279069585184836, "grad_norm": 0.10424266010522842, "learning_rate": 0.0002921271984949185, "loss": 9.6516, "step": 5920, "throughput": 17638.31696854064 }, { "epoch": 0.09329226718077728, "grad_norm": 0.1087077185511589, "learning_rate": 0.0002920346834739208, "loss": 9.6378, "step": 5952, "throughput": 17641.284236489937 }, { "epoch": 0.09379383850970618, "grad_norm": 0.09325496107339859, "learning_rate": 0.0002919416445615119, "loss": 9.641, "step": 5984, "throughput": 17643.60922814501 }, { "epoch": 0.0942954098386351, "grad_norm": 0.08783965557813644, "learning_rate": 0.0002918480821413846, "loss": 9.6218, "step": 6016, "throughput": 17645.8784832565 }, { "epoch": 0.094796981167564, "grad_norm": 0.09440628439188004, "learning_rate": 0.0002917539965993906, "loss": 9.624, "step": 6048, "throughput": 17646.755073945893 }, { "epoch": 0.09529855249649292, "grad_norm": 0.09520844370126724, "learning_rate": 0.00029165938832353885, "loss": 9.6299, "step": 6080, "throughput": 17641.485981629125 }, { "epoch": 0.09580012382542183, "grad_norm": 0.09581635892391205, "learning_rate": 0.00029156425770399434, "loss": 9.6167, "step": 6112, "throughput": 17643.078829361555 }, { "epoch": 0.09630169515435073, "grad_norm": 0.10582980513572693, "learning_rate": 0.0002914686051330759, "loss": 9.6188, "step": 6144, "throughput": 17645.96076286583 }, { "epoch": 0.09680326648327965, "grad_norm": 0.10588561743497849, "learning_rate": 0.00029137243100525506, "loss": 9.6429, "step": 6176, "throughput": 17642.181888559946 }, { "epoch": 0.09730483781220856, "grad_norm": 0.1067621037364006, "learning_rate": 0.00029127573571715416, "loss": 9.613, "step": 6208, "throughput": 17645.000134548154 }, { "epoch": 0.09780640914113747, "grad_norm": 0.09926958382129669, "learning_rate": 0.00029117851966754495, "loss": 9.6089, "step": 6240, "throughput": 17647.194567637944 }, { "epoch": 0.09830798047006638, "grad_norm": 0.09142431616783142, "learning_rate": 0.00029108078325734666, "loss": 9.6167, "step": 6272, "throughput": 17650.62803497812 }, { "epoch": 0.0988095517989953, "grad_norm": 0.10240490734577179, "learning_rate": 0.0002909825268896245, "loss": 9.6053, "step": 6304, "throughput": 17651.4597674561 }, { "epoch": 0.0993111231279242, "grad_norm": 0.09140589088201523, "learning_rate": 0.000290883750969588, "loss": 9.5989, "step": 6336, "throughput": 17654.249897680278 }, { "epoch": 0.0998126944568531, "grad_norm": 0.1017196998000145, "learning_rate": 0.00029078445590458946, "loss": 9.5878, "step": 6368, "throughput": 17649.12258178832 }, { "epoch": 0.10031426578578202, "grad_norm": 0.0923992320895195, "learning_rate": 0.0002906846421041219, "loss": 9.625, "step": 6400, "throughput": 17648.517260039112 }, { "epoch": 0.10081583711471093, "grad_norm": 0.09159571677446365, "learning_rate": 0.00029058430997981784, "loss": 9.571, "step": 6432, "throughput": 17651.87687688665 }, { "epoch": 0.10131740844363984, "grad_norm": 0.09523748606443405, "learning_rate": 0.0002904834599454472, "loss": 9.576, "step": 6464, "throughput": 17655.226269402574 }, { "epoch": 0.10181897977256875, "grad_norm": 0.09053566306829453, "learning_rate": 0.00029038209241691575, "loss": 9.6027, "step": 6496, "throughput": 17657.85699659635 }, { "epoch": 0.10232055110149767, "grad_norm": 0.10799683630466461, "learning_rate": 0.0002902802078122636, "loss": 9.558, "step": 6528, "throughput": 17659.877539796056 }, { "epoch": 0.10282212243042657, "grad_norm": 0.09402478486299515, "learning_rate": 0.00029017780655166315, "loss": 9.5786, "step": 6560, "throughput": 17662.439138601498 }, { "epoch": 0.10332369375935548, "grad_norm": 0.10318920761346817, "learning_rate": 0.0002900748890574175, "loss": 9.5855, "step": 6592, "throughput": 17664.441358775166 }, { "epoch": 0.1038252650882844, "grad_norm": 0.09061913937330246, "learning_rate": 0.0002899714557539586, "loss": 9.5826, "step": 6624, "throughput": 17666.35326495104 }, { "epoch": 0.1043268364172133, "grad_norm": 0.11016388982534409, "learning_rate": 0.00028986750706784574, "loss": 9.5847, "step": 6656, "throughput": 17663.995249747324 }, { "epoch": 0.10482840774614222, "grad_norm": 0.09011092782020569, "learning_rate": 0.0002897630434277637, "loss": 9.5616, "step": 6688, "throughput": 17661.589803465566 }, { "epoch": 0.10532997907507112, "grad_norm": 0.09123651683330536, "learning_rate": 0.0002896580652645207, "loss": 9.555, "step": 6720, "throughput": 17662.85464981392 }, { "epoch": 0.10583155040400004, "grad_norm": 0.08848977088928223, "learning_rate": 0.00028955257301104714, "loss": 9.5467, "step": 6752, "throughput": 17665.994535257018 }, { "epoch": 0.10633312173292894, "grad_norm": 0.09953072667121887, "learning_rate": 0.00028944656710239337, "loss": 9.538, "step": 6784, "throughput": 17669.094778991184 }, { "epoch": 0.10683469306185785, "grad_norm": 0.08808404952287674, "learning_rate": 0.00028934004797572795, "loss": 9.5748, "step": 6816, "throughput": 17669.885900216705 }, { "epoch": 0.10733626439078676, "grad_norm": 0.09777748584747314, "learning_rate": 0.00028923301607033616, "loss": 9.5246, "step": 6848, "throughput": 17671.560145856194 }, { "epoch": 0.10783783571971567, "grad_norm": 0.10776592046022415, "learning_rate": 0.0002891254718276178, "loss": 9.5752, "step": 6880, "throughput": 17673.816226508443 }, { "epoch": 0.10833940704864459, "grad_norm": 0.08375327289104462, "learning_rate": 0.00028901741569108586, "loss": 9.5443, "step": 6912, "throughput": 17674.9136385758 }, { "epoch": 0.10884097837757349, "grad_norm": 0.0973740741610527, "learning_rate": 0.00028890884810636394, "loss": 9.5495, "step": 6944, "throughput": 17675.986629077877 }, { "epoch": 0.1093425497065024, "grad_norm": 0.08603812754154205, "learning_rate": 0.00028879976952118523, "loss": 9.5444, "step": 6976, "throughput": 17671.644097643108 }, { "epoch": 0.10984412103543131, "grad_norm": 0.08715157955884933, "learning_rate": 0.0002886901803853901, "loss": 9.5646, "step": 7008, "throughput": 17671.419285394746 }, { "epoch": 0.11034569236436022, "grad_norm": 0.08621218055486679, "learning_rate": 0.00028858008115092445, "loss": 9.5315, "step": 7040, "throughput": 17674.205815983045 }, { "epoch": 0.11084726369328914, "grad_norm": 0.10020724684000015, "learning_rate": 0.0002884694722718378, "loss": 9.5324, "step": 7072, "throughput": 17676.921071529414 }, { "epoch": 0.11134883502221804, "grad_norm": 0.09769771993160248, "learning_rate": 0.00028835835420428163, "loss": 9.5231, "step": 7104, "throughput": 17678.41386225938 }, { "epoch": 0.11185040635114696, "grad_norm": 0.08455855399370193, "learning_rate": 0.000288246727406507, "loss": 9.5243, "step": 7136, "throughput": 17680.610998948523 }, { "epoch": 0.11235197768007586, "grad_norm": 0.08644996583461761, "learning_rate": 0.00028813459233886335, "loss": 9.514, "step": 7168, "throughput": 17682.161967136737 }, { "epoch": 0.11285354900900477, "grad_norm": 0.10697755962610245, "learning_rate": 0.00028802194946379585, "loss": 9.4924, "step": 7200, "throughput": 17682.65781775484 }, { "epoch": 0.11335512033793368, "grad_norm": 0.09193835407495499, "learning_rate": 0.0002879087992458442, "loss": 9.509, "step": 7232, "throughput": 17685.310101258376 }, { "epoch": 0.11385669166686259, "grad_norm": 0.09078823775053024, "learning_rate": 0.00028779514215164015, "loss": 9.4963, "step": 7264, "throughput": 17680.549920329966 }, { "epoch": 0.11435826299579151, "grad_norm": 0.09351503849029541, "learning_rate": 0.0002876809786499059, "loss": 9.5108, "step": 7296, "throughput": 17680.890002033695 }, { "epoch": 0.11485983432472041, "grad_norm": 0.08877700567245483, "learning_rate": 0.0002875663092114521, "loss": 9.5187, "step": 7328, "throughput": 17682.398823294523 }, { "epoch": 0.11536140565364933, "grad_norm": 0.0977354496717453, "learning_rate": 0.0002874511343091758, "loss": 9.5053, "step": 7360, "throughput": 17685.051078982924 }, { "epoch": 0.11586297698257823, "grad_norm": 0.09951066970825195, "learning_rate": 0.00028733545441805874, "loss": 9.5097, "step": 7392, "throughput": 17687.66218146485 }, { "epoch": 0.11636454831150714, "grad_norm": 0.08795120567083359, "learning_rate": 0.00028721927001516503, "loss": 9.518, "step": 7424, "throughput": 17688.555650311624 }, { "epoch": 0.11686611964043606, "grad_norm": 0.0988912284374237, "learning_rate": 0.00028710258157963955, "loss": 9.5067, "step": 7456, "throughput": 17690.55321163933 }, { "epoch": 0.11736769096936496, "grad_norm": 0.09112031012773514, "learning_rate": 0.00028698538959270577, "loss": 9.5045, "step": 7488, "throughput": 17691.52051917983 }, { "epoch": 0.11786926229829388, "grad_norm": 0.1025838628411293, "learning_rate": 0.00028686769453766366, "loss": 9.5023, "step": 7520, "throughput": 17692.930602701097 }, { "epoch": 0.11837083362722278, "grad_norm": 0.09629788249731064, "learning_rate": 0.00028674949689988814, "loss": 9.4773, "step": 7552, "throughput": 17693.81868079842 }, { "epoch": 0.1188724049561517, "grad_norm": 0.09582464396953583, "learning_rate": 0.00028663079716682654, "loss": 9.4727, "step": 7584, "throughput": 17689.672053808215 }, { "epoch": 0.1193739762850806, "grad_norm": 0.08803921192884445, "learning_rate": 0.00028651159582799695, "loss": 9.4801, "step": 7616, "throughput": 17691.050960385746 }, { "epoch": 0.11987554761400951, "grad_norm": 0.08849391341209412, "learning_rate": 0.000286391893374986, "loss": 9.4867, "step": 7648, "throughput": 17693.53900562614 }, { "epoch": 0.12037711894293843, "grad_norm": 0.087612085044384, "learning_rate": 0.0002862716903014469, "loss": 9.4833, "step": 7680, "throughput": 17696.03728518093 }, { "epoch": 0.12087869027186733, "grad_norm": 0.09365695714950562, "learning_rate": 0.0002861509871030977, "loss": 9.4686, "step": 7712, "throughput": 17697.358179243107 }, { "epoch": 0.12138026160079625, "grad_norm": 0.08572287112474442, "learning_rate": 0.0002860297842777185, "loss": 9.451, "step": 7744, "throughput": 17698.270939143044 }, { "epoch": 0.12188183292972515, "grad_norm": 0.0876513198018074, "learning_rate": 0.00028590808232515025, "loss": 9.4644, "step": 7776, "throughput": 17700.200087927802 }, { "epoch": 0.12238340425865407, "grad_norm": 0.0888097882270813, "learning_rate": 0.00028578588174729214, "loss": 9.4579, "step": 7808, "throughput": 17700.51634556709 }, { "epoch": 0.12288497558758298, "grad_norm": 0.0886104628443718, "learning_rate": 0.0002856631830480997, "loss": 9.4607, "step": 7840, "throughput": 17701.386413357985 }, { "epoch": 0.12338654691651188, "grad_norm": 0.09121379256248474, "learning_rate": 0.0002855399867335827, "loss": 9.4552, "step": 7872, "throughput": 17697.91403017303 }, { "epoch": 0.1238881182454408, "grad_norm": 0.1071939542889595, "learning_rate": 0.0002854162933118032, "loss": 9.4628, "step": 7904, "throughput": 17698.688838735907 }, { "epoch": 0.1243896895743697, "grad_norm": 0.10231801867485046, "learning_rate": 0.0002852921032928732, "loss": 9.4493, "step": 7936, "throughput": 17700.060224950434 }, { "epoch": 0.12489126090329862, "grad_norm": 0.08605187386274338, "learning_rate": 0.0002851674171889526, "loss": 9.4498, "step": 7968, "throughput": 17702.402029974543 }, { "epoch": 0.12539283223222752, "grad_norm": 0.08886358886957169, "learning_rate": 0.0002850422355142474, "loss": 9.4466, "step": 8000, "throughput": 17704.7022624541 }, { "epoch": 0.12589440356115644, "grad_norm": 0.08829627186059952, "learning_rate": 0.00028491655878500716, "loss": 9.4512, "step": 8032, "throughput": 17704.931370729046 }, { "epoch": 0.12639597489008533, "grad_norm": 0.09064590930938721, "learning_rate": 0.0002847903875195231, "loss": 9.4378, "step": 8064, "throughput": 17706.702573880393 }, { "epoch": 0.12689754621901425, "grad_norm": 0.08949542790651321, "learning_rate": 0.00028466372223812575, "loss": 9.4081, "step": 8096, "throughput": 17707.012920911453 }, { "epoch": 0.12739911754794317, "grad_norm": 0.0910944789648056, "learning_rate": 0.0002845365634631833, "loss": 9.4723, "step": 8128, "throughput": 17708.223008042412 }, { "epoch": 0.1279006888768721, "grad_norm": 0.10030682384967804, "learning_rate": 0.0002844089117190988, "loss": 9.4345, "step": 8160, "throughput": 17707.217328548944 }, { "epoch": 0.12840226020580098, "grad_norm": 0.08720734715461731, "learning_rate": 0.0002842807675323085, "loss": 9.4463, "step": 8192, "throughput": 17704.633840665756 }, { "epoch": 0.1289038315347299, "grad_norm": 0.09187284857034683, "learning_rate": 0.00028415213143127935, "loss": 9.4482, "step": 8224, "throughput": 17701.090390522568 }, { "epoch": 0.1294054028636588, "grad_norm": 0.09016137570142746, "learning_rate": 0.00028402300394650697, "loss": 9.4399, "step": 8256, "throughput": 17703.362725097726 }, { "epoch": 0.1299069741925877, "grad_norm": 0.1010221391916275, "learning_rate": 0.0002838933856105136, "loss": 9.4222, "step": 8288, "throughput": 17705.6091619322 }, { "epoch": 0.13040854552151662, "grad_norm": 0.08562653511762619, "learning_rate": 0.0002837632769578455, "loss": 9.4371, "step": 8320, "throughput": 17706.843163637994 }, { "epoch": 0.13091011685044554, "grad_norm": 0.08788871765136719, "learning_rate": 0.00028363267852507133, "loss": 9.4206, "step": 8352, "throughput": 17707.560741404828 }, { "epoch": 0.13141168817937446, "grad_norm": 0.08620458841323853, "learning_rate": 0.0002835015908507793, "loss": 9.4404, "step": 8384, "throughput": 17709.317740773935 }, { "epoch": 0.13191325950830335, "grad_norm": 0.09066160023212433, "learning_rate": 0.0002833700144755753, "loss": 9.414, "step": 8416, "throughput": 17709.55050982401 }, { "epoch": 0.13241483083723227, "grad_norm": 0.0891546979546547, "learning_rate": 0.0002832379499420808, "loss": 9.4323, "step": 8448, "throughput": 17709.75111213038 }, { "epoch": 0.13291640216616118, "grad_norm": 0.09241317957639694, "learning_rate": 0.0002831053977949303, "loss": 9.4121, "step": 8480, "throughput": 17707.765444295415 }, { "epoch": 0.13341797349509008, "grad_norm": 0.08092867583036423, "learning_rate": 0.00028297235858076923, "loss": 9.42, "step": 8512, "throughput": 17707.93522306093 }, { "epoch": 0.133919544824019, "grad_norm": 0.08822212368249893, "learning_rate": 0.0002828388328482517, "loss": 9.4055, "step": 8544, "throughput": 17709.673360506327 }, { "epoch": 0.1344211161529479, "grad_norm": 0.09241366386413574, "learning_rate": 0.0002827048211480383, "loss": 9.4052, "step": 8576, "throughput": 17711.83138890004 }, { "epoch": 0.13492268748187683, "grad_norm": 0.08627436310052872, "learning_rate": 0.00028257032403279354, "loss": 9.4124, "step": 8608, "throughput": 17713.428728467305 }, { "epoch": 0.13542425881080572, "grad_norm": 0.09174727648496628, "learning_rate": 0.00028243534205718405, "loss": 9.4067, "step": 8640, "throughput": 17713.749902611682 }, { "epoch": 0.13592583013973464, "grad_norm": 0.08742675185203552, "learning_rate": 0.00028229987577787585, "loss": 9.4009, "step": 8672, "throughput": 17714.910490991344 }, { "epoch": 0.13642740146866356, "grad_norm": 0.0941416397690773, "learning_rate": 0.00028216392575353225, "loss": 9.381, "step": 8704, "throughput": 17715.589063631214 }, { "epoch": 0.13692897279759245, "grad_norm": 0.09356024861335754, "learning_rate": 0.00028202749254481165, "loss": 9.3877, "step": 8736, "throughput": 17716.64766294759 }, { "epoch": 0.13743054412652136, "grad_norm": 0.0842645913362503, "learning_rate": 0.0002818905767143649, "loss": 9.3993, "step": 8768, "throughput": 17714.34060053566 }, { "epoch": 0.13793211545545028, "grad_norm": 0.09103891253471375, "learning_rate": 0.0002817531788268333, "loss": 9.396, "step": 8800, "throughput": 17713.960152419957 }, { "epoch": 0.1384336867843792, "grad_norm": 0.09892462193965912, "learning_rate": 0.0002816152994488462, "loss": 9.3945, "step": 8832, "throughput": 17715.097766552306 }, { "epoch": 0.1389352581133081, "grad_norm": 0.08912087231874466, "learning_rate": 0.0002814769391490185, "loss": 9.4177, "step": 8864, "throughput": 17717.138805374783 }, { "epoch": 0.139436829442237, "grad_norm": 0.08828168362379074, "learning_rate": 0.0002813380984979486, "loss": 9.3854, "step": 8896, "throughput": 17719.203861962458 }, { "epoch": 0.13993840077116593, "grad_norm": 0.08851828426122665, "learning_rate": 0.00028119877806821557, "loss": 9.3854, "step": 8928, "throughput": 17720.284647074564 }, { "epoch": 0.14043997210009482, "grad_norm": 0.09857714176177979, "learning_rate": 0.00028105897843437746, "loss": 9.3973, "step": 8960, "throughput": 17720.4507104652 }, { "epoch": 0.14094154342902374, "grad_norm": 0.08587909489870071, "learning_rate": 0.0002809187001729683, "loss": 9.4025, "step": 8992, "throughput": 17720.728277247388 }, { "epoch": 0.14144311475795265, "grad_norm": 0.09501481056213379, "learning_rate": 0.00028077794386249604, "loss": 9.3793, "step": 9024, "throughput": 17721.806096925364 }, { "epoch": 0.14194468608688157, "grad_norm": 0.08143424242734909, "learning_rate": 0.0002806367100834401, "loss": 9.374, "step": 9056, "throughput": 17721.533381466965 }, { "epoch": 0.14244625741581046, "grad_norm": 0.09524306654930115, "learning_rate": 0.00028049499941824906, "loss": 9.3819, "step": 9088, "throughput": 17718.38976010663 }, { "epoch": 0.14294782874473938, "grad_norm": 0.08763439953327179, "learning_rate": 0.0002803528124513382, "loss": 9.3596, "step": 9120, "throughput": 17719.890196370536 }, { "epoch": 0.1434494000736683, "grad_norm": 0.08398011326789856, "learning_rate": 0.00028021014976908676, "loss": 9.386, "step": 9152, "throughput": 17721.433472284 }, { "epoch": 0.1439509714025972, "grad_norm": 0.08198576420545578, "learning_rate": 0.0002800670119598363, "loss": 9.3502, "step": 9184, "throughput": 17723.374378071185 }, { "epoch": 0.1444525427315261, "grad_norm": 0.08458232134580612, "learning_rate": 0.0002799233996138874, "loss": 9.3845, "step": 9216, "throughput": 17724.399011975012 }, { "epoch": 0.14495411406045502, "grad_norm": 0.08208563178777695, "learning_rate": 0.00027977931332349786, "loss": 9.3633, "step": 9248, "throughput": 17724.60412540917 }, { "epoch": 0.14545568538938394, "grad_norm": 0.0929563045501709, "learning_rate": 0.00027963475368288006, "loss": 9.3822, "step": 9280, "throughput": 17726.127441229473 }, { "epoch": 0.14595725671831283, "grad_norm": 0.08427176624536514, "learning_rate": 0.00027948972128819823, "loss": 9.3594, "step": 9312, "throughput": 17725.839948487617 }, { "epoch": 0.14645882804724175, "grad_norm": 0.08081962168216705, "learning_rate": 0.0002793442167375665, "loss": 9.354, "step": 9344, "throughput": 17726.891969935805 }, { "epoch": 0.14696039937617067, "grad_norm": 0.08913140743970871, "learning_rate": 0.0002791982406310461, "loss": 9.365, "step": 9376, "throughput": 17723.89602326518 }, { "epoch": 0.14746197070509956, "grad_norm": 0.08514046669006348, "learning_rate": 0.0002790517935706428, "loss": 9.3629, "step": 9408, "throughput": 17724.411805093445 }, { "epoch": 0.14796354203402848, "grad_norm": 0.09198344498872757, "learning_rate": 0.00027890487616030475, "loss": 9.3655, "step": 9440, "throughput": 17725.455035647767 }, { "epoch": 0.1484651133629574, "grad_norm": 0.08742505311965942, "learning_rate": 0.0002787574890059199, "loss": 9.3334, "step": 9472, "throughput": 17727.366510384505 }, { "epoch": 0.1489666846918863, "grad_norm": 0.08325749635696411, "learning_rate": 0.0002786096327153131, "loss": 9.379, "step": 9504, "throughput": 17729.253232463434 }, { "epoch": 0.1494682560208152, "grad_norm": 0.09257443994283676, "learning_rate": 0.00027846130789824437, "loss": 9.3373, "step": 9536, "throughput": 17729.405691552933 }, { "epoch": 0.14996982734974412, "grad_norm": 0.0873776450753212, "learning_rate": 0.00027831251516640553, "loss": 9.3589, "step": 9568, "throughput": 17730.377847042964 }, { "epoch": 0.15047139867867304, "grad_norm": 0.08066736161708832, "learning_rate": 0.00027816325513341835, "loss": 9.3495, "step": 9600, "throughput": 17730.087161016774 }, { "epoch": 0.15097297000760193, "grad_norm": 0.08983741700649261, "learning_rate": 0.0002780135284148315, "loss": 9.3558, "step": 9632, "throughput": 17731.48408846505 }, { "epoch": 0.15147454133653085, "grad_norm": 0.08716747164726257, "learning_rate": 0.00027786333562811855, "loss": 9.3411, "step": 9664, "throughput": 17729.663061758318 }, { "epoch": 0.15197611266545977, "grad_norm": 0.09069626778364182, "learning_rate": 0.00027771267739267494, "loss": 9.3363, "step": 9696, "throughput": 17728.220288648616 }, { "epoch": 0.15247768399438866, "grad_norm": 0.0872938483953476, "learning_rate": 0.0002775615543298157, "loss": 9.3263, "step": 9728, "throughput": 17729.20781459165 }, { "epoch": 0.15297925532331758, "grad_norm": 0.08429323136806488, "learning_rate": 0.0002774099670627728, "loss": 9.344, "step": 9760, "throughput": 17731.034461616182 }, { "epoch": 0.1534808266522465, "grad_norm": 0.08956008404493332, "learning_rate": 0.00027725791621669257, "loss": 9.3532, "step": 9792, "throughput": 17732.87619972755 }, { "epoch": 0.1539823979811754, "grad_norm": 0.08699870854616165, "learning_rate": 0.0002771054024186331, "loss": 9.3539, "step": 9824, "throughput": 17733.819344468047 }, { "epoch": 0.1544839693101043, "grad_norm": 0.08837584406137466, "learning_rate": 0.0002769524262975618, "loss": 9.3225, "step": 9856, "throughput": 17733.988379634797 }, { "epoch": 0.15498554063903322, "grad_norm": 0.08523304760456085, "learning_rate": 0.0002767989884843527, "loss": 9.3234, "step": 9888, "throughput": 17734.564621286165 }, { "epoch": 0.15548711196796214, "grad_norm": 0.0892610102891922, "learning_rate": 0.0002766450896117837, "loss": 9.3348, "step": 9920, "throughput": 17735.113290039546 }, { "epoch": 0.15598868329689103, "grad_norm": 0.08581508696079254, "learning_rate": 0.0002764907303145342, "loss": 9.3469, "step": 9952, "throughput": 17735.232375741027 }, { "epoch": 0.15649025462581995, "grad_norm": 0.08464840799570084, "learning_rate": 0.00027633591122918244, "loss": 9.3182, "step": 9984, "throughput": 17733.241903899874 }, { "epoch": 0.15699182595474886, "grad_norm": 0.08995112776756287, "learning_rate": 0.0002761806329942028, "loss": 9.3363, "step": 10016, "throughput": 17733.762479490895 }, { "epoch": 0.15749339728367778, "grad_norm": 0.08774518221616745, "learning_rate": 0.0002760248962499632, "loss": 9.3135, "step": 10048, "throughput": 17734.737383423486 }, { "epoch": 0.15799496861260667, "grad_norm": 0.09354528039693832, "learning_rate": 0.0002758687016387223, "loss": 9.3394, "step": 10080, "throughput": 17736.51638569282 }, { "epoch": 0.1584965399415356, "grad_norm": 0.08741112053394318, "learning_rate": 0.0002757120498046273, "loss": 9.3364, "step": 10112, "throughput": 17738.275112121057 }, { "epoch": 0.1589981112704645, "grad_norm": 0.09841817617416382, "learning_rate": 0.00027555494139371077, "loss": 9.3244, "step": 10144, "throughput": 17738.358233489358 }, { "epoch": 0.1594996825993934, "grad_norm": 0.09093420952558517, "learning_rate": 0.0002753973770538882, "loss": 9.2967, "step": 10176, "throughput": 17738.822258680775 }, { "epoch": 0.16000125392832232, "grad_norm": 0.09394460171461105, "learning_rate": 0.00027523935743495553, "loss": 9.2852, "step": 10208, "throughput": 17738.97295402453 }, { "epoch": 0.16050282525725124, "grad_norm": 0.08894991129636765, "learning_rate": 0.00027508088318858604, "loss": 9.324, "step": 10240, "throughput": 17739.884708109348 }, { "epoch": 0.16100439658618015, "grad_norm": 0.07993625104427338, "learning_rate": 0.000274921954968328, "loss": 9.3121, "step": 10272, "throughput": 17734.377506200843 }, { "epoch": 0.16150596791510904, "grad_norm": 0.08242437988519669, "learning_rate": 0.0002747625734296019, "loss": 9.3121, "step": 10304, "throughput": 17733.655948355277 }, { "epoch": 0.16200753924403796, "grad_norm": 0.08217553049325943, "learning_rate": 0.00027460273922969757, "loss": 9.338, "step": 10336, "throughput": 17734.54701699429 }, { "epoch": 0.16250911057296688, "grad_norm": 0.08616854250431061, "learning_rate": 0.0002744424530277719, "loss": 9.307, "step": 10368, "throughput": 17736.269304977854 }, { "epoch": 0.16301068190189577, "grad_norm": 0.08809614181518555, "learning_rate": 0.0002742817154848455, "loss": 9.2938, "step": 10400, "throughput": 17737.939030764315 }, { "epoch": 0.1635122532308247, "grad_norm": 0.08409152179956436, "learning_rate": 0.00027412052726380053, "loss": 9.3157, "step": 10432, "throughput": 17738.799611961324 }, { "epoch": 0.1640138245597536, "grad_norm": 0.07925765216350555, "learning_rate": 0.00027395888902937777, "loss": 9.2993, "step": 10464, "throughput": 17739.300958390246 }, { "epoch": 0.16451539588868253, "grad_norm": 0.09361526370048523, "learning_rate": 0.0002737968014481737, "loss": 9.2984, "step": 10496, "throughput": 17739.45763139594 }, { "epoch": 0.16501696721761142, "grad_norm": 0.07672687619924545, "learning_rate": 0.000273634265188638, "loss": 9.2926, "step": 10528, "throughput": 17740.348861311355 }, { "epoch": 0.16551853854654033, "grad_norm": 0.07868822664022446, "learning_rate": 0.0002734712809210706, "loss": 9.2988, "step": 10560, "throughput": 17740.410090470832 }, { "epoch": 0.16602010987546925, "grad_norm": 0.08165296912193298, "learning_rate": 0.00027330784931761925, "loss": 9.2755, "step": 10592, "throughput": 17738.860810980197 }, { "epoch": 0.16652168120439814, "grad_norm": 0.07798882573843002, "learning_rate": 0.0002731439710522763, "loss": 9.2713, "step": 10624, "throughput": 17739.33997914266 }, { "epoch": 0.16702325253332706, "grad_norm": 0.08346995711326599, "learning_rate": 0.00027297964680087617, "loss": 9.2922, "step": 10656, "throughput": 17740.22205949218 }, { "epoch": 0.16752482386225598, "grad_norm": 0.08133242279291153, "learning_rate": 0.0002728148772410926, "loss": 9.3008, "step": 10688, "throughput": 17741.884007279037 }, { "epoch": 0.1680263951911849, "grad_norm": 0.08343921601772308, "learning_rate": 0.0002726496630524358, "loss": 9.316, "step": 10720, "throughput": 17743.119356231484 }, { "epoch": 0.1685279665201138, "grad_norm": 0.08336193114519119, "learning_rate": 0.00027248400491624946, "loss": 9.262, "step": 10752, "throughput": 17743.58509175239 }, { "epoch": 0.1690295378490427, "grad_norm": 0.08541908860206604, "learning_rate": 0.00027231790351570827, "loss": 9.2852, "step": 10784, "throughput": 17744.05931240815 }, { "epoch": 0.16953110917797162, "grad_norm": 0.08818753808736801, "learning_rate": 0.00027215135953581485, "loss": 9.2897, "step": 10816, "throughput": 17743.408459750997 }, { "epoch": 0.1700326805069005, "grad_norm": 0.0927952229976654, "learning_rate": 0.00027198437366339717, "loss": 9.2722, "step": 10848, "throughput": 17744.664686468273 }, { "epoch": 0.17053425183582943, "grad_norm": 0.07848216593265533, "learning_rate": 0.00027181694658710544, "loss": 9.2647, "step": 10880, "throughput": 17742.74908535243 }, { "epoch": 0.17103582316475835, "grad_norm": 0.08359099179506302, "learning_rate": 0.00027164907899740936, "loss": 9.2732, "step": 10912, "throughput": 17742.44229337719 }, { "epoch": 0.17153739449368727, "grad_norm": 0.08627219498157501, "learning_rate": 0.0002714807715865954, "loss": 9.2909, "step": 10944, "throughput": 17743.22525761864 }, { "epoch": 0.17203896582261616, "grad_norm": 0.08329717814922333, "learning_rate": 0.0002713120250487638, "loss": 9.2677, "step": 10976, "throughput": 17744.796276143617 }, { "epoch": 0.17254053715154508, "grad_norm": 0.0816192626953125, "learning_rate": 0.0002711428400798258, "loss": 9.2592, "step": 11008, "throughput": 17746.35511626247 }, { "epoch": 0.173042108480474, "grad_norm": 0.08471696078777313, "learning_rate": 0.00027097321737750075, "loss": 9.2676, "step": 11040, "throughput": 17747.514092651345 }, { "epoch": 0.17354367980940288, "grad_norm": 0.08643633127212524, "learning_rate": 0.00027080315764131316, "loss": 9.2539, "step": 11072, "throughput": 17747.237051226777 }, { "epoch": 0.1740452511383318, "grad_norm": 0.08549510687589645, "learning_rate": 0.0002706326615725898, "loss": 9.266, "step": 11104, "throughput": 17746.947223038034 }, { "epoch": 0.17454682246726072, "grad_norm": 0.09334033727645874, "learning_rate": 0.0002704617298744571, "loss": 9.2492, "step": 11136, "throughput": 17748.528601371454 }, { "epoch": 0.17504839379618964, "grad_norm": 0.08416300266981125, "learning_rate": 0.00027029036325183775, "loss": 9.2546, "step": 11168, "throughput": 17748.02754972764 }, { "epoch": 0.17554996512511853, "grad_norm": 0.07855620980262756, "learning_rate": 0.0002701185624114483, "loss": 9.2792, "step": 11200, "throughput": 17746.755300402074 }, { "epoch": 0.17605153645404745, "grad_norm": 0.08049018681049347, "learning_rate": 0.0002699463280617959, "loss": 9.2799, "step": 11232, "throughput": 17747.988327536557 }, { "epoch": 0.17655310778297637, "grad_norm": 0.09454195201396942, "learning_rate": 0.00026977366091317554, "loss": 9.2456, "step": 11264, "throughput": 17748.456910978788 }, { "epoch": 0.17705467911190526, "grad_norm": 0.07567203044891357, "learning_rate": 0.00026960056167766704, "loss": 9.2549, "step": 11296, "throughput": 17750.01303683647 }, { "epoch": 0.17755625044083417, "grad_norm": 0.08688521385192871, "learning_rate": 0.0002694270310691321, "loss": 9.2444, "step": 11328, "throughput": 17751.15787430835 }, { "epoch": 0.1780578217697631, "grad_norm": 0.09056143462657928, "learning_rate": 0.0002692530698032116, "loss": 9.2426, "step": 11360, "throughput": 17751.207510816206 }, { "epoch": 0.178559393098692, "grad_norm": 0.08120916038751602, "learning_rate": 0.00026907867859732223, "loss": 9.2452, "step": 11392, "throughput": 17751.31664040642 }, { "epoch": 0.1790609644276209, "grad_norm": 0.07514616847038269, "learning_rate": 0.0002689038581706538, "loss": 9.2491, "step": 11424, "throughput": 17751.360375468852 }, { "epoch": 0.17956253575654982, "grad_norm": 0.08787547796964645, "learning_rate": 0.0002687286092441664, "loss": 9.227, "step": 11456, "throughput": 17751.43941832824 }, { "epoch": 0.18006410708547874, "grad_norm": 0.08200137317180634, "learning_rate": 0.00026855293254058693, "loss": 9.2413, "step": 11488, "throughput": 17750.73067845772 }, { "epoch": 0.18056567841440763, "grad_norm": 0.09474249184131622, "learning_rate": 0.0002683768287844068, "loss": 9.2374, "step": 11520, "throughput": 17750.830930191118 }, { "epoch": 0.18106724974333654, "grad_norm": 0.08020896464586258, "learning_rate": 0.0002682002987018783, "loss": 9.258, "step": 11552, "throughput": 17751.605636322933 }, { "epoch": 0.18156882107226546, "grad_norm": 0.08732938766479492, "learning_rate": 0.00026802334302101214, "loss": 9.2477, "step": 11584, "throughput": 17752.74366557252 }, { "epoch": 0.18207039240119435, "grad_norm": 0.08201635628938675, "learning_rate": 0.000267845962471574, "loss": 9.2412, "step": 11616, "throughput": 17754.248561484445 }, { "epoch": 0.18257196373012327, "grad_norm": 0.08246797323226929, "learning_rate": 0.0002676681577850818, "loss": 9.2311, "step": 11648, "throughput": 17754.32764483168 }, { "epoch": 0.1830735350590522, "grad_norm": 0.08599203079938889, "learning_rate": 0.0002674899296948026, "loss": 9.2294, "step": 11680, "throughput": 17754.719613925416 }, { "epoch": 0.1835751063879811, "grad_norm": 0.08059228956699371, "learning_rate": 0.00026731127893574955, "loss": 9.2423, "step": 11712, "throughput": 17754.07333906634 }, { "epoch": 0.18407667771691, "grad_norm": 0.08479016274213791, "learning_rate": 0.00026713220624467894, "loss": 9.2528, "step": 11744, "throughput": 17755.54729374782 }, { "epoch": 0.18457824904583892, "grad_norm": 0.08023982495069504, "learning_rate": 0.00026695271236008703, "loss": 9.2301, "step": 11776, "throughput": 17754.15601091206 }, { "epoch": 0.18507982037476783, "grad_norm": 0.0974150151014328, "learning_rate": 0.00026677279802220726, "loss": 9.2395, "step": 11808, "throughput": 17753.80019657531 }, { "epoch": 0.18558139170369672, "grad_norm": 0.09254009276628494, "learning_rate": 0.00026659246397300673, "loss": 9.229, "step": 11840, "throughput": 17754.905195390864 }, { "epoch": 0.18608296303262564, "grad_norm": 0.0819011852145195, "learning_rate": 0.00026641171095618366, "loss": 9.219, "step": 11872, "throughput": 17755.662459115352 }, { "epoch": 0.18658453436155456, "grad_norm": 0.07982365041971207, "learning_rate": 0.0002662305397171641, "loss": 9.2316, "step": 11904, "throughput": 17757.072989102187 }, { "epoch": 0.18708610569048348, "grad_norm": 0.0868970975279808, "learning_rate": 0.0002660489510030986, "loss": 9.2386, "step": 11936, "throughput": 17758.149529276874 }, { "epoch": 0.18758767701941237, "grad_norm": 0.08935050666332245, "learning_rate": 0.00026586694556285975, "loss": 9.2364, "step": 11968, "throughput": 17758.176495549968 }, { "epoch": 0.1880892483483413, "grad_norm": 0.08183032274246216, "learning_rate": 0.0002656845241470384, "loss": 9.2169, "step": 12000, "throughput": 17757.1726961844 }, { "epoch": 0.1885908196772702, "grad_norm": 0.09355923533439636, "learning_rate": 0.0002655016875079411, "loss": 9.2237, "step": 12032, "throughput": 17758.286084059786 }, { "epoch": 0.1890923910061991, "grad_norm": 0.08387905359268188, "learning_rate": 0.00026531843639958656, "loss": 9.2133, "step": 12064, "throughput": 17758.027452995793 }, { "epoch": 0.189593962335128, "grad_norm": 0.08702760189771652, "learning_rate": 0.00026513477157770303, "loss": 9.2201, "step": 12096, "throughput": 17757.35661152563 }, { "epoch": 0.19009553366405693, "grad_norm": 0.0785302221775055, "learning_rate": 0.0002649506937997248, "loss": 9.2206, "step": 12128, "throughput": 17757.42597187838 }, { "epoch": 0.19059710499298585, "grad_norm": 0.08639495074748993, "learning_rate": 0.00026476620382478896, "loss": 9.2288, "step": 12160, "throughput": 17758.528128623464 }, { "epoch": 0.19109867632191474, "grad_norm": 0.08326099067926407, "learning_rate": 0.0002645813024137329, "loss": 9.231, "step": 12192, "throughput": 17759.527708457354 }, { "epoch": 0.19160024765084366, "grad_norm": 0.09123198688030243, "learning_rate": 0.00026439599032909055, "loss": 9.2303, "step": 12224, "throughput": 17760.933629831925 }, { "epoch": 0.19210181897977258, "grad_norm": 0.08471406996250153, "learning_rate": 0.0002642102683350894, "loss": 9.2284, "step": 12256, "throughput": 17760.95910159503 }, { "epoch": 0.19260339030870147, "grad_norm": 0.09966867417097092, "learning_rate": 0.00026402413719764774, "loss": 9.2111, "step": 12288, "throughput": 17761.297990507992 }, { "epoch": 0.19310496163763038, "grad_norm": 0.08580002933740616, "learning_rate": 0.0002638375976843707, "loss": 9.2135, "step": 12320, "throughput": 17756.89295395398 }, { "epoch": 0.1936065329665593, "grad_norm": 0.08772596716880798, "learning_rate": 0.0002636506505645478, "loss": 9.199, "step": 12352, "throughput": 17757.2471773537 }, { "epoch": 0.19410810429548822, "grad_norm": 0.0860821008682251, "learning_rate": 0.00026346329660914964, "loss": 9.2145, "step": 12384, "throughput": 17756.55196902713 }, { "epoch": 0.1946096756244171, "grad_norm": 0.08097507804632187, "learning_rate": 0.00026327553659082444, "loss": 9.2198, "step": 12416, "throughput": 17756.248802078728 }, { "epoch": 0.19511124695334603, "grad_norm": 0.08351437002420425, "learning_rate": 0.00026308737128389513, "loss": 9.1913, "step": 12448, "throughput": 17757.62806257604 }, { "epoch": 0.19561281828227495, "grad_norm": 0.08402379602193832, "learning_rate": 0.0002628988014643558, "loss": 9.2191, "step": 12480, "throughput": 17758.34908027362 }, { "epoch": 0.19611438961120384, "grad_norm": 0.08989959210157394, "learning_rate": 0.00026270982790986916, "loss": 9.218, "step": 12512, "throughput": 17759.728538211333 }, { "epoch": 0.19661596094013276, "grad_norm": 0.09197131544351578, "learning_rate": 0.00026252045139976254, "loss": 9.199, "step": 12544, "throughput": 17760.454666729 }, { "epoch": 0.19711753226906167, "grad_norm": 0.08772056549787521, "learning_rate": 0.00026233067271502536, "loss": 9.1786, "step": 12576, "throughput": 17760.168491821252 }, { "epoch": 0.1976191035979906, "grad_norm": 0.07788512855768204, "learning_rate": 0.0002621404926383054, "loss": 9.2083, "step": 12608, "throughput": 17759.559491562166 }, { "epoch": 0.19812067492691948, "grad_norm": 0.08134134113788605, "learning_rate": 0.0002619499119539059, "loss": 9.1859, "step": 12640, "throughput": 17760.580955267505 }, { "epoch": 0.1986222462558484, "grad_norm": 0.08216018974781036, "learning_rate": 0.0002617589314477821, "loss": 9.1733, "step": 12672, "throughput": 17760.070863293106 }, { "epoch": 0.19912381758477732, "grad_norm": 0.08295486867427826, "learning_rate": 0.0002615675519075383, "loss": 9.1955, "step": 12704, "throughput": 17758.912070185866 }, { "epoch": 0.1996253889137062, "grad_norm": 0.08604070544242859, "learning_rate": 0.00026137577412242415, "loss": 9.1792, "step": 12736, "throughput": 17759.930171949552 }, { "epoch": 0.20012696024263513, "grad_norm": 0.0887996256351471, "learning_rate": 0.00026118359888333193, "loss": 9.1603, "step": 12768, "throughput": 17760.97519962203 }, { "epoch": 0.20062853157156404, "grad_norm": 0.0748458057641983, "learning_rate": 0.00026099102698279276, "loss": 9.1849, "step": 12800, "throughput": 17761.979821963785 }, { "epoch": 0.20113010290049296, "grad_norm": 0.08445336669683456, "learning_rate": 0.0002607980592149739, "loss": 9.1846, "step": 12832, "throughput": 17762.663567510866 }, { "epoch": 0.20163167422942185, "grad_norm": 0.07447773963212967, "learning_rate": 0.00026060469637567484, "loss": 9.1943, "step": 12864, "throughput": 17763.02923012481 }, { "epoch": 0.20213324555835077, "grad_norm": 0.08066857606172562, "learning_rate": 0.0002604109392623246, "loss": 9.2074, "step": 12896, "throughput": 17762.771752956647 }, { "epoch": 0.2026348168872797, "grad_norm": 0.07313714921474457, "learning_rate": 0.00026021678867397803, "loss": 9.188, "step": 12928, "throughput": 17762.7475778211 }, { "epoch": 0.20313638821620858, "grad_norm": 0.07853822410106659, "learning_rate": 0.00026002224541131274, "loss": 9.1701, "step": 12960, "throughput": 17762.784594845412 }, { "epoch": 0.2036379595451375, "grad_norm": 0.08614935725927353, "learning_rate": 0.00025982731027662575, "loss": 9.1813, "step": 12992, "throughput": 17762.106150200903 }, { "epoch": 0.20413953087406642, "grad_norm": 0.07838352769613266, "learning_rate": 0.00025963198407383015, "loss": 9.1915, "step": 13024, "throughput": 17762.128170875214 }, { "epoch": 0.20464110220299533, "grad_norm": 0.07841797918081284, "learning_rate": 0.0002594362676084517, "loss": 9.1748, "step": 13056, "throughput": 17763.430365428718 }, { "epoch": 0.20514267353192422, "grad_norm": 0.07841688394546509, "learning_rate": 0.0002592401616876258, "loss": 9.1785, "step": 13088, "throughput": 17764.113988290395 }, { "epoch": 0.20564424486085314, "grad_norm": 0.08778318762779236, "learning_rate": 0.00025904366712009374, "loss": 9.1896, "step": 13120, "throughput": 17765.416229046143 }, { "epoch": 0.20614581618978206, "grad_norm": 0.07848533242940903, "learning_rate": 0.00025884678471619976, "loss": 9.1745, "step": 13152, "throughput": 17765.756664442815 }, { "epoch": 0.20664738751871095, "grad_norm": 0.082659512758255, "learning_rate": 0.0002586495152878874, "loss": 9.1685, "step": 13184, "throughput": 17765.91336427311 }, { "epoch": 0.20714895884763987, "grad_norm": 0.08469443023204803, "learning_rate": 0.0002584518596486965, "loss": 9.1722, "step": 13216, "throughput": 17765.018102965587 }, { "epoch": 0.2076505301765688, "grad_norm": 0.07865361869335175, "learning_rate": 0.00025825381861375936, "loss": 9.1817, "step": 13248, "throughput": 17766.303462840322 }, { "epoch": 0.2081521015054977, "grad_norm": 0.0782107338309288, "learning_rate": 0.00025805539299979794, "loss": 9.1819, "step": 13280, "throughput": 17765.005851088183 }, { "epoch": 0.2086536728344266, "grad_norm": 0.07893738895654678, "learning_rate": 0.0002578565836251199, "loss": 9.1728, "step": 13312, "throughput": 17764.704666216006 }, { "epoch": 0.2091552441633555, "grad_norm": 0.08694314956665039, "learning_rate": 0.0002576573913096158, "loss": 9.1774, "step": 13344, "throughput": 17765.994528548148 }, { "epoch": 0.20965681549228443, "grad_norm": 0.07928625494241714, "learning_rate": 0.00025745781687475534, "loss": 9.1696, "step": 13376, "throughput": 17766.961987929495 }, { "epoch": 0.21015838682121332, "grad_norm": 0.0769500657916069, "learning_rate": 0.000257257861143584, "loss": 9.1649, "step": 13408, "throughput": 17768.194245190854 }, { "epoch": 0.21065995815014224, "grad_norm": 0.08769873529672623, "learning_rate": 0.00025705752494071995, "loss": 9.1646, "step": 13440, "throughput": 17768.8347317131 }, { "epoch": 0.21116152947907116, "grad_norm": 0.07801324874162674, "learning_rate": 0.0002568568090923501, "loss": 9.1581, "step": 13472, "throughput": 17768.84446426064 }, { "epoch": 0.21166310080800008, "grad_norm": 0.07609668374061584, "learning_rate": 0.0002566557144262273, "loss": 9.1687, "step": 13504, "throughput": 17768.408898357007 }, { "epoch": 0.21216467213692897, "grad_norm": 0.09769364446401596, "learning_rate": 0.00025645424177166663, "loss": 9.1753, "step": 13536, "throughput": 17769.075062014672 }, { "epoch": 0.21266624346585788, "grad_norm": 0.08589643239974976, "learning_rate": 0.0002562523919595418, "loss": 9.163, "step": 13568, "throughput": 17769.10919087997 }, { "epoch": 0.2131678147947868, "grad_norm": 0.07752927392721176, "learning_rate": 0.0002560501658222821, "loss": 9.1467, "step": 13600, "throughput": 17768.465756585483 }, { "epoch": 0.2136693861237157, "grad_norm": 0.0796670913696289, "learning_rate": 0.0002558475641938686, "loss": 9.1517, "step": 13632, "throughput": 17768.478881713232 }, { "epoch": 0.2141709574526446, "grad_norm": 0.08221141248941422, "learning_rate": 0.00025564458790983114, "loss": 9.1704, "step": 13664, "throughput": 17769.42073647477 }, { "epoch": 0.21467252878157353, "grad_norm": 0.08998142182826996, "learning_rate": 0.0002554412378072445, "loss": 9.1552, "step": 13696, "throughput": 17770.666764438174 }, { "epoch": 0.21517410011050242, "grad_norm": 0.07725099474191666, "learning_rate": 0.0002552375147247251, "loss": 9.1454, "step": 13728, "throughput": 17771.883556423592 }, { "epoch": 0.21567567143943134, "grad_norm": 0.07587715983390808, "learning_rate": 0.0002550334195024275, "loss": 9.1491, "step": 13760, "throughput": 17771.352719333623 }, { "epoch": 0.21617724276836026, "grad_norm": 0.07606406509876251, "learning_rate": 0.00025482895298204096, "loss": 9.1445, "step": 13792, "throughput": 17771.476023673233 }, { "epoch": 0.21667881409728917, "grad_norm": 0.08447536081075668, "learning_rate": 0.0002546241160067861, "loss": 9.1462, "step": 13824, "throughput": 17771.204977664245 }, { "epoch": 0.21718038542621806, "grad_norm": 0.08394841849803925, "learning_rate": 0.00025441890942141124, "loss": 9.1635, "step": 13856, "throughput": 17771.853567391616 }, { "epoch": 0.21768195675514698, "grad_norm": 0.07255728542804718, "learning_rate": 0.00025421333407218884, "loss": 9.159, "step": 13888, "throughput": 17771.198852016052 }, { "epoch": 0.2181835280840759, "grad_norm": 0.0749092549085617, "learning_rate": 0.0002540073908069124, "loss": 9.1475, "step": 13920, "throughput": 17770.585107716557 }, { "epoch": 0.2186850994130048, "grad_norm": 0.0870911180973053, "learning_rate": 0.0002538010804748924, "loss": 9.1077, "step": 13952, "throughput": 17771.804267479227 }, { "epoch": 0.2191866707419337, "grad_norm": 0.08277792483568192, "learning_rate": 0.0002535944039269533, "loss": 9.1564, "step": 13984, "throughput": 17772.738838380068 }, { "epoch": 0.21968824207086263, "grad_norm": 0.07313404232263565, "learning_rate": 0.0002533873620154299, "loss": 9.1437, "step": 14016, "throughput": 17773.947692743845 }, { "epoch": 0.22018981339979155, "grad_norm": 0.09033776819705963, "learning_rate": 0.0002531799555941635, "loss": 9.1409, "step": 14048, "throughput": 17774.267928277182 }, { "epoch": 0.22069138472872044, "grad_norm": 0.07937667518854141, "learning_rate": 0.00025297218551849885, "loss": 9.1185, "step": 14080, "throughput": 17774.06221653404 }, { "epoch": 0.22119295605764935, "grad_norm": 0.08419618755578995, "learning_rate": 0.00025276405264528044, "loss": 9.1448, "step": 14112, "throughput": 17773.220136563268 }, { "epoch": 0.22169452738657827, "grad_norm": 0.07552050799131393, "learning_rate": 0.00025255555783284877, "loss": 9.1397, "step": 14144, "throughput": 17774.375940588343 }, { "epoch": 0.22219609871550716, "grad_norm": 0.07970742881298065, "learning_rate": 0.0002523467019410371, "loss": 9.1421, "step": 14176, "throughput": 17774.247620828053 }, { "epoch": 0.22269767004443608, "grad_norm": 0.08620994538068771, "learning_rate": 0.00025213748583116776, "loss": 9.1541, "step": 14208, "throughput": 17773.484717298426 }, { "epoch": 0.223199241373365, "grad_norm": 0.07768417149782181, "learning_rate": 0.0002519279103660486, "loss": 9.1098, "step": 14240, "throughput": 17773.73404081062 }, { "epoch": 0.22370081270229392, "grad_norm": 0.0926952138543129, "learning_rate": 0.0002517179764099694, "loss": 9.1067, "step": 14272, "throughput": 17774.638398139723 }, { "epoch": 0.2242023840312228, "grad_norm": 0.08634334802627563, "learning_rate": 0.00025150768482869846, "loss": 9.1326, "step": 14304, "throughput": 17775.821062374456 }, { "epoch": 0.22470395536015172, "grad_norm": 0.07841115444898605, "learning_rate": 0.0002512970364894789, "loss": 9.1293, "step": 14336, "throughput": 17776.7217812672 }, { "epoch": 0.22520552668908064, "grad_norm": 0.08082298189401627, "learning_rate": 0.00025108603226102515, "loss": 9.135, "step": 14368, "throughput": 17773.678824021947 }, { "epoch": 0.22570709801800953, "grad_norm": 0.07413540780544281, "learning_rate": 0.0002508746730135191, "loss": 9.1279, "step": 14400, "throughput": 17773.24651322649 }, { "epoch": 0.22620866934693845, "grad_norm": 0.08838255703449249, "learning_rate": 0.00025066295961860704, "loss": 9.1154, "step": 14432, "throughput": 17773.554209760005 }, { "epoch": 0.22671024067586737, "grad_norm": 0.08930821716785431, "learning_rate": 0.0002504508929493957, "loss": 9.1236, "step": 14464, "throughput": 17773.91288923869 }, { "epoch": 0.2272118120047963, "grad_norm": 0.0821206346154213, "learning_rate": 0.00025023847388044846, "loss": 9.1102, "step": 14496, "throughput": 17773.57951423693 }, { "epoch": 0.22771338333372518, "grad_norm": 0.08669077605009079, "learning_rate": 0.0002500257032877823, "loss": 9.1255, "step": 14528, "throughput": 17772.976340971953 }, { "epoch": 0.2282149546626541, "grad_norm": 0.08576954901218414, "learning_rate": 0.0002498125820488639, "loss": 9.1264, "step": 14560, "throughput": 17774.128793335887 }, { "epoch": 0.22871652599158301, "grad_norm": 0.08853522688150406, "learning_rate": 0.00024959911104260565, "loss": 9.1215, "step": 14592, "throughput": 17775.021770221105 }, { "epoch": 0.2292180973205119, "grad_norm": 0.08846239745616913, "learning_rate": 0.00024938529114936273, "loss": 9.133, "step": 14624, "throughput": 17776.182280312998 }, { "epoch": 0.22971966864944082, "grad_norm": 0.08885422348976135, "learning_rate": 0.000249171123250929, "loss": 9.127, "step": 14656, "throughput": 17776.489694815533 }, { "epoch": 0.23022123997836974, "grad_norm": 0.082110695540905, "learning_rate": 0.00024895660823053353, "loss": 9.1152, "step": 14688, "throughput": 17776.58724845879 }, { "epoch": 0.23072281130729866, "grad_norm": 0.07240453362464905, "learning_rate": 0.00024874174697283685, "loss": 9.1367, "step": 14720, "throughput": 17775.82717047677 }, { "epoch": 0.23122438263622755, "grad_norm": 0.07088429480791092, "learning_rate": 0.0002485265403639275, "loss": 9.1151, "step": 14752, "throughput": 17776.735107879464 }, { "epoch": 0.23172595396515647, "grad_norm": 0.07287611812353134, "learning_rate": 0.0002483109892913181, "loss": 9.1342, "step": 14784, "throughput": 17776.719121417333 }, { "epoch": 0.23222752529408539, "grad_norm": 0.08337391912937164, "learning_rate": 0.0002480950946439419, "loss": 9.1155, "step": 14816, "throughput": 17775.983219174625 }, { "epoch": 0.23272909662301428, "grad_norm": 0.08093888312578201, "learning_rate": 0.0002478788573121491, "loss": 9.0951, "step": 14848, "throughput": 17776.824699384837 }, { "epoch": 0.2332306679519432, "grad_norm": 0.0842365100979805, "learning_rate": 0.0002476622781877031, "loss": 9.0956, "step": 14880, "throughput": 17777.474284685857 }, { "epoch": 0.2337322392808721, "grad_norm": 0.07578255981206894, "learning_rate": 0.0002474453581637769, "loss": 9.112, "step": 14912, "throughput": 17778.586747839425 }, { "epoch": 0.23423381060980103, "grad_norm": 0.07239358872175217, "learning_rate": 0.00024722809813494933, "loss": 9.1123, "step": 14944, "throughput": 17779.466436439874 }, { "epoch": 0.23473538193872992, "grad_norm": 0.0819094255566597, "learning_rate": 0.00024701049899720123, "loss": 9.1053, "step": 14976, "throughput": 17779.444766337834 }, { "epoch": 0.23523695326765884, "grad_norm": 0.08021069318056107, "learning_rate": 0.0002467925616479122, "loss": 9.1304, "step": 15008, "throughput": 17779.002898580602 }, { "epoch": 0.23573852459658776, "grad_norm": 0.08905062824487686, "learning_rate": 0.0002465742869858566, "loss": 9.1156, "step": 15040, "throughput": 17779.548275387704 }, { "epoch": 0.23624009592551665, "grad_norm": 0.07807237654924393, "learning_rate": 0.0002463556759111996, "loss": 9.0916, "step": 15072, "throughput": 17779.240741012156 }, { "epoch": 0.23674166725444556, "grad_norm": 0.08749555796384811, "learning_rate": 0.00024613672932549403, "loss": 9.1059, "step": 15104, "throughput": 17778.825360570343 }, { "epoch": 0.23724323858337448, "grad_norm": 0.07883931696414948, "learning_rate": 0.00024591744813167625, "loss": 9.0943, "step": 15136, "throughput": 17779.06169946807 }, { "epoch": 0.2377448099123034, "grad_norm": 0.07457397878170013, "learning_rate": 0.00024569783323406255, "loss": 9.0915, "step": 15168, "throughput": 17779.64674349068 }, { "epoch": 0.2382463812412323, "grad_norm": 0.07606396824121475, "learning_rate": 0.00024547788553834536, "loss": 9.1019, "step": 15200, "throughput": 17780.496551560656 }, { "epoch": 0.2387479525701612, "grad_norm": 0.08174512535333633, "learning_rate": 0.00024525760595158977, "loss": 9.1145, "step": 15232, "throughput": 17781.56913269253 }, { "epoch": 0.23924952389909013, "grad_norm": 0.07189542055130005, "learning_rate": 0.0002450369953822293, "loss": 9.102, "step": 15264, "throughput": 17781.8745032226 }, { "epoch": 0.23975109522801902, "grad_norm": 0.07562036067247391, "learning_rate": 0.0002448160547400627, "loss": 9.0934, "step": 15296, "throughput": 17781.990741428832 }, { "epoch": 0.24025266655694794, "grad_norm": 0.07461810111999512, "learning_rate": 0.00024459478493624973, "loss": 9.0946, "step": 15328, "throughput": 17781.57031575691 }, { "epoch": 0.24075423788587685, "grad_norm": 0.07911355048418045, "learning_rate": 0.0002443731868833078, "loss": 9.097, "step": 15360, "throughput": 17781.693753365977 }, { "epoch": 0.24125580921480577, "grad_norm": 0.07777303457260132, "learning_rate": 0.0002441512614951079, "loss": 9.0662, "step": 15392, "throughput": 17781.314575011525 }, { "epoch": 0.24175738054373466, "grad_norm": 0.07520275563001633, "learning_rate": 0.00024392900968687103, "loss": 9.1051, "step": 15424, "throughput": 17781.036177664922 }, { "epoch": 0.24225895187266358, "grad_norm": 0.07553788274526596, "learning_rate": 0.00024370643237516426, "loss": 9.0889, "step": 15456, "throughput": 17782.035424535097 }, { "epoch": 0.2427605232015925, "grad_norm": 0.07838892936706543, "learning_rate": 0.00024348353047789708, "loss": 9.109, "step": 15488, "throughput": 17782.28704776579 }, { "epoch": 0.2432620945305214, "grad_norm": 0.0778832957148552, "learning_rate": 0.0002432603049143176, "loss": 9.1041, "step": 15520, "throughput": 17783.362660835668 }, { "epoch": 0.2437636658594503, "grad_norm": 0.07655086368322372, "learning_rate": 0.0002430367566050087, "loss": 9.0952, "step": 15552, "throughput": 17784.176518185708 }, { "epoch": 0.24426523718837923, "grad_norm": 0.08006106317043304, "learning_rate": 0.00024281288647188425, "loss": 9.0803, "step": 15584, "throughput": 17784.009025098036 }, { "epoch": 0.24476680851730814, "grad_norm": 0.07509180903434753, "learning_rate": 0.00024258869543818535, "loss": 9.0623, "step": 15616, "throughput": 17783.405734593525 }, { "epoch": 0.24526837984623703, "grad_norm": 0.07064583152532578, "learning_rate": 0.00024236418442847652, "loss": 9.1093, "step": 15648, "throughput": 17784.47659002254 }, { "epoch": 0.24576995117516595, "grad_norm": 0.0784299373626709, "learning_rate": 0.0002421393543686418, "loss": 9.0985, "step": 15680, "throughput": 17784.183071227122 }, { "epoch": 0.24627152250409487, "grad_norm": 0.07890692353248596, "learning_rate": 0.00024191420618588103, "loss": 9.0849, "step": 15712, "throughput": 17784.028872968156 }, { "epoch": 0.24677309383302376, "grad_norm": 0.07363320887088776, "learning_rate": 0.000241688740808706, "loss": 9.0832, "step": 15744, "throughput": 17784.2727540409 }, { "epoch": 0.24727466516195268, "grad_norm": 0.0745813325047493, "learning_rate": 0.0002414629591669366, "loss": 9.084, "step": 15776, "throughput": 17784.80146127646 }, { "epoch": 0.2477762364908816, "grad_norm": 0.07824543118476868, "learning_rate": 0.0002412368621916969, "loss": 9.0837, "step": 15808, "throughput": 17785.59480057255 }, { "epoch": 0.2482778078198105, "grad_norm": 0.08016502857208252, "learning_rate": 0.0002410104508154116, "loss": 9.0857, "step": 15840, "throughput": 17786.62932005166 }, { "epoch": 0.2487793791487394, "grad_norm": 0.08209695667028427, "learning_rate": 0.00024078372597180183, "loss": 9.0813, "step": 15872, "throughput": 17786.653180076366 }, { "epoch": 0.24928095047766832, "grad_norm": 0.0860067754983902, "learning_rate": 0.00024055668859588157, "loss": 9.062, "step": 15904, "throughput": 17786.153065215716 }, { "epoch": 0.24978252180659724, "grad_norm": 0.07782717794179916, "learning_rate": 0.0002403293396239536, "loss": 9.0747, "step": 15936, "throughput": 17786.550824802533 }, { "epoch": 0.25028409313552613, "grad_norm": 0.08756222575902939, "learning_rate": 0.00024010167999360575, "loss": 9.0941, "step": 15968, "throughput": 17786.37941455417 }, { "epoch": 0.25078566446445505, "grad_norm": 0.07098475843667984, "learning_rate": 0.00023987371064370698, "loss": 9.0685, "step": 16000, "throughput": 17786.590935662654 }, { "epoch": 0.25128723579338397, "grad_norm": 0.07569174468517303, "learning_rate": 0.00023964543251440363, "loss": 9.0477, "step": 16032, "throughput": 17786.305093450927 }, { "epoch": 0.2517888071223129, "grad_norm": 0.08683817833662033, "learning_rate": 0.00023941684654711534, "loss": 9.0784, "step": 16064, "throughput": 17787.327044746777 }, { "epoch": 0.2522903784512418, "grad_norm": 0.07383278012275696, "learning_rate": 0.0002391879536845313, "loss": 9.075, "step": 16096, "throughput": 17787.623320426683 }, { "epoch": 0.25279194978017067, "grad_norm": 0.08801043033599854, "learning_rate": 0.0002389587548706064, "loss": 9.0607, "step": 16128, "throughput": 17788.640494622367 }, { "epoch": 0.2532935211090996, "grad_norm": 0.07545439898967743, "learning_rate": 0.0002387292510505572, "loss": 9.0544, "step": 16160, "throughput": 17789.424450237166 }, { "epoch": 0.2537950924380285, "grad_norm": 0.08476603776216507, "learning_rate": 0.00023849944317085812, "loss": 9.0844, "step": 16192, "throughput": 17788.941098313495 }, { "epoch": 0.2542966637669574, "grad_norm": 0.08219964802265167, "learning_rate": 0.0002382693321792376, "loss": 9.0726, "step": 16224, "throughput": 17788.35583765858 }, { "epoch": 0.25479823509588634, "grad_norm": 0.07952090352773666, "learning_rate": 0.00023803891902467406, "loss": 9.0836, "step": 16256, "throughput": 17789.353238515407 }, { "epoch": 0.25529980642481526, "grad_norm": 0.07744229584932327, "learning_rate": 0.0002378082046573919, "loss": 9.0545, "step": 16288, "throughput": 17788.79759968533 }, { "epoch": 0.2558013777537442, "grad_norm": 0.07623753696680069, "learning_rate": 0.00023757719002885793, "loss": 9.0517, "step": 16320, "throughput": 17788.39778661232 }, { "epoch": 0.25630294908267304, "grad_norm": 0.07981929183006287, "learning_rate": 0.00023734587609177725, "loss": 9.0688, "step": 16352, "throughput": 17789.159511362217 }, { "epoch": 0.25680452041160196, "grad_norm": 0.08757334202528, "learning_rate": 0.000237114263800089, "loss": 9.0724, "step": 16384, "throughput": 17789.704377586713 }, { "epoch": 0.2573060917405309, "grad_norm": 0.07343178987503052, "learning_rate": 0.0002368823541089632, "loss": 9.0719, "step": 16416, "throughput": 17788.00287521869 }, { "epoch": 0.2578076630694598, "grad_norm": 0.07238256186246872, "learning_rate": 0.00023665014797479602, "loss": 9.0621, "step": 16448, "throughput": 17789.03773857688 }, { "epoch": 0.2583092343983887, "grad_norm": 0.07762811332941055, "learning_rate": 0.00023641764635520617, "loss": 9.0527, "step": 16480, "throughput": 17788.906456268425 }, { "epoch": 0.2588108057273176, "grad_norm": 0.0722971260547638, "learning_rate": 0.0002361848502090311, "loss": 9.0536, "step": 16512, "throughput": 17788.41421709581 }, { "epoch": 0.25931237705624655, "grad_norm": 0.07550311833620071, "learning_rate": 0.0002359517604963228, "loss": 9.0705, "step": 16544, "throughput": 17788.812063779653 }, { "epoch": 0.2598139483851754, "grad_norm": 0.08055978268384933, "learning_rate": 0.0002357183781783439, "loss": 9.0531, "step": 16576, "throughput": 17788.431551853282 }, { "epoch": 0.2603155197141043, "grad_norm": 0.08193892985582352, "learning_rate": 0.0002354847042175638, "loss": 9.0396, "step": 16608, "throughput": 17788.409081313366 }, { "epoch": 0.26081709104303324, "grad_norm": 0.07315579056739807, "learning_rate": 0.0002352507395776546, "loss": 9.0484, "step": 16640, "throughput": 17788.41089261413 }, { "epoch": 0.26131866237196216, "grad_norm": 0.07734058052301407, "learning_rate": 0.00023501648522348715, "loss": 9.0634, "step": 16672, "throughput": 17789.22038944418 }, { "epoch": 0.2618202337008911, "grad_norm": 0.07491450756788254, "learning_rate": 0.0002347819421211271, "loss": 9.0661, "step": 16704, "throughput": 17789.977336385524 }, { "epoch": 0.26232180502982, "grad_norm": 0.07738371938467026, "learning_rate": 0.00023454711123783092, "loss": 9.0493, "step": 16736, "throughput": 17790.974012187293 }, { "epoch": 0.2628233763587489, "grad_norm": 0.07885603606700897, "learning_rate": 0.00023431199354204192, "loss": 9.0511, "step": 16768, "throughput": 17791.490265431854 }, { "epoch": 0.2633249476876778, "grad_norm": 0.07946806401014328, "learning_rate": 0.00023407659000338607, "loss": 9.0613, "step": 16800, "throughput": 17791.105061305985 }, { "epoch": 0.2638265190166067, "grad_norm": 0.07436434924602509, "learning_rate": 0.00023384090159266833, "loss": 9.056, "step": 16832, "throughput": 17790.75878448174 }, { "epoch": 0.2643280903455356, "grad_norm": 0.07219117879867554, "learning_rate": 0.00023360492928186838, "loss": 9.0472, "step": 16864, "throughput": 17791.095584204028 }, { "epoch": 0.26482966167446453, "grad_norm": 0.0752549022436142, "learning_rate": 0.00023336867404413674, "loss": 9.0551, "step": 16896, "throughput": 17790.32914453402 }, { "epoch": 0.26533123300339345, "grad_norm": 0.07986687868833542, "learning_rate": 0.0002331321368537907, "loss": 9.0342, "step": 16928, "throughput": 17790.30320047984 }, { "epoch": 0.26583280433232237, "grad_norm": 0.07936165481805801, "learning_rate": 0.0002328953186863103, "loss": 9.0429, "step": 16960, "throughput": 17791.25445204999 }, { "epoch": 0.2663343756612513, "grad_norm": 0.07726788520812988, "learning_rate": 0.00023265822051833442, "loss": 9.0546, "step": 16992, "throughput": 17792.094337143873 }, { "epoch": 0.26683594699018015, "grad_norm": 0.07570113986730576, "learning_rate": 0.00023242084332765662, "loss": 9.0216, "step": 17024, "throughput": 17792.85045569564 }, { "epoch": 0.26733751831910907, "grad_norm": 0.07200782001018524, "learning_rate": 0.0002321831880932211, "loss": 9.0211, "step": 17056, "throughput": 17793.602824151738 }, { "epoch": 0.267839089648038, "grad_norm": 0.0791451558470726, "learning_rate": 0.00023194525579511876, "loss": 9.0484, "step": 17088, "throughput": 17793.50755081012 }, { "epoch": 0.2683406609769669, "grad_norm": 0.07487615942955017, "learning_rate": 0.00023170704741458308, "loss": 9.0559, "step": 17120, "throughput": 17793.055755669182 }, { "epoch": 0.2688422323058958, "grad_norm": 0.0783098042011261, "learning_rate": 0.00023146856393398615, "loss": 9.0623, "step": 17152, "throughput": 17793.38732326507 }, { "epoch": 0.26934380363482474, "grad_norm": 0.07308696955442429, "learning_rate": 0.0002312298063368346, "loss": 9.05, "step": 17184, "throughput": 17792.883408825306 }, { "epoch": 0.26984537496375366, "grad_norm": 0.080522820353508, "learning_rate": 0.00023099077560776536, "loss": 9.0426, "step": 17216, "throughput": 17792.804267937678 }, { "epoch": 0.2703469462926825, "grad_norm": 0.07421231269836426, "learning_rate": 0.00023075147273254195, "loss": 9.0254, "step": 17248, "throughput": 17793.020427664535 }, { "epoch": 0.27084851762161144, "grad_norm": 0.0802403911948204, "learning_rate": 0.0002305118986980501, "loss": 9.0529, "step": 17280, "throughput": 17793.81386712117 }, { "epoch": 0.27135008895054036, "grad_norm": 0.07557597011327744, "learning_rate": 0.00023027205449229388, "loss": 9.0459, "step": 17312, "throughput": 17794.53885751692 }, { "epoch": 0.2718516602794693, "grad_norm": 0.07545724511146545, "learning_rate": 0.00023003194110439145, "loss": 9.0391, "step": 17344, "throughput": 17795.47098513452 }, { "epoch": 0.2723532316083982, "grad_norm": 0.07856345921754837, "learning_rate": 0.00022979155952457118, "loss": 9.0403, "step": 17376, "throughput": 17795.81009923486 }, { "epoch": 0.2728548029373271, "grad_norm": 0.08439384400844574, "learning_rate": 0.00022955091074416733, "loss": 9.0333, "step": 17408, "throughput": 17795.18094316347 }, { "epoch": 0.27335637426625603, "grad_norm": 0.06949391961097717, "learning_rate": 0.0002293099957556163, "loss": 9.0498, "step": 17440, "throughput": 17795.422632099973 }, { "epoch": 0.2738579455951849, "grad_norm": 0.07778200507164001, "learning_rate": 0.00022906881555245212, "loss": 8.9999, "step": 17472, "throughput": 17795.314290608025 }, { "epoch": 0.2743595169241138, "grad_norm": 0.07648744434118271, "learning_rate": 0.0002288273711293028, "loss": 9.0106, "step": 17504, "throughput": 17795.054320179566 }, { "epoch": 0.27486108825304273, "grad_norm": 0.07675327360630035, "learning_rate": 0.00022858566348188568, "loss": 9.0532, "step": 17536, "throughput": 17795.260115755576 }, { "epoch": 0.27536265958197165, "grad_norm": 0.07408589869737625, "learning_rate": 0.00022834369360700394, "loss": 9.0316, "step": 17568, "throughput": 17796.207604447347 }, { "epoch": 0.27586423091090057, "grad_norm": 0.08209360390901566, "learning_rate": 0.00022810146250254196, "loss": 9.0422, "step": 17600, "throughput": 17797.01285422199 }, { "epoch": 0.2763658022398295, "grad_norm": 0.08367981016635895, "learning_rate": 0.00022785897116746166, "loss": 9.0116, "step": 17632, "throughput": 17797.561393204396 }, { "epoch": 0.2768673735687584, "grad_norm": 0.07375655323266983, "learning_rate": 0.00022761622060179793, "loss": 9.0219, "step": 17664, "throughput": 17798.27372285632 }, { "epoch": 0.27736894489768726, "grad_norm": 0.08418888598680496, "learning_rate": 0.00022737321180665488, "loss": 9.0296, "step": 17696, "throughput": 17798.215734831665 }, { "epoch": 0.2778705162266162, "grad_norm": 0.07620551437139511, "learning_rate": 0.00022712994578420143, "loss": 9.0558, "step": 17728, "throughput": 17797.602086991534 }, { "epoch": 0.2783720875555451, "grad_norm": 0.07998618483543396, "learning_rate": 0.00022688642353766746, "loss": 9.0022, "step": 17760, "throughput": 17797.891019187224 }, { "epoch": 0.278873658884474, "grad_norm": 0.08344753086566925, "learning_rate": 0.00022664264607133937, "loss": 9.0276, "step": 17792, "throughput": 17797.38464950342 }, { "epoch": 0.27937523021340294, "grad_norm": 0.08326306939125061, "learning_rate": 0.00022639861439055617, "loss": 9.0268, "step": 17824, "throughput": 17797.49075609581 }, { "epoch": 0.27987680154233185, "grad_norm": 0.08046148717403412, "learning_rate": 0.00022615432950170528, "loss": 9.0105, "step": 17856, "throughput": 17797.925671450626 }, { "epoch": 0.2803783728712608, "grad_norm": 0.07936503738164902, "learning_rate": 0.00022590979241221825, "loss": 9.0122, "step": 17888, "throughput": 17798.723556141715 }, { "epoch": 0.28087994420018964, "grad_norm": 0.0750616043806076, "learning_rate": 0.00022566500413056677, "loss": 8.9999, "step": 17920, "throughput": 17799.426296168647 }, { "epoch": 0.28138151552911855, "grad_norm": 0.08700020611286163, "learning_rate": 0.00022541996566625841, "loss": 9.027, "step": 17952, "throughput": 17800.17336464227 }, { "epoch": 0.28188308685804747, "grad_norm": 0.0861596092581749, "learning_rate": 0.00022517467802983266, "loss": 9.0228, "step": 17984, "throughput": 17800.27227871042 }, { "epoch": 0.2823846581869764, "grad_norm": 0.18219846487045288, "learning_rate": 0.0002249291422328563, "loss": 9.0184, "step": 18016, "throughput": 17799.87955037822 }, { "epoch": 0.2828862295159053, "grad_norm": 0.07688847929239273, "learning_rate": 0.00022468335928791977, "loss": 9.0133, "step": 18048, "throughput": 17799.79042344303 }, { "epoch": 0.2833878008448342, "grad_norm": 0.07489870488643646, "learning_rate": 0.00022443733020863262, "loss": 9.002, "step": 18080, "throughput": 17799.522150841196 }, { "epoch": 0.28388937217376314, "grad_norm": 0.07628989219665527, "learning_rate": 0.00022419105600961955, "loss": 9.0073, "step": 18112, "throughput": 17799.63748606668 }, { "epoch": 0.284390943502692, "grad_norm": 0.08326411247253418, "learning_rate": 0.00022394453770651607, "loss": 9.017, "step": 18144, "throughput": 17799.836091760237 }, { "epoch": 0.2848925148316209, "grad_norm": 0.07968247681856155, "learning_rate": 0.00022369777631596436, "loss": 8.9849, "step": 18176, "throughput": 17800.613434453368 }, { "epoch": 0.28539408616054984, "grad_norm": 0.07687773555517197, "learning_rate": 0.00022345077285560914, "loss": 9.0194, "step": 18208, "throughput": 17801.295220398395 }, { "epoch": 0.28589565748947876, "grad_norm": 0.07676286995410919, "learning_rate": 0.00022320352834409343, "loss": 9.0141, "step": 18240, "throughput": 17802.02864196113 }, { "epoch": 0.2863972288184077, "grad_norm": 0.08311989158391953, "learning_rate": 0.0002229560438010543, "loss": 8.9896, "step": 18272, "throughput": 17802.725834806886 }, { "epoch": 0.2868988001473366, "grad_norm": 0.08136092871427536, "learning_rate": 0.00022270832024711882, "loss": 9.0212, "step": 18304, "throughput": 17802.193303962875 }, { "epoch": 0.2874003714762655, "grad_norm": 0.06818056106567383, "learning_rate": 0.00022246035870389952, "loss": 8.9862, "step": 18336, "throughput": 17801.6183171822 }, { "epoch": 0.2879019428051944, "grad_norm": 0.0861053392291069, "learning_rate": 0.00022221216019399067, "loss": 9.0066, "step": 18368, "throughput": 17801.87988758641 }, { "epoch": 0.2884035141341233, "grad_norm": 0.07756201177835464, "learning_rate": 0.00022196372574096357, "loss": 8.9965, "step": 18400, "throughput": 17801.849397082373 }, { "epoch": 0.2889050854630522, "grad_norm": 0.08043848723173141, "learning_rate": 0.00022171505636936272, "loss": 9.0147, "step": 18432, "throughput": 17801.73744307053 }, { "epoch": 0.28940665679198113, "grad_norm": 0.07838977873325348, "learning_rate": 0.00022146615310470125, "loss": 9.0194, "step": 18464, "throughput": 17800.34864497776 }, { "epoch": 0.28990822812091005, "grad_norm": 0.07929672300815582, "learning_rate": 0.0002212170169734571, "loss": 9.0049, "step": 18496, "throughput": 17801.112254143158 }, { "epoch": 0.29040979944983897, "grad_norm": 0.08226138353347778, "learning_rate": 0.0002209676490030683, "loss": 8.9909, "step": 18528, "throughput": 17801.7744118373 }, { "epoch": 0.2909113707787679, "grad_norm": 0.0832921713590622, "learning_rate": 0.0002207180502219291, "loss": 9.0125, "step": 18560, "throughput": 17802.297882049992 }, { "epoch": 0.29141294210769675, "grad_norm": 0.07955853641033173, "learning_rate": 0.00022046822165938565, "loss": 8.9916, "step": 18592, "throughput": 17802.42861817214 }, { "epoch": 0.29191451343662567, "grad_norm": 0.07068558037281036, "learning_rate": 0.00022021816434573168, "loss": 8.9989, "step": 18624, "throughput": 17801.93784729312 }, { "epoch": 0.2924160847655546, "grad_norm": 0.07164032757282257, "learning_rate": 0.0002199678793122043, "loss": 9.0068, "step": 18656, "throughput": 17801.901509302465 }, { "epoch": 0.2929176560944835, "grad_norm": 0.07843668013811111, "learning_rate": 0.0002197173675909797, "loss": 8.9828, "step": 18688, "throughput": 17801.66895807248 }, { "epoch": 0.2934192274234124, "grad_norm": 0.07328581064939499, "learning_rate": 0.00021946663021516895, "loss": 9.0062, "step": 18720, "throughput": 17801.961052826777 }, { "epoch": 0.29392079875234134, "grad_norm": 0.08058661222457886, "learning_rate": 0.0002192156682188138, "loss": 9.017, "step": 18752, "throughput": 17802.15378270491 }, { "epoch": 0.29442237008127026, "grad_norm": 0.07562687247991562, "learning_rate": 0.00021896448263688224, "loss": 8.9795, "step": 18784, "throughput": 17802.902654737656 }, { "epoch": 0.2949239414101991, "grad_norm": 0.07606592029333115, "learning_rate": 0.00021871307450526428, "loss": 8.9991, "step": 18816, "throughput": 17803.556862594272 }, { "epoch": 0.29542551273912804, "grad_norm": 0.0791717916727066, "learning_rate": 0.00021846144486076794, "loss": 8.9762, "step": 18848, "throughput": 17804.259838788174 }, { "epoch": 0.29592708406805696, "grad_norm": 0.07694504410028458, "learning_rate": 0.00021820959474111448, "loss": 8.9946, "step": 18880, "throughput": 17804.901312659757 }, { "epoch": 0.2964286553969859, "grad_norm": 0.07806772738695145, "learning_rate": 0.00021795752518493462, "loss": 8.9957, "step": 18912, "throughput": 17804.04277097383 }, { "epoch": 0.2969302267259148, "grad_norm": 0.07575007528066635, "learning_rate": 0.0002177052372317639, "loss": 8.9875, "step": 18944, "throughput": 17803.77740896515 }, { "epoch": 0.2974317980548437, "grad_norm": 0.06974538415670395, "learning_rate": 0.00021745273192203871, "loss": 8.979, "step": 18976, "throughput": 17803.878554230883 }, { "epoch": 0.2979333693837726, "grad_norm": 0.07206161320209503, "learning_rate": 0.00021720001029709152, "loss": 8.9952, "step": 19008, "throughput": 17803.84715675061 }, { "epoch": 0.2984349407127015, "grad_norm": 0.07531093060970306, "learning_rate": 0.00021694707339914722, "loss": 8.9861, "step": 19040, "throughput": 17804.01661455889 }, { "epoch": 0.2989365120416304, "grad_norm": 0.07219666242599487, "learning_rate": 0.00021669392227131816, "loss": 8.9951, "step": 19072, "throughput": 17804.87759250451 }, { "epoch": 0.2994380833705593, "grad_norm": 0.07833381742238998, "learning_rate": 0.0002164405579576005, "loss": 9.0, "step": 19104, "throughput": 17805.427632362112 }, { "epoch": 0.29993965469948825, "grad_norm": 0.07842086255550385, "learning_rate": 0.0002161869815028694, "loss": 8.9848, "step": 19136, "throughput": 17805.883723570336 }, { "epoch": 0.30044122602841716, "grad_norm": 0.0783037543296814, "learning_rate": 0.00021593319395287483, "loss": 8.9859, "step": 19168, "throughput": 17806.746396470768 }, { "epoch": 0.3009427973573461, "grad_norm": 0.07206156104803085, "learning_rate": 0.0002156791963542374, "loss": 8.9808, "step": 19200, "throughput": 17806.462975098348 }, { "epoch": 0.30144436868627494, "grad_norm": 0.07156243175268173, "learning_rate": 0.00021542498975444404, "loss": 8.9698, "step": 19232, "throughput": 17806.145865857052 }, { "epoch": 0.30194594001520386, "grad_norm": 0.07291711866855621, "learning_rate": 0.0002151705752018435, "loss": 8.9754, "step": 19264, "throughput": 17806.223016558117 }, { "epoch": 0.3024475113441328, "grad_norm": 0.07339274883270264, "learning_rate": 0.0002149159537456421, "loss": 8.9839, "step": 19296, "throughput": 17805.957976066922 }, { "epoch": 0.3029490826730617, "grad_norm": 0.07498586177825928, "learning_rate": 0.00021466112643589948, "loss": 8.9323, "step": 19328, "throughput": 17806.27011152514 }, { "epoch": 0.3034506540019906, "grad_norm": 0.07056141644716263, "learning_rate": 0.00021440609432352427, "loss": 8.9969, "step": 19360, "throughput": 17806.671457033535 }, { "epoch": 0.30395222533091953, "grad_norm": 0.07289967685937881, "learning_rate": 0.00021415085846026961, "loss": 8.991, "step": 19392, "throughput": 17807.179791054605 }, { "epoch": 0.30445379665984845, "grad_norm": 0.07861388474702835, "learning_rate": 0.00021389541989872904, "loss": 8.9598, "step": 19424, "throughput": 17807.824663972242 }, { "epoch": 0.3049553679887773, "grad_norm": 0.08246038854122162, "learning_rate": 0.00021363977969233186, "loss": 8.9862, "step": 19456, "throughput": 17808.475116730137 }, { "epoch": 0.30545693931770623, "grad_norm": 0.07363611459732056, "learning_rate": 0.000213383938895339, "loss": 8.9798, "step": 19488, "throughput": 17808.86721984606 }, { "epoch": 0.30595851064663515, "grad_norm": 0.07611941546201706, "learning_rate": 0.00021312789856283885, "loss": 8.9645, "step": 19520, "throughput": 17807.974704074484 }, { "epoch": 0.30646008197556407, "grad_norm": 0.06836125254631042, "learning_rate": 0.0002128716597507423, "loss": 8.9942, "step": 19552, "throughput": 17807.922191509897 }, { "epoch": 0.306961653304493, "grad_norm": 0.07298135757446289, "learning_rate": 0.00021261522351577906, "loss": 8.9902, "step": 19584, "throughput": 17807.67964169082 }, { "epoch": 0.3074632246334219, "grad_norm": 0.06927632540464401, "learning_rate": 0.00021235859091549294, "loss": 8.9864, "step": 19616, "throughput": 17807.969765938007 }, { "epoch": 0.3079647959623508, "grad_norm": 0.07625886052846909, "learning_rate": 0.0002121017630082375, "loss": 8.9775, "step": 19648, "throughput": 17808.127902746375 }, { "epoch": 0.3084663672912797, "grad_norm": 0.07400345057249069, "learning_rate": 0.0002118447408531718, "loss": 8.9469, "step": 19680, "throughput": 17808.94989418138 }, { "epoch": 0.3089679386202086, "grad_norm": 0.07475277781486511, "learning_rate": 0.00021158752551025603, "loss": 8.9637, "step": 19712, "throughput": 17809.472429741916 }, { "epoch": 0.3094695099491375, "grad_norm": 0.07137515395879745, "learning_rate": 0.0002113301180402469, "loss": 8.9916, "step": 19744, "throughput": 17809.935530919967 }, { "epoch": 0.30997108127806644, "grad_norm": 0.09154373407363892, "learning_rate": 0.0002110725195046937, "loss": 8.9795, "step": 19776, "throughput": 17810.551977206345 }, { "epoch": 0.31047265260699536, "grad_norm": 0.07624326646327972, "learning_rate": 0.00021081473096593348, "loss": 8.9766, "step": 19808, "throughput": 17810.2789124244 }, { "epoch": 0.3109742239359243, "grad_norm": 0.07699087262153625, "learning_rate": 0.000210556753487087, "loss": 8.968, "step": 19840, "throughput": 17809.766829334505 }, { "epoch": 0.3114757952648532, "grad_norm": 0.0898560956120491, "learning_rate": 0.00021029858813205408, "loss": 8.9447, "step": 19872, "throughput": 17809.71709658218 }, { "epoch": 0.31197736659378206, "grad_norm": 0.06971347332000732, "learning_rate": 0.00021004023596550946, "loss": 8.9711, "step": 19904, "throughput": 17809.805281913414 }, { "epoch": 0.312478937922711, "grad_norm": 0.08007395267486572, "learning_rate": 0.00020978169805289823, "loss": 8.9766, "step": 19936, "throughput": 17809.85262690025 }, { "epoch": 0.3129805092516399, "grad_norm": 0.07413521409034729, "learning_rate": 0.0002095229754604315, "loss": 8.9644, "step": 19968, "throughput": 17810.456333872407 }, { "epoch": 0.3134820805805688, "grad_norm": 0.0724678784608841, "learning_rate": 0.00020926406925508202, "loss": 8.9704, "step": 20000, "throughput": 17810.95911060627 }, { "epoch": 0.31398365190949773, "grad_norm": 0.07824688404798508, "learning_rate": 0.00020900498050457973, "loss": 8.9808, "step": 20032, "throughput": 17811.40221341032 }, { "epoch": 0.31448522323842665, "grad_norm": 0.07604194432497025, "learning_rate": 0.0002087457102774074, "loss": 8.9667, "step": 20064, "throughput": 17812.197594582045 }, { "epoch": 0.31498679456735557, "grad_norm": 0.07452570647001266, "learning_rate": 0.00020848625964279622, "loss": 8.9568, "step": 20096, "throughput": 17812.116222839028 }, { "epoch": 0.31548836589628443, "grad_norm": 0.07305281609296799, "learning_rate": 0.0002082266296707214, "loss": 8.9635, "step": 20128, "throughput": 17811.703231534724 }, { "epoch": 0.31598993722521335, "grad_norm": 0.07430999726057053, "learning_rate": 0.0002079668214318977, "loss": 8.9571, "step": 20160, "throughput": 17811.648548123165 }, { "epoch": 0.31649150855414226, "grad_norm": 0.07261121273040771, "learning_rate": 0.00020770683599777507, "loss": 8.9543, "step": 20192, "throughput": 17811.40851628757 }, { "epoch": 0.3169930798830712, "grad_norm": 0.07133432477712631, "learning_rate": 0.0002074466744405342, "loss": 8.9533, "step": 20224, "throughput": 17811.67342260343 }, { "epoch": 0.3174946512120001, "grad_norm": 0.07755530625581741, "learning_rate": 0.00020718633783308214, "loss": 8.9451, "step": 20256, "throughput": 17811.81537866219 }, { "epoch": 0.317996222540929, "grad_norm": 0.07869453728199005, "learning_rate": 0.00020692582724904778, "loss": 8.9563, "step": 20288, "throughput": 17812.308371027375 }, { "epoch": 0.31849779386985794, "grad_norm": 0.07447408884763718, "learning_rate": 0.00020666514376277762, "loss": 8.9567, "step": 20320, "throughput": 17813.096559118872 }, { "epoch": 0.3189993651987868, "grad_norm": 0.07978935539722443, "learning_rate": 0.00020640428844933108, "loss": 8.9608, "step": 20352, "throughput": 17813.55181688389 }, { "epoch": 0.3195009365277157, "grad_norm": 0.07883100211620331, "learning_rate": 0.00020614326238447623, "loss": 8.9578, "step": 20384, "throughput": 17814.156716354308 }, { "epoch": 0.32000250785664464, "grad_norm": 0.07453079521656036, "learning_rate": 0.0002058820666446854, "loss": 8.9556, "step": 20416, "throughput": 17813.428897268856 }, { "epoch": 0.32050407918557355, "grad_norm": 0.07906678318977356, "learning_rate": 0.00020562070230713058, "loss": 8.9644, "step": 20448, "throughput": 17813.228445550754 }, { "epoch": 0.32100565051450247, "grad_norm": 0.08273608237504959, "learning_rate": 0.00020535917044967899, "loss": 8.946, "step": 20480, "throughput": 17812.901296804946 }, { "epoch": 0.3215072218434314, "grad_norm": 0.07959649711847305, "learning_rate": 0.00020509747215088887, "loss": 8.9523, "step": 20512, "throughput": 17811.100061077435 }, { "epoch": 0.3220087931723603, "grad_norm": 0.07556446641683578, "learning_rate": 0.00020483560849000475, "loss": 8.9512, "step": 20544, "throughput": 17811.46613679104 }, { "epoch": 0.32251036450128917, "grad_norm": 0.0763557106256485, "learning_rate": 0.00020457358054695317, "loss": 8.9434, "step": 20576, "throughput": 17812.04490257042 }, { "epoch": 0.3230119358302181, "grad_norm": 0.07173081487417221, "learning_rate": 0.00020431138940233808, "loss": 8.9412, "step": 20608, "throughput": 17812.52434196172 }, { "epoch": 0.323513507159147, "grad_norm": 0.0748804360628128, "learning_rate": 0.00020404903613743664, "loss": 8.9418, "step": 20640, "throughput": 17812.95504034032 }, { "epoch": 0.3240150784880759, "grad_norm": 0.07947655022144318, "learning_rate": 0.0002037865218341944, "loss": 8.951, "step": 20672, "throughput": 17813.552276554165 }, { "epoch": 0.32451664981700484, "grad_norm": 0.07350742071866989, "learning_rate": 0.00020352384757522113, "loss": 8.9345, "step": 20704, "throughput": 17813.469451543082 }, { "epoch": 0.32501822114593376, "grad_norm": 0.07518761605024338, "learning_rate": 0.00020326101444378633, "loss": 8.9597, "step": 20736, "throughput": 17813.428817463646 }, { "epoch": 0.3255197924748627, "grad_norm": 0.07858512550592422, "learning_rate": 0.0002029980235238145, "loss": 8.9569, "step": 20768, "throughput": 17812.8848706178 }, { "epoch": 0.32602136380379154, "grad_norm": 0.07931273430585861, "learning_rate": 0.0002027348758998811, "loss": 8.9508, "step": 20800, "throughput": 17812.926669883913 }, { "epoch": 0.32652293513272046, "grad_norm": 0.0770784467458725, "learning_rate": 0.0002024715726572076, "loss": 8.9619, "step": 20832, "throughput": 17813.494705234196 }, { "epoch": 0.3270245064616494, "grad_norm": 0.0742030069231987, "learning_rate": 0.0002022081148816574, "loss": 8.936, "step": 20864, "throughput": 17813.64573655942 }, { "epoch": 0.3275260777905783, "grad_norm": 0.07879988849163055, "learning_rate": 0.0002019445036597312, "loss": 8.9358, "step": 20896, "throughput": 17814.11509360599 }, { "epoch": 0.3280276491195072, "grad_norm": 0.07346165180206299, "learning_rate": 0.00020168074007856232, "loss": 8.9437, "step": 20928, "throughput": 17814.758227227496 }, { "epoch": 0.32852922044843613, "grad_norm": 0.06962357461452484, "learning_rate": 0.00020141682522591272, "loss": 8.9292, "step": 20960, "throughput": 17815.185618338328 }, { "epoch": 0.32903079177736505, "grad_norm": 0.07394640147686005, "learning_rate": 0.0002011527601901679, "loss": 8.948, "step": 20992, "throughput": 17815.820375428073 }, { "epoch": 0.3295323631062939, "grad_norm": 0.07294956594705582, "learning_rate": 0.00020088854606033292, "loss": 8.954, "step": 21024, "throughput": 17815.48353320163 }, { "epoch": 0.33003393443522283, "grad_norm": 0.0731930062174797, "learning_rate": 0.00020062418392602767, "loss": 8.9372, "step": 21056, "throughput": 17815.312534164874 }, { "epoch": 0.33053550576415175, "grad_norm": 0.07782046496868134, "learning_rate": 0.00020035967487748226, "loss": 8.9486, "step": 21088, "throughput": 17814.861235838223 }, { "epoch": 0.33103707709308067, "grad_norm": 0.07795765995979309, "learning_rate": 0.00020009502000553286, "loss": 8.9382, "step": 21120, "throughput": 17815.23092475671 }, { "epoch": 0.3315386484220096, "grad_norm": 0.07017985731363297, "learning_rate": 0.00019983022040161692, "loss": 8.9257, "step": 21152, "throughput": 17815.382657071077 }, { "epoch": 0.3320402197509385, "grad_norm": 0.07129698246717453, "learning_rate": 0.00019956527715776887, "loss": 8.9459, "step": 21184, "throughput": 17816.146885634007 }, { "epoch": 0.3325417910798674, "grad_norm": 0.07461407780647278, "learning_rate": 0.0001993001913666153, "loss": 8.9444, "step": 21216, "throughput": 17816.08373663044 }, { "epoch": 0.3330433624087963, "grad_norm": 0.08224450051784515, "learning_rate": 0.00019903496412137093, "loss": 8.9408, "step": 21248, "throughput": 17816.49273855996 }, { "epoch": 0.3335449337377252, "grad_norm": 0.07270831614732742, "learning_rate": 0.00019876959651583362, "loss": 8.9426, "step": 21280, "throughput": 17817.24483864838 }, { "epoch": 0.3340465050666541, "grad_norm": 0.07324141263961792, "learning_rate": 0.00019850408964438023, "loss": 8.9502, "step": 21312, "throughput": 17817.18968453246 }, { "epoch": 0.33454807639558304, "grad_norm": 0.0847543254494667, "learning_rate": 0.00019823844460196177, "loss": 8.9443, "step": 21344, "throughput": 17816.919887196673 }, { "epoch": 0.33504964772451196, "grad_norm": 0.07064050436019897, "learning_rate": 0.00019797266248409932, "loss": 8.9184, "step": 21376, "throughput": 17816.675415472295 }, { "epoch": 0.3355512190534409, "grad_norm": 0.0750681459903717, "learning_rate": 0.000197706744386879, "loss": 8.9338, "step": 21408, "throughput": 17816.857082441475 }, { "epoch": 0.3360527903823698, "grad_norm": 0.07319337129592896, "learning_rate": 0.00019744069140694795, "loss": 8.9523, "step": 21440, "throughput": 17817.227002143307 }, { "epoch": 0.33655436171129866, "grad_norm": 0.0720682218670845, "learning_rate": 0.00019717450464150935, "loss": 8.9081, "step": 21472, "throughput": 17817.55143137617 }, { "epoch": 0.3370559330402276, "grad_norm": 0.07472842186689377, "learning_rate": 0.00019690818518831827, "loss": 8.9401, "step": 21504, "throughput": 17817.80429730904 }, { "epoch": 0.3375575043691565, "grad_norm": 0.07875826954841614, "learning_rate": 0.0001966417341456769, "loss": 8.9229, "step": 21536, "throughput": 17818.05151405433 }, { "epoch": 0.3380590756980854, "grad_norm": 0.074205681681633, "learning_rate": 0.0001963751526124301, "loss": 8.9331, "step": 21568, "throughput": 17818.59926377971 }, { "epoch": 0.3385606470270143, "grad_norm": 0.07358483225107193, "learning_rate": 0.00019610844168796096, "loss": 8.9475, "step": 21600, "throughput": 17819.0439804744 }, { "epoch": 0.33906221835594325, "grad_norm": 0.07578427344560623, "learning_rate": 0.0001958416024721861, "loss": 8.9204, "step": 21632, "throughput": 17818.595125911124 }, { "epoch": 0.33956378968487216, "grad_norm": 0.0762067437171936, "learning_rate": 0.00019557463606555118, "loss": 8.9177, "step": 21664, "throughput": 17818.425541998775 }, { "epoch": 0.340065361013801, "grad_norm": 0.07589062303304672, "learning_rate": 0.0001953075435690266, "loss": 8.9473, "step": 21696, "throughput": 17818.298150910818 }, { "epoch": 0.34056693234272994, "grad_norm": 0.07703171670436859, "learning_rate": 0.0001950403260841024, "loss": 8.9025, "step": 21728, "throughput": 17818.641348897618 }, { "epoch": 0.34106850367165886, "grad_norm": 0.07283373922109604, "learning_rate": 0.0001947729847127845, "loss": 8.9463, "step": 21760, "throughput": 17818.782797105105 }, { "epoch": 0.3415700750005878, "grad_norm": 0.07369955629110336, "learning_rate": 0.00019450552055758934, "loss": 8.9105, "step": 21792, "throughput": 17819.51243392188 }, { "epoch": 0.3420716463295167, "grad_norm": 0.0815635547041893, "learning_rate": 0.00019423793472153996, "loss": 8.9103, "step": 21824, "throughput": 17819.43615014987 }, { "epoch": 0.3425732176584456, "grad_norm": 0.07449345290660858, "learning_rate": 0.0001939702283081611, "loss": 8.9171, "step": 21856, "throughput": 17819.837183645075 }, { "epoch": 0.34307478898737453, "grad_norm": 0.07299656420946121, "learning_rate": 0.00019370240242147488, "loss": 8.9307, "step": 21888, "throughput": 17820.560184313057 }, { "epoch": 0.3435763603163034, "grad_norm": 0.07765787839889526, "learning_rate": 0.000193434458165996, "loss": 8.9367, "step": 21920, "throughput": 17819.982451899494 }, { "epoch": 0.3440779316452323, "grad_norm": 0.07395743578672409, "learning_rate": 0.00019316639664672733, "loss": 8.9071, "step": 21952, "throughput": 17820.094119130765 }, { "epoch": 0.34457950297416123, "grad_norm": 0.07570222020149231, "learning_rate": 0.00019289821896915544, "loss": 8.9106, "step": 21984, "throughput": 17819.68260988534 }, { "epoch": 0.34508107430309015, "grad_norm": 0.07392715662717819, "learning_rate": 0.00019262992623924585, "loss": 8.9192, "step": 22016, "throughput": 17820.021151294266 }, { "epoch": 0.34558264563201907, "grad_norm": 0.07445602118968964, "learning_rate": 0.00019236151956343852, "loss": 8.8954, "step": 22048, "throughput": 17820.156965333827 }, { "epoch": 0.346084216960948, "grad_norm": 0.07157503068447113, "learning_rate": 0.00019209300004864341, "loss": 8.9212, "step": 22080, "throughput": 17820.683172522855 }, { "epoch": 0.3465857882898769, "grad_norm": 0.08508722484111786, "learning_rate": 0.00019182436880223585, "loss": 8.9188, "step": 22112, "throughput": 17820.930225618704 }, { "epoch": 0.34708735961880577, "grad_norm": 0.08076149970293045, "learning_rate": 0.00019155562693205178, "loss": 8.9127, "step": 22144, "throughput": 17821.16633543189 }, { "epoch": 0.3475889309477347, "grad_norm": 0.07283724844455719, "learning_rate": 0.00019128677554638355, "loss": 8.8887, "step": 22176, "throughput": 17821.709052753864 }, { "epoch": 0.3480905022766636, "grad_norm": 0.07913101464509964, "learning_rate": 0.0001910178157539751, "loss": 8.9094, "step": 22208, "throughput": 17821.848150108508 }, { "epoch": 0.3485920736055925, "grad_norm": 0.07167515158653259, "learning_rate": 0.00019074874866401733, "loss": 8.9367, "step": 22240, "throughput": 17821.58256896794 }, { "epoch": 0.34909364493452144, "grad_norm": 0.07268569618463516, "learning_rate": 0.00019047957538614375, "loss": 8.9112, "step": 22272, "throughput": 17821.33667359995 }, { "epoch": 0.34959521626345036, "grad_norm": 0.07488091289997101, "learning_rate": 0.00019021029703042576, "loss": 8.9092, "step": 22304, "throughput": 17821.224577891684 }, { "epoch": 0.3500967875923793, "grad_norm": 0.07319794595241547, "learning_rate": 0.0001899409147073681, "loss": 8.9162, "step": 22336, "throughput": 17821.760640674343 }, { "epoch": 0.35059835892130814, "grad_norm": 0.07335125654935837, "learning_rate": 0.0001896714295279043, "loss": 8.9058, "step": 22368, "throughput": 17821.86835973504 }, { "epoch": 0.35109993025023706, "grad_norm": 0.08088002353906631, "learning_rate": 0.00018940184260339194, "loss": 8.9161, "step": 22400, "throughput": 17822.288636338348 }, { "epoch": 0.351601501579166, "grad_norm": 0.0779116079211235, "learning_rate": 0.00018913215504560838, "loss": 8.9302, "step": 22432, "throughput": 17822.461043873136 }, { "epoch": 0.3521030729080949, "grad_norm": 0.07497800886631012, "learning_rate": 0.0001888623679667459, "loss": 8.9022, "step": 22464, "throughput": 17822.847998012865 }, { "epoch": 0.3526046442370238, "grad_norm": 0.06929473578929901, "learning_rate": 0.00018859248247940722, "loss": 8.911, "step": 22496, "throughput": 17823.555725739072 }, { "epoch": 0.35310621556595273, "grad_norm": 0.07567939162254333, "learning_rate": 0.0001883224996966008, "loss": 8.9249, "step": 22528, "throughput": 17823.010673609624 }, { "epoch": 0.35360778689488165, "grad_norm": 0.08010558784008026, "learning_rate": 0.00018805242073173653, "loss": 8.9136, "step": 22560, "throughput": 17821.25216698874 }, { "epoch": 0.3541093582238105, "grad_norm": 0.0750737190246582, "learning_rate": 0.00018778224669862087, "loss": 8.9159, "step": 22592, "throughput": 17820.76067321445 }, { "epoch": 0.35461092955273943, "grad_norm": 0.07348627597093582, "learning_rate": 0.0001875119787114523, "loss": 8.8968, "step": 22624, "throughput": 17821.09990511972 }, { "epoch": 0.35511250088166835, "grad_norm": 0.06908921897411346, "learning_rate": 0.00018724161788481676, "loss": 8.9083, "step": 22656, "throughput": 17821.23808504927 }, { "epoch": 0.35561407221059727, "grad_norm": 0.07111164182424545, "learning_rate": 0.00018697116533368316, "loss": 8.8927, "step": 22688, "throughput": 17821.93493660733 }, { "epoch": 0.3561156435395262, "grad_norm": 0.07290966063737869, "learning_rate": 0.00018670062217339867, "loss": 8.9259, "step": 22720, "throughput": 17821.967956572396 }, { "epoch": 0.3566172148684551, "grad_norm": 0.07786906510591507, "learning_rate": 0.0001864299895196839, "loss": 8.9351, "step": 22752, "throughput": 17822.36912099535 }, { "epoch": 0.357118786197384, "grad_norm": 0.08353696018457413, "learning_rate": 0.00018615926848862893, "loss": 8.9286, "step": 22784, "throughput": 17822.891229334833 }, { "epoch": 0.3576203575263129, "grad_norm": 0.07277407497167587, "learning_rate": 0.00018588846019668793, "loss": 8.8903, "step": 22816, "throughput": 17822.741576530676 }, { "epoch": 0.3581219288552418, "grad_norm": 0.07241684943437576, "learning_rate": 0.00018561756576067524, "loss": 8.8931, "step": 22848, "throughput": 17822.475657843017 }, { "epoch": 0.3586235001841707, "grad_norm": 0.07565472275018692, "learning_rate": 0.0001853465862977602, "loss": 8.8852, "step": 22880, "throughput": 17822.20926050296 }, { "epoch": 0.35912507151309964, "grad_norm": 0.07836824655532837, "learning_rate": 0.00018507552292546295, "loss": 8.8783, "step": 22912, "throughput": 17822.34557228031 }, { "epoch": 0.35962664284202855, "grad_norm": 0.07098279893398285, "learning_rate": 0.00018480437676164968, "loss": 8.8742, "step": 22944, "throughput": 17822.67608358776 }, { "epoch": 0.3601282141709575, "grad_norm": 0.07533206045627594, "learning_rate": 0.00018453314892452795, "loss": 8.9083, "step": 22976, "throughput": 17822.99284955358 }, { "epoch": 0.36062978549988634, "grad_norm": 0.07069644331932068, "learning_rate": 0.00018426184053264215, "loss": 8.9058, "step": 23008, "throughput": 17823.223058884796 }, { "epoch": 0.36113135682881525, "grad_norm": 0.07935863733291626, "learning_rate": 0.0001839904527048689, "loss": 8.9095, "step": 23040, "throughput": 17823.443737012367 }, { "epoch": 0.36163292815774417, "grad_norm": 0.07350348681211472, "learning_rate": 0.0001837189865604124, "loss": 8.8807, "step": 23072, "throughput": 17823.957248393664 }, { "epoch": 0.3621344994866731, "grad_norm": 0.07933742552995682, "learning_rate": 0.00018344744321879987, "loss": 8.9032, "step": 23104, "throughput": 17824.52610640502 }, { "epoch": 0.362636070815602, "grad_norm": 0.07533203065395355, "learning_rate": 0.0001831758237998768, "loss": 8.8809, "step": 23136, "throughput": 17824.12672581524 }, { "epoch": 0.3631376421445309, "grad_norm": 0.07438133656978607, "learning_rate": 0.00018290412942380252, "loss": 8.8638, "step": 23168, "throughput": 17823.96099488507 }, { "epoch": 0.36363921347345984, "grad_norm": 0.08473438024520874, "learning_rate": 0.00018263236121104543, "loss": 8.8881, "step": 23200, "throughput": 17823.641345838707 }, { "epoch": 0.3641407848023887, "grad_norm": 0.07578590512275696, "learning_rate": 0.00018236052028237847, "loss": 8.8879, "step": 23232, "throughput": 17823.96706337484 }, { "epoch": 0.3646423561313176, "grad_norm": 0.07169584929943085, "learning_rate": 0.0001820886077588744, "loss": 8.89, "step": 23264, "throughput": 17824.111644749042 }, { "epoch": 0.36514392746024654, "grad_norm": 0.07907114177942276, "learning_rate": 0.00018181662476190127, "loss": 8.9048, "step": 23296, "throughput": 17824.790422560367 }, { "epoch": 0.36564549878917546, "grad_norm": 0.07309950143098831, "learning_rate": 0.00018154457241311773, "loss": 8.8882, "step": 23328, "throughput": 17824.842153808437 }, { "epoch": 0.3661470701181044, "grad_norm": 0.08225353062152863, "learning_rate": 0.00018127245183446858, "loss": 8.8804, "step": 23360, "throughput": 17825.248228130444 }, { "epoch": 0.3666486414470333, "grad_norm": 0.07341925799846649, "learning_rate": 0.00018100026414817987, "loss": 8.899, "step": 23392, "throughput": 17825.926457473488 }, { "epoch": 0.3671502127759622, "grad_norm": 0.07902863621711731, "learning_rate": 0.00018072801047675432, "loss": 8.8896, "step": 23424, "throughput": 17825.396406786957 }, { "epoch": 0.3676517841048911, "grad_norm": 0.07868482172489166, "learning_rate": 0.00018045569194296697, "loss": 8.8864, "step": 23456, "throughput": 17825.498447999318 }, { "epoch": 0.36815335543382, "grad_norm": 0.07887911796569824, "learning_rate": 0.00018018330966986022, "loss": 8.8781, "step": 23488, "throughput": 17825.08292114403 }, { "epoch": 0.3686549267627489, "grad_norm": 0.07060116529464722, "learning_rate": 0.00017991086478073943, "loss": 8.9005, "step": 23520, "throughput": 17825.578620517335 }, { "epoch": 0.36915649809167783, "grad_norm": 0.07441214472055435, "learning_rate": 0.0001796383583991681, "loss": 8.8852, "step": 23552, "throughput": 17825.713155723053 }, { "epoch": 0.36965806942060675, "grad_norm": 0.07870756089687347, "learning_rate": 0.00017936579164896333, "loss": 8.8856, "step": 23584, "throughput": 17826.056147290332 }, { "epoch": 0.37015964074953567, "grad_norm": 0.07043929398059845, "learning_rate": 0.0001790931656541912, "loss": 8.8971, "step": 23616, "throughput": 17826.278076249124 }, { "epoch": 0.3706612120784646, "grad_norm": 0.07437200844287872, "learning_rate": 0.00017882048153916214, "loss": 8.9109, "step": 23648, "throughput": 17826.49189349421 }, { "epoch": 0.37116278340739345, "grad_norm": 0.07189071178436279, "learning_rate": 0.00017854774042842626, "loss": 8.901, "step": 23680, "throughput": 17827.162394924104 }, { "epoch": 0.37166435473632237, "grad_norm": 0.07663335651159286, "learning_rate": 0.00017827494344676873, "loss": 8.8979, "step": 23712, "throughput": 17827.442078451364 }, { "epoch": 0.3721659260652513, "grad_norm": 0.07194948941469193, "learning_rate": 0.000178002091719205, "loss": 8.8743, "step": 23744, "throughput": 17827.293276727123 }, { "epoch": 0.3726674973941802, "grad_norm": 0.07944053411483765, "learning_rate": 0.00017772918637097657, "loss": 8.8836, "step": 23776, "throughput": 17826.95173690492 }, { "epoch": 0.3731690687231091, "grad_norm": 0.09098808467388153, "learning_rate": 0.00017745622852754575, "loss": 8.8857, "step": 23808, "throughput": 17826.989096990383 }, { "epoch": 0.37367064005203804, "grad_norm": 0.07665561884641647, "learning_rate": 0.00017718321931459163, "loss": 8.8787, "step": 23840, "throughput": 17827.46315318843 }, { "epoch": 0.37417221138096696, "grad_norm": 0.08578639477491379, "learning_rate": 0.00017691015985800488, "loss": 8.898, "step": 23872, "throughput": 17827.58528583542 }, { "epoch": 0.3746737827098958, "grad_norm": 0.07634943723678589, "learning_rate": 0.0001766370512838836, "loss": 8.8763, "step": 23904, "throughput": 17827.978884744767 }, { "epoch": 0.37517535403882474, "grad_norm": 0.07927807420492172, "learning_rate": 0.00017636389471852834, "loss": 8.877, "step": 23936, "throughput": 17828.118072038644 }, { "epoch": 0.37567692536775366, "grad_norm": 0.07263068854808807, "learning_rate": 0.0001760906912884376, "loss": 8.8773, "step": 23968, "throughput": 17828.520912972963 }, { "epoch": 0.3761784966966826, "grad_norm": 0.0758548453450203, "learning_rate": 0.00017581744212030308, "loss": 8.8739, "step": 24000, "throughput": 17829.06547779846 }, { "epoch": 0.3766800680256115, "grad_norm": 0.07255948334932327, "learning_rate": 0.00017554414834100525, "loss": 8.8659, "step": 24032, "throughput": 17828.482269473152 }, { "epoch": 0.3771816393545404, "grad_norm": 0.0759415328502655, "learning_rate": 0.00017527081107760834, "loss": 8.8667, "step": 24064, "throughput": 17828.472128577403 }, { "epoch": 0.37768321068346933, "grad_norm": 0.08043156564235687, "learning_rate": 0.00017499743145735615, "loss": 8.8799, "step": 24096, "throughput": 17828.171666149505 }, { "epoch": 0.3781847820123982, "grad_norm": 0.07601181417703629, "learning_rate": 0.00017472401060766697, "loss": 8.8775, "step": 24128, "throughput": 17828.48492127222 }, { "epoch": 0.3786863533413271, "grad_norm": 0.07197776436805725, "learning_rate": 0.0001744505496561292, "loss": 8.882, "step": 24160, "throughput": 17828.766273160792 }, { "epoch": 0.379187924670256, "grad_norm": 0.07243472337722778, "learning_rate": 0.00017417704973049668, "loss": 8.8917, "step": 24192, "throughput": 17829.07403289147 }, { "epoch": 0.37968949599918494, "grad_norm": 0.07945257425308228, "learning_rate": 0.00017390351195868385, "loss": 8.881, "step": 24224, "throughput": 17829.292229522 }, { "epoch": 0.38019106732811386, "grad_norm": 0.07232029736042023, "learning_rate": 0.00017362993746876135, "loss": 8.8789, "step": 24256, "throughput": 17829.50020712472 }, { "epoch": 0.3806926386570428, "grad_norm": 0.07614285498857498, "learning_rate": 0.00017335632738895113, "loss": 8.8773, "step": 24288, "throughput": 17830.137295477773 }, { "epoch": 0.3811942099859717, "grad_norm": 0.07729846239089966, "learning_rate": 0.000173082682847622, "loss": 8.8747, "step": 24320, "throughput": 17830.041901587563 }, { "epoch": 0.38169578131490056, "grad_norm": 0.07206852734088898, "learning_rate": 0.0001728090049732848, "loss": 8.8686, "step": 24352, "throughput": 17830.061222745466 }, { "epoch": 0.3821973526438295, "grad_norm": 0.07643686234951019, "learning_rate": 0.00017253529489458802, "loss": 8.8818, "step": 24384, "throughput": 17829.672580412058 }, { "epoch": 0.3826989239727584, "grad_norm": 0.07176980376243591, "learning_rate": 0.00017226155374031271, "loss": 8.8655, "step": 24416, "throughput": 17829.9882926581 }, { "epoch": 0.3832004953016873, "grad_norm": 0.07665753364562988, "learning_rate": 0.0001719877826393683, "loss": 8.8661, "step": 24448, "throughput": 17830.44935147498 }, { "epoch": 0.38370206663061623, "grad_norm": 0.07416357845067978, "learning_rate": 0.00017171398272078752, "loss": 8.8556, "step": 24480, "throughput": 17830.548068886743 }, { "epoch": 0.38420363795954515, "grad_norm": 0.0756995901465416, "learning_rate": 0.00017144015511372208, "loss": 8.8604, "step": 24512, "throughput": 17830.773188901458 }, { "epoch": 0.38470520928847407, "grad_norm": 0.0758114829659462, "learning_rate": 0.00017116630094743792, "loss": 8.859, "step": 24544, "throughput": 17830.930798570385 }, { "epoch": 0.38520678061740293, "grad_norm": 0.08073586970567703, "learning_rate": 0.00017089242135131036, "loss": 8.8594, "step": 24576, "throughput": 17831.43214908143 }, { "epoch": 0.38570835194633185, "grad_norm": 0.0700501948595047, "learning_rate": 0.0001706185174548197, "loss": 8.853, "step": 24608, "throughput": 17830.3106839198 }, { "epoch": 0.38620992327526077, "grad_norm": 0.07498278468847275, "learning_rate": 0.0001703445903875464, "loss": 8.8598, "step": 24640, "throughput": 17829.736843745333 }, { "epoch": 0.3867114946041897, "grad_norm": 0.07171520590782166, "learning_rate": 0.00017007064127916644, "loss": 8.864, "step": 24672, "throughput": 17829.833203307262 }, { "epoch": 0.3872130659331186, "grad_norm": 0.07245524972677231, "learning_rate": 0.0001697966712594469, "loss": 8.8756, "step": 24704, "throughput": 17829.564385638096 }, { "epoch": 0.3877146372620475, "grad_norm": 0.07057449966669083, "learning_rate": 0.00016952268145824082, "loss": 8.8591, "step": 24736, "throughput": 17830.03439866721 }, { "epoch": 0.38821620859097644, "grad_norm": 0.07579497247934341, "learning_rate": 0.00016924867300548304, "loss": 8.8497, "step": 24768, "throughput": 17830.14237986475 }, { "epoch": 0.3887177799199053, "grad_norm": 0.0772717073559761, "learning_rate": 0.00016897464703118515, "loss": 8.8757, "step": 24800, "throughput": 17830.60302461098 }, { "epoch": 0.3892193512488342, "grad_norm": 0.07301851361989975, "learning_rate": 0.00016870060466543112, "loss": 8.8347, "step": 24832, "throughput": 17830.62448111229 }, { "epoch": 0.38972092257776314, "grad_norm": 0.0722026452422142, "learning_rate": 0.0001684265470383725, "loss": 8.8762, "step": 24864, "throughput": 17830.99681553676 }, { "epoch": 0.39022249390669206, "grad_norm": 0.07057911902666092, "learning_rate": 0.0001681524752802237, "loss": 8.8822, "step": 24896, "throughput": 17831.52814856809 }, { "epoch": 0.390724065235621, "grad_norm": 0.07054942101240158, "learning_rate": 0.00016787839052125758, "loss": 8.8627, "step": 24928, "throughput": 17830.850969822153 }, { "epoch": 0.3912256365645499, "grad_norm": 0.07358486205339432, "learning_rate": 0.00016760429389180037, "loss": 8.8779, "step": 24960, "throughput": 17831.13179515518 }, { "epoch": 0.3917272078934788, "grad_norm": 0.07562954723834991, "learning_rate": 0.00016733018652222744, "loss": 8.8385, "step": 24992, "throughput": 17830.842157214578 }, { "epoch": 0.3922287792224077, "grad_norm": 0.0741063579916954, "learning_rate": 0.0001670560695429584, "loss": 8.8556, "step": 25024, "throughput": 17831.311424604814 }, { "epoch": 0.3927303505513366, "grad_norm": 0.07098574936389923, "learning_rate": 0.00016678194408445245, "loss": 8.8303, "step": 25056, "throughput": 17831.617559289924 }, { "epoch": 0.3932319218802655, "grad_norm": 0.07636234164237976, "learning_rate": 0.00016650781127720382, "loss": 8.8659, "step": 25088, "throughput": 17831.88957466774 }, { "epoch": 0.39373349320919443, "grad_norm": 0.07332395017147064, "learning_rate": 0.00016623367225173703, "loss": 8.8681, "step": 25120, "throughput": 17831.94164096608 }, { "epoch": 0.39423506453812335, "grad_norm": 0.07327907532453537, "learning_rate": 0.00016595952813860216, "loss": 8.8584, "step": 25152, "throughput": 17832.23279696902 }, { "epoch": 0.39473663586705227, "grad_norm": 0.07691916078329086, "learning_rate": 0.00016568538006837046, "loss": 8.8657, "step": 25184, "throughput": 17832.71533372163 }, { "epoch": 0.3952382071959812, "grad_norm": 0.07487235218286514, "learning_rate": 0.00016541122917162934, "loss": 8.8482, "step": 25216, "throughput": 17832.838322321524 }, { "epoch": 0.39573977852491005, "grad_norm": 0.06726781278848648, "learning_rate": 0.00016513707657897785, "loss": 8.8768, "step": 25248, "throughput": 17832.51545284579 }, { "epoch": 0.39624134985383896, "grad_norm": 0.075165756046772, "learning_rate": 0.00016486292342102215, "loss": 8.8569, "step": 25280, "throughput": 17832.19952718306 }, { "epoch": 0.3967429211827679, "grad_norm": 0.07332989573478699, "learning_rate": 0.0001645887708283707, "loss": 8.8522, "step": 25312, "throughput": 17832.349097094095 }, { "epoch": 0.3972444925116968, "grad_norm": 0.07408641278743744, "learning_rate": 0.00016431461993162954, "loss": 8.854, "step": 25344, "throughput": 17832.795997567115 }, { "epoch": 0.3977460638406257, "grad_norm": 0.08329582959413528, "learning_rate": 0.00016404047186139784, "loss": 8.8496, "step": 25376, "throughput": 17832.89803762926 }, { "epoch": 0.39824763516955464, "grad_norm": 0.07956679165363312, "learning_rate": 0.00016376632774826297, "loss": 8.8448, "step": 25408, "throughput": 17833.34230562835 }, { "epoch": 0.39874920649848355, "grad_norm": 0.0716254860162735, "learning_rate": 0.0001634921887227962, "loss": 8.8514, "step": 25440, "throughput": 17833.376285320894 }, { "epoch": 0.3992507778274124, "grad_norm": 0.07612968236207962, "learning_rate": 0.00016321805591554755, "loss": 8.8448, "step": 25472, "throughput": 17833.72117489287 }, { "epoch": 0.39975234915634134, "grad_norm": 0.0749460905790329, "learning_rate": 0.00016294393045704163, "loss": 8.8541, "step": 25504, "throughput": 17834.124936367323 }, { "epoch": 0.40025392048527025, "grad_norm": 0.0763927549123764, "learning_rate": 0.00016266981347777255, "loss": 8.8475, "step": 25536, "throughput": 17833.71051948569 }, { "epoch": 0.40075549181419917, "grad_norm": 0.07364343851804733, "learning_rate": 0.00016239570610819963, "loss": 8.837, "step": 25568, "throughput": 17833.615463015547 }, { "epoch": 0.4012570631431281, "grad_norm": 0.07207117974758148, "learning_rate": 0.00016212160947874242, "loss": 8.8394, "step": 25600, "throughput": 17833.49950524665 }, { "epoch": 0.401758634472057, "grad_norm": 0.07933421432971954, "learning_rate": 0.00016184752471977627, "loss": 8.8387, "step": 25632, "throughput": 17833.7823688976 }, { "epoch": 0.4022602058009859, "grad_norm": 0.07499121874570847, "learning_rate": 0.0001615734529616275, "loss": 8.8306, "step": 25664, "throughput": 17834.034646287568 }, { "epoch": 0.4027617771299148, "grad_norm": 0.07656212896108627, "learning_rate": 0.00016129939533456888, "loss": 8.8507, "step": 25696, "throughput": 17834.45549523096 }, { "epoch": 0.4032633484588437, "grad_norm": 0.08051002025604248, "learning_rate": 0.00016102535296881485, "loss": 8.8392, "step": 25728, "throughput": 17834.50003813568 }, { "epoch": 0.4037649197877726, "grad_norm": 0.07518472522497177, "learning_rate": 0.00016075132699451701, "loss": 8.8521, "step": 25760, "throughput": 17834.790457946634 }, { "epoch": 0.40426649111670154, "grad_norm": 0.07647351920604706, "learning_rate": 0.00016047731854175917, "loss": 8.8423, "step": 25792, "throughput": 17835.18758382088 }, { "epoch": 0.40476806244563046, "grad_norm": 0.07534997165203094, "learning_rate": 0.00016020332874055313, "loss": 8.8514, "step": 25824, "throughput": 17835.120857478418 }, { "epoch": 0.4052696337745594, "grad_norm": 0.08069568127393723, "learning_rate": 0.00015992935872083356, "loss": 8.8565, "step": 25856, "throughput": 17835.006479560674 }, { "epoch": 0.4057712051034883, "grad_norm": 0.08479923009872437, "learning_rate": 0.00015965540961245363, "loss": 8.8285, "step": 25888, "throughput": 17834.786300777007 }, { "epoch": 0.40627277643241716, "grad_norm": 0.07456418126821518, "learning_rate": 0.0001593814825451803, "loss": 8.8426, "step": 25920, "throughput": 17834.88195853212 }, { "epoch": 0.4067743477613461, "grad_norm": 0.08726909011602402, "learning_rate": 0.00015910757864868967, "loss": 8.8489, "step": 25952, "throughput": 17835.3150599101 }, { "epoch": 0.407275919090275, "grad_norm": 0.07202770560979843, "learning_rate": 0.0001588336990525621, "loss": 8.8519, "step": 25984, "throughput": 17835.409316446654 }, { "epoch": 0.4077774904192039, "grad_norm": 0.07821185886859894, "learning_rate": 0.00015855984488627792, "loss": 8.857, "step": 26016, "throughput": 17835.69390511231 }, { "epoch": 0.40827906174813283, "grad_norm": 0.07136929035186768, "learning_rate": 0.00015828601727921248, "loss": 8.8318, "step": 26048, "throughput": 17835.8249373439 }, { "epoch": 0.40878063307706175, "grad_norm": 0.08094098418951035, "learning_rate": 0.0001580122173606317, "loss": 8.8499, "step": 26080, "throughput": 17836.29278814811 }, { "epoch": 0.40928220440599067, "grad_norm": 0.06964492797851562, "learning_rate": 0.00015773844625968726, "loss": 8.8401, "step": 26112, "throughput": 17836.78874572426 }, { "epoch": 0.40978377573491953, "grad_norm": 0.07381853461265564, "learning_rate": 0.00015746470510541197, "loss": 8.8143, "step": 26144, "throughput": 17836.38282499106 }, { "epoch": 0.41028534706384845, "grad_norm": 0.07215822488069534, "learning_rate": 0.00015719099502671516, "loss": 8.8268, "step": 26176, "throughput": 17836.13412965979 }, { "epoch": 0.41078691839277737, "grad_norm": 0.08016388863325119, "learning_rate": 0.00015691731715237802, "loss": 8.8207, "step": 26208, "throughput": 17836.171230961085 }, { "epoch": 0.4112884897217063, "grad_norm": 0.06681036204099655, "learning_rate": 0.00015664367261104887, "loss": 8.8447, "step": 26240, "throughput": 17836.36960626293 }, { "epoch": 0.4117900610506352, "grad_norm": 0.07530402392148972, "learning_rate": 0.00015637006253123865, "loss": 8.8343, "step": 26272, "throughput": 17836.616848521204 }, { "epoch": 0.4122916323795641, "grad_norm": 0.08355733752250671, "learning_rate": 0.00015609648804131612, "loss": 8.8149, "step": 26304, "throughput": 17837.019406655218 }, { "epoch": 0.41279320370849304, "grad_norm": 0.07137879729270935, "learning_rate": 0.00015582295026950332, "loss": 8.8359, "step": 26336, "throughput": 17837.043601108417 }, { "epoch": 0.4132947750374219, "grad_norm": 0.0776265487074852, "learning_rate": 0.00015554945034387075, "loss": 8.8286, "step": 26368, "throughput": 17837.503639803843 }, { "epoch": 0.4137963463663508, "grad_norm": 0.07875608652830124, "learning_rate": 0.00015527598939233303, "loss": 8.8341, "step": 26400, "throughput": 17837.96173423606 }, { "epoch": 0.41429791769527974, "grad_norm": 0.07496542483568192, "learning_rate": 0.00015500256854264385, "loss": 8.8242, "step": 26432, "throughput": 17837.46333688345 }, { "epoch": 0.41479948902420866, "grad_norm": 0.07084860652685165, "learning_rate": 0.00015472918892239166, "loss": 8.8373, "step": 26464, "throughput": 17837.6371879048 }, { "epoch": 0.4153010603531376, "grad_norm": 0.07146702706813812, "learning_rate": 0.00015445585165899475, "loss": 8.8324, "step": 26496, "throughput": 17837.419916645686 }, { "epoch": 0.4158026316820665, "grad_norm": 0.07583361119031906, "learning_rate": 0.00015418255787969692, "loss": 8.8297, "step": 26528, "throughput": 17837.692998535804 }, { "epoch": 0.4163042030109954, "grad_norm": 0.06981997191905975, "learning_rate": 0.0001539093087115624, "loss": 8.8494, "step": 26560, "throughput": 17837.738224969555 }, { "epoch": 0.4168057743399243, "grad_norm": 0.07170698791742325, "learning_rate": 0.00015363610528147163, "loss": 8.8394, "step": 26592, "throughput": 17837.976665315815 }, { "epoch": 0.4173073456688532, "grad_norm": 0.07302544265985489, "learning_rate": 0.00015336294871611637, "loss": 8.8187, "step": 26624, "throughput": 17838.15509512615 }, { "epoch": 0.4178089169977821, "grad_norm": 0.07223688811063766, "learning_rate": 0.00015308984014199511, "loss": 8.8316, "step": 26656, "throughput": 17836.936852385057 }, { "epoch": 0.418310488326711, "grad_norm": 0.08481582999229431, "learning_rate": 0.00015281678068540836, "loss": 8.8157, "step": 26688, "throughput": 17837.39188111627 }, { "epoch": 0.41881205965563995, "grad_norm": 0.07136400043964386, "learning_rate": 0.00015254377147245424, "loss": 8.8116, "step": 26720, "throughput": 17837.555422270147 }, { "epoch": 0.41931363098456886, "grad_norm": 0.08144687861204147, "learning_rate": 0.00015227081362902343, "loss": 8.84, "step": 26752, "throughput": 17837.480406306255 }, { "epoch": 0.4198152023134978, "grad_norm": 0.07405370473861694, "learning_rate": 0.000151997908280795, "loss": 8.8126, "step": 26784, "throughput": 17837.0972853071 }, { "epoch": 0.42031677364242664, "grad_norm": 0.07629577070474625, "learning_rate": 0.0001517250565532313, "loss": 8.8234, "step": 26816, "throughput": 17837.121182248913 }, { "epoch": 0.42081834497135556, "grad_norm": 0.0791485458612442, "learning_rate": 0.00015145225957157373, "loss": 8.8142, "step": 26848, "throughput": 17837.28745575894 }, { "epoch": 0.4213199163002845, "grad_norm": 0.07299696654081345, "learning_rate": 0.00015117951846083786, "loss": 8.8262, "step": 26880, "throughput": 17837.537646571815 }, { "epoch": 0.4218214876292134, "grad_norm": 0.07210979610681534, "learning_rate": 0.0001509068343458088, "loss": 8.8114, "step": 26912, "throughput": 17837.956699491104 }, { "epoch": 0.4223230589581423, "grad_norm": 0.07374562323093414, "learning_rate": 0.00015063420835103667, "loss": 8.8362, "step": 26944, "throughput": 17837.984902015047 }, { "epoch": 0.42282463028707123, "grad_norm": 0.07789213955402374, "learning_rate": 0.0001503616416008319, "loss": 8.8372, "step": 26976, "throughput": 17838.440082370682 }, { "epoch": 0.42332620161600015, "grad_norm": 0.07159041613340378, "learning_rate": 0.00015008913521926052, "loss": 8.8298, "step": 27008, "throughput": 17838.896224178727 }, { "epoch": 0.423827772944929, "grad_norm": 0.06973559409379959, "learning_rate": 0.00014981669033013972, "loss": 8.8185, "step": 27040, "throughput": 17838.25389198717 }, { "epoch": 0.42432934427385793, "grad_norm": 0.07604040205478668, "learning_rate": 0.00014954430805703302, "loss": 8.7998, "step": 27072, "throughput": 17838.339137952018 }, { "epoch": 0.42483091560278685, "grad_norm": 0.07852691411972046, "learning_rate": 0.00014927198952324568, "loss": 8.7969, "step": 27104, "throughput": 17838.075705595023 }, { "epoch": 0.42533248693171577, "grad_norm": 0.07022686302661896, "learning_rate": 0.00014899973585182012, "loss": 8.8271, "step": 27136, "throughput": 17838.480286838687 }, { "epoch": 0.4258340582606447, "grad_norm": 0.07396269589662552, "learning_rate": 0.00014872754816553141, "loss": 8.8041, "step": 27168, "throughput": 17838.354805028463 }, { "epoch": 0.4263356295895736, "grad_norm": 0.07898204028606415, "learning_rate": 0.00014845542758688222, "loss": 8.8419, "step": 27200, "throughput": 17838.761892172806 }, { "epoch": 0.42683720091850247, "grad_norm": 0.0686897560954094, "learning_rate": 0.00014818337523809876, "loss": 8.8243, "step": 27232, "throughput": 17838.944068222234 }, { "epoch": 0.4273387722474314, "grad_norm": 0.0748598575592041, "learning_rate": 0.0001479113922411256, "loss": 8.8166, "step": 27264, "throughput": 17839.222390933104 }, { "epoch": 0.4278403435763603, "grad_norm": 0.07616803795099258, "learning_rate": 0.00014763947971762153, "loss": 8.811, "step": 27296, "throughput": 17839.67488400504 }, { "epoch": 0.4283419149052892, "grad_norm": 0.07271240651607513, "learning_rate": 0.00014736763878895457, "loss": 8.8095, "step": 27328, "throughput": 17839.756404138443 }, { "epoch": 0.42884348623421814, "grad_norm": 0.07320531457662582, "learning_rate": 0.00014709587057619748, "loss": 8.8312, "step": 27360, "throughput": 17839.60933717741 }, { "epoch": 0.42934505756314706, "grad_norm": 0.07407976686954498, "learning_rate": 0.0001468241762001232, "loss": 8.8142, "step": 27392, "throughput": 17839.139047725068 }, { "epoch": 0.429846628892076, "grad_norm": 0.07086745649576187, "learning_rate": 0.00014655255678120015, "loss": 8.8125, "step": 27424, "throughput": 17839.268285762308 }, { "epoch": 0.43034820022100484, "grad_norm": 0.08092939853668213, "learning_rate": 0.0001462810134395876, "loss": 8.7988, "step": 27456, "throughput": 17839.440178719484 }, { "epoch": 0.43084977154993376, "grad_norm": 0.07382786273956299, "learning_rate": 0.0001460095472951311, "loss": 8.8329, "step": 27488, "throughput": 17839.527812024506 }, { "epoch": 0.4313513428788627, "grad_norm": 0.07203220576047897, "learning_rate": 0.0001457381594673579, "loss": 8.8201, "step": 27520, "throughput": 17839.94144916028 }, { "epoch": 0.4318529142077916, "grad_norm": 0.07578306645154953, "learning_rate": 0.00014546685107547205, "loss": 8.8113, "step": 27552, "throughput": 17839.97989853177 }, { "epoch": 0.4323544855367205, "grad_norm": 0.0789005383849144, "learning_rate": 0.00014519562323835034, "loss": 8.8245, "step": 27584, "throughput": 17840.533749978855 }, { "epoch": 0.43285605686564943, "grad_norm": 0.08284716308116913, "learning_rate": 0.000144924477074537, "loss": 8.8109, "step": 27616, "throughput": 17840.8666391986 }, { "epoch": 0.43335762819457835, "grad_norm": 0.0708891823887825, "learning_rate": 0.00014465341370223977, "loss": 8.8163, "step": 27648, "throughput": 17840.34216738948 }, { "epoch": 0.4338591995235072, "grad_norm": 0.08031383901834488, "learning_rate": 0.00014438243423932476, "loss": 8.7976, "step": 27680, "throughput": 17840.27370184412 }, { "epoch": 0.43436077085243613, "grad_norm": 0.07380495965480804, "learning_rate": 0.00014411153980331198, "loss": 8.8106, "step": 27712, "throughput": 17840.149268080273 }, { "epoch": 0.43486234218136505, "grad_norm": 0.07391571253538132, "learning_rate": 0.00014384073151137104, "loss": 8.7863, "step": 27744, "throughput": 17840.4072067685 }, { "epoch": 0.43536391351029397, "grad_norm": 0.08950921148061752, "learning_rate": 0.00014357001048031603, "loss": 8.8092, "step": 27776, "throughput": 17840.50071104384 }, { "epoch": 0.4358654848392229, "grad_norm": 0.07263769209384918, "learning_rate": 0.00014329937782660136, "loss": 8.8016, "step": 27808, "throughput": 17840.89219816196 }, { "epoch": 0.4363670561681518, "grad_norm": 0.0754525363445282, "learning_rate": 0.00014302883466631676, "loss": 8.8272, "step": 27840, "throughput": 17841.071706232735 }, { "epoch": 0.4368686274970807, "grad_norm": 0.07414602488279343, "learning_rate": 0.0001427583821151832, "loss": 8.8118, "step": 27872, "throughput": 17841.343629060204 }, { "epoch": 0.4373701988260096, "grad_norm": 0.07408881932497025, "learning_rate": 0.0001424880212885477, "loss": 8.809, "step": 27904, "throughput": 17841.784914055388 }, { "epoch": 0.4378717701549385, "grad_norm": 0.08639470487833023, "learning_rate": 0.0001422177533013791, "loss": 8.8339, "step": 27936, "throughput": 17841.311274442694 }, { "epoch": 0.4383733414838674, "grad_norm": 0.07878611981868744, "learning_rate": 0.00014194757926826342, "loss": 8.8069, "step": 27968, "throughput": 17841.391454740937 }, { "epoch": 0.43887491281279634, "grad_norm": 0.07366620004177094, "learning_rate": 0.00014167750030339915, "loss": 8.8038, "step": 28000, "throughput": 17841.046629825334 }, { "epoch": 0.43937648414172525, "grad_norm": 0.07283695042133331, "learning_rate": 0.00014140751752059278, "loss": 8.7893, "step": 28032, "throughput": 17841.302318158687 }, { "epoch": 0.4398780554706542, "grad_norm": 0.0751243531703949, "learning_rate": 0.0001411376320332541, "loss": 8.8033, "step": 28064, "throughput": 17841.558918240662 }, { "epoch": 0.4403796267995831, "grad_norm": 0.07369616627693176, "learning_rate": 0.0001408678449543916, "loss": 8.8206, "step": 28096, "throughput": 17841.637707693066 }, { "epoch": 0.44088119812851195, "grad_norm": 0.08011516183614731, "learning_rate": 0.00014059815739660806, "loss": 8.7958, "step": 28128, "throughput": 17841.94797571962 }, { "epoch": 0.44138276945744087, "grad_norm": 0.07943718135356903, "learning_rate": 0.00014032857047209573, "loss": 8.7998, "step": 28160, "throughput": 17842.07065405487 }, { "epoch": 0.4418843407863698, "grad_norm": 0.07318083196878433, "learning_rate": 0.0001400590852926319, "loss": 8.7977, "step": 28192, "throughput": 17842.61186513062 }, { "epoch": 0.4423859121152987, "grad_norm": 0.07628615945577621, "learning_rate": 0.00013978970296957423, "loss": 8.7923, "step": 28224, "throughput": 17842.56852902371 }, { "epoch": 0.4428874834442276, "grad_norm": 0.07363478094339371, "learning_rate": 0.00013952042461385625, "loss": 8.7956, "step": 28256, "throughput": 17842.437356485003 }, { "epoch": 0.44338905477315654, "grad_norm": 0.07090167701244354, "learning_rate": 0.00013925125133598266, "loss": 8.799, "step": 28288, "throughput": 17842.061185790088 }, { "epoch": 0.44389062610208546, "grad_norm": 0.07324929535388947, "learning_rate": 0.0001389821842460249, "loss": 8.8019, "step": 28320, "throughput": 17842.027593597886 }, { "epoch": 0.4443921974310143, "grad_norm": 0.09164309501647949, "learning_rate": 0.00013871322445361642, "loss": 8.8076, "step": 28352, "throughput": 17842.283044428117 }, { "epoch": 0.44489376875994324, "grad_norm": 0.07631401717662811, "learning_rate": 0.00013844437306794822, "loss": 8.8011, "step": 28384, "throughput": 17842.51556671752 }, { "epoch": 0.44539534008887216, "grad_norm": 0.07143891602754593, "learning_rate": 0.00013817563119776415, "loss": 8.7796, "step": 28416, "throughput": 17842.907716703598 }, { "epoch": 0.4458969114178011, "grad_norm": 0.07868574559688568, "learning_rate": 0.00013790699995135658, "loss": 8.7872, "step": 28448, "throughput": 17843.080127484805 }, { "epoch": 0.44639848274673, "grad_norm": 0.07218185067176819, "learning_rate": 0.00013763848043656148, "loss": 8.8027, "step": 28480, "throughput": 17843.50201491266 }, { "epoch": 0.4469000540756589, "grad_norm": 0.0759633257985115, "learning_rate": 0.00013737007376075414, "loss": 8.7821, "step": 28512, "throughput": 17843.828462314785 }, { "epoch": 0.44740162540458783, "grad_norm": 0.06998586654663086, "learning_rate": 0.0001371017810308445, "loss": 8.7811, "step": 28544, "throughput": 17843.47047588364 }, { "epoch": 0.4479031967335167, "grad_norm": 0.07746990770101547, "learning_rate": 0.00013683360335327264, "loss": 8.8143, "step": 28576, "throughput": 17843.451209079303 }, { "epoch": 0.4484047680624456, "grad_norm": 0.07679734379053116, "learning_rate": 0.000136565541834004, "loss": 8.7907, "step": 28608, "throughput": 17843.31445435224 }, { "epoch": 0.44890633939137453, "grad_norm": 0.08982423692941666, "learning_rate": 0.00013629759757852512, "loss": 8.7976, "step": 28640, "throughput": 17843.562150233083 }, { "epoch": 0.44940791072030345, "grad_norm": 0.07646424323320389, "learning_rate": 0.00013602977169183884, "loss": 8.7614, "step": 28672, "throughput": 17843.535464475917 }, { "epoch": 0.44990948204923237, "grad_norm": 0.0840567946434021, "learning_rate": 0.00013576206527846004, "loss": 8.7836, "step": 28704, "throughput": 17842.371375433257 }, { "epoch": 0.4504110533781613, "grad_norm": 0.07550926506519318, "learning_rate": 0.00013549447944241066, "loss": 8.7972, "step": 28736, "throughput": 17842.5410602527 }, { "epoch": 0.4509126247070902, "grad_norm": 0.0738803967833519, "learning_rate": 0.00013522701528721553, "loss": 8.7884, "step": 28768, "throughput": 17842.826906183545 }, { "epoch": 0.45141419603601907, "grad_norm": 0.07417251914739609, "learning_rate": 0.00013495967391589757, "loss": 8.7819, "step": 28800, "throughput": 17843.3510901284 }, { "epoch": 0.451915767364948, "grad_norm": 0.07992105185985565, "learning_rate": 0.00013469245643097345, "loss": 8.7829, "step": 28832, "throughput": 17843.309647376955 }, { "epoch": 0.4524173386938769, "grad_norm": 0.0737965926527977, "learning_rate": 0.0001344253639344488, "loss": 8.7909, "step": 28864, "throughput": 17843.325713250953 }, { "epoch": 0.4529189100228058, "grad_norm": 0.07945975661277771, "learning_rate": 0.00013415839752781392, "loss": 8.7935, "step": 28896, "throughput": 17842.819398840486 }, { "epoch": 0.45342048135173474, "grad_norm": 0.07636261731386185, "learning_rate": 0.00013389155831203904, "loss": 8.7946, "step": 28928, "throughput": 17842.91498224683 }, { "epoch": 0.45392205268066366, "grad_norm": 0.0748453214764595, "learning_rate": 0.0001336248473875699, "loss": 8.7897, "step": 28960, "throughput": 17843.16853929137 }, { "epoch": 0.4544236240095926, "grad_norm": 0.08027852326631546, "learning_rate": 0.00013335826585432313, "loss": 8.7805, "step": 28992, "throughput": 17843.244006607554 }, { "epoch": 0.45492519533852144, "grad_norm": 0.07624673843383789, "learning_rate": 0.00013309181481168173, "loss": 8.7804, "step": 29024, "throughput": 17843.624131799017 }, { "epoch": 0.45542676666745036, "grad_norm": 0.07282182574272156, "learning_rate": 0.00013282549535849065, "loss": 8.7779, "step": 29056, "throughput": 17843.67300853686 }, { "epoch": 0.4559283379963793, "grad_norm": 0.0720539316534996, "learning_rate": 0.00013255930859305205, "loss": 8.7718, "step": 29088, "throughput": 17844.06408514807 }, { "epoch": 0.4564299093253082, "grad_norm": 0.07573386281728745, "learning_rate": 0.000132293255613121, "loss": 8.7923, "step": 29120, "throughput": 17844.48076906447 }, { "epoch": 0.4569314806542371, "grad_norm": 0.0778200700879097, "learning_rate": 0.00013202733751590067, "loss": 8.7759, "step": 29152, "throughput": 17844.122697295465 }, { "epoch": 0.45743305198316603, "grad_norm": 0.07196016609668732, "learning_rate": 0.00013176155539803818, "loss": 8.7838, "step": 29184, "throughput": 17843.96467194935 }, { "epoch": 0.45793462331209495, "grad_norm": 0.07072978466749191, "learning_rate": 0.00013149591035561977, "loss": 8.7751, "step": 29216, "throughput": 17843.97033476635 }, { "epoch": 0.4584361946410238, "grad_norm": 0.07341047376394272, "learning_rate": 0.00013123040348416633, "loss": 8.7723, "step": 29248, "throughput": 17844.07352378656 }, { "epoch": 0.4589377659699527, "grad_norm": 0.07667620480060577, "learning_rate": 0.00013096503587862906, "loss": 8.7972, "step": 29280, "throughput": 17844.010552861248 }, { "epoch": 0.45943933729888164, "grad_norm": 0.07243162393569946, "learning_rate": 0.00013069980863338466, "loss": 8.7733, "step": 29312, "throughput": 17844.385719318707 }, { "epoch": 0.45994090862781056, "grad_norm": 0.07885871082544327, "learning_rate": 0.00013043472284223113, "loss": 8.7854, "step": 29344, "throughput": 17844.54295589396 }, { "epoch": 0.4604424799567395, "grad_norm": 0.0747121199965477, "learning_rate": 0.00013016977959838305, "loss": 8.7859, "step": 29376, "throughput": 17844.818108001928 }, { "epoch": 0.4609440512856684, "grad_norm": 0.07451453059911728, "learning_rate": 0.00012990497999446714, "loss": 8.7836, "step": 29408, "throughput": 17845.338332184892 }, { "epoch": 0.4614456226145973, "grad_norm": 0.07636185735464096, "learning_rate": 0.00012964032512251773, "loss": 8.7831, "step": 29440, "throughput": 17844.991803756493 }, { "epoch": 0.4619471939435262, "grad_norm": 0.07705360651016235, "learning_rate": 0.00012937581607397236, "loss": 8.7793, "step": 29472, "throughput": 17845.200047982493 }, { "epoch": 0.4624487652724551, "grad_norm": 0.07874922454357147, "learning_rate": 0.00012911145393966703, "loss": 8.7927, "step": 29504, "throughput": 17844.878648881604 }, { "epoch": 0.462950336601384, "grad_norm": 0.06849963217973709, "learning_rate": 0.00012884723980983206, "loss": 8.7843, "step": 29536, "throughput": 17844.968116187938 }, { "epoch": 0.46345190793031293, "grad_norm": 0.0721622183918953, "learning_rate": 0.00012858317477408728, "loss": 8.7883, "step": 29568, "throughput": 17845.21399269392 }, { "epoch": 0.46395347925924185, "grad_norm": 0.07688286155462265, "learning_rate": 0.00012831925992143765, "loss": 8.7926, "step": 29600, "throughput": 17845.137648151827 }, { "epoch": 0.46445505058817077, "grad_norm": 0.07621218264102936, "learning_rate": 0.00012805549634026882, "loss": 8.7775, "step": 29632, "throughput": 17845.574416407122 }, { "epoch": 0.4649566219170997, "grad_norm": 0.07827426493167877, "learning_rate": 0.00012779188511834256, "loss": 8.7888, "step": 29664, "throughput": 17845.693004355682 }, { "epoch": 0.46545819324602855, "grad_norm": 0.0747198611497879, "learning_rate": 0.00012752842734279238, "loss": 8.7684, "step": 29696, "throughput": 17845.998682074493 }, { "epoch": 0.46595976457495747, "grad_norm": 0.07569344341754913, "learning_rate": 0.0001272651241001189, "loss": 8.7821, "step": 29728, "throughput": 17845.9520682368 }, { "epoch": 0.4664613359038864, "grad_norm": 0.07907281070947647, "learning_rate": 0.00012700197647618549, "loss": 8.7905, "step": 29760, "throughput": 17845.94405368657 }, { "epoch": 0.4669629072328153, "grad_norm": 0.07393976300954819, "learning_rate": 0.00012673898555621373, "loss": 8.779, "step": 29792, "throughput": 17845.896590258762 }, { "epoch": 0.4674644785617442, "grad_norm": 0.07429799437522888, "learning_rate": 0.00012647615242477887, "loss": 8.7434, "step": 29824, "throughput": 17845.9036226395 }, { "epoch": 0.46796604989067314, "grad_norm": 0.07908093184232712, "learning_rate": 0.0001262134781658056, "loss": 8.7652, "step": 29856, "throughput": 17846.01233548715 }, { "epoch": 0.46846762121960206, "grad_norm": 0.08475719392299652, "learning_rate": 0.00012595096386256336, "loss": 8.7671, "step": 29888, "throughput": 17846.08677863954 }, { "epoch": 0.4689691925485309, "grad_norm": 0.07956618070602417, "learning_rate": 0.0001256886105976619, "loss": 8.7915, "step": 29920, "throughput": 17846.44840357302 }, { "epoch": 0.46947076387745984, "grad_norm": 0.07686559110879898, "learning_rate": 0.0001254264194530468, "loss": 8.7806, "step": 29952, "throughput": 17846.60226272647 }, { "epoch": 0.46997233520638876, "grad_norm": 0.08286672830581665, "learning_rate": 0.00012516439150999525, "loss": 8.7785, "step": 29984, "throughput": 17846.87335613637 }, { "epoch": 0.4704739065353177, "grad_norm": 0.06802724301815033, "learning_rate": 0.00012490252784911113, "loss": 8.7459, "step": 30016, "throughput": 17847.064497361553 }, { "epoch": 0.4709754778642466, "grad_norm": 0.07914753258228302, "learning_rate": 0.000124640829550321, "loss": 8.7634, "step": 30048, "throughput": 17846.685453391063 }, { "epoch": 0.4714770491931755, "grad_norm": 0.0800432562828064, "learning_rate": 0.00012437929769286942, "loss": 8.7673, "step": 30080, "throughput": 17846.8999518748 }, { "epoch": 0.47197862052210443, "grad_norm": 0.07590536773204803, "learning_rate": 0.0001241179333553146, "loss": 8.7833, "step": 30112, "throughput": 17846.69810252754 }, { "epoch": 0.4724801918510333, "grad_norm": 0.07159872353076935, "learning_rate": 0.00012385673761552374, "loss": 8.7631, "step": 30144, "throughput": 17846.939274740744 }, { "epoch": 0.4729817631799622, "grad_norm": 0.07268593460321426, "learning_rate": 0.00012359571155066894, "loss": 8.7725, "step": 30176, "throughput": 17846.890446191093 }, { "epoch": 0.47348333450889113, "grad_norm": 0.10743585973978043, "learning_rate": 0.00012333485623722238, "loss": 8.7883, "step": 30208, "throughput": 17847.09068368804 }, { "epoch": 0.47398490583782005, "grad_norm": 0.07458017021417618, "learning_rate": 0.00012307417275095222, "loss": 8.7676, "step": 30240, "throughput": 17847.50596240503 }, { "epoch": 0.47448647716674897, "grad_norm": 0.06924610584974289, "learning_rate": 0.00012281366216691786, "loss": 8.7535, "step": 30272, "throughput": 17847.500111783294 }, { "epoch": 0.4749880484956779, "grad_norm": 0.07047650963068008, "learning_rate": 0.00012255332555946582, "loss": 8.7518, "step": 30304, "throughput": 17847.917419369074 }, { "epoch": 0.4754896198246068, "grad_norm": 0.07766906172037125, "learning_rate": 0.00012229316400222493, "loss": 8.7787, "step": 30336, "throughput": 17847.72215626349 }, { "epoch": 0.47599119115353566, "grad_norm": 0.07250562310218811, "learning_rate": 0.00012203317856810232, "loss": 8.7749, "step": 30368, "throughput": 17847.734631924355 }, { "epoch": 0.4764927624824646, "grad_norm": 0.07328185439109802, "learning_rate": 0.0001217733703292786, "loss": 8.7513, "step": 30400, "throughput": 17847.613508866787 }, { "epoch": 0.4769943338113935, "grad_norm": 0.07611878961324692, "learning_rate": 0.0001215137403572038, "loss": 8.765, "step": 30432, "throughput": 17847.5771101991 }, { "epoch": 0.4774959051403224, "grad_norm": 0.08048366755247116, "learning_rate": 0.00012125428972259264, "loss": 8.7593, "step": 30464, "throughput": 17847.820835981867 }, { "epoch": 0.47799747646925134, "grad_norm": 0.0731780081987381, "learning_rate": 0.0001209950194954203, "loss": 8.7708, "step": 30496, "throughput": 17847.883866344233 }, { "epoch": 0.47849904779818025, "grad_norm": 0.07422348856925964, "learning_rate": 0.00012073593074491802, "loss": 8.7856, "step": 30528, "throughput": 17848.238168548705 }, { "epoch": 0.4790006191271092, "grad_norm": 0.0718270093202591, "learning_rate": 0.0001204770245395685, "loss": 8.776, "step": 30560, "throughput": 17848.273120260463 }, { "epoch": 0.47950219045603804, "grad_norm": 0.07511728256940842, "learning_rate": 0.00012021830194710178, "loss": 8.7618, "step": 30592, "throughput": 17848.62970918359 }, { "epoch": 0.48000376178496695, "grad_norm": 0.07711270451545715, "learning_rate": 0.00011995976403449054, "loss": 8.7615, "step": 30624, "throughput": 17848.659560032636 }, { "epoch": 0.48050533311389587, "grad_norm": 0.07422070950269699, "learning_rate": 0.00011970141186794592, "loss": 8.7722, "step": 30656, "throughput": 17848.36810050533 }, { "epoch": 0.4810069044428248, "grad_norm": 0.09282661974430084, "learning_rate": 0.00011944324651291299, "loss": 8.7454, "step": 30688, "throughput": 17848.450950218277 }, { "epoch": 0.4815084757717537, "grad_norm": 0.07340681552886963, "learning_rate": 0.00011918526903406647, "loss": 8.7462, "step": 30720, "throughput": 17848.471110200808 }, { "epoch": 0.4820100471006826, "grad_norm": 0.07820796221494675, "learning_rate": 0.0001189274804953063, "loss": 8.7661, "step": 30752, "throughput": 17847.300770419086 }, { "epoch": 0.48251161842961154, "grad_norm": 0.07713815569877625, "learning_rate": 0.00011866988195975307, "loss": 8.7602, "step": 30784, "throughput": 17847.37876156958 }, { "epoch": 0.4830131897585404, "grad_norm": 0.07636034488677979, "learning_rate": 0.00011841247448974398, "loss": 8.7614, "step": 30816, "throughput": 17847.583265104164 }, { "epoch": 0.4835147610874693, "grad_norm": 0.07851573824882507, "learning_rate": 0.00011815525914682817, "loss": 8.7412, "step": 30848, "throughput": 17847.858389147463 }, { "epoch": 0.48401633241639824, "grad_norm": 0.07513809949159622, "learning_rate": 0.00011789823699176249, "loss": 8.7635, "step": 30880, "throughput": 17847.989358915052 }, { "epoch": 0.48451790374532716, "grad_norm": 0.0752979964017868, "learning_rate": 0.00011764140908450703, "loss": 8.7519, "step": 30912, "throughput": 17848.399698441695 }, { "epoch": 0.4850194750742561, "grad_norm": 0.07560551166534424, "learning_rate": 0.0001173847764842209, "loss": 8.7653, "step": 30944, "throughput": 17847.92848888303 }, { "epoch": 0.485521046403185, "grad_norm": 0.07036852836608887, "learning_rate": 0.00011712834024925766, "loss": 8.7608, "step": 30976, "throughput": 17848.211029288916 }, { "epoch": 0.4860226177321139, "grad_norm": 0.07992174476385117, "learning_rate": 0.00011687210143716116, "loss": 8.746, "step": 31008, "throughput": 17847.930009877455 }, { "epoch": 0.4865241890610428, "grad_norm": 0.07941529154777527, "learning_rate": 0.00011661606110466095, "loss": 8.7518, "step": 31040, "throughput": 17848.006630520344 }, { "epoch": 0.4870257603899717, "grad_norm": 0.07616245001554489, "learning_rate": 0.00011636022030766818, "loss": 8.7637, "step": 31072, "throughput": 17848.241233007302 }, { "epoch": 0.4875273317189006, "grad_norm": 0.0720774233341217, "learning_rate": 0.00011610458010127093, "loss": 8.7527, "step": 31104, "throughput": 17848.29619784518 }, { "epoch": 0.48802890304782953, "grad_norm": 0.07582742720842361, "learning_rate": 0.00011584914153973036, "loss": 8.7721, "step": 31136, "throughput": 17848.647265738666 }, { "epoch": 0.48853047437675845, "grad_norm": 0.07284478098154068, "learning_rate": 0.00011559390567647571, "loss": 8.7476, "step": 31168, "throughput": 17848.683273518756 }, { "epoch": 0.48903204570568737, "grad_norm": 0.07436570525169373, "learning_rate": 0.00011533887356410052, "loss": 8.7572, "step": 31200, "throughput": 17849.02940449858 }, { "epoch": 0.4895336170346163, "grad_norm": 0.07970885187387466, "learning_rate": 0.00011508404625435791, "loss": 8.7596, "step": 31232, "throughput": 17849.075126584816 }, { "epoch": 0.49003518836354515, "grad_norm": 0.07180914282798767, "learning_rate": 0.00011482942479815651, "loss": 8.7399, "step": 31264, "throughput": 17848.82705998219 }, { "epoch": 0.49053675969247407, "grad_norm": 0.096111960709095, "learning_rate": 0.00011457501024555593, "loss": 8.7578, "step": 31296, "throughput": 17848.779338285327 }, { "epoch": 0.491038331021403, "grad_norm": 0.07298924773931503, "learning_rate": 0.00011432080364576256, "loss": 8.7366, "step": 31328, "throughput": 17848.776528847295 }, { "epoch": 0.4915399023503319, "grad_norm": 0.07969135046005249, "learning_rate": 0.00011406680604712517, "loss": 8.7565, "step": 31360, "throughput": 17849.00646660255 }, { "epoch": 0.4920414736792608, "grad_norm": 0.06794929504394531, "learning_rate": 0.00011381301849713059, "loss": 8.7557, "step": 31392, "throughput": 17848.946513776882 }, { "epoch": 0.49254304500818974, "grad_norm": 0.0790972039103508, "learning_rate": 0.00011355944204239944, "loss": 8.7557, "step": 31424, "throughput": 17849.23098082752 }, { "epoch": 0.4930446163371186, "grad_norm": 0.07518981397151947, "learning_rate": 0.0001133060777286818, "loss": 8.7445, "step": 31456, "throughput": 17849.504941230134 }, { "epoch": 0.4935461876660475, "grad_norm": 0.08620373904705048, "learning_rate": 0.00011305292660085278, "loss": 8.738, "step": 31488, "throughput": 17849.625420218974 }, { "epoch": 0.49404775899497644, "grad_norm": 0.07720964401960373, "learning_rate": 0.00011279998970290844, "loss": 8.7659, "step": 31520, "throughput": 17850.015704816262 }, { "epoch": 0.49454933032390536, "grad_norm": 0.07925351709127426, "learning_rate": 0.0001125472680779613, "loss": 8.7505, "step": 31552, "throughput": 17849.624096047148 }, { "epoch": 0.4950509016528343, "grad_norm": 0.07546688616275787, "learning_rate": 0.00011229476276823608, "loss": 8.7393, "step": 31584, "throughput": 17849.69202524607 }, { "epoch": 0.4955524729817632, "grad_norm": 0.07865744829177856, "learning_rate": 0.00011204247481506535, "loss": 8.7376, "step": 31616, "throughput": 17849.62718762721 }, { "epoch": 0.4960540443106921, "grad_norm": 0.08163941651582718, "learning_rate": 0.00011179040525888552, "loss": 8.7449, "step": 31648, "throughput": 17849.711629491485 }, { "epoch": 0.496555615639621, "grad_norm": 0.0784422978758812, "learning_rate": 0.00011153855513923207, "loss": 8.7327, "step": 31680, "throughput": 17849.93584187844 }, { "epoch": 0.4970571869685499, "grad_norm": 0.07680987566709518, "learning_rate": 0.00011128692549473568, "loss": 8.7538, "step": 31712, "throughput": 17849.852166393823 }, { "epoch": 0.4975587582974788, "grad_norm": 0.07829587161540985, "learning_rate": 0.00011103551736311777, "loss": 8.737, "step": 31744, "throughput": 17850.173752304592 }, { "epoch": 0.4980603296264077, "grad_norm": 0.07024698704481125, "learning_rate": 0.0001107843317811862, "loss": 8.7312, "step": 31776, "throughput": 17850.1787368264 }, { "epoch": 0.49856190095533665, "grad_norm": 0.07402710616588593, "learning_rate": 0.00011053336978483102, "loss": 8.7585, "step": 31808, "throughput": 17850.64963191776 }, { "epoch": 0.49906347228426556, "grad_norm": 0.09956757724285126, "learning_rate": 0.00011028263240902033, "loss": 8.7274, "step": 31840, "throughput": 17850.708965571015 }, { "epoch": 0.4995650436131945, "grad_norm": 0.07462165504693985, "learning_rate": 0.0001100321206877957, "loss": 8.7316, "step": 31872, "throughput": 17850.575949234746 }, { "epoch": 0.5000666149421233, "grad_norm": 0.08074957132339478, "learning_rate": 0.00010978183565426832, "loss": 8.7449, "step": 31904, "throughput": 17850.43376010897 }, { "epoch": 0.5005681862710523, "grad_norm": 0.07831110805273056, "learning_rate": 0.00010953177834061435, "loss": 8.7583, "step": 31936, "throughput": 17850.51662941851 }, { "epoch": 0.5010697575999812, "grad_norm": 0.080203115940094, "learning_rate": 0.00010928194977807091, "loss": 8.7381, "step": 31968, "throughput": 17850.608029374747 }, { "epoch": 0.5015713289289101, "grad_norm": 0.076949343085289, "learning_rate": 0.00010903235099693174, "loss": 8.7264, "step": 32000, "throughput": 17850.67277233083 }, { "epoch": 0.502072900257839, "grad_norm": 0.07552843540906906, "learning_rate": 0.00010878298302654294, "loss": 8.7573, "step": 32032, "throughput": 17850.929673978953 }, { "epoch": 0.5025744715867679, "grad_norm": 0.0816895142197609, "learning_rate": 0.00010853384689529873, "loss": 8.7427, "step": 32064, "throughput": 17850.966147359748 }, { "epoch": 0.5030760429156969, "grad_norm": 0.07351840287446976, "learning_rate": 0.00010828494363063732, "loss": 8.7416, "step": 32096, "throughput": 17851.308279198405 }, { "epoch": 0.5035776142446258, "grad_norm": 0.10635778307914734, "learning_rate": 0.0001080362742590364, "loss": 8.7523, "step": 32128, "throughput": 17851.689731864455 }, { "epoch": 0.5040791855735547, "grad_norm": 0.07729581743478775, "learning_rate": 0.00010778783980600939, "loss": 8.7587, "step": 32160, "throughput": 17851.210394599766 }, { "epoch": 0.5045807569024836, "grad_norm": 0.07464662939310074, "learning_rate": 0.00010753964129610052, "loss": 8.7416, "step": 32192, "throughput": 17851.199375968507 }, { "epoch": 0.5050823282314125, "grad_norm": 0.07541036605834961, "learning_rate": 0.00010729167975288122, "loss": 8.7447, "step": 32224, "throughput": 17851.335817399635 }, { "epoch": 0.5055838995603413, "grad_norm": 0.08859999477863312, "learning_rate": 0.0001070439561989457, "loss": 8.7497, "step": 32256, "throughput": 17851.39972043009 }, { "epoch": 0.5060854708892703, "grad_norm": 0.077695332467556, "learning_rate": 0.00010679647165590659, "loss": 8.7294, "step": 32288, "throughput": 17851.360632048607 }, { "epoch": 0.5065870422181992, "grad_norm": 0.07810457050800323, "learning_rate": 0.00010654922714439083, "loss": 8.7304, "step": 32320, "throughput": 17851.553764208507 }, { "epoch": 0.5070886135471281, "grad_norm": 0.0815735012292862, "learning_rate": 0.00010630222368403561, "loss": 8.7205, "step": 32352, "throughput": 17851.862837365712 }, { "epoch": 0.507590184876057, "grad_norm": 0.07661870867013931, "learning_rate": 0.00010605546229348396, "loss": 8.7485, "step": 32384, "throughput": 17851.85927641683 }, { "epoch": 0.5080917562049859, "grad_norm": 0.07422985881567001, "learning_rate": 0.00010580894399038044, "loss": 8.7459, "step": 32416, "throughput": 17852.318482970753 }, { "epoch": 0.5085933275339148, "grad_norm": 0.07624523341655731, "learning_rate": 0.00010556266979136734, "loss": 8.7295, "step": 32448, "throughput": 17852.09570726072 }, { "epoch": 0.5090948988628438, "grad_norm": 0.07496540248394012, "learning_rate": 0.00010531664071208019, "loss": 8.7318, "step": 32480, "throughput": 17852.12903658571 }, { "epoch": 0.5095964701917727, "grad_norm": 0.07751981914043427, "learning_rate": 0.00010507085776714369, "loss": 8.7212, "step": 32512, "throughput": 17851.88275411316 }, { "epoch": 0.5100980415207016, "grad_norm": 0.07308991998434067, "learning_rate": 0.00010482532197016732, "loss": 8.7392, "step": 32544, "throughput": 17852.077468590258 }, { "epoch": 0.5105996128496305, "grad_norm": 0.07246117293834686, "learning_rate": 0.00010458003433374152, "loss": 8.731, "step": 32576, "throughput": 17852.16765980201 }, { "epoch": 0.5111011841785594, "grad_norm": 0.0796428844332695, "learning_rate": 0.00010433499586943319, "loss": 8.74, "step": 32608, "throughput": 17852.209242554592 }, { "epoch": 0.5116027555074883, "grad_norm": 0.07866678386926651, "learning_rate": 0.00010409020758778178, "loss": 8.7387, "step": 32640, "throughput": 17852.455320879468 }, { "epoch": 0.5121043268364173, "grad_norm": 0.07312561571598053, "learning_rate": 0.00010384567049829474, "loss": 8.7283, "step": 32672, "throughput": 17852.501019436742 }, { "epoch": 0.5126058981653461, "grad_norm": 0.07131937146186829, "learning_rate": 0.00010360138560944379, "loss": 8.7243, "step": 32704, "throughput": 17852.82589756935 }, { "epoch": 0.513107469494275, "grad_norm": 0.0743732750415802, "learning_rate": 0.00010335735392866061, "loss": 8.7208, "step": 32736, "throughput": 17852.87888747027 }, { "epoch": 0.5136090408232039, "grad_norm": 0.08018866181373596, "learning_rate": 0.00010311357646233255, "loss": 8.7379, "step": 32768, "throughput": 17852.64528871008 }, { "epoch": 0.5141106121521328, "grad_norm": 0.077493816614151, "learning_rate": 0.00010287005421579854, "loss": 8.7498, "step": 32800, "throughput": 17851.308068751474 }, { "epoch": 0.5146121834810617, "grad_norm": 0.07570572197437286, "learning_rate": 0.00010262678819334511, "loss": 8.7279, "step": 32832, "throughput": 17851.523873792336 }, { "epoch": 0.5151137548099907, "grad_norm": 0.08089492470026016, "learning_rate": 0.00010238377939820202, "loss": 8.734, "step": 32864, "throughput": 17851.60249667329 }, { "epoch": 0.5156153261389196, "grad_norm": 0.0890161320567131, "learning_rate": 0.00010214102883253832, "loss": 8.7312, "step": 32896, "throughput": 17851.5531851023 }, { "epoch": 0.5161168974678485, "grad_norm": 0.07664911448955536, "learning_rate": 0.00010189853749745799, "loss": 8.7183, "step": 32928, "throughput": 17851.795498924672 }, { "epoch": 0.5166184687967774, "grad_norm": 0.07410534471273422, "learning_rate": 0.00010165630639299606, "loss": 8.72, "step": 32960, "throughput": 17852.076292837388 }, { "epoch": 0.5171200401257063, "grad_norm": 0.0790855884552002, "learning_rate": 0.00010141433651811429, "loss": 8.7243, "step": 32992, "throughput": 17852.275533354878 }, { "epoch": 0.5176216114546353, "grad_norm": 0.08857249468564987, "learning_rate": 0.00010117262887069724, "loss": 8.733, "step": 33024, "throughput": 17852.648968705045 }, { "epoch": 0.5181231827835642, "grad_norm": 0.07627321034669876, "learning_rate": 0.00010093118444754784, "loss": 8.7357, "step": 33056, "throughput": 17852.408439897976 }, { "epoch": 0.5186247541124931, "grad_norm": 0.08038872480392456, "learning_rate": 0.0001006900042443837, "loss": 8.703, "step": 33088, "throughput": 17852.457808305855 }, { "epoch": 0.519126325441422, "grad_norm": 0.07928008586168289, "learning_rate": 0.00010044908925583264, "loss": 8.7416, "step": 33120, "throughput": 17852.280623087227 }, { "epoch": 0.5196278967703508, "grad_norm": 0.07952787727117538, "learning_rate": 0.00010020844047542886, "loss": 8.7166, "step": 33152, "throughput": 17852.45677832672 }, { "epoch": 0.5201294680992797, "grad_norm": 0.07665357738733292, "learning_rate": 9.996805889560857e-05, "loss": 8.7227, "step": 33184, "throughput": 17852.444686158015 }, { "epoch": 0.5206310394282087, "grad_norm": 0.07277513295412064, "learning_rate": 9.972794550770612e-05, "loss": 8.7162, "step": 33216, "throughput": 17852.49822943568 }, { "epoch": 0.5211326107571376, "grad_norm": 0.07704409956932068, "learning_rate": 9.948810130194984e-05, "loss": 8.7392, "step": 33248, "throughput": 17852.742232599747 }, { "epoch": 0.5216341820860665, "grad_norm": 0.07315832376480103, "learning_rate": 9.924852726745807e-05, "loss": 8.7116, "step": 33280, "throughput": 17852.894822436796 }, { "epoch": 0.5221357534149954, "grad_norm": 0.08210276067256927, "learning_rate": 9.900922439223464e-05, "loss": 8.7548, "step": 33312, "throughput": 17853.21314899139 }, { "epoch": 0.5226373247439243, "grad_norm": 0.0742005780339241, "learning_rate": 9.877019366316541e-05, "loss": 8.7031, "step": 33344, "throughput": 17853.130417455657 }, { "epoch": 0.5231388960728532, "grad_norm": 0.08997371792793274, "learning_rate": 9.85314360660138e-05, "loss": 8.708, "step": 33376, "throughput": 17853.135809429154 }, { "epoch": 0.5236404674017822, "grad_norm": 0.07334605604410172, "learning_rate": 9.829295258541692e-05, "loss": 8.7183, "step": 33408, "throughput": 17852.964725346992 }, { "epoch": 0.5241420387307111, "grad_norm": 0.08196054399013519, "learning_rate": 9.805474420488123e-05, "loss": 8.7397, "step": 33440, "throughput": 17853.29280629254 }, { "epoch": 0.52464361005964, "grad_norm": 0.07731211930513382, "learning_rate": 9.78168119067789e-05, "loss": 8.7371, "step": 33472, "throughput": 17853.354784763338 }, { "epoch": 0.5251451813885689, "grad_norm": 0.08011672645807266, "learning_rate": 9.757915667234339e-05, "loss": 8.7353, "step": 33504, "throughput": 17853.176714040168 }, { "epoch": 0.5256467527174978, "grad_norm": 0.08022370934486389, "learning_rate": 9.734177948166558e-05, "loss": 8.7286, "step": 33536, "throughput": 17853.414800485636 }, { "epoch": 0.5261483240464266, "grad_norm": 0.08035130053758621, "learning_rate": 9.710468131368968e-05, "loss": 8.6915, "step": 33568, "throughput": 17853.693576028792 }, { "epoch": 0.5266498953753556, "grad_norm": 0.07651187479496002, "learning_rate": 9.68678631462093e-05, "loss": 8.7352, "step": 33600, "throughput": 17853.902846549423 }, { "epoch": 0.5271514667042845, "grad_norm": 0.07013057917356491, "learning_rate": 9.66313259558633e-05, "loss": 8.7234, "step": 33632, "throughput": 17854.26304555664 }, { "epoch": 0.5276530380332134, "grad_norm": 0.08306749910116196, "learning_rate": 9.639507071813166e-05, "loss": 8.6926, "step": 33664, "throughput": 17853.92545712553 }, { "epoch": 0.5281546093621423, "grad_norm": 0.07922951132059097, "learning_rate": 9.615909840733167e-05, "loss": 8.7203, "step": 33696, "throughput": 17853.73114229308 }, { "epoch": 0.5286561806910712, "grad_norm": 0.07435827702283859, "learning_rate": 9.592340999661393e-05, "loss": 8.7194, "step": 33728, "throughput": 17853.912868666077 }, { "epoch": 0.5291577520200001, "grad_norm": 0.07166502624750137, "learning_rate": 9.568800645795812e-05, "loss": 8.7444, "step": 33760, "throughput": 17853.987128701727 }, { "epoch": 0.5296593233489291, "grad_norm": 0.08474232256412506, "learning_rate": 9.545288876216901e-05, "loss": 8.712, "step": 33792, "throughput": 17853.808485432633 }, { "epoch": 0.530160894677858, "grad_norm": 0.07914811372756958, "learning_rate": 9.521805787887285e-05, "loss": 8.7079, "step": 33824, "throughput": 17853.99280724263 }, { "epoch": 0.5306624660067869, "grad_norm": 0.07761990278959274, "learning_rate": 9.498351477651286e-05, "loss": 8.7217, "step": 33856, "throughput": 17854.293602626312 }, { "epoch": 0.5311640373357158, "grad_norm": 0.07397238910198212, "learning_rate": 9.47492604223454e-05, "loss": 8.7051, "step": 33888, "throughput": 17854.527119045713 }, { "epoch": 0.5316656086646447, "grad_norm": 0.08450540155172348, "learning_rate": 9.451529578243618e-05, "loss": 8.7161, "step": 33920, "throughput": 17854.96415529531 }, { "epoch": 0.5321671799935737, "grad_norm": 0.08447160571813583, "learning_rate": 9.428162182165607e-05, "loss": 8.7036, "step": 33952, "throughput": 17854.760755613323 }, { "epoch": 0.5326687513225026, "grad_norm": 0.08158237487077713, "learning_rate": 9.40482395036772e-05, "loss": 8.7126, "step": 33984, "throughput": 17854.879374147862 }, { "epoch": 0.5331703226514314, "grad_norm": 0.0762786716222763, "learning_rate": 9.381514979096888e-05, "loss": 8.6922, "step": 34016, "throughput": 17854.642009693944 }, { "epoch": 0.5336718939803603, "grad_norm": 0.07647755742073059, "learning_rate": 9.35823536447938e-05, "loss": 8.7242, "step": 34048, "throughput": 17854.82546758402 }, { "epoch": 0.5341734653092892, "grad_norm": 0.07419923692941666, "learning_rate": 9.334985202520395e-05, "loss": 8.6907, "step": 34080, "throughput": 17854.79020767972 }, { "epoch": 0.5346750366382181, "grad_norm": 0.0749330073595047, "learning_rate": 9.311764589103679e-05, "loss": 8.7215, "step": 34112, "throughput": 17854.728711581607 }, { "epoch": 0.5351766079671471, "grad_norm": 0.07686057686805725, "learning_rate": 9.288573619991096e-05, "loss": 8.7209, "step": 34144, "throughput": 17854.96625759833 }, { "epoch": 0.535678179296076, "grad_norm": 0.0778697207570076, "learning_rate": 9.265412390822278e-05, "loss": 8.7256, "step": 34176, "throughput": 17855.24001360787 }, { "epoch": 0.5361797506250049, "grad_norm": 0.07231098413467407, "learning_rate": 9.242280997114204e-05, "loss": 8.6967, "step": 34208, "throughput": 17855.55976553791 }, { "epoch": 0.5366813219539338, "grad_norm": 0.07696084678173065, "learning_rate": 9.219179534260811e-05, "loss": 8.7035, "step": 34240, "throughput": 17855.732613653654 }, { "epoch": 0.5371828932828627, "grad_norm": 0.07389093190431595, "learning_rate": 9.196108097532597e-05, "loss": 8.7006, "step": 34272, "throughput": 17855.563916979274 }, { "epoch": 0.5376844646117916, "grad_norm": 0.07730893790721893, "learning_rate": 9.173066782076236e-05, "loss": 8.7102, "step": 34304, "throughput": 17855.354474081796 }, { "epoch": 0.5381860359407206, "grad_norm": 0.0806819349527359, "learning_rate": 9.15005568291418e-05, "loss": 8.6878, "step": 34336, "throughput": 17855.554636712146 }, { "epoch": 0.5386876072696495, "grad_norm": 0.07983998954296112, "learning_rate": 9.12707489494428e-05, "loss": 8.6808, "step": 34368, "throughput": 17855.74473759405 }, { "epoch": 0.5391891785985784, "grad_norm": 0.07641927897930145, "learning_rate": 9.104124512939357e-05, "loss": 8.7275, "step": 34400, "throughput": 17855.438142912495 }, { "epoch": 0.5396907499275073, "grad_norm": 0.07484152913093567, "learning_rate": 9.081204631546867e-05, "loss": 8.6999, "step": 34432, "throughput": 17855.625684506198 }, { "epoch": 0.5401923212564361, "grad_norm": 0.07652326673269272, "learning_rate": 9.058315345288465e-05, "loss": 8.6852, "step": 34464, "throughput": 17855.802815831357 }, { "epoch": 0.540693892585365, "grad_norm": 0.08515553176403046, "learning_rate": 9.035456748559639e-05, "loss": 8.727, "step": 34496, "throughput": 17856.033432889442 }, { "epoch": 0.541195463914294, "grad_norm": 0.0841352790594101, "learning_rate": 9.012628935629299e-05, "loss": 8.6966, "step": 34528, "throughput": 17856.388586859128 }, { "epoch": 0.5416970352432229, "grad_norm": 0.0760938748717308, "learning_rate": 8.989832000639424e-05, "loss": 8.6885, "step": 34560, "throughput": 17856.362491268934 }, { "epoch": 0.5421986065721518, "grad_norm": 0.07298042625188828, "learning_rate": 8.967066037604637e-05, "loss": 8.7089, "step": 34592, "throughput": 17856.393811450194 }, { "epoch": 0.5427001779010807, "grad_norm": 0.0769927054643631, "learning_rate": 8.944331140411841e-05, "loss": 8.7142, "step": 34624, "throughput": 17856.153219747895 }, { "epoch": 0.5432017492300096, "grad_norm": 0.07509263604879379, "learning_rate": 8.921627402819813e-05, "loss": 8.7001, "step": 34656, "throughput": 17856.32991604068 }, { "epoch": 0.5437033205589386, "grad_norm": 0.07436039298772812, "learning_rate": 8.898954918458835e-05, "loss": 8.7097, "step": 34688, "throughput": 17856.305279807035 }, { "epoch": 0.5442048918878675, "grad_norm": 0.08183735609054565, "learning_rate": 8.876313780830305e-05, "loss": 8.7207, "step": 34720, "throughput": 17856.23398069712 }, { "epoch": 0.5447064632167964, "grad_norm": 0.07405205070972443, "learning_rate": 8.853704083306341e-05, "loss": 8.7145, "step": 34752, "throughput": 17856.469484585414 }, { "epoch": 0.5452080345457253, "grad_norm": 0.07916124910116196, "learning_rate": 8.831125919129397e-05, "loss": 8.7078, "step": 34784, "throughput": 17856.600001475075 }, { "epoch": 0.5457096058746542, "grad_norm": 0.07913485169410706, "learning_rate": 8.808579381411892e-05, "loss": 8.7067, "step": 34816, "throughput": 17856.912010010525 }, { "epoch": 0.5462111772035831, "grad_norm": 0.07325571030378342, "learning_rate": 8.786064563135815e-05, "loss": 8.7104, "step": 34848, "throughput": 17855.739869060257 }, { "epoch": 0.5467127485325121, "grad_norm": 0.08538568019866943, "learning_rate": 8.763581557152348e-05, "loss": 8.7083, "step": 34880, "throughput": 17855.6542419764 }, { "epoch": 0.5472143198614409, "grad_norm": 0.07836513221263885, "learning_rate": 8.741130456181463e-05, "loss": 8.6979, "step": 34912, "throughput": 17855.51185036211 }, { "epoch": 0.5477158911903698, "grad_norm": 0.07362423837184906, "learning_rate": 8.718711352811573e-05, "loss": 8.7047, "step": 34944, "throughput": 17855.706793913592 }, { "epoch": 0.5482174625192987, "grad_norm": 0.0696859061717987, "learning_rate": 8.696324339499135e-05, "loss": 8.7058, "step": 34976, "throughput": 17855.76802440867 }, { "epoch": 0.5487190338482276, "grad_norm": 0.08644992858171463, "learning_rate": 8.673969508568242e-05, "loss": 8.6796, "step": 35008, "throughput": 17855.577492330583 }, { "epoch": 0.5492206051771565, "grad_norm": 0.07907194644212723, "learning_rate": 8.651646952210293e-05, "loss": 8.7013, "step": 35040, "throughput": 17855.69019825646 }, { "epoch": 0.5497221765060855, "grad_norm": 0.07400470227003098, "learning_rate": 8.629356762483573e-05, "loss": 8.7043, "step": 35072, "throughput": 17855.8359775117 }, { "epoch": 0.5502237478350144, "grad_norm": 0.07419518381357193, "learning_rate": 8.607099031312901e-05, "loss": 8.6961, "step": 35104, "throughput": 17856.147141321344 }, { "epoch": 0.5507253191639433, "grad_norm": 0.08127579092979431, "learning_rate": 8.58487385048921e-05, "loss": 8.7208, "step": 35136, "throughput": 17856.493290845447 }, { "epoch": 0.5512268904928722, "grad_norm": 0.08192048966884613, "learning_rate": 8.562681311669218e-05, "loss": 8.7099, "step": 35168, "throughput": 17856.4989556675 }, { "epoch": 0.5517284618218011, "grad_norm": 0.07196959853172302, "learning_rate": 8.540521506375026e-05, "loss": 8.7026, "step": 35200, "throughput": 17856.195421085813 }, { "epoch": 0.55223003315073, "grad_norm": 0.0753261148929596, "learning_rate": 8.518394525993734e-05, "loss": 8.6949, "step": 35232, "throughput": 17856.353308312617 }, { "epoch": 0.552731604479659, "grad_norm": 0.07535768300294876, "learning_rate": 8.496300461777068e-05, "loss": 8.7055, "step": 35264, "throughput": 17856.536985682873 }, { "epoch": 0.5532331758085879, "grad_norm": 0.07252952456474304, "learning_rate": 8.474239404841023e-05, "loss": 8.6933, "step": 35296, "throughput": 17856.616172365262 }, { "epoch": 0.5537347471375168, "grad_norm": 0.07780899852514267, "learning_rate": 8.452211446165458e-05, "loss": 8.6953, "step": 35328, "throughput": 17856.527483171823 }, { "epoch": 0.5542363184664456, "grad_norm": 0.07243838161230087, "learning_rate": 8.430216676593744e-05, "loss": 8.7208, "step": 35360, "throughput": 17856.624340604638 }, { "epoch": 0.5547378897953745, "grad_norm": 0.07750379294157028, "learning_rate": 8.408255186832372e-05, "loss": 8.7109, "step": 35392, "throughput": 17856.957011109807 }, { "epoch": 0.5552394611243034, "grad_norm": 0.07306235283613205, "learning_rate": 8.386327067450593e-05, "loss": 8.6811, "step": 35424, "throughput": 17857.252341059837 }, { "epoch": 0.5557410324532324, "grad_norm": 0.07733822613954544, "learning_rate": 8.36443240888004e-05, "loss": 8.6948, "step": 35456, "throughput": 17857.16316162486 }, { "epoch": 0.5562426037821613, "grad_norm": 0.0769861564040184, "learning_rate": 8.342571301414342e-05, "loss": 8.7093, "step": 35488, "throughput": 17857.179033760753 }, { "epoch": 0.5567441751110902, "grad_norm": 0.08327770978212357, "learning_rate": 8.320743835208775e-05, "loss": 8.7121, "step": 35520, "throughput": 17857.11541407053 }, { "epoch": 0.5572457464400191, "grad_norm": 0.0749799981713295, "learning_rate": 8.298950100279872e-05, "loss": 8.7003, "step": 35552, "throughput": 17857.41542698246 }, { "epoch": 0.557747317768948, "grad_norm": 0.080297090113163, "learning_rate": 8.27719018650507e-05, "loss": 8.7133, "step": 35584, "throughput": 17857.57912236483 }, { "epoch": 0.558248889097877, "grad_norm": 0.07166144251823425, "learning_rate": 8.255464183622304e-05, "loss": 8.714, "step": 35616, "throughput": 17857.295799307114 }, { "epoch": 0.5587504604268059, "grad_norm": 0.08338946849107742, "learning_rate": 8.23377218122968e-05, "loss": 8.6983, "step": 35648, "throughput": 17857.516026578443 }, { "epoch": 0.5592520317557348, "grad_norm": 0.08566264808177948, "learning_rate": 8.212114268785083e-05, "loss": 8.6859, "step": 35680, "throughput": 17857.63804371969 }, { "epoch": 0.5597536030846637, "grad_norm": 0.07868161797523499, "learning_rate": 8.190490535605809e-05, "loss": 8.6673, "step": 35712, "throughput": 17857.938206572217 }, { "epoch": 0.5602551744135926, "grad_norm": 0.08599625527858734, "learning_rate": 8.16890107086819e-05, "loss": 8.6835, "step": 35744, "throughput": 17857.99435946068 }, { "epoch": 0.5607567457425215, "grad_norm": 0.08485583961009979, "learning_rate": 8.14734596360725e-05, "loss": 8.6944, "step": 35776, "throughput": 17858.055374471925 }, { "epoch": 0.5612583170714504, "grad_norm": 0.0788610652089119, "learning_rate": 8.12582530271631e-05, "loss": 8.7083, "step": 35808, "throughput": 17857.842233432566 }, { "epoch": 0.5617598884003793, "grad_norm": 0.08662694692611694, "learning_rate": 8.104339176946648e-05, "loss": 8.6672, "step": 35840, "throughput": 17858.02721694298 }, { "epoch": 0.5622614597293082, "grad_norm": 0.07397332787513733, "learning_rate": 8.082887674907099e-05, "loss": 8.6857, "step": 35872, "throughput": 17858.320675679395 }, { "epoch": 0.5627630310582371, "grad_norm": 0.07281418889760971, "learning_rate": 8.061470885063726e-05, "loss": 8.7033, "step": 35904, "throughput": 17858.03092891344 }, { "epoch": 0.563264602387166, "grad_norm": 0.07784990966320038, "learning_rate": 8.040088895739433e-05, "loss": 8.7075, "step": 35936, "throughput": 17858.204570293852 }, { "epoch": 0.5637661737160949, "grad_norm": 0.07610969245433807, "learning_rate": 8.018741795113614e-05, "loss": 8.6939, "step": 35968, "throughput": 17858.36220633285 }, { "epoch": 0.5642677450450239, "grad_norm": 0.0770319402217865, "learning_rate": 7.997429671221764e-05, "loss": 8.6916, "step": 36000, "throughput": 17858.675310114504 }, { "epoch": 0.5647693163739528, "grad_norm": 0.07430537790060043, "learning_rate": 7.97615261195515e-05, "loss": 8.699, "step": 36032, "throughput": 17858.865330860357 }, { "epoch": 0.5652708877028817, "grad_norm": 0.0736197680234909, "learning_rate": 7.95491070506043e-05, "loss": 8.7112, "step": 36064, "throughput": 17858.859739329604 }, { "epoch": 0.5657724590318106, "grad_norm": 0.08195227384567261, "learning_rate": 7.933704038139292e-05, "loss": 8.6762, "step": 36096, "throughput": 17859.00238340524 }, { "epoch": 0.5662740303607395, "grad_norm": 0.06957199424505234, "learning_rate": 7.912532698648089e-05, "loss": 8.682, "step": 36128, "throughput": 17858.941718203627 }, { "epoch": 0.5667756016896685, "grad_norm": 0.0705103799700737, "learning_rate": 7.891396773897487e-05, "loss": 8.6731, "step": 36160, "throughput": 17859.129372681775 }, { "epoch": 0.5672771730185974, "grad_norm": 0.07276476919651031, "learning_rate": 7.870296351052104e-05, "loss": 8.6685, "step": 36192, "throughput": 17859.072481472478 }, { "epoch": 0.5677787443475263, "grad_norm": 0.08280736953020096, "learning_rate": 7.849231517130151e-05, "loss": 8.6851, "step": 36224, "throughput": 17858.997460045422 }, { "epoch": 0.5682803156764551, "grad_norm": 0.08353912830352783, "learning_rate": 7.828202359003058e-05, "loss": 8.6826, "step": 36256, "throughput": 17859.26928275893 }, { "epoch": 0.568781887005384, "grad_norm": 0.07920809090137482, "learning_rate": 7.807208963395139e-05, "loss": 8.6754, "step": 36288, "throughput": 17859.399339313015 }, { "epoch": 0.5692834583343129, "grad_norm": 0.07387978583574295, "learning_rate": 7.786251416883218e-05, "loss": 8.6828, "step": 36320, "throughput": 17859.594711944774 }, { "epoch": 0.5697850296632418, "grad_norm": 0.07279038429260254, "learning_rate": 7.765329805896287e-05, "loss": 8.6943, "step": 36352, "throughput": 17859.60028157515 }, { "epoch": 0.5702866009921708, "grad_norm": 0.08469823747873306, "learning_rate": 7.744444216715117e-05, "loss": 8.6924, "step": 36384, "throughput": 17859.770193623226 }, { "epoch": 0.5707881723210997, "grad_norm": 0.08174408227205276, "learning_rate": 7.723594735471952e-05, "loss": 8.6946, "step": 36416, "throughput": 17859.459083652644 }, { "epoch": 0.5712897436500286, "grad_norm": 0.07326170057058334, "learning_rate": 7.702781448150109e-05, "loss": 8.6875, "step": 36448, "throughput": 17859.748036127872 }, { "epoch": 0.5717913149789575, "grad_norm": 0.0783335268497467, "learning_rate": 7.682004440583654e-05, "loss": 8.6803, "step": 36480, "throughput": 17859.800168291476 }, { "epoch": 0.5722928863078864, "grad_norm": 0.07939084619283676, "learning_rate": 7.661263798457014e-05, "loss": 8.6805, "step": 36512, "throughput": 17859.62193777413 }, { "epoch": 0.5727944576368154, "grad_norm": 0.0731113851070404, "learning_rate": 7.64055960730467e-05, "loss": 8.6622, "step": 36544, "throughput": 17859.843484605248 }, { "epoch": 0.5732960289657443, "grad_norm": 0.07936207205057144, "learning_rate": 7.619891952510763e-05, "loss": 8.6914, "step": 36576, "throughput": 17859.902867621913 }, { "epoch": 0.5737976002946732, "grad_norm": 0.07797619700431824, "learning_rate": 7.599260919308764e-05, "loss": 8.6794, "step": 36608, "throughput": 17860.196809289922 }, { "epoch": 0.5742991716236021, "grad_norm": 0.07891387492418289, "learning_rate": 7.578666592781114e-05, "loss": 8.6893, "step": 36640, "throughput": 17860.380665697347 }, { "epoch": 0.574800742952531, "grad_norm": 0.07759033888578415, "learning_rate": 7.558109057858874e-05, "loss": 8.6679, "step": 36672, "throughput": 17860.385192027556 }, { "epoch": 0.5753023142814598, "grad_norm": 0.07606584578752518, "learning_rate": 7.53758839932139e-05, "loss": 8.6838, "step": 36704, "throughput": 17860.52645770749 }, { "epoch": 0.5758038856103888, "grad_norm": 0.08316051214933395, "learning_rate": 7.517104701795905e-05, "loss": 8.6631, "step": 36736, "throughput": 17860.236486716658 }, { "epoch": 0.5763054569393177, "grad_norm": 0.07760239392518997, "learning_rate": 7.496658049757255e-05, "loss": 8.6755, "step": 36768, "throughput": 17860.515150122916 }, { "epoch": 0.5768070282682466, "grad_norm": 0.07454492896795273, "learning_rate": 7.476248527527492e-05, "loss": 8.6737, "step": 36800, "throughput": 17860.45777620071 }, { "epoch": 0.5773085995971755, "grad_norm": 0.07791450619697571, "learning_rate": 7.455876219275552e-05, "loss": 8.6667, "step": 36832, "throughput": 17860.3926783732 }, { "epoch": 0.5778101709261044, "grad_norm": 0.08069723844528198, "learning_rate": 7.435541209016885e-05, "loss": 8.6912, "step": 36864, "throughput": 17860.5391968819 }, { "epoch": 0.5783117422550333, "grad_norm": 0.07404716312885284, "learning_rate": 7.415243580613134e-05, "loss": 8.6685, "step": 36896, "throughput": 17859.50875842157 }, { "epoch": 0.5788133135839623, "grad_norm": 0.07672100514173508, "learning_rate": 7.394983417771791e-05, "loss": 8.6881, "step": 36928, "throughput": 17859.56777824398 }, { "epoch": 0.5793148849128912, "grad_norm": 0.07763490080833435, "learning_rate": 7.374760804045815e-05, "loss": 8.6952, "step": 36960, "throughput": 17859.563325141797 }, { "epoch": 0.5798164562418201, "grad_norm": 0.07308504730463028, "learning_rate": 7.354575822833331e-05, "loss": 8.6714, "step": 36992, "throughput": 17859.84326325879 }, { "epoch": 0.580318027570749, "grad_norm": 0.07863820344209671, "learning_rate": 7.334428557377258e-05, "loss": 8.6904, "step": 37024, "throughput": 17859.579503577177 }, { "epoch": 0.5808195988996779, "grad_norm": 0.07718608528375626, "learning_rate": 7.314319090764985e-05, "loss": 8.6855, "step": 37056, "throughput": 17859.864899200704 }, { "epoch": 0.5813211702286069, "grad_norm": 0.07392841577529907, "learning_rate": 7.294247505928003e-05, "loss": 8.6828, "step": 37088, "throughput": 17859.90820816274 }, { "epoch": 0.5818227415575358, "grad_norm": 0.07847581803798676, "learning_rate": 7.274213885641592e-05, "loss": 8.6726, "step": 37120, "throughput": 17859.732287565872 }, { "epoch": 0.5823243128864646, "grad_norm": 0.07694242149591446, "learning_rate": 7.254218312524461e-05, "loss": 8.6801, "step": 37152, "throughput": 17859.941978431052 }, { "epoch": 0.5828258842153935, "grad_norm": 0.08155602216720581, "learning_rate": 7.234260869038417e-05, "loss": 8.6924, "step": 37184, "throughput": 17860.002150369633 }, { "epoch": 0.5833274555443224, "grad_norm": 0.07697729766368866, "learning_rate": 7.214341637488007e-05, "loss": 8.6981, "step": 37216, "throughput": 17860.28969020785 }, { "epoch": 0.5838290268732513, "grad_norm": 0.07712637633085251, "learning_rate": 7.194460700020206e-05, "loss": 8.6688, "step": 37248, "throughput": 17860.287694656174 }, { "epoch": 0.5843305982021803, "grad_norm": 0.07649629563093185, "learning_rate": 7.174618138624058e-05, "loss": 8.688, "step": 37280, "throughput": 17860.456684093922 }, { "epoch": 0.5848321695311092, "grad_norm": 0.08143318444490433, "learning_rate": 7.154814035130351e-05, "loss": 8.6842, "step": 37312, "throughput": 17860.38516502371 }, { "epoch": 0.5853337408600381, "grad_norm": 0.07608351856470108, "learning_rate": 7.135048471211257e-05, "loss": 8.6752, "step": 37344, "throughput": 17860.40264555555 }, { "epoch": 0.585835312188967, "grad_norm": 0.0704997107386589, "learning_rate": 7.115321528380024e-05, "loss": 8.6978, "step": 37376, "throughput": 17860.682923717475 }, { "epoch": 0.5863368835178959, "grad_norm": 0.07427257299423218, "learning_rate": 7.095633287990622e-05, "loss": 8.6774, "step": 37408, "throughput": 17860.533350386562 }, { "epoch": 0.5868384548468248, "grad_norm": 0.07474479079246521, "learning_rate": 7.075983831237421e-05, "loss": 8.6564, "step": 37440, "throughput": 17860.572141327655 }, { "epoch": 0.5873400261757538, "grad_norm": 0.07758533209562302, "learning_rate": 7.056373239154826e-05, "loss": 8.6725, "step": 37472, "throughput": 17860.650359983818 }, { "epoch": 0.5878415975046827, "grad_norm": 0.08383120596408844, "learning_rate": 7.036801592616982e-05, "loss": 8.6471, "step": 37504, "throughput": 17860.956290430775 }, { "epoch": 0.5883431688336116, "grad_norm": 0.07858950644731522, "learning_rate": 7.017268972337419e-05, "loss": 8.6478, "step": 37536, "throughput": 17861.135781772904 }, { "epoch": 0.5888447401625405, "grad_norm": 0.07414372265338898, "learning_rate": 6.997775458868724e-05, "loss": 8.6781, "step": 37568, "throughput": 17861.13272811816 }, { "epoch": 0.5893463114914693, "grad_norm": 0.08369318395853043, "learning_rate": 6.978321132602197e-05, "loss": 8.6752, "step": 37600, "throughput": 17861.352014974746 }, { "epoch": 0.5898478828203982, "grad_norm": 0.08304564654827118, "learning_rate": 6.95890607376754e-05, "loss": 8.6589, "step": 37632, "throughput": 17861.165783111348 }, { "epoch": 0.5903494541493272, "grad_norm": 0.08017552644014359, "learning_rate": 6.939530362432513e-05, "loss": 8.6764, "step": 37664, "throughput": 17861.44463727893 }, { "epoch": 0.5908510254782561, "grad_norm": 0.07597630470991135, "learning_rate": 6.920194078502611e-05, "loss": 8.671, "step": 37696, "throughput": 17861.483137722276 }, { "epoch": 0.591352596807185, "grad_norm": 0.07199093699455261, "learning_rate": 6.900897301720721e-05, "loss": 8.6762, "step": 37728, "throughput": 17861.313925261547 }, { "epoch": 0.5918541681361139, "grad_norm": 0.07194902747869492, "learning_rate": 6.881640111666807e-05, "loss": 8.6917, "step": 37760, "throughput": 17861.45777088156 }, { "epoch": 0.5923557394650428, "grad_norm": 0.07729242742061615, "learning_rate": 6.862422587757581e-05, "loss": 8.6805, "step": 37792, "throughput": 17861.582855243618 }, { "epoch": 0.5928573107939717, "grad_norm": 0.0748252347111702, "learning_rate": 6.843244809246173e-05, "loss": 8.6948, "step": 37824, "throughput": 17861.868472076487 }, { "epoch": 0.5933588821229007, "grad_norm": 0.06912515312433243, "learning_rate": 6.824106855221788e-05, "loss": 8.6746, "step": 37856, "throughput": 17861.868699846593 }, { "epoch": 0.5938604534518296, "grad_norm": 0.08204159885644913, "learning_rate": 6.805008804609411e-05, "loss": 8.6705, "step": 37888, "throughput": 17862.034973142487 }, { "epoch": 0.5943620247807585, "grad_norm": 0.07541067898273468, "learning_rate": 6.78595073616946e-05, "loss": 8.6669, "step": 37920, "throughput": 17861.798222004218 }, { "epoch": 0.5948635961096874, "grad_norm": 0.07566790282726288, "learning_rate": 6.766932728497468e-05, "loss": 8.6908, "step": 37952, "throughput": 17862.07464676886 }, { "epoch": 0.5953651674386163, "grad_norm": 0.07250373810529709, "learning_rate": 6.747954860023746e-05, "loss": 8.6903, "step": 37984, "throughput": 17862.34271828335 }, { "epoch": 0.5958667387675451, "grad_norm": 0.07323428988456726, "learning_rate": 6.729017209013086e-05, "loss": 8.6791, "step": 38016, "throughput": 17862.1684501893 }, { "epoch": 0.5963683100964741, "grad_norm": 0.07525332272052765, "learning_rate": 6.710119853564422e-05, "loss": 8.6859, "step": 38048, "throughput": 17862.317622242426 }, { "epoch": 0.596869881425403, "grad_norm": 0.08609536290168762, "learning_rate": 6.69126287161049e-05, "loss": 8.695, "step": 38080, "throughput": 17862.192551926404 }, { "epoch": 0.5973714527543319, "grad_norm": 0.07468371838331223, "learning_rate": 6.672446340917553e-05, "loss": 8.6781, "step": 38112, "throughput": 17862.568143605156 }, { "epoch": 0.5978730240832608, "grad_norm": 0.08155862241983414, "learning_rate": 6.653670339085031e-05, "loss": 8.6713, "step": 38144, "throughput": 17862.652431778395 }, { "epoch": 0.5983745954121897, "grad_norm": 0.07647927850484848, "learning_rate": 6.634934943545217e-05, "loss": 8.6772, "step": 38176, "throughput": 17862.627789866117 }, { "epoch": 0.5988761667411187, "grad_norm": 0.07881610840559006, "learning_rate": 6.616240231562933e-05, "loss": 8.6527, "step": 38208, "throughput": 17862.73911744598 }, { "epoch": 0.5993777380700476, "grad_norm": 0.07397406548261642, "learning_rate": 6.597586280235227e-05, "loss": 8.6814, "step": 38240, "throughput": 17862.676804243994 }, { "epoch": 0.5998793093989765, "grad_norm": 0.08274087309837341, "learning_rate": 6.578973166491053e-05, "loss": 8.6655, "step": 38272, "throughput": 17862.84161762198 }, { "epoch": 0.6003808807279054, "grad_norm": 0.07987891137599945, "learning_rate": 6.560400967090948e-05, "loss": 8.6669, "step": 38304, "throughput": 17862.883476007497 }, { "epoch": 0.6008824520568343, "grad_norm": 0.09073396027088165, "learning_rate": 6.54186975862671e-05, "loss": 8.6597, "step": 38336, "throughput": 17862.81187994118 }, { "epoch": 0.6013840233857632, "grad_norm": 0.0760880559682846, "learning_rate": 6.523379617521104e-05, "loss": 8.6609, "step": 38368, "throughput": 17862.823933162377 }, { "epoch": 0.6018855947146922, "grad_norm": 0.08379305154085159, "learning_rate": 6.504930620027524e-05, "loss": 8.6619, "step": 38400, "throughput": 17863.056926972076 }, { "epoch": 0.6023871660436211, "grad_norm": 0.08088319003582001, "learning_rate": 6.486522842229692e-05, "loss": 8.6684, "step": 38432, "throughput": 17863.332135046952 }, { "epoch": 0.6028887373725499, "grad_norm": 0.07462088018655777, "learning_rate": 6.468156360041337e-05, "loss": 8.6695, "step": 38464, "throughput": 17863.224382775457 }, { "epoch": 0.6033903087014788, "grad_norm": 0.08881346881389618, "learning_rate": 6.449831249205887e-05, "loss": 8.6569, "step": 38496, "throughput": 17863.50051418473 }, { "epoch": 0.6038918800304077, "grad_norm": 0.07640784978866577, "learning_rate": 6.431547585296156e-05, "loss": 8.6554, "step": 38528, "throughput": 17863.268214463256 }, { "epoch": 0.6043934513593366, "grad_norm": 0.07738711684942245, "learning_rate": 6.413305443714022e-05, "loss": 8.6579, "step": 38560, "throughput": 17863.535238141452 }, { "epoch": 0.6048950226882656, "grad_norm": 0.07391881197690964, "learning_rate": 6.395104899690134e-05, "loss": 8.66, "step": 38592, "throughput": 17863.58042819578 }, { "epoch": 0.6053965940171945, "grad_norm": 0.07256294786930084, "learning_rate": 6.37694602828359e-05, "loss": 8.6675, "step": 38624, "throughput": 17863.51573004575 }, { "epoch": 0.6058981653461234, "grad_norm": 0.07247397303581238, "learning_rate": 6.358828904381632e-05, "loss": 8.656, "step": 38656, "throughput": 17863.72591574277 }, { "epoch": 0.6063997366750523, "grad_norm": 0.07626520097255707, "learning_rate": 6.340753602699327e-05, "loss": 8.6792, "step": 38688, "throughput": 17863.736278779154 }, { "epoch": 0.6069013080039812, "grad_norm": 0.0811394453048706, "learning_rate": 6.322720197779275e-05, "loss": 8.6834, "step": 38720, "throughput": 17864.114418141355 }, { "epoch": 0.6074028793329102, "grad_norm": 0.07609833031892776, "learning_rate": 6.304728763991291e-05, "loss": 8.6639, "step": 38752, "throughput": 17864.02078649924 }, { "epoch": 0.6079044506618391, "grad_norm": 0.07995349168777466, "learning_rate": 6.286779375532107e-05, "loss": 8.6797, "step": 38784, "throughput": 17864.17338739285 }, { "epoch": 0.608406021990768, "grad_norm": 0.07657574117183685, "learning_rate": 6.268872106425044e-05, "loss": 8.6393, "step": 38816, "throughput": 17864.181334000346 }, { "epoch": 0.6089075933196969, "grad_norm": 0.07328113168478012, "learning_rate": 6.25100703051974e-05, "loss": 8.6498, "step": 38848, "throughput": 17864.109557804193 }, { "epoch": 0.6094091646486258, "grad_norm": 0.0713716670870781, "learning_rate": 6.233184221491818e-05, "loss": 8.6694, "step": 38880, "throughput": 17864.327473130015 }, { "epoch": 0.6099107359775546, "grad_norm": 0.08026931434869766, "learning_rate": 6.2154037528426e-05, "loss": 8.6497, "step": 38912, "throughput": 17864.380336494443 }, { "epoch": 0.6104123073064835, "grad_norm": 0.07720151543617249, "learning_rate": 6.197665697898784e-05, "loss": 8.6616, "step": 38944, "throughput": 17863.307410331086 }, { "epoch": 0.6109138786354125, "grad_norm": 0.08285202085971832, "learning_rate": 6.179970129812166e-05, "loss": 8.6576, "step": 38976, "throughput": 17863.390790070513 }, { "epoch": 0.6114154499643414, "grad_norm": 0.07704629004001617, "learning_rate": 6.16231712155932e-05, "loss": 8.6726, "step": 39008, "throughput": 17863.601863733846 }, { "epoch": 0.6119170212932703, "grad_norm": 0.07858503609895706, "learning_rate": 6.144706745941308e-05, "loss": 8.652, "step": 39040, "throughput": 17863.759204938622 }, { "epoch": 0.6124185926221992, "grad_norm": 0.07336670160293579, "learning_rate": 6.127139075583363e-05, "loss": 8.6579, "step": 39072, "throughput": 17863.636421163745 }, { "epoch": 0.6129201639511281, "grad_norm": 0.07628186047077179, "learning_rate": 6.109614182934616e-05, "loss": 8.6587, "step": 39104, "throughput": 17863.897427308668 }, { "epoch": 0.6134217352800571, "grad_norm": 0.0749865397810936, "learning_rate": 6.092132140267775e-05, "loss": 8.6461, "step": 39136, "throughput": 17863.672068682863 }, { "epoch": 0.613923306608986, "grad_norm": 0.08119495958089828, "learning_rate": 6.074693019678839e-05, "loss": 8.667, "step": 39168, "throughput": 17863.94105634038 }, { "epoch": 0.6144248779379149, "grad_norm": 0.07770372927188873, "learning_rate": 6.0572968930867827e-05, "loss": 8.6489, "step": 39200, "throughput": 17863.909958673925 }, { "epoch": 0.6149264492668438, "grad_norm": 0.08431252092123032, "learning_rate": 6.039943832233293e-05, "loss": 8.6562, "step": 39232, "throughput": 17863.84274779138 }, { "epoch": 0.6154280205957727, "grad_norm": 0.07534243166446686, "learning_rate": 6.022633908682442e-05, "loss": 8.6567, "step": 39264, "throughput": 17864.039271383816 }, { "epoch": 0.6159295919247016, "grad_norm": 0.07038530707359314, "learning_rate": 6.005367193820408e-05, "loss": 8.6517, "step": 39296, "throughput": 17864.034321243686 }, { "epoch": 0.6164311632536306, "grad_norm": 0.07531935721635818, "learning_rate": 5.9881437588551675e-05, "loss": 8.657, "step": 39328, "throughput": 17864.300657879492 }, { "epoch": 0.6169327345825594, "grad_norm": 0.07444503158330917, "learning_rate": 5.970963674816224e-05, "loss": 8.6488, "step": 39360, "throughput": 17864.20170582491 }, { "epoch": 0.6174343059114883, "grad_norm": 0.07967787981033325, "learning_rate": 5.953827012554291e-05, "loss": 8.662, "step": 39392, "throughput": 17864.458779070905 }, { "epoch": 0.6179358772404172, "grad_norm": 0.07082749158143997, "learning_rate": 5.9367338427410197e-05, "loss": 8.6544, "step": 39424, "throughput": 17864.34052637765 }, { "epoch": 0.6184374485693461, "grad_norm": 0.07375257462263107, "learning_rate": 5.9196842358686866e-05, "loss": 8.6608, "step": 39456, "throughput": 17864.398214765588 }, { "epoch": 0.618939019898275, "grad_norm": 0.07579505443572998, "learning_rate": 5.902678262249923e-05, "loss": 8.6612, "step": 39488, "throughput": 17864.590355831428 }, { "epoch": 0.619440591227204, "grad_norm": 0.07662634551525116, "learning_rate": 5.885715992017419e-05, "loss": 8.6461, "step": 39520, "throughput": 17864.417285726835 }, { "epoch": 0.6199421625561329, "grad_norm": 0.08262430131435394, "learning_rate": 5.86879749512362e-05, "loss": 8.651, "step": 39552, "throughput": 17864.564681502638 }, { "epoch": 0.6204437338850618, "grad_norm": 0.09002744406461716, "learning_rate": 5.851922841340461e-05, "loss": 8.6551, "step": 39584, "throughput": 17864.489150612582 }, { "epoch": 0.6209453052139907, "grad_norm": 0.07560932636260986, "learning_rate": 5.835092100259063e-05, "loss": 8.6438, "step": 39616, "throughput": 17864.85466818457 }, { "epoch": 0.6214468765429196, "grad_norm": 0.07000603526830673, "learning_rate": 5.818305341289458e-05, "loss": 8.6616, "step": 39648, "throughput": 17865.022496325797 }, { "epoch": 0.6219484478718486, "grad_norm": 0.07508612424135208, "learning_rate": 5.8015626336602814e-05, "loss": 8.6451, "step": 39680, "throughput": 17865.015064796968 }, { "epoch": 0.6224500192007775, "grad_norm": 0.07781418412923813, "learning_rate": 5.7848640464185124e-05, "loss": 8.6664, "step": 39712, "throughput": 17865.122405643884 }, { "epoch": 0.6229515905297064, "grad_norm": 0.07823880016803741, "learning_rate": 5.768209648429174e-05, "loss": 8.662, "step": 39744, "throughput": 17864.99515528762 }, { "epoch": 0.6234531618586353, "grad_norm": 0.07522641867399216, "learning_rate": 5.751599508375059e-05, "loss": 8.6612, "step": 39776, "throughput": 17865.26393103338 }, { "epoch": 0.6239547331875641, "grad_norm": 0.07135847210884094, "learning_rate": 5.735033694756423e-05, "loss": 8.6458, "step": 39808, "throughput": 17865.114361056123 }, { "epoch": 0.624456304516493, "grad_norm": 0.07262608408927917, "learning_rate": 5.718512275890737e-05, "loss": 8.6467, "step": 39840, "throughput": 17865.039723312893 }, { "epoch": 0.624957875845422, "grad_norm": 0.08001768589019775, "learning_rate": 5.70203531991238e-05, "loss": 8.6502, "step": 39872, "throughput": 17865.138677653576 }, { "epoch": 0.6254594471743509, "grad_norm": 0.07615388929843903, "learning_rate": 5.6856028947723734e-05, "loss": 8.6538, "step": 39904, "throughput": 17865.24251195156 }, { "epoch": 0.6259610185032798, "grad_norm": 0.08307041972875595, "learning_rate": 5.669215068238075e-05, "loss": 8.6371, "step": 39936, "throughput": 17865.49912828423 }, { "epoch": 0.6264625898322087, "grad_norm": 0.1058107390999794, "learning_rate": 5.652871907892934e-05, "loss": 8.6577, "step": 39968, "throughput": 17865.508025420746 }, { "epoch": 0.6269641611611376, "grad_norm": 0.07321104407310486, "learning_rate": 5.6365734811362026e-05, "loss": 8.654, "step": 40000, "throughput": 17865.873432565157 }, { "epoch": 0.6274657324900665, "grad_norm": 0.07945670187473297, "learning_rate": 5.620319855182629e-05, "loss": 8.644, "step": 40032, "throughput": 17865.753371531107 }, { "epoch": 0.6279673038189955, "grad_norm": 0.07705490291118622, "learning_rate": 5.60411109706222e-05, "loss": 8.619, "step": 40064, "throughput": 17865.828809691076 }, { "epoch": 0.6284688751479244, "grad_norm": 0.07273361086845398, "learning_rate": 5.587947273619938e-05, "loss": 8.6327, "step": 40096, "throughput": 17865.806906101432 }, { "epoch": 0.6289704464768533, "grad_norm": 0.08058905601501465, "learning_rate": 5.5718284515154476e-05, "loss": 8.6399, "step": 40128, "throughput": 17865.742978327868 }, { "epoch": 0.6294720178057822, "grad_norm": 0.08202604949474335, "learning_rate": 5.5557546972228114e-05, "loss": 8.668, "step": 40160, "throughput": 17865.81605389241 }, { "epoch": 0.6299735891347111, "grad_norm": 0.07899637520313263, "learning_rate": 5.539726077030239e-05, "loss": 8.66, "step": 40192, "throughput": 17865.723897540585 }, { "epoch": 0.63047516046364, "grad_norm": 0.07868487387895584, "learning_rate": 5.523742657039809e-05, "loss": 8.6464, "step": 40224, "throughput": 17865.993595646247 }, { "epoch": 0.6309767317925689, "grad_norm": 0.0851953774690628, "learning_rate": 5.5078045031672005e-05, "loss": 8.6763, "step": 40256, "throughput": 17866.002235211934 }, { "epoch": 0.6314783031214978, "grad_norm": 0.0772816613316536, "learning_rate": 5.491911681141394e-05, "loss": 8.6405, "step": 40288, "throughput": 17866.25481727828 }, { "epoch": 0.6319798744504267, "grad_norm": 0.08760164678096771, "learning_rate": 5.476064256504443e-05, "loss": 8.6784, "step": 40320, "throughput": 17866.391651604456 }, { "epoch": 0.6324814457793556, "grad_norm": 0.0925959125161171, "learning_rate": 5.460262294611172e-05, "loss": 8.6622, "step": 40352, "throughput": 17866.315716446847 }, { "epoch": 0.6329830171082845, "grad_norm": 0.07837571203708649, "learning_rate": 5.444505860628923e-05, "loss": 8.6255, "step": 40384, "throughput": 17866.510661128217 }, { "epoch": 0.6334845884372134, "grad_norm": 0.07390439510345459, "learning_rate": 5.428795019537268e-05, "loss": 8.6348, "step": 40416, "throughput": 17866.438074365313 }, { "epoch": 0.6339861597661424, "grad_norm": 0.10645684599876404, "learning_rate": 5.413129836127766e-05, "loss": 8.6481, "step": 40448, "throughput": 17866.375502482675 }, { "epoch": 0.6344877310950713, "grad_norm": 0.07660505175590515, "learning_rate": 5.3975103750036805e-05, "loss": 8.6345, "step": 40480, "throughput": 17866.386718452814 }, { "epoch": 0.6349893024240002, "grad_norm": 0.08478232473134995, "learning_rate": 5.3819367005797186e-05, "loss": 8.631, "step": 40512, "throughput": 17866.570802730967 }, { "epoch": 0.6354908737529291, "grad_norm": 0.0727619156241417, "learning_rate": 5.366408877081752e-05, "loss": 8.6615, "step": 40544, "throughput": 17866.730174278655 }, { "epoch": 0.635992445081858, "grad_norm": 0.07355979830026627, "learning_rate": 5.3509269685465764e-05, "loss": 8.6351, "step": 40576, "throughput": 17866.643518617777 }, { "epoch": 0.636494016410787, "grad_norm": 0.08197405189275742, "learning_rate": 5.3354910388216274e-05, "loss": 8.6527, "step": 40608, "throughput": 17866.997496697804 }, { "epoch": 0.6369955877397159, "grad_norm": 0.07571222633123398, "learning_rate": 5.3201011515647276e-05, "loss": 8.658, "step": 40640, "throughput": 17866.88506691309 }, { "epoch": 0.6374971590686448, "grad_norm": 0.08034916967153549, "learning_rate": 5.304757370243811e-05, "loss": 8.6353, "step": 40672, "throughput": 17867.07195681159 }, { "epoch": 0.6379987303975736, "grad_norm": 0.09238483011722565, "learning_rate": 5.2894597581366835e-05, "loss": 8.6596, "step": 40704, "throughput": 17867.052423325236 }, { "epoch": 0.6385003017265025, "grad_norm": 0.07784246653318405, "learning_rate": 5.274208378330737e-05, "loss": 8.658, "step": 40736, "throughput": 17866.879184258778 }, { "epoch": 0.6390018730554314, "grad_norm": 0.08024382591247559, "learning_rate": 5.2590032937227154e-05, "loss": 8.6502, "step": 40768, "throughput": 17867.069276156857 }, { "epoch": 0.6395034443843604, "grad_norm": 0.08907058835029602, "learning_rate": 5.2438445670184244e-05, "loss": 8.6509, "step": 40800, "throughput": 17867.088888346483 }, { "epoch": 0.6400050157132893, "grad_norm": 0.07528900355100632, "learning_rate": 5.2287322607325e-05, "loss": 8.6323, "step": 40832, "throughput": 17867.34687568463 }, { "epoch": 0.6405065870422182, "grad_norm": 0.07476245611906052, "learning_rate": 5.213666437188141e-05, "loss": 8.6636, "step": 40864, "throughput": 17867.248490118967 }, { "epoch": 0.6410081583711471, "grad_norm": 0.07534540444612503, "learning_rate": 5.1986471585168485e-05, "loss": 8.6525, "step": 40896, "throughput": 17867.501511953807 }, { "epoch": 0.641509729700076, "grad_norm": 0.07289925962686539, "learning_rate": 5.183674486658167e-05, "loss": 8.6673, "step": 40928, "throughput": 17867.44215734667 }, { "epoch": 0.6420113010290049, "grad_norm": 0.07585127651691437, "learning_rate": 5.168748483359445e-05, "loss": 8.6576, "step": 40960, "throughput": 17867.563558104393 }, { "epoch": 0.6425128723579339, "grad_norm": 0.07819195091724396, "learning_rate": 5.153869210175563e-05, "loss": 8.6272, "step": 40992, "throughput": 17866.695754184875 }, { "epoch": 0.6430144436868628, "grad_norm": 0.07709189504384995, "learning_rate": 5.139036728468686e-05, "loss": 8.6352, "step": 41024, "throughput": 17866.540715824405 }, { "epoch": 0.6435160150157917, "grad_norm": 0.07325899600982666, "learning_rate": 5.124251099408012e-05, "loss": 8.6239, "step": 41056, "throughput": 17866.573961717964 }, { "epoch": 0.6440175863447206, "grad_norm": 0.07731231302022934, "learning_rate": 5.1095123839695224e-05, "loss": 8.6481, "step": 41088, "throughput": 17866.52019947398 }, { "epoch": 0.6445191576736495, "grad_norm": 0.07855982333421707, "learning_rate": 5.0948206429357224e-05, "loss": 8.6585, "step": 41120, "throughput": 17866.874899436556 }, { "epoch": 0.6450207290025783, "grad_norm": 0.08262521773576736, "learning_rate": 5.080175936895392e-05, "loss": 8.6673, "step": 41152, "throughput": 17866.97401668372 }, { "epoch": 0.6455223003315073, "grad_norm": 0.07248621433973312, "learning_rate": 5.065578326243348e-05, "loss": 8.6146, "step": 41184, "throughput": 17866.939932998972 }, { "epoch": 0.6460238716604362, "grad_norm": 0.08461394906044006, "learning_rate": 5.0510278711801735e-05, "loss": 8.6513, "step": 41216, "throughput": 17867.19175924385 }, { "epoch": 0.6465254429893651, "grad_norm": 0.08080363273620605, "learning_rate": 5.036524631711996e-05, "loss": 8.6439, "step": 41248, "throughput": 17867.07581491338 }, { "epoch": 0.647027014318294, "grad_norm": 0.07436025887727737, "learning_rate": 5.02206866765021e-05, "loss": 8.6421, "step": 41280, "throughput": 17867.35340200034 }, { "epoch": 0.6475285856472229, "grad_norm": 0.07391635328531265, "learning_rate": 5.007660038611259e-05, "loss": 8.6543, "step": 41312, "throughput": 17867.44467388088 }, { "epoch": 0.6480301569761518, "grad_norm": 0.08293146640062332, "learning_rate": 4.9932988040163726e-05, "loss": 8.6386, "step": 41344, "throughput": 17867.060722918603 }, { "epoch": 0.6485317283050808, "grad_norm": 0.08641663193702698, "learning_rate": 4.978985023091324e-05, "loss": 8.6454, "step": 41376, "throughput": 17867.136581283157 }, { "epoch": 0.6490332996340097, "grad_norm": 0.07811315357685089, "learning_rate": 4.964718754866186e-05, "loss": 8.6361, "step": 41408, "throughput": 17867.35349034328 }, { "epoch": 0.6495348709629386, "grad_norm": 0.07370807230472565, "learning_rate": 4.95050005817509e-05, "loss": 8.6499, "step": 41440, "throughput": 17867.511919709723 }, { "epoch": 0.6500364422918675, "grad_norm": 0.0777527242898941, "learning_rate": 4.936328991655988e-05, "loss": 8.6117, "step": 41472, "throughput": 17867.50111318367 }, { "epoch": 0.6505380136207964, "grad_norm": 0.0801592543721199, "learning_rate": 4.9222056137504e-05, "loss": 8.6319, "step": 41504, "throughput": 17867.853086445513 }, { "epoch": 0.6510395849497254, "grad_norm": 0.07333212345838547, "learning_rate": 4.908129982703169e-05, "loss": 8.6352, "step": 41536, "throughput": 17867.628775432426 }, { "epoch": 0.6515411562786543, "grad_norm": 0.07759763300418854, "learning_rate": 4.8941021565622516e-05, "loss": 8.6491, "step": 41568, "throughput": 17867.804595213962 }, { "epoch": 0.6520427276075831, "grad_norm": 0.07329500466585159, "learning_rate": 4.880122193178441e-05, "loss": 8.6472, "step": 41600, "throughput": 17867.99294167593 }, { "epoch": 0.652544298936512, "grad_norm": 0.07508452236652374, "learning_rate": 4.866190150205143e-05, "loss": 8.6398, "step": 41632, "throughput": 17867.830850863997 }, { "epoch": 0.6530458702654409, "grad_norm": 0.0712026059627533, "learning_rate": 4.8523060850981476e-05, "loss": 8.62, "step": 41664, "throughput": 17867.803101399382 }, { "epoch": 0.6535474415943698, "grad_norm": 0.09108668565750122, "learning_rate": 4.838470055115379e-05, "loss": 8.6402, "step": 41696, "throughput": 17867.811763701153 }, { "epoch": 0.6540490129232988, "grad_norm": 0.08174485713243484, "learning_rate": 4.82468211731667e-05, "loss": 8.6461, "step": 41728, "throughput": 17868.16016042303 }, { "epoch": 0.6545505842522277, "grad_norm": 0.08193115890026093, "learning_rate": 4.8109423285635116e-05, "loss": 8.6443, "step": 41760, "throughput": 17868.069688114592 }, { "epoch": 0.6550521555811566, "grad_norm": 0.0756475180387497, "learning_rate": 4.797250745518833e-05, "loss": 8.607, "step": 41792, "throughput": 17868.219940900595 }, { "epoch": 0.6555537269100855, "grad_norm": 0.07722701877355576, "learning_rate": 4.7836074246467685e-05, "loss": 8.6325, "step": 41824, "throughput": 17868.244047539516 }, { "epoch": 0.6560552982390144, "grad_norm": 0.07927604764699936, "learning_rate": 4.770012422212412e-05, "loss": 8.6342, "step": 41856, "throughput": 17868.25155414449 }, { "epoch": 0.6565568695679433, "grad_norm": 0.09216690063476562, "learning_rate": 4.756465794281592e-05, "loss": 8.6335, "step": 41888, "throughput": 17868.52465285224 }, { "epoch": 0.6570584408968723, "grad_norm": 0.07883986085653305, "learning_rate": 4.742967596720641e-05, "loss": 8.6454, "step": 41920, "throughput": 17868.603658253276 }, { "epoch": 0.6575600122258012, "grad_norm": 0.07400567084550858, "learning_rate": 4.729517885196169e-05, "loss": 8.6558, "step": 41952, "throughput": 17868.331232637775 }, { "epoch": 0.6580615835547301, "grad_norm": 0.07267136126756668, "learning_rate": 4.716116715174827e-05, "loss": 8.6288, "step": 41984, "throughput": 17868.341387047603 }, { "epoch": 0.6585631548836589, "grad_norm": 0.07420884817838669, "learning_rate": 4.702764141923075e-05, "loss": 8.6473, "step": 42016, "throughput": 17868.619800005705 }, { "epoch": 0.6590647262125878, "grad_norm": 0.07612421363592148, "learning_rate": 4.6894602205069674e-05, "loss": 8.6197, "step": 42048, "throughput": 17868.768551557903 }, { "epoch": 0.6595662975415167, "grad_norm": 0.09027434140443802, "learning_rate": 4.6762050057919165e-05, "loss": 8.6223, "step": 42080, "throughput": 17868.673524801692 }, { "epoch": 0.6600678688704457, "grad_norm": 0.0758587121963501, "learning_rate": 4.6629985524424686e-05, "loss": 8.6244, "step": 42112, "throughput": 17869.0184421209 }, { "epoch": 0.6605694401993746, "grad_norm": 0.08677732199430466, "learning_rate": 4.649840914922071e-05, "loss": 8.6343, "step": 42144, "throughput": 17868.714091906582 }, { "epoch": 0.6610710115283035, "grad_norm": 0.07691498100757599, "learning_rate": 4.636732147492863e-05, "loss": 8.6237, "step": 42176, "throughput": 17868.886598381487 }, { "epoch": 0.6615725828572324, "grad_norm": 0.07497978210449219, "learning_rate": 4.6236723042154424e-05, "loss": 8.6488, "step": 42208, "throughput": 17868.970009957025 }, { "epoch": 0.6620741541861613, "grad_norm": 0.07713934779167175, "learning_rate": 4.61066143894864e-05, "loss": 8.6389, "step": 42240, "throughput": 17868.90043001298 }, { "epoch": 0.6625757255150903, "grad_norm": 0.0759284496307373, "learning_rate": 4.5976996053492996e-05, "loss": 8.6253, "step": 42272, "throughput": 17868.98162581538 }, { "epoch": 0.6630772968440192, "grad_norm": 0.07967263460159302, "learning_rate": 4.5847868568720646e-05, "loss": 8.6185, "step": 42304, "throughput": 17868.993457005505 }, { "epoch": 0.6635788681729481, "grad_norm": 0.07670048624277115, "learning_rate": 4.571923246769147e-05, "loss": 8.6412, "step": 42336, "throughput": 17869.152099055646 }, { "epoch": 0.664080439501877, "grad_norm": 0.08368648588657379, "learning_rate": 4.559108828090115e-05, "loss": 8.632, "step": 42368, "throughput": 17869.164191981206 }, { "epoch": 0.6645820108308059, "grad_norm": 0.08008282631635666, "learning_rate": 4.546343653681667e-05, "loss": 8.6426, "step": 42400, "throughput": 17869.406153800315 }, { "epoch": 0.6650835821597348, "grad_norm": 0.07593069970607758, "learning_rate": 4.53362777618742e-05, "loss": 8.6291, "step": 42432, "throughput": 17869.34026576544 }, { "epoch": 0.6655851534886637, "grad_norm": 0.07599702477455139, "learning_rate": 4.52096124804769e-05, "loss": 8.6262, "step": 42464, "throughput": 17869.34267106005 }, { "epoch": 0.6660867248175926, "grad_norm": 0.08322039246559143, "learning_rate": 4.508344121499281e-05, "loss": 8.628, "step": 42496, "throughput": 17869.44801207976 }, { "epoch": 0.6665882961465215, "grad_norm": 0.08085814118385315, "learning_rate": 4.495776448575255e-05, "loss": 8.6247, "step": 42528, "throughput": 17869.685302503272 }, { "epoch": 0.6670898674754504, "grad_norm": 0.07521507143974304, "learning_rate": 4.483258281104734e-05, "loss": 8.6076, "step": 42560, "throughput": 17869.411012059038 }, { "epoch": 0.6675914388043793, "grad_norm": 0.08697548508644104, "learning_rate": 4.470789670712681e-05, "loss": 8.6259, "step": 42592, "throughput": 17869.41670981458 }, { "epoch": 0.6680930101333082, "grad_norm": 0.08319617062807083, "learning_rate": 4.458370668819676e-05, "loss": 8.6411, "step": 42624, "throughput": 17869.702199175634 }, { "epoch": 0.6685945814622372, "grad_norm": 0.074796162545681, "learning_rate": 4.4460013266417226e-05, "loss": 8.6154, "step": 42656, "throughput": 17869.84765573023 }, { "epoch": 0.6690961527911661, "grad_norm": 0.10984188318252563, "learning_rate": 4.433681695190027e-05, "loss": 8.6231, "step": 42688, "throughput": 17869.75924603985 }, { "epoch": 0.669597724120095, "grad_norm": 0.07607369124889374, "learning_rate": 4.421411825270785e-05, "loss": 8.614, "step": 42720, "throughput": 17869.892649707363 }, { "epoch": 0.6700992954490239, "grad_norm": 0.07601217180490494, "learning_rate": 4.4091917674849727e-05, "loss": 8.6327, "step": 42752, "throughput": 17869.76525839264 }, { "epoch": 0.6706008667779528, "grad_norm": 0.08114274591207504, "learning_rate": 4.397021572228147e-05, "loss": 8.6388, "step": 42784, "throughput": 17869.938173688686 }, { "epoch": 0.6711024381068817, "grad_norm": 0.09427239745855331, "learning_rate": 4.38490128969023e-05, "loss": 8.6407, "step": 42816, "throughput": 17870.02010821782 }, { "epoch": 0.6716040094358107, "grad_norm": 0.077272430062294, "learning_rate": 4.3728309698553056e-05, "loss": 8.6417, "step": 42848, "throughput": 17869.953129300473 }, { "epoch": 0.6721055807647396, "grad_norm": 0.07326050847768784, "learning_rate": 4.3608106625014014e-05, "loss": 8.6258, "step": 42880, "throughput": 17869.9248813899 }, { "epoch": 0.6726071520936684, "grad_norm": 0.07527446001768112, "learning_rate": 4.348840417200306e-05, "loss": 8.6325, "step": 42912, "throughput": 17870.028090881584 }, { "epoch": 0.6731087234225973, "grad_norm": 0.07910473644733429, "learning_rate": 4.336920283317343e-05, "loss": 8.6445, "step": 42944, "throughput": 17870.18679976226 }, { "epoch": 0.6736102947515262, "grad_norm": 0.07715103030204773, "learning_rate": 4.325050310011183e-05, "loss": 8.648, "step": 42976, "throughput": 17870.101235465474 }, { "epoch": 0.6741118660804551, "grad_norm": 0.09285691380500793, "learning_rate": 4.3132305462336306e-05, "loss": 8.6354, "step": 43008, "throughput": 17870.4344978396 }, { "epoch": 0.6746134374093841, "grad_norm": 0.07316839694976807, "learning_rate": 4.301461040729424e-05, "loss": 8.6547, "step": 43040, "throughput": 17829.53093573461 }, { "epoch": 0.675115008738313, "grad_norm": 0.08353468775749207, "learning_rate": 4.289741842036042e-05, "loss": 8.6261, "step": 43072, "throughput": 17828.00636019171 }, { "epoch": 0.6756165800672419, "grad_norm": 0.07400113344192505, "learning_rate": 4.2780729984834916e-05, "loss": 8.6173, "step": 43104, "throughput": 17827.94909204432 }, { "epoch": 0.6761181513961708, "grad_norm": 0.08323801308870316, "learning_rate": 4.266454558194122e-05, "loss": 8.6261, "step": 43136, "throughput": 17827.857852711364 }, { "epoch": 0.6766197227250997, "grad_norm": 0.08236182481050491, "learning_rate": 4.254886569082413e-05, "loss": 8.61, "step": 43168, "throughput": 17828.213728487226 }, { "epoch": 0.6771212940540287, "grad_norm": 0.08978724479675293, "learning_rate": 4.243369078854788e-05, "loss": 8.6226, "step": 43200, "throughput": 17828.47533696741 }, { "epoch": 0.6776228653829576, "grad_norm": 0.07722384482622147, "learning_rate": 4.231902135009407e-05, "loss": 8.6436, "step": 43232, "throughput": 17828.832063955277 }, { "epoch": 0.6781244367118865, "grad_norm": 0.07705316692590714, "learning_rate": 4.220485784835984e-05, "loss": 8.6338, "step": 43264, "throughput": 17829.196605309022 }, { "epoch": 0.6786260080408154, "grad_norm": 0.07429414242506027, "learning_rate": 4.209120075415577e-05, "loss": 8.6076, "step": 43296, "throughput": 17829.554240003934 }, { "epoch": 0.6791275793697443, "grad_norm": 0.07961345463991165, "learning_rate": 4.197805053620411e-05, "loss": 8.6129, "step": 43328, "throughput": 17829.906459575002 }, { "epoch": 0.6796291506986731, "grad_norm": 0.07683772593736649, "learning_rate": 4.186540766113665e-05, "loss": 8.6178, "step": 43360, "throughput": 17829.15771918659 }, { "epoch": 0.680130722027602, "grad_norm": 0.07365067303180695, "learning_rate": 4.1753272593492956e-05, "loss": 8.6282, "step": 43392, "throughput": 17828.201463559795 }, { "epoch": 0.680632293356531, "grad_norm": 0.07368257641792297, "learning_rate": 4.1641645795718364e-05, "loss": 8.6367, "step": 43424, "throughput": 17828.16390825877 }, { "epoch": 0.6811338646854599, "grad_norm": 0.08067294210195541, "learning_rate": 4.153052772816217e-05, "loss": 8.6053, "step": 43456, "throughput": 17828.43430354821 }, { "epoch": 0.6816354360143888, "grad_norm": 0.0738702118396759, "learning_rate": 4.141991884907555e-05, "loss": 8.6118, "step": 43488, "throughput": 17828.703134283922 }, { "epoch": 0.6821370073433177, "grad_norm": 0.07744862884283066, "learning_rate": 4.1309819614609865e-05, "loss": 8.6139, "step": 43520, "throughput": 17829.06611352555 }, { "epoch": 0.6826385786722466, "grad_norm": 0.08360811322927475, "learning_rate": 4.1200230478814695e-05, "loss": 8.6318, "step": 43552, "throughput": 17829.43111372124 }, { "epoch": 0.6831401500011756, "grad_norm": 0.08169478923082352, "learning_rate": 4.109115189363601e-05, "loss": 8.6276, "step": 43584, "throughput": 17829.79810554368 }, { "epoch": 0.6836417213301045, "grad_norm": 0.07911770045757294, "learning_rate": 4.0982584308914114e-05, "loss": 8.5991, "step": 43616, "throughput": 17830.156279143917 }, { "epoch": 0.6841432926590334, "grad_norm": 0.0773385539650917, "learning_rate": 4.0874528172382114e-05, "loss": 8.6271, "step": 43648, "throughput": 17830.127300317476 }, { "epoch": 0.6846448639879623, "grad_norm": 0.07815414667129517, "learning_rate": 4.0766983929663835e-05, "loss": 8.6012, "step": 43680, "throughput": 17829.028219723397 }, { "epoch": 0.6851464353168912, "grad_norm": 0.07514085620641708, "learning_rate": 4.065995202427206e-05, "loss": 8.6132, "step": 43712, "throughput": 17829.106705307986 }, { "epoch": 0.6856480066458202, "grad_norm": 0.07363571971654892, "learning_rate": 4.055343289760664e-05, "loss": 8.6209, "step": 43744, "throughput": 17829.082599309702 }, { "epoch": 0.6861495779747491, "grad_norm": 0.10579396039247513, "learning_rate": 4.0447426988952816e-05, "loss": 8.5954, "step": 43776, "throughput": 17829.43620231659 }, { "epoch": 0.6866511493036779, "grad_norm": 0.08076892048120499, "learning_rate": 4.0341934735479224e-05, "loss": 8.6166, "step": 43808, "throughput": 17829.703211476124 }, { "epoch": 0.6871527206326068, "grad_norm": 0.07535163313150406, "learning_rate": 4.02369565722363e-05, "loss": 8.6349, "step": 43840, "throughput": 17830.054087310255 }, { "epoch": 0.6876542919615357, "grad_norm": 0.07902940362691879, "learning_rate": 4.013249293215422e-05, "loss": 8.5933, "step": 43872, "throughput": 17830.408681836223 }, { "epoch": 0.6881558632904646, "grad_norm": 0.07208844274282455, "learning_rate": 4.0028544246041406e-05, "loss": 8.6062, "step": 43904, "throughput": 17830.762075605933 }, { "epoch": 0.6886574346193935, "grad_norm": 0.07734175026416779, "learning_rate": 3.99251109425825e-05, "loss": 8.6341, "step": 43936, "throughput": 17831.037793466916 }, { "epoch": 0.6891590059483225, "grad_norm": 0.07820406556129456, "learning_rate": 3.982219344833681e-05, "loss": 8.6247, "step": 43968, "throughput": 17829.944007343434 }, { "epoch": 0.6896605772772514, "grad_norm": 0.08111374080181122, "learning_rate": 3.971979218773634e-05, "loss": 8.6006, "step": 44000, "throughput": 17829.537729953823 }, { "epoch": 0.6901621486061803, "grad_norm": 0.0744817703962326, "learning_rate": 3.961790758308418e-05, "loss": 8.6149, "step": 44032, "throughput": 17829.513134364835 }, { "epoch": 0.6906637199351092, "grad_norm": 0.07969462871551514, "learning_rate": 3.951654005455281e-05, "loss": 8.61, "step": 44064, "throughput": 17829.86407906323 }, { "epoch": 0.6911652912640381, "grad_norm": 0.0778898149728775, "learning_rate": 3.9415690020182154e-05, "loss": 8.6197, "step": 44096, "throughput": 17830.128389522783 }, { "epoch": 0.6916668625929671, "grad_norm": 0.07783157378435135, "learning_rate": 3.9315357895878066e-05, "loss": 8.6106, "step": 44128, "throughput": 17830.48168304553 }, { "epoch": 0.692168433921896, "grad_norm": 0.08259106427431107, "learning_rate": 3.921554409541053e-05, "loss": 8.6088, "step": 44160, "throughput": 17830.83309655857 }, { "epoch": 0.6926700052508249, "grad_norm": 0.08314735442399979, "learning_rate": 3.911624903041198e-05, "loss": 8.6292, "step": 44192, "throughput": 17831.187801043976 }, { "epoch": 0.6931715765797538, "grad_norm": 0.07506611198186874, "learning_rate": 3.9017473110375525e-05, "loss": 8.6192, "step": 44224, "throughput": 17831.44545054685 }, { "epoch": 0.6936731479086826, "grad_norm": 0.08658742159605026, "learning_rate": 3.891921674265336e-05, "loss": 8.6018, "step": 44256, "throughput": 17831.315833658613 }, { "epoch": 0.6941747192376115, "grad_norm": 0.08220481127500534, "learning_rate": 3.8821480332455024e-05, "loss": 8.61, "step": 44288, "throughput": 17830.152042987287 }, { "epoch": 0.6946762905665405, "grad_norm": 0.08411680907011032, "learning_rate": 3.87242642828458e-05, "loss": 8.6295, "step": 44320, "throughput": 17830.21507840018 }, { "epoch": 0.6951778618954694, "grad_norm": 0.09514406323432922, "learning_rate": 3.862756899474493e-05, "loss": 8.6189, "step": 44352, "throughput": 17830.27413842766 }, { "epoch": 0.6956794332243983, "grad_norm": 0.07466054707765579, "learning_rate": 3.853139486692408e-05, "loss": 8.592, "step": 44384, "throughput": 17830.626133393966 }, { "epoch": 0.6961810045533272, "grad_norm": 0.07343938946723938, "learning_rate": 3.843574229600565e-05, "loss": 8.6085, "step": 44416, "throughput": 17830.88249626846 }, { "epoch": 0.6966825758822561, "grad_norm": 0.08367404341697693, "learning_rate": 3.834061167646112e-05, "loss": 8.6191, "step": 44448, "throughput": 17831.23872612226 }, { "epoch": 0.697184147211185, "grad_norm": 0.08425655215978622, "learning_rate": 3.8246003400609424e-05, "loss": 8.6116, "step": 44480, "throughput": 17831.58888669082 }, { "epoch": 0.697685718540114, "grad_norm": 0.09636666625738144, "learning_rate": 3.81519178586154e-05, "loss": 8.6087, "step": 44512, "throughput": 17831.944748926202 }, { "epoch": 0.6981872898690429, "grad_norm": 0.07186683267354965, "learning_rate": 3.805835543848809e-05, "loss": 8.6372, "step": 44544, "throughput": 17831.92925302466 }, { "epoch": 0.6986888611979718, "grad_norm": 0.07774645835161209, "learning_rate": 3.796531652607919e-05, "loss": 8.629, "step": 44576, "throughput": 17830.856019052466 }, { "epoch": 0.6991904325269007, "grad_norm": 0.07877375930547714, "learning_rate": 3.7872801505081434e-05, "loss": 8.6345, "step": 44608, "throughput": 17830.757211903798 }, { "epoch": 0.6996920038558296, "grad_norm": 0.07710936665534973, "learning_rate": 3.778081075702709e-05, "loss": 8.6043, "step": 44640, "throughput": 17830.732995255126 }, { "epoch": 0.7001935751847586, "grad_norm": 0.0777658000588417, "learning_rate": 3.7689344661286264e-05, "loss": 8.6328, "step": 44672, "throughput": 17831.086625902175 }, { "epoch": 0.7006951465136874, "grad_norm": 0.07222341746091843, "learning_rate": 3.759840359506536e-05, "loss": 8.6041, "step": 44704, "throughput": 17831.35073612898 }, { "epoch": 0.7011967178426163, "grad_norm": 0.07875847816467285, "learning_rate": 3.750798793340565e-05, "loss": 8.6201, "step": 44736, "throughput": 17831.704921569017 }, { "epoch": 0.7016982891715452, "grad_norm": 0.08122248947620392, "learning_rate": 3.7418098049181573e-05, "loss": 8.6311, "step": 44768, "throughput": 17832.053439285082 }, { "epoch": 0.7021998605004741, "grad_norm": 0.08080139756202698, "learning_rate": 3.732873431309929e-05, "loss": 8.6082, "step": 44800, "throughput": 17832.40082616424 }, { "epoch": 0.702701431829403, "grad_norm": 0.0753706693649292, "learning_rate": 3.7239897093695106e-05, "loss": 8.6183, "step": 44832, "throughput": 17832.662159292475 }, { "epoch": 0.703203003158332, "grad_norm": 0.07338026911020279, "learning_rate": 3.715158675733396e-05, "loss": 8.618, "step": 44864, "throughput": 17832.246083241713 }, { "epoch": 0.7037045744872609, "grad_norm": 0.07462576031684875, "learning_rate": 3.706380366820796e-05, "loss": 8.6148, "step": 44896, "throughput": 17831.393643303032 }, { "epoch": 0.7042061458161898, "grad_norm": 0.07330302149057388, "learning_rate": 3.6976548188334834e-05, "loss": 8.5964, "step": 44928, "throughput": 17831.456045595824 }, { "epoch": 0.7047077171451187, "grad_norm": 0.07165367156267166, "learning_rate": 3.688982067755642e-05, "loss": 8.5966, "step": 44960, "throughput": 17831.61336550786 }, { "epoch": 0.7052092884740476, "grad_norm": 0.07546305656433105, "learning_rate": 3.680362149353724e-05, "loss": 8.6257, "step": 44992, "throughput": 17831.95989946819 }, { "epoch": 0.7057108598029765, "grad_norm": 0.07777175307273865, "learning_rate": 3.671795099176297e-05, "loss": 8.6032, "step": 45024, "throughput": 17832.22300289001 }, { "epoch": 0.7062124311319055, "grad_norm": 0.08201725035905838, "learning_rate": 3.6632809525539055e-05, "loss": 8.6169, "step": 45056, "throughput": 17832.570589285373 }, { "epoch": 0.7067140024608344, "grad_norm": 0.07370094209909439, "learning_rate": 3.6548197445989086e-05, "loss": 8.6216, "step": 45088, "throughput": 17831.93015607742 }, { "epoch": 0.7072155737897633, "grad_norm": 0.09871018677949905, "learning_rate": 3.6464115102053596e-05, "loss": 8.6063, "step": 45120, "throughput": 17832.114820630442 }, { "epoch": 0.7077171451186921, "grad_norm": 0.08234908431768417, "learning_rate": 3.6380562840488376e-05, "loss": 8.6291, "step": 45152, "throughput": 17832.17183748897 }, { "epoch": 0.708218716447621, "grad_norm": 0.08061608672142029, "learning_rate": 3.629754100586323e-05, "loss": 8.6041, "step": 45184, "throughput": 17831.104851476342 }, { "epoch": 0.7087202877765499, "grad_norm": 0.07441861927509308, "learning_rate": 3.6215049940560433e-05, "loss": 8.6275, "step": 45216, "throughput": 17831.083314860523 }, { "epoch": 0.7092218591054789, "grad_norm": 0.07990828156471252, "learning_rate": 3.613308998477339e-05, "loss": 8.579, "step": 45248, "throughput": 17831.062522278604 }, { "epoch": 0.7097234304344078, "grad_norm": 0.07660244405269623, "learning_rate": 3.605166147650517e-05, "loss": 8.6059, "step": 45280, "throughput": 17831.40752875969 }, { "epoch": 0.7102250017633367, "grad_norm": 0.07470241189002991, "learning_rate": 3.597076475156726e-05, "loss": 8.629, "step": 45312, "throughput": 17831.660710762444 }, { "epoch": 0.7107265730922656, "grad_norm": 0.08347544074058533, "learning_rate": 3.589040014357791e-05, "loss": 8.6297, "step": 45344, "throughput": 17832.003144784834 }, { "epoch": 0.7112281444211945, "grad_norm": 0.08342598378658295, "learning_rate": 3.581056798396105e-05, "loss": 8.6175, "step": 45376, "throughput": 17832.345687120098 }, { "epoch": 0.7117297157501234, "grad_norm": 0.07837989181280136, "learning_rate": 3.57312686019447e-05, "loss": 8.6017, "step": 45408, "throughput": 17832.603146867266 }, { "epoch": 0.7122312870790524, "grad_norm": 0.07344204932451248, "learning_rate": 3.565250232455983e-05, "loss": 8.6169, "step": 45440, "throughput": 17832.851243073444 }, { "epoch": 0.7127328584079813, "grad_norm": 0.07202349603176117, "learning_rate": 3.55742694766387e-05, "loss": 8.6055, "step": 45472, "throughput": 17831.984718235526 }, { "epoch": 0.7132344297369102, "grad_norm": 0.08134770393371582, "learning_rate": 3.549657038081386e-05, "loss": 8.6118, "step": 45504, "throughput": 17831.51519479825 }, { "epoch": 0.7137360010658391, "grad_norm": 0.07818509638309479, "learning_rate": 3.5419405357516624e-05, "loss": 8.5977, "step": 45536, "throughput": 17831.574718349315 }, { "epoch": 0.714237572394768, "grad_norm": 0.07608740031719208, "learning_rate": 3.534277472497574e-05, "loss": 8.6037, "step": 45568, "throughput": 17831.82476007102 }, { "epoch": 0.7147391437236968, "grad_norm": 0.08183170855045319, "learning_rate": 3.52666787992162e-05, "loss": 8.6228, "step": 45600, "throughput": 17832.07426776666 }, { "epoch": 0.7152407150526258, "grad_norm": 0.07773195207118988, "learning_rate": 3.519111789405779e-05, "loss": 8.6376, "step": 45632, "throughput": 17832.416637519258 }, { "epoch": 0.7157422863815547, "grad_norm": 0.07735767215490341, "learning_rate": 3.5116092321113936e-05, "loss": 8.612, "step": 45664, "throughput": 17832.676189491172 }, { "epoch": 0.7162438577104836, "grad_norm": 0.07806604355573654, "learning_rate": 3.504160238979032e-05, "loss": 8.592, "step": 45696, "throughput": 17833.019040719744 }, { "epoch": 0.7167454290394125, "grad_norm": 0.07910951972007751, "learning_rate": 3.496764840728361e-05, "loss": 8.591, "step": 45728, "throughput": 17833.26704762432 }, { "epoch": 0.7172470003683414, "grad_norm": 0.08164380490779877, "learning_rate": 3.489423067858027e-05, "loss": 8.6037, "step": 45760, "throughput": 17833.137383345835 }, { "epoch": 0.7177485716972704, "grad_norm": 0.0812004879117012, "learning_rate": 3.4821349506455255e-05, "loss": 8.6192, "step": 45792, "throughput": 17832.10878133463 }, { "epoch": 0.7182501430261993, "grad_norm": 0.07716728746891022, "learning_rate": 3.47490051914707e-05, "loss": 8.5838, "step": 45824, "throughput": 17832.087124480226 }, { "epoch": 0.7187517143551282, "grad_norm": 0.07972914725542068, "learning_rate": 3.4677198031974784e-05, "loss": 8.6032, "step": 45856, "throughput": 17832.146003866314 }, { "epoch": 0.7192532856840571, "grad_norm": 0.07909884303808212, "learning_rate": 3.4605928324100444e-05, "loss": 8.6251, "step": 45888, "throughput": 17832.48324538191 }, { "epoch": 0.719754857012986, "grad_norm": 0.07721769064664841, "learning_rate": 3.45351963617642e-05, "loss": 8.5953, "step": 45920, "throughput": 17832.729419615276 }, { "epoch": 0.720256428341915, "grad_norm": 0.07230034470558167, "learning_rate": 3.446500243666481e-05, "loss": 8.6252, "step": 45952, "throughput": 17832.980164510158 }, { "epoch": 0.7207579996708439, "grad_norm": 0.0790412500500679, "learning_rate": 3.439534683828228e-05, "loss": 8.6085, "step": 45984, "throughput": 17833.314847562127 }, { "epoch": 0.7212595709997727, "grad_norm": 0.08125613629817963, "learning_rate": 3.4326229853876475e-05, "loss": 8.6184, "step": 46016, "throughput": 17833.57056758044 }, { "epoch": 0.7217611423287016, "grad_norm": 0.08089485764503479, "learning_rate": 3.425765176848607e-05, "loss": 8.6079, "step": 46048, "throughput": 17833.715059902177 }, { "epoch": 0.7222627136576305, "grad_norm": 0.08002211153507233, "learning_rate": 3.418961286492728e-05, "loss": 8.6169, "step": 46080, "throughput": 17832.842095556927 }, { "epoch": 0.7227642849865594, "grad_norm": 0.08807891607284546, "learning_rate": 3.412211342379273e-05, "loss": 8.6204, "step": 46112, "throughput": 17832.438760989724 }, { "epoch": 0.7232658563154883, "grad_norm": 0.07419393956661224, "learning_rate": 3.405515372345033e-05, "loss": 8.6076, "step": 46144, "throughput": 17832.408519828827 }, { "epoch": 0.7237674276444173, "grad_norm": 0.0870908796787262, "learning_rate": 3.398873404004209e-05, "loss": 8.6187, "step": 46176, "throughput": 17832.746373556896 }, { "epoch": 0.7242689989733462, "grad_norm": 0.07417533546686172, "learning_rate": 3.392285464748298e-05, "loss": 8.6184, "step": 46208, "throughput": 17832.997842482142 }, { "epoch": 0.7247705703022751, "grad_norm": 0.07752197980880737, "learning_rate": 3.385751581745979e-05, "loss": 8.6129, "step": 46240, "throughput": 17833.339610502808 }, { "epoch": 0.725272141631204, "grad_norm": 0.08416827768087387, "learning_rate": 3.379271781943007e-05, "loss": 8.5945, "step": 46272, "throughput": 17833.587420699234 }, { "epoch": 0.7257737129601329, "grad_norm": 0.1027483195066452, "learning_rate": 3.372846092062095e-05, "loss": 8.6091, "step": 46304, "throughput": 17833.920243728942 }, { "epoch": 0.7262752842890619, "grad_norm": 0.07594089955091476, "learning_rate": 3.366474538602806e-05, "loss": 8.6121, "step": 46336, "throughput": 17834.165193588993 }, { "epoch": 0.7267768556179908, "grad_norm": 0.07859986275434494, "learning_rate": 3.3601571478414455e-05, "loss": 8.5916, "step": 46368, "throughput": 17833.943095792438 }, { "epoch": 0.7272784269469197, "grad_norm": 0.07391414791345596, "learning_rate": 3.3538939458309556e-05, "loss": 8.6061, "step": 46400, "throughput": 17833.008350621407 }, { "epoch": 0.7277799982758486, "grad_norm": 0.07304394245147705, "learning_rate": 3.347684958400795e-05, "loss": 8.5964, "step": 46432, "throughput": 17832.974470104615 }, { "epoch": 0.7282815696047774, "grad_norm": 0.08239227533340454, "learning_rate": 3.341530211156847e-05, "loss": 8.6014, "step": 46464, "throughput": 17833.22047940315 }, { "epoch": 0.7287831409337063, "grad_norm": 0.07560481131076813, "learning_rate": 3.33542972948131e-05, "loss": 8.604, "step": 46496, "throughput": 17833.552998804513 }, { "epoch": 0.7292847122626352, "grad_norm": 0.08268658071756363, "learning_rate": 3.329383538532587e-05, "loss": 8.6096, "step": 46528, "throughput": 17833.803191749368 }, { "epoch": 0.7297862835915642, "grad_norm": 0.08136653900146484, "learning_rate": 3.323391663245188e-05, "loss": 8.6022, "step": 46560, "throughput": 17834.047552995467 }, { "epoch": 0.7302878549204931, "grad_norm": 0.07478020340204239, "learning_rate": 3.3174541283296225e-05, "loss": 8.5982, "step": 46592, "throughput": 17834.308879658493 }, { "epoch": 0.730789426249422, "grad_norm": 0.07832559198141098, "learning_rate": 3.311570958272303e-05, "loss": 8.5859, "step": 46624, "throughput": 17834.55308335101 }, { "epoch": 0.7312909975783509, "grad_norm": 0.07957372814416885, "learning_rate": 3.305742177335444e-05, "loss": 8.5923, "step": 46656, "throughput": 17834.52670424745 }, { "epoch": 0.7317925689072798, "grad_norm": 0.07402876764535904, "learning_rate": 3.29996780955695e-05, "loss": 8.591, "step": 46688, "throughput": 17833.69300948565 }, { "epoch": 0.7322941402362088, "grad_norm": 0.09786561131477356, "learning_rate": 3.294247878750333e-05, "loss": 8.6094, "step": 46720, "throughput": 17833.587017346064 }, { "epoch": 0.7327957115651377, "grad_norm": 0.07788746058940887, "learning_rate": 3.288582408504603e-05, "loss": 8.5917, "step": 46752, "throughput": 17833.55305593615 }, { "epoch": 0.7332972828940666, "grad_norm": 0.0738791972398758, "learning_rate": 3.2829714221841805e-05, "loss": 8.63, "step": 46784, "throughput": 17833.883406695128 }, { "epoch": 0.7337988542229955, "grad_norm": 0.07963655143976212, "learning_rate": 3.2774149429287854e-05, "loss": 8.606, "step": 46816, "throughput": 17834.13096712285 }, { "epoch": 0.7343004255519244, "grad_norm": 0.0802462100982666, "learning_rate": 3.271912993653357e-05, "loss": 8.6161, "step": 46848, "throughput": 17834.37435759249 }, { "epoch": 0.7348019968808533, "grad_norm": 0.07487773150205612, "learning_rate": 3.266465597047948e-05, "loss": 8.6063, "step": 46880, "throughput": 17834.613366413905 }, { "epoch": 0.7353035682097822, "grad_norm": 0.07707255333662033, "learning_rate": 3.261072775577641e-05, "loss": 8.609, "step": 46912, "throughput": 17834.93786758653 }, { "epoch": 0.7358051395387111, "grad_norm": 0.08311225473880768, "learning_rate": 3.255734551482446e-05, "loss": 8.5932, "step": 46944, "throughput": 17835.193244551054 }, { "epoch": 0.73630671086764, "grad_norm": 0.07928361743688583, "learning_rate": 3.2504509467772154e-05, "loss": 8.5993, "step": 46976, "throughput": 17834.624306488906 }, { "epoch": 0.7368082821965689, "grad_norm": 0.07688979804515839, "learning_rate": 3.24522198325155e-05, "loss": 8.5923, "step": 47008, "throughput": 17834.22779306419 }, { "epoch": 0.7373098535254978, "grad_norm": 0.08063149452209473, "learning_rate": 3.2400476824697126e-05, "loss": 8.5946, "step": 47040, "throughput": 17834.103907984365 }, { "epoch": 0.7378114248544267, "grad_norm": 0.0874280259013176, "learning_rate": 3.234928065770532e-05, "loss": 8.6212, "step": 47072, "throughput": 17834.343842706545 }, { "epoch": 0.7383129961833557, "grad_norm": 0.08335231989622116, "learning_rate": 3.2298631542673254e-05, "loss": 8.6064, "step": 47104, "throughput": 17834.663927160742 }, { "epoch": 0.7388145675122846, "grad_norm": 0.07642818242311478, "learning_rate": 3.2248529688478036e-05, "loss": 8.6317, "step": 47136, "throughput": 17834.044860553677 }, { "epoch": 0.7393161388412135, "grad_norm": 0.10783732682466507, "learning_rate": 3.2198975301739834e-05, "loss": 8.5986, "step": 47168, "throughput": 17834.201288398795 }, { "epoch": 0.7398177101701424, "grad_norm": 0.07692237198352814, "learning_rate": 3.214996858682109e-05, "loss": 8.6017, "step": 47200, "throughput": 17834.44666997345 }, { "epoch": 0.7403192814990713, "grad_norm": 0.07549633085727692, "learning_rate": 3.210150974582565e-05, "loss": 8.62, "step": 47232, "throughput": 17834.614133441388 }, { "epoch": 0.7408208528280003, "grad_norm": 0.08472277969121933, "learning_rate": 3.205359897859793e-05, "loss": 8.5948, "step": 47264, "throughput": 17834.48982830613 }, { "epoch": 0.7413224241569292, "grad_norm": 0.08152199536561966, "learning_rate": 3.2006236482722034e-05, "loss": 8.5738, "step": 47296, "throughput": 17833.84796111838 }, { "epoch": 0.7418239954858581, "grad_norm": 0.07427741587162018, "learning_rate": 3.195942245352108e-05, "loss": 8.6153, "step": 47328, "throughput": 17833.826142962356 }, { "epoch": 0.7423255668147869, "grad_norm": 0.07587329298257828, "learning_rate": 3.191315708405626e-05, "loss": 8.5985, "step": 47360, "throughput": 17833.884920516462 }, { "epoch": 0.7428271381437158, "grad_norm": 0.08033397793769836, "learning_rate": 3.1867440565126066e-05, "loss": 8.6256, "step": 47392, "throughput": 17834.21834739414 }, { "epoch": 0.7433287094726447, "grad_norm": 0.07397351413965225, "learning_rate": 3.182227308526557e-05, "loss": 8.5935, "step": 47424, "throughput": 17834.463669530487 }, { "epoch": 0.7438302808015737, "grad_norm": 0.0776250809431076, "learning_rate": 3.17776548307456e-05, "loss": 8.6178, "step": 47456, "throughput": 17834.702054140886 }, { "epoch": 0.7443318521305026, "grad_norm": 0.08041536062955856, "learning_rate": 3.173358598557196e-05, "loss": 8.5952, "step": 47488, "throughput": 17834.94603244263 }, { "epoch": 0.7448334234594315, "grad_norm": 0.09945892542600632, "learning_rate": 3.169006673148473e-05, "loss": 8.5852, "step": 47520, "throughput": 17835.013868235914 }, { "epoch": 0.7453349947883604, "grad_norm": 0.08749539405107498, "learning_rate": 3.1647097247957385e-05, "loss": 8.5971, "step": 47552, "throughput": 17835.153046151987 }, { "epoch": 0.7458365661172893, "grad_norm": 0.08924855291843414, "learning_rate": 3.160467771219624e-05, "loss": 8.6044, "step": 47584, "throughput": 17834.596442235495 }, { "epoch": 0.7463381374462182, "grad_norm": 0.07980576902627945, "learning_rate": 3.1562808299139596e-05, "loss": 8.6152, "step": 47616, "throughput": 17834.48573924514 }, { "epoch": 0.7468397087751472, "grad_norm": 0.07602015882730484, "learning_rate": 3.1521489181457005e-05, "loss": 8.607, "step": 47648, "throughput": 17834.36694344504 }, { "epoch": 0.7473412801040761, "grad_norm": 0.08380915224552155, "learning_rate": 3.1480720529548654e-05, "loss": 8.5986, "step": 47680, "throughput": 17834.694986968072 }, { "epoch": 0.747842851433005, "grad_norm": 0.07392337173223495, "learning_rate": 3.1440502511544566e-05, "loss": 8.5987, "step": 47712, "throughput": 17835.02278537719 }, { "epoch": 0.7483444227619339, "grad_norm": 0.08508224040269852, "learning_rate": 3.1400835293303984e-05, "loss": 8.6111, "step": 47744, "throughput": 17835.263740486364 }, { "epoch": 0.7488459940908628, "grad_norm": 0.08330404758453369, "learning_rate": 3.136171903841463e-05, "loss": 8.6165, "step": 47776, "throughput": 17835.428268281223 }, { "epoch": 0.7493475654197916, "grad_norm": 0.07605268806219101, "learning_rate": 3.1323153908192057e-05, "loss": 8.6041, "step": 47808, "throughput": 17835.6553983799 }, { "epoch": 0.7498491367487206, "grad_norm": 0.07306007295846939, "learning_rate": 3.128514006167897e-05, "loss": 8.6178, "step": 47840, "throughput": 17835.65838665241 }, { "epoch": 0.7503507080776495, "grad_norm": 0.0778181254863739, "learning_rate": 3.124767765564459e-05, "loss": 8.5957, "step": 47872, "throughput": 17835.34722429415 }, { "epoch": 0.7508522794065784, "grad_norm": 0.07451704889535904, "learning_rate": 3.121076684458398e-05, "loss": 8.6047, "step": 47904, "throughput": 17834.8868689467 }, { "epoch": 0.7513538507355073, "grad_norm": 0.08760581165552139, "learning_rate": 3.1174407780717433e-05, "loss": 8.6037, "step": 47936, "throughput": 17834.85964877205 }, { "epoch": 0.7518554220644362, "grad_norm": 0.07690441608428955, "learning_rate": 3.113860061398985e-05, "loss": 8.585, "step": 47968, "throughput": 17835.006197636758 }, { "epoch": 0.7523569933933651, "grad_norm": 0.07794877141714096, "learning_rate": 3.110334549207009e-05, "loss": 8.5965, "step": 48000, "throughput": 17835.327988956305 }, { "epoch": 0.7528585647222941, "grad_norm": 0.08219928294420242, "learning_rate": 3.1068642560350375e-05, "loss": 8.5826, "step": 48032, "throughput": 17835.570811645724 }, { "epoch": 0.753360136051223, "grad_norm": 0.08382223546504974, "learning_rate": 3.103449196194569e-05, "loss": 8.6001, "step": 48064, "throughput": 17835.806531516253 }, { "epoch": 0.7538617073801519, "grad_norm": 0.09056299924850464, "learning_rate": 3.1000893837693234e-05, "loss": 8.6214, "step": 48096, "throughput": 17836.045335655446 }, { "epoch": 0.7543632787090808, "grad_norm": 0.07294950634241104, "learning_rate": 3.096784832615175e-05, "loss": 8.5694, "step": 48128, "throughput": 17836.02776593902 }, { "epoch": 0.7548648500380097, "grad_norm": 0.08173788338899612, "learning_rate": 3.093535556360101e-05, "loss": 8.6219, "step": 48160, "throughput": 17836.069887898226 }, { "epoch": 0.7553664213669387, "grad_norm": 0.07701551169157028, "learning_rate": 3.0903415684041285e-05, "loss": 8.6025, "step": 48192, "throughput": 17835.41495806723 }, { "epoch": 0.7558679926958676, "grad_norm": 0.08350755274295807, "learning_rate": 3.087202881919273e-05, "loss": 8.6023, "step": 48224, "throughput": 17835.314069311517 }, { "epoch": 0.7563695640247964, "grad_norm": 0.08125930279493332, "learning_rate": 3.084119509849488e-05, "loss": 8.6016, "step": 48256, "throughput": 17835.284502123366 }, { "epoch": 0.7568711353537253, "grad_norm": 0.12003647536039352, "learning_rate": 3.081091464910606e-05, "loss": 8.6096, "step": 48288, "throughput": 17835.608629798637 }, { "epoch": 0.7573727066826542, "grad_norm": 0.07632914930582047, "learning_rate": 3.078118759590295e-05, "loss": 8.5979, "step": 48320, "throughput": 17835.931485541114 }, { "epoch": 0.7578742780115831, "grad_norm": 0.07661539316177368, "learning_rate": 3.075201406148001e-05, "loss": 8.5919, "step": 48352, "throughput": 17836.173041678863 }, { "epoch": 0.758375849340512, "grad_norm": 0.07512614876031876, "learning_rate": 3.072339416614899e-05, "loss": 8.5966, "step": 48384, "throughput": 17836.333431530336 }, { "epoch": 0.758877420669441, "grad_norm": 0.08936319500207901, "learning_rate": 3.069532802793839e-05, "loss": 8.585, "step": 48416, "throughput": 17836.487537129284 }, { "epoch": 0.7593789919983699, "grad_norm": 0.08849947899580002, "learning_rate": 3.066781576259309e-05, "loss": 8.6167, "step": 48448, "throughput": 17836.47048999063 }, { "epoch": 0.7598805633272988, "grad_norm": 0.08645664155483246, "learning_rate": 3.0640857483573714e-05, "loss": 8.6013, "step": 48480, "throughput": 17836.080591388745 }, { "epoch": 0.7603821346562277, "grad_norm": 0.07802789658308029, "learning_rate": 3.061445330205631e-05, "loss": 8.5903, "step": 48512, "throughput": 17835.79941755457 }, { "epoch": 0.7608837059851566, "grad_norm": 0.08236244320869446, "learning_rate": 3.0588603326931796e-05, "loss": 8.6045, "step": 48544, "throughput": 17835.773908487055 }, { "epoch": 0.7613852773140856, "grad_norm": 0.08064370602369308, "learning_rate": 3.056330766480554e-05, "loss": 8.5926, "step": 48576, "throughput": 17836.009046012525 }, { "epoch": 0.7618868486430145, "grad_norm": 0.07869768887758255, "learning_rate": 3.053856641999694e-05, "loss": 8.5944, "step": 48608, "throughput": 17836.32572679976 }, { "epoch": 0.7623884199719434, "grad_norm": 0.07431543618440628, "learning_rate": 3.0514379694538932e-05, "loss": 8.5902, "step": 48640, "throughput": 17836.554931513605 }, { "epoch": 0.7628899913008723, "grad_norm": 0.07379096746444702, "learning_rate": 3.0490747588177684e-05, "loss": 8.6127, "step": 48672, "throughput": 17836.709908699562 }, { "epoch": 0.7633915626298011, "grad_norm": 0.07593395560979843, "learning_rate": 3.0467670198372044e-05, "loss": 8.6016, "step": 48704, "throughput": 17836.941250927004 }, { "epoch": 0.76389313395873, "grad_norm": 0.07255639880895615, "learning_rate": 3.044514762029326e-05, "loss": 8.5867, "step": 48736, "throughput": 17837.08882129428 }, { "epoch": 0.764394705287659, "grad_norm": 0.0845591202378273, "learning_rate": 3.0423179946824494e-05, "loss": 8.6074, "step": 48768, "throughput": 17836.78557055867 }, { "epoch": 0.7648962766165879, "grad_norm": 0.07482326030731201, "learning_rate": 3.040176726856049e-05, "loss": 8.6001, "step": 48800, "throughput": 17836.485349372793 }, { "epoch": 0.7653978479455168, "grad_norm": 0.07807141542434692, "learning_rate": 3.0380909673807205e-05, "loss": 8.5799, "step": 48832, "throughput": 17836.38466703605 }, { "epoch": 0.7658994192744457, "grad_norm": 0.07797779142856598, "learning_rate": 3.0360607248581437e-05, "loss": 8.6014, "step": 48864, "throughput": 17836.43798871803 }, { "epoch": 0.7664009906033746, "grad_norm": 0.0804995745420456, "learning_rate": 3.0340860076610427e-05, "loss": 8.5887, "step": 48896, "throughput": 17836.7562656726 }, { "epoch": 0.7669025619323036, "grad_norm": 0.08541762083768845, "learning_rate": 3.0321668239331582e-05, "loss": 8.5996, "step": 48928, "throughput": 17837.069595176625 }, { "epoch": 0.7674041332612325, "grad_norm": 0.07710454612970352, "learning_rate": 3.030303181589207e-05, "loss": 8.5785, "step": 48960, "throughput": 17837.147894551657 }, { "epoch": 0.7679057045901614, "grad_norm": 0.07934178411960602, "learning_rate": 3.0284950883148598e-05, "loss": 8.5863, "step": 48992, "throughput": 17837.386582616426 }, { "epoch": 0.7684072759190903, "grad_norm": 0.0971083864569664, "learning_rate": 3.026742551566696e-05, "loss": 8.5905, "step": 49024, "throughput": 17837.461820329434 }, { "epoch": 0.7689088472480192, "grad_norm": 0.07783489674329758, "learning_rate": 3.0250455785721827e-05, "loss": 8.5961, "step": 49056, "throughput": 17837.42766907545 }, { "epoch": 0.7694104185769481, "grad_norm": 0.07902809977531433, "learning_rate": 3.023404176329643e-05, "loss": 8.6004, "step": 49088, "throughput": 17836.97276521561 }, { "epoch": 0.7699119899058771, "grad_norm": 0.07620695978403091, "learning_rate": 3.021818351608223e-05, "loss": 8.5928, "step": 49120, "throughput": 17836.862850153775 }, { "epoch": 0.7704135612348059, "grad_norm": 0.07504887133836746, "learning_rate": 3.0202881109478676e-05, "loss": 8.6057, "step": 49152, "throughput": 17836.916599803946 }, { "epoch": 0.7709151325637348, "grad_norm": 0.0752057358622551, "learning_rate": 3.0188134606592958e-05, "loss": 8.6044, "step": 49184, "throughput": 17836.313046066643 }, { "epoch": 0.7714167038926637, "grad_norm": 0.07765153795480728, "learning_rate": 3.017394406823969e-05, "loss": 8.5944, "step": 49216, "throughput": 17836.622213396313 }, { "epoch": 0.7719182752215926, "grad_norm": 0.08464544266462326, "learning_rate": 3.0160309552940704e-05, "loss": 8.62, "step": 49248, "throughput": 17836.858812627466 }, { "epoch": 0.7724198465505215, "grad_norm": 0.07419130206108093, "learning_rate": 3.014723111692476e-05, "loss": 8.5974, "step": 49280, "throughput": 17837.018146216167 }, { "epoch": 0.7729214178794505, "grad_norm": 0.08695698529481888, "learning_rate": 3.013470881412739e-05, "loss": 8.5758, "step": 49312, "throughput": 17837.094983627103 }, { "epoch": 0.7734229892083794, "grad_norm": 0.07773898541927338, "learning_rate": 3.0122742696190606e-05, "loss": 8.6104, "step": 49344, "throughput": 17837.158286582817 }, { "epoch": 0.7739245605373083, "grad_norm": 0.07133398205041885, "learning_rate": 3.0111332812462692e-05, "loss": 8.5956, "step": 49376, "throughput": 17836.952026592255 }, { "epoch": 0.7744261318662372, "grad_norm": 0.081082284450531, "learning_rate": 3.0100479209998055e-05, "loss": 8.5868, "step": 49408, "throughput": 17836.570468016125 }, { "epoch": 0.7749277031951661, "grad_norm": 0.09132739156484604, "learning_rate": 3.0090181933556994e-05, "loss": 8.6011, "step": 49440, "throughput": 17836.460989167073 }, { "epoch": 0.775429274524095, "grad_norm": 0.08725861459970474, "learning_rate": 3.0080441025605494e-05, "loss": 8.5824, "step": 49472, "throughput": 17836.59863661418 }, { "epoch": 0.775930845853024, "grad_norm": 0.07932783663272858, "learning_rate": 3.007125652631508e-05, "loss": 8.5797, "step": 49504, "throughput": 17836.909560824424 }, { "epoch": 0.7764324171819529, "grad_norm": 0.07793397456407547, "learning_rate": 3.006262847356269e-05, "loss": 8.586, "step": 49536, "throughput": 17837.2234703918 }, { "epoch": 0.7769339885108818, "grad_norm": 0.08320693671703339, "learning_rate": 3.0054556902930394e-05, "loss": 8.5985, "step": 49568, "throughput": 17837.373588519822 }, { "epoch": 0.7774355598398106, "grad_norm": 0.09126448631286621, "learning_rate": 3.0047041847705404e-05, "loss": 8.6029, "step": 49600, "throughput": 17837.602067526575 }, { "epoch": 0.7779371311687395, "grad_norm": 0.08450878411531448, "learning_rate": 3.0040083338879834e-05, "loss": 8.5736, "step": 49632, "throughput": 17837.58848053963 }, { "epoch": 0.7784387024976684, "grad_norm": 0.08500304073095322, "learning_rate": 3.0033681405150554e-05, "loss": 8.6081, "step": 49664, "throughput": 17837.466373705025 }, { "epoch": 0.7789402738265974, "grad_norm": 0.08071015775203705, "learning_rate": 3.0027836072919202e-05, "loss": 8.5804, "step": 49696, "throughput": 17837.098340151126 }, { "epoch": 0.7794418451555263, "grad_norm": 0.0792938694357872, "learning_rate": 3.002254736629194e-05, "loss": 8.6094, "step": 49728, "throughput": 17836.984875565544 }, { "epoch": 0.7799434164844552, "grad_norm": 0.08206379413604736, "learning_rate": 3.001781530707938e-05, "loss": 8.5797, "step": 49760, "throughput": 17836.954878495046 }, { "epoch": 0.7804449878133841, "grad_norm": 0.07426104694604874, "learning_rate": 3.0013639914796586e-05, "loss": 8.5966, "step": 49792, "throughput": 17837.265641517744 }, { "epoch": 0.780946559142313, "grad_norm": 0.07360176742076874, "learning_rate": 3.001002120666285e-05, "loss": 8.5894, "step": 49824, "throughput": 17837.56856457737 }, { "epoch": 0.781448130471242, "grad_norm": 0.0769825130701065, "learning_rate": 3.0006959197601765e-05, "loss": 8.6, "step": 49856, "throughput": 17837.795235918486 }, { "epoch": 0.7819497018001709, "grad_norm": 0.08089779317378998, "learning_rate": 3.000445390024106e-05, "loss": 8.5933, "step": 49888, "throughput": 17837.947136390696 }, { "epoch": 0.7824512731290998, "grad_norm": 0.07225056737661362, "learning_rate": 3.0002505324912582e-05, "loss": 8.5799, "step": 49920, "throughput": 17838.009145271175 }, { "epoch": 0.7829528444580287, "grad_norm": 0.08053544163703918, "learning_rate": 3.0001113479652246e-05, "loss": 8.5902, "step": 49952, "throughput": 17837.980418546384 }, { "epoch": 0.7834544157869576, "grad_norm": 0.07379666715860367, "learning_rate": 3.0000278370200057e-05, "loss": 8.5985, "step": 49984, "throughput": 17837.532480432266 }, { "epoch": 0.7839559871158865, "grad_norm": 0.08274323493242264, "learning_rate": 2.9999999999999997e-05, "loss": 8.5875, "step": 50016, "throughput": 17837.339637847817 }, { "epoch": 0.7839559871158865, "step": 50016, "throughput": 17836.525899553584, "total_flos": 8.182419259005201e+20, "train_loss": 1.2059035261548336, "train_runtime": 26049.049, "train_samples_per_second": 1966.152, "train_steps_per_second": 1.92 } ], "logging_steps": 32, "max_steps": 50016, "num_input_tokens_seen": 104891154432, "num_train_epochs": 1, "save_steps": 2048, "stateful_callbacks": { "LogCallback": { "elapsed_time": 183771.69379615784, "start_time": 1765630168.7857313 }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.182419259005201e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }