{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.7839559871158865,
  "eval_steps": 500,
  "global_step": 50016,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0005015713289289101,
      "grad_norm": 2.737861394882202,
      "learning_rate": 1.875e-05,
      "loss": 36.6669,
      "step": 32,
      "throughput": 4350.062995504972
    },
    {
      "epoch": 0.0010031426578578201,
      "grad_norm": 2.384377956390381,
      "learning_rate": 3.75e-05,
      "loss": 27.7836,
      "step": 64,
      "throughput": 6868.906038827848
    },
    {
      "epoch": 0.0015047139867867302,
      "grad_norm": 2.172982931137085,
      "learning_rate": 5.625e-05,
      "loss": 23.6446,
      "step": 96,
      "throughput": 8676.71146512722
    },
    {
      "epoch": 0.0020062853157156403,
      "grad_norm": 1.5305997133255005,
      "learning_rate": 7.5e-05,
      "loss": 21.199,
      "step": 128,
      "throughput": 9992.827972037452
    },
    {
      "epoch": 0.0025078566446445506,
      "grad_norm": 1.4041763544082642,
      "learning_rate": 9.374999999999999e-05,
      "loss": 19.6528,
      "step": 160,
      "throughput": 10982.420092196491
    },
    {
      "epoch": 0.0030094279735734604,
      "grad_norm": 1.2152384519577026,
      "learning_rate": 0.0001125,
      "loss": 18.5342,
      "step": 192,
      "throughput": 11769.216948894604
    },
    {
      "epoch": 0.0035109993025023707,
      "grad_norm": 0.9721791744232178,
      "learning_rate": 0.00013125,
      "loss": 17.5499,
      "step": 224,
      "throughput": 12404.199759486375
    },
    {
      "epoch": 0.0040125706314312806,
      "grad_norm": 0.8485270142555237,
      "learning_rate": 0.00015,
      "loss": 16.7178,
      "step": 256,
      "throughput": 12926.906808106636
    },
    {
      "epoch": 0.004514141960360191,
      "grad_norm": 0.879156231880188,
      "learning_rate": 0.00016874999999999998,
      "loss": 15.9278,
      "step": 288,
      "throughput": 13364.992559591625
    },
    {
      "epoch": 0.005015713289289101,
      "grad_norm": 0.7019696235656738,
      "learning_rate": 0.00018749999999999998,
      "loss": 15.2188,
      "step": 320,
      "throughput": 13738.019263643286
    },
    {
      "epoch": 0.005517284618218011,
      "grad_norm": 0.5537405014038086,
      "learning_rate": 0.00020624999999999997,
      "loss": 14.6855,
      "step": 352,
      "throughput": 13864.274977038647
    },
    {
      "epoch": 0.006018855947146921,
      "grad_norm": 0.5422670841217041,
      "learning_rate": 0.000225,
      "loss": 14.2397,
      "step": 384,
      "throughput": 14152.243480340532
    },
    {
      "epoch": 0.006520427276075831,
      "grad_norm": 0.5699282288551331,
      "learning_rate": 0.00024375,
      "loss": 13.9064,
      "step": 416,
      "throughput": 14404.992949736901
    },
    {
      "epoch": 0.007021998605004741,
      "grad_norm": 0.4603062868118286,
      "learning_rate": 0.0002625,
      "loss": 13.6128,
      "step": 448,
      "throughput": 14623.56986825748
    },
    {
      "epoch": 0.007523569933933652,
      "grad_norm": 0.4180799424648285,
      "learning_rate": 0.00028125,
      "loss": 13.3493,
      "step": 480,
      "throughput": 14824.190947537965
    },
    {
      "epoch": 0.008025141262862561,
      "grad_norm": 0.4132392108440399,
      "learning_rate": 0.0003,
      "loss": 13.1644,
      "step": 512,
      "throughput": 15004.16560878327
    },
    {
      "epoch": 0.008526712591791472,
      "grad_norm": 0.3716597557067871,
      "learning_rate": 0.00029999972162979993,
      "loss": 12.9831,
      "step": 544,
      "throughput": 15166.039125342944
    },
    {
      "epoch": 0.009028283920720382,
      "grad_norm": 0.31194815039634705,
      "learning_rate": 0.00029999888652034774,
      "loss": 12.7736,
      "step": 576,
      "throughput": 15312.97548988371
    },
    {
      "epoch": 0.009529855249649291,
      "grad_norm": 0.27463942766189575,
      "learning_rate": 0.00029999749467508744,
      "loss": 12.6429,
      "step": 608,
      "throughput": 15447.179519135016
    },
    {
      "epoch": 0.010031426578578202,
      "grad_norm": 0.35916781425476074,
      "learning_rate": 0.0002999955460997589,
      "loss": 12.4571,
      "step": 640,
      "throughput": 15515.372038285244
    },
    {
      "epoch": 0.010532997907507112,
      "grad_norm": 0.2248268574476242,
      "learning_rate": 0.0002999930408023982,
      "loss": 12.3474,
      "step": 672,
      "throughput": 15565.854800125042
    },
    {
      "epoch": 0.011034569236436023,
      "grad_norm": 0.2440202385187149,
      "learning_rate": 0.00029998997879333714,
      "loss": 12.2072,
      "step": 704,
      "throughput": 15673.428153679059
    },
    {
      "epoch": 0.011536140565364932,
      "grad_norm": 0.22323162853717804,
      "learning_rate": 0.0002999863600852034,
      "loss": 12.0949,
      "step": 736,
      "throughput": 15772.819065073185
    },
    {
      "epoch": 0.012037711894293842,
      "grad_norm": 0.21207238733768463,
      "learning_rate": 0.0002999821846929206,
      "loss": 12.0205,
      "step": 768,
      "throughput": 15861.48978900848
    },
    {
      "epoch": 0.012539283223222753,
      "grad_norm": 0.21534381806850433,
      "learning_rate": 0.000299977452633708,
      "loss": 11.9246,
      "step": 800,
      "throughput": 15947.834714894536
    },
    {
      "epoch": 0.013040854552151662,
      "grad_norm": 0.1891532838344574,
      "learning_rate": 0.00029997216392708075,
      "loss": 11.8366,
      "step": 832,
      "throughput": 16028.38745709301
    },
    {
      "epoch": 0.013542425881080573,
      "grad_norm": 0.26148706674575806,
      "learning_rate": 0.00029996631859484943,
      "loss": 11.7597,
      "step": 864,
      "throughput": 16103.565898562283
    },
    {
      "epoch": 0.014043997210009483,
      "grad_norm": 0.19636370241641998,
      "learning_rate": 0.00029995991666112014,
      "loss": 11.6889,
      "step": 896,
      "throughput": 16174.401229490417
    },
    {
      "epoch": 0.014545568538938392,
      "grad_norm": 0.3416622281074524,
      "learning_rate": 0.0002999529581522946,
      "loss": 11.622,
      "step": 928,
      "throughput": 16233.68864359473
    },
    {
      "epoch": 0.015047139867867303,
      "grad_norm": 0.18073995411396027,
      "learning_rate": 0.0002999454430970696,
      "loss": 11.5889,
      "step": 960,
      "throughput": 16210.364274523525
    },
    {
      "epoch": 0.015548711196796213,
      "grad_norm": 0.1570938527584076,
      "learning_rate": 0.0002999373715264373,
      "loss": 11.5215,
      "step": 992,
      "throughput": 16271.492628215825
    },
    {
      "epoch": 0.016050282525725122,
      "grad_norm": 0.20865876972675323,
      "learning_rate": 0.0002999287434736849,
      "loss": 11.4671,
      "step": 1024,
      "throughput": 16329.189967431246
    },
    {
      "epoch": 0.016551853854654033,
      "grad_norm": 0.15938478708267212,
      "learning_rate": 0.0002999195589743945,
      "loss": 11.4095,
      "step": 1056,
      "throughput": 16380.195823306925
    },
    {
      "epoch": 0.017053425183582945,
      "grad_norm": 0.16895660758018494,
      "learning_rate": 0.000299909818066443,
      "loss": 11.3584,
      "step": 1088,
      "throughput": 16431.633737801927
    },
    {
      "epoch": 0.017554996512511852,
      "grad_norm": 0.21328414976596832,
      "learning_rate": 0.00029989952079000195,
      "loss": 11.3362,
      "step": 1120,
      "throughput": 16480.600441523216
    },
    {
      "epoch": 0.018056567841440763,
      "grad_norm": 0.17861202359199524,
      "learning_rate": 0.0002998886671875373,
      "loss": 11.2748,
      "step": 1152,
      "throughput": 16527.308305360624
    },
    {
      "epoch": 0.018558139170369675,
      "grad_norm": 0.1585921049118042,
      "learning_rate": 0.0002998772573038094,
      "loss": 11.2309,
      "step": 1184,
      "throughput": 16571.695879414754
    },
    {
      "epoch": 0.019059710499298582,
      "grad_norm": 0.18928000330924988,
      "learning_rate": 0.0002998652911858726,
      "loss": 11.1846,
      "step": 1216,
      "throughput": 16613.799461460218
    },
    {
      "epoch": 0.019561281828227493,
      "grad_norm": 0.15984398126602173,
      "learning_rate": 0.00029985276888307524,
      "loss": 11.1471,
      "step": 1248,
      "throughput": 16609.892953842762
    },
    {
      "epoch": 0.020062853157156404,
      "grad_norm": 0.21660014986991882,
      "learning_rate": 0.00029983969044705927,
      "loss": 11.1291,
      "step": 1280,
      "throughput": 16622.300930347366
    },
    {
      "epoch": 0.020564424486085316,
      "grad_norm": 0.14777784049510956,
      "learning_rate": 0.0002998260559317603,
      "loss": 11.0892,
      "step": 1312,
      "throughput": 16660.521579844877
    },
    {
      "epoch": 0.021065995815014223,
      "grad_norm": 0.14789365231990814,
      "learning_rate": 0.00029981186539340703,
      "loss": 11.0426,
      "step": 1344,
      "throughput": 16697.111978019177
    },
    {
      "epoch": 0.021567567143943134,
      "grad_norm": 0.14859367907047272,
      "learning_rate": 0.0002997971188905213,
      "loss": 11.0245,
      "step": 1376,
      "throughput": 16729.518871853692
    },
    {
      "epoch": 0.022069138472872046,
      "grad_norm": 0.15714970231056213,
      "learning_rate": 0.0002997818164839178,
      "loss": 10.9909,
      "step": 1408,
      "throughput": 16762.959924093906
    },
    {
      "epoch": 0.022570709801800953,
      "grad_norm": 0.15366333723068237,
      "learning_rate": 0.00029976595823670354,
      "loss": 10.9599,
      "step": 1440,
      "throughput": 16795.25507278253
    },
    {
      "epoch": 0.023072281130729864,
      "grad_norm": 0.12857754528522491,
      "learning_rate": 0.0002997495442142781,
      "loss": 10.9525,
      "step": 1472,
      "throughput": 16826.31322241198
    },
    {
      "epoch": 0.023573852459658776,
      "grad_norm": 0.14877088367938995,
      "learning_rate": 0.000299732574484333,
      "loss": 10.9111,
      "step": 1504,
      "throughput": 16856.042827265162
    },
    {
      "epoch": 0.024075423788587683,
      "grad_norm": 0.13609924912452698,
      "learning_rate": 0.0002997150491168514,
      "loss": 10.8869,
      "step": 1536,
      "throughput": 16860.23517737338
    },
    {
      "epoch": 0.024576995117516594,
      "grad_norm": 0.13336826860904694,
      "learning_rate": 0.0002996969681841079,
      "loss": 10.8515,
      "step": 1568,
      "throughput": 16849.334342204478
    },
    {
      "epoch": 0.025078566446445506,
      "grad_norm": 0.1425756961107254,
      "learning_rate": 0.0002996783317606684,
      "loss": 10.8343,
      "step": 1600,
      "throughput": 16876.970940638224
    },
    {
      "epoch": 0.025580137775374417,
      "grad_norm": 0.12339065968990326,
      "learning_rate": 0.0002996591399233895,
      "loss": 10.8043,
      "step": 1632,
      "throughput": 16903.458674886315
    },
    {
      "epoch": 0.026081709104303324,
      "grad_norm": 0.1654420793056488,
      "learning_rate": 0.00029963939275141855,
      "loss": 10.7661,
      "step": 1664,
      "throughput": 16926.983914307886
    },
    {
      "epoch": 0.026583280433232236,
      "grad_norm": 0.1435118317604065,
      "learning_rate": 0.00029961909032619275,
      "loss": 10.7588,
      "step": 1696,
      "throughput": 16951.641196906367
    },
    {
      "epoch": 0.027084851762161147,
      "grad_norm": 0.13442976772785187,
      "learning_rate": 0.00029959823273143947,
      "loss": 10.7364,
      "step": 1728,
      "throughput": 16975.48019723633
    },
    {
      "epoch": 0.027586423091090054,
      "grad_norm": 0.13002510368824005,
      "learning_rate": 0.0002995768200531755,
      "loss": 10.7439,
      "step": 1760,
      "throughput": 16998.619261377895
    },
    {
      "epoch": 0.028087994420018966,
      "grad_norm": 0.13215093314647675,
      "learning_rate": 0.00029955485237970675,
      "loss": 10.7027,
      "step": 1792,
      "throughput": 17020.925154476437
    },
    {
      "epoch": 0.028589565748947877,
      "grad_norm": 0.11973442882299423,
      "learning_rate": 0.00029953232980162793,
      "loss": 10.6846,
      "step": 1824,
      "throughput": 17042.518070188875
    },
    {
      "epoch": 0.029091137077876784,
      "grad_norm": 0.1303834766149521,
      "learning_rate": 0.0002995092524118223,
      "loss": 10.6449,
      "step": 1856,
      "throughput": 17016.161749213952
    },
    {
      "epoch": 0.029592708406805696,
      "grad_norm": 0.12708888947963715,
      "learning_rate": 0.00029948562030546107,
      "loss": 10.6539,
      "step": 1888,
      "throughput": 17035.112974831827
    },
    {
      "epoch": 0.030094279735734607,
      "grad_norm": 0.12121839076280594,
      "learning_rate": 0.00029946143358000306,
      "loss": 10.6242,
      "step": 1920,
      "throughput": 17055.38715248678
    },
    {
      "epoch": 0.030595851064663518,
      "grad_norm": 0.11469029635190964,
      "learning_rate": 0.0002994366923351945,
      "loss": 10.6108,
      "step": 1952,
      "throughput": 17075.084240308945
    },
    {
      "epoch": 0.031097422393592426,
      "grad_norm": 0.12286818772554398,
      "learning_rate": 0.00029941139667306817,
      "loss": 10.5874,
      "step": 1984,
      "throughput": 17092.458306452812
    },
    {
      "epoch": 0.03159899372252133,
      "grad_norm": 0.1339081972837448,
      "learning_rate": 0.00029938554669794364,
      "loss": 10.5631,
      "step": 2016,
      "throughput": 17111.09462915212
    },
    {
      "epoch": 0.032100565051450244,
      "grad_norm": 0.11653994768857956,
      "learning_rate": 0.00029935914251642625,
      "loss": 10.5505,
      "step": 2048,
      "throughput": 17129.118733264822
    },
    {
      "epoch": 0.032602136380379156,
      "grad_norm": 0.11728750914335251,
      "learning_rate": 0.0002993321842374069,
      "loss": 10.5368,
      "step": 2080,
      "throughput": 17125.597353447316
    },
    {
      "epoch": 0.03310370770930807,
      "grad_norm": 0.14113257825374603,
      "learning_rate": 0.00029930467197206156,
      "loss": 10.507,
      "step": 2112,
      "throughput": 17142.736351591233
    },
    {
      "epoch": 0.03360527903823698,
      "grad_norm": 0.1468958556652069,
      "learning_rate": 0.000299276605833851,
      "loss": 10.4866,
      "step": 2144,
      "throughput": 17138.43663722723
    },
    {
      "epoch": 0.03410685036716589,
      "grad_norm": 0.11644583195447922,
      "learning_rate": 0.00029924798593851994,
      "loss": 10.4893,
      "step": 2176,
      "throughput": 17133.26269190115
    },
    {
      "epoch": 0.0346084216960948,
      "grad_norm": 0.13621202111244202,
      "learning_rate": 0.00029921881240409703,
      "loss": 10.4784,
      "step": 2208,
      "throughput": 17149.698906784317
    },
    {
      "epoch": 0.035109993025023704,
      "grad_norm": 0.13037540018558502,
      "learning_rate": 0.00029918908535089394,
      "loss": 10.4713,
      "step": 2240,
      "throughput": 17165.729847041897
    },
    {
      "epoch": 0.035611564353952616,
      "grad_norm": 0.10882680118083954,
      "learning_rate": 0.00029915880490150515,
      "loss": 10.4456,
      "step": 2272,
      "throughput": 17179.659850846612
    },
    {
      "epoch": 0.03611313568288153,
      "grad_norm": 0.1198781281709671,
      "learning_rate": 0.0002991279711808072,
      "loss": 10.4441,
      "step": 2304,
      "throughput": 17194.730227287117
    },
    {
      "epoch": 0.03661470701181044,
      "grad_norm": 0.123746857047081,
      "learning_rate": 0.0002990965843159587,
      "loss": 10.4084,
      "step": 2336,
      "throughput": 17209.49343914183
    },
    {
      "epoch": 0.03711627834073935,
      "grad_norm": 0.11193118989467621,
      "learning_rate": 0.000299064644436399,
      "loss": 10.4024,
      "step": 2368,
      "throughput": 17223.8604387402
    },
    {
      "epoch": 0.03761784966966826,
      "grad_norm": 0.12986549735069275,
      "learning_rate": 0.0002990321516738482,
      "loss": 10.3658,
      "step": 2400,
      "throughput": 17234.548263588465
    },
    {
      "epoch": 0.038119420998597164,
      "grad_norm": 0.1199018731713295,
      "learning_rate": 0.00029899910616230674,
      "loss": 10.3805,
      "step": 2432,
      "throughput": 17245.132224434677
    },
    {
      "epoch": 0.038620992327526076,
      "grad_norm": 0.11385921388864517,
      "learning_rate": 0.0002989655080380543,
      "loss": 10.3796,
      "step": 2464,
      "throughput": 17225.296922944664
    },
    {
      "epoch": 0.03912256365645499,
      "grad_norm": 0.11526582390069962,
      "learning_rate": 0.0002989313574396496,
      "loss": 10.3479,
      "step": 2496,
      "throughput": 17236.683727834174
    },
    {
      "epoch": 0.0396241349853839,
      "grad_norm": 0.12292210012674332,
      "learning_rate": 0.00029889665450792983,
      "loss": 10.3344,
      "step": 2528,
      "throughput": 17249.837851508317
    },
    {
      "epoch": 0.04012570631431281,
      "grad_norm": 0.11053690314292908,
      "learning_rate": 0.0002988613993860101,
      "loss": 10.327,
      "step": 2560,
      "throughput": 17262.691844135206
    },
    {
      "epoch": 0.04062727764324172,
      "grad_norm": 0.12507599592208862,
      "learning_rate": 0.0002988255922192825,
      "loss": 10.318,
      "step": 2592,
      "throughput": 17273.835852105618
    },
    {
      "epoch": 0.04112884897217063,
      "grad_norm": 0.14252890646457672,
      "learning_rate": 0.000298789233155416,
      "loss": 10.305,
      "step": 2624,
      "throughput": 17286.01479157816
    },
    {
      "epoch": 0.041630420301099536,
      "grad_norm": 0.12239658832550049,
      "learning_rate": 0.0002987523223443554,
      "loss": 10.2977,
      "step": 2656,
      "throughput": 17298.06003124777
    },
    {
      "epoch": 0.04213199163002845,
      "grad_norm": 0.11500866711139679,
      "learning_rate": 0.000298714859938321,
      "loss": 10.2824,
      "step": 2688,
      "throughput": 17306.5386805577
    },
    {
      "epoch": 0.04263356295895736,
      "grad_norm": 0.13773761689662933,
      "learning_rate": 0.0002986768460918079,
      "loss": 10.269,
      "step": 2720,
      "throughput": 17318.022733030513
    },
    {
      "epoch": 0.04313513428788627,
      "grad_norm": 0.11034831404685974,
      "learning_rate": 0.0002986382809615853,
      "loss": 10.2613,
      "step": 2752,
      "throughput": 17310.182294333277
    },
    {
      "epoch": 0.04363670561681518,
      "grad_norm": 0.12043263763189316,
      "learning_rate": 0.00029859916470669596,
      "loss": 10.2641,
      "step": 2784,
      "throughput": 17305.36764489267
    },
    {
      "epoch": 0.04413827694574409,
      "grad_norm": 0.12668395042419434,
      "learning_rate": 0.0002985594974884554,
      "loss": 10.2143,
      "step": 2816,
      "throughput": 17316.46325404571
    },
    {
      "epoch": 0.044639848274673,
      "grad_norm": 0.12551531195640564,
      "learning_rate": 0.00029851927947045136,
      "loss": 10.2288,
      "step": 2848,
      "throughput": 17327.313985046396
    },
    {
      "epoch": 0.04514141960360191,
      "grad_norm": 0.1118423193693161,
      "learning_rate": 0.000298478510818543,
      "loss": 10.2037,
      "step": 2880,
      "throughput": 17336.701168332987
    },
    {
      "epoch": 0.04564299093253082,
      "grad_norm": 0.11913536489009857,
      "learning_rate": 0.0002984371917008604,
      "loss": 10.1908,
      "step": 2912,
      "throughput": 17347.173504797236
    },
    {
      "epoch": 0.04614456226145973,
      "grad_norm": 0.11208291351795197,
      "learning_rate": 0.0002983953222878037,
      "loss": 10.2091,
      "step": 2944,
      "throughput": 17357.365218601015
    },
    {
      "epoch": 0.04664613359038864,
      "grad_norm": 0.13201284408569336,
      "learning_rate": 0.0002983529027520426,
      "loss": 10.1697,
      "step": 2976,
      "throughput": 17367.380788820446
    },
    {
      "epoch": 0.04714770491931755,
      "grad_norm": 0.10959289968013763,
      "learning_rate": 0.0002983099332685153,
      "loss": 10.1807,
      "step": 3008,
      "throughput": 17374.438712058905
    },
    {
      "epoch": 0.04764927624824646,
      "grad_norm": 0.12129059433937073,
      "learning_rate": 0.000298266414014428,
      "loss": 10.1717,
      "step": 3040,
      "throughput": 17374.21131079343
    },
    {
      "epoch": 0.04815084757717537,
      "grad_norm": 0.11781725287437439,
      "learning_rate": 0.0002982223451692544,
      "loss": 10.1645,
      "step": 3072,
      "throughput": 17362.983274354043
    },
    {
      "epoch": 0.04865241890610428,
      "grad_norm": 0.11948370188474655,
      "learning_rate": 0.0002981777269147344,
      "loss": 10.1535,
      "step": 3104,
      "throughput": 17372.49721677286
    },
    {
      "epoch": 0.04915399023503319,
      "grad_norm": 0.11986897140741348,
      "learning_rate": 0.0002981325594348739,
      "loss": 10.1651,
      "step": 3136,
      "throughput": 17381.819105870687
    },
    {
      "epoch": 0.0496555615639621,
      "grad_norm": 0.1096222773194313,
      "learning_rate": 0.00029808684291594373,
      "loss": 10.1223,
      "step": 3168,
      "throughput": 17391.02283950976
    },
    {
      "epoch": 0.05015713289289101,
      "grad_norm": 0.11586567759513855,
      "learning_rate": 0.0002980405775464789,
      "loss": 10.1181,
      "step": 3200,
      "throughput": 17398.88477420086
    },
    {
      "epoch": 0.05065870422181992,
      "grad_norm": 0.11325585842132568,
      "learning_rate": 0.00029799376351727797,
      "loss": 10.1027,
      "step": 3232,
      "throughput": 17407.772020075172
    },
    {
      "epoch": 0.051160275550748834,
      "grad_norm": 0.11115550249814987,
      "learning_rate": 0.00029794640102140206,
      "loss": 10.0934,
      "step": 3264,
      "throughput": 17416.48282303885
    },
    {
      "epoch": 0.05166184687967774,
      "grad_norm": 0.1690579205751419,
      "learning_rate": 0.00029789849025417433,
      "loss": 10.0908,
      "step": 3296,
      "throughput": 17422.67145275401
    },
    {
      "epoch": 0.05216341820860665,
      "grad_norm": 0.1091977134346962,
      "learning_rate": 0.0002978500314131789,
      "loss": 10.1244,
      "step": 3328,
      "throughput": 17431.123336344346
    },
    {
      "epoch": 0.05266498953753556,
      "grad_norm": 0.12021326273679733,
      "learning_rate": 0.00029780102469826014,
      "loss": 10.0776,
      "step": 3360,
      "throughput": 17418.5921345674
    },
    {
      "epoch": 0.05316656086646447,
      "grad_norm": 0.11450007557868958,
      "learning_rate": 0.00029775147031152195,
      "loss": 10.0661,
      "step": 3392,
      "throughput": 17419.855544322516
    },
    {
      "epoch": 0.05366813219539338,
      "grad_norm": 0.11600250005722046,
      "learning_rate": 0.0002977013684573267,
      "loss": 10.068,
      "step": 3424,
      "throughput": 17428.08526860908
    },
    {
      "epoch": 0.054169703524322294,
      "grad_norm": 0.1143857091665268,
      "learning_rate": 0.0002976507193422946,
      "loss": 10.0618,
      "step": 3456,
      "throughput": 17436.174986582257
    },
    {
      "epoch": 0.0546712748532512,
      "grad_norm": 0.11679524928331375,
      "learning_rate": 0.00029759952317530284,
      "loss": 10.0712,
      "step": 3488,
      "throughput": 17443.11412749548
    },
    {
      "epoch": 0.05517284618218011,
      "grad_norm": 0.10890108346939087,
      "learning_rate": 0.0002975477801674845,
      "loss": 10.0198,
      "step": 3520,
      "throughput": 17449.86598475358
    },
    {
      "epoch": 0.05567441751110902,
      "grad_norm": 0.12132527679204941,
      "learning_rate": 0.00029749549053222784,
      "loss": 10.0404,
      "step": 3552,
      "throughput": 17457.54716352314
    },
    {
      "epoch": 0.05617598884003793,
      "grad_norm": 0.10533833503723145,
      "learning_rate": 0.0002974426544851755,
      "loss": 10.0162,
      "step": 3584,
      "throughput": 17461.716367238747
    },
    {
      "epoch": 0.05667756016896684,
      "grad_norm": 0.1043761745095253,
      "learning_rate": 0.00029738927224422354,
      "loss": 10.019,
      "step": 3616,
      "throughput": 17469.163920337327
    },
    {
      "epoch": 0.057179131497895753,
      "grad_norm": 0.12880338728427887,
      "learning_rate": 0.0002973353440295205,
      "loss": 9.9935,
      "step": 3648,
      "throughput": 17467.133660286712
    },
    {
      "epoch": 0.057680702826824665,
      "grad_norm": 0.11385416239500046,
      "learning_rate": 0.0002972808700634664,
      "loss": 9.9976,
      "step": 3680,
      "throughput": 17457.92126652086
    },
    {
      "epoch": 0.05818227415575357,
      "grad_norm": 0.10410229116678238,
      "learning_rate": 0.0002972258505707121,
      "loss": 9.9902,
      "step": 3712,
      "throughput": 17465.242572055144
    },
    {
      "epoch": 0.05868384548468248,
      "grad_norm": 0.11616440117359161,
      "learning_rate": 0.00029717028577815817,
      "loss": 9.978,
      "step": 3744,
      "throughput": 17472.43441514092
    },
    {
      "epoch": 0.05918541681361139,
      "grad_norm": 0.10885748267173767,
      "learning_rate": 0.0002971141759149539,
      "loss": 9.9977,
      "step": 3776,
      "throughput": 17479.47152805395
    },
    {
      "epoch": 0.0596869881425403,
      "grad_norm": 0.09921612590551376,
      "learning_rate": 0.00029705752121249665,
      "loss": 9.9735,
      "step": 3808,
      "throughput": 17485.442570293268
    },
    {
      "epoch": 0.060188559471469213,
      "grad_norm": 0.10691357403993607,
      "learning_rate": 0.0002970003219044305,
      "loss": 9.96,
      "step": 3840,
      "throughput": 17491.197596983668
    },
    {
      "epoch": 0.060690130800398125,
      "grad_norm": 0.11703182011842728,
      "learning_rate": 0.0002969425782266455,
      "loss": 9.9753,
      "step": 3872,
      "throughput": 17496.026084267538
    },
    {
      "epoch": 0.061191702129327036,
      "grad_norm": 0.11055783927440643,
      "learning_rate": 0.0002968842904172769,
      "loss": 9.9648,
      "step": 3904,
      "throughput": 17500.58649689582
    },
    {
      "epoch": 0.06169327345825594,
      "grad_norm": 0.10314410924911499,
      "learning_rate": 0.00029682545871670375,
      "loss": 9.9586,
      "step": 3936,
      "throughput": 17504.020001952344
    },
    {
      "epoch": 0.06219484478718485,
      "grad_norm": 0.10210820287466049,
      "learning_rate": 0.0002967660833675481,
      "loss": 9.9413,
      "step": 3968,
      "throughput": 17493.79839046395
    },
    {
      "epoch": 0.06269641611611376,
      "grad_norm": 0.11651694774627686,
      "learning_rate": 0.0002967061646146741,
      "loss": 9.9297,
      "step": 4000,
      "throughput": 17497.19311474326
    },
    {
      "epoch": 0.06319798744504267,
      "grad_norm": 0.11037289351224899,
      "learning_rate": 0.00029664570270518685,
      "loss": 9.9138,
      "step": 4032,
      "throughput": 17503.607958066386
    },
    {
      "epoch": 0.06369955877397158,
      "grad_norm": 0.1176404356956482,
      "learning_rate": 0.00029658469788843147,
      "loss": 9.9193,
      "step": 4064,
      "throughput": 17509.946216930257
    },
    {
      "epoch": 0.06420113010290049,
      "grad_norm": 0.10412602126598358,
      "learning_rate": 0.00029652315041599203,
      "loss": 9.908,
      "step": 4096,
      "throughput": 17515.229704243564
    },
    {
      "epoch": 0.0647027014318294,
      "grad_norm": 0.10687946528196335,
      "learning_rate": 0.00029646106054169046,
      "loss": 9.9116,
      "step": 4128,
      "throughput": 17510.8929672248
    },
    {
      "epoch": 0.06520427276075831,
      "grad_norm": 0.1051303818821907,
      "learning_rate": 0.00029639842852158553,
      "loss": 9.9078,
      "step": 4160,
      "throughput": 17516.986313956397
    },
    {
      "epoch": 0.06570584408968723,
      "grad_norm": 0.09516575187444687,
      "learning_rate": 0.00029633525461397194,
      "loss": 9.9009,
      "step": 4192,
      "throughput": 17518.336116573944
    },
    {
      "epoch": 0.06620741541861613,
      "grad_norm": 0.10216325521469116,
      "learning_rate": 0.00029627153907937903,
      "loss": 9.9047,
      "step": 4224,
      "throughput": 17524.402411826053
    },
    {
      "epoch": 0.06670898674754504,
      "grad_norm": 0.09496073424816132,
      "learning_rate": 0.0002962072821805699,
      "loss": 9.8684,
      "step": 4256,
      "throughput": 17516.880157408814
    },
    {
      "epoch": 0.06721055807647396,
      "grad_norm": 0.09801316261291504,
      "learning_rate": 0.0002961424841825402,
      "loss": 9.8765,
      "step": 4288,
      "throughput": 17514.939707148635
    },
    {
      "epoch": 0.06771212940540286,
      "grad_norm": 0.10971728712320328,
      "learning_rate": 0.00029607714535251703,
      "loss": 9.8709,
      "step": 4320,
      "throughput": 17520.7930113957
    },
    {
      "epoch": 0.06821370073433178,
      "grad_norm": 0.10133639723062515,
      "learning_rate": 0.00029601126595995794,
      "loss": 9.8745,
      "step": 4352,
      "throughput": 17526.298259170235
    },
    {
      "epoch": 0.06871527206326068,
      "grad_norm": 0.11306998878717422,
      "learning_rate": 0.0002959448462765497,
      "loss": 9.8557,
      "step": 4384,
      "throughput": 17532.004624718895
    },
    {
      "epoch": 0.0692168433921896,
      "grad_norm": 0.09759490936994553,
      "learning_rate": 0.0002958778865762072,
      "loss": 9.8712,
      "step": 4416,
      "throughput": 17537.640257356656
    },
    {
      "epoch": 0.0697184147211185,
      "grad_norm": 0.10575896501541138,
      "learning_rate": 0.0002958103871350727,
      "loss": 9.8513,
      "step": 4448,
      "throughput": 17542.25457088177
    },
    {
      "epoch": 0.07021998605004741,
      "grad_norm": 0.10681546479463577,
      "learning_rate": 0.0002957423482315139,
      "loss": 9.8529,
      "step": 4480,
      "throughput": 17545.144703490852
    },
    {
      "epoch": 0.07072155737897633,
      "grad_norm": 0.1152559444308281,
      "learning_rate": 0.0002956737701461235,
      "loss": 9.8385,
      "step": 4512,
      "throughput": 17548.74305341976
    },
    {
      "epoch": 0.07122312870790523,
      "grad_norm": 0.10631895065307617,
      "learning_rate": 0.00029560465316171773,
      "loss": 9.8269,
      "step": 4544,
      "throughput": 17550.01474642025
    },
    {
      "epoch": 0.07172470003683415,
      "grad_norm": 0.10484053939580917,
      "learning_rate": 0.0002955349975633352,
      "loss": 9.8415,
      "step": 4576,
      "throughput": 17542.430141402656
    },
    {
      "epoch": 0.07222627136576305,
      "grad_norm": 0.12255118787288666,
      "learning_rate": 0.00029546480363823577,
      "loss": 9.837,
      "step": 4608,
      "throughput": 17545.271449204676
    },
    {
      "epoch": 0.07272784269469197,
      "grad_norm": 0.09655202925205231,
      "learning_rate": 0.0002953940716758995,
      "loss": 9.8122,
      "step": 4640,
      "throughput": 17550.573230292368
    },
    {
      "epoch": 0.07322941402362088,
      "grad_norm": 0.0936635285615921,
      "learning_rate": 0.0002953228019680252,
      "loss": 9.8208,
      "step": 4672,
      "throughput": 17555.763552451823
    },
    {
      "epoch": 0.07373098535254978,
      "grad_norm": 0.10081731528043747,
      "learning_rate": 0.0002952509948085293,
      "loss": 9.7989,
      "step": 4704,
      "throughput": 17560.945999053136
    },
    {
      "epoch": 0.0742325566814787,
      "grad_norm": 0.11177249252796173,
      "learning_rate": 0.00029517865049354477,
      "loss": 9.8218,
      "step": 4736,
      "throughput": 17564.308658957896
    },
    {
      "epoch": 0.0747341280104076,
      "grad_norm": 0.10516203194856644,
      "learning_rate": 0.0002951057693214197,
      "loss": 9.7971,
      "step": 4768,
      "throughput": 17568.519454363795
    },
    {
      "epoch": 0.07523569933933652,
      "grad_norm": 0.09306059777736664,
      "learning_rate": 0.0002950323515927164,
      "loss": 9.7782,
      "step": 4800,
      "throughput": 17570.196634560925
    },
    {
      "epoch": 0.07573727066826542,
      "grad_norm": 0.10436815023422241,
      "learning_rate": 0.0002949583976102097,
      "loss": 9.7929,
      "step": 4832,
      "throughput": 17574.36304626455
    },
    {
      "epoch": 0.07623884199719433,
      "grad_norm": 0.10673332214355469,
      "learning_rate": 0.00029488390767888606,
      "loss": 9.7824,
      "step": 4864,
      "throughput": 17568.283545826413
    },
    {
      "epoch": 0.07674041332612325,
      "grad_norm": 0.10187188535928726,
      "learning_rate": 0.0002948088821059422,
      "loss": 9.7773,
      "step": 4896,
      "throughput": 17567.223444270698
    },
    {
      "epoch": 0.07724198465505215,
      "grad_norm": 0.09946262836456299,
      "learning_rate": 0.0002947333212007838,
      "loss": 9.7803,
      "step": 4928,
      "throughput": 17571.22188826861
    },
    {
      "epoch": 0.07774355598398107,
      "grad_norm": 0.11062668263912201,
      "learning_rate": 0.0002946572252750242,
      "loss": 9.7851,
      "step": 4960,
      "throughput": 17575.99999033726
    },
    {
      "epoch": 0.07824512731290997,
      "grad_norm": 0.1022370308637619,
      "learning_rate": 0.0002945805946424834,
      "loss": 9.7647,
      "step": 4992,
      "throughput": 17580.71820594211
    },
    {
      "epoch": 0.07874669864183889,
      "grad_norm": 0.10585004836320877,
      "learning_rate": 0.0002945034296191861,
      "loss": 9.7739,
      "step": 5024,
      "throughput": 17585.3697487813
    },
    {
      "epoch": 0.0792482699707678,
      "grad_norm": 0.10841728746891022,
      "learning_rate": 0.00029442573052336127,
      "loss": 9.7694,
      "step": 5056,
      "throughput": 17588.24348090029
    },
    {
      "epoch": 0.0797498412996967,
      "grad_norm": 0.10807620733976364,
      "learning_rate": 0.0002943474976754401,
      "loss": 9.726,
      "step": 5088,
      "throughput": 17590.41015953132
    },
    {
      "epoch": 0.08025141262862562,
      "grad_norm": 0.10132047533988953,
      "learning_rate": 0.0002942687313980552,
      "loss": 9.7531,
      "step": 5120,
      "throughput": 17594.225674820857
    },
    {
      "epoch": 0.08075298395755452,
      "grad_norm": 0.10171724855899811,
      "learning_rate": 0.0002941894320160389,
      "loss": 9.7544,
      "step": 5152,
      "throughput": 17592.732609094946
    },
    {
      "epoch": 0.08125455528648344,
      "grad_norm": 0.1025429293513298,
      "learning_rate": 0.00029410959985642205,
      "loss": 9.7367,
      "step": 5184,
      "throughput": 17588.42501942629
    },
    {
      "epoch": 0.08175612661541234,
      "grad_norm": 0.10521358251571655,
      "learning_rate": 0.0002940292352484327,
      "loss": 9.7222,
      "step": 5216,
      "throughput": 17591.457901596536
    },
    {
      "epoch": 0.08225769794434126,
      "grad_norm": 0.10784083604812622,
      "learning_rate": 0.0002939483385234948,
      "loss": 9.7218,
      "step": 5248,
      "throughput": 17595.868519367257
    },
    {
      "epoch": 0.08275926927327017,
      "grad_norm": 0.10504278540611267,
      "learning_rate": 0.0002938669100152266,
      "loss": 9.7445,
      "step": 5280,
      "throughput": 17600.25831414381
    },
    {
      "epoch": 0.08326084060219907,
      "grad_norm": 0.09491508454084396,
      "learning_rate": 0.00029378495005943954,
      "loss": 9.7135,
      "step": 5312,
      "throughput": 17603.869297250032
    },
    {
      "epoch": 0.08376241193112799,
      "grad_norm": 0.11275453120470047,
      "learning_rate": 0.00029370245899413677,
      "loss": 9.7141,
      "step": 5344,
      "throughput": 17606.548134823293
    },
    {
      "epoch": 0.0842639832600569,
      "grad_norm": 0.10523293912410736,
      "learning_rate": 0.0002936194371595116,
      "loss": 9.7171,
      "step": 5376,
      "throughput": 17609.9973105787
    },
    {
      "epoch": 0.08476555458898581,
      "grad_norm": 0.10381247848272324,
      "learning_rate": 0.00029353588489794636,
      "loss": 9.707,
      "step": 5408,
      "throughput": 17611.932878916054
    },
    {
      "epoch": 0.08526712591791472,
      "grad_norm": 0.08856762945652008,
      "learning_rate": 0.0002934518025540109,
      "loss": 9.7049,
      "step": 5440,
      "throughput": 17614.62864714934
    },
    {
      "epoch": 0.08576869724684363,
      "grad_norm": 0.09817332029342651,
      "learning_rate": 0.00029336719047446096,
      "loss": 9.7237,
      "step": 5472,
      "throughput": 17606.804804142805
    },
    {
      "epoch": 0.08627026857577254,
      "grad_norm": 0.10031472146511078,
      "learning_rate": 0.000293282049008237,
      "loss": 9.695,
      "step": 5504,
      "throughput": 17608.69165022491
    },
    {
      "epoch": 0.08677183990470144,
      "grad_norm": 0.1021127700805664,
      "learning_rate": 0.00029319637850646273,
      "loss": 9.6985,
      "step": 5536,
      "throughput": 17612.11712008491
    },
    {
      "epoch": 0.08727341123363036,
      "grad_norm": 0.10871855914592743,
      "learning_rate": 0.0002931101793224435,
      "loss": 9.7016,
      "step": 5568,
      "throughput": 17616.15297625078
    },
    {
      "epoch": 0.08777498256255926,
      "grad_norm": 0.09045758843421936,
      "learning_rate": 0.0002930234518116651,
      "loss": 9.7002,
      "step": 5600,
      "throughput": 17619.38128133957
    },
    {
      "epoch": 0.08827655389148818,
      "grad_norm": 0.091500423848629,
      "learning_rate": 0.000292936196331792,
      "loss": 9.6671,
      "step": 5632,
      "throughput": 17622.575227982983
    },
    {
      "epoch": 0.08877812522041709,
      "grad_norm": 0.1074434295296669,
      "learning_rate": 0.000292848413242666,
      "loss": 9.6926,
      "step": 5664,
      "throughput": 17626.52213908722
    },
    {
      "epoch": 0.089279696549346,
      "grad_norm": 0.09811190515756607,
      "learning_rate": 0.0002927601029063049,
      "loss": 9.6708,
      "step": 5696,
      "throughput": 17628.319110769124
    },
    {
      "epoch": 0.08978126787827491,
      "grad_norm": 0.1028069257736206,
      "learning_rate": 0.0002926712656869007,
      "loss": 9.6649,
      "step": 5728,
      "throughput": 17632.157170322804
    },
    {
      "epoch": 0.09028283920720381,
      "grad_norm": 0.09495889395475388,
      "learning_rate": 0.0002925819019508184,
      "loss": 9.6708,
      "step": 5760,
      "throughput": 17627.799875140157
    },
    {
      "epoch": 0.09078441053613273,
      "grad_norm": 0.08912132680416107,
      "learning_rate": 0.0002924920120665943,
      "loss": 9.6776,
      "step": 5792,
      "throughput": 17626.529906001404
    },
    {
      "epoch": 0.09128598186506164,
      "grad_norm": 0.10304176807403564,
      "learning_rate": 0.00029240159640493463,
      "loss": 9.6722,
      "step": 5824,
      "throughput": 17629.08179239365
    },
    {
      "epoch": 0.09178755319399055,
      "grad_norm": 0.09453807771205902,
      "learning_rate": 0.00029231065533871374,
      "loss": 9.661,
      "step": 5856,
      "throughput": 17632.87142952827
    },
    {
      "epoch": 0.09228912452291946,
      "grad_norm": 0.09665724635124207,
      "learning_rate": 0.0002922191892429729,
      "loss": 9.6408,
      "step": 5888,
      "throughput": 17635.962107195053
    },
    {
      "epoch": 0.09279069585184836,
      "grad_norm": 0.10424266010522842,
      "learning_rate": 0.0002921271984949185,
      "loss": 9.6516,
      "step": 5920,
      "throughput": 17638.31696854064
    },
    {
      "epoch": 0.09329226718077728,
      "grad_norm": 0.1087077185511589,
      "learning_rate": 0.0002920346834739208,
      "loss": 9.6378,
      "step": 5952,
      "throughput": 17641.284236489937
    },
    {
      "epoch": 0.09379383850970618,
      "grad_norm": 0.09325496107339859,
      "learning_rate": 0.0002919416445615119,
      "loss": 9.641,
      "step": 5984,
      "throughput": 17643.60922814501
    },
    {
      "epoch": 0.0942954098386351,
      "grad_norm": 0.08783965557813644,
      "learning_rate": 0.0002918480821413846,
      "loss": 9.6218,
      "step": 6016,
      "throughput": 17645.8784832565
    },
    {
      "epoch": 0.094796981167564,
      "grad_norm": 0.09440628439188004,
      "learning_rate": 0.0002917539965993906,
      "loss": 9.624,
      "step": 6048,
      "throughput": 17646.755073945893
    },
    {
      "epoch": 0.09529855249649292,
      "grad_norm": 0.09520844370126724,
      "learning_rate": 0.00029165938832353885,
      "loss": 9.6299,
      "step": 6080,
      "throughput": 17641.485981629125
    },
    {
      "epoch": 0.09580012382542183,
      "grad_norm": 0.09581635892391205,
      "learning_rate": 0.00029156425770399434,
      "loss": 9.6167,
      "step": 6112,
      "throughput": 17643.078829361555
    },
    {
      "epoch": 0.09630169515435073,
      "grad_norm": 0.10582980513572693,
      "learning_rate": 0.0002914686051330759,
      "loss": 9.6188,
      "step": 6144,
      "throughput": 17645.96076286583
    },
    {
      "epoch": 0.09680326648327965,
      "grad_norm": 0.10588561743497849,
      "learning_rate": 0.00029137243100525506,
      "loss": 9.6429,
      "step": 6176,
      "throughput": 17642.181888559946
    },
    {
      "epoch": 0.09730483781220856,
      "grad_norm": 0.1067621037364006,
      "learning_rate": 0.00029127573571715416,
      "loss": 9.613,
      "step": 6208,
      "throughput": 17645.000134548154
    },
    {
      "epoch": 0.09780640914113747,
      "grad_norm": 0.09926958382129669,
      "learning_rate": 0.00029117851966754495,
      "loss": 9.6089,
      "step": 6240,
      "throughput": 17647.194567637944
    },
    {
      "epoch": 0.09830798047006638,
      "grad_norm": 0.09142431616783142,
      "learning_rate": 0.00029108078325734666,
      "loss": 9.6167,
      "step": 6272,
      "throughput": 17650.62803497812
    },
    {
      "epoch": 0.0988095517989953,
      "grad_norm": 0.10240490734577179,
      "learning_rate": 0.0002909825268896245,
      "loss": 9.6053,
      "step": 6304,
      "throughput": 17651.4597674561
    },
    {
      "epoch": 0.0993111231279242,
      "grad_norm": 0.09140589088201523,
      "learning_rate": 0.000290883750969588,
      "loss": 9.5989,
      "step": 6336,
      "throughput": 17654.249897680278
    },
    {
      "epoch": 0.0998126944568531,
      "grad_norm": 0.1017196998000145,
      "learning_rate": 0.00029078445590458946,
      "loss": 9.5878,
      "step": 6368,
      "throughput": 17649.12258178832
    },
    {
      "epoch": 0.10031426578578202,
      "grad_norm": 0.0923992320895195,
      "learning_rate": 0.0002906846421041219,
      "loss": 9.625,
      "step": 6400,
      "throughput": 17648.517260039112
    },
    {
      "epoch": 0.10081583711471093,
      "grad_norm": 0.09159571677446365,
      "learning_rate": 0.00029058430997981784,
      "loss": 9.571,
      "step": 6432,
      "throughput": 17651.87687688665
    },
    {
      "epoch": 0.10131740844363984,
      "grad_norm": 0.09523748606443405,
      "learning_rate": 0.0002904834599454472,
      "loss": 9.576,
      "step": 6464,
      "throughput": 17655.226269402574
    },
    {
      "epoch": 0.10181897977256875,
      "grad_norm": 0.09053566306829453,
      "learning_rate": 0.00029038209241691575,
      "loss": 9.6027,
      "step": 6496,
      "throughput": 17657.85699659635
    },
    {
      "epoch": 0.10232055110149767,
      "grad_norm": 0.10799683630466461,
      "learning_rate": 0.0002902802078122636,
      "loss": 9.558,
      "step": 6528,
      "throughput": 17659.877539796056
    },
    {
      "epoch": 0.10282212243042657,
      "grad_norm": 0.09402478486299515,
      "learning_rate": 0.00029017780655166315,
      "loss": 9.5786,
      "step": 6560,
      "throughput": 17662.439138601498
    },
    {
      "epoch": 0.10332369375935548,
      "grad_norm": 0.10318920761346817,
      "learning_rate": 0.0002900748890574175,
      "loss": 9.5855,
      "step": 6592,
      "throughput": 17664.441358775166
    },
    {
      "epoch": 0.1038252650882844,
      "grad_norm": 0.09061913937330246,
      "learning_rate": 0.0002899714557539586,
      "loss": 9.5826,
      "step": 6624,
      "throughput": 17666.35326495104
    },
    {
      "epoch": 0.1043268364172133,
      "grad_norm": 0.11016388982534409,
      "learning_rate": 0.00028986750706784574,
      "loss": 9.5847,
      "step": 6656,
      "throughput": 17663.995249747324
    },
    {
      "epoch": 0.10482840774614222,
      "grad_norm": 0.09011092782020569,
      "learning_rate": 0.0002897630434277637,
      "loss": 9.5616,
      "step": 6688,
      "throughput": 17661.589803465566
    },
    {
      "epoch": 0.10532997907507112,
      "grad_norm": 0.09123651683330536,
      "learning_rate": 0.0002896580652645207,
      "loss": 9.555,
      "step": 6720,
      "throughput": 17662.85464981392
    },
    {
      "epoch": 0.10583155040400004,
      "grad_norm": 0.08848977088928223,
      "learning_rate": 0.00028955257301104714,
      "loss": 9.5467,
      "step": 6752,
      "throughput": 17665.994535257018
    },
    {
      "epoch": 0.10633312173292894,
      "grad_norm": 0.09953072667121887,
      "learning_rate": 0.00028944656710239337,
      "loss": 9.538,
      "step": 6784,
      "throughput": 17669.094778991184
    },
    {
      "epoch": 0.10683469306185785,
      "grad_norm": 0.08808404952287674,
      "learning_rate": 0.00028934004797572795,
      "loss": 9.5748,
      "step": 6816,
      "throughput": 17669.885900216705
    },
    {
      "epoch": 0.10733626439078676,
      "grad_norm": 0.09777748584747314,
      "learning_rate": 0.00028923301607033616,
      "loss": 9.5246,
      "step": 6848,
      "throughput": 17671.560145856194
    },
    {
      "epoch": 0.10783783571971567,
      "grad_norm": 0.10776592046022415,
      "learning_rate": 0.0002891254718276178,
      "loss": 9.5752,
      "step": 6880,
      "throughput": 17673.816226508443
    },
    {
      "epoch": 0.10833940704864459,
      "grad_norm": 0.08375327289104462,
      "learning_rate": 0.00028901741569108586,
      "loss": 9.5443,
      "step": 6912,
      "throughput": 17674.9136385758
    },
    {
      "epoch": 0.10884097837757349,
      "grad_norm": 0.0973740741610527,
      "learning_rate": 0.00028890884810636394,
      "loss": 9.5495,
      "step": 6944,
      "throughput": 17675.986629077877
    },
    {
      "epoch": 0.1093425497065024,
      "grad_norm": 0.08603812754154205,
      "learning_rate": 0.00028879976952118523,
      "loss": 9.5444,
      "step": 6976,
      "throughput": 17671.644097643108
    },
    {
      "epoch": 0.10984412103543131,
      "grad_norm": 0.08715157955884933,
      "learning_rate": 0.0002886901803853901,
      "loss": 9.5646,
      "step": 7008,
      "throughput": 17671.419285394746
    },
    {
      "epoch": 0.11034569236436022,
      "grad_norm": 0.08621218055486679,
      "learning_rate": 0.00028858008115092445,
      "loss": 9.5315,
      "step": 7040,
      "throughput": 17674.205815983045
    },
    {
      "epoch": 0.11084726369328914,
      "grad_norm": 0.10020724684000015,
      "learning_rate": 0.0002884694722718378,
      "loss": 9.5324,
      "step": 7072,
      "throughput": 17676.921071529414
    },
    {
      "epoch": 0.11134883502221804,
      "grad_norm": 0.09769771993160248,
      "learning_rate": 0.00028835835420428163,
      "loss": 9.5231,
      "step": 7104,
      "throughput": 17678.41386225938
    },
    {
      "epoch": 0.11185040635114696,
      "grad_norm": 0.08455855399370193,
      "learning_rate": 0.000288246727406507,
      "loss": 9.5243,
      "step": 7136,
      "throughput": 17680.610998948523
    },
    {
      "epoch": 0.11235197768007586,
      "grad_norm": 0.08644996583461761,
      "learning_rate": 0.00028813459233886335,
      "loss": 9.514,
      "step": 7168,
      "throughput": 17682.161967136737
    },
    {
      "epoch": 0.11285354900900477,
      "grad_norm": 0.10697755962610245,
      "learning_rate": 0.00028802194946379585,
      "loss": 9.4924,
      "step": 7200,
      "throughput": 17682.65781775484
    },
    {
      "epoch": 0.11335512033793368,
      "grad_norm": 0.09193835407495499,
      "learning_rate": 0.0002879087992458442,
      "loss": 9.509,
      "step": 7232,
      "throughput": 17685.310101258376
    },
    {
      "epoch": 0.11385669166686259,
      "grad_norm": 0.09078823775053024,
      "learning_rate": 0.00028779514215164015,
      "loss": 9.4963,
      "step": 7264,
      "throughput": 17680.549920329966
    },
    {
      "epoch": 0.11435826299579151,
      "grad_norm": 0.09351503849029541,
      "learning_rate": 0.0002876809786499059,
      "loss": 9.5108,
      "step": 7296,
      "throughput": 17680.890002033695
    },
    {
      "epoch": 0.11485983432472041,
      "grad_norm": 0.08877700567245483,
      "learning_rate": 0.0002875663092114521,
      "loss": 9.5187,
      "step": 7328,
      "throughput": 17682.398823294523
    },
    {
      "epoch": 0.11536140565364933,
      "grad_norm": 0.0977354496717453,
      "learning_rate": 0.0002874511343091758,
      "loss": 9.5053,
      "step": 7360,
      "throughput": 17685.051078982924
    },
    {
      "epoch": 0.11586297698257823,
      "grad_norm": 0.09951066970825195,
      "learning_rate": 0.00028733545441805874,
      "loss": 9.5097,
      "step": 7392,
      "throughput": 17687.66218146485
    },
    {
      "epoch": 0.11636454831150714,
      "grad_norm": 0.08795120567083359,
      "learning_rate": 0.00028721927001516503,
      "loss": 9.518,
      "step": 7424,
      "throughput": 17688.555650311624
    },
    {
      "epoch": 0.11686611964043606,
      "grad_norm": 0.0988912284374237,
      "learning_rate": 0.00028710258157963955,
      "loss": 9.5067,
      "step": 7456,
      "throughput": 17690.55321163933
    },
    {
      "epoch": 0.11736769096936496,
      "grad_norm": 0.09112031012773514,
      "learning_rate": 0.00028698538959270577,
      "loss": 9.5045,
      "step": 7488,
      "throughput": 17691.52051917983
    },
    {
      "epoch": 0.11786926229829388,
      "grad_norm": 0.1025838628411293,
      "learning_rate": 0.00028686769453766366,
      "loss": 9.5023,
      "step": 7520,
      "throughput": 17692.930602701097
    },
    {
      "epoch": 0.11837083362722278,
      "grad_norm": 0.09629788249731064,
      "learning_rate": 0.00028674949689988814,
      "loss": 9.4773,
      "step": 7552,
      "throughput": 17693.81868079842
    },
    {
      "epoch": 0.1188724049561517,
      "grad_norm": 0.09582464396953583,
      "learning_rate": 0.00028663079716682654,
      "loss": 9.4727,
      "step": 7584,
      "throughput": 17689.672053808215
    },
    {
      "epoch": 0.1193739762850806,
      "grad_norm": 0.08803921192884445,
      "learning_rate": 0.00028651159582799695,
      "loss": 9.4801,
      "step": 7616,
      "throughput": 17691.050960385746
    },
    {
      "epoch": 0.11987554761400951,
      "grad_norm": 0.08849391341209412,
      "learning_rate": 0.000286391893374986,
      "loss": 9.4867,
      "step": 7648,
      "throughput": 17693.53900562614
    },
    {
      "epoch": 0.12037711894293843,
      "grad_norm": 0.087612085044384,
      "learning_rate": 0.0002862716903014469,
      "loss": 9.4833,
      "step": 7680,
      "throughput": 17696.03728518093
    },
    {
      "epoch": 0.12087869027186733,
      "grad_norm": 0.09365695714950562,
      "learning_rate": 0.0002861509871030977,
      "loss": 9.4686,
      "step": 7712,
      "throughput": 17697.358179243107
    },
    {
      "epoch": 0.12138026160079625,
      "grad_norm": 0.08572287112474442,
      "learning_rate": 0.0002860297842777185,
      "loss": 9.451,
      "step": 7744,
      "throughput": 17698.270939143044
    },
    {
      "epoch": 0.12188183292972515,
      "grad_norm": 0.0876513198018074,
      "learning_rate": 0.00028590808232515025,
      "loss": 9.4644,
      "step": 7776,
      "throughput": 17700.200087927802
    },
    {
      "epoch": 0.12238340425865407,
      "grad_norm": 0.0888097882270813,
      "learning_rate": 0.00028578588174729214,
      "loss": 9.4579,
      "step": 7808,
      "throughput": 17700.51634556709
    },
    {
      "epoch": 0.12288497558758298,
      "grad_norm": 0.0886104628443718,
      "learning_rate": 0.0002856631830480997,
      "loss": 9.4607,
      "step": 7840,
      "throughput": 17701.386413357985
    },
    {
      "epoch": 0.12338654691651188,
      "grad_norm": 0.09121379256248474,
      "learning_rate": 0.0002855399867335827,
      "loss": 9.4552,
      "step": 7872,
      "throughput": 17697.91403017303
    },
    {
      "epoch": 0.1238881182454408,
      "grad_norm": 0.1071939542889595,
      "learning_rate": 0.0002854162933118032,
      "loss": 9.4628,
      "step": 7904,
      "throughput": 17698.688838735907
    },
    {
      "epoch": 0.1243896895743697,
      "grad_norm": 0.10231801867485046,
      "learning_rate": 0.0002852921032928732,
      "loss": 9.4493,
      "step": 7936,
      "throughput": 17700.060224950434
    },
    {
      "epoch": 0.12489126090329862,
      "grad_norm": 0.08605187386274338,
      "learning_rate": 0.0002851674171889526,
      "loss": 9.4498,
      "step": 7968,
      "throughput": 17702.402029974543
    },
    {
      "epoch": 0.12539283223222752,
      "grad_norm": 0.08886358886957169,
      "learning_rate": 0.0002850422355142474,
      "loss": 9.4466,
      "step": 8000,
      "throughput": 17704.7022624541
    },
    {
      "epoch": 0.12589440356115644,
      "grad_norm": 0.08829627186059952,
      "learning_rate": 0.00028491655878500716,
      "loss": 9.4512,
      "step": 8032,
      "throughput": 17704.931370729046
    },
    {
      "epoch": 0.12639597489008533,
      "grad_norm": 0.09064590930938721,
      "learning_rate": 0.0002847903875195231,
      "loss": 9.4378,
      "step": 8064,
      "throughput": 17706.702573880393
    },
    {
      "epoch": 0.12689754621901425,
      "grad_norm": 0.08949542790651321,
      "learning_rate": 0.00028466372223812575,
      "loss": 9.4081,
      "step": 8096,
      "throughput": 17707.012920911453
    },
    {
      "epoch": 0.12739911754794317,
      "grad_norm": 0.0910944789648056,
      "learning_rate": 0.0002845365634631833,
      "loss": 9.4723,
      "step": 8128,
      "throughput": 17708.223008042412
    },
    {
      "epoch": 0.1279006888768721,
      "grad_norm": 0.10030682384967804,
      "learning_rate": 0.0002844089117190988,
      "loss": 9.4345,
      "step": 8160,
      "throughput": 17707.217328548944
    },
    {
      "epoch": 0.12840226020580098,
      "grad_norm": 0.08720734715461731,
      "learning_rate": 0.0002842807675323085,
      "loss": 9.4463,
      "step": 8192,
      "throughput": 17704.633840665756
    },
    {
      "epoch": 0.1289038315347299,
      "grad_norm": 0.09187284857034683,
      "learning_rate": 0.00028415213143127935,
      "loss": 9.4482,
      "step": 8224,
      "throughput": 17701.090390522568
    },
    {
      "epoch": 0.1294054028636588,
      "grad_norm": 0.09016137570142746,
      "learning_rate": 0.00028402300394650697,
      "loss": 9.4399,
      "step": 8256,
      "throughput": 17703.362725097726
    },
    {
      "epoch": 0.1299069741925877,
      "grad_norm": 0.1010221391916275,
      "learning_rate": 0.0002838933856105136,
      "loss": 9.4222,
      "step": 8288,
      "throughput": 17705.6091619322
    },
    {
      "epoch": 0.13040854552151662,
      "grad_norm": 0.08562653511762619,
      "learning_rate": 0.0002837632769578455,
      "loss": 9.4371,
      "step": 8320,
      "throughput": 17706.843163637994
    },
    {
      "epoch": 0.13091011685044554,
      "grad_norm": 0.08788871765136719,
      "learning_rate": 0.00028363267852507133,
      "loss": 9.4206,
      "step": 8352,
      "throughput": 17707.560741404828
    },
    {
      "epoch": 0.13141168817937446,
      "grad_norm": 0.08620458841323853,
      "learning_rate": 0.0002835015908507793,
      "loss": 9.4404,
      "step": 8384,
      "throughput": 17709.317740773935
    },
    {
      "epoch": 0.13191325950830335,
      "grad_norm": 0.09066160023212433,
      "learning_rate": 0.0002833700144755753,
      "loss": 9.414,
      "step": 8416,
      "throughput": 17709.55050982401
    },
    {
      "epoch": 0.13241483083723227,
      "grad_norm": 0.0891546979546547,
      "learning_rate": 0.0002832379499420808,
      "loss": 9.4323,
      "step": 8448,
      "throughput": 17709.75111213038
    },
    {
      "epoch": 0.13291640216616118,
      "grad_norm": 0.09241317957639694,
      "learning_rate": 0.0002831053977949303,
      "loss": 9.4121,
      "step": 8480,
      "throughput": 17707.765444295415
    },
    {
      "epoch": 0.13341797349509008,
      "grad_norm": 0.08092867583036423,
      "learning_rate": 0.00028297235858076923,
      "loss": 9.42,
      "step": 8512,
      "throughput": 17707.93522306093
    },
    {
      "epoch": 0.133919544824019,
      "grad_norm": 0.08822212368249893,
      "learning_rate": 0.0002828388328482517,
      "loss": 9.4055,
      "step": 8544,
      "throughput": 17709.673360506327
    },
    {
      "epoch": 0.1344211161529479,
      "grad_norm": 0.09241366386413574,
      "learning_rate": 0.0002827048211480383,
      "loss": 9.4052,
      "step": 8576,
      "throughput": 17711.83138890004
    },
    {
      "epoch": 0.13492268748187683,
      "grad_norm": 0.08627436310052872,
      "learning_rate": 0.00028257032403279354,
      "loss": 9.4124,
      "step": 8608,
      "throughput": 17713.428728467305
    },
    {
      "epoch": 0.13542425881080572,
      "grad_norm": 0.09174727648496628,
      "learning_rate": 0.00028243534205718405,
      "loss": 9.4067,
      "step": 8640,
      "throughput": 17713.749902611682
    },
    {
      "epoch": 0.13592583013973464,
      "grad_norm": 0.08742675185203552,
      "learning_rate": 0.00028229987577787585,
      "loss": 9.4009,
      "step": 8672,
      "throughput": 17714.910490991344
    },
    {
      "epoch": 0.13642740146866356,
      "grad_norm": 0.0941416397690773,
      "learning_rate": 0.00028216392575353225,
      "loss": 9.381,
      "step": 8704,
      "throughput": 17715.589063631214
    },
    {
      "epoch": 0.13692897279759245,
      "grad_norm": 0.09356024861335754,
      "learning_rate": 0.00028202749254481165,
      "loss": 9.3877,
      "step": 8736,
      "throughput": 17716.64766294759
    },
    {
      "epoch": 0.13743054412652136,
      "grad_norm": 0.0842645913362503,
      "learning_rate": 0.0002818905767143649,
      "loss": 9.3993,
      "step": 8768,
      "throughput": 17714.34060053566
    },
    {
      "epoch": 0.13793211545545028,
      "grad_norm": 0.09103891253471375,
      "learning_rate": 0.0002817531788268333,
      "loss": 9.396,
      "step": 8800,
      "throughput": 17713.960152419957
    },
    {
      "epoch": 0.1384336867843792,
      "grad_norm": 0.09892462193965912,
      "learning_rate": 0.0002816152994488462,
      "loss": 9.3945,
      "step": 8832,
      "throughput": 17715.097766552306
    },
    {
      "epoch": 0.1389352581133081,
      "grad_norm": 0.08912087231874466,
      "learning_rate": 0.0002814769391490185,
      "loss": 9.4177,
      "step": 8864,
      "throughput": 17717.138805374783
    },
    {
      "epoch": 0.139436829442237,
      "grad_norm": 0.08828168362379074,
      "learning_rate": 0.0002813380984979486,
      "loss": 9.3854,
      "step": 8896,
      "throughput": 17719.203861962458
    },
    {
      "epoch": 0.13993840077116593,
      "grad_norm": 0.08851828426122665,
      "learning_rate": 0.00028119877806821557,
      "loss": 9.3854,
      "step": 8928,
      "throughput": 17720.284647074564
    },
    {
      "epoch": 0.14043997210009482,
      "grad_norm": 0.09857714176177979,
      "learning_rate": 0.00028105897843437746,
      "loss": 9.3973,
      "step": 8960,
      "throughput": 17720.4507104652
    },
    {
      "epoch": 0.14094154342902374,
      "grad_norm": 0.08587909489870071,
      "learning_rate": 0.0002809187001729683,
      "loss": 9.4025,
      "step": 8992,
      "throughput": 17720.728277247388
    },
    {
      "epoch": 0.14144311475795265,
      "grad_norm": 0.09501481056213379,
      "learning_rate": 0.00028077794386249604,
      "loss": 9.3793,
      "step": 9024,
      "throughput": 17721.806096925364
    },
    {
      "epoch": 0.14194468608688157,
      "grad_norm": 0.08143424242734909,
      "learning_rate": 0.0002806367100834401,
      "loss": 9.374,
      "step": 9056,
      "throughput": 17721.533381466965
    },
    {
      "epoch": 0.14244625741581046,
      "grad_norm": 0.09524306654930115,
      "learning_rate": 0.00028049499941824906,
      "loss": 9.3819,
      "step": 9088,
      "throughput": 17718.38976010663
    },
    {
      "epoch": 0.14294782874473938,
      "grad_norm": 0.08763439953327179,
      "learning_rate": 0.0002803528124513382,
      "loss": 9.3596,
      "step": 9120,
      "throughput": 17719.890196370536
    },
    {
      "epoch": 0.1434494000736683,
      "grad_norm": 0.08398011326789856,
      "learning_rate": 0.00028021014976908676,
      "loss": 9.386,
      "step": 9152,
      "throughput": 17721.433472284
    },
    {
      "epoch": 0.1439509714025972,
      "grad_norm": 0.08198576420545578,
      "learning_rate": 0.0002800670119598363,
      "loss": 9.3502,
      "step": 9184,
      "throughput": 17723.374378071185
    },
    {
      "epoch": 0.1444525427315261,
      "grad_norm": 0.08458232134580612,
      "learning_rate": 0.0002799233996138874,
      "loss": 9.3845,
      "step": 9216,
      "throughput": 17724.399011975012
    },
    {
      "epoch": 0.14495411406045502,
      "grad_norm": 0.08208563178777695,
      "learning_rate": 0.00027977931332349786,
      "loss": 9.3633,
      "step": 9248,
      "throughput": 17724.60412540917
    },
    {
      "epoch": 0.14545568538938394,
      "grad_norm": 0.0929563045501709,
      "learning_rate": 0.00027963475368288006,
      "loss": 9.3822,
      "step": 9280,
      "throughput": 17726.127441229473
    },
    {
      "epoch": 0.14595725671831283,
      "grad_norm": 0.08427176624536514,
      "learning_rate": 0.00027948972128819823,
      "loss": 9.3594,
      "step": 9312,
      "throughput": 17725.839948487617
    },
    {
      "epoch": 0.14645882804724175,
      "grad_norm": 0.08081962168216705,
      "learning_rate": 0.0002793442167375665,
      "loss": 9.354,
      "step": 9344,
      "throughput": 17726.891969935805
    },
    {
      "epoch": 0.14696039937617067,
      "grad_norm": 0.08913140743970871,
      "learning_rate": 0.0002791982406310461,
      "loss": 9.365,
      "step": 9376,
      "throughput": 17723.89602326518
    },
    {
      "epoch": 0.14746197070509956,
      "grad_norm": 0.08514046669006348,
      "learning_rate": 0.0002790517935706428,
      "loss": 9.3629,
      "step": 9408,
      "throughput": 17724.411805093445
    },
    {
      "epoch": 0.14796354203402848,
      "grad_norm": 0.09198344498872757,
      "learning_rate": 0.00027890487616030475,
      "loss": 9.3655,
      "step": 9440,
      "throughput": 17725.455035647767
    },
    {
      "epoch": 0.1484651133629574,
      "grad_norm": 0.08742505311965942,
      "learning_rate": 0.0002787574890059199,
      "loss": 9.3334,
      "step": 9472,
      "throughput": 17727.366510384505
    },
    {
      "epoch": 0.1489666846918863,
      "grad_norm": 0.08325749635696411,
      "learning_rate": 0.0002786096327153131,
      "loss": 9.379,
      "step": 9504,
      "throughput": 17729.253232463434
    },
    {
      "epoch": 0.1494682560208152,
      "grad_norm": 0.09257443994283676,
      "learning_rate": 0.00027846130789824437,
      "loss": 9.3373,
      "step": 9536,
      "throughput": 17729.405691552933
    },
    {
      "epoch": 0.14996982734974412,
      "grad_norm": 0.0873776450753212,
      "learning_rate": 0.00027831251516640553,
      "loss": 9.3589,
      "step": 9568,
      "throughput": 17730.377847042964
    },
    {
      "epoch": 0.15047139867867304,
      "grad_norm": 0.08066736161708832,
      "learning_rate": 0.00027816325513341835,
      "loss": 9.3495,
      "step": 9600,
      "throughput": 17730.087161016774
    },
    {
      "epoch": 0.15097297000760193,
      "grad_norm": 0.08983741700649261,
      "learning_rate": 0.0002780135284148315,
      "loss": 9.3558,
      "step": 9632,
      "throughput": 17731.48408846505
    },
    {
      "epoch": 0.15147454133653085,
      "grad_norm": 0.08716747164726257,
      "learning_rate": 0.00027786333562811855,
      "loss": 9.3411,
      "step": 9664,
      "throughput": 17729.663061758318
    },
    {
      "epoch": 0.15197611266545977,
      "grad_norm": 0.09069626778364182,
      "learning_rate": 0.00027771267739267494,
      "loss": 9.3363,
      "step": 9696,
      "throughput": 17728.220288648616
    },
    {
      "epoch": 0.15247768399438866,
      "grad_norm": 0.0872938483953476,
      "learning_rate": 0.0002775615543298157,
      "loss": 9.3263,
      "step": 9728,
      "throughput": 17729.20781459165
    },
    {
      "epoch": 0.15297925532331758,
      "grad_norm": 0.08429323136806488,
      "learning_rate": 0.0002774099670627728,
      "loss": 9.344,
      "step": 9760,
      "throughput": 17731.034461616182
    },
    {
      "epoch": 0.1534808266522465,
      "grad_norm": 0.08956008404493332,
      "learning_rate": 0.00027725791621669257,
      "loss": 9.3532,
      "step": 9792,
      "throughput": 17732.87619972755
    },
    {
      "epoch": 0.1539823979811754,
      "grad_norm": 0.08699870854616165,
      "learning_rate": 0.0002771054024186331,
      "loss": 9.3539,
      "step": 9824,
      "throughput": 17733.819344468047
    },
    {
      "epoch": 0.1544839693101043,
      "grad_norm": 0.08837584406137466,
      "learning_rate": 0.0002769524262975618,
      "loss": 9.3225,
      "step": 9856,
      "throughput": 17733.988379634797
    },
    {
      "epoch": 0.15498554063903322,
      "grad_norm": 0.08523304760456085,
      "learning_rate": 0.0002767989884843527,
      "loss": 9.3234,
      "step": 9888,
      "throughput": 17734.564621286165
    },
    {
      "epoch": 0.15548711196796214,
      "grad_norm": 0.0892610102891922,
      "learning_rate": 0.0002766450896117837,
      "loss": 9.3348,
      "step": 9920,
      "throughput": 17735.113290039546
    },
    {
      "epoch": 0.15598868329689103,
      "grad_norm": 0.08581508696079254,
      "learning_rate": 0.0002764907303145342,
      "loss": 9.3469,
      "step": 9952,
      "throughput": 17735.232375741027
    },
    {
      "epoch": 0.15649025462581995,
      "grad_norm": 0.08464840799570084,
      "learning_rate": 0.00027633591122918244,
      "loss": 9.3182,
      "step": 9984,
      "throughput": 17733.241903899874
    },
    {
      "epoch": 0.15699182595474886,
      "grad_norm": 0.08995112776756287,
      "learning_rate": 0.0002761806329942028,
      "loss": 9.3363,
      "step": 10016,
      "throughput": 17733.762479490895
    },
    {
      "epoch": 0.15749339728367778,
      "grad_norm": 0.08774518221616745,
      "learning_rate": 0.0002760248962499632,
      "loss": 9.3135,
      "step": 10048,
      "throughput": 17734.737383423486
    },
    {
      "epoch": 0.15799496861260667,
      "grad_norm": 0.09354528039693832,
      "learning_rate": 0.0002758687016387223,
      "loss": 9.3394,
      "step": 10080,
      "throughput": 17736.51638569282
    },
    {
      "epoch": 0.1584965399415356,
      "grad_norm": 0.08741112053394318,
      "learning_rate": 0.0002757120498046273,
      "loss": 9.3364,
      "step": 10112,
      "throughput": 17738.275112121057
    },
    {
      "epoch": 0.1589981112704645,
      "grad_norm": 0.09841817617416382,
      "learning_rate": 0.00027555494139371077,
      "loss": 9.3244,
      "step": 10144,
      "throughput": 17738.358233489358
    },
    {
      "epoch": 0.1594996825993934,
      "grad_norm": 0.09093420952558517,
      "learning_rate": 0.0002753973770538882,
      "loss": 9.2967,
      "step": 10176,
      "throughput": 17738.822258680775
    },
    {
      "epoch": 0.16000125392832232,
      "grad_norm": 0.09394460171461105,
      "learning_rate": 0.00027523935743495553,
      "loss": 9.2852,
      "step": 10208,
      "throughput": 17738.97295402453
    },
    {
      "epoch": 0.16050282525725124,
      "grad_norm": 0.08894991129636765,
      "learning_rate": 0.00027508088318858604,
      "loss": 9.324,
      "step": 10240,
      "throughput": 17739.884708109348
    },
    {
      "epoch": 0.16100439658618015,
      "grad_norm": 0.07993625104427338,
      "learning_rate": 0.000274921954968328,
      "loss": 9.3121,
      "step": 10272,
      "throughput": 17734.377506200843
    },
    {
      "epoch": 0.16150596791510904,
      "grad_norm": 0.08242437988519669,
      "learning_rate": 0.0002747625734296019,
      "loss": 9.3121,
      "step": 10304,
      "throughput": 17733.655948355277
    },
    {
      "epoch": 0.16200753924403796,
      "grad_norm": 0.08217553049325943,
      "learning_rate": 0.00027460273922969757,
      "loss": 9.338,
      "step": 10336,
      "throughput": 17734.54701699429
    },
    {
      "epoch": 0.16250911057296688,
      "grad_norm": 0.08616854250431061,
      "learning_rate": 0.0002744424530277719,
      "loss": 9.307,
      "step": 10368,
      "throughput": 17736.269304977854
    },
    {
      "epoch": 0.16301068190189577,
      "grad_norm": 0.08809614181518555,
      "learning_rate": 0.0002742817154848455,
      "loss": 9.2938,
      "step": 10400,
      "throughput": 17737.939030764315
    },
    {
      "epoch": 0.1635122532308247,
      "grad_norm": 0.08409152179956436,
      "learning_rate": 0.00027412052726380053,
      "loss": 9.3157,
      "step": 10432,
      "throughput": 17738.799611961324
    },
    {
      "epoch": 0.1640138245597536,
      "grad_norm": 0.07925765216350555,
      "learning_rate": 0.00027395888902937777,
      "loss": 9.2993,
      "step": 10464,
      "throughput": 17739.300958390246
    },
    {
      "epoch": 0.16451539588868253,
      "grad_norm": 0.09361526370048523,
      "learning_rate": 0.0002737968014481737,
      "loss": 9.2984,
      "step": 10496,
      "throughput": 17739.45763139594
    },
    {
      "epoch": 0.16501696721761142,
      "grad_norm": 0.07672687619924545,
      "learning_rate": 0.000273634265188638,
      "loss": 9.2926,
      "step": 10528,
      "throughput": 17740.348861311355
    },
    {
      "epoch": 0.16551853854654033,
      "grad_norm": 0.07868822664022446,
      "learning_rate": 0.0002734712809210706,
      "loss": 9.2988,
      "step": 10560,
      "throughput": 17740.410090470832
    },
    {
      "epoch": 0.16602010987546925,
      "grad_norm": 0.08165296912193298,
      "learning_rate": 0.00027330784931761925,
      "loss": 9.2755,
      "step": 10592,
      "throughput": 17738.860810980197
    },
    {
      "epoch": 0.16652168120439814,
      "grad_norm": 0.07798882573843002,
      "learning_rate": 0.0002731439710522763,
      "loss": 9.2713,
      "step": 10624,
      "throughput": 17739.33997914266
    },
    {
      "epoch": 0.16702325253332706,
      "grad_norm": 0.08346995711326599,
      "learning_rate": 0.00027297964680087617,
      "loss": 9.2922,
      "step": 10656,
      "throughput": 17740.22205949218
    },
    {
      "epoch": 0.16752482386225598,
      "grad_norm": 0.08133242279291153,
      "learning_rate": 0.0002728148772410926,
      "loss": 9.3008,
      "step": 10688,
      "throughput": 17741.884007279037
    },
    {
      "epoch": 0.1680263951911849,
      "grad_norm": 0.08343921601772308,
      "learning_rate": 0.0002726496630524358,
      "loss": 9.316,
      "step": 10720,
      "throughput": 17743.119356231484
    },
    {
      "epoch": 0.1685279665201138,
      "grad_norm": 0.08336193114519119,
      "learning_rate": 0.00027248400491624946,
      "loss": 9.262,
      "step": 10752,
      "throughput": 17743.58509175239
    },
    {
      "epoch": 0.1690295378490427,
      "grad_norm": 0.08541908860206604,
      "learning_rate": 0.00027231790351570827,
      "loss": 9.2852,
      "step": 10784,
      "throughput": 17744.05931240815
    },
    {
      "epoch": 0.16953110917797162,
      "grad_norm": 0.08818753808736801,
      "learning_rate": 0.00027215135953581485,
      "loss": 9.2897,
      "step": 10816,
      "throughput": 17743.408459750997
    },
    {
      "epoch": 0.1700326805069005,
      "grad_norm": 0.0927952229976654,
      "learning_rate": 0.00027198437366339717,
      "loss": 9.2722,
      "step": 10848,
      "throughput": 17744.664686468273
    },
    {
      "epoch": 0.17053425183582943,
      "grad_norm": 0.07848216593265533,
      "learning_rate": 0.00027181694658710544,
      "loss": 9.2647,
      "step": 10880,
      "throughput": 17742.74908535243
    },
    {
      "epoch": 0.17103582316475835,
      "grad_norm": 0.08359099179506302,
      "learning_rate": 0.00027164907899740936,
      "loss": 9.2732,
      "step": 10912,
      "throughput": 17742.44229337719
    },
    {
      "epoch": 0.17153739449368727,
      "grad_norm": 0.08627219498157501,
      "learning_rate": 0.0002714807715865954,
      "loss": 9.2909,
      "step": 10944,
      "throughput": 17743.22525761864
    },
    {
      "epoch": 0.17203896582261616,
      "grad_norm": 0.08329717814922333,
      "learning_rate": 0.0002713120250487638,
      "loss": 9.2677,
      "step": 10976,
      "throughput": 17744.796276143617
    },
    {
      "epoch": 0.17254053715154508,
      "grad_norm": 0.0816192626953125,
      "learning_rate": 0.0002711428400798258,
      "loss": 9.2592,
      "step": 11008,
      "throughput": 17746.35511626247
    },
    {
      "epoch": 0.173042108480474,
      "grad_norm": 0.08471696078777313,
      "learning_rate": 0.00027097321737750075,
      "loss": 9.2676,
      "step": 11040,
      "throughput": 17747.514092651345
    },
    {
      "epoch": 0.17354367980940288,
      "grad_norm": 0.08643633127212524,
      "learning_rate": 0.00027080315764131316,
      "loss": 9.2539,
      "step": 11072,
      "throughput": 17747.237051226777
    },
    {
      "epoch": 0.1740452511383318,
      "grad_norm": 0.08549510687589645,
      "learning_rate": 0.0002706326615725898,
      "loss": 9.266,
      "step": 11104,
      "throughput": 17746.947223038034
    },
    {
      "epoch": 0.17454682246726072,
      "grad_norm": 0.09334033727645874,
      "learning_rate": 0.0002704617298744571,
      "loss": 9.2492,
      "step": 11136,
      "throughput": 17748.528601371454
    },
    {
      "epoch": 0.17504839379618964,
      "grad_norm": 0.08416300266981125,
      "learning_rate": 0.00027029036325183775,
      "loss": 9.2546,
      "step": 11168,
      "throughput": 17748.02754972764
    },
    {
      "epoch": 0.17554996512511853,
      "grad_norm": 0.07855620980262756,
      "learning_rate": 0.0002701185624114483,
      "loss": 9.2792,
      "step": 11200,
      "throughput": 17746.755300402074
    },
    {
      "epoch": 0.17605153645404745,
      "grad_norm": 0.08049018681049347,
      "learning_rate": 0.0002699463280617959,
      "loss": 9.2799,
      "step": 11232,
      "throughput": 17747.988327536557
    },
    {
      "epoch": 0.17655310778297637,
      "grad_norm": 0.09454195201396942,
      "learning_rate": 0.00026977366091317554,
      "loss": 9.2456,
      "step": 11264,
      "throughput": 17748.456910978788
    },
    {
      "epoch": 0.17705467911190526,
      "grad_norm": 0.07567203044891357,
      "learning_rate": 0.00026960056167766704,
      "loss": 9.2549,
      "step": 11296,
      "throughput": 17750.01303683647
    },
    {
      "epoch": 0.17755625044083417,
      "grad_norm": 0.08688521385192871,
      "learning_rate": 0.0002694270310691321,
      "loss": 9.2444,
      "step": 11328,
      "throughput": 17751.15787430835
    },
    {
      "epoch": 0.1780578217697631,
      "grad_norm": 0.09056143462657928,
      "learning_rate": 0.0002692530698032116,
      "loss": 9.2426,
      "step": 11360,
      "throughput": 17751.207510816206
    },
    {
      "epoch": 0.178559393098692,
      "grad_norm": 0.08120916038751602,
      "learning_rate": 0.00026907867859732223,
      "loss": 9.2452,
      "step": 11392,
      "throughput": 17751.31664040642
    },
    {
      "epoch": 0.1790609644276209,
      "grad_norm": 0.07514616847038269,
      "learning_rate": 0.0002689038581706538,
      "loss": 9.2491,
      "step": 11424,
      "throughput": 17751.360375468852
    },
    {
      "epoch": 0.17956253575654982,
      "grad_norm": 0.08787547796964645,
      "learning_rate": 0.0002687286092441664,
      "loss": 9.227,
      "step": 11456,
      "throughput": 17751.43941832824
    },
    {
      "epoch": 0.18006410708547874,
      "grad_norm": 0.08200137317180634,
      "learning_rate": 0.00026855293254058693,
      "loss": 9.2413,
      "step": 11488,
      "throughput": 17750.73067845772
    },
    {
      "epoch": 0.18056567841440763,
      "grad_norm": 0.09474249184131622,
      "learning_rate": 0.0002683768287844068,
      "loss": 9.2374,
      "step": 11520,
      "throughput": 17750.830930191118
    },
    {
      "epoch": 0.18106724974333654,
      "grad_norm": 0.08020896464586258,
      "learning_rate": 0.0002682002987018783,
      "loss": 9.258,
      "step": 11552,
      "throughput": 17751.605636322933
    },
    {
      "epoch": 0.18156882107226546,
      "grad_norm": 0.08732938766479492,
      "learning_rate": 0.00026802334302101214,
      "loss": 9.2477,
      "step": 11584,
      "throughput": 17752.74366557252
    },
    {
      "epoch": 0.18207039240119435,
      "grad_norm": 0.08201635628938675,
      "learning_rate": 0.000267845962471574,
      "loss": 9.2412,
      "step": 11616,
      "throughput": 17754.248561484445
    },
    {
      "epoch": 0.18257196373012327,
      "grad_norm": 0.08246797323226929,
      "learning_rate": 0.0002676681577850818,
      "loss": 9.2311,
      "step": 11648,
      "throughput": 17754.32764483168
    },
    {
      "epoch": 0.1830735350590522,
      "grad_norm": 0.08599203079938889,
      "learning_rate": 0.0002674899296948026,
      "loss": 9.2294,
      "step": 11680,
      "throughput": 17754.719613925416
    },
    {
      "epoch": 0.1835751063879811,
      "grad_norm": 0.08059228956699371,
      "learning_rate": 0.00026731127893574955,
      "loss": 9.2423,
      "step": 11712,
      "throughput": 17754.07333906634
    },
    {
      "epoch": 0.18407667771691,
      "grad_norm": 0.08479016274213791,
      "learning_rate": 0.00026713220624467894,
      "loss": 9.2528,
      "step": 11744,
      "throughput": 17755.54729374782
    },
    {
      "epoch": 0.18457824904583892,
      "grad_norm": 0.08023982495069504,
      "learning_rate": 0.00026695271236008703,
      "loss": 9.2301,
      "step": 11776,
      "throughput": 17754.15601091206
    },
    {
      "epoch": 0.18507982037476783,
      "grad_norm": 0.0974150151014328,
      "learning_rate": 0.00026677279802220726,
      "loss": 9.2395,
      "step": 11808,
      "throughput": 17753.80019657531
    },
    {
      "epoch": 0.18558139170369672,
      "grad_norm": 0.09254009276628494,
      "learning_rate": 0.00026659246397300673,
      "loss": 9.229,
      "step": 11840,
      "throughput": 17754.905195390864
    },
    {
      "epoch": 0.18608296303262564,
      "grad_norm": 0.0819011852145195,
      "learning_rate": 0.00026641171095618366,
      "loss": 9.219,
      "step": 11872,
      "throughput": 17755.662459115352
    },
    {
      "epoch": 0.18658453436155456,
      "grad_norm": 0.07982365041971207,
      "learning_rate": 0.0002662305397171641,
      "loss": 9.2316,
      "step": 11904,
      "throughput": 17757.072989102187
    },
    {
      "epoch": 0.18708610569048348,
      "grad_norm": 0.0868970975279808,
      "learning_rate": 0.0002660489510030986,
      "loss": 9.2386,
      "step": 11936,
      "throughput": 17758.149529276874
    },
    {
      "epoch": 0.18758767701941237,
      "grad_norm": 0.08935050666332245,
      "learning_rate": 0.00026586694556285975,
      "loss": 9.2364,
      "step": 11968,
      "throughput": 17758.176495549968
    },
    {
      "epoch": 0.1880892483483413,
      "grad_norm": 0.08183032274246216,
      "learning_rate": 0.0002656845241470384,
      "loss": 9.2169,
      "step": 12000,
      "throughput": 17757.1726961844
    },
    {
      "epoch": 0.1885908196772702,
      "grad_norm": 0.09355923533439636,
      "learning_rate": 0.0002655016875079411,
      "loss": 9.2237,
      "step": 12032,
      "throughput": 17758.286084059786
    },
    {
      "epoch": 0.1890923910061991,
      "grad_norm": 0.08387905359268188,
      "learning_rate": 0.00026531843639958656,
      "loss": 9.2133,
      "step": 12064,
      "throughput": 17758.027452995793
    },
    {
      "epoch": 0.189593962335128,
      "grad_norm": 0.08702760189771652,
      "learning_rate": 0.00026513477157770303,
      "loss": 9.2201,
      "step": 12096,
      "throughput": 17757.35661152563
    },
    {
      "epoch": 0.19009553366405693,
      "grad_norm": 0.0785302221775055,
      "learning_rate": 0.0002649506937997248,
      "loss": 9.2206,
      "step": 12128,
      "throughput": 17757.42597187838
    },
    {
      "epoch": 0.19059710499298585,
      "grad_norm": 0.08639495074748993,
      "learning_rate": 0.00026476620382478896,
      "loss": 9.2288,
      "step": 12160,
      "throughput": 17758.528128623464
    },
    {
      "epoch": 0.19109867632191474,
      "grad_norm": 0.08326099067926407,
      "learning_rate": 0.0002645813024137329,
      "loss": 9.231,
      "step": 12192,
      "throughput": 17759.527708457354
    },
    {
      "epoch": 0.19160024765084366,
      "grad_norm": 0.09123198688030243,
      "learning_rate": 0.00026439599032909055,
      "loss": 9.2303,
      "step": 12224,
      "throughput": 17760.933629831925
    },
    {
      "epoch": 0.19210181897977258,
      "grad_norm": 0.08471406996250153,
      "learning_rate": 0.0002642102683350894,
      "loss": 9.2284,
      "step": 12256,
      "throughput": 17760.95910159503
    },
    {
      "epoch": 0.19260339030870147,
      "grad_norm": 0.09966867417097092,
      "learning_rate": 0.00026402413719764774,
      "loss": 9.2111,
      "step": 12288,
      "throughput": 17761.297990507992
    },
    {
      "epoch": 0.19310496163763038,
      "grad_norm": 0.08580002933740616,
      "learning_rate": 0.0002638375976843707,
      "loss": 9.2135,
      "step": 12320,
      "throughput": 17756.89295395398
    },
    {
      "epoch": 0.1936065329665593,
      "grad_norm": 0.08772596716880798,
      "learning_rate": 0.0002636506505645478,
      "loss": 9.199,
      "step": 12352,
      "throughput": 17757.2471773537
    },
    {
      "epoch": 0.19410810429548822,
      "grad_norm": 0.0860821008682251,
      "learning_rate": 0.00026346329660914964,
      "loss": 9.2145,
      "step": 12384,
      "throughput": 17756.55196902713
    },
    {
      "epoch": 0.1946096756244171,
      "grad_norm": 0.08097507804632187,
      "learning_rate": 0.00026327553659082444,
      "loss": 9.2198,
      "step": 12416,
      "throughput": 17756.248802078728
    },
    {
      "epoch": 0.19511124695334603,
      "grad_norm": 0.08351437002420425,
      "learning_rate": 0.00026308737128389513,
      "loss": 9.1913,
      "step": 12448,
      "throughput": 17757.62806257604
    },
    {
      "epoch": 0.19561281828227495,
      "grad_norm": 0.08402379602193832,
      "learning_rate": 0.0002628988014643558,
      "loss": 9.2191,
      "step": 12480,
      "throughput": 17758.34908027362
    },
    {
      "epoch": 0.19611438961120384,
      "grad_norm": 0.08989959210157394,
      "learning_rate": 0.00026270982790986916,
      "loss": 9.218,
      "step": 12512,
      "throughput": 17759.728538211333
    },
    {
      "epoch": 0.19661596094013276,
      "grad_norm": 0.09197131544351578,
      "learning_rate": 0.00026252045139976254,
      "loss": 9.199,
      "step": 12544,
      "throughput": 17760.454666729
    },
    {
      "epoch": 0.19711753226906167,
      "grad_norm": 0.08772056549787521,
      "learning_rate": 0.00026233067271502536,
      "loss": 9.1786,
      "step": 12576,
      "throughput": 17760.168491821252
    },
    {
      "epoch": 0.1976191035979906,
      "grad_norm": 0.07788512855768204,
      "learning_rate": 0.0002621404926383054,
      "loss": 9.2083,
      "step": 12608,
      "throughput": 17759.559491562166
    },
    {
      "epoch": 0.19812067492691948,
      "grad_norm": 0.08134134113788605,
      "learning_rate": 0.0002619499119539059,
      "loss": 9.1859,
      "step": 12640,
      "throughput": 17760.580955267505
    },
    {
      "epoch": 0.1986222462558484,
      "grad_norm": 0.08216018974781036,
      "learning_rate": 0.0002617589314477821,
      "loss": 9.1733,
      "step": 12672,
      "throughput": 17760.070863293106
    },
    {
      "epoch": 0.19912381758477732,
      "grad_norm": 0.08295486867427826,
      "learning_rate": 0.0002615675519075383,
      "loss": 9.1955,
      "step": 12704,
      "throughput": 17758.912070185866
    },
    {
      "epoch": 0.1996253889137062,
      "grad_norm": 0.08604070544242859,
      "learning_rate": 0.00026137577412242415,
      "loss": 9.1792,
      "step": 12736,
      "throughput": 17759.930171949552
    },
    {
      "epoch": 0.20012696024263513,
      "grad_norm": 0.0887996256351471,
      "learning_rate": 0.00026118359888333193,
      "loss": 9.1603,
      "step": 12768,
      "throughput": 17760.97519962203
    },
    {
      "epoch": 0.20062853157156404,
      "grad_norm": 0.0748458057641983,
      "learning_rate": 0.00026099102698279276,
      "loss": 9.1849,
      "step": 12800,
      "throughput": 17761.979821963785
    },
    {
      "epoch": 0.20113010290049296,
      "grad_norm": 0.08445336669683456,
      "learning_rate": 0.0002607980592149739,
      "loss": 9.1846,
      "step": 12832,
      "throughput": 17762.663567510866
    },
    {
      "epoch": 0.20163167422942185,
      "grad_norm": 0.07447773963212967,
      "learning_rate": 0.00026060469637567484,
      "loss": 9.1943,
      "step": 12864,
      "throughput": 17763.02923012481
    },
    {
      "epoch": 0.20213324555835077,
      "grad_norm": 0.08066857606172562,
      "learning_rate": 0.0002604109392623246,
      "loss": 9.2074,
      "step": 12896,
      "throughput": 17762.771752956647
    },
    {
      "epoch": 0.2026348168872797,
      "grad_norm": 0.07313714921474457,
      "learning_rate": 0.00026021678867397803,
      "loss": 9.188,
      "step": 12928,
      "throughput": 17762.7475778211
    },
    {
      "epoch": 0.20313638821620858,
      "grad_norm": 0.07853822410106659,
      "learning_rate": 0.00026002224541131274,
      "loss": 9.1701,
      "step": 12960,
      "throughput": 17762.784594845412
    },
    {
      "epoch": 0.2036379595451375,
      "grad_norm": 0.08614935725927353,
      "learning_rate": 0.00025982731027662575,
      "loss": 9.1813,
      "step": 12992,
      "throughput": 17762.106150200903
    },
    {
      "epoch": 0.20413953087406642,
      "grad_norm": 0.07838352769613266,
      "learning_rate": 0.00025963198407383015,
      "loss": 9.1915,
      "step": 13024,
      "throughput": 17762.128170875214
    },
    {
      "epoch": 0.20464110220299533,
      "grad_norm": 0.07841797918081284,
      "learning_rate": 0.0002594362676084517,
      "loss": 9.1748,
      "step": 13056,
      "throughput": 17763.430365428718
    },
    {
      "epoch": 0.20514267353192422,
      "grad_norm": 0.07841688394546509,
      "learning_rate": 0.0002592401616876258,
      "loss": 9.1785,
      "step": 13088,
      "throughput": 17764.113988290395
    },
    {
      "epoch": 0.20564424486085314,
      "grad_norm": 0.08778318762779236,
      "learning_rate": 0.00025904366712009374,
      "loss": 9.1896,
      "step": 13120,
      "throughput": 17765.416229046143
    },
    {
      "epoch": 0.20614581618978206,
      "grad_norm": 0.07848533242940903,
      "learning_rate": 0.00025884678471619976,
      "loss": 9.1745,
      "step": 13152,
      "throughput": 17765.756664442815
    },
    {
      "epoch": 0.20664738751871095,
      "grad_norm": 0.082659512758255,
      "learning_rate": 0.0002586495152878874,
      "loss": 9.1685,
      "step": 13184,
      "throughput": 17765.91336427311
    },
    {
      "epoch": 0.20714895884763987,
      "grad_norm": 0.08469443023204803,
      "learning_rate": 0.0002584518596486965,
      "loss": 9.1722,
      "step": 13216,
      "throughput": 17765.018102965587
    },
    {
      "epoch": 0.2076505301765688,
      "grad_norm": 0.07865361869335175,
      "learning_rate": 0.00025825381861375936,
      "loss": 9.1817,
      "step": 13248,
      "throughput": 17766.303462840322
    },
    {
      "epoch": 0.2081521015054977,
      "grad_norm": 0.0782107338309288,
      "learning_rate": 0.00025805539299979794,
      "loss": 9.1819,
      "step": 13280,
      "throughput": 17765.005851088183
    },
    {
      "epoch": 0.2086536728344266,
      "grad_norm": 0.07893738895654678,
      "learning_rate": 0.0002578565836251199,
      "loss": 9.1728,
      "step": 13312,
      "throughput": 17764.704666216006
    },
    {
      "epoch": 0.2091552441633555,
      "grad_norm": 0.08694314956665039,
      "learning_rate": 0.0002576573913096158,
      "loss": 9.1774,
      "step": 13344,
      "throughput": 17765.994528548148
    },
    {
      "epoch": 0.20965681549228443,
      "grad_norm": 0.07928625494241714,
      "learning_rate": 0.00025745781687475534,
      "loss": 9.1696,
      "step": 13376,
      "throughput": 17766.961987929495
    },
    {
      "epoch": 0.21015838682121332,
      "grad_norm": 0.0769500657916069,
      "learning_rate": 0.000257257861143584,
      "loss": 9.1649,
      "step": 13408,
      "throughput": 17768.194245190854
    },
    {
      "epoch": 0.21065995815014224,
      "grad_norm": 0.08769873529672623,
      "learning_rate": 0.00025705752494071995,
      "loss": 9.1646,
      "step": 13440,
      "throughput": 17768.8347317131
    },
    {
      "epoch": 0.21116152947907116,
      "grad_norm": 0.07801324874162674,
      "learning_rate": 0.0002568568090923501,
      "loss": 9.1581,
      "step": 13472,
      "throughput": 17768.84446426064
    },
    {
      "epoch": 0.21166310080800008,
      "grad_norm": 0.07609668374061584,
      "learning_rate": 0.0002566557144262273,
      "loss": 9.1687,
      "step": 13504,
      "throughput": 17768.408898357007
    },
    {
      "epoch": 0.21216467213692897,
      "grad_norm": 0.09769364446401596,
      "learning_rate": 0.00025645424177166663,
      "loss": 9.1753,
      "step": 13536,
      "throughput": 17769.075062014672
    },
    {
      "epoch": 0.21266624346585788,
      "grad_norm": 0.08589643239974976,
      "learning_rate": 0.0002562523919595418,
      "loss": 9.163,
      "step": 13568,
      "throughput": 17769.10919087997
    },
    {
      "epoch": 0.2131678147947868,
      "grad_norm": 0.07752927392721176,
      "learning_rate": 0.0002560501658222821,
      "loss": 9.1467,
      "step": 13600,
      "throughput": 17768.465756585483
    },
    {
      "epoch": 0.2136693861237157,
      "grad_norm": 0.0796670913696289,
      "learning_rate": 0.0002558475641938686,
      "loss": 9.1517,
      "step": 13632,
      "throughput": 17768.478881713232
    },
    {
      "epoch": 0.2141709574526446,
      "grad_norm": 0.08221141248941422,
      "learning_rate": 0.00025564458790983114,
      "loss": 9.1704,
      "step": 13664,
      "throughput": 17769.42073647477
    },
    {
      "epoch": 0.21467252878157353,
      "grad_norm": 0.08998142182826996,
      "learning_rate": 0.0002554412378072445,
      "loss": 9.1552,
      "step": 13696,
      "throughput": 17770.666764438174
    },
    {
      "epoch": 0.21517410011050242,
      "grad_norm": 0.07725099474191666,
      "learning_rate": 0.0002552375147247251,
      "loss": 9.1454,
      "step": 13728,
      "throughput": 17771.883556423592
    },
    {
      "epoch": 0.21567567143943134,
      "grad_norm": 0.07587715983390808,
      "learning_rate": 0.0002550334195024275,
      "loss": 9.1491,
      "step": 13760,
      "throughput": 17771.352719333623
    },
    {
      "epoch": 0.21617724276836026,
      "grad_norm": 0.07606406509876251,
      "learning_rate": 0.00025482895298204096,
      "loss": 9.1445,
      "step": 13792,
      "throughput": 17771.476023673233
    },
    {
      "epoch": 0.21667881409728917,
      "grad_norm": 0.08447536081075668,
      "learning_rate": 0.0002546241160067861,
      "loss": 9.1462,
      "step": 13824,
      "throughput": 17771.204977664245
    },
    {
      "epoch": 0.21718038542621806,
      "grad_norm": 0.08394841849803925,
      "learning_rate": 0.00025441890942141124,
      "loss": 9.1635,
      "step": 13856,
      "throughput": 17771.853567391616
    },
    {
      "epoch": 0.21768195675514698,
      "grad_norm": 0.07255728542804718,
      "learning_rate": 0.00025421333407218884,
      "loss": 9.159,
      "step": 13888,
      "throughput": 17771.198852016052
    },
    {
      "epoch": 0.2181835280840759,
      "grad_norm": 0.0749092549085617,
      "learning_rate": 0.0002540073908069124,
      "loss": 9.1475,
      "step": 13920,
      "throughput": 17770.585107716557
    },
    {
      "epoch": 0.2186850994130048,
      "grad_norm": 0.0870911180973053,
      "learning_rate": 0.0002538010804748924,
      "loss": 9.1077,
      "step": 13952,
      "throughput": 17771.804267479227
    },
    {
      "epoch": 0.2191866707419337,
      "grad_norm": 0.08277792483568192,
      "learning_rate": 0.0002535944039269533,
      "loss": 9.1564,
      "step": 13984,
      "throughput": 17772.738838380068
    },
    {
      "epoch": 0.21968824207086263,
      "grad_norm": 0.07313404232263565,
      "learning_rate": 0.0002533873620154299,
      "loss": 9.1437,
      "step": 14016,
      "throughput": 17773.947692743845
    },
    {
      "epoch": 0.22018981339979155,
      "grad_norm": 0.09033776819705963,
      "learning_rate": 0.0002531799555941635,
      "loss": 9.1409,
      "step": 14048,
      "throughput": 17774.267928277182
    },
    {
      "epoch": 0.22069138472872044,
      "grad_norm": 0.07937667518854141,
      "learning_rate": 0.00025297218551849885,
      "loss": 9.1185,
      "step": 14080,
      "throughput": 17774.06221653404
    },
    {
      "epoch": 0.22119295605764935,
      "grad_norm": 0.08419618755578995,
      "learning_rate": 0.00025276405264528044,
      "loss": 9.1448,
      "step": 14112,
      "throughput": 17773.220136563268
    },
    {
      "epoch": 0.22169452738657827,
      "grad_norm": 0.07552050799131393,
      "learning_rate": 0.00025255555783284877,
      "loss": 9.1397,
      "step": 14144,
      "throughput": 17774.375940588343
    },
    {
      "epoch": 0.22219609871550716,
      "grad_norm": 0.07970742881298065,
      "learning_rate": 0.0002523467019410371,
      "loss": 9.1421,
      "step": 14176,
      "throughput": 17774.247620828053
    },
    {
      "epoch": 0.22269767004443608,
      "grad_norm": 0.08620994538068771,
      "learning_rate": 0.00025213748583116776,
      "loss": 9.1541,
      "step": 14208,
      "throughput": 17773.484717298426
    },
    {
      "epoch": 0.223199241373365,
      "grad_norm": 0.07768417149782181,
      "learning_rate": 0.0002519279103660486,
      "loss": 9.1098,
      "step": 14240,
      "throughput": 17773.73404081062
    },
    {
      "epoch": 0.22370081270229392,
      "grad_norm": 0.0926952138543129,
      "learning_rate": 0.0002517179764099694,
      "loss": 9.1067,
      "step": 14272,
      "throughput": 17774.638398139723
    },
    {
      "epoch": 0.2242023840312228,
      "grad_norm": 0.08634334802627563,
      "learning_rate": 0.00025150768482869846,
      "loss": 9.1326,
      "step": 14304,
      "throughput": 17775.821062374456
    },
    {
      "epoch": 0.22470395536015172,
      "grad_norm": 0.07841115444898605,
      "learning_rate": 0.0002512970364894789,
      "loss": 9.1293,
      "step": 14336,
      "throughput": 17776.7217812672
    },
    {
      "epoch": 0.22520552668908064,
      "grad_norm": 0.08082298189401627,
      "learning_rate": 0.00025108603226102515,
      "loss": 9.135,
      "step": 14368,
      "throughput": 17773.678824021947
    },
    {
      "epoch": 0.22570709801800953,
      "grad_norm": 0.07413540780544281,
      "learning_rate": 0.0002508746730135191,
      "loss": 9.1279,
      "step": 14400,
      "throughput": 17773.24651322649
    },
    {
      "epoch": 0.22620866934693845,
      "grad_norm": 0.08838255703449249,
      "learning_rate": 0.00025066295961860704,
      "loss": 9.1154,
      "step": 14432,
      "throughput": 17773.554209760005
    },
    {
      "epoch": 0.22671024067586737,
      "grad_norm": 0.08930821716785431,
      "learning_rate": 0.0002504508929493957,
      "loss": 9.1236,
      "step": 14464,
      "throughput": 17773.91288923869
    },
    {
      "epoch": 0.2272118120047963,
      "grad_norm": 0.0821206346154213,
      "learning_rate": 0.00025023847388044846,
      "loss": 9.1102,
      "step": 14496,
      "throughput": 17773.57951423693
    },
    {
      "epoch": 0.22771338333372518,
      "grad_norm": 0.08669077605009079,
      "learning_rate": 0.0002500257032877823,
      "loss": 9.1255,
      "step": 14528,
      "throughput": 17772.976340971953
    },
    {
      "epoch": 0.2282149546626541,
      "grad_norm": 0.08576954901218414,
      "learning_rate": 0.0002498125820488639,
      "loss": 9.1264,
      "step": 14560,
      "throughput": 17774.128793335887
    },
    {
      "epoch": 0.22871652599158301,
      "grad_norm": 0.08853522688150406,
      "learning_rate": 0.00024959911104260565,
      "loss": 9.1215,
      "step": 14592,
      "throughput": 17775.021770221105
    },
    {
      "epoch": 0.2292180973205119,
      "grad_norm": 0.08846239745616913,
      "learning_rate": 0.00024938529114936273,
      "loss": 9.133,
      "step": 14624,
      "throughput": 17776.182280312998
    },
    {
      "epoch": 0.22971966864944082,
      "grad_norm": 0.08885422348976135,
      "learning_rate": 0.000249171123250929,
      "loss": 9.127,
      "step": 14656,
      "throughput": 17776.489694815533
    },
    {
      "epoch": 0.23022123997836974,
      "grad_norm": 0.082110695540905,
      "learning_rate": 0.00024895660823053353,
      "loss": 9.1152,
      "step": 14688,
      "throughput": 17776.58724845879
    },
    {
      "epoch": 0.23072281130729866,
      "grad_norm": 0.07240453362464905,
      "learning_rate": 0.00024874174697283685,
      "loss": 9.1367,
      "step": 14720,
      "throughput": 17775.82717047677
    },
    {
      "epoch": 0.23122438263622755,
      "grad_norm": 0.07088429480791092,
      "learning_rate": 0.0002485265403639275,
      "loss": 9.1151,
      "step": 14752,
      "throughput": 17776.735107879464
    },
    {
      "epoch": 0.23172595396515647,
      "grad_norm": 0.07287611812353134,
      "learning_rate": 0.0002483109892913181,
      "loss": 9.1342,
      "step": 14784,
      "throughput": 17776.719121417333
    },
    {
      "epoch": 0.23222752529408539,
      "grad_norm": 0.08337391912937164,
      "learning_rate": 0.0002480950946439419,
      "loss": 9.1155,
      "step": 14816,
      "throughput": 17775.983219174625
    },
    {
      "epoch": 0.23272909662301428,
      "grad_norm": 0.08093888312578201,
      "learning_rate": 0.0002478788573121491,
      "loss": 9.0951,
      "step": 14848,
      "throughput": 17776.824699384837
    },
    {
      "epoch": 0.2332306679519432,
      "grad_norm": 0.0842365100979805,
      "learning_rate": 0.0002476622781877031,
      "loss": 9.0956,
      "step": 14880,
      "throughput": 17777.474284685857
    },
    {
      "epoch": 0.2337322392808721,
      "grad_norm": 0.07578255981206894,
      "learning_rate": 0.0002474453581637769,
      "loss": 9.112,
      "step": 14912,
      "throughput": 17778.586747839425
    },
    {
      "epoch": 0.23423381060980103,
      "grad_norm": 0.07239358872175217,
      "learning_rate": 0.00024722809813494933,
      "loss": 9.1123,
      "step": 14944,
      "throughput": 17779.466436439874
    },
    {
      "epoch": 0.23473538193872992,
      "grad_norm": 0.0819094255566597,
      "learning_rate": 0.00024701049899720123,
      "loss": 9.1053,
      "step": 14976,
      "throughput": 17779.444766337834
    },
    {
      "epoch": 0.23523695326765884,
      "grad_norm": 0.08021069318056107,
      "learning_rate": 0.0002467925616479122,
      "loss": 9.1304,
      "step": 15008,
      "throughput": 17779.002898580602
    },
    {
      "epoch": 0.23573852459658776,
      "grad_norm": 0.08905062824487686,
      "learning_rate": 0.0002465742869858566,
      "loss": 9.1156,
      "step": 15040,
      "throughput": 17779.548275387704
    },
    {
      "epoch": 0.23624009592551665,
      "grad_norm": 0.07807237654924393,
      "learning_rate": 0.0002463556759111996,
      "loss": 9.0916,
      "step": 15072,
      "throughput": 17779.240741012156
    },
    {
      "epoch": 0.23674166725444556,
      "grad_norm": 0.08749555796384811,
      "learning_rate": 0.00024613672932549403,
      "loss": 9.1059,
      "step": 15104,
      "throughput": 17778.825360570343
    },
    {
      "epoch": 0.23724323858337448,
      "grad_norm": 0.07883931696414948,
      "learning_rate": 0.00024591744813167625,
      "loss": 9.0943,
      "step": 15136,
      "throughput": 17779.06169946807
    },
    {
      "epoch": 0.2377448099123034,
      "grad_norm": 0.07457397878170013,
      "learning_rate": 0.00024569783323406255,
      "loss": 9.0915,
      "step": 15168,
      "throughput": 17779.64674349068
    },
    {
      "epoch": 0.2382463812412323,
      "grad_norm": 0.07606396824121475,
      "learning_rate": 0.00024547788553834536,
      "loss": 9.1019,
      "step": 15200,
      "throughput": 17780.496551560656
    },
    {
      "epoch": 0.2387479525701612,
      "grad_norm": 0.08174512535333633,
      "learning_rate": 0.00024525760595158977,
      "loss": 9.1145,
      "step": 15232,
      "throughput": 17781.56913269253
    },
    {
      "epoch": 0.23924952389909013,
      "grad_norm": 0.07189542055130005,
      "learning_rate": 0.0002450369953822293,
      "loss": 9.102,
      "step": 15264,
      "throughput": 17781.8745032226
    },
    {
      "epoch": 0.23975109522801902,
      "grad_norm": 0.07562036067247391,
      "learning_rate": 0.0002448160547400627,
      "loss": 9.0934,
      "step": 15296,
      "throughput": 17781.990741428832
    },
    {
      "epoch": 0.24025266655694794,
      "grad_norm": 0.07461810111999512,
      "learning_rate": 0.00024459478493624973,
      "loss": 9.0946,
      "step": 15328,
      "throughput": 17781.57031575691
    },
    {
      "epoch": 0.24075423788587685,
      "grad_norm": 0.07911355048418045,
      "learning_rate": 0.0002443731868833078,
      "loss": 9.097,
      "step": 15360,
      "throughput": 17781.693753365977
    },
    {
      "epoch": 0.24125580921480577,
      "grad_norm": 0.07777303457260132,
      "learning_rate": 0.0002441512614951079,
      "loss": 9.0662,
      "step": 15392,
      "throughput": 17781.314575011525
    },
    {
      "epoch": 0.24175738054373466,
      "grad_norm": 0.07520275563001633,
      "learning_rate": 0.00024392900968687103,
      "loss": 9.1051,
      "step": 15424,
      "throughput": 17781.036177664922
    },
    {
      "epoch": 0.24225895187266358,
      "grad_norm": 0.07553788274526596,
      "learning_rate": 0.00024370643237516426,
      "loss": 9.0889,
      "step": 15456,
      "throughput": 17782.035424535097
    },
    {
      "epoch": 0.2427605232015925,
      "grad_norm": 0.07838892936706543,
      "learning_rate": 0.00024348353047789708,
      "loss": 9.109,
      "step": 15488,
      "throughput": 17782.28704776579
    },
    {
      "epoch": 0.2432620945305214,
      "grad_norm": 0.0778832957148552,
      "learning_rate": 0.0002432603049143176,
      "loss": 9.1041,
      "step": 15520,
      "throughput": 17783.362660835668
    },
    {
      "epoch": 0.2437636658594503,
      "grad_norm": 0.07655086368322372,
      "learning_rate": 0.0002430367566050087,
      "loss": 9.0952,
      "step": 15552,
      "throughput": 17784.176518185708
    },
    {
      "epoch": 0.24426523718837923,
      "grad_norm": 0.08006106317043304,
      "learning_rate": 0.00024281288647188425,
      "loss": 9.0803,
      "step": 15584,
      "throughput": 17784.009025098036
    },
    {
      "epoch": 0.24476680851730814,
      "grad_norm": 0.07509180903434753,
      "learning_rate": 0.00024258869543818535,
      "loss": 9.0623,
      "step": 15616,
      "throughput": 17783.405734593525
    },
    {
      "epoch": 0.24526837984623703,
      "grad_norm": 0.07064583152532578,
      "learning_rate": 0.00024236418442847652,
      "loss": 9.1093,
      "step": 15648,
      "throughput": 17784.47659002254
    },
    {
      "epoch": 0.24576995117516595,
      "grad_norm": 0.0784299373626709,
      "learning_rate": 0.0002421393543686418,
      "loss": 9.0985,
      "step": 15680,
      "throughput": 17784.183071227122
    },
    {
      "epoch": 0.24627152250409487,
      "grad_norm": 0.07890692353248596,
      "learning_rate": 0.00024191420618588103,
      "loss": 9.0849,
      "step": 15712,
      "throughput": 17784.028872968156
    },
    {
      "epoch": 0.24677309383302376,
      "grad_norm": 0.07363320887088776,
      "learning_rate": 0.000241688740808706,
      "loss": 9.0832,
      "step": 15744,
      "throughput": 17784.2727540409
    },
    {
      "epoch": 0.24727466516195268,
      "grad_norm": 0.0745813325047493,
      "learning_rate": 0.0002414629591669366,
      "loss": 9.084,
      "step": 15776,
      "throughput": 17784.80146127646
    },
    {
      "epoch": 0.2477762364908816,
      "grad_norm": 0.07824543118476868,
      "learning_rate": 0.0002412368621916969,
      "loss": 9.0837,
      "step": 15808,
      "throughput": 17785.59480057255
    },
    {
      "epoch": 0.2482778078198105,
      "grad_norm": 0.08016502857208252,
      "learning_rate": 0.0002410104508154116,
      "loss": 9.0857,
      "step": 15840,
      "throughput": 17786.62932005166
    },
    {
      "epoch": 0.2487793791487394,
      "grad_norm": 0.08209695667028427,
      "learning_rate": 0.00024078372597180183,
      "loss": 9.0813,
      "step": 15872,
      "throughput": 17786.653180076366
    },
    {
      "epoch": 0.24928095047766832,
      "grad_norm": 0.0860067754983902,
      "learning_rate": 0.00024055668859588157,
      "loss": 9.062,
      "step": 15904,
      "throughput": 17786.153065215716
    },
    {
      "epoch": 0.24978252180659724,
      "grad_norm": 0.07782717794179916,
      "learning_rate": 0.0002403293396239536,
      "loss": 9.0747,
      "step": 15936,
      "throughput": 17786.550824802533
    },
    {
      "epoch": 0.25028409313552613,
      "grad_norm": 0.08756222575902939,
      "learning_rate": 0.00024010167999360575,
      "loss": 9.0941,
      "step": 15968,
      "throughput": 17786.37941455417
    },
    {
      "epoch": 0.25078566446445505,
      "grad_norm": 0.07098475843667984,
      "learning_rate": 0.00023987371064370698,
      "loss": 9.0685,
      "step": 16000,
      "throughput": 17786.590935662654
    },
    {
      "epoch": 0.25128723579338397,
      "grad_norm": 0.07569174468517303,
      "learning_rate": 0.00023964543251440363,
      "loss": 9.0477,
      "step": 16032,
      "throughput": 17786.305093450927
    },
    {
      "epoch": 0.2517888071223129,
      "grad_norm": 0.08683817833662033,
      "learning_rate": 0.00023941684654711534,
      "loss": 9.0784,
      "step": 16064,
      "throughput": 17787.327044746777
    },
    {
      "epoch": 0.2522903784512418,
      "grad_norm": 0.07383278012275696,
      "learning_rate": 0.0002391879536845313,
      "loss": 9.075,
      "step": 16096,
      "throughput": 17787.623320426683
    },
    {
      "epoch": 0.25279194978017067,
      "grad_norm": 0.08801043033599854,
      "learning_rate": 0.0002389587548706064,
      "loss": 9.0607,
      "step": 16128,
      "throughput": 17788.640494622367
    },
    {
      "epoch": 0.2532935211090996,
      "grad_norm": 0.07545439898967743,
      "learning_rate": 0.0002387292510505572,
      "loss": 9.0544,
      "step": 16160,
      "throughput": 17789.424450237166
    },
    {
      "epoch": 0.2537950924380285,
      "grad_norm": 0.08476603776216507,
      "learning_rate": 0.00023849944317085812,
      "loss": 9.0844,
      "step": 16192,
      "throughput": 17788.941098313495
    },
    {
      "epoch": 0.2542966637669574,
      "grad_norm": 0.08219964802265167,
      "learning_rate": 0.0002382693321792376,
      "loss": 9.0726,
      "step": 16224,
      "throughput": 17788.35583765858
    },
    {
      "epoch": 0.25479823509588634,
      "grad_norm": 0.07952090352773666,
      "learning_rate": 0.00023803891902467406,
      "loss": 9.0836,
      "step": 16256,
      "throughput": 17789.353238515407
    },
    {
      "epoch": 0.25529980642481526,
      "grad_norm": 0.07744229584932327,
      "learning_rate": 0.0002378082046573919,
      "loss": 9.0545,
      "step": 16288,
      "throughput": 17788.79759968533
    },
    {
      "epoch": 0.2558013777537442,
      "grad_norm": 0.07623753696680069,
      "learning_rate": 0.00023757719002885793,
      "loss": 9.0517,
      "step": 16320,
      "throughput": 17788.39778661232
    },
    {
      "epoch": 0.25630294908267304,
      "grad_norm": 0.07981929183006287,
      "learning_rate": 0.00023734587609177725,
      "loss": 9.0688,
      "step": 16352,
      "throughput": 17789.159511362217
    },
    {
      "epoch": 0.25680452041160196,
      "grad_norm": 0.08757334202528,
      "learning_rate": 0.000237114263800089,
      "loss": 9.0724,
      "step": 16384,
      "throughput": 17789.704377586713
    },
    {
      "epoch": 0.2573060917405309,
      "grad_norm": 0.07343178987503052,
      "learning_rate": 0.0002368823541089632,
      "loss": 9.0719,
      "step": 16416,
      "throughput": 17788.00287521869
    },
    {
      "epoch": 0.2578076630694598,
      "grad_norm": 0.07238256186246872,
      "learning_rate": 0.00023665014797479602,
      "loss": 9.0621,
      "step": 16448,
      "throughput": 17789.03773857688
    },
    {
      "epoch": 0.2583092343983887,
      "grad_norm": 0.07762811332941055,
      "learning_rate": 0.00023641764635520617,
      "loss": 9.0527,
      "step": 16480,
      "throughput": 17788.906456268425
    },
    {
      "epoch": 0.2588108057273176,
      "grad_norm": 0.0722971260547638,
      "learning_rate": 0.0002361848502090311,
      "loss": 9.0536,
      "step": 16512,
      "throughput": 17788.41421709581
    },
    {
      "epoch": 0.25931237705624655,
      "grad_norm": 0.07550311833620071,
      "learning_rate": 0.0002359517604963228,
      "loss": 9.0705,
      "step": 16544,
      "throughput": 17788.812063779653
    },
    {
      "epoch": 0.2598139483851754,
      "grad_norm": 0.08055978268384933,
      "learning_rate": 0.0002357183781783439,
      "loss": 9.0531,
      "step": 16576,
      "throughput": 17788.431551853282
    },
    {
      "epoch": 0.2603155197141043,
      "grad_norm": 0.08193892985582352,
      "learning_rate": 0.0002354847042175638,
      "loss": 9.0396,
      "step": 16608,
      "throughput": 17788.409081313366
    },
    {
      "epoch": 0.26081709104303324,
      "grad_norm": 0.07315579056739807,
      "learning_rate": 0.0002352507395776546,
      "loss": 9.0484,
      "step": 16640,
      "throughput": 17788.41089261413
    },
    {
      "epoch": 0.26131866237196216,
      "grad_norm": 0.07734058052301407,
      "learning_rate": 0.00023501648522348715,
      "loss": 9.0634,
      "step": 16672,
      "throughput": 17789.22038944418
    },
    {
      "epoch": 0.2618202337008911,
      "grad_norm": 0.07491450756788254,
      "learning_rate": 0.0002347819421211271,
      "loss": 9.0661,
      "step": 16704,
      "throughput": 17789.977336385524
    },
    {
      "epoch": 0.26232180502982,
      "grad_norm": 0.07738371938467026,
      "learning_rate": 0.00023454711123783092,
      "loss": 9.0493,
      "step": 16736,
      "throughput": 17790.974012187293
    },
    {
      "epoch": 0.2628233763587489,
      "grad_norm": 0.07885603606700897,
      "learning_rate": 0.00023431199354204192,
      "loss": 9.0511,
      "step": 16768,
      "throughput": 17791.490265431854
    },
    {
      "epoch": 0.2633249476876778,
      "grad_norm": 0.07946806401014328,
      "learning_rate": 0.00023407659000338607,
      "loss": 9.0613,
      "step": 16800,
      "throughput": 17791.105061305985
    },
    {
      "epoch": 0.2638265190166067,
      "grad_norm": 0.07436434924602509,
      "learning_rate": 0.00023384090159266833,
      "loss": 9.056,
      "step": 16832,
      "throughput": 17790.75878448174
    },
    {
      "epoch": 0.2643280903455356,
      "grad_norm": 0.07219117879867554,
      "learning_rate": 0.00023360492928186838,
      "loss": 9.0472,
      "step": 16864,
      "throughput": 17791.095584204028
    },
    {
      "epoch": 0.26482966167446453,
      "grad_norm": 0.0752549022436142,
      "learning_rate": 0.00023336867404413674,
      "loss": 9.0551,
      "step": 16896,
      "throughput": 17790.32914453402
    },
    {
      "epoch": 0.26533123300339345,
      "grad_norm": 0.07986687868833542,
      "learning_rate": 0.0002331321368537907,
      "loss": 9.0342,
      "step": 16928,
      "throughput": 17790.30320047984
    },
    {
      "epoch": 0.26583280433232237,
      "grad_norm": 0.07936165481805801,
      "learning_rate": 0.0002328953186863103,
      "loss": 9.0429,
      "step": 16960,
      "throughput": 17791.25445204999
    },
    {
      "epoch": 0.2663343756612513,
      "grad_norm": 0.07726788520812988,
      "learning_rate": 0.00023265822051833442,
      "loss": 9.0546,
      "step": 16992,
      "throughput": 17792.094337143873
    },
    {
      "epoch": 0.26683594699018015,
      "grad_norm": 0.07570113986730576,
      "learning_rate": 0.00023242084332765662,
      "loss": 9.0216,
      "step": 17024,
      "throughput": 17792.85045569564
    },
    {
      "epoch": 0.26733751831910907,
      "grad_norm": 0.07200782001018524,
      "learning_rate": 0.0002321831880932211,
      "loss": 9.0211,
      "step": 17056,
      "throughput": 17793.602824151738
    },
    {
      "epoch": 0.267839089648038,
      "grad_norm": 0.0791451558470726,
      "learning_rate": 0.00023194525579511876,
      "loss": 9.0484,
      "step": 17088,
      "throughput": 17793.50755081012
    },
    {
      "epoch": 0.2683406609769669,
      "grad_norm": 0.07487615942955017,
      "learning_rate": 0.00023170704741458308,
      "loss": 9.0559,
      "step": 17120,
      "throughput": 17793.055755669182
    },
    {
      "epoch": 0.2688422323058958,
      "grad_norm": 0.0783098042011261,
      "learning_rate": 0.00023146856393398615,
      "loss": 9.0623,
      "step": 17152,
      "throughput": 17793.38732326507
    },
    {
      "epoch": 0.26934380363482474,
      "grad_norm": 0.07308696955442429,
      "learning_rate": 0.0002312298063368346,
      "loss": 9.05,
      "step": 17184,
      "throughput": 17792.883408825306
    },
    {
      "epoch": 0.26984537496375366,
      "grad_norm": 0.080522820353508,
      "learning_rate": 0.00023099077560776536,
      "loss": 9.0426,
      "step": 17216,
      "throughput": 17792.804267937678
    },
    {
      "epoch": 0.2703469462926825,
      "grad_norm": 0.07421231269836426,
      "learning_rate": 0.00023075147273254195,
      "loss": 9.0254,
      "step": 17248,
      "throughput": 17793.020427664535
    },
    {
      "epoch": 0.27084851762161144,
      "grad_norm": 0.0802403911948204,
      "learning_rate": 0.0002305118986980501,
      "loss": 9.0529,
      "step": 17280,
      "throughput": 17793.81386712117
    },
    {
      "epoch": 0.27135008895054036,
      "grad_norm": 0.07557597011327744,
      "learning_rate": 0.00023027205449229388,
      "loss": 9.0459,
      "step": 17312,
      "throughput": 17794.53885751692
    },
    {
      "epoch": 0.2718516602794693,
      "grad_norm": 0.07545724511146545,
      "learning_rate": 0.00023003194110439145,
      "loss": 9.0391,
      "step": 17344,
      "throughput": 17795.47098513452
    },
    {
      "epoch": 0.2723532316083982,
      "grad_norm": 0.07856345921754837,
      "learning_rate": 0.00022979155952457118,
      "loss": 9.0403,
      "step": 17376,
      "throughput": 17795.81009923486
    },
    {
      "epoch": 0.2728548029373271,
      "grad_norm": 0.08439384400844574,
      "learning_rate": 0.00022955091074416733,
      "loss": 9.0333,
      "step": 17408,
      "throughput": 17795.18094316347
    },
    {
      "epoch": 0.27335637426625603,
      "grad_norm": 0.06949391961097717,
      "learning_rate": 0.0002293099957556163,
      "loss": 9.0498,
      "step": 17440,
      "throughput": 17795.422632099973
    },
    {
      "epoch": 0.2738579455951849,
      "grad_norm": 0.07778200507164001,
      "learning_rate": 0.00022906881555245212,
      "loss": 8.9999,
      "step": 17472,
      "throughput": 17795.314290608025
    },
    {
      "epoch": 0.2743595169241138,
      "grad_norm": 0.07648744434118271,
      "learning_rate": 0.0002288273711293028,
      "loss": 9.0106,
      "step": 17504,
      "throughput": 17795.054320179566
    },
    {
      "epoch": 0.27486108825304273,
      "grad_norm": 0.07675327360630035,
      "learning_rate": 0.00022858566348188568,
      "loss": 9.0532,
      "step": 17536,
      "throughput": 17795.260115755576
    },
    {
      "epoch": 0.27536265958197165,
      "grad_norm": 0.07408589869737625,
      "learning_rate": 0.00022834369360700394,
      "loss": 9.0316,
      "step": 17568,
      "throughput": 17796.207604447347
    },
    {
      "epoch": 0.27586423091090057,
      "grad_norm": 0.08209360390901566,
      "learning_rate": 0.00022810146250254196,
      "loss": 9.0422,
      "step": 17600,
      "throughput": 17797.01285422199
    },
    {
      "epoch": 0.2763658022398295,
      "grad_norm": 0.08367981016635895,
      "learning_rate": 0.00022785897116746166,
      "loss": 9.0116,
      "step": 17632,
      "throughput": 17797.561393204396
    },
    {
      "epoch": 0.2768673735687584,
      "grad_norm": 0.07375655323266983,
      "learning_rate": 0.00022761622060179793,
      "loss": 9.0219,
      "step": 17664,
      "throughput": 17798.27372285632
    },
    {
      "epoch": 0.27736894489768726,
      "grad_norm": 0.08418888598680496,
      "learning_rate": 0.00022737321180665488,
      "loss": 9.0296,
      "step": 17696,
      "throughput": 17798.215734831665
    },
    {
      "epoch": 0.2778705162266162,
      "grad_norm": 0.07620551437139511,
      "learning_rate": 0.00022712994578420143,
      "loss": 9.0558,
      "step": 17728,
      "throughput": 17797.602086991534
    },
    {
      "epoch": 0.2783720875555451,
      "grad_norm": 0.07998618483543396,
      "learning_rate": 0.00022688642353766746,
      "loss": 9.0022,
      "step": 17760,
      "throughput": 17797.891019187224
    },
    {
      "epoch": 0.278873658884474,
      "grad_norm": 0.08344753086566925,
      "learning_rate": 0.00022664264607133937,
      "loss": 9.0276,
      "step": 17792,
      "throughput": 17797.38464950342
    },
    {
      "epoch": 0.27937523021340294,
      "grad_norm": 0.08326306939125061,
      "learning_rate": 0.00022639861439055617,
      "loss": 9.0268,
      "step": 17824,
      "throughput": 17797.49075609581
    },
    {
      "epoch": 0.27987680154233185,
      "grad_norm": 0.08046148717403412,
      "learning_rate": 0.00022615432950170528,
      "loss": 9.0105,
      "step": 17856,
      "throughput": 17797.925671450626
    },
    {
      "epoch": 0.2803783728712608,
      "grad_norm": 0.07936503738164902,
      "learning_rate": 0.00022590979241221825,
      "loss": 9.0122,
      "step": 17888,
      "throughput": 17798.723556141715
    },
    {
      "epoch": 0.28087994420018964,
      "grad_norm": 0.0750616043806076,
      "learning_rate": 0.00022566500413056677,
      "loss": 8.9999,
      "step": 17920,
      "throughput": 17799.426296168647
    },
    {
      "epoch": 0.28138151552911855,
      "grad_norm": 0.08700020611286163,
      "learning_rate": 0.00022541996566625841,
      "loss": 9.027,
      "step": 17952,
      "throughput": 17800.17336464227
    },
    {
      "epoch": 0.28188308685804747,
      "grad_norm": 0.0861596092581749,
      "learning_rate": 0.00022517467802983266,
      "loss": 9.0228,
      "step": 17984,
      "throughput": 17800.27227871042
    },
    {
      "epoch": 0.2823846581869764,
      "grad_norm": 0.18219846487045288,
      "learning_rate": 0.0002249291422328563,
      "loss": 9.0184,
      "step": 18016,
      "throughput": 17799.87955037822
    },
    {
      "epoch": 0.2828862295159053,
      "grad_norm": 0.07688847929239273,
      "learning_rate": 0.00022468335928791977,
      "loss": 9.0133,
      "step": 18048,
      "throughput": 17799.79042344303
    },
    {
      "epoch": 0.2833878008448342,
      "grad_norm": 0.07489870488643646,
      "learning_rate": 0.00022443733020863262,
      "loss": 9.002,
      "step": 18080,
      "throughput": 17799.522150841196
    },
    {
      "epoch": 0.28388937217376314,
      "grad_norm": 0.07628989219665527,
      "learning_rate": 0.00022419105600961955,
      "loss": 9.0073,
      "step": 18112,
      "throughput": 17799.63748606668
    },
    {
      "epoch": 0.284390943502692,
      "grad_norm": 0.08326411247253418,
      "learning_rate": 0.00022394453770651607,
      "loss": 9.017,
      "step": 18144,
      "throughput": 17799.836091760237
    },
    {
      "epoch": 0.2848925148316209,
      "grad_norm": 0.07968247681856155,
      "learning_rate": 0.00022369777631596436,
      "loss": 8.9849,
      "step": 18176,
      "throughput": 17800.613434453368
    },
    {
      "epoch": 0.28539408616054984,
      "grad_norm": 0.07687773555517197,
      "learning_rate": 0.00022345077285560914,
      "loss": 9.0194,
      "step": 18208,
      "throughput": 17801.295220398395
    },
    {
      "epoch": 0.28589565748947876,
      "grad_norm": 0.07676286995410919,
      "learning_rate": 0.00022320352834409343,
      "loss": 9.0141,
      "step": 18240,
      "throughput": 17802.02864196113
    },
    {
      "epoch": 0.2863972288184077,
      "grad_norm": 0.08311989158391953,
      "learning_rate": 0.0002229560438010543,
      "loss": 8.9896,
      "step": 18272,
      "throughput": 17802.725834806886
    },
    {
      "epoch": 0.2868988001473366,
      "grad_norm": 0.08136092871427536,
      "learning_rate": 0.00022270832024711882,
      "loss": 9.0212,
      "step": 18304,
      "throughput": 17802.193303962875
    },
    {
      "epoch": 0.2874003714762655,
      "grad_norm": 0.06818056106567383,
      "learning_rate": 0.00022246035870389952,
      "loss": 8.9862,
      "step": 18336,
      "throughput": 17801.6183171822
    },
    {
      "epoch": 0.2879019428051944,
      "grad_norm": 0.0861053392291069,
      "learning_rate": 0.00022221216019399067,
      "loss": 9.0066,
      "step": 18368,
      "throughput": 17801.87988758641
    },
    {
      "epoch": 0.2884035141341233,
      "grad_norm": 0.07756201177835464,
      "learning_rate": 0.00022196372574096357,
      "loss": 8.9965,
      "step": 18400,
      "throughput": 17801.849397082373
    },
    {
      "epoch": 0.2889050854630522,
      "grad_norm": 0.08043848723173141,
      "learning_rate": 0.00022171505636936272,
      "loss": 9.0147,
      "step": 18432,
      "throughput": 17801.73744307053
    },
    {
      "epoch": 0.28940665679198113,
      "grad_norm": 0.07838977873325348,
      "learning_rate": 0.00022146615310470125,
      "loss": 9.0194,
      "step": 18464,
      "throughput": 17800.34864497776
    },
    {
      "epoch": 0.28990822812091005,
      "grad_norm": 0.07929672300815582,
      "learning_rate": 0.0002212170169734571,
      "loss": 9.0049,
      "step": 18496,
      "throughput": 17801.112254143158
    },
    {
      "epoch": 0.29040979944983897,
      "grad_norm": 0.08226138353347778,
      "learning_rate": 0.0002209676490030683,
      "loss": 8.9909,
      "step": 18528,
      "throughput": 17801.7744118373
    },
    {
      "epoch": 0.2909113707787679,
      "grad_norm": 0.0832921713590622,
      "learning_rate": 0.0002207180502219291,
      "loss": 9.0125,
      "step": 18560,
      "throughput": 17802.297882049992
    },
    {
      "epoch": 0.29141294210769675,
      "grad_norm": 0.07955853641033173,
      "learning_rate": 0.00022046822165938565,
      "loss": 8.9916,
      "step": 18592,
      "throughput": 17802.42861817214
    },
    {
      "epoch": 0.29191451343662567,
      "grad_norm": 0.07068558037281036,
      "learning_rate": 0.00022021816434573168,
      "loss": 8.9989,
      "step": 18624,
      "throughput": 17801.93784729312
    },
    {
      "epoch": 0.2924160847655546,
      "grad_norm": 0.07164032757282257,
      "learning_rate": 0.0002199678793122043,
      "loss": 9.0068,
      "step": 18656,
      "throughput": 17801.901509302465
    },
    {
      "epoch": 0.2929176560944835,
      "grad_norm": 0.07843668013811111,
      "learning_rate": 0.0002197173675909797,
      "loss": 8.9828,
      "step": 18688,
      "throughput": 17801.66895807248
    },
    {
      "epoch": 0.2934192274234124,
      "grad_norm": 0.07328581064939499,
      "learning_rate": 0.00021946663021516895,
      "loss": 9.0062,
      "step": 18720,
      "throughput": 17801.961052826777
    },
    {
      "epoch": 0.29392079875234134,
      "grad_norm": 0.08058661222457886,
      "learning_rate": 0.0002192156682188138,
      "loss": 9.017,
      "step": 18752,
      "throughput": 17802.15378270491
    },
    {
      "epoch": 0.29442237008127026,
      "grad_norm": 0.07562687247991562,
      "learning_rate": 0.00021896448263688224,
      "loss": 8.9795,
      "step": 18784,
      "throughput": 17802.902654737656
    },
    {
      "epoch": 0.2949239414101991,
      "grad_norm": 0.07606592029333115,
      "learning_rate": 0.00021871307450526428,
      "loss": 8.9991,
      "step": 18816,
      "throughput": 17803.556862594272
    },
    {
      "epoch": 0.29542551273912804,
      "grad_norm": 0.0791717916727066,
      "learning_rate": 0.00021846144486076794,
      "loss": 8.9762,
      "step": 18848,
      "throughput": 17804.259838788174
    },
    {
      "epoch": 0.29592708406805696,
      "grad_norm": 0.07694504410028458,
      "learning_rate": 0.00021820959474111448,
      "loss": 8.9946,
      "step": 18880,
      "throughput": 17804.901312659757
    },
    {
      "epoch": 0.2964286553969859,
      "grad_norm": 0.07806772738695145,
      "learning_rate": 0.00021795752518493462,
      "loss": 8.9957,
      "step": 18912,
      "throughput": 17804.04277097383
    },
    {
      "epoch": 0.2969302267259148,
      "grad_norm": 0.07575007528066635,
      "learning_rate": 0.0002177052372317639,
      "loss": 8.9875,
      "step": 18944,
      "throughput": 17803.77740896515
    },
    {
      "epoch": 0.2974317980548437,
      "grad_norm": 0.06974538415670395,
      "learning_rate": 0.00021745273192203871,
      "loss": 8.979,
      "step": 18976,
      "throughput": 17803.878554230883
    },
    {
      "epoch": 0.2979333693837726,
      "grad_norm": 0.07206161320209503,
      "learning_rate": 0.00021720001029709152,
      "loss": 8.9952,
      "step": 19008,
      "throughput": 17803.84715675061
    },
    {
      "epoch": 0.2984349407127015,
      "grad_norm": 0.07531093060970306,
      "learning_rate": 0.00021694707339914722,
      "loss": 8.9861,
      "step": 19040,
      "throughput": 17804.01661455889
    },
    {
      "epoch": 0.2989365120416304,
      "grad_norm": 0.07219666242599487,
      "learning_rate": 0.00021669392227131816,
      "loss": 8.9951,
      "step": 19072,
      "throughput": 17804.87759250451
    },
    {
      "epoch": 0.2994380833705593,
      "grad_norm": 0.07833381742238998,
      "learning_rate": 0.0002164405579576005,
      "loss": 9.0,
      "step": 19104,
      "throughput": 17805.427632362112
    },
    {
      "epoch": 0.29993965469948825,
      "grad_norm": 0.07842086255550385,
      "learning_rate": 0.0002161869815028694,
      "loss": 8.9848,
      "step": 19136,
      "throughput": 17805.883723570336
    },
    {
      "epoch": 0.30044122602841716,
      "grad_norm": 0.0783037543296814,
      "learning_rate": 0.00021593319395287483,
      "loss": 8.9859,
      "step": 19168,
      "throughput": 17806.746396470768
    },
    {
      "epoch": 0.3009427973573461,
      "grad_norm": 0.07206156104803085,
      "learning_rate": 0.0002156791963542374,
      "loss": 8.9808,
      "step": 19200,
      "throughput": 17806.462975098348
    },
    {
      "epoch": 0.30144436868627494,
      "grad_norm": 0.07156243175268173,
      "learning_rate": 0.00021542498975444404,
      "loss": 8.9698,
      "step": 19232,
      "throughput": 17806.145865857052
    },
    {
      "epoch": 0.30194594001520386,
      "grad_norm": 0.07291711866855621,
      "learning_rate": 0.0002151705752018435,
      "loss": 8.9754,
      "step": 19264,
      "throughput": 17806.223016558117
    },
    {
      "epoch": 0.3024475113441328,
      "grad_norm": 0.07339274883270264,
      "learning_rate": 0.0002149159537456421,
      "loss": 8.9839,
      "step": 19296,
      "throughput": 17805.957976066922
    },
    {
      "epoch": 0.3029490826730617,
      "grad_norm": 0.07498586177825928,
      "learning_rate": 0.00021466112643589948,
      "loss": 8.9323,
      "step": 19328,
      "throughput": 17806.27011152514
    },
    {
      "epoch": 0.3034506540019906,
      "grad_norm": 0.07056141644716263,
      "learning_rate": 0.00021440609432352427,
      "loss": 8.9969,
      "step": 19360,
      "throughput": 17806.671457033535
    },
    {
      "epoch": 0.30395222533091953,
      "grad_norm": 0.07289967685937881,
      "learning_rate": 0.00021415085846026961,
      "loss": 8.991,
      "step": 19392,
      "throughput": 17807.179791054605
    },
    {
      "epoch": 0.30445379665984845,
      "grad_norm": 0.07861388474702835,
      "learning_rate": 0.00021389541989872904,
      "loss": 8.9598,
      "step": 19424,
      "throughput": 17807.824663972242
    },
    {
      "epoch": 0.3049553679887773,
      "grad_norm": 0.08246038854122162,
      "learning_rate": 0.00021363977969233186,
      "loss": 8.9862,
      "step": 19456,
      "throughput": 17808.475116730137
    },
    {
      "epoch": 0.30545693931770623,
      "grad_norm": 0.07363611459732056,
      "learning_rate": 0.000213383938895339,
      "loss": 8.9798,
      "step": 19488,
      "throughput": 17808.86721984606
    },
    {
      "epoch": 0.30595851064663515,
      "grad_norm": 0.07611941546201706,
      "learning_rate": 0.00021312789856283885,
      "loss": 8.9645,
      "step": 19520,
      "throughput": 17807.974704074484
    },
    {
      "epoch": 0.30646008197556407,
      "grad_norm": 0.06836125254631042,
      "learning_rate": 0.0002128716597507423,
      "loss": 8.9942,
      "step": 19552,
      "throughput": 17807.922191509897
    },
    {
      "epoch": 0.306961653304493,
      "grad_norm": 0.07298135757446289,
      "learning_rate": 0.00021261522351577906,
      "loss": 8.9902,
      "step": 19584,
      "throughput": 17807.67964169082
    },
    {
      "epoch": 0.3074632246334219,
      "grad_norm": 0.06927632540464401,
      "learning_rate": 0.00021235859091549294,
      "loss": 8.9864,
      "step": 19616,
      "throughput": 17807.969765938007
    },
    {
      "epoch": 0.3079647959623508,
      "grad_norm": 0.07625886052846909,
      "learning_rate": 0.0002121017630082375,
      "loss": 8.9775,
      "step": 19648,
      "throughput": 17808.127902746375
    },
    {
      "epoch": 0.3084663672912797,
      "grad_norm": 0.07400345057249069,
      "learning_rate": 0.0002118447408531718,
      "loss": 8.9469,
      "step": 19680,
      "throughput": 17808.94989418138
    },
    {
      "epoch": 0.3089679386202086,
      "grad_norm": 0.07475277781486511,
      "learning_rate": 0.00021158752551025603,
      "loss": 8.9637,
      "step": 19712,
      "throughput": 17809.472429741916
    },
    {
      "epoch": 0.3094695099491375,
      "grad_norm": 0.07137515395879745,
      "learning_rate": 0.0002113301180402469,
      "loss": 8.9916,
      "step": 19744,
      "throughput": 17809.935530919967
    },
    {
      "epoch": 0.30997108127806644,
      "grad_norm": 0.09154373407363892,
      "learning_rate": 0.0002110725195046937,
      "loss": 8.9795,
      "step": 19776,
      "throughput": 17810.551977206345
    },
    {
      "epoch": 0.31047265260699536,
      "grad_norm": 0.07624326646327972,
      "learning_rate": 0.00021081473096593348,
      "loss": 8.9766,
      "step": 19808,
      "throughput": 17810.2789124244
    },
    {
      "epoch": 0.3109742239359243,
      "grad_norm": 0.07699087262153625,
      "learning_rate": 0.000210556753487087,
      "loss": 8.968,
      "step": 19840,
      "throughput": 17809.766829334505
    },
    {
      "epoch": 0.3114757952648532,
      "grad_norm": 0.0898560956120491,
      "learning_rate": 0.00021029858813205408,
      "loss": 8.9447,
      "step": 19872,
      "throughput": 17809.71709658218
    },
    {
      "epoch": 0.31197736659378206,
      "grad_norm": 0.06971347332000732,
      "learning_rate": 0.00021004023596550946,
      "loss": 8.9711,
      "step": 19904,
      "throughput": 17809.805281913414
    },
    {
      "epoch": 0.312478937922711,
      "grad_norm": 0.08007395267486572,
      "learning_rate": 0.00020978169805289823,
      "loss": 8.9766,
      "step": 19936,
      "throughput": 17809.85262690025
    },
    {
      "epoch": 0.3129805092516399,
      "grad_norm": 0.07413521409034729,
      "learning_rate": 0.0002095229754604315,
      "loss": 8.9644,
      "step": 19968,
      "throughput": 17810.456333872407
    },
    {
      "epoch": 0.3134820805805688,
      "grad_norm": 0.0724678784608841,
      "learning_rate": 0.00020926406925508202,
      "loss": 8.9704,
      "step": 20000,
      "throughput": 17810.95911060627
    },
    {
      "epoch": 0.31398365190949773,
      "grad_norm": 0.07824688404798508,
      "learning_rate": 0.00020900498050457973,
      "loss": 8.9808,
      "step": 20032,
      "throughput": 17811.40221341032
    },
    {
      "epoch": 0.31448522323842665,
      "grad_norm": 0.07604194432497025,
      "learning_rate": 0.0002087457102774074,
      "loss": 8.9667,
      "step": 20064,
      "throughput": 17812.197594582045
    },
    {
      "epoch": 0.31498679456735557,
      "grad_norm": 0.07452570647001266,
      "learning_rate": 0.00020848625964279622,
      "loss": 8.9568,
      "step": 20096,
      "throughput": 17812.116222839028
    },
    {
      "epoch": 0.31548836589628443,
      "grad_norm": 0.07305281609296799,
      "learning_rate": 0.0002082266296707214,
      "loss": 8.9635,
      "step": 20128,
      "throughput": 17811.703231534724
    },
    {
      "epoch": 0.31598993722521335,
      "grad_norm": 0.07430999726057053,
      "learning_rate": 0.0002079668214318977,
      "loss": 8.9571,
      "step": 20160,
      "throughput": 17811.648548123165
    },
    {
      "epoch": 0.31649150855414226,
      "grad_norm": 0.07261121273040771,
      "learning_rate": 0.00020770683599777507,
      "loss": 8.9543,
      "step": 20192,
      "throughput": 17811.40851628757
    },
    {
      "epoch": 0.3169930798830712,
      "grad_norm": 0.07133432477712631,
      "learning_rate": 0.0002074466744405342,
      "loss": 8.9533,
      "step": 20224,
      "throughput": 17811.67342260343
    },
    {
      "epoch": 0.3174946512120001,
      "grad_norm": 0.07755530625581741,
      "learning_rate": 0.00020718633783308214,
      "loss": 8.9451,
      "step": 20256,
      "throughput": 17811.81537866219
    },
    {
      "epoch": 0.317996222540929,
      "grad_norm": 0.07869453728199005,
      "learning_rate": 0.00020692582724904778,
      "loss": 8.9563,
      "step": 20288,
      "throughput": 17812.308371027375
    },
    {
      "epoch": 0.31849779386985794,
      "grad_norm": 0.07447408884763718,
      "learning_rate": 0.00020666514376277762,
      "loss": 8.9567,
      "step": 20320,
      "throughput": 17813.096559118872
    },
    {
      "epoch": 0.3189993651987868,
      "grad_norm": 0.07978935539722443,
      "learning_rate": 0.00020640428844933108,
      "loss": 8.9608,
      "step": 20352,
      "throughput": 17813.55181688389
    },
    {
      "epoch": 0.3195009365277157,
      "grad_norm": 0.07883100211620331,
      "learning_rate": 0.00020614326238447623,
      "loss": 8.9578,
      "step": 20384,
      "throughput": 17814.156716354308
    },
    {
      "epoch": 0.32000250785664464,
      "grad_norm": 0.07453079521656036,
      "learning_rate": 0.0002058820666446854,
      "loss": 8.9556,
      "step": 20416,
      "throughput": 17813.428897268856
    },
    {
      "epoch": 0.32050407918557355,
      "grad_norm": 0.07906678318977356,
      "learning_rate": 0.00020562070230713058,
      "loss": 8.9644,
      "step": 20448,
      "throughput": 17813.228445550754
    },
    {
      "epoch": 0.32100565051450247,
      "grad_norm": 0.08273608237504959,
      "learning_rate": 0.00020535917044967899,
      "loss": 8.946,
      "step": 20480,
      "throughput": 17812.901296804946
    },
    {
      "epoch": 0.3215072218434314,
      "grad_norm": 0.07959649711847305,
      "learning_rate": 0.00020509747215088887,
      "loss": 8.9523,
      "step": 20512,
      "throughput": 17811.100061077435
    },
    {
      "epoch": 0.3220087931723603,
      "grad_norm": 0.07556446641683578,
      "learning_rate": 0.00020483560849000475,
      "loss": 8.9512,
      "step": 20544,
      "throughput": 17811.46613679104
    },
    {
      "epoch": 0.32251036450128917,
      "grad_norm": 0.0763557106256485,
      "learning_rate": 0.00020457358054695317,
      "loss": 8.9434,
      "step": 20576,
      "throughput": 17812.04490257042
    },
    {
      "epoch": 0.3230119358302181,
      "grad_norm": 0.07173081487417221,
      "learning_rate": 0.00020431138940233808,
      "loss": 8.9412,
      "step": 20608,
      "throughput": 17812.52434196172
    },
    {
      "epoch": 0.323513507159147,
      "grad_norm": 0.0748804360628128,
      "learning_rate": 0.00020404903613743664,
      "loss": 8.9418,
      "step": 20640,
      "throughput": 17812.95504034032
    },
    {
      "epoch": 0.3240150784880759,
      "grad_norm": 0.07947655022144318,
      "learning_rate": 0.0002037865218341944,
      "loss": 8.951,
      "step": 20672,
      "throughput": 17813.552276554165
    },
    {
      "epoch": 0.32451664981700484,
      "grad_norm": 0.07350742071866989,
      "learning_rate": 0.00020352384757522113,
      "loss": 8.9345,
      "step": 20704,
      "throughput": 17813.469451543082
    },
    {
      "epoch": 0.32501822114593376,
      "grad_norm": 0.07518761605024338,
      "learning_rate": 0.00020326101444378633,
      "loss": 8.9597,
      "step": 20736,
      "throughput": 17813.428817463646
    },
    {
      "epoch": 0.3255197924748627,
      "grad_norm": 0.07858512550592422,
      "learning_rate": 0.0002029980235238145,
      "loss": 8.9569,
      "step": 20768,
      "throughput": 17812.8848706178
    },
    {
      "epoch": 0.32602136380379154,
      "grad_norm": 0.07931273430585861,
      "learning_rate": 0.0002027348758998811,
      "loss": 8.9508,
      "step": 20800,
      "throughput": 17812.926669883913
    },
    {
      "epoch": 0.32652293513272046,
      "grad_norm": 0.0770784467458725,
      "learning_rate": 0.0002024715726572076,
      "loss": 8.9619,
      "step": 20832,
      "throughput": 17813.494705234196
    },
    {
      "epoch": 0.3270245064616494,
      "grad_norm": 0.0742030069231987,
      "learning_rate": 0.0002022081148816574,
      "loss": 8.936,
      "step": 20864,
      "throughput": 17813.64573655942
    },
    {
      "epoch": 0.3275260777905783,
      "grad_norm": 0.07879988849163055,
      "learning_rate": 0.0002019445036597312,
      "loss": 8.9358,
      "step": 20896,
      "throughput": 17814.11509360599
    },
    {
      "epoch": 0.3280276491195072,
      "grad_norm": 0.07346165180206299,
      "learning_rate": 0.00020168074007856232,
      "loss": 8.9437,
      "step": 20928,
      "throughput": 17814.758227227496
    },
    {
      "epoch": 0.32852922044843613,
      "grad_norm": 0.06962357461452484,
      "learning_rate": 0.00020141682522591272,
      "loss": 8.9292,
      "step": 20960,
      "throughput": 17815.185618338328
    },
    {
      "epoch": 0.32903079177736505,
      "grad_norm": 0.07394640147686005,
      "learning_rate": 0.0002011527601901679,
      "loss": 8.948,
      "step": 20992,
      "throughput": 17815.820375428073
    },
    {
      "epoch": 0.3295323631062939,
      "grad_norm": 0.07294956594705582,
      "learning_rate": 0.00020088854606033292,
      "loss": 8.954,
      "step": 21024,
      "throughput": 17815.48353320163
    },
    {
      "epoch": 0.33003393443522283,
      "grad_norm": 0.0731930062174797,
      "learning_rate": 0.00020062418392602767,
      "loss": 8.9372,
      "step": 21056,
      "throughput": 17815.312534164874
    },
    {
      "epoch": 0.33053550576415175,
      "grad_norm": 0.07782046496868134,
      "learning_rate": 0.00020035967487748226,
      "loss": 8.9486,
      "step": 21088,
      "throughput": 17814.861235838223
    },
    {
      "epoch": 0.33103707709308067,
      "grad_norm": 0.07795765995979309,
      "learning_rate": 0.00020009502000553286,
      "loss": 8.9382,
      "step": 21120,
      "throughput": 17815.23092475671
    },
    {
      "epoch": 0.3315386484220096,
      "grad_norm": 0.07017985731363297,
      "learning_rate": 0.00019983022040161692,
      "loss": 8.9257,
      "step": 21152,
      "throughput": 17815.382657071077
    },
    {
      "epoch": 0.3320402197509385,
      "grad_norm": 0.07129698246717453,
      "learning_rate": 0.00019956527715776887,
      "loss": 8.9459,
      "step": 21184,
      "throughput": 17816.146885634007
    },
    {
      "epoch": 0.3325417910798674,
      "grad_norm": 0.07461407780647278,
      "learning_rate": 0.0001993001913666153,
      "loss": 8.9444,
      "step": 21216,
      "throughput": 17816.08373663044
    },
    {
      "epoch": 0.3330433624087963,
      "grad_norm": 0.08224450051784515,
      "learning_rate": 0.00019903496412137093,
      "loss": 8.9408,
      "step": 21248,
      "throughput": 17816.49273855996
    },
    {
      "epoch": 0.3335449337377252,
      "grad_norm": 0.07270831614732742,
      "learning_rate": 0.00019876959651583362,
      "loss": 8.9426,
      "step": 21280,
      "throughput": 17817.24483864838
    },
    {
      "epoch": 0.3340465050666541,
      "grad_norm": 0.07324141263961792,
      "learning_rate": 0.00019850408964438023,
      "loss": 8.9502,
      "step": 21312,
      "throughput": 17817.18968453246
    },
    {
      "epoch": 0.33454807639558304,
      "grad_norm": 0.0847543254494667,
      "learning_rate": 0.00019823844460196177,
      "loss": 8.9443,
      "step": 21344,
      "throughput": 17816.919887196673
    },
    {
      "epoch": 0.33504964772451196,
      "grad_norm": 0.07064050436019897,
      "learning_rate": 0.00019797266248409932,
      "loss": 8.9184,
      "step": 21376,
      "throughput": 17816.675415472295
    },
    {
      "epoch": 0.3355512190534409,
      "grad_norm": 0.0750681459903717,
      "learning_rate": 0.000197706744386879,
      "loss": 8.9338,
      "step": 21408,
      "throughput": 17816.857082441475
    },
    {
      "epoch": 0.3360527903823698,
      "grad_norm": 0.07319337129592896,
      "learning_rate": 0.00019744069140694795,
      "loss": 8.9523,
      "step": 21440,
      "throughput": 17817.227002143307
    },
    {
      "epoch": 0.33655436171129866,
      "grad_norm": 0.0720682218670845,
      "learning_rate": 0.00019717450464150935,
      "loss": 8.9081,
      "step": 21472,
      "throughput": 17817.55143137617
    },
    {
      "epoch": 0.3370559330402276,
      "grad_norm": 0.07472842186689377,
      "learning_rate": 0.00019690818518831827,
      "loss": 8.9401,
      "step": 21504,
      "throughput": 17817.80429730904
    },
    {
      "epoch": 0.3375575043691565,
      "grad_norm": 0.07875826954841614,
      "learning_rate": 0.0001966417341456769,
      "loss": 8.9229,
      "step": 21536,
      "throughput": 17818.05151405433
    },
    {
      "epoch": 0.3380590756980854,
      "grad_norm": 0.074205681681633,
      "learning_rate": 0.0001963751526124301,
      "loss": 8.9331,
      "step": 21568,
      "throughput": 17818.59926377971
    },
    {
      "epoch": 0.3385606470270143,
      "grad_norm": 0.07358483225107193,
      "learning_rate": 0.00019610844168796096,
      "loss": 8.9475,
      "step": 21600,
      "throughput": 17819.0439804744
    },
    {
      "epoch": 0.33906221835594325,
      "grad_norm": 0.07578427344560623,
      "learning_rate": 0.0001958416024721861,
      "loss": 8.9204,
      "step": 21632,
      "throughput": 17818.595125911124
    },
    {
      "epoch": 0.33956378968487216,
      "grad_norm": 0.0762067437171936,
      "learning_rate": 0.00019557463606555118,
      "loss": 8.9177,
      "step": 21664,
      "throughput": 17818.425541998775
    },
    {
      "epoch": 0.340065361013801,
      "grad_norm": 0.07589062303304672,
      "learning_rate": 0.0001953075435690266,
      "loss": 8.9473,
      "step": 21696,
      "throughput": 17818.298150910818
    },
    {
      "epoch": 0.34056693234272994,
      "grad_norm": 0.07703171670436859,
      "learning_rate": 0.0001950403260841024,
      "loss": 8.9025,
      "step": 21728,
      "throughput": 17818.641348897618
    },
    {
      "epoch": 0.34106850367165886,
      "grad_norm": 0.07283373922109604,
      "learning_rate": 0.0001947729847127845,
      "loss": 8.9463,
      "step": 21760,
      "throughput": 17818.782797105105
    },
    {
      "epoch": 0.3415700750005878,
      "grad_norm": 0.07369955629110336,
      "learning_rate": 0.00019450552055758934,
      "loss": 8.9105,
      "step": 21792,
      "throughput": 17819.51243392188
    },
    {
      "epoch": 0.3420716463295167,
      "grad_norm": 0.0815635547041893,
      "learning_rate": 0.00019423793472153996,
      "loss": 8.9103,
      "step": 21824,
      "throughput": 17819.43615014987
    },
    {
      "epoch": 0.3425732176584456,
      "grad_norm": 0.07449345290660858,
      "learning_rate": 0.0001939702283081611,
      "loss": 8.9171,
      "step": 21856,
      "throughput": 17819.837183645075
    },
    {
      "epoch": 0.34307478898737453,
      "grad_norm": 0.07299656420946121,
      "learning_rate": 0.00019370240242147488,
      "loss": 8.9307,
      "step": 21888,
      "throughput": 17820.560184313057
    },
    {
      "epoch": 0.3435763603163034,
      "grad_norm": 0.07765787839889526,
      "learning_rate": 0.000193434458165996,
      "loss": 8.9367,
      "step": 21920,
      "throughput": 17819.982451899494
    },
    {
      "epoch": 0.3440779316452323,
      "grad_norm": 0.07395743578672409,
      "learning_rate": 0.00019316639664672733,
      "loss": 8.9071,
      "step": 21952,
      "throughput": 17820.094119130765
    },
    {
      "epoch": 0.34457950297416123,
      "grad_norm": 0.07570222020149231,
      "learning_rate": 0.00019289821896915544,
      "loss": 8.9106,
      "step": 21984,
      "throughput": 17819.68260988534
    },
    {
      "epoch": 0.34508107430309015,
      "grad_norm": 0.07392715662717819,
      "learning_rate": 0.00019262992623924585,
      "loss": 8.9192,
      "step": 22016,
      "throughput": 17820.021151294266
    },
    {
      "epoch": 0.34558264563201907,
      "grad_norm": 0.07445602118968964,
      "learning_rate": 0.00019236151956343852,
      "loss": 8.8954,
      "step": 22048,
      "throughput": 17820.156965333827
    },
    {
      "epoch": 0.346084216960948,
      "grad_norm": 0.07157503068447113,
      "learning_rate": 0.00019209300004864341,
      "loss": 8.9212,
      "step": 22080,
      "throughput": 17820.683172522855
    },
    {
      "epoch": 0.3465857882898769,
      "grad_norm": 0.08508722484111786,
      "learning_rate": 0.00019182436880223585,
      "loss": 8.9188,
      "step": 22112,
      "throughput": 17820.930225618704
    },
    {
      "epoch": 0.34708735961880577,
      "grad_norm": 0.08076149970293045,
      "learning_rate": 0.00019155562693205178,
      "loss": 8.9127,
      "step": 22144,
      "throughput": 17821.16633543189
    },
    {
      "epoch": 0.3475889309477347,
      "grad_norm": 0.07283724844455719,
      "learning_rate": 0.00019128677554638355,
      "loss": 8.8887,
      "step": 22176,
      "throughput": 17821.709052753864
    },
    {
      "epoch": 0.3480905022766636,
      "grad_norm": 0.07913101464509964,
      "learning_rate": 0.0001910178157539751,
      "loss": 8.9094,
      "step": 22208,
      "throughput": 17821.848150108508
    },
    {
      "epoch": 0.3485920736055925,
      "grad_norm": 0.07167515158653259,
      "learning_rate": 0.00019074874866401733,
      "loss": 8.9367,
      "step": 22240,
      "throughput": 17821.58256896794
    },
    {
      "epoch": 0.34909364493452144,
      "grad_norm": 0.07268569618463516,
      "learning_rate": 0.00019047957538614375,
      "loss": 8.9112,
      "step": 22272,
      "throughput": 17821.33667359995
    },
    {
      "epoch": 0.34959521626345036,
      "grad_norm": 0.07488091289997101,
      "learning_rate": 0.00019021029703042576,
      "loss": 8.9092,
      "step": 22304,
      "throughput": 17821.224577891684
    },
    {
      "epoch": 0.3500967875923793,
      "grad_norm": 0.07319794595241547,
      "learning_rate": 0.0001899409147073681,
      "loss": 8.9162,
      "step": 22336,
      "throughput": 17821.760640674343
    },
    {
      "epoch": 0.35059835892130814,
      "grad_norm": 0.07335125654935837,
      "learning_rate": 0.0001896714295279043,
      "loss": 8.9058,
      "step": 22368,
      "throughput": 17821.86835973504
    },
    {
      "epoch": 0.35109993025023706,
      "grad_norm": 0.08088002353906631,
      "learning_rate": 0.00018940184260339194,
      "loss": 8.9161,
      "step": 22400,
      "throughput": 17822.288636338348
    },
    {
      "epoch": 0.351601501579166,
      "grad_norm": 0.0779116079211235,
      "learning_rate": 0.00018913215504560838,
      "loss": 8.9302,
      "step": 22432,
      "throughput": 17822.461043873136
    },
    {
      "epoch": 0.3521030729080949,
      "grad_norm": 0.07497800886631012,
      "learning_rate": 0.0001888623679667459,
      "loss": 8.9022,
      "step": 22464,
      "throughput": 17822.847998012865
    },
    {
      "epoch": 0.3526046442370238,
      "grad_norm": 0.06929473578929901,
      "learning_rate": 0.00018859248247940722,
      "loss": 8.911,
      "step": 22496,
      "throughput": 17823.555725739072
    },
    {
      "epoch": 0.35310621556595273,
      "grad_norm": 0.07567939162254333,
      "learning_rate": 0.0001883224996966008,
      "loss": 8.9249,
      "step": 22528,
      "throughput": 17823.010673609624
    },
    {
      "epoch": 0.35360778689488165,
      "grad_norm": 0.08010558784008026,
      "learning_rate": 0.00018805242073173653,
      "loss": 8.9136,
      "step": 22560,
      "throughput": 17821.25216698874
    },
    {
      "epoch": 0.3541093582238105,
      "grad_norm": 0.0750737190246582,
      "learning_rate": 0.00018778224669862087,
      "loss": 8.9159,
      "step": 22592,
      "throughput": 17820.76067321445
    },
    {
      "epoch": 0.35461092955273943,
      "grad_norm": 0.07348627597093582,
      "learning_rate": 0.0001875119787114523,
      "loss": 8.8968,
      "step": 22624,
      "throughput": 17821.09990511972
    },
    {
      "epoch": 0.35511250088166835,
      "grad_norm": 0.06908921897411346,
      "learning_rate": 0.00018724161788481676,
      "loss": 8.9083,
      "step": 22656,
      "throughput": 17821.23808504927
    },
    {
      "epoch": 0.35561407221059727,
      "grad_norm": 0.07111164182424545,
      "learning_rate": 0.00018697116533368316,
      "loss": 8.8927,
      "step": 22688,
      "throughput": 17821.93493660733
    },
    {
      "epoch": 0.3561156435395262,
      "grad_norm": 0.07290966063737869,
      "learning_rate": 0.00018670062217339867,
      "loss": 8.9259,
      "step": 22720,
      "throughput": 17821.967956572396
    },
    {
      "epoch": 0.3566172148684551,
      "grad_norm": 0.07786906510591507,
      "learning_rate": 0.0001864299895196839,
      "loss": 8.9351,
      "step": 22752,
      "throughput": 17822.36912099535
    },
    {
      "epoch": 0.357118786197384,
      "grad_norm": 0.08353696018457413,
      "learning_rate": 0.00018615926848862893,
      "loss": 8.9286,
      "step": 22784,
      "throughput": 17822.891229334833
    },
    {
      "epoch": 0.3576203575263129,
      "grad_norm": 0.07277407497167587,
      "learning_rate": 0.00018588846019668793,
      "loss": 8.8903,
      "step": 22816,
      "throughput": 17822.741576530676
    },
    {
      "epoch": 0.3581219288552418,
      "grad_norm": 0.07241684943437576,
      "learning_rate": 0.00018561756576067524,
      "loss": 8.8931,
      "step": 22848,
      "throughput": 17822.475657843017
    },
    {
      "epoch": 0.3586235001841707,
      "grad_norm": 0.07565472275018692,
      "learning_rate": 0.0001853465862977602,
      "loss": 8.8852,
      "step": 22880,
      "throughput": 17822.20926050296
    },
    {
      "epoch": 0.35912507151309964,
      "grad_norm": 0.07836824655532837,
      "learning_rate": 0.00018507552292546295,
      "loss": 8.8783,
      "step": 22912,
      "throughput": 17822.34557228031
    },
    {
      "epoch": 0.35962664284202855,
      "grad_norm": 0.07098279893398285,
      "learning_rate": 0.00018480437676164968,
      "loss": 8.8742,
      "step": 22944,
      "throughput": 17822.67608358776
    },
    {
      "epoch": 0.3601282141709575,
      "grad_norm": 0.07533206045627594,
      "learning_rate": 0.00018453314892452795,
      "loss": 8.9083,
      "step": 22976,
      "throughput": 17822.99284955358
    },
    {
      "epoch": 0.36062978549988634,
      "grad_norm": 0.07069644331932068,
      "learning_rate": 0.00018426184053264215,
      "loss": 8.9058,
      "step": 23008,
      "throughput": 17823.223058884796
    },
    {
      "epoch": 0.36113135682881525,
      "grad_norm": 0.07935863733291626,
      "learning_rate": 0.0001839904527048689,
      "loss": 8.9095,
      "step": 23040,
      "throughput": 17823.443737012367
    },
    {
      "epoch": 0.36163292815774417,
      "grad_norm": 0.07350348681211472,
      "learning_rate": 0.0001837189865604124,
      "loss": 8.8807,
      "step": 23072,
      "throughput": 17823.957248393664
    },
    {
      "epoch": 0.3621344994866731,
      "grad_norm": 0.07933742552995682,
      "learning_rate": 0.00018344744321879987,
      "loss": 8.9032,
      "step": 23104,
      "throughput": 17824.52610640502
    },
    {
      "epoch": 0.362636070815602,
      "grad_norm": 0.07533203065395355,
      "learning_rate": 0.0001831758237998768,
      "loss": 8.8809,
      "step": 23136,
      "throughput": 17824.12672581524
    },
    {
      "epoch": 0.3631376421445309,
      "grad_norm": 0.07438133656978607,
      "learning_rate": 0.00018290412942380252,
      "loss": 8.8638,
      "step": 23168,
      "throughput": 17823.96099488507
    },
    {
      "epoch": 0.36363921347345984,
      "grad_norm": 0.08473438024520874,
      "learning_rate": 0.00018263236121104543,
      "loss": 8.8881,
      "step": 23200,
      "throughput": 17823.641345838707
    },
    {
      "epoch": 0.3641407848023887,
      "grad_norm": 0.07578590512275696,
      "learning_rate": 0.00018236052028237847,
      "loss": 8.8879,
      "step": 23232,
      "throughput": 17823.96706337484
    },
    {
      "epoch": 0.3646423561313176,
      "grad_norm": 0.07169584929943085,
      "learning_rate": 0.0001820886077588744,
      "loss": 8.89,
      "step": 23264,
      "throughput": 17824.111644749042
    },
    {
      "epoch": 0.36514392746024654,
      "grad_norm": 0.07907114177942276,
      "learning_rate": 0.00018181662476190127,
      "loss": 8.9048,
      "step": 23296,
      "throughput": 17824.790422560367
    },
    {
      "epoch": 0.36564549878917546,
      "grad_norm": 0.07309950143098831,
      "learning_rate": 0.00018154457241311773,
      "loss": 8.8882,
      "step": 23328,
      "throughput": 17824.842153808437
    },
    {
      "epoch": 0.3661470701181044,
      "grad_norm": 0.08225353062152863,
      "learning_rate": 0.00018127245183446858,
      "loss": 8.8804,
      "step": 23360,
      "throughput": 17825.248228130444
    },
    {
      "epoch": 0.3666486414470333,
      "grad_norm": 0.07341925799846649,
      "learning_rate": 0.00018100026414817987,
      "loss": 8.899,
      "step": 23392,
      "throughput": 17825.926457473488
    },
    {
      "epoch": 0.3671502127759622,
      "grad_norm": 0.07902863621711731,
      "learning_rate": 0.00018072801047675432,
      "loss": 8.8896,
      "step": 23424,
      "throughput": 17825.396406786957
    },
    {
      "epoch": 0.3676517841048911,
      "grad_norm": 0.07868482172489166,
      "learning_rate": 0.00018045569194296697,
      "loss": 8.8864,
      "step": 23456,
      "throughput": 17825.498447999318
    },
    {
      "epoch": 0.36815335543382,
      "grad_norm": 0.07887911796569824,
      "learning_rate": 0.00018018330966986022,
      "loss": 8.8781,
      "step": 23488,
      "throughput": 17825.08292114403
    },
    {
      "epoch": 0.3686549267627489,
      "grad_norm": 0.07060116529464722,
      "learning_rate": 0.00017991086478073943,
      "loss": 8.9005,
      "step": 23520,
      "throughput": 17825.578620517335
    },
    {
      "epoch": 0.36915649809167783,
      "grad_norm": 0.07441214472055435,
      "learning_rate": 0.0001796383583991681,
      "loss": 8.8852,
      "step": 23552,
      "throughput": 17825.713155723053
    },
    {
      "epoch": 0.36965806942060675,
      "grad_norm": 0.07870756089687347,
      "learning_rate": 0.00017936579164896333,
      "loss": 8.8856,
      "step": 23584,
      "throughput": 17826.056147290332
    },
    {
      "epoch": 0.37015964074953567,
      "grad_norm": 0.07043929398059845,
      "learning_rate": 0.0001790931656541912,
      "loss": 8.8971,
      "step": 23616,
      "throughput": 17826.278076249124
    },
    {
      "epoch": 0.3706612120784646,
      "grad_norm": 0.07437200844287872,
      "learning_rate": 0.00017882048153916214,
      "loss": 8.9109,
      "step": 23648,
      "throughput": 17826.49189349421
    },
    {
      "epoch": 0.37116278340739345,
      "grad_norm": 0.07189071178436279,
      "learning_rate": 0.00017854774042842626,
      "loss": 8.901,
      "step": 23680,
      "throughput": 17827.162394924104
    },
    {
      "epoch": 0.37166435473632237,
      "grad_norm": 0.07663335651159286,
      "learning_rate": 0.00017827494344676873,
      "loss": 8.8979,
      "step": 23712,
      "throughput": 17827.442078451364
    },
    {
      "epoch": 0.3721659260652513,
      "grad_norm": 0.07194948941469193,
      "learning_rate": 0.000178002091719205,
      "loss": 8.8743,
      "step": 23744,
      "throughput": 17827.293276727123
    },
    {
      "epoch": 0.3726674973941802,
      "grad_norm": 0.07944053411483765,
      "learning_rate": 0.00017772918637097657,
      "loss": 8.8836,
      "step": 23776,
      "throughput": 17826.95173690492
    },
    {
      "epoch": 0.3731690687231091,
      "grad_norm": 0.09098808467388153,
      "learning_rate": 0.00017745622852754575,
      "loss": 8.8857,
      "step": 23808,
      "throughput": 17826.989096990383
    },
    {
      "epoch": 0.37367064005203804,
      "grad_norm": 0.07665561884641647,
      "learning_rate": 0.00017718321931459163,
      "loss": 8.8787,
      "step": 23840,
      "throughput": 17827.46315318843
    },
    {
      "epoch": 0.37417221138096696,
      "grad_norm": 0.08578639477491379,
      "learning_rate": 0.00017691015985800488,
      "loss": 8.898,
      "step": 23872,
      "throughput": 17827.58528583542
    },
    {
      "epoch": 0.3746737827098958,
      "grad_norm": 0.07634943723678589,
      "learning_rate": 0.0001766370512838836,
      "loss": 8.8763,
      "step": 23904,
      "throughput": 17827.978884744767
    },
    {
      "epoch": 0.37517535403882474,
      "grad_norm": 0.07927807420492172,
      "learning_rate": 0.00017636389471852834,
      "loss": 8.877,
      "step": 23936,
      "throughput": 17828.118072038644
    },
    {
      "epoch": 0.37567692536775366,
      "grad_norm": 0.07263068854808807,
      "learning_rate": 0.0001760906912884376,
      "loss": 8.8773,
      "step": 23968,
      "throughput": 17828.520912972963
    },
    {
      "epoch": 0.3761784966966826,
      "grad_norm": 0.0758548453450203,
      "learning_rate": 0.00017581744212030308,
      "loss": 8.8739,
      "step": 24000,
      "throughput": 17829.06547779846
    },
    {
      "epoch": 0.3766800680256115,
      "grad_norm": 0.07255948334932327,
      "learning_rate": 0.00017554414834100525,
      "loss": 8.8659,
      "step": 24032,
      "throughput": 17828.482269473152
    },
    {
      "epoch": 0.3771816393545404,
      "grad_norm": 0.0759415328502655,
      "learning_rate": 0.00017527081107760834,
      "loss": 8.8667,
      "step": 24064,
      "throughput": 17828.472128577403
    },
    {
      "epoch": 0.37768321068346933,
      "grad_norm": 0.08043156564235687,
      "learning_rate": 0.00017499743145735615,
      "loss": 8.8799,
      "step": 24096,
      "throughput": 17828.171666149505
    },
    {
      "epoch": 0.3781847820123982,
      "grad_norm": 0.07601181417703629,
      "learning_rate": 0.00017472401060766697,
      "loss": 8.8775,
      "step": 24128,
      "throughput": 17828.48492127222
    },
    {
      "epoch": 0.3786863533413271,
      "grad_norm": 0.07197776436805725,
      "learning_rate": 0.0001744505496561292,
      "loss": 8.882,
      "step": 24160,
      "throughput": 17828.766273160792
    },
    {
      "epoch": 0.379187924670256,
      "grad_norm": 0.07243472337722778,
      "learning_rate": 0.00017417704973049668,
      "loss": 8.8917,
      "step": 24192,
      "throughput": 17829.07403289147
    },
    {
      "epoch": 0.37968949599918494,
      "grad_norm": 0.07945257425308228,
      "learning_rate": 0.00017390351195868385,
      "loss": 8.881,
      "step": 24224,
      "throughput": 17829.292229522
    },
    {
      "epoch": 0.38019106732811386,
      "grad_norm": 0.07232029736042023,
      "learning_rate": 0.00017362993746876135,
      "loss": 8.8789,
      "step": 24256,
      "throughput": 17829.50020712472
    },
    {
      "epoch": 0.3806926386570428,
      "grad_norm": 0.07614285498857498,
      "learning_rate": 0.00017335632738895113,
      "loss": 8.8773,
      "step": 24288,
      "throughput": 17830.137295477773
    },
    {
      "epoch": 0.3811942099859717,
      "grad_norm": 0.07729846239089966,
      "learning_rate": 0.000173082682847622,
      "loss": 8.8747,
      "step": 24320,
      "throughput": 17830.041901587563
    },
    {
      "epoch": 0.38169578131490056,
      "grad_norm": 0.07206852734088898,
      "learning_rate": 0.0001728090049732848,
      "loss": 8.8686,
      "step": 24352,
      "throughput": 17830.061222745466
    },
    {
      "epoch": 0.3821973526438295,
      "grad_norm": 0.07643686234951019,
      "learning_rate": 0.00017253529489458802,
      "loss": 8.8818,
      "step": 24384,
      "throughput": 17829.672580412058
    },
    {
      "epoch": 0.3826989239727584,
      "grad_norm": 0.07176980376243591,
      "learning_rate": 0.00017226155374031271,
      "loss": 8.8655,
      "step": 24416,
      "throughput": 17829.9882926581
    },
    {
      "epoch": 0.3832004953016873,
      "grad_norm": 0.07665753364562988,
      "learning_rate": 0.0001719877826393683,
      "loss": 8.8661,
      "step": 24448,
      "throughput": 17830.44935147498
    },
    {
      "epoch": 0.38370206663061623,
      "grad_norm": 0.07416357845067978,
      "learning_rate": 0.00017171398272078752,
      "loss": 8.8556,
      "step": 24480,
      "throughput": 17830.548068886743
    },
    {
      "epoch": 0.38420363795954515,
      "grad_norm": 0.0756995901465416,
      "learning_rate": 0.00017144015511372208,
      "loss": 8.8604,
      "step": 24512,
      "throughput": 17830.773188901458
    },
    {
      "epoch": 0.38470520928847407,
      "grad_norm": 0.0758114829659462,
      "learning_rate": 0.00017116630094743792,
      "loss": 8.859,
      "step": 24544,
      "throughput": 17830.930798570385
    },
    {
      "epoch": 0.38520678061740293,
      "grad_norm": 0.08073586970567703,
      "learning_rate": 0.00017089242135131036,
      "loss": 8.8594,
      "step": 24576,
      "throughput": 17831.43214908143
    },
    {
      "epoch": 0.38570835194633185,
      "grad_norm": 0.0700501948595047,
      "learning_rate": 0.0001706185174548197,
      "loss": 8.853,
      "step": 24608,
      "throughput": 17830.3106839198
    },
    {
      "epoch": 0.38620992327526077,
      "grad_norm": 0.07498278468847275,
      "learning_rate": 0.0001703445903875464,
      "loss": 8.8598,
      "step": 24640,
      "throughput": 17829.736843745333
    },
    {
      "epoch": 0.3867114946041897,
      "grad_norm": 0.07171520590782166,
      "learning_rate": 0.00017007064127916644,
      "loss": 8.864,
      "step": 24672,
      "throughput": 17829.833203307262
    },
    {
      "epoch": 0.3872130659331186,
      "grad_norm": 0.07245524972677231,
      "learning_rate": 0.0001697966712594469,
      "loss": 8.8756,
      "step": 24704,
      "throughput": 17829.564385638096
    },
    {
      "epoch": 0.3877146372620475,
      "grad_norm": 0.07057449966669083,
      "learning_rate": 0.00016952268145824082,
      "loss": 8.8591,
      "step": 24736,
      "throughput": 17830.03439866721
    },
    {
      "epoch": 0.38821620859097644,
      "grad_norm": 0.07579497247934341,
      "learning_rate": 0.00016924867300548304,
      "loss": 8.8497,
      "step": 24768,
      "throughput": 17830.14237986475
    },
    {
      "epoch": 0.3887177799199053,
      "grad_norm": 0.0772717073559761,
      "learning_rate": 0.00016897464703118515,
      "loss": 8.8757,
      "step": 24800,
      "throughput": 17830.60302461098
    },
    {
      "epoch": 0.3892193512488342,
      "grad_norm": 0.07301851361989975,
      "learning_rate": 0.00016870060466543112,
      "loss": 8.8347,
      "step": 24832,
      "throughput": 17830.62448111229
    },
    {
      "epoch": 0.38972092257776314,
      "grad_norm": 0.0722026452422142,
      "learning_rate": 0.0001684265470383725,
      "loss": 8.8762,
      "step": 24864,
      "throughput": 17830.99681553676
    },
    {
      "epoch": 0.39022249390669206,
      "grad_norm": 0.07057911902666092,
      "learning_rate": 0.0001681524752802237,
      "loss": 8.8822,
      "step": 24896,
      "throughput": 17831.52814856809
    },
    {
      "epoch": 0.390724065235621,
      "grad_norm": 0.07054942101240158,
      "learning_rate": 0.00016787839052125758,
      "loss": 8.8627,
      "step": 24928,
      "throughput": 17830.850969822153
    },
    {
      "epoch": 0.3912256365645499,
      "grad_norm": 0.07358486205339432,
      "learning_rate": 0.00016760429389180037,
      "loss": 8.8779,
      "step": 24960,
      "throughput": 17831.13179515518
    },
    {
      "epoch": 0.3917272078934788,
      "grad_norm": 0.07562954723834991,
      "learning_rate": 0.00016733018652222744,
      "loss": 8.8385,
      "step": 24992,
      "throughput": 17830.842157214578
    },
    {
      "epoch": 0.3922287792224077,
      "grad_norm": 0.0741063579916954,
      "learning_rate": 0.0001670560695429584,
      "loss": 8.8556,
      "step": 25024,
      "throughput": 17831.311424604814
    },
    {
      "epoch": 0.3927303505513366,
      "grad_norm": 0.07098574936389923,
      "learning_rate": 0.00016678194408445245,
      "loss": 8.8303,
      "step": 25056,
      "throughput": 17831.617559289924
    },
    {
      "epoch": 0.3932319218802655,
      "grad_norm": 0.07636234164237976,
      "learning_rate": 0.00016650781127720382,
      "loss": 8.8659,
      "step": 25088,
      "throughput": 17831.88957466774
    },
    {
      "epoch": 0.39373349320919443,
      "grad_norm": 0.07332395017147064,
      "learning_rate": 0.00016623367225173703,
      "loss": 8.8681,
      "step": 25120,
      "throughput": 17831.94164096608
    },
    {
      "epoch": 0.39423506453812335,
      "grad_norm": 0.07327907532453537,
      "learning_rate": 0.00016595952813860216,
      "loss": 8.8584,
      "step": 25152,
      "throughput": 17832.23279696902
    },
    {
      "epoch": 0.39473663586705227,
      "grad_norm": 0.07691916078329086,
      "learning_rate": 0.00016568538006837046,
      "loss": 8.8657,
      "step": 25184,
      "throughput": 17832.71533372163
    },
    {
      "epoch": 0.3952382071959812,
      "grad_norm": 0.07487235218286514,
      "learning_rate": 0.00016541122917162934,
      "loss": 8.8482,
      "step": 25216,
      "throughput": 17832.838322321524
    },
    {
      "epoch": 0.39573977852491005,
      "grad_norm": 0.06726781278848648,
      "learning_rate": 0.00016513707657897785,
      "loss": 8.8768,
      "step": 25248,
      "throughput": 17832.51545284579
    },
    {
      "epoch": 0.39624134985383896,
      "grad_norm": 0.075165756046772,
      "learning_rate": 0.00016486292342102215,
      "loss": 8.8569,
      "step": 25280,
      "throughput": 17832.19952718306
    },
    {
      "epoch": 0.3967429211827679,
      "grad_norm": 0.07332989573478699,
      "learning_rate": 0.0001645887708283707,
      "loss": 8.8522,
      "step": 25312,
      "throughput": 17832.349097094095
    },
    {
      "epoch": 0.3972444925116968,
      "grad_norm": 0.07408641278743744,
      "learning_rate": 0.00016431461993162954,
      "loss": 8.854,
      "step": 25344,
      "throughput": 17832.795997567115
    },
    {
      "epoch": 0.3977460638406257,
      "grad_norm": 0.08329582959413528,
      "learning_rate": 0.00016404047186139784,
      "loss": 8.8496,
      "step": 25376,
      "throughput": 17832.89803762926
    },
    {
      "epoch": 0.39824763516955464,
      "grad_norm": 0.07956679165363312,
      "learning_rate": 0.00016376632774826297,
      "loss": 8.8448,
      "step": 25408,
      "throughput": 17833.34230562835
    },
    {
      "epoch": 0.39874920649848355,
      "grad_norm": 0.0716254860162735,
      "learning_rate": 0.0001634921887227962,
      "loss": 8.8514,
      "step": 25440,
      "throughput": 17833.376285320894
    },
    {
      "epoch": 0.3992507778274124,
      "grad_norm": 0.07612968236207962,
      "learning_rate": 0.00016321805591554755,
      "loss": 8.8448,
      "step": 25472,
      "throughput": 17833.72117489287
    },
    {
      "epoch": 0.39975234915634134,
      "grad_norm": 0.0749460905790329,
      "learning_rate": 0.00016294393045704163,
      "loss": 8.8541,
      "step": 25504,
      "throughput": 17834.124936367323
    },
    {
      "epoch": 0.40025392048527025,
      "grad_norm": 0.0763927549123764,
      "learning_rate": 0.00016266981347777255,
      "loss": 8.8475,
      "step": 25536,
      "throughput": 17833.71051948569
    },
    {
      "epoch": 0.40075549181419917,
      "grad_norm": 0.07364343851804733,
      "learning_rate": 0.00016239570610819963,
      "loss": 8.837,
      "step": 25568,
      "throughput": 17833.615463015547
    },
    {
      "epoch": 0.4012570631431281,
      "grad_norm": 0.07207117974758148,
      "learning_rate": 0.00016212160947874242,
      "loss": 8.8394,
      "step": 25600,
      "throughput": 17833.49950524665
    },
    {
      "epoch": 0.401758634472057,
      "grad_norm": 0.07933421432971954,
      "learning_rate": 0.00016184752471977627,
      "loss": 8.8387,
      "step": 25632,
      "throughput": 17833.7823688976
    },
    {
      "epoch": 0.4022602058009859,
      "grad_norm": 0.07499121874570847,
      "learning_rate": 0.0001615734529616275,
      "loss": 8.8306,
      "step": 25664,
      "throughput": 17834.034646287568
    },
    {
      "epoch": 0.4027617771299148,
      "grad_norm": 0.07656212896108627,
      "learning_rate": 0.00016129939533456888,
      "loss": 8.8507,
      "step": 25696,
      "throughput": 17834.45549523096
    },
    {
      "epoch": 0.4032633484588437,
      "grad_norm": 0.08051002025604248,
      "learning_rate": 0.00016102535296881485,
      "loss": 8.8392,
      "step": 25728,
      "throughput": 17834.50003813568
    },
    {
      "epoch": 0.4037649197877726,
      "grad_norm": 0.07518472522497177,
      "learning_rate": 0.00016075132699451701,
      "loss": 8.8521,
      "step": 25760,
      "throughput": 17834.790457946634
    },
    {
      "epoch": 0.40426649111670154,
      "grad_norm": 0.07647351920604706,
      "learning_rate": 0.00016047731854175917,
      "loss": 8.8423,
      "step": 25792,
      "throughput": 17835.18758382088
    },
    {
      "epoch": 0.40476806244563046,
      "grad_norm": 0.07534997165203094,
      "learning_rate": 0.00016020332874055313,
      "loss": 8.8514,
      "step": 25824,
      "throughput": 17835.120857478418
    },
    {
      "epoch": 0.4052696337745594,
      "grad_norm": 0.08069568127393723,
      "learning_rate": 0.00015992935872083356,
      "loss": 8.8565,
      "step": 25856,
      "throughput": 17835.006479560674
    },
    {
      "epoch": 0.4057712051034883,
      "grad_norm": 0.08479923009872437,
      "learning_rate": 0.00015965540961245363,
      "loss": 8.8285,
      "step": 25888,
      "throughput": 17834.786300777007
    },
    {
      "epoch": 0.40627277643241716,
      "grad_norm": 0.07456418126821518,
      "learning_rate": 0.0001593814825451803,
      "loss": 8.8426,
      "step": 25920,
      "throughput": 17834.88195853212
    },
    {
      "epoch": 0.4067743477613461,
      "grad_norm": 0.08726909011602402,
      "learning_rate": 0.00015910757864868967,
      "loss": 8.8489,
      "step": 25952,
      "throughput": 17835.3150599101
    },
    {
      "epoch": 0.407275919090275,
      "grad_norm": 0.07202770560979843,
      "learning_rate": 0.0001588336990525621,
      "loss": 8.8519,
      "step": 25984,
      "throughput": 17835.409316446654
    },
    {
      "epoch": 0.4077774904192039,
      "grad_norm": 0.07821185886859894,
      "learning_rate": 0.00015855984488627792,
      "loss": 8.857,
      "step": 26016,
      "throughput": 17835.69390511231
    },
    {
      "epoch": 0.40827906174813283,
      "grad_norm": 0.07136929035186768,
      "learning_rate": 0.00015828601727921248,
      "loss": 8.8318,
      "step": 26048,
      "throughput": 17835.8249373439
    },
    {
      "epoch": 0.40878063307706175,
      "grad_norm": 0.08094098418951035,
      "learning_rate": 0.0001580122173606317,
      "loss": 8.8499,
      "step": 26080,
      "throughput": 17836.29278814811
    },
    {
      "epoch": 0.40928220440599067,
      "grad_norm": 0.06964492797851562,
      "learning_rate": 0.00015773844625968726,
      "loss": 8.8401,
      "step": 26112,
      "throughput": 17836.78874572426
    },
    {
      "epoch": 0.40978377573491953,
      "grad_norm": 0.07381853461265564,
      "learning_rate": 0.00015746470510541197,
      "loss": 8.8143,
      "step": 26144,
      "throughput": 17836.38282499106
    },
    {
      "epoch": 0.41028534706384845,
      "grad_norm": 0.07215822488069534,
      "learning_rate": 0.00015719099502671516,
      "loss": 8.8268,
      "step": 26176,
      "throughput": 17836.13412965979
    },
    {
      "epoch": 0.41078691839277737,
      "grad_norm": 0.08016388863325119,
      "learning_rate": 0.00015691731715237802,
      "loss": 8.8207,
      "step": 26208,
      "throughput": 17836.171230961085
    },
    {
      "epoch": 0.4112884897217063,
      "grad_norm": 0.06681036204099655,
      "learning_rate": 0.00015664367261104887,
      "loss": 8.8447,
      "step": 26240,
      "throughput": 17836.36960626293
    },
    {
      "epoch": 0.4117900610506352,
      "grad_norm": 0.07530402392148972,
      "learning_rate": 0.00015637006253123865,
      "loss": 8.8343,
      "step": 26272,
      "throughput": 17836.616848521204
    },
    {
      "epoch": 0.4122916323795641,
      "grad_norm": 0.08355733752250671,
      "learning_rate": 0.00015609648804131612,
      "loss": 8.8149,
      "step": 26304,
      "throughput": 17837.019406655218
    },
    {
      "epoch": 0.41279320370849304,
      "grad_norm": 0.07137879729270935,
      "learning_rate": 0.00015582295026950332,
      "loss": 8.8359,
      "step": 26336,
      "throughput": 17837.043601108417
    },
    {
      "epoch": 0.4132947750374219,
      "grad_norm": 0.0776265487074852,
      "learning_rate": 0.00015554945034387075,
      "loss": 8.8286,
      "step": 26368,
      "throughput": 17837.503639803843
    },
    {
      "epoch": 0.4137963463663508,
      "grad_norm": 0.07875608652830124,
      "learning_rate": 0.00015527598939233303,
      "loss": 8.8341,
      "step": 26400,
      "throughput": 17837.96173423606
    },
    {
      "epoch": 0.41429791769527974,
      "grad_norm": 0.07496542483568192,
      "learning_rate": 0.00015500256854264385,
      "loss": 8.8242,
      "step": 26432,
      "throughput": 17837.46333688345
    },
    {
      "epoch": 0.41479948902420866,
      "grad_norm": 0.07084860652685165,
      "learning_rate": 0.00015472918892239166,
      "loss": 8.8373,
      "step": 26464,
      "throughput": 17837.6371879048
    },
    {
      "epoch": 0.4153010603531376,
      "grad_norm": 0.07146702706813812,
      "learning_rate": 0.00015445585165899475,
      "loss": 8.8324,
      "step": 26496,
      "throughput": 17837.419916645686
    },
    {
      "epoch": 0.4158026316820665,
      "grad_norm": 0.07583361119031906,
      "learning_rate": 0.00015418255787969692,
      "loss": 8.8297,
      "step": 26528,
      "throughput": 17837.692998535804
    },
    {
      "epoch": 0.4163042030109954,
      "grad_norm": 0.06981997191905975,
      "learning_rate": 0.0001539093087115624,
      "loss": 8.8494,
      "step": 26560,
      "throughput": 17837.738224969555
    },
    {
      "epoch": 0.4168057743399243,
      "grad_norm": 0.07170698791742325,
      "learning_rate": 0.00015363610528147163,
      "loss": 8.8394,
      "step": 26592,
      "throughput": 17837.976665315815
    },
    {
      "epoch": 0.4173073456688532,
      "grad_norm": 0.07302544265985489,
      "learning_rate": 0.00015336294871611637,
      "loss": 8.8187,
      "step": 26624,
      "throughput": 17838.15509512615
    },
    {
      "epoch": 0.4178089169977821,
      "grad_norm": 0.07223688811063766,
      "learning_rate": 0.00015308984014199511,
      "loss": 8.8316,
      "step": 26656,
      "throughput": 17836.936852385057
    },
    {
      "epoch": 0.418310488326711,
      "grad_norm": 0.08481582999229431,
      "learning_rate": 0.00015281678068540836,
      "loss": 8.8157,
      "step": 26688,
      "throughput": 17837.39188111627
    },
    {
      "epoch": 0.41881205965563995,
      "grad_norm": 0.07136400043964386,
      "learning_rate": 0.00015254377147245424,
      "loss": 8.8116,
      "step": 26720,
      "throughput": 17837.555422270147
    },
    {
      "epoch": 0.41931363098456886,
      "grad_norm": 0.08144687861204147,
      "learning_rate": 0.00015227081362902343,
      "loss": 8.84,
      "step": 26752,
      "throughput": 17837.480406306255
    },
    {
      "epoch": 0.4198152023134978,
      "grad_norm": 0.07405370473861694,
      "learning_rate": 0.000151997908280795,
      "loss": 8.8126,
      "step": 26784,
      "throughput": 17837.0972853071
    },
    {
      "epoch": 0.42031677364242664,
      "grad_norm": 0.07629577070474625,
      "learning_rate": 0.0001517250565532313,
      "loss": 8.8234,
      "step": 26816,
      "throughput": 17837.121182248913
    },
    {
      "epoch": 0.42081834497135556,
      "grad_norm": 0.0791485458612442,
      "learning_rate": 0.00015145225957157373,
      "loss": 8.8142,
      "step": 26848,
      "throughput": 17837.28745575894
    },
    {
      "epoch": 0.4213199163002845,
      "grad_norm": 0.07299696654081345,
      "learning_rate": 0.00015117951846083786,
      "loss": 8.8262,
      "step": 26880,
      "throughput": 17837.537646571815
    },
    {
      "epoch": 0.4218214876292134,
      "grad_norm": 0.07210979610681534,
      "learning_rate": 0.0001509068343458088,
      "loss": 8.8114,
      "step": 26912,
      "throughput": 17837.956699491104
    },
    {
      "epoch": 0.4223230589581423,
      "grad_norm": 0.07374562323093414,
      "learning_rate": 0.00015063420835103667,
      "loss": 8.8362,
      "step": 26944,
      "throughput": 17837.984902015047
    },
    {
      "epoch": 0.42282463028707123,
      "grad_norm": 0.07789213955402374,
      "learning_rate": 0.0001503616416008319,
      "loss": 8.8372,
      "step": 26976,
      "throughput": 17838.440082370682
    },
    {
      "epoch": 0.42332620161600015,
      "grad_norm": 0.07159041613340378,
      "learning_rate": 0.00015008913521926052,
      "loss": 8.8298,
      "step": 27008,
      "throughput": 17838.896224178727
    },
    {
      "epoch": 0.423827772944929,
      "grad_norm": 0.06973559409379959,
      "learning_rate": 0.00014981669033013972,
      "loss": 8.8185,
      "step": 27040,
      "throughput": 17838.25389198717
    },
    {
      "epoch": 0.42432934427385793,
      "grad_norm": 0.07604040205478668,
      "learning_rate": 0.00014954430805703302,
      "loss": 8.7998,
      "step": 27072,
      "throughput": 17838.339137952018
    },
    {
      "epoch": 0.42483091560278685,
      "grad_norm": 0.07852691411972046,
      "learning_rate": 0.00014927198952324568,
      "loss": 8.7969,
      "step": 27104,
      "throughput": 17838.075705595023
    },
    {
      "epoch": 0.42533248693171577,
      "grad_norm": 0.07022686302661896,
      "learning_rate": 0.00014899973585182012,
      "loss": 8.8271,
      "step": 27136,
      "throughput": 17838.480286838687
    },
    {
      "epoch": 0.4258340582606447,
      "grad_norm": 0.07396269589662552,
      "learning_rate": 0.00014872754816553141,
      "loss": 8.8041,
      "step": 27168,
      "throughput": 17838.354805028463
    },
    {
      "epoch": 0.4263356295895736,
      "grad_norm": 0.07898204028606415,
      "learning_rate": 0.00014845542758688222,
      "loss": 8.8419,
      "step": 27200,
      "throughput": 17838.761892172806
    },
    {
      "epoch": 0.42683720091850247,
      "grad_norm": 0.0686897560954094,
      "learning_rate": 0.00014818337523809876,
      "loss": 8.8243,
      "step": 27232,
      "throughput": 17838.944068222234
    },
    {
      "epoch": 0.4273387722474314,
      "grad_norm": 0.0748598575592041,
      "learning_rate": 0.0001479113922411256,
      "loss": 8.8166,
      "step": 27264,
      "throughput": 17839.222390933104
    },
    {
      "epoch": 0.4278403435763603,
      "grad_norm": 0.07616803795099258,
      "learning_rate": 0.00014763947971762153,
      "loss": 8.811,
      "step": 27296,
      "throughput": 17839.67488400504
    },
    {
      "epoch": 0.4283419149052892,
      "grad_norm": 0.07271240651607513,
      "learning_rate": 0.00014736763878895457,
      "loss": 8.8095,
      "step": 27328,
      "throughput": 17839.756404138443
    },
    {
      "epoch": 0.42884348623421814,
      "grad_norm": 0.07320531457662582,
      "learning_rate": 0.00014709587057619748,
      "loss": 8.8312,
      "step": 27360,
      "throughput": 17839.60933717741
    },
    {
      "epoch": 0.42934505756314706,
      "grad_norm": 0.07407976686954498,
      "learning_rate": 0.0001468241762001232,
      "loss": 8.8142,
      "step": 27392,
      "throughput": 17839.139047725068
    },
    {
      "epoch": 0.429846628892076,
      "grad_norm": 0.07086745649576187,
      "learning_rate": 0.00014655255678120015,
      "loss": 8.8125,
      "step": 27424,
      "throughput": 17839.268285762308
    },
    {
      "epoch": 0.43034820022100484,
      "grad_norm": 0.08092939853668213,
      "learning_rate": 0.0001462810134395876,
      "loss": 8.7988,
      "step": 27456,
      "throughput": 17839.440178719484
    },
    {
      "epoch": 0.43084977154993376,
      "grad_norm": 0.07382786273956299,
      "learning_rate": 0.0001460095472951311,
      "loss": 8.8329,
      "step": 27488,
      "throughput": 17839.527812024506
    },
    {
      "epoch": 0.4313513428788627,
      "grad_norm": 0.07203220576047897,
      "learning_rate": 0.0001457381594673579,
      "loss": 8.8201,
      "step": 27520,
      "throughput": 17839.94144916028
    },
    {
      "epoch": 0.4318529142077916,
      "grad_norm": 0.07578306645154953,
      "learning_rate": 0.00014546685107547205,
      "loss": 8.8113,
      "step": 27552,
      "throughput": 17839.97989853177
    },
    {
      "epoch": 0.4323544855367205,
      "grad_norm": 0.0789005383849144,
      "learning_rate": 0.00014519562323835034,
      "loss": 8.8245,
      "step": 27584,
      "throughput": 17840.533749978855
    },
    {
      "epoch": 0.43285605686564943,
      "grad_norm": 0.08284716308116913,
      "learning_rate": 0.000144924477074537,
      "loss": 8.8109,
      "step": 27616,
      "throughput": 17840.8666391986
    },
    {
      "epoch": 0.43335762819457835,
      "grad_norm": 0.0708891823887825,
      "learning_rate": 0.00014465341370223977,
      "loss": 8.8163,
      "step": 27648,
      "throughput": 17840.34216738948
    },
    {
      "epoch": 0.4338591995235072,
      "grad_norm": 0.08031383901834488,
      "learning_rate": 0.00014438243423932476,
      "loss": 8.7976,
      "step": 27680,
      "throughput": 17840.27370184412
    },
    {
      "epoch": 0.43436077085243613,
      "grad_norm": 0.07380495965480804,
      "learning_rate": 0.00014411153980331198,
      "loss": 8.8106,
      "step": 27712,
      "throughput": 17840.149268080273
    },
    {
      "epoch": 0.43486234218136505,
      "grad_norm": 0.07391571253538132,
      "learning_rate": 0.00014384073151137104,
      "loss": 8.7863,
      "step": 27744,
      "throughput": 17840.4072067685
    },
    {
      "epoch": 0.43536391351029397,
      "grad_norm": 0.08950921148061752,
      "learning_rate": 0.00014357001048031603,
      "loss": 8.8092,
      "step": 27776,
      "throughput": 17840.50071104384
    },
    {
      "epoch": 0.4358654848392229,
      "grad_norm": 0.07263769209384918,
      "learning_rate": 0.00014329937782660136,
      "loss": 8.8016,
      "step": 27808,
      "throughput": 17840.89219816196
    },
    {
      "epoch": 0.4363670561681518,
      "grad_norm": 0.0754525363445282,
      "learning_rate": 0.00014302883466631676,
      "loss": 8.8272,
      "step": 27840,
      "throughput": 17841.071706232735
    },
    {
      "epoch": 0.4368686274970807,
      "grad_norm": 0.07414602488279343,
      "learning_rate": 0.0001427583821151832,
      "loss": 8.8118,
      "step": 27872,
      "throughput": 17841.343629060204
    },
    {
      "epoch": 0.4373701988260096,
      "grad_norm": 0.07408881932497025,
      "learning_rate": 0.0001424880212885477,
      "loss": 8.809,
      "step": 27904,
      "throughput": 17841.784914055388
    },
    {
      "epoch": 0.4378717701549385,
      "grad_norm": 0.08639470487833023,
      "learning_rate": 0.0001422177533013791,
      "loss": 8.8339,
      "step": 27936,
      "throughput": 17841.311274442694
    },
    {
      "epoch": 0.4383733414838674,
      "grad_norm": 0.07878611981868744,
      "learning_rate": 0.00014194757926826342,
      "loss": 8.8069,
      "step": 27968,
      "throughput": 17841.391454740937
    },
    {
      "epoch": 0.43887491281279634,
      "grad_norm": 0.07366620004177094,
      "learning_rate": 0.00014167750030339915,
      "loss": 8.8038,
      "step": 28000,
      "throughput": 17841.046629825334
    },
    {
      "epoch": 0.43937648414172525,
      "grad_norm": 0.07283695042133331,
      "learning_rate": 0.00014140751752059278,
      "loss": 8.7893,
      "step": 28032,
      "throughput": 17841.302318158687
    },
    {
      "epoch": 0.4398780554706542,
      "grad_norm": 0.0751243531703949,
      "learning_rate": 0.0001411376320332541,
      "loss": 8.8033,
      "step": 28064,
      "throughput": 17841.558918240662
    },
    {
      "epoch": 0.4403796267995831,
      "grad_norm": 0.07369616627693176,
      "learning_rate": 0.0001408678449543916,
      "loss": 8.8206,
      "step": 28096,
      "throughput": 17841.637707693066
    },
    {
      "epoch": 0.44088119812851195,
      "grad_norm": 0.08011516183614731,
      "learning_rate": 0.00014059815739660806,
      "loss": 8.7958,
      "step": 28128,
      "throughput": 17841.94797571962
    },
    {
      "epoch": 0.44138276945744087,
      "grad_norm": 0.07943718135356903,
      "learning_rate": 0.00014032857047209573,
      "loss": 8.7998,
      "step": 28160,
      "throughput": 17842.07065405487
    },
    {
      "epoch": 0.4418843407863698,
      "grad_norm": 0.07318083196878433,
      "learning_rate": 0.0001400590852926319,
      "loss": 8.7977,
      "step": 28192,
      "throughput": 17842.61186513062
    },
    {
      "epoch": 0.4423859121152987,
      "grad_norm": 0.07628615945577621,
      "learning_rate": 0.00013978970296957423,
      "loss": 8.7923,
      "step": 28224,
      "throughput": 17842.56852902371
    },
    {
      "epoch": 0.4428874834442276,
      "grad_norm": 0.07363478094339371,
      "learning_rate": 0.00013952042461385625,
      "loss": 8.7956,
      "step": 28256,
      "throughput": 17842.437356485003
    },
    {
      "epoch": 0.44338905477315654,
      "grad_norm": 0.07090167701244354,
      "learning_rate": 0.00013925125133598266,
      "loss": 8.799,
      "step": 28288,
      "throughput": 17842.061185790088
    },
    {
      "epoch": 0.44389062610208546,
      "grad_norm": 0.07324929535388947,
      "learning_rate": 0.0001389821842460249,
      "loss": 8.8019,
      "step": 28320,
      "throughput": 17842.027593597886
    },
    {
      "epoch": 0.4443921974310143,
      "grad_norm": 0.09164309501647949,
      "learning_rate": 0.00013871322445361642,
      "loss": 8.8076,
      "step": 28352,
      "throughput": 17842.283044428117
    },
    {
      "epoch": 0.44489376875994324,
      "grad_norm": 0.07631401717662811,
      "learning_rate": 0.00013844437306794822,
      "loss": 8.8011,
      "step": 28384,
      "throughput": 17842.51556671752
    },
    {
      "epoch": 0.44539534008887216,
      "grad_norm": 0.07143891602754593,
      "learning_rate": 0.00013817563119776415,
      "loss": 8.7796,
      "step": 28416,
      "throughput": 17842.907716703598
    },
    {
      "epoch": 0.4458969114178011,
      "grad_norm": 0.07868574559688568,
      "learning_rate": 0.00013790699995135658,
      "loss": 8.7872,
      "step": 28448,
      "throughput": 17843.080127484805
    },
    {
      "epoch": 0.44639848274673,
      "grad_norm": 0.07218185067176819,
      "learning_rate": 0.00013763848043656148,
      "loss": 8.8027,
      "step": 28480,
      "throughput": 17843.50201491266
    },
    {
      "epoch": 0.4469000540756589,
      "grad_norm": 0.0759633257985115,
      "learning_rate": 0.00013737007376075414,
      "loss": 8.7821,
      "step": 28512,
      "throughput": 17843.828462314785
    },
    {
      "epoch": 0.44740162540458783,
      "grad_norm": 0.06998586654663086,
      "learning_rate": 0.0001371017810308445,
      "loss": 8.7811,
      "step": 28544,
      "throughput": 17843.47047588364
    },
    {
      "epoch": 0.4479031967335167,
      "grad_norm": 0.07746990770101547,
      "learning_rate": 0.00013683360335327264,
      "loss": 8.8143,
      "step": 28576,
      "throughput": 17843.451209079303
    },
    {
      "epoch": 0.4484047680624456,
      "grad_norm": 0.07679734379053116,
      "learning_rate": 0.000136565541834004,
      "loss": 8.7907,
      "step": 28608,
      "throughput": 17843.31445435224
    },
    {
      "epoch": 0.44890633939137453,
      "grad_norm": 0.08982423692941666,
      "learning_rate": 0.00013629759757852512,
      "loss": 8.7976,
      "step": 28640,
      "throughput": 17843.562150233083
    },
    {
      "epoch": 0.44940791072030345,
      "grad_norm": 0.07646424323320389,
      "learning_rate": 0.00013602977169183884,
      "loss": 8.7614,
      "step": 28672,
      "throughput": 17843.535464475917
    },
    {
      "epoch": 0.44990948204923237,
      "grad_norm": 0.0840567946434021,
      "learning_rate": 0.00013576206527846004,
      "loss": 8.7836,
      "step": 28704,
      "throughput": 17842.371375433257
    },
    {
      "epoch": 0.4504110533781613,
      "grad_norm": 0.07550926506519318,
      "learning_rate": 0.00013549447944241066,
      "loss": 8.7972,
      "step": 28736,
      "throughput": 17842.5410602527
    },
    {
      "epoch": 0.4509126247070902,
      "grad_norm": 0.0738803967833519,
      "learning_rate": 0.00013522701528721553,
      "loss": 8.7884,
      "step": 28768,
      "throughput": 17842.826906183545
    },
    {
      "epoch": 0.45141419603601907,
      "grad_norm": 0.07417251914739609,
      "learning_rate": 0.00013495967391589757,
      "loss": 8.7819,
      "step": 28800,
      "throughput": 17843.3510901284
    },
    {
      "epoch": 0.451915767364948,
      "grad_norm": 0.07992105185985565,
      "learning_rate": 0.00013469245643097345,
      "loss": 8.7829,
      "step": 28832,
      "throughput": 17843.309647376955
    },
    {
      "epoch": 0.4524173386938769,
      "grad_norm": 0.0737965926527977,
      "learning_rate": 0.0001344253639344488,
      "loss": 8.7909,
      "step": 28864,
      "throughput": 17843.325713250953
    },
    {
      "epoch": 0.4529189100228058,
      "grad_norm": 0.07945975661277771,
      "learning_rate": 0.00013415839752781392,
      "loss": 8.7935,
      "step": 28896,
      "throughput": 17842.819398840486
    },
    {
      "epoch": 0.45342048135173474,
      "grad_norm": 0.07636261731386185,
      "learning_rate": 0.00013389155831203904,
      "loss": 8.7946,
      "step": 28928,
      "throughput": 17842.91498224683
    },
    {
      "epoch": 0.45392205268066366,
      "grad_norm": 0.0748453214764595,
      "learning_rate": 0.0001336248473875699,
      "loss": 8.7897,
      "step": 28960,
      "throughput": 17843.16853929137
    },
    {
      "epoch": 0.4544236240095926,
      "grad_norm": 0.08027852326631546,
      "learning_rate": 0.00013335826585432313,
      "loss": 8.7805,
      "step": 28992,
      "throughput": 17843.244006607554
    },
    {
      "epoch": 0.45492519533852144,
      "grad_norm": 0.07624673843383789,
      "learning_rate": 0.00013309181481168173,
      "loss": 8.7804,
      "step": 29024,
      "throughput": 17843.624131799017
    },
    {
      "epoch": 0.45542676666745036,
      "grad_norm": 0.07282182574272156,
      "learning_rate": 0.00013282549535849065,
      "loss": 8.7779,
      "step": 29056,
      "throughput": 17843.67300853686
    },
    {
      "epoch": 0.4559283379963793,
      "grad_norm": 0.0720539316534996,
      "learning_rate": 0.00013255930859305205,
      "loss": 8.7718,
      "step": 29088,
      "throughput": 17844.06408514807
    },
    {
      "epoch": 0.4564299093253082,
      "grad_norm": 0.07573386281728745,
      "learning_rate": 0.000132293255613121,
      "loss": 8.7923,
      "step": 29120,
      "throughput": 17844.48076906447
    },
    {
      "epoch": 0.4569314806542371,
      "grad_norm": 0.0778200700879097,
      "learning_rate": 0.00013202733751590067,
      "loss": 8.7759,
      "step": 29152,
      "throughput": 17844.122697295465
    },
    {
      "epoch": 0.45743305198316603,
      "grad_norm": 0.07196016609668732,
      "learning_rate": 0.00013176155539803818,
      "loss": 8.7838,
      "step": 29184,
      "throughput": 17843.96467194935
    },
    {
      "epoch": 0.45793462331209495,
      "grad_norm": 0.07072978466749191,
      "learning_rate": 0.00013149591035561977,
      "loss": 8.7751,
      "step": 29216,
      "throughput": 17843.97033476635
    },
    {
      "epoch": 0.4584361946410238,
      "grad_norm": 0.07341047376394272,
      "learning_rate": 0.00013123040348416633,
      "loss": 8.7723,
      "step": 29248,
      "throughput": 17844.07352378656
    },
    {
      "epoch": 0.4589377659699527,
      "grad_norm": 0.07667620480060577,
      "learning_rate": 0.00013096503587862906,
      "loss": 8.7972,
      "step": 29280,
      "throughput": 17844.010552861248
    },
    {
      "epoch": 0.45943933729888164,
      "grad_norm": 0.07243162393569946,
      "learning_rate": 0.00013069980863338466,
      "loss": 8.7733,
      "step": 29312,
      "throughput": 17844.385719318707
    },
    {
      "epoch": 0.45994090862781056,
      "grad_norm": 0.07885871082544327,
      "learning_rate": 0.00013043472284223113,
      "loss": 8.7854,
      "step": 29344,
      "throughput": 17844.54295589396
    },
    {
      "epoch": 0.4604424799567395,
      "grad_norm": 0.0747121199965477,
      "learning_rate": 0.00013016977959838305,
      "loss": 8.7859,
      "step": 29376,
      "throughput": 17844.818108001928
    },
    {
      "epoch": 0.4609440512856684,
      "grad_norm": 0.07451453059911728,
      "learning_rate": 0.00012990497999446714,
      "loss": 8.7836,
      "step": 29408,
      "throughput": 17845.338332184892
    },
    {
      "epoch": 0.4614456226145973,
      "grad_norm": 0.07636185735464096,
      "learning_rate": 0.00012964032512251773,
      "loss": 8.7831,
      "step": 29440,
      "throughput": 17844.991803756493
    },
    {
      "epoch": 0.4619471939435262,
      "grad_norm": 0.07705360651016235,
      "learning_rate": 0.00012937581607397236,
      "loss": 8.7793,
      "step": 29472,
      "throughput": 17845.200047982493
    },
    {
      "epoch": 0.4624487652724551,
      "grad_norm": 0.07874922454357147,
      "learning_rate": 0.00012911145393966703,
      "loss": 8.7927,
      "step": 29504,
      "throughput": 17844.878648881604
    },
    {
      "epoch": 0.462950336601384,
      "grad_norm": 0.06849963217973709,
      "learning_rate": 0.00012884723980983206,
      "loss": 8.7843,
      "step": 29536,
      "throughput": 17844.968116187938
    },
    {
      "epoch": 0.46345190793031293,
      "grad_norm": 0.0721622183918953,
      "learning_rate": 0.00012858317477408728,
      "loss": 8.7883,
      "step": 29568,
      "throughput": 17845.21399269392
    },
    {
      "epoch": 0.46395347925924185,
      "grad_norm": 0.07688286155462265,
      "learning_rate": 0.00012831925992143765,
      "loss": 8.7926,
      "step": 29600,
      "throughput": 17845.137648151827
    },
    {
      "epoch": 0.46445505058817077,
      "grad_norm": 0.07621218264102936,
      "learning_rate": 0.00012805549634026882,
      "loss": 8.7775,
      "step": 29632,
      "throughput": 17845.574416407122
    },
    {
      "epoch": 0.4649566219170997,
      "grad_norm": 0.07827426493167877,
      "learning_rate": 0.00012779188511834256,
      "loss": 8.7888,
      "step": 29664,
      "throughput": 17845.693004355682
    },
    {
      "epoch": 0.46545819324602855,
      "grad_norm": 0.0747198611497879,
      "learning_rate": 0.00012752842734279238,
      "loss": 8.7684,
      "step": 29696,
      "throughput": 17845.998682074493
    },
    {
      "epoch": 0.46595976457495747,
      "grad_norm": 0.07569344341754913,
      "learning_rate": 0.0001272651241001189,
      "loss": 8.7821,
      "step": 29728,
      "throughput": 17845.9520682368
    },
    {
      "epoch": 0.4664613359038864,
      "grad_norm": 0.07907281070947647,
      "learning_rate": 0.00012700197647618549,
      "loss": 8.7905,
      "step": 29760,
      "throughput": 17845.94405368657
    },
    {
      "epoch": 0.4669629072328153,
      "grad_norm": 0.07393976300954819,
      "learning_rate": 0.00012673898555621373,
      "loss": 8.779,
      "step": 29792,
      "throughput": 17845.896590258762
    },
    {
      "epoch": 0.4674644785617442,
      "grad_norm": 0.07429799437522888,
      "learning_rate": 0.00012647615242477887,
      "loss": 8.7434,
      "step": 29824,
      "throughput": 17845.9036226395
    },
    {
      "epoch": 0.46796604989067314,
      "grad_norm": 0.07908093184232712,
      "learning_rate": 0.0001262134781658056,
      "loss": 8.7652,
      "step": 29856,
      "throughput": 17846.01233548715
    },
    {
      "epoch": 0.46846762121960206,
      "grad_norm": 0.08475719392299652,
      "learning_rate": 0.00012595096386256336,
      "loss": 8.7671,
      "step": 29888,
      "throughput": 17846.08677863954
    },
    {
      "epoch": 0.4689691925485309,
      "grad_norm": 0.07956618070602417,
      "learning_rate": 0.0001256886105976619,
      "loss": 8.7915,
      "step": 29920,
      "throughput": 17846.44840357302
    },
    {
      "epoch": 0.46947076387745984,
      "grad_norm": 0.07686559110879898,
      "learning_rate": 0.0001254264194530468,
      "loss": 8.7806,
      "step": 29952,
      "throughput": 17846.60226272647
    },
    {
      "epoch": 0.46997233520638876,
      "grad_norm": 0.08286672830581665,
      "learning_rate": 0.00012516439150999525,
      "loss": 8.7785,
      "step": 29984,
      "throughput": 17846.87335613637
    },
    {
      "epoch": 0.4704739065353177,
      "grad_norm": 0.06802724301815033,
      "learning_rate": 0.00012490252784911113,
      "loss": 8.7459,
      "step": 30016,
      "throughput": 17847.064497361553
    },
    {
      "epoch": 0.4709754778642466,
      "grad_norm": 0.07914753258228302,
      "learning_rate": 0.000124640829550321,
      "loss": 8.7634,
      "step": 30048,
      "throughput": 17846.685453391063
    },
    {
      "epoch": 0.4714770491931755,
      "grad_norm": 0.0800432562828064,
      "learning_rate": 0.00012437929769286942,
      "loss": 8.7673,
      "step": 30080,
      "throughput": 17846.8999518748
    },
    {
      "epoch": 0.47197862052210443,
      "grad_norm": 0.07590536773204803,
      "learning_rate": 0.0001241179333553146,
      "loss": 8.7833,
      "step": 30112,
      "throughput": 17846.69810252754
    },
    {
      "epoch": 0.4724801918510333,
      "grad_norm": 0.07159872353076935,
      "learning_rate": 0.00012385673761552374,
      "loss": 8.7631,
      "step": 30144,
      "throughput": 17846.939274740744
    },
    {
      "epoch": 0.4729817631799622,
      "grad_norm": 0.07268593460321426,
      "learning_rate": 0.00012359571155066894,
      "loss": 8.7725,
      "step": 30176,
      "throughput": 17846.890446191093
    },
    {
      "epoch": 0.47348333450889113,
      "grad_norm": 0.10743585973978043,
      "learning_rate": 0.00012333485623722238,
      "loss": 8.7883,
      "step": 30208,
      "throughput": 17847.09068368804
    },
    {
      "epoch": 0.47398490583782005,
      "grad_norm": 0.07458017021417618,
      "learning_rate": 0.00012307417275095222,
      "loss": 8.7676,
      "step": 30240,
      "throughput": 17847.50596240503
    },
    {
      "epoch": 0.47448647716674897,
      "grad_norm": 0.06924610584974289,
      "learning_rate": 0.00012281366216691786,
      "loss": 8.7535,
      "step": 30272,
      "throughput": 17847.500111783294
    },
    {
      "epoch": 0.4749880484956779,
      "grad_norm": 0.07047650963068008,
      "learning_rate": 0.00012255332555946582,
      "loss": 8.7518,
      "step": 30304,
      "throughput": 17847.917419369074
    },
    {
      "epoch": 0.4754896198246068,
      "grad_norm": 0.07766906172037125,
      "learning_rate": 0.00012229316400222493,
      "loss": 8.7787,
      "step": 30336,
      "throughput": 17847.72215626349
    },
    {
      "epoch": 0.47599119115353566,
      "grad_norm": 0.07250562310218811,
      "learning_rate": 0.00012203317856810232,
      "loss": 8.7749,
      "step": 30368,
      "throughput": 17847.734631924355
    },
    {
      "epoch": 0.4764927624824646,
      "grad_norm": 0.07328185439109802,
      "learning_rate": 0.0001217733703292786,
      "loss": 8.7513,
      "step": 30400,
      "throughput": 17847.613508866787
    },
    {
      "epoch": 0.4769943338113935,
      "grad_norm": 0.07611878961324692,
      "learning_rate": 0.0001215137403572038,
      "loss": 8.765,
      "step": 30432,
      "throughput": 17847.5771101991
    },
    {
      "epoch": 0.4774959051403224,
      "grad_norm": 0.08048366755247116,
      "learning_rate": 0.00012125428972259264,
      "loss": 8.7593,
      "step": 30464,
      "throughput": 17847.820835981867
    },
    {
      "epoch": 0.47799747646925134,
      "grad_norm": 0.0731780081987381,
      "learning_rate": 0.0001209950194954203,
      "loss": 8.7708,
      "step": 30496,
      "throughput": 17847.883866344233
    },
    {
      "epoch": 0.47849904779818025,
      "grad_norm": 0.07422348856925964,
      "learning_rate": 0.00012073593074491802,
      "loss": 8.7856,
      "step": 30528,
      "throughput": 17848.238168548705
    },
    {
      "epoch": 0.4790006191271092,
      "grad_norm": 0.0718270093202591,
      "learning_rate": 0.0001204770245395685,
      "loss": 8.776,
      "step": 30560,
      "throughput": 17848.273120260463
    },
    {
      "epoch": 0.47950219045603804,
      "grad_norm": 0.07511728256940842,
      "learning_rate": 0.00012021830194710178,
      "loss": 8.7618,
      "step": 30592,
      "throughput": 17848.62970918359
    },
    {
      "epoch": 0.48000376178496695,
      "grad_norm": 0.07711270451545715,
      "learning_rate": 0.00011995976403449054,
      "loss": 8.7615,
      "step": 30624,
      "throughput": 17848.659560032636
    },
    {
      "epoch": 0.48050533311389587,
      "grad_norm": 0.07422070950269699,
      "learning_rate": 0.00011970141186794592,
      "loss": 8.7722,
      "step": 30656,
      "throughput": 17848.36810050533
    },
    {
      "epoch": 0.4810069044428248,
      "grad_norm": 0.09282661974430084,
      "learning_rate": 0.00011944324651291299,
      "loss": 8.7454,
      "step": 30688,
      "throughput": 17848.450950218277
    },
    {
      "epoch": 0.4815084757717537,
      "grad_norm": 0.07340681552886963,
      "learning_rate": 0.00011918526903406647,
      "loss": 8.7462,
      "step": 30720,
      "throughput": 17848.471110200808
    },
    {
      "epoch": 0.4820100471006826,
      "grad_norm": 0.07820796221494675,
      "learning_rate": 0.0001189274804953063,
      "loss": 8.7661,
      "step": 30752,
      "throughput": 17847.300770419086
    },
    {
      "epoch": 0.48251161842961154,
      "grad_norm": 0.07713815569877625,
      "learning_rate": 0.00011866988195975307,
      "loss": 8.7602,
      "step": 30784,
      "throughput": 17847.37876156958
    },
    {
      "epoch": 0.4830131897585404,
      "grad_norm": 0.07636034488677979,
      "learning_rate": 0.00011841247448974398,
      "loss": 8.7614,
      "step": 30816,
      "throughput": 17847.583265104164
    },
    {
      "epoch": 0.4835147610874693,
      "grad_norm": 0.07851573824882507,
      "learning_rate": 0.00011815525914682817,
      "loss": 8.7412,
      "step": 30848,
      "throughput": 17847.858389147463
    },
    {
      "epoch": 0.48401633241639824,
      "grad_norm": 0.07513809949159622,
      "learning_rate": 0.00011789823699176249,
      "loss": 8.7635,
      "step": 30880,
      "throughput": 17847.989358915052
    },
    {
      "epoch": 0.48451790374532716,
      "grad_norm": 0.0752979964017868,
      "learning_rate": 0.00011764140908450703,
      "loss": 8.7519,
      "step": 30912,
      "throughput": 17848.399698441695
    },
    {
      "epoch": 0.4850194750742561,
      "grad_norm": 0.07560551166534424,
      "learning_rate": 0.0001173847764842209,
      "loss": 8.7653,
      "step": 30944,
      "throughput": 17847.92848888303
    },
    {
      "epoch": 0.485521046403185,
      "grad_norm": 0.07036852836608887,
      "learning_rate": 0.00011712834024925766,
      "loss": 8.7608,
      "step": 30976,
      "throughput": 17848.211029288916
    },
    {
      "epoch": 0.4860226177321139,
      "grad_norm": 0.07992174476385117,
      "learning_rate": 0.00011687210143716116,
      "loss": 8.746,
      "step": 31008,
      "throughput": 17847.930009877455
    },
    {
      "epoch": 0.4865241890610428,
      "grad_norm": 0.07941529154777527,
      "learning_rate": 0.00011661606110466095,
      "loss": 8.7518,
      "step": 31040,
      "throughput": 17848.006630520344
    },
    {
      "epoch": 0.4870257603899717,
      "grad_norm": 0.07616245001554489,
      "learning_rate": 0.00011636022030766818,
      "loss": 8.7637,
      "step": 31072,
      "throughput": 17848.241233007302
    },
    {
      "epoch": 0.4875273317189006,
      "grad_norm": 0.0720774233341217,
      "learning_rate": 0.00011610458010127093,
      "loss": 8.7527,
      "step": 31104,
      "throughput": 17848.29619784518
    },
    {
      "epoch": 0.48802890304782953,
      "grad_norm": 0.07582742720842361,
      "learning_rate": 0.00011584914153973036,
      "loss": 8.7721,
      "step": 31136,
      "throughput": 17848.647265738666
    },
    {
      "epoch": 0.48853047437675845,
      "grad_norm": 0.07284478098154068,
      "learning_rate": 0.00011559390567647571,
      "loss": 8.7476,
      "step": 31168,
      "throughput": 17848.683273518756
    },
    {
      "epoch": 0.48903204570568737,
      "grad_norm": 0.07436570525169373,
      "learning_rate": 0.00011533887356410052,
      "loss": 8.7572,
      "step": 31200,
      "throughput": 17849.02940449858
    },
    {
      "epoch": 0.4895336170346163,
      "grad_norm": 0.07970885187387466,
      "learning_rate": 0.00011508404625435791,
      "loss": 8.7596,
      "step": 31232,
      "throughput": 17849.075126584816
    },
    {
      "epoch": 0.49003518836354515,
      "grad_norm": 0.07180914282798767,
      "learning_rate": 0.00011482942479815651,
      "loss": 8.7399,
      "step": 31264,
      "throughput": 17848.82705998219
    },
    {
      "epoch": 0.49053675969247407,
      "grad_norm": 0.096111960709095,
      "learning_rate": 0.00011457501024555593,
      "loss": 8.7578,
      "step": 31296,
      "throughput": 17848.779338285327
    },
    {
      "epoch": 0.491038331021403,
      "grad_norm": 0.07298924773931503,
      "learning_rate": 0.00011432080364576256,
      "loss": 8.7366,
      "step": 31328,
      "throughput": 17848.776528847295
    },
    {
      "epoch": 0.4915399023503319,
      "grad_norm": 0.07969135046005249,
      "learning_rate": 0.00011406680604712517,
      "loss": 8.7565,
      "step": 31360,
      "throughput": 17849.00646660255
    },
    {
      "epoch": 0.4920414736792608,
      "grad_norm": 0.06794929504394531,
      "learning_rate": 0.00011381301849713059,
      "loss": 8.7557,
      "step": 31392,
      "throughput": 17848.946513776882
    },
    {
      "epoch": 0.49254304500818974,
      "grad_norm": 0.0790972039103508,
      "learning_rate": 0.00011355944204239944,
      "loss": 8.7557,
      "step": 31424,
      "throughput": 17849.23098082752
    },
    {
      "epoch": 0.4930446163371186,
      "grad_norm": 0.07518981397151947,
      "learning_rate": 0.0001133060777286818,
      "loss": 8.7445,
      "step": 31456,
      "throughput": 17849.504941230134
    },
    {
      "epoch": 0.4935461876660475,
      "grad_norm": 0.08620373904705048,
      "learning_rate": 0.00011305292660085278,
      "loss": 8.738,
      "step": 31488,
      "throughput": 17849.625420218974
    },
    {
      "epoch": 0.49404775899497644,
      "grad_norm": 0.07720964401960373,
      "learning_rate": 0.00011279998970290844,
      "loss": 8.7659,
      "step": 31520,
      "throughput": 17850.015704816262
    },
    {
      "epoch": 0.49454933032390536,
      "grad_norm": 0.07925351709127426,
      "learning_rate": 0.0001125472680779613,
      "loss": 8.7505,
      "step": 31552,
      "throughput": 17849.624096047148
    },
    {
      "epoch": 0.4950509016528343,
      "grad_norm": 0.07546688616275787,
      "learning_rate": 0.00011229476276823608,
      "loss": 8.7393,
      "step": 31584,
      "throughput": 17849.69202524607
    },
    {
      "epoch": 0.4955524729817632,
      "grad_norm": 0.07865744829177856,
      "learning_rate": 0.00011204247481506535,
      "loss": 8.7376,
      "step": 31616,
      "throughput": 17849.62718762721
    },
    {
      "epoch": 0.4960540443106921,
      "grad_norm": 0.08163941651582718,
      "learning_rate": 0.00011179040525888552,
      "loss": 8.7449,
      "step": 31648,
      "throughput": 17849.711629491485
    },
    {
      "epoch": 0.496555615639621,
      "grad_norm": 0.0784422978758812,
      "learning_rate": 0.00011153855513923207,
      "loss": 8.7327,
      "step": 31680,
      "throughput": 17849.93584187844
    },
    {
      "epoch": 0.4970571869685499,
      "grad_norm": 0.07680987566709518,
      "learning_rate": 0.00011128692549473568,
      "loss": 8.7538,
      "step": 31712,
      "throughput": 17849.852166393823
    },
    {
      "epoch": 0.4975587582974788,
      "grad_norm": 0.07829587161540985,
      "learning_rate": 0.00011103551736311777,
      "loss": 8.737,
      "step": 31744,
      "throughput": 17850.173752304592
    },
    {
      "epoch": 0.4980603296264077,
      "grad_norm": 0.07024698704481125,
      "learning_rate": 0.0001107843317811862,
      "loss": 8.7312,
      "step": 31776,
      "throughput": 17850.1787368264
    },
    {
      "epoch": 0.49856190095533665,
      "grad_norm": 0.07402710616588593,
      "learning_rate": 0.00011053336978483102,
      "loss": 8.7585,
      "step": 31808,
      "throughput": 17850.64963191776
    },
    {
      "epoch": 0.49906347228426556,
      "grad_norm": 0.09956757724285126,
      "learning_rate": 0.00011028263240902033,
      "loss": 8.7274,
      "step": 31840,
      "throughput": 17850.708965571015
    },
    {
      "epoch": 0.4995650436131945,
      "grad_norm": 0.07462165504693985,
      "learning_rate": 0.0001100321206877957,
      "loss": 8.7316,
      "step": 31872,
      "throughput": 17850.575949234746
    },
    {
      "epoch": 0.5000666149421233,
      "grad_norm": 0.08074957132339478,
      "learning_rate": 0.00010978183565426832,
      "loss": 8.7449,
      "step": 31904,
      "throughput": 17850.43376010897
    },
    {
      "epoch": 0.5005681862710523,
      "grad_norm": 0.07831110805273056,
      "learning_rate": 0.00010953177834061435,
      "loss": 8.7583,
      "step": 31936,
      "throughput": 17850.51662941851
    },
    {
      "epoch": 0.5010697575999812,
      "grad_norm": 0.080203115940094,
      "learning_rate": 0.00010928194977807091,
      "loss": 8.7381,
      "step": 31968,
      "throughput": 17850.608029374747
    },
    {
      "epoch": 0.5015713289289101,
      "grad_norm": 0.076949343085289,
      "learning_rate": 0.00010903235099693174,
      "loss": 8.7264,
      "step": 32000,
      "throughput": 17850.67277233083
    },
    {
      "epoch": 0.502072900257839,
      "grad_norm": 0.07552843540906906,
      "learning_rate": 0.00010878298302654294,
      "loss": 8.7573,
      "step": 32032,
      "throughput": 17850.929673978953
    },
    {
      "epoch": 0.5025744715867679,
      "grad_norm": 0.0816895142197609,
      "learning_rate": 0.00010853384689529873,
      "loss": 8.7427,
      "step": 32064,
      "throughput": 17850.966147359748
    },
    {
      "epoch": 0.5030760429156969,
      "grad_norm": 0.07351840287446976,
      "learning_rate": 0.00010828494363063732,
      "loss": 8.7416,
      "step": 32096,
      "throughput": 17851.308279198405
    },
    {
      "epoch": 0.5035776142446258,
      "grad_norm": 0.10635778307914734,
      "learning_rate": 0.0001080362742590364,
      "loss": 8.7523,
      "step": 32128,
      "throughput": 17851.689731864455
    },
    {
      "epoch": 0.5040791855735547,
      "grad_norm": 0.07729581743478775,
      "learning_rate": 0.00010778783980600939,
      "loss": 8.7587,
      "step": 32160,
      "throughput": 17851.210394599766
    },
    {
      "epoch": 0.5045807569024836,
      "grad_norm": 0.07464662939310074,
      "learning_rate": 0.00010753964129610052,
      "loss": 8.7416,
      "step": 32192,
      "throughput": 17851.199375968507
    },
    {
      "epoch": 0.5050823282314125,
      "grad_norm": 0.07541036605834961,
      "learning_rate": 0.00010729167975288122,
      "loss": 8.7447,
      "step": 32224,
      "throughput": 17851.335817399635
    },
    {
      "epoch": 0.5055838995603413,
      "grad_norm": 0.08859999477863312,
      "learning_rate": 0.0001070439561989457,
      "loss": 8.7497,
      "step": 32256,
      "throughput": 17851.39972043009
    },
    {
      "epoch": 0.5060854708892703,
      "grad_norm": 0.077695332467556,
      "learning_rate": 0.00010679647165590659,
      "loss": 8.7294,
      "step": 32288,
      "throughput": 17851.360632048607
    },
    {
      "epoch": 0.5065870422181992,
      "grad_norm": 0.07810457050800323,
      "learning_rate": 0.00010654922714439083,
      "loss": 8.7304,
      "step": 32320,
      "throughput": 17851.553764208507
    },
    {
      "epoch": 0.5070886135471281,
      "grad_norm": 0.0815735012292862,
      "learning_rate": 0.00010630222368403561,
      "loss": 8.7205,
      "step": 32352,
      "throughput": 17851.862837365712
    },
    {
      "epoch": 0.507590184876057,
      "grad_norm": 0.07661870867013931,
      "learning_rate": 0.00010605546229348396,
      "loss": 8.7485,
      "step": 32384,
      "throughput": 17851.85927641683
    },
    {
      "epoch": 0.5080917562049859,
      "grad_norm": 0.07422985881567001,
      "learning_rate": 0.00010580894399038044,
      "loss": 8.7459,
      "step": 32416,
      "throughput": 17852.318482970753
    },
    {
      "epoch": 0.5085933275339148,
      "grad_norm": 0.07624523341655731,
      "learning_rate": 0.00010556266979136734,
      "loss": 8.7295,
      "step": 32448,
      "throughput": 17852.09570726072
    },
    {
      "epoch": 0.5090948988628438,
      "grad_norm": 0.07496540248394012,
      "learning_rate": 0.00010531664071208019,
      "loss": 8.7318,
      "step": 32480,
      "throughput": 17852.12903658571
    },
    {
      "epoch": 0.5095964701917727,
      "grad_norm": 0.07751981914043427,
      "learning_rate": 0.00010507085776714369,
      "loss": 8.7212,
      "step": 32512,
      "throughput": 17851.88275411316
    },
    {
      "epoch": 0.5100980415207016,
      "grad_norm": 0.07308991998434067,
      "learning_rate": 0.00010482532197016732,
      "loss": 8.7392,
      "step": 32544,
      "throughput": 17852.077468590258
    },
    {
      "epoch": 0.5105996128496305,
      "grad_norm": 0.07246117293834686,
      "learning_rate": 0.00010458003433374152,
      "loss": 8.731,
      "step": 32576,
      "throughput": 17852.16765980201
    },
    {
      "epoch": 0.5111011841785594,
      "grad_norm": 0.0796428844332695,
      "learning_rate": 0.00010433499586943319,
      "loss": 8.74,
      "step": 32608,
      "throughput": 17852.209242554592
    },
    {
      "epoch": 0.5116027555074883,
      "grad_norm": 0.07866678386926651,
      "learning_rate": 0.00010409020758778178,
      "loss": 8.7387,
      "step": 32640,
      "throughput": 17852.455320879468
    },
    {
      "epoch": 0.5121043268364173,
      "grad_norm": 0.07312561571598053,
      "learning_rate": 0.00010384567049829474,
      "loss": 8.7283,
      "step": 32672,
      "throughput": 17852.501019436742
    },
    {
      "epoch": 0.5126058981653461,
      "grad_norm": 0.07131937146186829,
      "learning_rate": 0.00010360138560944379,
      "loss": 8.7243,
      "step": 32704,
      "throughput": 17852.82589756935
    },
    {
      "epoch": 0.513107469494275,
      "grad_norm": 0.0743732750415802,
      "learning_rate": 0.00010335735392866061,
      "loss": 8.7208,
      "step": 32736,
      "throughput": 17852.87888747027
    },
    {
      "epoch": 0.5136090408232039,
      "grad_norm": 0.08018866181373596,
      "learning_rate": 0.00010311357646233255,
      "loss": 8.7379,
      "step": 32768,
      "throughput": 17852.64528871008
    },
    {
      "epoch": 0.5141106121521328,
      "grad_norm": 0.077493816614151,
      "learning_rate": 0.00010287005421579854,
      "loss": 8.7498,
      "step": 32800,
      "throughput": 17851.308068751474
    },
    {
      "epoch": 0.5146121834810617,
      "grad_norm": 0.07570572197437286,
      "learning_rate": 0.00010262678819334511,
      "loss": 8.7279,
      "step": 32832,
      "throughput": 17851.523873792336
    },
    {
      "epoch": 0.5151137548099907,
      "grad_norm": 0.08089492470026016,
      "learning_rate": 0.00010238377939820202,
      "loss": 8.734,
      "step": 32864,
      "throughput": 17851.60249667329
    },
    {
      "epoch": 0.5156153261389196,
      "grad_norm": 0.0890161320567131,
      "learning_rate": 0.00010214102883253832,
      "loss": 8.7312,
      "step": 32896,
      "throughput": 17851.5531851023
    },
    {
      "epoch": 0.5161168974678485,
      "grad_norm": 0.07664911448955536,
      "learning_rate": 0.00010189853749745799,
      "loss": 8.7183,
      "step": 32928,
      "throughput": 17851.795498924672
    },
    {
      "epoch": 0.5166184687967774,
      "grad_norm": 0.07410534471273422,
      "learning_rate": 0.00010165630639299606,
      "loss": 8.72,
      "step": 32960,
      "throughput": 17852.076292837388
    },
    {
      "epoch": 0.5171200401257063,
      "grad_norm": 0.0790855884552002,
      "learning_rate": 0.00010141433651811429,
      "loss": 8.7243,
      "step": 32992,
      "throughput": 17852.275533354878
    },
    {
      "epoch": 0.5176216114546353,
      "grad_norm": 0.08857249468564987,
      "learning_rate": 0.00010117262887069724,
      "loss": 8.733,
      "step": 33024,
      "throughput": 17852.648968705045
    },
    {
      "epoch": 0.5181231827835642,
      "grad_norm": 0.07627321034669876,
      "learning_rate": 0.00010093118444754784,
      "loss": 8.7357,
      "step": 33056,
      "throughput": 17852.408439897976
    },
    {
      "epoch": 0.5186247541124931,
      "grad_norm": 0.08038872480392456,
      "learning_rate": 0.0001006900042443837,
      "loss": 8.703,
      "step": 33088,
      "throughput": 17852.457808305855
    },
    {
      "epoch": 0.519126325441422,
      "grad_norm": 0.07928008586168289,
      "learning_rate": 0.00010044908925583264,
      "loss": 8.7416,
      "step": 33120,
      "throughput": 17852.280623087227
    },
    {
      "epoch": 0.5196278967703508,
      "grad_norm": 0.07952787727117538,
      "learning_rate": 0.00010020844047542886,
      "loss": 8.7166,
      "step": 33152,
      "throughput": 17852.45677832672
    },
    {
      "epoch": 0.5201294680992797,
      "grad_norm": 0.07665357738733292,
      "learning_rate": 9.996805889560857e-05,
      "loss": 8.7227,
      "step": 33184,
      "throughput": 17852.444686158015
    },
    {
      "epoch": 0.5206310394282087,
      "grad_norm": 0.07277513295412064,
      "learning_rate": 9.972794550770612e-05,
      "loss": 8.7162,
      "step": 33216,
      "throughput": 17852.49822943568
    },
    {
      "epoch": 0.5211326107571376,
      "grad_norm": 0.07704409956932068,
      "learning_rate": 9.948810130194984e-05,
      "loss": 8.7392,
      "step": 33248,
      "throughput": 17852.742232599747
    },
    {
      "epoch": 0.5216341820860665,
      "grad_norm": 0.07315832376480103,
      "learning_rate": 9.924852726745807e-05,
      "loss": 8.7116,
      "step": 33280,
      "throughput": 17852.894822436796
    },
    {
      "epoch": 0.5221357534149954,
      "grad_norm": 0.08210276067256927,
      "learning_rate": 9.900922439223464e-05,
      "loss": 8.7548,
      "step": 33312,
      "throughput": 17853.21314899139
    },
    {
      "epoch": 0.5226373247439243,
      "grad_norm": 0.0742005780339241,
      "learning_rate": 9.877019366316541e-05,
      "loss": 8.7031,
      "step": 33344,
      "throughput": 17853.130417455657
    },
    {
      "epoch": 0.5231388960728532,
      "grad_norm": 0.08997371792793274,
      "learning_rate": 9.85314360660138e-05,
      "loss": 8.708,
      "step": 33376,
      "throughput": 17853.135809429154
    },
    {
      "epoch": 0.5236404674017822,
      "grad_norm": 0.07334605604410172,
      "learning_rate": 9.829295258541692e-05,
      "loss": 8.7183,
      "step": 33408,
      "throughput": 17852.964725346992
    },
    {
      "epoch": 0.5241420387307111,
      "grad_norm": 0.08196054399013519,
      "learning_rate": 9.805474420488123e-05,
      "loss": 8.7397,
      "step": 33440,
      "throughput": 17853.29280629254
    },
    {
      "epoch": 0.52464361005964,
      "grad_norm": 0.07731211930513382,
      "learning_rate": 9.78168119067789e-05,
      "loss": 8.7371,
      "step": 33472,
      "throughput": 17853.354784763338
    },
    {
      "epoch": 0.5251451813885689,
      "grad_norm": 0.08011672645807266,
      "learning_rate": 9.757915667234339e-05,
      "loss": 8.7353,
      "step": 33504,
      "throughput": 17853.176714040168
    },
    {
      "epoch": 0.5256467527174978,
      "grad_norm": 0.08022370934486389,
      "learning_rate": 9.734177948166558e-05,
      "loss": 8.7286,
      "step": 33536,
      "throughput": 17853.414800485636
    },
    {
      "epoch": 0.5261483240464266,
      "grad_norm": 0.08035130053758621,
      "learning_rate": 9.710468131368968e-05,
      "loss": 8.6915,
      "step": 33568,
      "throughput": 17853.693576028792
    },
    {
      "epoch": 0.5266498953753556,
      "grad_norm": 0.07651187479496002,
      "learning_rate": 9.68678631462093e-05,
      "loss": 8.7352,
      "step": 33600,
      "throughput": 17853.902846549423
    },
    {
      "epoch": 0.5271514667042845,
      "grad_norm": 0.07013057917356491,
      "learning_rate": 9.66313259558633e-05,
      "loss": 8.7234,
      "step": 33632,
      "throughput": 17854.26304555664
    },
    {
      "epoch": 0.5276530380332134,
      "grad_norm": 0.08306749910116196,
      "learning_rate": 9.639507071813166e-05,
      "loss": 8.6926,
      "step": 33664,
      "throughput": 17853.92545712553
    },
    {
      "epoch": 0.5281546093621423,
      "grad_norm": 0.07922951132059097,
      "learning_rate": 9.615909840733167e-05,
      "loss": 8.7203,
      "step": 33696,
      "throughput": 17853.73114229308
    },
    {
      "epoch": 0.5286561806910712,
      "grad_norm": 0.07435827702283859,
      "learning_rate": 9.592340999661393e-05,
      "loss": 8.7194,
      "step": 33728,
      "throughput": 17853.912868666077
    },
    {
      "epoch": 0.5291577520200001,
      "grad_norm": 0.07166502624750137,
      "learning_rate": 9.568800645795812e-05,
      "loss": 8.7444,
      "step": 33760,
      "throughput": 17853.987128701727
    },
    {
      "epoch": 0.5296593233489291,
      "grad_norm": 0.08474232256412506,
      "learning_rate": 9.545288876216901e-05,
      "loss": 8.712,
      "step": 33792,
      "throughput": 17853.808485432633
    },
    {
      "epoch": 0.530160894677858,
      "grad_norm": 0.07914811372756958,
      "learning_rate": 9.521805787887285e-05,
      "loss": 8.7079,
      "step": 33824,
      "throughput": 17853.99280724263
    },
    {
      "epoch": 0.5306624660067869,
      "grad_norm": 0.07761990278959274,
      "learning_rate": 9.498351477651286e-05,
      "loss": 8.7217,
      "step": 33856,
      "throughput": 17854.293602626312
    },
    {
      "epoch": 0.5311640373357158,
      "grad_norm": 0.07397238910198212,
      "learning_rate": 9.47492604223454e-05,
      "loss": 8.7051,
      "step": 33888,
      "throughput": 17854.527119045713
    },
    {
      "epoch": 0.5316656086646447,
      "grad_norm": 0.08450540155172348,
      "learning_rate": 9.451529578243618e-05,
      "loss": 8.7161,
      "step": 33920,
      "throughput": 17854.96415529531
    },
    {
      "epoch": 0.5321671799935737,
      "grad_norm": 0.08447160571813583,
      "learning_rate": 9.428162182165607e-05,
      "loss": 8.7036,
      "step": 33952,
      "throughput": 17854.760755613323
    },
    {
      "epoch": 0.5326687513225026,
      "grad_norm": 0.08158237487077713,
      "learning_rate": 9.40482395036772e-05,
      "loss": 8.7126,
      "step": 33984,
      "throughput": 17854.879374147862
    },
    {
      "epoch": 0.5331703226514314,
      "grad_norm": 0.0762786716222763,
      "learning_rate": 9.381514979096888e-05,
      "loss": 8.6922,
      "step": 34016,
      "throughput": 17854.642009693944
    },
    {
      "epoch": 0.5336718939803603,
      "grad_norm": 0.07647755742073059,
      "learning_rate": 9.35823536447938e-05,
      "loss": 8.7242,
      "step": 34048,
      "throughput": 17854.82546758402
    },
    {
      "epoch": 0.5341734653092892,
      "grad_norm": 0.07419923692941666,
      "learning_rate": 9.334985202520395e-05,
      "loss": 8.6907,
      "step": 34080,
      "throughput": 17854.79020767972
    },
    {
      "epoch": 0.5346750366382181,
      "grad_norm": 0.0749330073595047,
      "learning_rate": 9.311764589103679e-05,
      "loss": 8.7215,
      "step": 34112,
      "throughput": 17854.728711581607
    },
    {
      "epoch": 0.5351766079671471,
      "grad_norm": 0.07686057686805725,
      "learning_rate": 9.288573619991096e-05,
      "loss": 8.7209,
      "step": 34144,
      "throughput": 17854.96625759833
    },
    {
      "epoch": 0.535678179296076,
      "grad_norm": 0.0778697207570076,
      "learning_rate": 9.265412390822278e-05,
      "loss": 8.7256,
      "step": 34176,
      "throughput": 17855.24001360787
    },
    {
      "epoch": 0.5361797506250049,
      "grad_norm": 0.07231098413467407,
      "learning_rate": 9.242280997114204e-05,
      "loss": 8.6967,
      "step": 34208,
      "throughput": 17855.55976553791
    },
    {
      "epoch": 0.5366813219539338,
      "grad_norm": 0.07696084678173065,
      "learning_rate": 9.219179534260811e-05,
      "loss": 8.7035,
      "step": 34240,
      "throughput": 17855.732613653654
    },
    {
      "epoch": 0.5371828932828627,
      "grad_norm": 0.07389093190431595,
      "learning_rate": 9.196108097532597e-05,
      "loss": 8.7006,
      "step": 34272,
      "throughput": 17855.563916979274
    },
    {
      "epoch": 0.5376844646117916,
      "grad_norm": 0.07730893790721893,
      "learning_rate": 9.173066782076236e-05,
      "loss": 8.7102,
      "step": 34304,
      "throughput": 17855.354474081796
    },
    {
      "epoch": 0.5381860359407206,
      "grad_norm": 0.0806819349527359,
      "learning_rate": 9.15005568291418e-05,
      "loss": 8.6878,
      "step": 34336,
      "throughput": 17855.554636712146
    },
    {
      "epoch": 0.5386876072696495,
      "grad_norm": 0.07983998954296112,
      "learning_rate": 9.12707489494428e-05,
      "loss": 8.6808,
      "step": 34368,
      "throughput": 17855.74473759405
    },
    {
      "epoch": 0.5391891785985784,
      "grad_norm": 0.07641927897930145,
      "learning_rate": 9.104124512939357e-05,
      "loss": 8.7275,
      "step": 34400,
      "throughput": 17855.438142912495
    },
    {
      "epoch": 0.5396907499275073,
      "grad_norm": 0.07484152913093567,
      "learning_rate": 9.081204631546867e-05,
      "loss": 8.6999,
      "step": 34432,
      "throughput": 17855.625684506198
    },
    {
      "epoch": 0.5401923212564361,
      "grad_norm": 0.07652326673269272,
      "learning_rate": 9.058315345288465e-05,
      "loss": 8.6852,
      "step": 34464,
      "throughput": 17855.802815831357
    },
    {
      "epoch": 0.540693892585365,
      "grad_norm": 0.08515553176403046,
      "learning_rate": 9.035456748559639e-05,
      "loss": 8.727,
      "step": 34496,
      "throughput": 17856.033432889442
    },
    {
      "epoch": 0.541195463914294,
      "grad_norm": 0.0841352790594101,
      "learning_rate": 9.012628935629299e-05,
      "loss": 8.6966,
      "step": 34528,
      "throughput": 17856.388586859128
    },
    {
      "epoch": 0.5416970352432229,
      "grad_norm": 0.0760938748717308,
      "learning_rate": 8.989832000639424e-05,
      "loss": 8.6885,
      "step": 34560,
      "throughput": 17856.362491268934
    },
    {
      "epoch": 0.5421986065721518,
      "grad_norm": 0.07298042625188828,
      "learning_rate": 8.967066037604637e-05,
      "loss": 8.7089,
      "step": 34592,
      "throughput": 17856.393811450194
    },
    {
      "epoch": 0.5427001779010807,
      "grad_norm": 0.0769927054643631,
      "learning_rate": 8.944331140411841e-05,
      "loss": 8.7142,
      "step": 34624,
      "throughput": 17856.153219747895
    },
    {
      "epoch": 0.5432017492300096,
      "grad_norm": 0.07509263604879379,
      "learning_rate": 8.921627402819813e-05,
      "loss": 8.7001,
      "step": 34656,
      "throughput": 17856.32991604068
    },
    {
      "epoch": 0.5437033205589386,
      "grad_norm": 0.07436039298772812,
      "learning_rate": 8.898954918458835e-05,
      "loss": 8.7097,
      "step": 34688,
      "throughput": 17856.305279807035
    },
    {
      "epoch": 0.5442048918878675,
      "grad_norm": 0.08183735609054565,
      "learning_rate": 8.876313780830305e-05,
      "loss": 8.7207,
      "step": 34720,
      "throughput": 17856.23398069712
    },
    {
      "epoch": 0.5447064632167964,
      "grad_norm": 0.07405205070972443,
      "learning_rate": 8.853704083306341e-05,
      "loss": 8.7145,
      "step": 34752,
      "throughput": 17856.469484585414
    },
    {
      "epoch": 0.5452080345457253,
      "grad_norm": 0.07916124910116196,
      "learning_rate": 8.831125919129397e-05,
      "loss": 8.7078,
      "step": 34784,
      "throughput": 17856.600001475075
    },
    {
      "epoch": 0.5457096058746542,
      "grad_norm": 0.07913485169410706,
      "learning_rate": 8.808579381411892e-05,
      "loss": 8.7067,
      "step": 34816,
      "throughput": 17856.912010010525
    },
    {
      "epoch": 0.5462111772035831,
      "grad_norm": 0.07325571030378342,
      "learning_rate": 8.786064563135815e-05,
      "loss": 8.7104,
      "step": 34848,
      "throughput": 17855.739869060257
    },
    {
      "epoch": 0.5467127485325121,
      "grad_norm": 0.08538568019866943,
      "learning_rate": 8.763581557152348e-05,
      "loss": 8.7083,
      "step": 34880,
      "throughput": 17855.6542419764
    },
    {
      "epoch": 0.5472143198614409,
      "grad_norm": 0.07836513221263885,
      "learning_rate": 8.741130456181463e-05,
      "loss": 8.6979,
      "step": 34912,
      "throughput": 17855.51185036211
    },
    {
      "epoch": 0.5477158911903698,
      "grad_norm": 0.07362423837184906,
      "learning_rate": 8.718711352811573e-05,
      "loss": 8.7047,
      "step": 34944,
      "throughput": 17855.706793913592
    },
    {
      "epoch": 0.5482174625192987,
      "grad_norm": 0.0696859061717987,
      "learning_rate": 8.696324339499135e-05,
      "loss": 8.7058,
      "step": 34976,
      "throughput": 17855.76802440867
    },
    {
      "epoch": 0.5487190338482276,
      "grad_norm": 0.08644992858171463,
      "learning_rate": 8.673969508568242e-05,
      "loss": 8.6796,
      "step": 35008,
      "throughput": 17855.577492330583
    },
    {
      "epoch": 0.5492206051771565,
      "grad_norm": 0.07907194644212723,
      "learning_rate": 8.651646952210293e-05,
      "loss": 8.7013,
      "step": 35040,
      "throughput": 17855.69019825646
    },
    {
      "epoch": 0.5497221765060855,
      "grad_norm": 0.07400470227003098,
      "learning_rate": 8.629356762483573e-05,
      "loss": 8.7043,
      "step": 35072,
      "throughput": 17855.8359775117
    },
    {
      "epoch": 0.5502237478350144,
      "grad_norm": 0.07419518381357193,
      "learning_rate": 8.607099031312901e-05,
      "loss": 8.6961,
      "step": 35104,
      "throughput": 17856.147141321344
    },
    {
      "epoch": 0.5507253191639433,
      "grad_norm": 0.08127579092979431,
      "learning_rate": 8.58487385048921e-05,
      "loss": 8.7208,
      "step": 35136,
      "throughput": 17856.493290845447
    },
    {
      "epoch": 0.5512268904928722,
      "grad_norm": 0.08192048966884613,
      "learning_rate": 8.562681311669218e-05,
      "loss": 8.7099,
      "step": 35168,
      "throughput": 17856.4989556675
    },
    {
      "epoch": 0.5517284618218011,
      "grad_norm": 0.07196959853172302,
      "learning_rate": 8.540521506375026e-05,
      "loss": 8.7026,
      "step": 35200,
      "throughput": 17856.195421085813
    },
    {
      "epoch": 0.55223003315073,
      "grad_norm": 0.0753261148929596,
      "learning_rate": 8.518394525993734e-05,
      "loss": 8.6949,
      "step": 35232,
      "throughput": 17856.353308312617
    },
    {
      "epoch": 0.552731604479659,
      "grad_norm": 0.07535768300294876,
      "learning_rate": 8.496300461777068e-05,
      "loss": 8.7055,
      "step": 35264,
      "throughput": 17856.536985682873
    },
    {
      "epoch": 0.5532331758085879,
      "grad_norm": 0.07252952456474304,
      "learning_rate": 8.474239404841023e-05,
      "loss": 8.6933,
      "step": 35296,
      "throughput": 17856.616172365262
    },
    {
      "epoch": 0.5537347471375168,
      "grad_norm": 0.07780899852514267,
      "learning_rate": 8.452211446165458e-05,
      "loss": 8.6953,
      "step": 35328,
      "throughput": 17856.527483171823
    },
    {
      "epoch": 0.5542363184664456,
      "grad_norm": 0.07243838161230087,
      "learning_rate": 8.430216676593744e-05,
      "loss": 8.7208,
      "step": 35360,
      "throughput": 17856.624340604638
    },
    {
      "epoch": 0.5547378897953745,
      "grad_norm": 0.07750379294157028,
      "learning_rate": 8.408255186832372e-05,
      "loss": 8.7109,
      "step": 35392,
      "throughput": 17856.957011109807
    },
    {
      "epoch": 0.5552394611243034,
      "grad_norm": 0.07306235283613205,
      "learning_rate": 8.386327067450593e-05,
      "loss": 8.6811,
      "step": 35424,
      "throughput": 17857.252341059837
    },
    {
      "epoch": 0.5557410324532324,
      "grad_norm": 0.07733822613954544,
      "learning_rate": 8.36443240888004e-05,
      "loss": 8.6948,
      "step": 35456,
      "throughput": 17857.16316162486
    },
    {
      "epoch": 0.5562426037821613,
      "grad_norm": 0.0769861564040184,
      "learning_rate": 8.342571301414342e-05,
      "loss": 8.7093,
      "step": 35488,
      "throughput": 17857.179033760753
    },
    {
      "epoch": 0.5567441751110902,
      "grad_norm": 0.08327770978212357,
      "learning_rate": 8.320743835208775e-05,
      "loss": 8.7121,
      "step": 35520,
      "throughput": 17857.11541407053
    },
    {
      "epoch": 0.5572457464400191,
      "grad_norm": 0.0749799981713295,
      "learning_rate": 8.298950100279872e-05,
      "loss": 8.7003,
      "step": 35552,
      "throughput": 17857.41542698246
    },
    {
      "epoch": 0.557747317768948,
      "grad_norm": 0.080297090113163,
      "learning_rate": 8.27719018650507e-05,
      "loss": 8.7133,
      "step": 35584,
      "throughput": 17857.57912236483
    },
    {
      "epoch": 0.558248889097877,
      "grad_norm": 0.07166144251823425,
      "learning_rate": 8.255464183622304e-05,
      "loss": 8.714,
      "step": 35616,
      "throughput": 17857.295799307114
    },
    {
      "epoch": 0.5587504604268059,
      "grad_norm": 0.08338946849107742,
      "learning_rate": 8.23377218122968e-05,
      "loss": 8.6983,
      "step": 35648,
      "throughput": 17857.516026578443
    },
    {
      "epoch": 0.5592520317557348,
      "grad_norm": 0.08566264808177948,
      "learning_rate": 8.212114268785083e-05,
      "loss": 8.6859,
      "step": 35680,
      "throughput": 17857.63804371969
    },
    {
      "epoch": 0.5597536030846637,
      "grad_norm": 0.07868161797523499,
      "learning_rate": 8.190490535605809e-05,
      "loss": 8.6673,
      "step": 35712,
      "throughput": 17857.938206572217
    },
    {
      "epoch": 0.5602551744135926,
      "grad_norm": 0.08599625527858734,
      "learning_rate": 8.16890107086819e-05,
      "loss": 8.6835,
      "step": 35744,
      "throughput": 17857.99435946068
    },
    {
      "epoch": 0.5607567457425215,
      "grad_norm": 0.08485583961009979,
      "learning_rate": 8.14734596360725e-05,
      "loss": 8.6944,
      "step": 35776,
      "throughput": 17858.055374471925
    },
    {
      "epoch": 0.5612583170714504,
      "grad_norm": 0.0788610652089119,
      "learning_rate": 8.12582530271631e-05,
      "loss": 8.7083,
      "step": 35808,
      "throughput": 17857.842233432566
    },
    {
      "epoch": 0.5617598884003793,
      "grad_norm": 0.08662694692611694,
      "learning_rate": 8.104339176946648e-05,
      "loss": 8.6672,
      "step": 35840,
      "throughput": 17858.02721694298
    },
    {
      "epoch": 0.5622614597293082,
      "grad_norm": 0.07397332787513733,
      "learning_rate": 8.082887674907099e-05,
      "loss": 8.6857,
      "step": 35872,
      "throughput": 17858.320675679395
    },
    {
      "epoch": 0.5627630310582371,
      "grad_norm": 0.07281418889760971,
      "learning_rate": 8.061470885063726e-05,
      "loss": 8.7033,
      "step": 35904,
      "throughput": 17858.03092891344
    },
    {
      "epoch": 0.563264602387166,
      "grad_norm": 0.07784990966320038,
      "learning_rate": 8.040088895739433e-05,
      "loss": 8.7075,
      "step": 35936,
      "throughput": 17858.204570293852
    },
    {
      "epoch": 0.5637661737160949,
      "grad_norm": 0.07610969245433807,
      "learning_rate": 8.018741795113614e-05,
      "loss": 8.6939,
      "step": 35968,
      "throughput": 17858.36220633285
    },
    {
      "epoch": 0.5642677450450239,
      "grad_norm": 0.0770319402217865,
      "learning_rate": 7.997429671221764e-05,
      "loss": 8.6916,
      "step": 36000,
      "throughput": 17858.675310114504
    },
    {
      "epoch": 0.5647693163739528,
      "grad_norm": 0.07430537790060043,
      "learning_rate": 7.97615261195515e-05,
      "loss": 8.699,
      "step": 36032,
      "throughput": 17858.865330860357
    },
    {
      "epoch": 0.5652708877028817,
      "grad_norm": 0.0736197680234909,
      "learning_rate": 7.95491070506043e-05,
      "loss": 8.7112,
      "step": 36064,
      "throughput": 17858.859739329604
    },
    {
      "epoch": 0.5657724590318106,
      "grad_norm": 0.08195227384567261,
      "learning_rate": 7.933704038139292e-05,
      "loss": 8.6762,
      "step": 36096,
      "throughput": 17859.00238340524
    },
    {
      "epoch": 0.5662740303607395,
      "grad_norm": 0.06957199424505234,
      "learning_rate": 7.912532698648089e-05,
      "loss": 8.682,
      "step": 36128,
      "throughput": 17858.941718203627
    },
    {
      "epoch": 0.5667756016896685,
      "grad_norm": 0.0705103799700737,
      "learning_rate": 7.891396773897487e-05,
      "loss": 8.6731,
      "step": 36160,
      "throughput": 17859.129372681775
    },
    {
      "epoch": 0.5672771730185974,
      "grad_norm": 0.07276476919651031,
      "learning_rate": 7.870296351052104e-05,
      "loss": 8.6685,
      "step": 36192,
      "throughput": 17859.072481472478
    },
    {
      "epoch": 0.5677787443475263,
      "grad_norm": 0.08280736953020096,
      "learning_rate": 7.849231517130151e-05,
      "loss": 8.6851,
      "step": 36224,
      "throughput": 17858.997460045422
    },
    {
      "epoch": 0.5682803156764551,
      "grad_norm": 0.08353912830352783,
      "learning_rate": 7.828202359003058e-05,
      "loss": 8.6826,
      "step": 36256,
      "throughput": 17859.26928275893
    },
    {
      "epoch": 0.568781887005384,
      "grad_norm": 0.07920809090137482,
      "learning_rate": 7.807208963395139e-05,
      "loss": 8.6754,
      "step": 36288,
      "throughput": 17859.399339313015
    },
    {
      "epoch": 0.5692834583343129,
      "grad_norm": 0.07387978583574295,
      "learning_rate": 7.786251416883218e-05,
      "loss": 8.6828,
      "step": 36320,
      "throughput": 17859.594711944774
    },
    {
      "epoch": 0.5697850296632418,
      "grad_norm": 0.07279038429260254,
      "learning_rate": 7.765329805896287e-05,
      "loss": 8.6943,
      "step": 36352,
      "throughput": 17859.60028157515
    },
    {
      "epoch": 0.5702866009921708,
      "grad_norm": 0.08469823747873306,
      "learning_rate": 7.744444216715117e-05,
      "loss": 8.6924,
      "step": 36384,
      "throughput": 17859.770193623226
    },
    {
      "epoch": 0.5707881723210997,
      "grad_norm": 0.08174408227205276,
      "learning_rate": 7.723594735471952e-05,
      "loss": 8.6946,
      "step": 36416,
      "throughput": 17859.459083652644
    },
    {
      "epoch": 0.5712897436500286,
      "grad_norm": 0.07326170057058334,
      "learning_rate": 7.702781448150109e-05,
      "loss": 8.6875,
      "step": 36448,
      "throughput": 17859.748036127872
    },
    {
      "epoch": 0.5717913149789575,
      "grad_norm": 0.0783335268497467,
      "learning_rate": 7.682004440583654e-05,
      "loss": 8.6803,
      "step": 36480,
      "throughput": 17859.800168291476
    },
    {
      "epoch": 0.5722928863078864,
      "grad_norm": 0.07939084619283676,
      "learning_rate": 7.661263798457014e-05,
      "loss": 8.6805,
      "step": 36512,
      "throughput": 17859.62193777413
    },
    {
      "epoch": 0.5727944576368154,
      "grad_norm": 0.0731113851070404,
      "learning_rate": 7.64055960730467e-05,
      "loss": 8.6622,
      "step": 36544,
      "throughput": 17859.843484605248
    },
    {
      "epoch": 0.5732960289657443,
      "grad_norm": 0.07936207205057144,
      "learning_rate": 7.619891952510763e-05,
      "loss": 8.6914,
      "step": 36576,
      "throughput": 17859.902867621913
    },
    {
      "epoch": 0.5737976002946732,
      "grad_norm": 0.07797619700431824,
      "learning_rate": 7.599260919308764e-05,
      "loss": 8.6794,
      "step": 36608,
      "throughput": 17860.196809289922
    },
    {
      "epoch": 0.5742991716236021,
      "grad_norm": 0.07891387492418289,
      "learning_rate": 7.578666592781114e-05,
      "loss": 8.6893,
      "step": 36640,
      "throughput": 17860.380665697347
    },
    {
      "epoch": 0.574800742952531,
      "grad_norm": 0.07759033888578415,
      "learning_rate": 7.558109057858874e-05,
      "loss": 8.6679,
      "step": 36672,
      "throughput": 17860.385192027556
    },
    {
      "epoch": 0.5753023142814598,
      "grad_norm": 0.07606584578752518,
      "learning_rate": 7.53758839932139e-05,
      "loss": 8.6838,
      "step": 36704,
      "throughput": 17860.52645770749
    },
    {
      "epoch": 0.5758038856103888,
      "grad_norm": 0.08316051214933395,
      "learning_rate": 7.517104701795905e-05,
      "loss": 8.6631,
      "step": 36736,
      "throughput": 17860.236486716658
    },
    {
      "epoch": 0.5763054569393177,
      "grad_norm": 0.07760239392518997,
      "learning_rate": 7.496658049757255e-05,
      "loss": 8.6755,
      "step": 36768,
      "throughput": 17860.515150122916
    },
    {
      "epoch": 0.5768070282682466,
      "grad_norm": 0.07454492896795273,
      "learning_rate": 7.476248527527492e-05,
      "loss": 8.6737,
      "step": 36800,
      "throughput": 17860.45777620071
    },
    {
      "epoch": 0.5773085995971755,
      "grad_norm": 0.07791450619697571,
      "learning_rate": 7.455876219275552e-05,
      "loss": 8.6667,
      "step": 36832,
      "throughput": 17860.3926783732
    },
    {
      "epoch": 0.5778101709261044,
      "grad_norm": 0.08069723844528198,
      "learning_rate": 7.435541209016885e-05,
      "loss": 8.6912,
      "step": 36864,
      "throughput": 17860.5391968819
    },
    {
      "epoch": 0.5783117422550333,
      "grad_norm": 0.07404716312885284,
      "learning_rate": 7.415243580613134e-05,
      "loss": 8.6685,
      "step": 36896,
      "throughput": 17859.50875842157
    },
    {
      "epoch": 0.5788133135839623,
      "grad_norm": 0.07672100514173508,
      "learning_rate": 7.394983417771791e-05,
      "loss": 8.6881,
      "step": 36928,
      "throughput": 17859.56777824398
    },
    {
      "epoch": 0.5793148849128912,
      "grad_norm": 0.07763490080833435,
      "learning_rate": 7.374760804045815e-05,
      "loss": 8.6952,
      "step": 36960,
      "throughput": 17859.563325141797
    },
    {
      "epoch": 0.5798164562418201,
      "grad_norm": 0.07308504730463028,
      "learning_rate": 7.354575822833331e-05,
      "loss": 8.6714,
      "step": 36992,
      "throughput": 17859.84326325879
    },
    {
      "epoch": 0.580318027570749,
      "grad_norm": 0.07863820344209671,
      "learning_rate": 7.334428557377258e-05,
      "loss": 8.6904,
      "step": 37024,
      "throughput": 17859.579503577177
    },
    {
      "epoch": 0.5808195988996779,
      "grad_norm": 0.07718608528375626,
      "learning_rate": 7.314319090764985e-05,
      "loss": 8.6855,
      "step": 37056,
      "throughput": 17859.864899200704
    },
    {
      "epoch": 0.5813211702286069,
      "grad_norm": 0.07392841577529907,
      "learning_rate": 7.294247505928003e-05,
      "loss": 8.6828,
      "step": 37088,
      "throughput": 17859.90820816274
    },
    {
      "epoch": 0.5818227415575358,
      "grad_norm": 0.07847581803798676,
      "learning_rate": 7.274213885641592e-05,
      "loss": 8.6726,
      "step": 37120,
      "throughput": 17859.732287565872
    },
    {
      "epoch": 0.5823243128864646,
      "grad_norm": 0.07694242149591446,
      "learning_rate": 7.254218312524461e-05,
      "loss": 8.6801,
      "step": 37152,
      "throughput": 17859.941978431052
    },
    {
      "epoch": 0.5828258842153935,
      "grad_norm": 0.08155602216720581,
      "learning_rate": 7.234260869038417e-05,
      "loss": 8.6924,
      "step": 37184,
      "throughput": 17860.002150369633
    },
    {
      "epoch": 0.5833274555443224,
      "grad_norm": 0.07697729766368866,
      "learning_rate": 7.214341637488007e-05,
      "loss": 8.6981,
      "step": 37216,
      "throughput": 17860.28969020785
    },
    {
      "epoch": 0.5838290268732513,
      "grad_norm": 0.07712637633085251,
      "learning_rate": 7.194460700020206e-05,
      "loss": 8.6688,
      "step": 37248,
      "throughput": 17860.287694656174
    },
    {
      "epoch": 0.5843305982021803,
      "grad_norm": 0.07649629563093185,
      "learning_rate": 7.174618138624058e-05,
      "loss": 8.688,
      "step": 37280,
      "throughput": 17860.456684093922
    },
    {
      "epoch": 0.5848321695311092,
      "grad_norm": 0.08143318444490433,
      "learning_rate": 7.154814035130351e-05,
      "loss": 8.6842,
      "step": 37312,
      "throughput": 17860.38516502371
    },
    {
      "epoch": 0.5853337408600381,
      "grad_norm": 0.07608351856470108,
      "learning_rate": 7.135048471211257e-05,
      "loss": 8.6752,
      "step": 37344,
      "throughput": 17860.40264555555
    },
    {
      "epoch": 0.585835312188967,
      "grad_norm": 0.0704997107386589,
      "learning_rate": 7.115321528380024e-05,
      "loss": 8.6978,
      "step": 37376,
      "throughput": 17860.682923717475
    },
    {
      "epoch": 0.5863368835178959,
      "grad_norm": 0.07427257299423218,
      "learning_rate": 7.095633287990622e-05,
      "loss": 8.6774,
      "step": 37408,
      "throughput": 17860.533350386562
    },
    {
      "epoch": 0.5868384548468248,
      "grad_norm": 0.07474479079246521,
      "learning_rate": 7.075983831237421e-05,
      "loss": 8.6564,
      "step": 37440,
      "throughput": 17860.572141327655
    },
    {
      "epoch": 0.5873400261757538,
      "grad_norm": 0.07758533209562302,
      "learning_rate": 7.056373239154826e-05,
      "loss": 8.6725,
      "step": 37472,
      "throughput": 17860.650359983818
    },
    {
      "epoch": 0.5878415975046827,
      "grad_norm": 0.08383120596408844,
      "learning_rate": 7.036801592616982e-05,
      "loss": 8.6471,
      "step": 37504,
      "throughput": 17860.956290430775
    },
    {
      "epoch": 0.5883431688336116,
      "grad_norm": 0.07858950644731522,
      "learning_rate": 7.017268972337419e-05,
      "loss": 8.6478,
      "step": 37536,
      "throughput": 17861.135781772904
    },
    {
      "epoch": 0.5888447401625405,
      "grad_norm": 0.07414372265338898,
      "learning_rate": 6.997775458868724e-05,
      "loss": 8.6781,
      "step": 37568,
      "throughput": 17861.13272811816
    },
    {
      "epoch": 0.5893463114914693,
      "grad_norm": 0.08369318395853043,
      "learning_rate": 6.978321132602197e-05,
      "loss": 8.6752,
      "step": 37600,
      "throughput": 17861.352014974746
    },
    {
      "epoch": 0.5898478828203982,
      "grad_norm": 0.08304564654827118,
      "learning_rate": 6.95890607376754e-05,
      "loss": 8.6589,
      "step": 37632,
      "throughput": 17861.165783111348
    },
    {
      "epoch": 0.5903494541493272,
      "grad_norm": 0.08017552644014359,
      "learning_rate": 6.939530362432513e-05,
      "loss": 8.6764,
      "step": 37664,
      "throughput": 17861.44463727893
    },
    {
      "epoch": 0.5908510254782561,
      "grad_norm": 0.07597630470991135,
      "learning_rate": 6.920194078502611e-05,
      "loss": 8.671,
      "step": 37696,
      "throughput": 17861.483137722276
    },
    {
      "epoch": 0.591352596807185,
      "grad_norm": 0.07199093699455261,
      "learning_rate": 6.900897301720721e-05,
      "loss": 8.6762,
      "step": 37728,
      "throughput": 17861.313925261547
    },
    {
      "epoch": 0.5918541681361139,
      "grad_norm": 0.07194902747869492,
      "learning_rate": 6.881640111666807e-05,
      "loss": 8.6917,
      "step": 37760,
      "throughput": 17861.45777088156
    },
    {
      "epoch": 0.5923557394650428,
      "grad_norm": 0.07729242742061615,
      "learning_rate": 6.862422587757581e-05,
      "loss": 8.6805,
      "step": 37792,
      "throughput": 17861.582855243618
    },
    {
      "epoch": 0.5928573107939717,
      "grad_norm": 0.0748252347111702,
      "learning_rate": 6.843244809246173e-05,
      "loss": 8.6948,
      "step": 37824,
      "throughput": 17861.868472076487
    },
    {
      "epoch": 0.5933588821229007,
      "grad_norm": 0.06912515312433243,
      "learning_rate": 6.824106855221788e-05,
      "loss": 8.6746,
      "step": 37856,
      "throughput": 17861.868699846593
    },
    {
      "epoch": 0.5938604534518296,
      "grad_norm": 0.08204159885644913,
      "learning_rate": 6.805008804609411e-05,
      "loss": 8.6705,
      "step": 37888,
      "throughput": 17862.034973142487
    },
    {
      "epoch": 0.5943620247807585,
      "grad_norm": 0.07541067898273468,
      "learning_rate": 6.78595073616946e-05,
      "loss": 8.6669,
      "step": 37920,
      "throughput": 17861.798222004218
    },
    {
      "epoch": 0.5948635961096874,
      "grad_norm": 0.07566790282726288,
      "learning_rate": 6.766932728497468e-05,
      "loss": 8.6908,
      "step": 37952,
      "throughput": 17862.07464676886
    },
    {
      "epoch": 0.5953651674386163,
      "grad_norm": 0.07250373810529709,
      "learning_rate": 6.747954860023746e-05,
      "loss": 8.6903,
      "step": 37984,
      "throughput": 17862.34271828335
    },
    {
      "epoch": 0.5958667387675451,
      "grad_norm": 0.07323428988456726,
      "learning_rate": 6.729017209013086e-05,
      "loss": 8.6791,
      "step": 38016,
      "throughput": 17862.1684501893
    },
    {
      "epoch": 0.5963683100964741,
      "grad_norm": 0.07525332272052765,
      "learning_rate": 6.710119853564422e-05,
      "loss": 8.6859,
      "step": 38048,
      "throughput": 17862.317622242426
    },
    {
      "epoch": 0.596869881425403,
      "grad_norm": 0.08609536290168762,
      "learning_rate": 6.69126287161049e-05,
      "loss": 8.695,
      "step": 38080,
      "throughput": 17862.192551926404
    },
    {
      "epoch": 0.5973714527543319,
      "grad_norm": 0.07468371838331223,
      "learning_rate": 6.672446340917553e-05,
      "loss": 8.6781,
      "step": 38112,
      "throughput": 17862.568143605156
    },
    {
      "epoch": 0.5978730240832608,
      "grad_norm": 0.08155862241983414,
      "learning_rate": 6.653670339085031e-05,
      "loss": 8.6713,
      "step": 38144,
      "throughput": 17862.652431778395
    },
    {
      "epoch": 0.5983745954121897,
      "grad_norm": 0.07647927850484848,
      "learning_rate": 6.634934943545217e-05,
      "loss": 8.6772,
      "step": 38176,
      "throughput": 17862.627789866117
    },
    {
      "epoch": 0.5988761667411187,
      "grad_norm": 0.07881610840559006,
      "learning_rate": 6.616240231562933e-05,
      "loss": 8.6527,
      "step": 38208,
      "throughput": 17862.73911744598
    },
    {
      "epoch": 0.5993777380700476,
      "grad_norm": 0.07397406548261642,
      "learning_rate": 6.597586280235227e-05,
      "loss": 8.6814,
      "step": 38240,
      "throughput": 17862.676804243994
    },
    {
      "epoch": 0.5998793093989765,
      "grad_norm": 0.08274087309837341,
      "learning_rate": 6.578973166491053e-05,
      "loss": 8.6655,
      "step": 38272,
      "throughput": 17862.84161762198
    },
    {
      "epoch": 0.6003808807279054,
      "grad_norm": 0.07987891137599945,
      "learning_rate": 6.560400967090948e-05,
      "loss": 8.6669,
      "step": 38304,
      "throughput": 17862.883476007497
    },
    {
      "epoch": 0.6008824520568343,
      "grad_norm": 0.09073396027088165,
      "learning_rate": 6.54186975862671e-05,
      "loss": 8.6597,
      "step": 38336,
      "throughput": 17862.81187994118
    },
    {
      "epoch": 0.6013840233857632,
      "grad_norm": 0.0760880559682846,
      "learning_rate": 6.523379617521104e-05,
      "loss": 8.6609,
      "step": 38368,
      "throughput": 17862.823933162377
    },
    {
      "epoch": 0.6018855947146922,
      "grad_norm": 0.08379305154085159,
      "learning_rate": 6.504930620027524e-05,
      "loss": 8.6619,
      "step": 38400,
      "throughput": 17863.056926972076
    },
    {
      "epoch": 0.6023871660436211,
      "grad_norm": 0.08088319003582001,
      "learning_rate": 6.486522842229692e-05,
      "loss": 8.6684,
      "step": 38432,
      "throughput": 17863.332135046952
    },
    {
      "epoch": 0.6028887373725499,
      "grad_norm": 0.07462088018655777,
      "learning_rate": 6.468156360041337e-05,
      "loss": 8.6695,
      "step": 38464,
      "throughput": 17863.224382775457
    },
    {
      "epoch": 0.6033903087014788,
      "grad_norm": 0.08881346881389618,
      "learning_rate": 6.449831249205887e-05,
      "loss": 8.6569,
      "step": 38496,
      "throughput": 17863.50051418473
    },
    {
      "epoch": 0.6038918800304077,
      "grad_norm": 0.07640784978866577,
      "learning_rate": 6.431547585296156e-05,
      "loss": 8.6554,
      "step": 38528,
      "throughput": 17863.268214463256
    },
    {
      "epoch": 0.6043934513593366,
      "grad_norm": 0.07738711684942245,
      "learning_rate": 6.413305443714022e-05,
      "loss": 8.6579,
      "step": 38560,
      "throughput": 17863.535238141452
    },
    {
      "epoch": 0.6048950226882656,
      "grad_norm": 0.07391881197690964,
      "learning_rate": 6.395104899690134e-05,
      "loss": 8.66,
      "step": 38592,
      "throughput": 17863.58042819578
    },
    {
      "epoch": 0.6053965940171945,
      "grad_norm": 0.07256294786930084,
      "learning_rate": 6.37694602828359e-05,
      "loss": 8.6675,
      "step": 38624,
      "throughput": 17863.51573004575
    },
    {
      "epoch": 0.6058981653461234,
      "grad_norm": 0.07247397303581238,
      "learning_rate": 6.358828904381632e-05,
      "loss": 8.656,
      "step": 38656,
      "throughput": 17863.72591574277
    },
    {
      "epoch": 0.6063997366750523,
      "grad_norm": 0.07626520097255707,
      "learning_rate": 6.340753602699327e-05,
      "loss": 8.6792,
      "step": 38688,
      "throughput": 17863.736278779154
    },
    {
      "epoch": 0.6069013080039812,
      "grad_norm": 0.0811394453048706,
      "learning_rate": 6.322720197779275e-05,
      "loss": 8.6834,
      "step": 38720,
      "throughput": 17864.114418141355
    },
    {
      "epoch": 0.6074028793329102,
      "grad_norm": 0.07609833031892776,
      "learning_rate": 6.304728763991291e-05,
      "loss": 8.6639,
      "step": 38752,
      "throughput": 17864.02078649924
    },
    {
      "epoch": 0.6079044506618391,
      "grad_norm": 0.07995349168777466,
      "learning_rate": 6.286779375532107e-05,
      "loss": 8.6797,
      "step": 38784,
      "throughput": 17864.17338739285
    },
    {
      "epoch": 0.608406021990768,
      "grad_norm": 0.07657574117183685,
      "learning_rate": 6.268872106425044e-05,
      "loss": 8.6393,
      "step": 38816,
      "throughput": 17864.181334000346
    },
    {
      "epoch": 0.6089075933196969,
      "grad_norm": 0.07328113168478012,
      "learning_rate": 6.25100703051974e-05,
      "loss": 8.6498,
      "step": 38848,
      "throughput": 17864.109557804193
    },
    {
      "epoch": 0.6094091646486258,
      "grad_norm": 0.0713716670870781,
      "learning_rate": 6.233184221491818e-05,
      "loss": 8.6694,
      "step": 38880,
      "throughput": 17864.327473130015
    },
    {
      "epoch": 0.6099107359775546,
      "grad_norm": 0.08026931434869766,
      "learning_rate": 6.2154037528426e-05,
      "loss": 8.6497,
      "step": 38912,
      "throughput": 17864.380336494443
    },
    {
      "epoch": 0.6104123073064835,
      "grad_norm": 0.07720151543617249,
      "learning_rate": 6.197665697898784e-05,
      "loss": 8.6616,
      "step": 38944,
      "throughput": 17863.307410331086
    },
    {
      "epoch": 0.6109138786354125,
      "grad_norm": 0.08285202085971832,
      "learning_rate": 6.179970129812166e-05,
      "loss": 8.6576,
      "step": 38976,
      "throughput": 17863.390790070513
    },
    {
      "epoch": 0.6114154499643414,
      "grad_norm": 0.07704629004001617,
      "learning_rate": 6.16231712155932e-05,
      "loss": 8.6726,
      "step": 39008,
      "throughput": 17863.601863733846
    },
    {
      "epoch": 0.6119170212932703,
      "grad_norm": 0.07858503609895706,
      "learning_rate": 6.144706745941308e-05,
      "loss": 8.652,
      "step": 39040,
      "throughput": 17863.759204938622
    },
    {
      "epoch": 0.6124185926221992,
      "grad_norm": 0.07336670160293579,
      "learning_rate": 6.127139075583363e-05,
      "loss": 8.6579,
      "step": 39072,
      "throughput": 17863.636421163745
    },
    {
      "epoch": 0.6129201639511281,
      "grad_norm": 0.07628186047077179,
      "learning_rate": 6.109614182934616e-05,
      "loss": 8.6587,
      "step": 39104,
      "throughput": 17863.897427308668
    },
    {
      "epoch": 0.6134217352800571,
      "grad_norm": 0.0749865397810936,
      "learning_rate": 6.092132140267775e-05,
      "loss": 8.6461,
      "step": 39136,
      "throughput": 17863.672068682863
    },
    {
      "epoch": 0.613923306608986,
      "grad_norm": 0.08119495958089828,
      "learning_rate": 6.074693019678839e-05,
      "loss": 8.667,
      "step": 39168,
      "throughput": 17863.94105634038
    },
    {
      "epoch": 0.6144248779379149,
      "grad_norm": 0.07770372927188873,
      "learning_rate": 6.0572968930867827e-05,
      "loss": 8.6489,
      "step": 39200,
      "throughput": 17863.909958673925
    },
    {
      "epoch": 0.6149264492668438,
      "grad_norm": 0.08431252092123032,
      "learning_rate": 6.039943832233293e-05,
      "loss": 8.6562,
      "step": 39232,
      "throughput": 17863.84274779138
    },
    {
      "epoch": 0.6154280205957727,
      "grad_norm": 0.07534243166446686,
      "learning_rate": 6.022633908682442e-05,
      "loss": 8.6567,
      "step": 39264,
      "throughput": 17864.039271383816
    },
    {
      "epoch": 0.6159295919247016,
      "grad_norm": 0.07038530707359314,
      "learning_rate": 6.005367193820408e-05,
      "loss": 8.6517,
      "step": 39296,
      "throughput": 17864.034321243686
    },
    {
      "epoch": 0.6164311632536306,
      "grad_norm": 0.07531935721635818,
      "learning_rate": 5.9881437588551675e-05,
      "loss": 8.657,
      "step": 39328,
      "throughput": 17864.300657879492
    },
    {
      "epoch": 0.6169327345825594,
      "grad_norm": 0.07444503158330917,
      "learning_rate": 5.970963674816224e-05,
      "loss": 8.6488,
      "step": 39360,
      "throughput": 17864.20170582491
    },
    {
      "epoch": 0.6174343059114883,
      "grad_norm": 0.07967787981033325,
      "learning_rate": 5.953827012554291e-05,
      "loss": 8.662,
      "step": 39392,
      "throughput": 17864.458779070905
    },
    {
      "epoch": 0.6179358772404172,
      "grad_norm": 0.07082749158143997,
      "learning_rate": 5.9367338427410197e-05,
      "loss": 8.6544,
      "step": 39424,
      "throughput": 17864.34052637765
    },
    {
      "epoch": 0.6184374485693461,
      "grad_norm": 0.07375257462263107,
      "learning_rate": 5.9196842358686866e-05,
      "loss": 8.6608,
      "step": 39456,
      "throughput": 17864.398214765588
    },
    {
      "epoch": 0.618939019898275,
      "grad_norm": 0.07579505443572998,
      "learning_rate": 5.902678262249923e-05,
      "loss": 8.6612,
      "step": 39488,
      "throughput": 17864.590355831428
    },
    {
      "epoch": 0.619440591227204,
      "grad_norm": 0.07662634551525116,
      "learning_rate": 5.885715992017419e-05,
      "loss": 8.6461,
      "step": 39520,
      "throughput": 17864.417285726835
    },
    {
      "epoch": 0.6199421625561329,
      "grad_norm": 0.08262430131435394,
      "learning_rate": 5.86879749512362e-05,
      "loss": 8.651,
      "step": 39552,
      "throughput": 17864.564681502638
    },
    {
      "epoch": 0.6204437338850618,
      "grad_norm": 0.09002744406461716,
      "learning_rate": 5.851922841340461e-05,
      "loss": 8.6551,
      "step": 39584,
      "throughput": 17864.489150612582
    },
    {
      "epoch": 0.6209453052139907,
      "grad_norm": 0.07560932636260986,
      "learning_rate": 5.835092100259063e-05,
      "loss": 8.6438,
      "step": 39616,
      "throughput": 17864.85466818457
    },
    {
      "epoch": 0.6214468765429196,
      "grad_norm": 0.07000603526830673,
      "learning_rate": 5.818305341289458e-05,
      "loss": 8.6616,
      "step": 39648,
      "throughput": 17865.022496325797
    },
    {
      "epoch": 0.6219484478718486,
      "grad_norm": 0.07508612424135208,
      "learning_rate": 5.8015626336602814e-05,
      "loss": 8.6451,
      "step": 39680,
      "throughput": 17865.015064796968
    },
    {
      "epoch": 0.6224500192007775,
      "grad_norm": 0.07781418412923813,
      "learning_rate": 5.7848640464185124e-05,
      "loss": 8.6664,
      "step": 39712,
      "throughput": 17865.122405643884
    },
    {
      "epoch": 0.6229515905297064,
      "grad_norm": 0.07823880016803741,
      "learning_rate": 5.768209648429174e-05,
      "loss": 8.662,
      "step": 39744,
      "throughput": 17864.99515528762
    },
    {
      "epoch": 0.6234531618586353,
      "grad_norm": 0.07522641867399216,
      "learning_rate": 5.751599508375059e-05,
      "loss": 8.6612,
      "step": 39776,
      "throughput": 17865.26393103338
    },
    {
      "epoch": 0.6239547331875641,
      "grad_norm": 0.07135847210884094,
      "learning_rate": 5.735033694756423e-05,
      "loss": 8.6458,
      "step": 39808,
      "throughput": 17865.114361056123
    },
    {
      "epoch": 0.624456304516493,
      "grad_norm": 0.07262608408927917,
      "learning_rate": 5.718512275890737e-05,
      "loss": 8.6467,
      "step": 39840,
      "throughput": 17865.039723312893
    },
    {
      "epoch": 0.624957875845422,
      "grad_norm": 0.08001768589019775,
      "learning_rate": 5.70203531991238e-05,
      "loss": 8.6502,
      "step": 39872,
      "throughput": 17865.138677653576
    },
    {
      "epoch": 0.6254594471743509,
      "grad_norm": 0.07615388929843903,
      "learning_rate": 5.6856028947723734e-05,
      "loss": 8.6538,
      "step": 39904,
      "throughput": 17865.24251195156
    },
    {
      "epoch": 0.6259610185032798,
      "grad_norm": 0.08307041972875595,
      "learning_rate": 5.669215068238075e-05,
      "loss": 8.6371,
      "step": 39936,
      "throughput": 17865.49912828423
    },
    {
      "epoch": 0.6264625898322087,
      "grad_norm": 0.1058107390999794,
      "learning_rate": 5.652871907892934e-05,
      "loss": 8.6577,
      "step": 39968,
      "throughput": 17865.508025420746
    },
    {
      "epoch": 0.6269641611611376,
      "grad_norm": 0.07321104407310486,
      "learning_rate": 5.6365734811362026e-05,
      "loss": 8.654,
      "step": 40000,
      "throughput": 17865.873432565157
    },
    {
      "epoch": 0.6274657324900665,
      "grad_norm": 0.07945670187473297,
      "learning_rate": 5.620319855182629e-05,
      "loss": 8.644,
      "step": 40032,
      "throughput": 17865.753371531107
    },
    {
      "epoch": 0.6279673038189955,
      "grad_norm": 0.07705490291118622,
      "learning_rate": 5.60411109706222e-05,
      "loss": 8.619,
      "step": 40064,
      "throughput": 17865.828809691076
    },
    {
      "epoch": 0.6284688751479244,
      "grad_norm": 0.07273361086845398,
      "learning_rate": 5.587947273619938e-05,
      "loss": 8.6327,
      "step": 40096,
      "throughput": 17865.806906101432
    },
    {
      "epoch": 0.6289704464768533,
      "grad_norm": 0.08058905601501465,
      "learning_rate": 5.5718284515154476e-05,
      "loss": 8.6399,
      "step": 40128,
      "throughput": 17865.742978327868
    },
    {
      "epoch": 0.6294720178057822,
      "grad_norm": 0.08202604949474335,
      "learning_rate": 5.5557546972228114e-05,
      "loss": 8.668,
      "step": 40160,
      "throughput": 17865.81605389241
    },
    {
      "epoch": 0.6299735891347111,
      "grad_norm": 0.07899637520313263,
      "learning_rate": 5.539726077030239e-05,
      "loss": 8.66,
      "step": 40192,
      "throughput": 17865.723897540585
    },
    {
      "epoch": 0.63047516046364,
      "grad_norm": 0.07868487387895584,
      "learning_rate": 5.523742657039809e-05,
      "loss": 8.6464,
      "step": 40224,
      "throughput": 17865.993595646247
    },
    {
      "epoch": 0.6309767317925689,
      "grad_norm": 0.0851953774690628,
      "learning_rate": 5.5078045031672005e-05,
      "loss": 8.6763,
      "step": 40256,
      "throughput": 17866.002235211934
    },
    {
      "epoch": 0.6314783031214978,
      "grad_norm": 0.0772816613316536,
      "learning_rate": 5.491911681141394e-05,
      "loss": 8.6405,
      "step": 40288,
      "throughput": 17866.25481727828
    },
    {
      "epoch": 0.6319798744504267,
      "grad_norm": 0.08760164678096771,
      "learning_rate": 5.476064256504443e-05,
      "loss": 8.6784,
      "step": 40320,
      "throughput": 17866.391651604456
    },
    {
      "epoch": 0.6324814457793556,
      "grad_norm": 0.0925959125161171,
      "learning_rate": 5.460262294611172e-05,
      "loss": 8.6622,
      "step": 40352,
      "throughput": 17866.315716446847
    },
    {
      "epoch": 0.6329830171082845,
      "grad_norm": 0.07837571203708649,
      "learning_rate": 5.444505860628923e-05,
      "loss": 8.6255,
      "step": 40384,
      "throughput": 17866.510661128217
    },
    {
      "epoch": 0.6334845884372134,
      "grad_norm": 0.07390439510345459,
      "learning_rate": 5.428795019537268e-05,
      "loss": 8.6348,
      "step": 40416,
      "throughput": 17866.438074365313
    },
    {
      "epoch": 0.6339861597661424,
      "grad_norm": 0.10645684599876404,
      "learning_rate": 5.413129836127766e-05,
      "loss": 8.6481,
      "step": 40448,
      "throughput": 17866.375502482675
    },
    {
      "epoch": 0.6344877310950713,
      "grad_norm": 0.07660505175590515,
      "learning_rate": 5.3975103750036805e-05,
      "loss": 8.6345,
      "step": 40480,
      "throughput": 17866.386718452814
    },
    {
      "epoch": 0.6349893024240002,
      "grad_norm": 0.08478232473134995,
      "learning_rate": 5.3819367005797186e-05,
      "loss": 8.631,
      "step": 40512,
      "throughput": 17866.570802730967
    },
    {
      "epoch": 0.6354908737529291,
      "grad_norm": 0.0727619156241417,
      "learning_rate": 5.366408877081752e-05,
      "loss": 8.6615,
      "step": 40544,
      "throughput": 17866.730174278655
    },
    {
      "epoch": 0.635992445081858,
      "grad_norm": 0.07355979830026627,
      "learning_rate": 5.3509269685465764e-05,
      "loss": 8.6351,
      "step": 40576,
      "throughput": 17866.643518617777
    },
    {
      "epoch": 0.636494016410787,
      "grad_norm": 0.08197405189275742,
      "learning_rate": 5.3354910388216274e-05,
      "loss": 8.6527,
      "step": 40608,
      "throughput": 17866.997496697804
    },
    {
      "epoch": 0.6369955877397159,
      "grad_norm": 0.07571222633123398,
      "learning_rate": 5.3201011515647276e-05,
      "loss": 8.658,
      "step": 40640,
      "throughput": 17866.88506691309
    },
    {
      "epoch": 0.6374971590686448,
      "grad_norm": 0.08034916967153549,
      "learning_rate": 5.304757370243811e-05,
      "loss": 8.6353,
      "step": 40672,
      "throughput": 17867.07195681159
    },
    {
      "epoch": 0.6379987303975736,
      "grad_norm": 0.09238483011722565,
      "learning_rate": 5.2894597581366835e-05,
      "loss": 8.6596,
      "step": 40704,
      "throughput": 17867.052423325236
    },
    {
      "epoch": 0.6385003017265025,
      "grad_norm": 0.07784246653318405,
      "learning_rate": 5.274208378330737e-05,
      "loss": 8.658,
      "step": 40736,
      "throughput": 17866.879184258778
    },
    {
      "epoch": 0.6390018730554314,
      "grad_norm": 0.08024382591247559,
      "learning_rate": 5.2590032937227154e-05,
      "loss": 8.6502,
      "step": 40768,
      "throughput": 17867.069276156857
    },
    {
      "epoch": 0.6395034443843604,
      "grad_norm": 0.08907058835029602,
      "learning_rate": 5.2438445670184244e-05,
      "loss": 8.6509,
      "step": 40800,
      "throughput": 17867.088888346483
    },
    {
      "epoch": 0.6400050157132893,
      "grad_norm": 0.07528900355100632,
      "learning_rate": 5.2287322607325e-05,
      "loss": 8.6323,
      "step": 40832,
      "throughput": 17867.34687568463
    },
    {
      "epoch": 0.6405065870422182,
      "grad_norm": 0.07476245611906052,
      "learning_rate": 5.213666437188141e-05,
      "loss": 8.6636,
      "step": 40864,
      "throughput": 17867.248490118967
    },
    {
      "epoch": 0.6410081583711471,
      "grad_norm": 0.07534540444612503,
      "learning_rate": 5.1986471585168485e-05,
      "loss": 8.6525,
      "step": 40896,
      "throughput": 17867.501511953807
    },
    {
      "epoch": 0.641509729700076,
      "grad_norm": 0.07289925962686539,
      "learning_rate": 5.183674486658167e-05,
      "loss": 8.6673,
      "step": 40928,
      "throughput": 17867.44215734667
    },
    {
      "epoch": 0.6420113010290049,
      "grad_norm": 0.07585127651691437,
      "learning_rate": 5.168748483359445e-05,
      "loss": 8.6576,
      "step": 40960,
      "throughput": 17867.563558104393
    },
    {
      "epoch": 0.6425128723579339,
      "grad_norm": 0.07819195091724396,
      "learning_rate": 5.153869210175563e-05,
      "loss": 8.6272,
      "step": 40992,
      "throughput": 17866.695754184875
    },
    {
      "epoch": 0.6430144436868628,
      "grad_norm": 0.07709189504384995,
      "learning_rate": 5.139036728468686e-05,
      "loss": 8.6352,
      "step": 41024,
      "throughput": 17866.540715824405
    },
    {
      "epoch": 0.6435160150157917,
      "grad_norm": 0.07325899600982666,
      "learning_rate": 5.124251099408012e-05,
      "loss": 8.6239,
      "step": 41056,
      "throughput": 17866.573961717964
    },
    {
      "epoch": 0.6440175863447206,
      "grad_norm": 0.07731231302022934,
      "learning_rate": 5.1095123839695224e-05,
      "loss": 8.6481,
      "step": 41088,
      "throughput": 17866.52019947398
    },
    {
      "epoch": 0.6445191576736495,
      "grad_norm": 0.07855982333421707,
      "learning_rate": 5.0948206429357224e-05,
      "loss": 8.6585,
      "step": 41120,
      "throughput": 17866.874899436556
    },
    {
      "epoch": 0.6450207290025783,
      "grad_norm": 0.08262521773576736,
      "learning_rate": 5.080175936895392e-05,
      "loss": 8.6673,
      "step": 41152,
      "throughput": 17866.97401668372
    },
    {
      "epoch": 0.6455223003315073,
      "grad_norm": 0.07248621433973312,
      "learning_rate": 5.065578326243348e-05,
      "loss": 8.6146,
      "step": 41184,
      "throughput": 17866.939932998972
    },
    {
      "epoch": 0.6460238716604362,
      "grad_norm": 0.08461394906044006,
      "learning_rate": 5.0510278711801735e-05,
      "loss": 8.6513,
      "step": 41216,
      "throughput": 17867.19175924385
    },
    {
      "epoch": 0.6465254429893651,
      "grad_norm": 0.08080363273620605,
      "learning_rate": 5.036524631711996e-05,
      "loss": 8.6439,
      "step": 41248,
      "throughput": 17867.07581491338
    },
    {
      "epoch": 0.647027014318294,
      "grad_norm": 0.07436025887727737,
      "learning_rate": 5.02206866765021e-05,
      "loss": 8.6421,
      "step": 41280,
      "throughput": 17867.35340200034
    },
    {
      "epoch": 0.6475285856472229,
      "grad_norm": 0.07391635328531265,
      "learning_rate": 5.007660038611259e-05,
      "loss": 8.6543,
      "step": 41312,
      "throughput": 17867.44467388088
    },
    {
      "epoch": 0.6480301569761518,
      "grad_norm": 0.08293146640062332,
      "learning_rate": 4.9932988040163726e-05,
      "loss": 8.6386,
      "step": 41344,
      "throughput": 17867.060722918603
    },
    {
      "epoch": 0.6485317283050808,
      "grad_norm": 0.08641663193702698,
      "learning_rate": 4.978985023091324e-05,
      "loss": 8.6454,
      "step": 41376,
      "throughput": 17867.136581283157
    },
    {
      "epoch": 0.6490332996340097,
      "grad_norm": 0.07811315357685089,
      "learning_rate": 4.964718754866186e-05,
      "loss": 8.6361,
      "step": 41408,
      "throughput": 17867.35349034328
    },
    {
      "epoch": 0.6495348709629386,
      "grad_norm": 0.07370807230472565,
      "learning_rate": 4.95050005817509e-05,
      "loss": 8.6499,
      "step": 41440,
      "throughput": 17867.511919709723
    },
    {
      "epoch": 0.6500364422918675,
      "grad_norm": 0.0777527242898941,
      "learning_rate": 4.936328991655988e-05,
      "loss": 8.6117,
      "step": 41472,
      "throughput": 17867.50111318367
    },
    {
      "epoch": 0.6505380136207964,
      "grad_norm": 0.0801592543721199,
      "learning_rate": 4.9222056137504e-05,
      "loss": 8.6319,
      "step": 41504,
      "throughput": 17867.853086445513
    },
    {
      "epoch": 0.6510395849497254,
      "grad_norm": 0.07333212345838547,
      "learning_rate": 4.908129982703169e-05,
      "loss": 8.6352,
      "step": 41536,
      "throughput": 17867.628775432426
    },
    {
      "epoch": 0.6515411562786543,
      "grad_norm": 0.07759763300418854,
      "learning_rate": 4.8941021565622516e-05,
      "loss": 8.6491,
      "step": 41568,
      "throughput": 17867.804595213962
    },
    {
      "epoch": 0.6520427276075831,
      "grad_norm": 0.07329500466585159,
      "learning_rate": 4.880122193178441e-05,
      "loss": 8.6472,
      "step": 41600,
      "throughput": 17867.99294167593
    },
    {
      "epoch": 0.652544298936512,
      "grad_norm": 0.07508452236652374,
      "learning_rate": 4.866190150205143e-05,
      "loss": 8.6398,
      "step": 41632,
      "throughput": 17867.830850863997
    },
    {
      "epoch": 0.6530458702654409,
      "grad_norm": 0.0712026059627533,
      "learning_rate": 4.8523060850981476e-05,
      "loss": 8.62,
      "step": 41664,
      "throughput": 17867.803101399382
    },
    {
      "epoch": 0.6535474415943698,
      "grad_norm": 0.09108668565750122,
      "learning_rate": 4.838470055115379e-05,
      "loss": 8.6402,
      "step": 41696,
      "throughput": 17867.811763701153
    },
    {
      "epoch": 0.6540490129232988,
      "grad_norm": 0.08174485713243484,
      "learning_rate": 4.82468211731667e-05,
      "loss": 8.6461,
      "step": 41728,
      "throughput": 17868.16016042303
    },
    {
      "epoch": 0.6545505842522277,
      "grad_norm": 0.08193115890026093,
      "learning_rate": 4.8109423285635116e-05,
      "loss": 8.6443,
      "step": 41760,
      "throughput": 17868.069688114592
    },
    {
      "epoch": 0.6550521555811566,
      "grad_norm": 0.0756475180387497,
      "learning_rate": 4.797250745518833e-05,
      "loss": 8.607,
      "step": 41792,
      "throughput": 17868.219940900595
    },
    {
      "epoch": 0.6555537269100855,
      "grad_norm": 0.07722701877355576,
      "learning_rate": 4.7836074246467685e-05,
      "loss": 8.6325,
      "step": 41824,
      "throughput": 17868.244047539516
    },
    {
      "epoch": 0.6560552982390144,
      "grad_norm": 0.07927604764699936,
      "learning_rate": 4.770012422212412e-05,
      "loss": 8.6342,
      "step": 41856,
      "throughput": 17868.25155414449
    },
    {
      "epoch": 0.6565568695679433,
      "grad_norm": 0.09216690063476562,
      "learning_rate": 4.756465794281592e-05,
      "loss": 8.6335,
      "step": 41888,
      "throughput": 17868.52465285224
    },
    {
      "epoch": 0.6570584408968723,
      "grad_norm": 0.07883986085653305,
      "learning_rate": 4.742967596720641e-05,
      "loss": 8.6454,
      "step": 41920,
      "throughput": 17868.603658253276
    },
    {
      "epoch": 0.6575600122258012,
      "grad_norm": 0.07400567084550858,
      "learning_rate": 4.729517885196169e-05,
      "loss": 8.6558,
      "step": 41952,
      "throughput": 17868.331232637775
    },
    {
      "epoch": 0.6580615835547301,
      "grad_norm": 0.07267136126756668,
      "learning_rate": 4.716116715174827e-05,
      "loss": 8.6288,
      "step": 41984,
      "throughput": 17868.341387047603
    },
    {
      "epoch": 0.6585631548836589,
      "grad_norm": 0.07420884817838669,
      "learning_rate": 4.702764141923075e-05,
      "loss": 8.6473,
      "step": 42016,
      "throughput": 17868.619800005705
    },
    {
      "epoch": 0.6590647262125878,
      "grad_norm": 0.07612421363592148,
      "learning_rate": 4.6894602205069674e-05,
      "loss": 8.6197,
      "step": 42048,
      "throughput": 17868.768551557903
    },
    {
      "epoch": 0.6595662975415167,
      "grad_norm": 0.09027434140443802,
      "learning_rate": 4.6762050057919165e-05,
      "loss": 8.6223,
      "step": 42080,
      "throughput": 17868.673524801692
    },
    {
      "epoch": 0.6600678688704457,
      "grad_norm": 0.0758587121963501,
      "learning_rate": 4.6629985524424686e-05,
      "loss": 8.6244,
      "step": 42112,
      "throughput": 17869.0184421209
    },
    {
      "epoch": 0.6605694401993746,
      "grad_norm": 0.08677732199430466,
      "learning_rate": 4.649840914922071e-05,
      "loss": 8.6343,
      "step": 42144,
      "throughput": 17868.714091906582
    },
    {
      "epoch": 0.6610710115283035,
      "grad_norm": 0.07691498100757599,
      "learning_rate": 4.636732147492863e-05,
      "loss": 8.6237,
      "step": 42176,
      "throughput": 17868.886598381487
    },
    {
      "epoch": 0.6615725828572324,
      "grad_norm": 0.07497978210449219,
      "learning_rate": 4.6236723042154424e-05,
      "loss": 8.6488,
      "step": 42208,
      "throughput": 17868.970009957025
    },
    {
      "epoch": 0.6620741541861613,
      "grad_norm": 0.07713934779167175,
      "learning_rate": 4.61066143894864e-05,
      "loss": 8.6389,
      "step": 42240,
      "throughput": 17868.90043001298
    },
    {
      "epoch": 0.6625757255150903,
      "grad_norm": 0.0759284496307373,
      "learning_rate": 4.5976996053492996e-05,
      "loss": 8.6253,
      "step": 42272,
      "throughput": 17868.98162581538
    },
    {
      "epoch": 0.6630772968440192,
      "grad_norm": 0.07967263460159302,
      "learning_rate": 4.5847868568720646e-05,
      "loss": 8.6185,
      "step": 42304,
      "throughput": 17868.993457005505
    },
    {
      "epoch": 0.6635788681729481,
      "grad_norm": 0.07670048624277115,
      "learning_rate": 4.571923246769147e-05,
      "loss": 8.6412,
      "step": 42336,
      "throughput": 17869.152099055646
    },
    {
      "epoch": 0.664080439501877,
      "grad_norm": 0.08368648588657379,
      "learning_rate": 4.559108828090115e-05,
      "loss": 8.632,
      "step": 42368,
      "throughput": 17869.164191981206
    },
    {
      "epoch": 0.6645820108308059,
      "grad_norm": 0.08008282631635666,
      "learning_rate": 4.546343653681667e-05,
      "loss": 8.6426,
      "step": 42400,
      "throughput": 17869.406153800315
    },
    {
      "epoch": 0.6650835821597348,
      "grad_norm": 0.07593069970607758,
      "learning_rate": 4.53362777618742e-05,
      "loss": 8.6291,
      "step": 42432,
      "throughput": 17869.34026576544
    },
    {
      "epoch": 0.6655851534886637,
      "grad_norm": 0.07599702477455139,
      "learning_rate": 4.52096124804769e-05,
      "loss": 8.6262,
      "step": 42464,
      "throughput": 17869.34267106005
    },
    {
      "epoch": 0.6660867248175926,
      "grad_norm": 0.08322039246559143,
      "learning_rate": 4.508344121499281e-05,
      "loss": 8.628,
      "step": 42496,
      "throughput": 17869.44801207976
    },
    {
      "epoch": 0.6665882961465215,
      "grad_norm": 0.08085814118385315,
      "learning_rate": 4.495776448575255e-05,
      "loss": 8.6247,
      "step": 42528,
      "throughput": 17869.685302503272
    },
    {
      "epoch": 0.6670898674754504,
      "grad_norm": 0.07521507143974304,
      "learning_rate": 4.483258281104734e-05,
      "loss": 8.6076,
      "step": 42560,
      "throughput": 17869.411012059038
    },
    {
      "epoch": 0.6675914388043793,
      "grad_norm": 0.08697548508644104,
      "learning_rate": 4.470789670712681e-05,
      "loss": 8.6259,
      "step": 42592,
      "throughput": 17869.41670981458
    },
    {
      "epoch": 0.6680930101333082,
      "grad_norm": 0.08319617062807083,
      "learning_rate": 4.458370668819676e-05,
      "loss": 8.6411,
      "step": 42624,
      "throughput": 17869.702199175634
    },
    {
      "epoch": 0.6685945814622372,
      "grad_norm": 0.074796162545681,
      "learning_rate": 4.4460013266417226e-05,
      "loss": 8.6154,
      "step": 42656,
      "throughput": 17869.84765573023
    },
    {
      "epoch": 0.6690961527911661,
      "grad_norm": 0.10984188318252563,
      "learning_rate": 4.433681695190027e-05,
      "loss": 8.6231,
      "step": 42688,
      "throughput": 17869.75924603985
    },
    {
      "epoch": 0.669597724120095,
      "grad_norm": 0.07607369124889374,
      "learning_rate": 4.421411825270785e-05,
      "loss": 8.614,
      "step": 42720,
      "throughput": 17869.892649707363
    },
    {
      "epoch": 0.6700992954490239,
      "grad_norm": 0.07601217180490494,
      "learning_rate": 4.4091917674849727e-05,
      "loss": 8.6327,
      "step": 42752,
      "throughput": 17869.76525839264
    },
    {
      "epoch": 0.6706008667779528,
      "grad_norm": 0.08114274591207504,
      "learning_rate": 4.397021572228147e-05,
      "loss": 8.6388,
      "step": 42784,
      "throughput": 17869.938173688686
    },
    {
      "epoch": 0.6711024381068817,
      "grad_norm": 0.09427239745855331,
      "learning_rate": 4.38490128969023e-05,
      "loss": 8.6407,
      "step": 42816,
      "throughput": 17870.02010821782
    },
    {
      "epoch": 0.6716040094358107,
      "grad_norm": 0.077272430062294,
      "learning_rate": 4.3728309698553056e-05,
      "loss": 8.6417,
      "step": 42848,
      "throughput": 17869.953129300473
    },
    {
      "epoch": 0.6721055807647396,
      "grad_norm": 0.07326050847768784,
      "learning_rate": 4.3608106625014014e-05,
      "loss": 8.6258,
      "step": 42880,
      "throughput": 17869.9248813899
    },
    {
      "epoch": 0.6726071520936684,
      "grad_norm": 0.07527446001768112,
      "learning_rate": 4.348840417200306e-05,
      "loss": 8.6325,
      "step": 42912,
      "throughput": 17870.028090881584
    },
    {
      "epoch": 0.6731087234225973,
      "grad_norm": 0.07910473644733429,
      "learning_rate": 4.336920283317343e-05,
      "loss": 8.6445,
      "step": 42944,
      "throughput": 17870.18679976226
    },
    {
      "epoch": 0.6736102947515262,
      "grad_norm": 0.07715103030204773,
      "learning_rate": 4.325050310011183e-05,
      "loss": 8.648,
      "step": 42976,
      "throughput": 17870.101235465474
    },
    {
      "epoch": 0.6741118660804551,
      "grad_norm": 0.09285691380500793,
      "learning_rate": 4.3132305462336306e-05,
      "loss": 8.6354,
      "step": 43008,
      "throughput": 17870.4344978396
    },
    {
      "epoch": 0.6746134374093841,
      "grad_norm": 0.07316839694976807,
      "learning_rate": 4.301461040729424e-05,
      "loss": 8.6547,
      "step": 43040,
      "throughput": 17829.53093573461
    },
    {
      "epoch": 0.675115008738313,
      "grad_norm": 0.08353468775749207,
      "learning_rate": 4.289741842036042e-05,
      "loss": 8.6261,
      "step": 43072,
      "throughput": 17828.00636019171
    },
    {
      "epoch": 0.6756165800672419,
      "grad_norm": 0.07400113344192505,
      "learning_rate": 4.2780729984834916e-05,
      "loss": 8.6173,
      "step": 43104,
      "throughput": 17827.94909204432
    },
    {
      "epoch": 0.6761181513961708,
      "grad_norm": 0.08323801308870316,
      "learning_rate": 4.266454558194122e-05,
      "loss": 8.6261,
      "step": 43136,
      "throughput": 17827.857852711364
    },
    {
      "epoch": 0.6766197227250997,
      "grad_norm": 0.08236182481050491,
      "learning_rate": 4.254886569082413e-05,
      "loss": 8.61,
      "step": 43168,
      "throughput": 17828.213728487226
    },
    {
      "epoch": 0.6771212940540287,
      "grad_norm": 0.08978724479675293,
      "learning_rate": 4.243369078854788e-05,
      "loss": 8.6226,
      "step": 43200,
      "throughput": 17828.47533696741
    },
    {
      "epoch": 0.6776228653829576,
      "grad_norm": 0.07722384482622147,
      "learning_rate": 4.231902135009407e-05,
      "loss": 8.6436,
      "step": 43232,
      "throughput": 17828.832063955277
    },
    {
      "epoch": 0.6781244367118865,
      "grad_norm": 0.07705316692590714,
      "learning_rate": 4.220485784835984e-05,
      "loss": 8.6338,
      "step": 43264,
      "throughput": 17829.196605309022
    },
    {
      "epoch": 0.6786260080408154,
      "grad_norm": 0.07429414242506027,
      "learning_rate": 4.209120075415577e-05,
      "loss": 8.6076,
      "step": 43296,
      "throughput": 17829.554240003934
    },
    {
      "epoch": 0.6791275793697443,
      "grad_norm": 0.07961345463991165,
      "learning_rate": 4.197805053620411e-05,
      "loss": 8.6129,
      "step": 43328,
      "throughput": 17829.906459575002
    },
    {
      "epoch": 0.6796291506986731,
      "grad_norm": 0.07683772593736649,
      "learning_rate": 4.186540766113665e-05,
      "loss": 8.6178,
      "step": 43360,
      "throughput": 17829.15771918659
    },
    {
      "epoch": 0.680130722027602,
      "grad_norm": 0.07365067303180695,
      "learning_rate": 4.1753272593492956e-05,
      "loss": 8.6282,
      "step": 43392,
      "throughput": 17828.201463559795
    },
    {
      "epoch": 0.680632293356531,
      "grad_norm": 0.07368257641792297,
      "learning_rate": 4.1641645795718364e-05,
      "loss": 8.6367,
      "step": 43424,
      "throughput": 17828.16390825877
    },
    {
      "epoch": 0.6811338646854599,
      "grad_norm": 0.08067294210195541,
      "learning_rate": 4.153052772816217e-05,
      "loss": 8.6053,
      "step": 43456,
      "throughput": 17828.43430354821
    },
    {
      "epoch": 0.6816354360143888,
      "grad_norm": 0.0738702118396759,
      "learning_rate": 4.141991884907555e-05,
      "loss": 8.6118,
      "step": 43488,
      "throughput": 17828.703134283922
    },
    {
      "epoch": 0.6821370073433177,
      "grad_norm": 0.07744862884283066,
      "learning_rate": 4.1309819614609865e-05,
      "loss": 8.6139,
      "step": 43520,
      "throughput": 17829.06611352555
    },
    {
      "epoch": 0.6826385786722466,
      "grad_norm": 0.08360811322927475,
      "learning_rate": 4.1200230478814695e-05,
      "loss": 8.6318,
      "step": 43552,
      "throughput": 17829.43111372124
    },
    {
      "epoch": 0.6831401500011756,
      "grad_norm": 0.08169478923082352,
      "learning_rate": 4.109115189363601e-05,
      "loss": 8.6276,
      "step": 43584,
      "throughput": 17829.79810554368
    },
    {
      "epoch": 0.6836417213301045,
      "grad_norm": 0.07911770045757294,
      "learning_rate": 4.0982584308914114e-05,
      "loss": 8.5991,
      "step": 43616,
      "throughput": 17830.156279143917
    },
    {
      "epoch": 0.6841432926590334,
      "grad_norm": 0.0773385539650917,
      "learning_rate": 4.0874528172382114e-05,
      "loss": 8.6271,
      "step": 43648,
      "throughput": 17830.127300317476
    },
    {
      "epoch": 0.6846448639879623,
      "grad_norm": 0.07815414667129517,
      "learning_rate": 4.0766983929663835e-05,
      "loss": 8.6012,
      "step": 43680,
      "throughput": 17829.028219723397
    },
    {
      "epoch": 0.6851464353168912,
      "grad_norm": 0.07514085620641708,
      "learning_rate": 4.065995202427206e-05,
      "loss": 8.6132,
      "step": 43712,
      "throughput": 17829.106705307986
    },
    {
      "epoch": 0.6856480066458202,
      "grad_norm": 0.07363571971654892,
      "learning_rate": 4.055343289760664e-05,
      "loss": 8.6209,
      "step": 43744,
      "throughput": 17829.082599309702
    },
    {
      "epoch": 0.6861495779747491,
      "grad_norm": 0.10579396039247513,
      "learning_rate": 4.0447426988952816e-05,
      "loss": 8.5954,
      "step": 43776,
      "throughput": 17829.43620231659
    },
    {
      "epoch": 0.6866511493036779,
      "grad_norm": 0.08076892048120499,
      "learning_rate": 4.0341934735479224e-05,
      "loss": 8.6166,
      "step": 43808,
      "throughput": 17829.703211476124
    },
    {
      "epoch": 0.6871527206326068,
      "grad_norm": 0.07535163313150406,
      "learning_rate": 4.02369565722363e-05,
      "loss": 8.6349,
      "step": 43840,
      "throughput": 17830.054087310255
    },
    {
      "epoch": 0.6876542919615357,
      "grad_norm": 0.07902940362691879,
      "learning_rate": 4.013249293215422e-05,
      "loss": 8.5933,
      "step": 43872,
      "throughput": 17830.408681836223
    },
    {
      "epoch": 0.6881558632904646,
      "grad_norm": 0.07208844274282455,
      "learning_rate": 4.0028544246041406e-05,
      "loss": 8.6062,
      "step": 43904,
      "throughput": 17830.762075605933
    },
    {
      "epoch": 0.6886574346193935,
      "grad_norm": 0.07734175026416779,
      "learning_rate": 3.99251109425825e-05,
      "loss": 8.6341,
      "step": 43936,
      "throughput": 17831.037793466916
    },
    {
      "epoch": 0.6891590059483225,
      "grad_norm": 0.07820406556129456,
      "learning_rate": 3.982219344833681e-05,
      "loss": 8.6247,
      "step": 43968,
      "throughput": 17829.944007343434
    },
    {
      "epoch": 0.6896605772772514,
      "grad_norm": 0.08111374080181122,
      "learning_rate": 3.971979218773634e-05,
      "loss": 8.6006,
      "step": 44000,
      "throughput": 17829.537729953823
    },
    {
      "epoch": 0.6901621486061803,
      "grad_norm": 0.0744817703962326,
      "learning_rate": 3.961790758308418e-05,
      "loss": 8.6149,
      "step": 44032,
      "throughput": 17829.513134364835
    },
    {
      "epoch": 0.6906637199351092,
      "grad_norm": 0.07969462871551514,
      "learning_rate": 3.951654005455281e-05,
      "loss": 8.61,
      "step": 44064,
      "throughput": 17829.86407906323
    },
    {
      "epoch": 0.6911652912640381,
      "grad_norm": 0.0778898149728775,
      "learning_rate": 3.9415690020182154e-05,
      "loss": 8.6197,
      "step": 44096,
      "throughput": 17830.128389522783
    },
    {
      "epoch": 0.6916668625929671,
      "grad_norm": 0.07783157378435135,
      "learning_rate": 3.9315357895878066e-05,
      "loss": 8.6106,
      "step": 44128,
      "throughput": 17830.48168304553
    },
    {
      "epoch": 0.692168433921896,
      "grad_norm": 0.08259106427431107,
      "learning_rate": 3.921554409541053e-05,
      "loss": 8.6088,
      "step": 44160,
      "throughput": 17830.83309655857
    },
    {
      "epoch": 0.6926700052508249,
      "grad_norm": 0.08314735442399979,
      "learning_rate": 3.911624903041198e-05,
      "loss": 8.6292,
      "step": 44192,
      "throughput": 17831.187801043976
    },
    {
      "epoch": 0.6931715765797538,
      "grad_norm": 0.07506611198186874,
      "learning_rate": 3.9017473110375525e-05,
      "loss": 8.6192,
      "step": 44224,
      "throughput": 17831.44545054685
    },
    {
      "epoch": 0.6936731479086826,
      "grad_norm": 0.08658742159605026,
      "learning_rate": 3.891921674265336e-05,
      "loss": 8.6018,
      "step": 44256,
      "throughput": 17831.315833658613
    },
    {
      "epoch": 0.6941747192376115,
      "grad_norm": 0.08220481127500534,
      "learning_rate": 3.8821480332455024e-05,
      "loss": 8.61,
      "step": 44288,
      "throughput": 17830.152042987287
    },
    {
      "epoch": 0.6946762905665405,
      "grad_norm": 0.08411680907011032,
      "learning_rate": 3.87242642828458e-05,
      "loss": 8.6295,
      "step": 44320,
      "throughput": 17830.21507840018
    },
    {
      "epoch": 0.6951778618954694,
      "grad_norm": 0.09514406323432922,
      "learning_rate": 3.862756899474493e-05,
      "loss": 8.6189,
      "step": 44352,
      "throughput": 17830.27413842766
    },
    {
      "epoch": 0.6956794332243983,
      "grad_norm": 0.07466054707765579,
      "learning_rate": 3.853139486692408e-05,
      "loss": 8.592,
      "step": 44384,
      "throughput": 17830.626133393966
    },
    {
      "epoch": 0.6961810045533272,
      "grad_norm": 0.07343938946723938,
      "learning_rate": 3.843574229600565e-05,
      "loss": 8.6085,
      "step": 44416,
      "throughput": 17830.88249626846
    },
    {
      "epoch": 0.6966825758822561,
      "grad_norm": 0.08367404341697693,
      "learning_rate": 3.834061167646112e-05,
      "loss": 8.6191,
      "step": 44448,
      "throughput": 17831.23872612226
    },
    {
      "epoch": 0.697184147211185,
      "grad_norm": 0.08425655215978622,
      "learning_rate": 3.8246003400609424e-05,
      "loss": 8.6116,
      "step": 44480,
      "throughput": 17831.58888669082
    },
    {
      "epoch": 0.697685718540114,
      "grad_norm": 0.09636666625738144,
      "learning_rate": 3.81519178586154e-05,
      "loss": 8.6087,
      "step": 44512,
      "throughput": 17831.944748926202
    },
    {
      "epoch": 0.6981872898690429,
      "grad_norm": 0.07186683267354965,
      "learning_rate": 3.805835543848809e-05,
      "loss": 8.6372,
      "step": 44544,
      "throughput": 17831.92925302466
    },
    {
      "epoch": 0.6986888611979718,
      "grad_norm": 0.07774645835161209,
      "learning_rate": 3.796531652607919e-05,
      "loss": 8.629,
      "step": 44576,
      "throughput": 17830.856019052466
    },
    {
      "epoch": 0.6991904325269007,
      "grad_norm": 0.07877375930547714,
      "learning_rate": 3.7872801505081434e-05,
      "loss": 8.6345,
      "step": 44608,
      "throughput": 17830.757211903798
    },
    {
      "epoch": 0.6996920038558296,
      "grad_norm": 0.07710936665534973,
      "learning_rate": 3.778081075702709e-05,
      "loss": 8.6043,
      "step": 44640,
      "throughput": 17830.732995255126
    },
    {
      "epoch": 0.7001935751847586,
      "grad_norm": 0.0777658000588417,
      "learning_rate": 3.7689344661286264e-05,
      "loss": 8.6328,
      "step": 44672,
      "throughput": 17831.086625902175
    },
    {
      "epoch": 0.7006951465136874,
      "grad_norm": 0.07222341746091843,
      "learning_rate": 3.759840359506536e-05,
      "loss": 8.6041,
      "step": 44704,
      "throughput": 17831.35073612898
    },
    {
      "epoch": 0.7011967178426163,
      "grad_norm": 0.07875847816467285,
      "learning_rate": 3.750798793340565e-05,
      "loss": 8.6201,
      "step": 44736,
      "throughput": 17831.704921569017
    },
    {
      "epoch": 0.7016982891715452,
      "grad_norm": 0.08122248947620392,
      "learning_rate": 3.7418098049181573e-05,
      "loss": 8.6311,
      "step": 44768,
      "throughput": 17832.053439285082
    },
    {
      "epoch": 0.7021998605004741,
      "grad_norm": 0.08080139756202698,
      "learning_rate": 3.732873431309929e-05,
      "loss": 8.6082,
      "step": 44800,
      "throughput": 17832.40082616424
    },
    {
      "epoch": 0.702701431829403,
      "grad_norm": 0.0753706693649292,
      "learning_rate": 3.7239897093695106e-05,
      "loss": 8.6183,
      "step": 44832,
      "throughput": 17832.662159292475
    },
    {
      "epoch": 0.703203003158332,
      "grad_norm": 0.07338026911020279,
      "learning_rate": 3.715158675733396e-05,
      "loss": 8.618,
      "step": 44864,
      "throughput": 17832.246083241713
    },
    {
      "epoch": 0.7037045744872609,
      "grad_norm": 0.07462576031684875,
      "learning_rate": 3.706380366820796e-05,
      "loss": 8.6148,
      "step": 44896,
      "throughput": 17831.393643303032
    },
    {
      "epoch": 0.7042061458161898,
      "grad_norm": 0.07330302149057388,
      "learning_rate": 3.6976548188334834e-05,
      "loss": 8.5964,
      "step": 44928,
      "throughput": 17831.456045595824
    },
    {
      "epoch": 0.7047077171451187,
      "grad_norm": 0.07165367156267166,
      "learning_rate": 3.688982067755642e-05,
      "loss": 8.5966,
      "step": 44960,
      "throughput": 17831.61336550786
    },
    {
      "epoch": 0.7052092884740476,
      "grad_norm": 0.07546305656433105,
      "learning_rate": 3.680362149353724e-05,
      "loss": 8.6257,
      "step": 44992,
      "throughput": 17831.95989946819
    },
    {
      "epoch": 0.7057108598029765,
      "grad_norm": 0.07777175307273865,
      "learning_rate": 3.671795099176297e-05,
      "loss": 8.6032,
      "step": 45024,
      "throughput": 17832.22300289001
    },
    {
      "epoch": 0.7062124311319055,
      "grad_norm": 0.08201725035905838,
      "learning_rate": 3.6632809525539055e-05,
      "loss": 8.6169,
      "step": 45056,
      "throughput": 17832.570589285373
    },
    {
      "epoch": 0.7067140024608344,
      "grad_norm": 0.07370094209909439,
      "learning_rate": 3.6548197445989086e-05,
      "loss": 8.6216,
      "step": 45088,
      "throughput": 17831.93015607742
    },
    {
      "epoch": 0.7072155737897633,
      "grad_norm": 0.09871018677949905,
      "learning_rate": 3.6464115102053596e-05,
      "loss": 8.6063,
      "step": 45120,
      "throughput": 17832.114820630442
    },
    {
      "epoch": 0.7077171451186921,
      "grad_norm": 0.08234908431768417,
      "learning_rate": 3.6380562840488376e-05,
      "loss": 8.6291,
      "step": 45152,
      "throughput": 17832.17183748897
    },
    {
      "epoch": 0.708218716447621,
      "grad_norm": 0.08061608672142029,
      "learning_rate": 3.629754100586323e-05,
      "loss": 8.6041,
      "step": 45184,
      "throughput": 17831.104851476342
    },
    {
      "epoch": 0.7087202877765499,
      "grad_norm": 0.07441861927509308,
      "learning_rate": 3.6215049940560433e-05,
      "loss": 8.6275,
      "step": 45216,
      "throughput": 17831.083314860523
    },
    {
      "epoch": 0.7092218591054789,
      "grad_norm": 0.07990828156471252,
      "learning_rate": 3.613308998477339e-05,
      "loss": 8.579,
      "step": 45248,
      "throughput": 17831.062522278604
    },
    {
      "epoch": 0.7097234304344078,
      "grad_norm": 0.07660244405269623,
      "learning_rate": 3.605166147650517e-05,
      "loss": 8.6059,
      "step": 45280,
      "throughput": 17831.40752875969
    },
    {
      "epoch": 0.7102250017633367,
      "grad_norm": 0.07470241189002991,
      "learning_rate": 3.597076475156726e-05,
      "loss": 8.629,
      "step": 45312,
      "throughput": 17831.660710762444
    },
    {
      "epoch": 0.7107265730922656,
      "grad_norm": 0.08347544074058533,
      "learning_rate": 3.589040014357791e-05,
      "loss": 8.6297,
      "step": 45344,
      "throughput": 17832.003144784834
    },
    {
      "epoch": 0.7112281444211945,
      "grad_norm": 0.08342598378658295,
      "learning_rate": 3.581056798396105e-05,
      "loss": 8.6175,
      "step": 45376,
      "throughput": 17832.345687120098
    },
    {
      "epoch": 0.7117297157501234,
      "grad_norm": 0.07837989181280136,
      "learning_rate": 3.57312686019447e-05,
      "loss": 8.6017,
      "step": 45408,
      "throughput": 17832.603146867266
    },
    {
      "epoch": 0.7122312870790524,
      "grad_norm": 0.07344204932451248,
      "learning_rate": 3.565250232455983e-05,
      "loss": 8.6169,
      "step": 45440,
      "throughput": 17832.851243073444
    },
    {
      "epoch": 0.7127328584079813,
      "grad_norm": 0.07202349603176117,
      "learning_rate": 3.55742694766387e-05,
      "loss": 8.6055,
      "step": 45472,
      "throughput": 17831.984718235526
    },
    {
      "epoch": 0.7132344297369102,
      "grad_norm": 0.08134770393371582,
      "learning_rate": 3.549657038081386e-05,
      "loss": 8.6118,
      "step": 45504,
      "throughput": 17831.51519479825
    },
    {
      "epoch": 0.7137360010658391,
      "grad_norm": 0.07818509638309479,
      "learning_rate": 3.5419405357516624e-05,
      "loss": 8.5977,
      "step": 45536,
      "throughput": 17831.574718349315
    },
    {
      "epoch": 0.714237572394768,
      "grad_norm": 0.07608740031719208,
      "learning_rate": 3.534277472497574e-05,
      "loss": 8.6037,
      "step": 45568,
      "throughput": 17831.82476007102
    },
    {
      "epoch": 0.7147391437236968,
      "grad_norm": 0.08183170855045319,
      "learning_rate": 3.52666787992162e-05,
      "loss": 8.6228,
      "step": 45600,
      "throughput": 17832.07426776666
    },
    {
      "epoch": 0.7152407150526258,
      "grad_norm": 0.07773195207118988,
      "learning_rate": 3.519111789405779e-05,
      "loss": 8.6376,
      "step": 45632,
      "throughput": 17832.416637519258
    },
    {
      "epoch": 0.7157422863815547,
      "grad_norm": 0.07735767215490341,
      "learning_rate": 3.5116092321113936e-05,
      "loss": 8.612,
      "step": 45664,
      "throughput": 17832.676189491172
    },
    {
      "epoch": 0.7162438577104836,
      "grad_norm": 0.07806604355573654,
      "learning_rate": 3.504160238979032e-05,
      "loss": 8.592,
      "step": 45696,
      "throughput": 17833.019040719744
    },
    {
      "epoch": 0.7167454290394125,
      "grad_norm": 0.07910951972007751,
      "learning_rate": 3.496764840728361e-05,
      "loss": 8.591,
      "step": 45728,
      "throughput": 17833.26704762432
    },
    {
      "epoch": 0.7172470003683414,
      "grad_norm": 0.08164380490779877,
      "learning_rate": 3.489423067858027e-05,
      "loss": 8.6037,
      "step": 45760,
      "throughput": 17833.137383345835
    },
    {
      "epoch": 0.7177485716972704,
      "grad_norm": 0.0812004879117012,
      "learning_rate": 3.4821349506455255e-05,
      "loss": 8.6192,
      "step": 45792,
      "throughput": 17832.10878133463
    },
    {
      "epoch": 0.7182501430261993,
      "grad_norm": 0.07716728746891022,
      "learning_rate": 3.47490051914707e-05,
      "loss": 8.5838,
      "step": 45824,
      "throughput": 17832.087124480226
    },
    {
      "epoch": 0.7187517143551282,
      "grad_norm": 0.07972914725542068,
      "learning_rate": 3.4677198031974784e-05,
      "loss": 8.6032,
      "step": 45856,
      "throughput": 17832.146003866314
    },
    {
      "epoch": 0.7192532856840571,
      "grad_norm": 0.07909884303808212,
      "learning_rate": 3.4605928324100444e-05,
      "loss": 8.6251,
      "step": 45888,
      "throughput": 17832.48324538191
    },
    {
      "epoch": 0.719754857012986,
      "grad_norm": 0.07721769064664841,
      "learning_rate": 3.45351963617642e-05,
      "loss": 8.5953,
      "step": 45920,
      "throughput": 17832.729419615276
    },
    {
      "epoch": 0.720256428341915,
      "grad_norm": 0.07230034470558167,
      "learning_rate": 3.446500243666481e-05,
      "loss": 8.6252,
      "step": 45952,
      "throughput": 17832.980164510158
    },
    {
      "epoch": 0.7207579996708439,
      "grad_norm": 0.0790412500500679,
      "learning_rate": 3.439534683828228e-05,
      "loss": 8.6085,
      "step": 45984,
      "throughput": 17833.314847562127
    },
    {
      "epoch": 0.7212595709997727,
      "grad_norm": 0.08125613629817963,
      "learning_rate": 3.4326229853876475e-05,
      "loss": 8.6184,
      "step": 46016,
      "throughput": 17833.57056758044
    },
    {
      "epoch": 0.7217611423287016,
      "grad_norm": 0.08089485764503479,
      "learning_rate": 3.425765176848607e-05,
      "loss": 8.6079,
      "step": 46048,
      "throughput": 17833.715059902177
    },
    {
      "epoch": 0.7222627136576305,
      "grad_norm": 0.08002211153507233,
      "learning_rate": 3.418961286492728e-05,
      "loss": 8.6169,
      "step": 46080,
      "throughput": 17832.842095556927
    },
    {
      "epoch": 0.7227642849865594,
      "grad_norm": 0.08807891607284546,
      "learning_rate": 3.412211342379273e-05,
      "loss": 8.6204,
      "step": 46112,
      "throughput": 17832.438760989724
    },
    {
      "epoch": 0.7232658563154883,
      "grad_norm": 0.07419393956661224,
      "learning_rate": 3.405515372345033e-05,
      "loss": 8.6076,
      "step": 46144,
      "throughput": 17832.408519828827
    },
    {
      "epoch": 0.7237674276444173,
      "grad_norm": 0.0870908796787262,
      "learning_rate": 3.398873404004209e-05,
      "loss": 8.6187,
      "step": 46176,
      "throughput": 17832.746373556896
    },
    {
      "epoch": 0.7242689989733462,
      "grad_norm": 0.07417533546686172,
      "learning_rate": 3.392285464748298e-05,
      "loss": 8.6184,
      "step": 46208,
      "throughput": 17832.997842482142
    },
    {
      "epoch": 0.7247705703022751,
      "grad_norm": 0.07752197980880737,
      "learning_rate": 3.385751581745979e-05,
      "loss": 8.6129,
      "step": 46240,
      "throughput": 17833.339610502808
    },
    {
      "epoch": 0.725272141631204,
      "grad_norm": 0.08416827768087387,
      "learning_rate": 3.379271781943007e-05,
      "loss": 8.5945,
      "step": 46272,
      "throughput": 17833.587420699234
    },
    {
      "epoch": 0.7257737129601329,
      "grad_norm": 0.1027483195066452,
      "learning_rate": 3.372846092062095e-05,
      "loss": 8.6091,
      "step": 46304,
      "throughput": 17833.920243728942
    },
    {
      "epoch": 0.7262752842890619,
      "grad_norm": 0.07594089955091476,
      "learning_rate": 3.366474538602806e-05,
      "loss": 8.6121,
      "step": 46336,
      "throughput": 17834.165193588993
    },
    {
      "epoch": 0.7267768556179908,
      "grad_norm": 0.07859986275434494,
      "learning_rate": 3.3601571478414455e-05,
      "loss": 8.5916,
      "step": 46368,
      "throughput": 17833.943095792438
    },
    {
      "epoch": 0.7272784269469197,
      "grad_norm": 0.07391414791345596,
      "learning_rate": 3.3538939458309556e-05,
      "loss": 8.6061,
      "step": 46400,
      "throughput": 17833.008350621407
    },
    {
      "epoch": 0.7277799982758486,
      "grad_norm": 0.07304394245147705,
      "learning_rate": 3.347684958400795e-05,
      "loss": 8.5964,
      "step": 46432,
      "throughput": 17832.974470104615
    },
    {
      "epoch": 0.7282815696047774,
      "grad_norm": 0.08239227533340454,
      "learning_rate": 3.341530211156847e-05,
      "loss": 8.6014,
      "step": 46464,
      "throughput": 17833.22047940315
    },
    {
      "epoch": 0.7287831409337063,
      "grad_norm": 0.07560481131076813,
      "learning_rate": 3.33542972948131e-05,
      "loss": 8.604,
      "step": 46496,
      "throughput": 17833.552998804513
    },
    {
      "epoch": 0.7292847122626352,
      "grad_norm": 0.08268658071756363,
      "learning_rate": 3.329383538532587e-05,
      "loss": 8.6096,
      "step": 46528,
      "throughput": 17833.803191749368
    },
    {
      "epoch": 0.7297862835915642,
      "grad_norm": 0.08136653900146484,
      "learning_rate": 3.323391663245188e-05,
      "loss": 8.6022,
      "step": 46560,
      "throughput": 17834.047552995467
    },
    {
      "epoch": 0.7302878549204931,
      "grad_norm": 0.07478020340204239,
      "learning_rate": 3.3174541283296225e-05,
      "loss": 8.5982,
      "step": 46592,
      "throughput": 17834.308879658493
    },
    {
      "epoch": 0.730789426249422,
      "grad_norm": 0.07832559198141098,
      "learning_rate": 3.311570958272303e-05,
      "loss": 8.5859,
      "step": 46624,
      "throughput": 17834.55308335101
    },
    {
      "epoch": 0.7312909975783509,
      "grad_norm": 0.07957372814416885,
      "learning_rate": 3.305742177335444e-05,
      "loss": 8.5923,
      "step": 46656,
      "throughput": 17834.52670424745
    },
    {
      "epoch": 0.7317925689072798,
      "grad_norm": 0.07402876764535904,
      "learning_rate": 3.29996780955695e-05,
      "loss": 8.591,
      "step": 46688,
      "throughput": 17833.69300948565
    },
    {
      "epoch": 0.7322941402362088,
      "grad_norm": 0.09786561131477356,
      "learning_rate": 3.294247878750333e-05,
      "loss": 8.6094,
      "step": 46720,
      "throughput": 17833.587017346064
    },
    {
      "epoch": 0.7327957115651377,
      "grad_norm": 0.07788746058940887,
      "learning_rate": 3.288582408504603e-05,
      "loss": 8.5917,
      "step": 46752,
      "throughput": 17833.55305593615
    },
    {
      "epoch": 0.7332972828940666,
      "grad_norm": 0.0738791972398758,
      "learning_rate": 3.2829714221841805e-05,
      "loss": 8.63,
      "step": 46784,
      "throughput": 17833.883406695128
    },
    {
      "epoch": 0.7337988542229955,
      "grad_norm": 0.07963655143976212,
      "learning_rate": 3.2774149429287854e-05,
      "loss": 8.606,
      "step": 46816,
      "throughput": 17834.13096712285
    },
    {
      "epoch": 0.7343004255519244,
      "grad_norm": 0.0802462100982666,
      "learning_rate": 3.271912993653357e-05,
      "loss": 8.6161,
      "step": 46848,
      "throughput": 17834.37435759249
    },
    {
      "epoch": 0.7348019968808533,
      "grad_norm": 0.07487773150205612,
      "learning_rate": 3.266465597047948e-05,
      "loss": 8.6063,
      "step": 46880,
      "throughput": 17834.613366413905
    },
    {
      "epoch": 0.7353035682097822,
      "grad_norm": 0.07707255333662033,
      "learning_rate": 3.261072775577641e-05,
      "loss": 8.609,
      "step": 46912,
      "throughput": 17834.93786758653
    },
    {
      "epoch": 0.7358051395387111,
      "grad_norm": 0.08311225473880768,
      "learning_rate": 3.255734551482446e-05,
      "loss": 8.5932,
      "step": 46944,
      "throughput": 17835.193244551054
    },
    {
      "epoch": 0.73630671086764,
      "grad_norm": 0.07928361743688583,
      "learning_rate": 3.2504509467772154e-05,
      "loss": 8.5993,
      "step": 46976,
      "throughput": 17834.624306488906
    },
    {
      "epoch": 0.7368082821965689,
      "grad_norm": 0.07688979804515839,
      "learning_rate": 3.24522198325155e-05,
      "loss": 8.5923,
      "step": 47008,
      "throughput": 17834.22779306419
    },
    {
      "epoch": 0.7373098535254978,
      "grad_norm": 0.08063149452209473,
      "learning_rate": 3.2400476824697126e-05,
      "loss": 8.5946,
      "step": 47040,
      "throughput": 17834.103907984365
    },
    {
      "epoch": 0.7378114248544267,
      "grad_norm": 0.0874280259013176,
      "learning_rate": 3.234928065770532e-05,
      "loss": 8.6212,
      "step": 47072,
      "throughput": 17834.343842706545
    },
    {
      "epoch": 0.7383129961833557,
      "grad_norm": 0.08335231989622116,
      "learning_rate": 3.2298631542673254e-05,
      "loss": 8.6064,
      "step": 47104,
      "throughput": 17834.663927160742
    },
    {
      "epoch": 0.7388145675122846,
      "grad_norm": 0.07642818242311478,
      "learning_rate": 3.2248529688478036e-05,
      "loss": 8.6317,
      "step": 47136,
      "throughput": 17834.044860553677
    },
    {
      "epoch": 0.7393161388412135,
      "grad_norm": 0.10783732682466507,
      "learning_rate": 3.2198975301739834e-05,
      "loss": 8.5986,
      "step": 47168,
      "throughput": 17834.201288398795
    },
    {
      "epoch": 0.7398177101701424,
      "grad_norm": 0.07692237198352814,
      "learning_rate": 3.214996858682109e-05,
      "loss": 8.6017,
      "step": 47200,
      "throughput": 17834.44666997345
    },
    {
      "epoch": 0.7403192814990713,
      "grad_norm": 0.07549633085727692,
      "learning_rate": 3.210150974582565e-05,
      "loss": 8.62,
      "step": 47232,
      "throughput": 17834.614133441388
    },
    {
      "epoch": 0.7408208528280003,
      "grad_norm": 0.08472277969121933,
      "learning_rate": 3.205359897859793e-05,
      "loss": 8.5948,
      "step": 47264,
      "throughput": 17834.48982830613
    },
    {
      "epoch": 0.7413224241569292,
      "grad_norm": 0.08152199536561966,
      "learning_rate": 3.2006236482722034e-05,
      "loss": 8.5738,
      "step": 47296,
      "throughput": 17833.84796111838
    },
    {
      "epoch": 0.7418239954858581,
      "grad_norm": 0.07427741587162018,
      "learning_rate": 3.195942245352108e-05,
      "loss": 8.6153,
      "step": 47328,
      "throughput": 17833.826142962356
    },
    {
      "epoch": 0.7423255668147869,
      "grad_norm": 0.07587329298257828,
      "learning_rate": 3.191315708405626e-05,
      "loss": 8.5985,
      "step": 47360,
      "throughput": 17833.884920516462
    },
    {
      "epoch": 0.7428271381437158,
      "grad_norm": 0.08033397793769836,
      "learning_rate": 3.1867440565126066e-05,
      "loss": 8.6256,
      "step": 47392,
      "throughput": 17834.21834739414
    },
    {
      "epoch": 0.7433287094726447,
      "grad_norm": 0.07397351413965225,
      "learning_rate": 3.182227308526557e-05,
      "loss": 8.5935,
      "step": 47424,
      "throughput": 17834.463669530487
    },
    {
      "epoch": 0.7438302808015737,
      "grad_norm": 0.0776250809431076,
      "learning_rate": 3.17776548307456e-05,
      "loss": 8.6178,
      "step": 47456,
      "throughput": 17834.702054140886
    },
    {
      "epoch": 0.7443318521305026,
      "grad_norm": 0.08041536062955856,
      "learning_rate": 3.173358598557196e-05,
      "loss": 8.5952,
      "step": 47488,
      "throughput": 17834.94603244263
    },
    {
      "epoch": 0.7448334234594315,
      "grad_norm": 0.09945892542600632,
      "learning_rate": 3.169006673148473e-05,
      "loss": 8.5852,
      "step": 47520,
      "throughput": 17835.013868235914
    },
    {
      "epoch": 0.7453349947883604,
      "grad_norm": 0.08749539405107498,
      "learning_rate": 3.1647097247957385e-05,
      "loss": 8.5971,
      "step": 47552,
      "throughput": 17835.153046151987
    },
    {
      "epoch": 0.7458365661172893,
      "grad_norm": 0.08924855291843414,
      "learning_rate": 3.160467771219624e-05,
      "loss": 8.6044,
      "step": 47584,
      "throughput": 17834.596442235495
    },
    {
      "epoch": 0.7463381374462182,
      "grad_norm": 0.07980576902627945,
      "learning_rate": 3.1562808299139596e-05,
      "loss": 8.6152,
      "step": 47616,
      "throughput": 17834.48573924514
    },
    {
      "epoch": 0.7468397087751472,
      "grad_norm": 0.07602015882730484,
      "learning_rate": 3.1521489181457005e-05,
      "loss": 8.607,
      "step": 47648,
      "throughput": 17834.36694344504
    },
    {
      "epoch": 0.7473412801040761,
      "grad_norm": 0.08380915224552155,
      "learning_rate": 3.1480720529548654e-05,
      "loss": 8.5986,
      "step": 47680,
      "throughput": 17834.694986968072
    },
    {
      "epoch": 0.747842851433005,
      "grad_norm": 0.07392337173223495,
      "learning_rate": 3.1440502511544566e-05,
      "loss": 8.5987,
      "step": 47712,
      "throughput": 17835.02278537719
    },
    {
      "epoch": 0.7483444227619339,
      "grad_norm": 0.08508224040269852,
      "learning_rate": 3.1400835293303984e-05,
      "loss": 8.6111,
      "step": 47744,
      "throughput": 17835.263740486364
    },
    {
      "epoch": 0.7488459940908628,
      "grad_norm": 0.08330404758453369,
      "learning_rate": 3.136171903841463e-05,
      "loss": 8.6165,
      "step": 47776,
      "throughput": 17835.428268281223
    },
    {
      "epoch": 0.7493475654197916,
      "grad_norm": 0.07605268806219101,
      "learning_rate": 3.1323153908192057e-05,
      "loss": 8.6041,
      "step": 47808,
      "throughput": 17835.6553983799
    },
    {
      "epoch": 0.7498491367487206,
      "grad_norm": 0.07306007295846939,
      "learning_rate": 3.128514006167897e-05,
      "loss": 8.6178,
      "step": 47840,
      "throughput": 17835.65838665241
    },
    {
      "epoch": 0.7503507080776495,
      "grad_norm": 0.0778181254863739,
      "learning_rate": 3.124767765564459e-05,
      "loss": 8.5957,
      "step": 47872,
      "throughput": 17835.34722429415
    },
    {
      "epoch": 0.7508522794065784,
      "grad_norm": 0.07451704889535904,
      "learning_rate": 3.121076684458398e-05,
      "loss": 8.6047,
      "step": 47904,
      "throughput": 17834.8868689467
    },
    {
      "epoch": 0.7513538507355073,
      "grad_norm": 0.08760581165552139,
      "learning_rate": 3.1174407780717433e-05,
      "loss": 8.6037,
      "step": 47936,
      "throughput": 17834.85964877205
    },
    {
      "epoch": 0.7518554220644362,
      "grad_norm": 0.07690441608428955,
      "learning_rate": 3.113860061398985e-05,
      "loss": 8.585,
      "step": 47968,
      "throughput": 17835.006197636758
    },
    {
      "epoch": 0.7523569933933651,
      "grad_norm": 0.07794877141714096,
      "learning_rate": 3.110334549207009e-05,
      "loss": 8.5965,
      "step": 48000,
      "throughput": 17835.327988956305
    },
    {
      "epoch": 0.7528585647222941,
      "grad_norm": 0.08219928294420242,
      "learning_rate": 3.1068642560350375e-05,
      "loss": 8.5826,
      "step": 48032,
      "throughput": 17835.570811645724
    },
    {
      "epoch": 0.753360136051223,
      "grad_norm": 0.08382223546504974,
      "learning_rate": 3.103449196194569e-05,
      "loss": 8.6001,
      "step": 48064,
      "throughput": 17835.806531516253
    },
    {
      "epoch": 0.7538617073801519,
      "grad_norm": 0.09056299924850464,
      "learning_rate": 3.1000893837693234e-05,
      "loss": 8.6214,
      "step": 48096,
      "throughput": 17836.045335655446
    },
    {
      "epoch": 0.7543632787090808,
      "grad_norm": 0.07294950634241104,
      "learning_rate": 3.096784832615175e-05,
      "loss": 8.5694,
      "step": 48128,
      "throughput": 17836.02776593902
    },
    {
      "epoch": 0.7548648500380097,
      "grad_norm": 0.08173788338899612,
      "learning_rate": 3.093535556360101e-05,
      "loss": 8.6219,
      "step": 48160,
      "throughput": 17836.069887898226
    },
    {
      "epoch": 0.7553664213669387,
      "grad_norm": 0.07701551169157028,
      "learning_rate": 3.0903415684041285e-05,
      "loss": 8.6025,
      "step": 48192,
      "throughput": 17835.41495806723
    },
    {
      "epoch": 0.7558679926958676,
      "grad_norm": 0.08350755274295807,
      "learning_rate": 3.087202881919273e-05,
      "loss": 8.6023,
      "step": 48224,
      "throughput": 17835.314069311517
    },
    {
      "epoch": 0.7563695640247964,
      "grad_norm": 0.08125930279493332,
      "learning_rate": 3.084119509849488e-05,
      "loss": 8.6016,
      "step": 48256,
      "throughput": 17835.284502123366
    },
    {
      "epoch": 0.7568711353537253,
      "grad_norm": 0.12003647536039352,
      "learning_rate": 3.081091464910606e-05,
      "loss": 8.6096,
      "step": 48288,
      "throughput": 17835.608629798637
    },
    {
      "epoch": 0.7573727066826542,
      "grad_norm": 0.07632914930582047,
      "learning_rate": 3.078118759590295e-05,
      "loss": 8.5979,
      "step": 48320,
      "throughput": 17835.931485541114
    },
    {
      "epoch": 0.7578742780115831,
      "grad_norm": 0.07661539316177368,
      "learning_rate": 3.075201406148001e-05,
      "loss": 8.5919,
      "step": 48352,
      "throughput": 17836.173041678863
    },
    {
      "epoch": 0.758375849340512,
      "grad_norm": 0.07512614876031876,
      "learning_rate": 3.072339416614899e-05,
      "loss": 8.5966,
      "step": 48384,
      "throughput": 17836.333431530336
    },
    {
      "epoch": 0.758877420669441,
      "grad_norm": 0.08936319500207901,
      "learning_rate": 3.069532802793839e-05,
      "loss": 8.585,
      "step": 48416,
      "throughput": 17836.487537129284
    },
    {
      "epoch": 0.7593789919983699,
      "grad_norm": 0.08849947899580002,
      "learning_rate": 3.066781576259309e-05,
      "loss": 8.6167,
      "step": 48448,
      "throughput": 17836.47048999063
    },
    {
      "epoch": 0.7598805633272988,
      "grad_norm": 0.08645664155483246,
      "learning_rate": 3.0640857483573714e-05,
      "loss": 8.6013,
      "step": 48480,
      "throughput": 17836.080591388745
    },
    {
      "epoch": 0.7603821346562277,
      "grad_norm": 0.07802789658308029,
      "learning_rate": 3.061445330205631e-05,
      "loss": 8.5903,
      "step": 48512,
      "throughput": 17835.79941755457
    },
    {
      "epoch": 0.7608837059851566,
      "grad_norm": 0.08236244320869446,
      "learning_rate": 3.0588603326931796e-05,
      "loss": 8.6045,
      "step": 48544,
      "throughput": 17835.773908487055
    },
    {
      "epoch": 0.7613852773140856,
      "grad_norm": 0.08064370602369308,
      "learning_rate": 3.056330766480554e-05,
      "loss": 8.5926,
      "step": 48576,
      "throughput": 17836.009046012525
    },
    {
      "epoch": 0.7618868486430145,
      "grad_norm": 0.07869768887758255,
      "learning_rate": 3.053856641999694e-05,
      "loss": 8.5944,
      "step": 48608,
      "throughput": 17836.32572679976
    },
    {
      "epoch": 0.7623884199719434,
      "grad_norm": 0.07431543618440628,
      "learning_rate": 3.0514379694538932e-05,
      "loss": 8.5902,
      "step": 48640,
      "throughput": 17836.554931513605
    },
    {
      "epoch": 0.7628899913008723,
      "grad_norm": 0.07379096746444702,
      "learning_rate": 3.0490747588177684e-05,
      "loss": 8.6127,
      "step": 48672,
      "throughput": 17836.709908699562
    },
    {
      "epoch": 0.7633915626298011,
      "grad_norm": 0.07593395560979843,
      "learning_rate": 3.0467670198372044e-05,
      "loss": 8.6016,
      "step": 48704,
      "throughput": 17836.941250927004
    },
    {
      "epoch": 0.76389313395873,
      "grad_norm": 0.07255639880895615,
      "learning_rate": 3.044514762029326e-05,
      "loss": 8.5867,
      "step": 48736,
      "throughput": 17837.08882129428
    },
    {
      "epoch": 0.764394705287659,
      "grad_norm": 0.0845591202378273,
      "learning_rate": 3.0423179946824494e-05,
      "loss": 8.6074,
      "step": 48768,
      "throughput": 17836.78557055867
    },
    {
      "epoch": 0.7648962766165879,
      "grad_norm": 0.07482326030731201,
      "learning_rate": 3.040176726856049e-05,
      "loss": 8.6001,
      "step": 48800,
      "throughput": 17836.485349372793
    },
    {
      "epoch": 0.7653978479455168,
      "grad_norm": 0.07807141542434692,
      "learning_rate": 3.0380909673807205e-05,
      "loss": 8.5799,
      "step": 48832,
      "throughput": 17836.38466703605
    },
    {
      "epoch": 0.7658994192744457,
      "grad_norm": 0.07797779142856598,
      "learning_rate": 3.0360607248581437e-05,
      "loss": 8.6014,
      "step": 48864,
      "throughput": 17836.43798871803
    },
    {
      "epoch": 0.7664009906033746,
      "grad_norm": 0.0804995745420456,
      "learning_rate": 3.0340860076610427e-05,
      "loss": 8.5887,
      "step": 48896,
      "throughput": 17836.7562656726
    },
    {
      "epoch": 0.7669025619323036,
      "grad_norm": 0.08541762083768845,
      "learning_rate": 3.0321668239331582e-05,
      "loss": 8.5996,
      "step": 48928,
      "throughput": 17837.069595176625
    },
    {
      "epoch": 0.7674041332612325,
      "grad_norm": 0.07710454612970352,
      "learning_rate": 3.030303181589207e-05,
      "loss": 8.5785,
      "step": 48960,
      "throughput": 17837.147894551657
    },
    {
      "epoch": 0.7679057045901614,
      "grad_norm": 0.07934178411960602,
      "learning_rate": 3.0284950883148598e-05,
      "loss": 8.5863,
      "step": 48992,
      "throughput": 17837.386582616426
    },
    {
      "epoch": 0.7684072759190903,
      "grad_norm": 0.0971083864569664,
      "learning_rate": 3.026742551566696e-05,
      "loss": 8.5905,
      "step": 49024,
      "throughput": 17837.461820329434
    },
    {
      "epoch": 0.7689088472480192,
      "grad_norm": 0.07783489674329758,
      "learning_rate": 3.0250455785721827e-05,
      "loss": 8.5961,
      "step": 49056,
      "throughput": 17837.42766907545
    },
    {
      "epoch": 0.7694104185769481,
      "grad_norm": 0.07902809977531433,
      "learning_rate": 3.023404176329643e-05,
      "loss": 8.6004,
      "step": 49088,
      "throughput": 17836.97276521561
    },
    {
      "epoch": 0.7699119899058771,
      "grad_norm": 0.07620695978403091,
      "learning_rate": 3.021818351608223e-05,
      "loss": 8.5928,
      "step": 49120,
      "throughput": 17836.862850153775
    },
    {
      "epoch": 0.7704135612348059,
      "grad_norm": 0.07504887133836746,
      "learning_rate": 3.0202881109478676e-05,
      "loss": 8.6057,
      "step": 49152,
      "throughput": 17836.916599803946
    },
    {
      "epoch": 0.7709151325637348,
      "grad_norm": 0.0752057358622551,
      "learning_rate": 3.0188134606592958e-05,
      "loss": 8.6044,
      "step": 49184,
      "throughput": 17836.313046066643
    },
    {
      "epoch": 0.7714167038926637,
      "grad_norm": 0.07765153795480728,
      "learning_rate": 3.017394406823969e-05,
      "loss": 8.5944,
      "step": 49216,
      "throughput": 17836.622213396313
    },
    {
      "epoch": 0.7719182752215926,
      "grad_norm": 0.08464544266462326,
      "learning_rate": 3.0160309552940704e-05,
      "loss": 8.62,
      "step": 49248,
      "throughput": 17836.858812627466
    },
    {
      "epoch": 0.7724198465505215,
      "grad_norm": 0.07419130206108093,
      "learning_rate": 3.014723111692476e-05,
      "loss": 8.5974,
      "step": 49280,
      "throughput": 17837.018146216167
    },
    {
      "epoch": 0.7729214178794505,
      "grad_norm": 0.08695698529481888,
      "learning_rate": 3.013470881412739e-05,
      "loss": 8.5758,
      "step": 49312,
      "throughput": 17837.094983627103
    },
    {
      "epoch": 0.7734229892083794,
      "grad_norm": 0.07773898541927338,
      "learning_rate": 3.0122742696190606e-05,
      "loss": 8.6104,
      "step": 49344,
      "throughput": 17837.158286582817
    },
    {
      "epoch": 0.7739245605373083,
      "grad_norm": 0.07133398205041885,
      "learning_rate": 3.0111332812462692e-05,
      "loss": 8.5956,
      "step": 49376,
      "throughput": 17836.952026592255
    },
    {
      "epoch": 0.7744261318662372,
      "grad_norm": 0.081082284450531,
      "learning_rate": 3.0100479209998055e-05,
      "loss": 8.5868,
      "step": 49408,
      "throughput": 17836.570468016125
    },
    {
      "epoch": 0.7749277031951661,
      "grad_norm": 0.09132739156484604,
      "learning_rate": 3.0090181933556994e-05,
      "loss": 8.6011,
      "step": 49440,
      "throughput": 17836.460989167073
    },
    {
      "epoch": 0.775429274524095,
      "grad_norm": 0.08725861459970474,
      "learning_rate": 3.0080441025605494e-05,
      "loss": 8.5824,
      "step": 49472,
      "throughput": 17836.59863661418
    },
    {
      "epoch": 0.775930845853024,
      "grad_norm": 0.07932783663272858,
      "learning_rate": 3.007125652631508e-05,
      "loss": 8.5797,
      "step": 49504,
      "throughput": 17836.909560824424
    },
    {
      "epoch": 0.7764324171819529,
      "grad_norm": 0.07793397456407547,
      "learning_rate": 3.006262847356269e-05,
      "loss": 8.586,
      "step": 49536,
      "throughput": 17837.2234703918
    },
    {
      "epoch": 0.7769339885108818,
      "grad_norm": 0.08320693671703339,
      "learning_rate": 3.0054556902930394e-05,
      "loss": 8.5985,
      "step": 49568,
      "throughput": 17837.373588519822
    },
    {
      "epoch": 0.7774355598398106,
      "grad_norm": 0.09126448631286621,
      "learning_rate": 3.0047041847705404e-05,
      "loss": 8.6029,
      "step": 49600,
      "throughput": 17837.602067526575
    },
    {
      "epoch": 0.7779371311687395,
      "grad_norm": 0.08450878411531448,
      "learning_rate": 3.0040083338879834e-05,
      "loss": 8.5736,
      "step": 49632,
      "throughput": 17837.58848053963
    },
    {
      "epoch": 0.7784387024976684,
      "grad_norm": 0.08500304073095322,
      "learning_rate": 3.0033681405150554e-05,
      "loss": 8.6081,
      "step": 49664,
      "throughput": 17837.466373705025
    },
    {
      "epoch": 0.7789402738265974,
      "grad_norm": 0.08071015775203705,
      "learning_rate": 3.0027836072919202e-05,
      "loss": 8.5804,
      "step": 49696,
      "throughput": 17837.098340151126
    },
    {
      "epoch": 0.7794418451555263,
      "grad_norm": 0.0792938694357872,
      "learning_rate": 3.002254736629194e-05,
      "loss": 8.6094,
      "step": 49728,
      "throughput": 17836.984875565544
    },
    {
      "epoch": 0.7799434164844552,
      "grad_norm": 0.08206379413604736,
      "learning_rate": 3.001781530707938e-05,
      "loss": 8.5797,
      "step": 49760,
      "throughput": 17836.954878495046
    },
    {
      "epoch": 0.7804449878133841,
      "grad_norm": 0.07426104694604874,
      "learning_rate": 3.0013639914796586e-05,
      "loss": 8.5966,
      "step": 49792,
      "throughput": 17837.265641517744
    },
    {
      "epoch": 0.780946559142313,
      "grad_norm": 0.07360176742076874,
      "learning_rate": 3.001002120666285e-05,
      "loss": 8.5894,
      "step": 49824,
      "throughput": 17837.56856457737
    },
    {
      "epoch": 0.781448130471242,
      "grad_norm": 0.0769825130701065,
      "learning_rate": 3.0006959197601765e-05,
      "loss": 8.6,
      "step": 49856,
      "throughput": 17837.795235918486
    },
    {
      "epoch": 0.7819497018001709,
      "grad_norm": 0.08089779317378998,
      "learning_rate": 3.000445390024106e-05,
      "loss": 8.5933,
      "step": 49888,
      "throughput": 17837.947136390696
    },
    {
      "epoch": 0.7824512731290998,
      "grad_norm": 0.07225056737661362,
      "learning_rate": 3.0002505324912582e-05,
      "loss": 8.5799,
      "step": 49920,
      "throughput": 17838.009145271175
    },
    {
      "epoch": 0.7829528444580287,
      "grad_norm": 0.08053544163703918,
      "learning_rate": 3.0001113479652246e-05,
      "loss": 8.5902,
      "step": 49952,
      "throughput": 17837.980418546384
    },
    {
      "epoch": 0.7834544157869576,
      "grad_norm": 0.07379666715860367,
      "learning_rate": 3.0000278370200057e-05,
      "loss": 8.5985,
      "step": 49984,
      "throughput": 17837.532480432266
    },
    {
      "epoch": 0.7839559871158865,
      "grad_norm": 0.08274323493242264,
      "learning_rate": 2.9999999999999997e-05,
      "loss": 8.5875,
      "step": 50016,
      "throughput": 17837.339637847817
    },
    {
      "epoch": 0.7839559871158865,
      "step": 50016,
      "throughput": 17836.525899553584,
      "total_flos": 8.182419259005201e+20,
      "train_loss": 1.2059035261548336,
      "train_runtime": 26049.049,
      "train_samples_per_second": 1966.152,
      "train_steps_per_second": 1.92
    }
  ],
  "logging_steps": 32,
  "max_steps": 50016,
  "num_input_tokens_seen": 104891154432,
  "num_train_epochs": 1,
  "save_steps": 2048,
  "stateful_callbacks": {
    "LogCallback": {
      "elapsed_time": 183771.69379615784,
      "start_time": 1765630168.7857313
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 8.182419259005201e+20,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}