{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.7839559871158865,
  "eval_steps": 500,
  "global_step": 50016,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0005015713289289101,
      "grad_norm": 2.6373515129089355,
      "learning_rate": 1.875e-05,
      "loss": 39.2137,
      "step": 32,
      "throughput": 3031.484591979932
    },
    {
      "epoch": 0.0010031426578578201,
      "grad_norm": 1.6154510974884033,
      "learning_rate": 3.75e-05,
      "loss": 30.9048,
      "step": 64,
      "throughput": 4533.5848217950215
    },
    {
      "epoch": 0.0015047139867867302,
      "grad_norm": 2.1769161224365234,
      "learning_rate": 5.625e-05,
      "loss": 27.3343,
      "step": 96,
      "throughput": 5442.436560864993
    },
    {
      "epoch": 0.0020062853157156403,
      "grad_norm": 1.8507866859436035,
      "learning_rate": 7.5e-05,
      "loss": 24.8344,
      "step": 128,
      "throughput": 6048.333849188605
    },
    {
      "epoch": 0.0025078566446445506,
      "grad_norm": 1.6263043880462646,
      "learning_rate": 9.374999999999999e-05,
      "loss": 23.0449,
      "step": 160,
      "throughput": 6383.579958146075
    },
    {
      "epoch": 0.0030094279735734604,
      "grad_norm": 1.5903853178024292,
      "learning_rate": 0.0001125,
      "loss": 21.6192,
      "step": 192,
      "throughput": 6716.015359285796
    },
    {
      "epoch": 0.0035109993025023707,
      "grad_norm": 1.3139172792434692,
      "learning_rate": 0.00013125,
      "loss": 20.2952,
      "step": 224,
      "throughput": 6973.364827539361
    },
    {
      "epoch": 0.0040125706314312806,
      "grad_norm": 1.414817214012146,
      "learning_rate": 0.00015,
      "loss": 19.0528,
      "step": 256,
      "throughput": 7182.123779353329
    },
    {
      "epoch": 0.004514141960360191,
      "grad_norm": 0.9963351488113403,
      "learning_rate": 0.00016874999999999998,
      "loss": 17.8896,
      "step": 288,
      "throughput": 7289.056846050263
    },
    {
      "epoch": 0.005015713289289101,
      "grad_norm": 0.9738964438438416,
      "learning_rate": 0.00018749999999999998,
      "loss": 16.8125,
      "step": 320,
      "throughput": 7435.817597231107
    },
    {
      "epoch": 0.005517284618218011,
      "grad_norm": 0.8014869689941406,
      "learning_rate": 0.00020624999999999997,
      "loss": 15.9875,
      "step": 352,
      "throughput": 7558.026515468645
    },
    {
      "epoch": 0.006018855947146921,
      "grad_norm": 0.6334019899368286,
      "learning_rate": 0.000225,
      "loss": 15.2491,
      "step": 384,
      "throughput": 7665.0553985229435
    },
    {
      "epoch": 0.006520427276075831,
      "grad_norm": 0.691703736782074,
      "learning_rate": 0.00024375,
      "loss": 14.7027,
      "step": 416,
      "throughput": 7706.13283153309
    },
    {
      "epoch": 0.007021998605004741,
      "grad_norm": 0.6111452579498291,
      "learning_rate": 0.0002625,
      "loss": 14.2629,
      "step": 448,
      "throughput": 7788.969234107833
    },
    {
      "epoch": 0.007523569933933652,
      "grad_norm": 0.453713983297348,
      "learning_rate": 0.00028125,
      "loss": 13.8273,
      "step": 480,
      "throughput": 7861.532758662085
    },
    {
      "epoch": 0.008025141262862561,
      "grad_norm": 0.6347829103469849,
      "learning_rate": 0.0003,
      "loss": 13.5631,
      "step": 512,
      "throughput": 7927.929938236498
    },
    {
      "epoch": 0.008526712591791472,
      "grad_norm": 0.36244717240333557,
      "learning_rate": 0.00029999972162979993,
      "loss": 13.2913,
      "step": 544,
      "throughput": 7950.168975118933
    },
    {
      "epoch": 0.009028283920720382,
      "grad_norm": 0.3927552402019501,
      "learning_rate": 0.00029999888652034774,
      "loss": 13.0637,
      "step": 576,
      "throughput": 8002.683417615384
    },
    {
      "epoch": 0.009529855249649291,
      "grad_norm": 0.34958720207214355,
      "learning_rate": 0.00029999749467508744,
      "loss": 12.8461,
      "step": 608,
      "throughput": 8051.561816817223
    },
    {
      "epoch": 0.010031426578578202,
      "grad_norm": 0.39206886291503906,
      "learning_rate": 0.0002999955460997589,
      "loss": 12.6701,
      "step": 640,
      "throughput": 8097.459941385641
    },
    {
      "epoch": 0.010532997907507112,
      "grad_norm": 0.32412078976631165,
      "learning_rate": 0.0002999930408023982,
      "loss": 12.5112,
      "step": 672,
      "throughput": 8110.693951807436
    },
    {
      "epoch": 0.011034569236436023,
      "grad_norm": 0.28792303800582886,
      "learning_rate": 0.00029998997879333714,
      "loss": 12.3516,
      "step": 704,
      "throughput": 8146.659534494293
    },
    {
      "epoch": 0.011536140565364932,
      "grad_norm": 0.27620336413383484,
      "learning_rate": 0.0002999863600852034,
      "loss": 12.2134,
      "step": 736,
      "throughput": 8181.849297422524
    },
    {
      "epoch": 0.012037711894293842,
      "grad_norm": 0.298735648393631,
      "learning_rate": 0.0002999821846929206,
      "loss": 12.1216,
      "step": 768,
      "throughput": 8215.768342687106
    },
    {
      "epoch": 0.012539283223222753,
      "grad_norm": 0.2960178256034851,
      "learning_rate": 0.000299977452633708,
      "loss": 12.0131,
      "step": 800,
      "throughput": 8224.10883755775
    },
    {
      "epoch": 0.013040854552151662,
      "grad_norm": 0.2541326582431793,
      "learning_rate": 0.00029997216392708075,
      "loss": 11.9168,
      "step": 832,
      "throughput": 8250.757327240604
    },
    {
      "epoch": 0.013542425881080573,
      "grad_norm": 0.31256556510925293,
      "learning_rate": 0.00029996631859484943,
      "loss": 11.8253,
      "step": 864,
      "throughput": 8277.59484237759
    },
    {
      "epoch": 0.014043997210009483,
      "grad_norm": 0.2150149643421173,
      "learning_rate": 0.00029995991666112014,
      "loss": 11.7395,
      "step": 896,
      "throughput": 8303.820467973554
    },
    {
      "epoch": 0.014545568538938392,
      "grad_norm": 0.20100632309913635,
      "learning_rate": 0.0002999529581522946,
      "loss": 11.6624,
      "step": 928,
      "throughput": 8307.53804985821
    },
    {
      "epoch": 0.015047139867867303,
      "grad_norm": 0.23360204696655273,
      "learning_rate": 0.0002999454430970696,
      "loss": 11.6254,
      "step": 960,
      "throughput": 8328.20061532659
    },
    {
      "epoch": 0.015548711196796213,
      "grad_norm": 0.19596298038959503,
      "learning_rate": 0.0002999373715264373,
      "loss": 11.5469,
      "step": 992,
      "throughput": 8349.693540163496
    },
    {
      "epoch": 0.016050282525725122,
      "grad_norm": 0.23281055688858032,
      "learning_rate": 0.0002999287434736849,
      "loss": 11.4881,
      "step": 1024,
      "throughput": 8370.74705624476
    },
    {
      "epoch": 0.016551853854654033,
      "grad_norm": 0.178004190325737,
      "learning_rate": 0.0002999195589743945,
      "loss": 11.4228,
      "step": 1056,
      "throughput": 8371.927394946062
    },
    {
      "epoch": 0.017053425183582945,
      "grad_norm": 0.17390933632850647,
      "learning_rate": 0.000299909818066443,
      "loss": 11.3634,
      "step": 1088,
      "throughput": 8388.5787076317
    },
    {
      "epoch": 0.017554996512511852,
      "grad_norm": 0.20782887935638428,
      "learning_rate": 0.00029989952079000195,
      "loss": 11.3356,
      "step": 1120,
      "throughput": 8406.042588855575
    },
    {
      "epoch": 0.018056567841440763,
      "grad_norm": 0.17906507849693298,
      "learning_rate": 0.0002998886671875373,
      "loss": 11.268,
      "step": 1152,
      "throughput": 8423.431878637526
    },
    {
      "epoch": 0.018558139170369675,
      "grad_norm": 0.2125791758298874,
      "learning_rate": 0.0002998772573038094,
      "loss": 11.2191,
      "step": 1184,
      "throughput": 8423.562959023457
    },
    {
      "epoch": 0.019059710499298582,
      "grad_norm": 0.1875011920928955,
      "learning_rate": 0.0002998652911858726,
      "loss": 11.166,
      "step": 1216,
      "throughput": 8437.670481903486
    },
    {
      "epoch": 0.019561281828227493,
      "grad_norm": 0.18936695158481598,
      "learning_rate": 0.00029985276888307524,
      "loss": 11.1251,
      "step": 1248,
      "throughput": 8452.193237807254
    },
    {
      "epoch": 0.020062853157156404,
      "grad_norm": 0.177974134683609,
      "learning_rate": 0.00029983969044705927,
      "loss": 11.1019,
      "step": 1280,
      "throughput": 8466.85665649742
    },
    {
      "epoch": 0.020564424486085316,
      "grad_norm": 0.16055719554424286,
      "learning_rate": 0.0002998260559317603,
      "loss": 11.0582,
      "step": 1312,
      "throughput": 8466.517167981028
    },
    {
      "epoch": 0.021065995815014223,
      "grad_norm": 0.16099485754966736,
      "learning_rate": 0.00029981186539340703,
      "loss": 11.0095,
      "step": 1344,
      "throughput": 8478.566566937574
    },
    {
      "epoch": 0.021567567143943134,
      "grad_norm": 0.1568784862756729,
      "learning_rate": 0.0002997971188905213,
      "loss": 10.9878,
      "step": 1376,
      "throughput": 8491.080428955667
    },
    {
      "epoch": 0.022069138472872046,
      "grad_norm": 0.15591877698898315,
      "learning_rate": 0.0002997818164839178,
      "loss": 10.95,
      "step": 1408,
      "throughput": 8503.772964081749
    },
    {
      "epoch": 0.022570709801800953,
      "grad_norm": 0.1584455668926239,
      "learning_rate": 0.00029976595823670354,
      "loss": 10.9177,
      "step": 1440,
      "throughput": 8502.49272473002
    },
    {
      "epoch": 0.023072281130729864,
      "grad_norm": 0.1538284718990326,
      "learning_rate": 0.0002997495442142781,
      "loss": 10.9034,
      "step": 1472,
      "throughput": 8512.850418983315
    },
    {
      "epoch": 0.023573852459658776,
      "grad_norm": 0.16589871048927307,
      "learning_rate": 0.000299732574484333,
      "loss": 10.862,
      "step": 1504,
      "throughput": 8523.613773945295
    },
    {
      "epoch": 0.024075423788587683,
      "grad_norm": 0.1705697625875473,
      "learning_rate": 0.0002997150491168514,
      "loss": 10.8344,
      "step": 1536,
      "throughput": 8534.577834014373
    },
    {
      "epoch": 0.024576995117516594,
      "grad_norm": 0.15539616346359253,
      "learning_rate": 0.0002996969681841079,
      "loss": 10.7975,
      "step": 1568,
      "throughput": 8532.849110741738
    },
    {
      "epoch": 0.025078566446445506,
      "grad_norm": 0.13422518968582153,
      "learning_rate": 0.0002996783317606684,
      "loss": 10.7751,
      "step": 1600,
      "throughput": 8540.6999977051
    },
    {
      "epoch": 0.025580137775374417,
      "grad_norm": 0.14098893105983734,
      "learning_rate": 0.0002996591399233895,
      "loss": 10.742,
      "step": 1632,
      "throughput": 8550.000233087334
    },
    {
      "epoch": 0.026081709104303324,
      "grad_norm": 0.13725745677947998,
      "learning_rate": 0.00029963939275141855,
      "loss": 10.7043,
      "step": 1664,
      "throughput": 8559.634576291046
    },
    {
      "epoch": 0.026583280433232236,
      "grad_norm": 0.15282359719276428,
      "learning_rate": 0.00029961909032619275,
      "loss": 10.6942,
      "step": 1696,
      "throughput": 8557.213588623437
    },
    {
      "epoch": 0.027084851762161147,
      "grad_norm": 0.15533699095249176,
      "learning_rate": 0.00029959823273143947,
      "loss": 10.6708,
      "step": 1728,
      "throughput": 8563.929337010282
    },
    {
      "epoch": 0.027586423091090054,
      "grad_norm": 0.1476975679397583,
      "learning_rate": 0.0002995768200531755,
      "loss": 10.6754,
      "step": 1760,
      "throughput": 8572.364628252106
    },
    {
      "epoch": 0.028087994420018966,
      "grad_norm": 0.13373318314552307,
      "learning_rate": 0.00029955485237970675,
      "loss": 10.633,
      "step": 1792,
      "throughput": 8580.991220450485
    },
    {
      "epoch": 0.028589565748947877,
      "grad_norm": 0.13255077600479126,
      "learning_rate": 0.00029953232980162793,
      "loss": 10.612,
      "step": 1824,
      "throughput": 8579.270567797352
    },
    {
      "epoch": 0.029091137077876784,
      "grad_norm": 0.1498062014579773,
      "learning_rate": 0.0002995092524118223,
      "loss": 10.57,
      "step": 1856,
      "throughput": 8584.96901712662
    },
    {
      "epoch": 0.029592708406805696,
      "grad_norm": 0.1309068202972412,
      "learning_rate": 0.00029948562030546107,
      "loss": 10.5787,
      "step": 1888,
      "throughput": 8592.560589507188
    },
    {
      "epoch": 0.030094279735734607,
      "grad_norm": 0.13275641202926636,
      "learning_rate": 0.00029946143358000306,
      "loss": 10.5466,
      "step": 1920,
      "throughput": 8600.325823833216
    },
    {
      "epoch": 0.030595851064663518,
      "grad_norm": 0.13608723878860474,
      "learning_rate": 0.0002994366923351945,
      "loss": 10.531,
      "step": 1952,
      "throughput": 8598.203774351727
    },
    {
      "epoch": 0.031097422393592426,
      "grad_norm": 0.12380360066890717,
      "learning_rate": 0.00029941139667306817,
      "loss": 10.5066,
      "step": 1984,
      "throughput": 8603.133954877543
    },
    {
      "epoch": 0.03159899372252133,
      "grad_norm": 0.145261749625206,
      "learning_rate": 0.00029938554669794364,
      "loss": 10.4803,
      "step": 2016,
      "throughput": 8609.89138038132
    },
    {
      "epoch": 0.032100565051450244,
      "grad_norm": 0.13648687303066254,
      "learning_rate": 0.00029935914251642625,
      "loss": 10.4657,
      "step": 2048,
      "throughput": 8616.946997737774
    },
    {
      "epoch": 0.032602136380379156,
      "grad_norm": 0.14016900956630707,
      "learning_rate": 0.0002993321842374069,
      "loss": 10.4494,
      "step": 2080,
      "throughput": 8604.296026764225
    },
    {
      "epoch": 0.03310370770930807,
      "grad_norm": 0.14942410588264465,
      "learning_rate": 0.00029930467197206156,
      "loss": 10.4193,
      "step": 2112,
      "throughput": 8608.41214757555
    },
    {
      "epoch": 0.03360527903823698,
      "grad_norm": 0.14700458943843842,
      "learning_rate": 0.000299276605833851,
      "loss": 10.3977,
      "step": 2144,
      "throughput": 8614.543771421539
    },
    {
      "epoch": 0.03410685036716589,
      "grad_norm": 0.12929627299308777,
      "learning_rate": 0.00029924798593851994,
      "loss": 10.3986,
      "step": 2176,
      "throughput": 8621.065638504637
    },
    {
      "epoch": 0.0346084216960948,
      "grad_norm": 0.14943945407867432,
      "learning_rate": 0.00029921881240409703,
      "loss": 10.3857,
      "step": 2208,
      "throughput": 8618.170501024351
    },
    {
      "epoch": 0.035109993025023704,
      "grad_norm": 0.1490052491426468,
      "learning_rate": 0.00029918908535089394,
      "loss": 10.3782,
      "step": 2240,
      "throughput": 8621.5559803982
    },
    {
      "epoch": 0.035611564353952616,
      "grad_norm": 0.1199193075299263,
      "learning_rate": 0.00029915880490150515,
      "loss": 10.3513,
      "step": 2272,
      "throughput": 8627.148534803291
    },
    {
      "epoch": 0.03611313568288153,
      "grad_norm": 0.12391626089811325,
      "learning_rate": 0.0002991279711808072,
      "loss": 10.3492,
      "step": 2304,
      "throughput": 8633.14469700088
    },
    {
      "epoch": 0.03661470701181044,
      "grad_norm": 0.1489352434873581,
      "learning_rate": 0.0002990965843159587,
      "loss": 10.3098,
      "step": 2336,
      "throughput": 8631.454564886973
    },
    {
      "epoch": 0.03711627834073935,
      "grad_norm": 0.12652361392974854,
      "learning_rate": 0.000299064644436399,
      "loss": 10.3038,
      "step": 2368,
      "throughput": 8634.488781208483
    },
    {
      "epoch": 0.03761784966966826,
      "grad_norm": 0.13568635284900665,
      "learning_rate": 0.0002990321516738482,
      "loss": 10.2685,
      "step": 2400,
      "throughput": 8640.191384361931
    },
    {
      "epoch": 0.038119420998597164,
      "grad_norm": 0.1313263326883316,
      "learning_rate": 0.00029899910616230674,
      "loss": 10.2806,
      "step": 2432,
      "throughput": 8645.23888018826
    },
    {
      "epoch": 0.038620992327526076,
      "grad_norm": 0.12250286340713501,
      "learning_rate": 0.0002989655080380543,
      "loss": 10.2797,
      "step": 2464,
      "throughput": 8642.636898666538
    },
    {
      "epoch": 0.03912256365645499,
      "grad_norm": 0.11763161420822144,
      "learning_rate": 0.0002989313574396496,
      "loss": 10.2454,
      "step": 2496,
      "throughput": 8645.384442994879
    },
    {
      "epoch": 0.0396241349853839,
      "grad_norm": 0.13227710127830505,
      "learning_rate": 0.00029889665450792983,
      "loss": 10.2309,
      "step": 2528,
      "throughput": 8650.663248170937
    },
    {
      "epoch": 0.04012570631431281,
      "grad_norm": 0.12891387939453125,
      "learning_rate": 0.0002988613993860101,
      "loss": 10.2252,
      "step": 2560,
      "throughput": 8655.331219016643
    },
    {
      "epoch": 0.04062727764324172,
      "grad_norm": 0.13591551780700684,
      "learning_rate": 0.0002988255922192825,
      "loss": 10.2132,
      "step": 2592,
      "throughput": 8652.438713629675
    },
    {
      "epoch": 0.04112884897217063,
      "grad_norm": 0.13800325989723206,
      "learning_rate": 0.000298789233155416,
      "loss": 10.1996,
      "step": 2624,
      "throughput": 8655.184002646449
    },
    {
      "epoch": 0.041630420301099536,
      "grad_norm": 0.13499251008033752,
      "learning_rate": 0.0002987523223443554,
      "loss": 10.1903,
      "step": 2656,
      "throughput": 8659.739091286676
    },
    {
      "epoch": 0.04213199163002845,
      "grad_norm": 0.12725511193275452,
      "learning_rate": 0.000298714859938321,
      "loss": 10.1742,
      "step": 2688,
      "throughput": 8664.175771510912
    },
    {
      "epoch": 0.04263356295895736,
      "grad_norm": 0.13918425142765045,
      "learning_rate": 0.0002986768460918079,
      "loss": 10.1607,
      "step": 2720,
      "throughput": 8661.906185594662
    },
    {
      "epoch": 0.04313513428788627,
      "grad_norm": 0.12420102953910828,
      "learning_rate": 0.0002986382809615853,
      "loss": 10.1532,
      "step": 2752,
      "throughput": 8664.108344168615
    },
    {
      "epoch": 0.04363670561681518,
      "grad_norm": 0.13506534695625305,
      "learning_rate": 0.00029859916470669596,
      "loss": 10.1531,
      "step": 2784,
      "throughput": 8668.340072989078
    },
    {
      "epoch": 0.04413827694574409,
      "grad_norm": 0.11485131084918976,
      "learning_rate": 0.0002985594974884554,
      "loss": 10.1036,
      "step": 2816,
      "throughput": 8672.514168355223
    },
    {
      "epoch": 0.044639848274673,
      "grad_norm": 0.1429208666086197,
      "learning_rate": 0.00029851927947045136,
      "loss": 10.1181,
      "step": 2848,
      "throughput": 8670.511688031254
    },
    {
      "epoch": 0.04514141960360191,
      "grad_norm": 0.11533421277999878,
      "learning_rate": 0.000298478510818543,
      "loss": 10.0922,
      "step": 2880,
      "throughput": 8672.608976050276
    },
    {
      "epoch": 0.04564299093253082,
      "grad_norm": 0.12951959669589996,
      "learning_rate": 0.0002984371917008604,
      "loss": 10.0784,
      "step": 2912,
      "throughput": 8676.554281193294
    },
    {
      "epoch": 0.04614456226145973,
      "grad_norm": 0.1254311501979828,
      "learning_rate": 0.0002983953222878037,
      "loss": 10.096,
      "step": 2944,
      "throughput": 8680.355487940858
    },
    {
      "epoch": 0.04664613359038864,
      "grad_norm": 0.15013115108013153,
      "learning_rate": 0.0002983529027520426,
      "loss": 10.0558,
      "step": 2976,
      "throughput": 8678.506209881712
    },
    {
      "epoch": 0.04714770491931755,
      "grad_norm": 0.12344110757112503,
      "learning_rate": 0.0002983099332685153,
      "loss": 10.065,
      "step": 3008,
      "throughput": 8680.447547364309
    },
    {
      "epoch": 0.04764927624824646,
      "grad_norm": 0.12854412198066711,
      "learning_rate": 0.000298266414014428,
      "loss": 10.0563,
      "step": 3040,
      "throughput": 8684.174220917293
    },
    {
      "epoch": 0.04815084757717537,
      "grad_norm": 0.12837445735931396,
      "learning_rate": 0.0002982223451692544,
      "loss": 10.0484,
      "step": 3072,
      "throughput": 8687.846008837005
    },
    {
      "epoch": 0.04865241890610428,
      "grad_norm": 0.13306200504302979,
      "learning_rate": 0.0002981777269147344,
      "loss": 10.0373,
      "step": 3104,
      "throughput": 8686.269605240876
    },
    {
      "epoch": 0.04915399023503319,
      "grad_norm": 0.1327691376209259,
      "learning_rate": 0.0002981325594348739,
      "loss": 10.0474,
      "step": 3136,
      "throughput": 8687.881093275713
    },
    {
      "epoch": 0.0496555615639621,
      "grad_norm": 0.11801597476005554,
      "learning_rate": 0.00029808684291594373,
      "loss": 10.0057,
      "step": 3168,
      "throughput": 8691.366927033056
    },
    {
      "epoch": 0.05015713289289101,
      "grad_norm": 0.114040307700634,
      "learning_rate": 0.0002980405775464789,
      "loss": 9.9989,
      "step": 3200,
      "throughput": 8694.78405139367
    },
    {
      "epoch": 0.05065870422181992,
      "grad_norm": 0.11511870473623276,
      "learning_rate": 0.00029799376351727797,
      "loss": 9.9831,
      "step": 3232,
      "throughput": 8692.988671152094
    },
    {
      "epoch": 0.051160275550748834,
      "grad_norm": 0.13341563940048218,
      "learning_rate": 0.00029794640102140206,
      "loss": 9.9744,
      "step": 3264,
      "throughput": 8694.304613157901
    },
    {
      "epoch": 0.05166184687967774,
      "grad_norm": 0.14193041622638702,
      "learning_rate": 0.00029789849025417433,
      "loss": 9.9716,
      "step": 3296,
      "throughput": 8697.71995228885
    },
    {
      "epoch": 0.05216341820860665,
      "grad_norm": 0.11138767004013062,
      "learning_rate": 0.0002978500314131789,
      "loss": 10.0049,
      "step": 3328,
      "throughput": 8700.995142735857
    },
    {
      "epoch": 0.05266498953753556,
      "grad_norm": 0.13181596994400024,
      "learning_rate": 0.00029780102469826014,
      "loss": 9.9559,
      "step": 3360,
      "throughput": 8699.412860118453
    },
    {
      "epoch": 0.05316656086646447,
      "grad_norm": 0.11928830295801163,
      "learning_rate": 0.00029775147031152195,
      "loss": 9.9436,
      "step": 3392,
      "throughput": 8700.922524938454
    },
    {
      "epoch": 0.05366813219539338,
      "grad_norm": 0.12960603833198547,
      "learning_rate": 0.0002977013684573267,
      "loss": 9.9464,
      "step": 3424,
      "throughput": 8704.083537777888
    },
    {
      "epoch": 0.054169703524322294,
      "grad_norm": 0.12624278664588928,
      "learning_rate": 0.0002976507193422946,
      "loss": 9.939,
      "step": 3456,
      "throughput": 8707.186611773384
    },
    {
      "epoch": 0.0546712748532512,
      "grad_norm": 0.12646783888339996,
      "learning_rate": 0.00029759952317530284,
      "loss": 9.9485,
      "step": 3488,
      "throughput": 8705.7946206228
    },
    {
      "epoch": 0.05517284618218011,
      "grad_norm": 0.1135874092578888,
      "learning_rate": 0.0002975477801674845,
      "loss": 9.8956,
      "step": 3520,
      "throughput": 8706.898540237127
    },
    {
      "epoch": 0.05567441751110902,
      "grad_norm": 0.11660734564065933,
      "learning_rate": 0.00029749549053222784,
      "loss": 9.9178,
      "step": 3552,
      "throughput": 8709.896727910347
    },
    {
      "epoch": 0.05617598884003793,
      "grad_norm": 0.11348798871040344,
      "learning_rate": 0.0002974426544851755,
      "loss": 9.8913,
      "step": 3584,
      "throughput": 8712.842678586643
    },
    {
      "epoch": 0.05667756016896684,
      "grad_norm": 0.11837482452392578,
      "learning_rate": 0.00029738927224422354,
      "loss": 9.8937,
      "step": 3616,
      "throughput": 8711.675362629981
    },
    {
      "epoch": 0.057179131497895753,
      "grad_norm": 0.1253383755683899,
      "learning_rate": 0.0002973353440295205,
      "loss": 9.8689,
      "step": 3648,
      "throughput": 8712.58627493939
    },
    {
      "epoch": 0.057680702826824665,
      "grad_norm": 0.12361543625593185,
      "learning_rate": 0.0002972808700634664,
      "loss": 9.8713,
      "step": 3680,
      "throughput": 8715.511098762165
    },
    {
      "epoch": 0.05818227415575357,
      "grad_norm": 0.1185847818851471,
      "learning_rate": 0.0002972258505707121,
      "loss": 9.8642,
      "step": 3712,
      "throughput": 8718.259214378111
    },
    {
      "epoch": 0.05868384548468248,
      "grad_norm": 0.11482404917478561,
      "learning_rate": 0.00029717028577815817,
      "loss": 9.8517,
      "step": 3744,
      "throughput": 8716.86528152881
    },
    {
      "epoch": 0.05918541681361139,
      "grad_norm": 0.12030131369829178,
      "learning_rate": 0.0002971141759149539,
      "loss": 9.8704,
      "step": 3776,
      "throughput": 8717.99026040238
    },
    {
      "epoch": 0.0596869881425403,
      "grad_norm": 0.11121921986341476,
      "learning_rate": 0.00029705752121249665,
      "loss": 9.846,
      "step": 3808,
      "throughput": 8720.784126088147
    },
    {
      "epoch": 0.060188559471469213,
      "grad_norm": 0.11496740579605103,
      "learning_rate": 0.0002970003219044305,
      "loss": 9.833,
      "step": 3840,
      "throughput": 8723.510819994406
    },
    {
      "epoch": 0.060690130800398125,
      "grad_norm": 0.11950097233057022,
      "learning_rate": 0.0002969425782266455,
      "loss": 9.847,
      "step": 3872,
      "throughput": 8722.220559962287
    },
    {
      "epoch": 0.061191702129327036,
      "grad_norm": 0.11807090044021606,
      "learning_rate": 0.0002968842904172769,
      "loss": 9.837,
      "step": 3904,
      "throughput": 8722.88462889842
    },
    {
      "epoch": 0.06169327345825594,
      "grad_norm": 0.11705534160137177,
      "learning_rate": 0.00029682545871670375,
      "loss": 9.8289,
      "step": 3936,
      "throughput": 8725.452473969277
    },
    {
      "epoch": 0.06219484478718485,
      "grad_norm": 0.11206234246492386,
      "learning_rate": 0.0002967660833675481,
      "loss": 9.814,
      "step": 3968,
      "throughput": 8728.053752565047
    },
    {
      "epoch": 0.06269641611611376,
      "grad_norm": 0.10935520380735397,
      "learning_rate": 0.0002967061646146741,
      "loss": 9.8,
      "step": 4000,
      "throughput": 8726.928482257368
    },
    {
      "epoch": 0.06319798744504267,
      "grad_norm": 0.12385314702987671,
      "learning_rate": 0.00029664570270518685,
      "loss": 9.7849,
      "step": 4032,
      "throughput": 8727.5696353496
    },
    {
      "epoch": 0.06369955877397158,
      "grad_norm": 0.11679193377494812,
      "learning_rate": 0.00029658469788843147,
      "loss": 9.7898,
      "step": 4064,
      "throughput": 8730.008881642094
    },
    {
      "epoch": 0.06420113010290049,
      "grad_norm": 0.11991138756275177,
      "learning_rate": 0.00029652315041599203,
      "loss": 9.7781,
      "step": 4096,
      "throughput": 8732.473879711068
    },
    {
      "epoch": 0.0647027014318294,
      "grad_norm": 0.13184259831905365,
      "learning_rate": 0.00029646106054169046,
      "loss": 9.7812,
      "step": 4128,
      "throughput": 8726.75567838056
    },
    {
      "epoch": 0.06520427276075831,
      "grad_norm": 0.11864672601222992,
      "learning_rate": 0.00029639842852158553,
      "loss": 9.7764,
      "step": 4160,
      "throughput": 8727.531637802169
    },
    {
      "epoch": 0.06570584408968723,
      "grad_norm": 0.10902588814496994,
      "learning_rate": 0.00029633525461397194,
      "loss": 9.7707,
      "step": 4192,
      "throughput": 8729.976476059877
    },
    {
      "epoch": 0.06620741541861613,
      "grad_norm": 0.10904784500598907,
      "learning_rate": 0.00029627153907937903,
      "loss": 9.7731,
      "step": 4224,
      "throughput": 8732.42270047259
    },
    {
      "epoch": 0.06670898674754504,
      "grad_norm": 0.1083018109202385,
      "learning_rate": 0.0002962072821805699,
      "loss": 9.7378,
      "step": 4256,
      "throughput": 8731.099681392934
    },
    {
      "epoch": 0.06721055807647396,
      "grad_norm": 0.1126420721411705,
      "learning_rate": 0.0002961424841825402,
      "loss": 9.7451,
      "step": 4288,
      "throughput": 8731.601959832633
    },
    {
      "epoch": 0.06771212940540286,
      "grad_norm": 0.11158367246389389,
      "learning_rate": 0.00029607714535251703,
      "loss": 9.7397,
      "step": 4320,
      "throughput": 8733.870236599323
    },
    {
      "epoch": 0.06821370073433178,
      "grad_norm": 0.10763044655323029,
      "learning_rate": 0.00029601126595995794,
      "loss": 9.7427,
      "step": 4352,
      "throughput": 8736.180002336438
    },
    {
      "epoch": 0.06871527206326068,
      "grad_norm": 0.11694357544183731,
      "learning_rate": 0.0002959448462765497,
      "loss": 9.7229,
      "step": 4384,
      "throughput": 8734.995267095126
    },
    {
      "epoch": 0.0692168433921896,
      "grad_norm": 0.1064562126994133,
      "learning_rate": 0.0002958778865762072,
      "loss": 9.7388,
      "step": 4416,
      "throughput": 8735.564838968481
    },
    {
      "epoch": 0.0697184147211185,
      "grad_norm": 0.10848239809274673,
      "learning_rate": 0.0002958103871350727,
      "loss": 9.7179,
      "step": 4448,
      "throughput": 8737.615081448372
    },
    {
      "epoch": 0.07021998605004741,
      "grad_norm": 0.10709454864263535,
      "learning_rate": 0.0002957423482315139,
      "loss": 9.7198,
      "step": 4480,
      "throughput": 8739.853794654147
    },
    {
      "epoch": 0.07072155737897633,
      "grad_norm": 0.12926509976387024,
      "learning_rate": 0.0002956737701461235,
      "loss": 9.7057,
      "step": 4512,
      "throughput": 8739.03085797585
    },
    {
      "epoch": 0.07122312870790523,
      "grad_norm": 0.1091700941324234,
      "learning_rate": 0.00029560465316171773,
      "loss": 9.6923,
      "step": 4544,
      "throughput": 8739.543393337932
    },
    {
      "epoch": 0.07172470003683415,
      "grad_norm": 0.11444082856178284,
      "learning_rate": 0.0002955349975633352,
      "loss": 9.7073,
      "step": 4576,
      "throughput": 8741.510902754704
    },
    {
      "epoch": 0.07222627136576305,
      "grad_norm": 0.1193537712097168,
      "learning_rate": 0.00029546480363823577,
      "loss": 9.7022,
      "step": 4608,
      "throughput": 8743.724679614512
    },
    {
      "epoch": 0.07272784269469197,
      "grad_norm": 0.11008062213659286,
      "learning_rate": 0.0002953940716758995,
      "loss": 9.6769,
      "step": 4640,
      "throughput": 8742.567475185164
    },
    {
      "epoch": 0.07322941402362088,
      "grad_norm": 0.11342762410640717,
      "learning_rate": 0.0002953228019680252,
      "loss": 9.6866,
      "step": 4672,
      "throughput": 8743.06418128608
    },
    {
      "epoch": 0.07373098535254978,
      "grad_norm": 0.10978548973798752,
      "learning_rate": 0.0002952509948085293,
      "loss": 9.6647,
      "step": 4704,
      "throughput": 8744.934773526928
    },
    {
      "epoch": 0.0742325566814787,
      "grad_norm": 0.10810253769159317,
      "learning_rate": 0.00029517865049354477,
      "loss": 9.6861,
      "step": 4736,
      "throughput": 8747.043262691786
    },
    {
      "epoch": 0.0747341280104076,
      "grad_norm": 0.12077952176332474,
      "learning_rate": 0.0002951057693214197,
      "loss": 9.6609,
      "step": 4768,
      "throughput": 8745.863832630565
    },
    {
      "epoch": 0.07523569933933652,
      "grad_norm": 0.12620708346366882,
      "learning_rate": 0.0002950323515927164,
      "loss": 9.6417,
      "step": 4800,
      "throughput": 8746.33913872354
    },
    {
      "epoch": 0.07573727066826542,
      "grad_norm": 0.10805953294038773,
      "learning_rate": 0.0002949583976102097,
      "loss": 9.6571,
      "step": 4832,
      "throughput": 8748.119129389326
    },
    {
      "epoch": 0.07623884199719433,
      "grad_norm": 0.10737334191799164,
      "learning_rate": 0.00029488390767888606,
      "loss": 9.6458,
      "step": 4864,
      "throughput": 8750.2710186966
    },
    {
      "epoch": 0.07674041332612325,
      "grad_norm": 0.10387426614761353,
      "learning_rate": 0.0002948088821059422,
      "loss": 9.6404,
      "step": 4896,
      "throughput": 8749.044764652741
    },
    {
      "epoch": 0.07724198465505215,
      "grad_norm": 0.10420756787061691,
      "learning_rate": 0.0002947333212007838,
      "loss": 9.6436,
      "step": 4928,
      "throughput": 8749.480158538867
    },
    {
      "epoch": 0.07774355598398107,
      "grad_norm": 0.11674252152442932,
      "learning_rate": 0.0002946572252750242,
      "loss": 9.6466,
      "step": 4960,
      "throughput": 8751.365321390707
    },
    {
      "epoch": 0.07824512731290997,
      "grad_norm": 0.11858739703893661,
      "learning_rate": 0.0002945805946424834,
      "loss": 9.6272,
      "step": 4992,
      "throughput": 8753.230851191058
    },
    {
      "epoch": 0.07874669864183889,
      "grad_norm": 0.12305985391139984,
      "learning_rate": 0.0002945034296191861,
      "loss": 9.636,
      "step": 5024,
      "throughput": 8751.780277949114
    },
    {
      "epoch": 0.0792482699707678,
      "grad_norm": 0.11536737531423569,
      "learning_rate": 0.00029442573052336127,
      "loss": 9.6316,
      "step": 5056,
      "throughput": 8752.300934613511
    },
    {
      "epoch": 0.0797498412996967,
      "grad_norm": 0.10255605727434158,
      "learning_rate": 0.0002943474976754401,
      "loss": 9.5882,
      "step": 5088,
      "throughput": 8754.111959971262
    },
    {
      "epoch": 0.08025141262862562,
      "grad_norm": 0.1140480637550354,
      "learning_rate": 0.0002942687313980552,
      "loss": 9.6156,
      "step": 5120,
      "throughput": 8755.960772925051
    },
    {
      "epoch": 0.08075298395755452,
      "grad_norm": 0.10552536696195602,
      "learning_rate": 0.0002941894320160389,
      "loss": 9.6169,
      "step": 5152,
      "throughput": 8754.902346274297
    },
    {
      "epoch": 0.08125455528648344,
      "grad_norm": 0.10546742379665375,
      "learning_rate": 0.00029410959985642205,
      "loss": 9.5985,
      "step": 5184,
      "throughput": 8755.319127740058
    },
    {
      "epoch": 0.08175612661541234,
      "grad_norm": 0.10557737201452255,
      "learning_rate": 0.0002940292352484327,
      "loss": 9.5846,
      "step": 5216,
      "throughput": 8757.070036568484
    },
    {
      "epoch": 0.08225769794434126,
      "grad_norm": 0.10859539359807968,
      "learning_rate": 0.0002939483385234948,
      "loss": 9.5832,
      "step": 5248,
      "throughput": 8758.805477156033
    },
    {
      "epoch": 0.08275926927327017,
      "grad_norm": 0.1116238459944725,
      "learning_rate": 0.0002938669100152266,
      "loss": 9.605,
      "step": 5280,
      "throughput": 8758.001263340446
    },
    {
      "epoch": 0.08326084060219907,
      "grad_norm": 0.10401325672864914,
      "learning_rate": 0.00029378495005943954,
      "loss": 9.5738,
      "step": 5312,
      "throughput": 8758.390589772524
    },
    {
      "epoch": 0.08376241193112799,
      "grad_norm": 0.11249975115060806,
      "learning_rate": 0.00029370245899413677,
      "loss": 9.5761,
      "step": 5344,
      "throughput": 8759.943487108056
    },
    {
      "epoch": 0.0842639832600569,
      "grad_norm": 0.10400120913982391,
      "learning_rate": 0.0002936194371595116,
      "loss": 9.5772,
      "step": 5376,
      "throughput": 8761.705736556785
    },
    {
      "epoch": 0.08476555458898581,
      "grad_norm": 0.11678726971149445,
      "learning_rate": 0.00029353588489794636,
      "loss": 9.5678,
      "step": 5408,
      "throughput": 8760.785024398701
    },
    {
      "epoch": 0.08526712591791472,
      "grad_norm": 0.10207447409629822,
      "learning_rate": 0.0002934518025540109,
      "loss": 9.5648,
      "step": 5440,
      "throughput": 8761.207794467886
    },
    {
      "epoch": 0.08576869724684363,
      "grad_norm": 0.10971741378307343,
      "learning_rate": 0.00029336719047446096,
      "loss": 9.5842,
      "step": 5472,
      "throughput": 8762.6991526964
    },
    {
      "epoch": 0.08627026857577254,
      "grad_norm": 0.10587865859270096,
      "learning_rate": 0.000293282049008237,
      "loss": 9.5558,
      "step": 5504,
      "throughput": 8764.382572374267
    },
    {
      "epoch": 0.08677183990470144,
      "grad_norm": 0.10832160711288452,
      "learning_rate": 0.00029319637850646273,
      "loss": 9.5593,
      "step": 5536,
      "throughput": 8763.682026107219
    },
    {
      "epoch": 0.08727341123363036,
      "grad_norm": 0.1197381466627121,
      "learning_rate": 0.0002931101793224435,
      "loss": 9.5612,
      "step": 5568,
      "throughput": 8764.26764075061
    },
    {
      "epoch": 0.08777498256255926,
      "grad_norm": 0.10048508644104004,
      "learning_rate": 0.0002930234518116651,
      "loss": 9.5599,
      "step": 5600,
      "throughput": 8765.526614902168
    },
    {
      "epoch": 0.08827655389148818,
      "grad_norm": 0.11333134025335312,
      "learning_rate": 0.000292936196331792,
      "loss": 9.5267,
      "step": 5632,
      "throughput": 8767.121347324892
    },
    {
      "epoch": 0.08877812522041709,
      "grad_norm": 0.10634933412075043,
      "learning_rate": 0.000292848413242666,
      "loss": 9.5516,
      "step": 5664,
      "throughput": 8766.28425052529
    },
    {
      "epoch": 0.089279696549346,
      "grad_norm": 0.1022300124168396,
      "learning_rate": 0.0002927601029063049,
      "loss": 9.5298,
      "step": 5696,
      "throughput": 8766.822494400065
    },
    {
      "epoch": 0.08978126787827491,
      "grad_norm": 0.11001124978065491,
      "learning_rate": 0.0002926712656869007,
      "loss": 9.5238,
      "step": 5728,
      "throughput": 8768.014349855302
    },
    {
      "epoch": 0.09028283920720381,
      "grad_norm": 0.10148325562477112,
      "learning_rate": 0.0002925819019508184,
      "loss": 9.5295,
      "step": 5760,
      "throughput": 8769.51986607424
    },
    {
      "epoch": 0.09078441053613273,
      "grad_norm": 0.11986260861158371,
      "learning_rate": 0.0002924920120665943,
      "loss": 9.5362,
      "step": 5792,
      "throughput": 8768.542280819205
    },
    {
      "epoch": 0.09128598186506164,
      "grad_norm": 0.11802539229393005,
      "learning_rate": 0.00029240159640493463,
      "loss": 9.5297,
      "step": 5824,
      "throughput": 8769.037029142866
    },
    {
      "epoch": 0.09178755319399055,
      "grad_norm": 0.10226628929376602,
      "learning_rate": 0.00029231065533871374,
      "loss": 9.5186,
      "step": 5856,
      "throughput": 8770.189301563876
    },
    {
      "epoch": 0.09228912452291946,
      "grad_norm": 0.09981340169906616,
      "learning_rate": 0.0002922191892429729,
      "loss": 9.4993,
      "step": 5888,
      "throughput": 8771.715936979588
    },
    {
      "epoch": 0.09279069585184836,
      "grad_norm": 0.11034058779478073,
      "learning_rate": 0.0002921271984949185,
      "loss": 9.5075,
      "step": 5920,
      "throughput": 8771.140273309473
    },
    {
      "epoch": 0.09329226718077728,
      "grad_norm": 0.09680221229791641,
      "learning_rate": 0.0002920346834739208,
      "loss": 9.4944,
      "step": 5952,
      "throughput": 8771.453342478308
    },
    {
      "epoch": 0.09379383850970618,
      "grad_norm": 0.10602926462888718,
      "learning_rate": 0.0002919416445615119,
      "loss": 9.4971,
      "step": 5984,
      "throughput": 8772.612423388544
    },
    {
      "epoch": 0.0942954098386351,
      "grad_norm": 0.09529194980859756,
      "learning_rate": 0.0002918480821413846,
      "loss": 9.4783,
      "step": 6016,
      "throughput": 8774.060226804862
    },
    {
      "epoch": 0.094796981167564,
      "grad_norm": 0.11012410372495651,
      "learning_rate": 0.0002917539965993906,
      "loss": 9.4814,
      "step": 6048,
      "throughput": 8773.541619500458
    },
    {
      "epoch": 0.09529855249649292,
      "grad_norm": 0.11098303645849228,
      "learning_rate": 0.00029165938832353885,
      "loss": 9.486,
      "step": 6080,
      "throughput": 8773.753906713271
    },
    {
      "epoch": 0.09580012382542183,
      "grad_norm": 0.10786409676074982,
      "learning_rate": 0.00029156425770399434,
      "loss": 9.4732,
      "step": 6112,
      "throughput": 8774.859206813278
    },
    {
      "epoch": 0.09630169515435073,
      "grad_norm": 0.10273081809282303,
      "learning_rate": 0.0002914686051330759,
      "loss": 9.4749,
      "step": 6144,
      "throughput": 8776.312027347161
    },
    {
      "epoch": 0.09680326648327965,
      "grad_norm": 0.10742319375276566,
      "learning_rate": 0.00029137243100525506,
      "loss": 9.4978,
      "step": 6176,
      "throughput": 8772.277513786172
    },
    {
      "epoch": 0.09730483781220856,
      "grad_norm": 0.10680707544088364,
      "learning_rate": 0.00029127573571715416,
      "loss": 9.4679,
      "step": 6208,
      "throughput": 8772.23422342053
    },
    {
      "epoch": 0.09780640914113747,
      "grad_norm": 0.10717196762561798,
      "learning_rate": 0.00029117851966754495,
      "loss": 9.4652,
      "step": 6240,
      "throughput": 8773.386896101358
    },
    {
      "epoch": 0.09830798047006638,
      "grad_norm": 0.09716420620679855,
      "learning_rate": 0.00029108078325734666,
      "loss": 9.4707,
      "step": 6272,
      "throughput": 8774.817011223837
    },
    {
      "epoch": 0.0988095517989953,
      "grad_norm": 0.10774450749158859,
      "learning_rate": 0.0002909825268896245,
      "loss": 9.4607,
      "step": 6304,
      "throughput": 8774.22740411009
    },
    {
      "epoch": 0.0993111231279242,
      "grad_norm": 0.09601946175098419,
      "learning_rate": 0.000290883750969588,
      "loss": 9.4527,
      "step": 6336,
      "throughput": 8774.561502422925
    },
    {
      "epoch": 0.0998126944568531,
      "grad_norm": 0.10894615203142166,
      "learning_rate": 0.00029078445590458946,
      "loss": 9.442,
      "step": 6368,
      "throughput": 8775.561618802984
    },
    {
      "epoch": 0.10031426578578202,
      "grad_norm": 0.09985128045082092,
      "learning_rate": 0.0002906846421041219,
      "loss": 9.479,
      "step": 6400,
      "throughput": 8776.897730433244
    },
    {
      "epoch": 0.10081583711471093,
      "grad_norm": 0.09858572483062744,
      "learning_rate": 0.00029058430997981784,
      "loss": 9.4263,
      "step": 6432,
      "throughput": 8776.367922921861
    },
    {
      "epoch": 0.10131740844363984,
      "grad_norm": 0.10523559898138046,
      "learning_rate": 0.0002904834599454472,
      "loss": 9.4307,
      "step": 6464,
      "throughput": 8776.785085883334
    },
    {
      "epoch": 0.10181897977256875,
      "grad_norm": 0.0995122492313385,
      "learning_rate": 0.00029038209241691575,
      "loss": 9.4566,
      "step": 6496,
      "throughput": 8777.531086805837
    },
    {
      "epoch": 0.10232055110149767,
      "grad_norm": 0.10437292605638504,
      "learning_rate": 0.0002902802078122636,
      "loss": 9.4127,
      "step": 6528,
      "throughput": 8778.849186284346
    },
    {
      "epoch": 0.10282212243042657,
      "grad_norm": 0.10644105076789856,
      "learning_rate": 0.00029017780655166315,
      "loss": 9.4328,
      "step": 6560,
      "throughput": 8778.169575081867
    },
    {
      "epoch": 0.10332369375935548,
      "grad_norm": 0.10638494044542313,
      "learning_rate": 0.0002900748890574175,
      "loss": 9.4391,
      "step": 6592,
      "throughput": 8778.733712323921
    },
    {
      "epoch": 0.1038252650882844,
      "grad_norm": 0.09863536059856415,
      "learning_rate": 0.0002899714557539586,
      "loss": 9.4357,
      "step": 6624,
      "throughput": 8779.33307552037
    },
    {
      "epoch": 0.1043268364172133,
      "grad_norm": 0.11023005098104477,
      "learning_rate": 0.00028986750706784574,
      "loss": 9.4383,
      "step": 6656,
      "throughput": 8780.65893379113
    },
    {
      "epoch": 0.10482840774614222,
      "grad_norm": 0.10076352208852768,
      "learning_rate": 0.0002897630434277637,
      "loss": 9.4148,
      "step": 6688,
      "throughput": 8780.240505214717
    },
    {
      "epoch": 0.10532997907507112,
      "grad_norm": 0.09036028385162354,
      "learning_rate": 0.0002896580652645207,
      "loss": 9.4093,
      "step": 6720,
      "throughput": 8780.66309519438
    },
    {
      "epoch": 0.10583155040400004,
      "grad_norm": 0.09312684834003448,
      "learning_rate": 0.00028955257301104714,
      "loss": 9.3995,
      "step": 6752,
      "throughput": 8781.240044175389
    },
    {
      "epoch": 0.10633312173292894,
      "grad_norm": 0.114221952855587,
      "learning_rate": 0.00028944656710239337,
      "loss": 9.3911,
      "step": 6784,
      "throughput": 8782.48113812559
    },
    {
      "epoch": 0.10683469306185785,
      "grad_norm": 0.0931376963853836,
      "learning_rate": 0.00028934004797572795,
      "loss": 9.427,
      "step": 6816,
      "throughput": 8782.077289376892
    },
    {
      "epoch": 0.10733626439078676,
      "grad_norm": 0.10255315154790878,
      "learning_rate": 0.00028923301607033616,
      "loss": 9.3771,
      "step": 6848,
      "throughput": 8782.590252463471
    },
    {
      "epoch": 0.10783783571971567,
      "grad_norm": 0.10455886274576187,
      "learning_rate": 0.0002891254718276178,
      "loss": 9.4268,
      "step": 6880,
      "throughput": 8783.169961134565
    },
    {
      "epoch": 0.10833940704864459,
      "grad_norm": 0.09569968283176422,
      "learning_rate": 0.00028901741569108586,
      "loss": 9.3963,
      "step": 6912,
      "throughput": 8784.373364351874
    },
    {
      "epoch": 0.10884097837757349,
      "grad_norm": 0.10171248763799667,
      "learning_rate": 0.00028890884810636394,
      "loss": 9.4016,
      "step": 6944,
      "throughput": 8783.993265016647
    },
    {
      "epoch": 0.1093425497065024,
      "grad_norm": 0.0940864086151123,
      "learning_rate": 0.00028879976952118523,
      "loss": 9.3953,
      "step": 6976,
      "throughput": 8784.41693816875
    },
    {
      "epoch": 0.10984412103543131,
      "grad_norm": 0.09311022609472275,
      "learning_rate": 0.0002886901803853901,
      "loss": 9.4155,
      "step": 7008,
      "throughput": 8784.969029180918
    },
    {
      "epoch": 0.11034569236436022,
      "grad_norm": 0.10037797689437866,
      "learning_rate": 0.00028858008115092445,
      "loss": 9.3822,
      "step": 7040,
      "throughput": 8786.190972002953
    },
    {
      "epoch": 0.11084726369328914,
      "grad_norm": 0.09330254048109055,
      "learning_rate": 0.0002884694722718378,
      "loss": 9.3832,
      "step": 7072,
      "throughput": 8785.815264238152
    },
    {
      "epoch": 0.11134883502221804,
      "grad_norm": 0.10277887433767319,
      "learning_rate": 0.00028835835420428163,
      "loss": 9.3735,
      "step": 7104,
      "throughput": 8786.055555264302
    },
    {
      "epoch": 0.11185040635114696,
      "grad_norm": 0.09488580375909805,
      "learning_rate": 0.000288246727406507,
      "loss": 9.3748,
      "step": 7136,
      "throughput": 8786.588604106677
    },
    {
      "epoch": 0.11235197768007586,
      "grad_norm": 0.10583002865314484,
      "learning_rate": 0.00028813459233886335,
      "loss": 9.3646,
      "step": 7168,
      "throughput": 8787.867037853663
    },
    {
      "epoch": 0.11285354900900477,
      "grad_norm": 0.10686413943767548,
      "learning_rate": 0.00028802194946379585,
      "loss": 9.3436,
      "step": 7200,
      "throughput": 8787.70036951069
    },
    {
      "epoch": 0.11335512033793368,
      "grad_norm": 0.09465917944908142,
      "learning_rate": 0.0002879087992458442,
      "loss": 9.3593,
      "step": 7232,
      "throughput": 8787.736361138324
    },
    {
      "epoch": 0.11385669166686259,
      "grad_norm": 0.09683835506439209,
      "learning_rate": 0.00028779514215164015,
      "loss": 9.3462,
      "step": 7264,
      "throughput": 8788.262568915294
    },
    {
      "epoch": 0.11435826299579151,
      "grad_norm": 0.09295953065156937,
      "learning_rate": 0.0002876809786499059,
      "loss": 9.3604,
      "step": 7296,
      "throughput": 8789.55067224747
    },
    {
      "epoch": 0.11485983432472041,
      "grad_norm": 0.08935536444187164,
      "learning_rate": 0.0002875663092114521,
      "loss": 9.3685,
      "step": 7328,
      "throughput": 8789.450310964028
    },
    {
      "epoch": 0.11536140565364933,
      "grad_norm": 0.1067778691649437,
      "learning_rate": 0.0002874511343091758,
      "loss": 9.3559,
      "step": 7360,
      "throughput": 8789.533171952675
    },
    {
      "epoch": 0.11586297698257823,
      "grad_norm": 0.1031869426369667,
      "learning_rate": 0.00028733545441805874,
      "loss": 9.359,
      "step": 7392,
      "throughput": 8790.117743777615
    },
    {
      "epoch": 0.11636454831150714,
      "grad_norm": 0.09314560145139694,
      "learning_rate": 0.00028721927001516503,
      "loss": 9.3671,
      "step": 7424,
      "throughput": 8791.3475967336
    },
    {
      "epoch": 0.11686611964043606,
      "grad_norm": 0.10189881175756454,
      "learning_rate": 0.00028710258157963955,
      "loss": 9.356,
      "step": 7456,
      "throughput": 8791.08301501178
    },
    {
      "epoch": 0.11736769096936496,
      "grad_norm": 0.10202765464782715,
      "learning_rate": 0.00028698538959270577,
      "loss": 9.3532,
      "step": 7488,
      "throughput": 8791.172960360063
    },
    {
      "epoch": 0.11786926229829388,
      "grad_norm": 0.11254678666591644,
      "learning_rate": 0.00028686769453766366,
      "loss": 9.3508,
      "step": 7520,
      "throughput": 8791.70103738155
    },
    {
      "epoch": 0.11837083362722278,
      "grad_norm": 0.09686455875635147,
      "learning_rate": 0.00028674949689988814,
      "loss": 9.3269,
      "step": 7552,
      "throughput": 8792.904480615553
    },
    {
      "epoch": 0.1188724049561517,
      "grad_norm": 0.0967174768447876,
      "learning_rate": 0.00028663079716682654,
      "loss": 9.3219,
      "step": 7584,
      "throughput": 8792.668287370536
    },
    {
      "epoch": 0.1193739762850806,
      "grad_norm": 0.09918548911809921,
      "learning_rate": 0.00028651159582799695,
      "loss": 9.3291,
      "step": 7616,
      "throughput": 8792.746124169227
    },
    {
      "epoch": 0.11987554761400951,
      "grad_norm": 0.09100424498319626,
      "learning_rate": 0.000286391893374986,
      "loss": 9.3354,
      "step": 7648,
      "throughput": 8793.249523980605
    },
    {
      "epoch": 0.12037711894293843,
      "grad_norm": 0.10109537839889526,
      "learning_rate": 0.0002862716903014469,
      "loss": 9.3325,
      "step": 7680,
      "throughput": 8794.43398400681
    },
    {
      "epoch": 0.12087869027186733,
      "grad_norm": 0.09545495361089706,
      "learning_rate": 0.0002861509871030977,
      "loss": 9.3165,
      "step": 7712,
      "throughput": 8794.374266231913
    },
    {
      "epoch": 0.12138026160079625,
      "grad_norm": 0.09386946260929108,
      "learning_rate": 0.0002860297842777185,
      "loss": 9.2992,
      "step": 7744,
      "throughput": 8794.424310030801
    },
    {
      "epoch": 0.12188183292972515,
      "grad_norm": 0.09427899122238159,
      "learning_rate": 0.00028590808232515025,
      "loss": 9.3118,
      "step": 7776,
      "throughput": 8794.856080279087
    },
    {
      "epoch": 0.12238340425865407,
      "grad_norm": 0.09586696326732635,
      "learning_rate": 0.00028578588174729214,
      "loss": 9.3064,
      "step": 7808,
      "throughput": 8796.0053929776
    },
    {
      "epoch": 0.12288497558758298,
      "grad_norm": 0.10378427803516388,
      "learning_rate": 0.0002856631830480997,
      "loss": 9.3089,
      "step": 7840,
      "throughput": 8795.877169587182
    },
    {
      "epoch": 0.12338654691651188,
      "grad_norm": 0.09206092357635498,
      "learning_rate": 0.0002855399867335827,
      "loss": 9.303,
      "step": 7872,
      "throughput": 8795.95365876067
    },
    {
      "epoch": 0.1238881182454408,
      "grad_norm": 0.1028638631105423,
      "learning_rate": 0.0002854162933118032,
      "loss": 9.3107,
      "step": 7904,
      "throughput": 8796.369941584839
    },
    {
      "epoch": 0.1243896895743697,
      "grad_norm": 0.09411998093128204,
      "learning_rate": 0.0002852921032928732,
      "loss": 9.2964,
      "step": 7936,
      "throughput": 8797.50387293777
    },
    {
      "epoch": 0.12489126090329862,
      "grad_norm": 0.08698836714029312,
      "learning_rate": 0.0002851674171889526,
      "loss": 9.2972,
      "step": 7968,
      "throughput": 8797.140561187089
    },
    {
      "epoch": 0.12539283223222752,
      "grad_norm": 0.09941025078296661,
      "learning_rate": 0.0002850422355142474,
      "loss": 9.2937,
      "step": 8000,
      "throughput": 8797.22027781598
    },
    {
      "epoch": 0.12589440356115644,
      "grad_norm": 0.0902884230017662,
      "learning_rate": 0.00028491655878500716,
      "loss": 9.2986,
      "step": 8032,
      "throughput": 8797.70882325548
    },
    {
      "epoch": 0.12639597489008533,
      "grad_norm": 0.09279583394527435,
      "learning_rate": 0.0002847903875195231,
      "loss": 9.2844,
      "step": 8064,
      "throughput": 8798.813097833674
    },
    {
      "epoch": 0.12689754621901425,
      "grad_norm": 0.08902335166931152,
      "learning_rate": 0.00028466372223812575,
      "loss": 9.2554,
      "step": 8096,
      "throughput": 8798.64995728701
    },
    {
      "epoch": 0.12739911754794317,
      "grad_norm": 0.09631629288196564,
      "learning_rate": 0.0002845365634631833,
      "loss": 9.3199,
      "step": 8128,
      "throughput": 8798.723076061977
    },
    {
      "epoch": 0.1279006888768721,
      "grad_norm": 0.09583239257335663,
      "learning_rate": 0.0002844089117190988,
      "loss": 9.281,
      "step": 8160,
      "throughput": 8799.30489082074
    },
    {
      "epoch": 0.12840226020580098,
      "grad_norm": 0.1015479788184166,
      "learning_rate": 0.0002842807675323085,
      "loss": 9.2922,
      "step": 8192,
      "throughput": 8800.297249952806
    },
    {
      "epoch": 0.1289038315347299,
      "grad_norm": 0.08587797731161118,
      "learning_rate": 0.00028415213143127935,
      "loss": 9.2946,
      "step": 8224,
      "throughput": 8797.422031741422
    },
    {
      "epoch": 0.1294054028636588,
      "grad_norm": 0.08758696168661118,
      "learning_rate": 0.00028402300394650697,
      "loss": 9.2858,
      "step": 8256,
      "throughput": 8797.608642785463
    },
    {
      "epoch": 0.1299069741925877,
      "grad_norm": 0.09429500997066498,
      "learning_rate": 0.0002838933856105136,
      "loss": 9.2681,
      "step": 8288,
      "throughput": 8798.190467266051
    },
    {
      "epoch": 0.13040854552151662,
      "grad_norm": 0.09859396517276764,
      "learning_rate": 0.0002837632769578455,
      "loss": 9.284,
      "step": 8320,
      "throughput": 8798.986546394839
    },
    {
      "epoch": 0.13091011685044554,
      "grad_norm": 0.0930311530828476,
      "learning_rate": 0.00028363267852507133,
      "loss": 9.2665,
      "step": 8352,
      "throughput": 8798.865374036568
    },
    {
      "epoch": 0.13141168817937446,
      "grad_norm": 0.09189429879188538,
      "learning_rate": 0.0002835015908507793,
      "loss": 9.2858,
      "step": 8384,
      "throughput": 8799.037145915567
    },
    {
      "epoch": 0.13191325950830335,
      "grad_norm": 0.09416882693767548,
      "learning_rate": 0.0002833700144755753,
      "loss": 9.2591,
      "step": 8416,
      "throughput": 8799.647256916047
    },
    {
      "epoch": 0.13241483083723227,
      "grad_norm": 0.09771085530519485,
      "learning_rate": 0.0002832379499420808,
      "loss": 9.2772,
      "step": 8448,
      "throughput": 8800.302025027004
    },
    {
      "epoch": 0.13291640216616118,
      "grad_norm": 0.09605992585420609,
      "learning_rate": 0.0002831053977949303,
      "loss": 9.2571,
      "step": 8480,
      "throughput": 8800.269159153138
    },
    {
      "epoch": 0.13341797349509008,
      "grad_norm": 0.09242798388004303,
      "learning_rate": 0.00028297235858076923,
      "loss": 9.265,
      "step": 8512,
      "throughput": 8800.330415544971
    },
    {
      "epoch": 0.133919544824019,
      "grad_norm": 0.0932585746049881,
      "learning_rate": 0.0002828388328482517,
      "loss": 9.2515,
      "step": 8544,
      "throughput": 8800.918277345787
    },
    {
      "epoch": 0.1344211161529479,
      "grad_norm": 0.09092196822166443,
      "learning_rate": 0.0002827048211480383,
      "loss": 9.2499,
      "step": 8576,
      "throughput": 8801.608592729954
    },
    {
      "epoch": 0.13492268748187683,
      "grad_norm": 0.09770791232585907,
      "learning_rate": 0.00028257032403279354,
      "loss": 9.2567,
      "step": 8608,
      "throughput": 8801.558844237528
    },
    {
      "epoch": 0.13542425881080572,
      "grad_norm": 0.10055698454380035,
      "learning_rate": 0.00028243534205718405,
      "loss": 9.2512,
      "step": 8640,
      "throughput": 8801.763149454433
    },
    {
      "epoch": 0.13592583013973464,
      "grad_norm": 0.09112855792045593,
      "learning_rate": 0.00028229987577787585,
      "loss": 9.2453,
      "step": 8672,
      "throughput": 8802.337256009518
    },
    {
      "epoch": 0.13642740146866356,
      "grad_norm": 0.09379726648330688,
      "learning_rate": 0.00028216392575353225,
      "loss": 9.2256,
      "step": 8704,
      "throughput": 8803.026187051975
    },
    {
      "epoch": 0.13692897279759245,
      "grad_norm": 0.09850838780403137,
      "learning_rate": 0.00028202749254481165,
      "loss": 9.2331,
      "step": 8736,
      "throughput": 8803.125678082812
    },
    {
      "epoch": 0.13743054412652136,
      "grad_norm": 0.09180594235658646,
      "learning_rate": 0.0002818905767143649,
      "loss": 9.2448,
      "step": 8768,
      "throughput": 8803.326981019853
    },
    {
      "epoch": 0.13793211545545028,
      "grad_norm": 0.0894036740064621,
      "learning_rate": 0.0002817531788268333,
      "loss": 9.2408,
      "step": 8800,
      "throughput": 8803.89868187386
    },
    {
      "epoch": 0.1384336867843792,
      "grad_norm": 0.10017931461334229,
      "learning_rate": 0.0002816152994488462,
      "loss": 9.2397,
      "step": 8832,
      "throughput": 8804.577001273188
    },
    {
      "epoch": 0.1389352581133081,
      "grad_norm": 0.08872395753860474,
      "learning_rate": 0.0002814769391490185,
      "loss": 9.2626,
      "step": 8864,
      "throughput": 8804.650554415306
    },
    {
      "epoch": 0.139436829442237,
      "grad_norm": 0.08980926871299744,
      "learning_rate": 0.0002813380984979486,
      "loss": 9.2282,
      "step": 8896,
      "throughput": 8804.558633045863
    },
    {
      "epoch": 0.13993840077116593,
      "grad_norm": 0.08992139995098114,
      "learning_rate": 0.00028119877806821557,
      "loss": 9.2294,
      "step": 8928,
      "throughput": 8805.109496685498
    },
    {
      "epoch": 0.14043997210009482,
      "grad_norm": 0.09270741790533066,
      "learning_rate": 0.00028105897843437746,
      "loss": 9.2416,
      "step": 8960,
      "throughput": 8805.72966199901
    },
    {
      "epoch": 0.14094154342902374,
      "grad_norm": 0.09832657873630524,
      "learning_rate": 0.0002809187001729683,
      "loss": 9.2475,
      "step": 8992,
      "throughput": 8805.730741485399
    },
    {
      "epoch": 0.14144311475795265,
      "grad_norm": 0.09440915286540985,
      "learning_rate": 0.00028077794386249604,
      "loss": 9.2224,
      "step": 9024,
      "throughput": 8805.796344473592
    },
    {
      "epoch": 0.14194468608688157,
      "grad_norm": 0.09356938302516937,
      "learning_rate": 0.0002806367100834401,
      "loss": 9.2184,
      "step": 9056,
      "throughput": 8806.347742433114
    },
    {
      "epoch": 0.14244625741581046,
      "grad_norm": 0.08971024304628372,
      "learning_rate": 0.00028049499941824906,
      "loss": 9.225,
      "step": 9088,
      "throughput": 8806.948924234961
    },
    {
      "epoch": 0.14294782874473938,
      "grad_norm": 0.09602756053209305,
      "learning_rate": 0.0002803528124513382,
      "loss": 9.2023,
      "step": 9120,
      "throughput": 8807.236542892864
    },
    {
      "epoch": 0.1434494000736683,
      "grad_norm": 0.08790814876556396,
      "learning_rate": 0.00028021014976908676,
      "loss": 9.2285,
      "step": 9152,
      "throughput": 8807.245979642503
    },
    {
      "epoch": 0.1439509714025972,
      "grad_norm": 0.09404074400663376,
      "learning_rate": 0.0002800670119598363,
      "loss": 9.1934,
      "step": 9184,
      "throughput": 8807.76363103956
    },
    {
      "epoch": 0.1444525427315261,
      "grad_norm": 0.09263089299201965,
      "learning_rate": 0.0002799233996138874,
      "loss": 9.2266,
      "step": 9216,
      "throughput": 8808.38551889481
    },
    {
      "epoch": 0.14495411406045502,
      "grad_norm": 0.10047102719545364,
      "learning_rate": 0.00027977931332349786,
      "loss": 9.2069,
      "step": 9248,
      "throughput": 8808.654971911983
    },
    {
      "epoch": 0.14545568538938394,
      "grad_norm": 0.08788640052080154,
      "learning_rate": 0.00027963475368288006,
      "loss": 9.2235,
      "step": 9280,
      "throughput": 8808.410717545134
    },
    {
      "epoch": 0.14595725671831283,
      "grad_norm": 0.09000838547945023,
      "learning_rate": 0.00027948972128819823,
      "loss": 9.2016,
      "step": 9312,
      "throughput": 8808.989347922286
    },
    {
      "epoch": 0.14645882804724175,
      "grad_norm": 0.08718439191579819,
      "learning_rate": 0.0002793442167375665,
      "loss": 9.1963,
      "step": 9344,
      "throughput": 8809.608424197246
    },
    {
      "epoch": 0.14696039937617067,
      "grad_norm": 0.09730090200901031,
      "learning_rate": 0.0002791982406310461,
      "loss": 9.2075,
      "step": 9376,
      "throughput": 8809.664159961862
    },
    {
      "epoch": 0.14746197070509956,
      "grad_norm": 0.08973324298858643,
      "learning_rate": 0.0002790517935706428,
      "loss": 9.2052,
      "step": 9408,
      "throughput": 8809.583264650293
    },
    {
      "epoch": 0.14796354203402848,
      "grad_norm": 0.0930982455611229,
      "learning_rate": 0.00027890487616030475,
      "loss": 9.2064,
      "step": 9440,
      "throughput": 8810.203150029063
    },
    {
      "epoch": 0.1484651133629574,
      "grad_norm": 0.0945277065038681,
      "learning_rate": 0.0002787574890059199,
      "loss": 9.1756,
      "step": 9472,
      "throughput": 8810.752868527594
    },
    {
      "epoch": 0.1489666846918863,
      "grad_norm": 0.09098446369171143,
      "learning_rate": 0.0002786096327153131,
      "loss": 9.2186,
      "step": 9504,
      "throughput": 8810.953288912602
    },
    {
      "epoch": 0.1494682560208152,
      "grad_norm": 0.09192486852407455,
      "learning_rate": 0.00027846130789824437,
      "loss": 9.1797,
      "step": 9536,
      "throughput": 8811.010016247727
    },
    {
      "epoch": 0.14996982734974412,
      "grad_norm": 0.09026999771595001,
      "learning_rate": 0.00027831251516640553,
      "loss": 9.2007,
      "step": 9568,
      "throughput": 8811.586749223503
    },
    {
      "epoch": 0.15047139867867304,
      "grad_norm": 0.08644837141036987,
      "learning_rate": 0.00027816325513341835,
      "loss": 9.1898,
      "step": 9600,
      "throughput": 8812.119277191767
    },
    {
      "epoch": 0.15097297000760193,
      "grad_norm": 0.09695939719676971,
      "learning_rate": 0.0002780135284148315,
      "loss": 9.1974,
      "step": 9632,
      "throughput": 8812.412326591517
    },
    {
      "epoch": 0.15147454133653085,
      "grad_norm": 0.09113366156816483,
      "learning_rate": 0.00027786333562811855,
      "loss": 9.1829,
      "step": 9664,
      "throughput": 8812.383500445432
    },
    {
      "epoch": 0.15197611266545977,
      "grad_norm": 0.08756538480520248,
      "learning_rate": 0.00027771267739267494,
      "loss": 9.1776,
      "step": 9696,
      "throughput": 8813.011200739571
    },
    {
      "epoch": 0.15247768399438866,
      "grad_norm": 0.10144542157649994,
      "learning_rate": 0.0002775615543298157,
      "loss": 9.1674,
      "step": 9728,
      "throughput": 8813.579825339031
    },
    {
      "epoch": 0.15297925532331758,
      "grad_norm": 0.0889199823141098,
      "learning_rate": 0.0002774099670627728,
      "loss": 9.185,
      "step": 9760,
      "throughput": 8813.850669007317
    },
    {
      "epoch": 0.1534808266522465,
      "grad_norm": 0.08933806419372559,
      "learning_rate": 0.00027725791621669257,
      "loss": 9.1948,
      "step": 9792,
      "throughput": 8813.74460647051
    },
    {
      "epoch": 0.1539823979811754,
      "grad_norm": 0.08577853441238403,
      "learning_rate": 0.0002771054024186331,
      "loss": 9.1949,
      "step": 9824,
      "throughput": 8814.22588833097
    },
    {
      "epoch": 0.1544839693101043,
      "grad_norm": 0.08825614303350449,
      "learning_rate": 0.0002769524262975618,
      "loss": 9.1621,
      "step": 9856,
      "throughput": 8814.707823396035
    },
    {
      "epoch": 0.15498554063903322,
      "grad_norm": 0.08949641138315201,
      "learning_rate": 0.0002767989884843527,
      "loss": 9.1642,
      "step": 9888,
      "throughput": 8814.907522626361
    },
    {
      "epoch": 0.15548711196796214,
      "grad_norm": 0.09024640172719955,
      "learning_rate": 0.0002766450896117837,
      "loss": 9.1762,
      "step": 9920,
      "throughput": 8814.85194309132
    },
    {
      "epoch": 0.15598868329689103,
      "grad_norm": 0.09068801254034042,
      "learning_rate": 0.0002764907303145342,
      "loss": 9.1875,
      "step": 9952,
      "throughput": 8815.33405831657
    },
    {
      "epoch": 0.15649025462581995,
      "grad_norm": 0.08363509178161621,
      "learning_rate": 0.00027633591122918244,
      "loss": 9.159,
      "step": 9984,
      "throughput": 8815.8473612818
    },
    {
      "epoch": 0.15699182595474886,
      "grad_norm": 0.09737946093082428,
      "learning_rate": 0.0002761806329942028,
      "loss": 9.1766,
      "step": 10016,
      "throughput": 8815.942484982741
    },
    {
      "epoch": 0.15749339728367778,
      "grad_norm": 0.0928775891661644,
      "learning_rate": 0.0002760248962499632,
      "loss": 9.1529,
      "step": 10048,
      "throughput": 8815.921196414467
    },
    {
      "epoch": 0.15799496861260667,
      "grad_norm": 0.08588163554668427,
      "learning_rate": 0.0002758687016387223,
      "loss": 9.1796,
      "step": 10080,
      "throughput": 8816.397065306171
    },
    {
      "epoch": 0.1584965399415356,
      "grad_norm": 0.09236335009336472,
      "learning_rate": 0.0002757120498046273,
      "loss": 9.1775,
      "step": 10112,
      "throughput": 8816.860867169062
    },
    {
      "epoch": 0.1589981112704645,
      "grad_norm": 0.09708454459905624,
      "learning_rate": 0.00027555494139371077,
      "loss": 9.1648,
      "step": 10144,
      "throughput": 8816.976222054867
    },
    {
      "epoch": 0.1594996825993934,
      "grad_norm": 0.09289638698101044,
      "learning_rate": 0.0002753973770538882,
      "loss": 9.1374,
      "step": 10176,
      "throughput": 8816.974242370972
    },
    {
      "epoch": 0.16000125392832232,
      "grad_norm": 0.08685285598039627,
      "learning_rate": 0.00027523935743495553,
      "loss": 9.1247,
      "step": 10208,
      "throughput": 8817.429767353415
    },
    {
      "epoch": 0.16050282525725124,
      "grad_norm": 0.09010959416627884,
      "learning_rate": 0.00027508088318858604,
      "loss": 9.1647,
      "step": 10240,
      "throughput": 8817.910904094293
    },
    {
      "epoch": 0.16100439658618015,
      "grad_norm": 0.08543059229850769,
      "learning_rate": 0.000274921954968328,
      "loss": 9.1521,
      "step": 10272,
      "throughput": 8816.138866181464
    },
    {
      "epoch": 0.16150596791510904,
      "grad_norm": 0.08699269592761993,
      "learning_rate": 0.0002747625734296019,
      "loss": 9.1511,
      "step": 10304,
      "throughput": 8816.057816004723
    },
    {
      "epoch": 0.16200753924403796,
      "grad_norm": 0.08303535729646683,
      "learning_rate": 0.00027460273922969757,
      "loss": 9.1778,
      "step": 10336,
      "throughput": 8816.583570439654
    },
    {
      "epoch": 0.16250911057296688,
      "grad_norm": 0.08931925892829895,
      "learning_rate": 0.0002744424530277719,
      "loss": 9.1468,
      "step": 10368,
      "throughput": 8817.054724984973
    },
    {
      "epoch": 0.16301068190189577,
      "grad_norm": 0.08841574937105179,
      "learning_rate": 0.0002742817154848455,
      "loss": 9.1337,
      "step": 10400,
      "throughput": 8817.296466945818
    },
    {
      "epoch": 0.1635122532308247,
      "grad_norm": 0.08364134281873703,
      "learning_rate": 0.00027412052726380053,
      "loss": 9.1558,
      "step": 10432,
      "throughput": 8817.129164380318
    },
    {
      "epoch": 0.1640138245597536,
      "grad_norm": 0.0879436731338501,
      "learning_rate": 0.00027395888902937777,
      "loss": 9.1385,
      "step": 10464,
      "throughput": 8817.69940152977
    },
    {
      "epoch": 0.16451539588868253,
      "grad_norm": 0.08633450418710709,
      "learning_rate": 0.0002737968014481737,
      "loss": 9.1373,
      "step": 10496,
      "throughput": 8818.078687782507
    },
    {
      "epoch": 0.16501696721761142,
      "grad_norm": 0.08205156028270721,
      "learning_rate": 0.000273634265188638,
      "loss": 9.132,
      "step": 10528,
      "throughput": 8818.33960536414
    },
    {
      "epoch": 0.16551853854654033,
      "grad_norm": 0.0833672285079956,
      "learning_rate": 0.0002734712809210706,
      "loss": 9.1376,
      "step": 10560,
      "throughput": 8818.221824229719
    },
    {
      "epoch": 0.16602010987546925,
      "grad_norm": 0.0895625501871109,
      "learning_rate": 0.00027330784931761925,
      "loss": 9.1149,
      "step": 10592,
      "throughput": 8818.894932316287
    },
    {
      "epoch": 0.16652168120439814,
      "grad_norm": 0.0861629992723465,
      "learning_rate": 0.0002731439710522763,
      "loss": 9.1102,
      "step": 10624,
      "throughput": 8819.223744484703
    },
    {
      "epoch": 0.16702325253332706,
      "grad_norm": 0.0922650396823883,
      "learning_rate": 0.00027297964680087617,
      "loss": 9.1304,
      "step": 10656,
      "throughput": 8819.425250401511
    },
    {
      "epoch": 0.16752482386225598,
      "grad_norm": 0.08803316205739975,
      "learning_rate": 0.0002728148772410926,
      "loss": 9.1387,
      "step": 10688,
      "throughput": 8819.313232610373
    },
    {
      "epoch": 0.1680263951911849,
      "grad_norm": 0.08782891929149628,
      "learning_rate": 0.0002726496630524358,
      "loss": 9.1549,
      "step": 10720,
      "throughput": 8820.06652653723
    },
    {
      "epoch": 0.1685279665201138,
      "grad_norm": 0.08598551899194717,
      "learning_rate": 0.00027248400491624946,
      "loss": 9.1005,
      "step": 10752,
      "throughput": 8820.29782491265
    },
    {
      "epoch": 0.1690295378490427,
      "grad_norm": 0.09307502955198288,
      "learning_rate": 0.00027231790351570827,
      "loss": 9.1229,
      "step": 10784,
      "throughput": 8820.491573445923
    },
    {
      "epoch": 0.16953110917797162,
      "grad_norm": 0.08863917738199234,
      "learning_rate": 0.00027215135953581485,
      "loss": 9.1274,
      "step": 10816,
      "throughput": 8820.347253560274
    },
    {
      "epoch": 0.1700326805069005,
      "grad_norm": 0.08433914929628372,
      "learning_rate": 0.00027198437366339717,
      "loss": 9.1096,
      "step": 10848,
      "throughput": 8821.061915668972
    },
    {
      "epoch": 0.17053425183582943,
      "grad_norm": 0.08366360515356064,
      "learning_rate": 0.00027181694658710544,
      "loss": 9.1036,
      "step": 10880,
      "throughput": 8821.259467346279
    },
    {
      "epoch": 0.17103582316475835,
      "grad_norm": 0.09207843244075775,
      "learning_rate": 0.00027164907899740936,
      "loss": 9.1121,
      "step": 10912,
      "throughput": 8821.515016943495
    },
    {
      "epoch": 0.17153739449368727,
      "grad_norm": 0.08775728940963745,
      "learning_rate": 0.0002714807715865954,
      "loss": 9.1289,
      "step": 10944,
      "throughput": 8821.438910822044
    },
    {
      "epoch": 0.17203896582261616,
      "grad_norm": 0.08356799185276031,
      "learning_rate": 0.0002713120250487638,
      "loss": 9.1053,
      "step": 10976,
      "throughput": 8822.131257270616
    },
    {
      "epoch": 0.17254053715154508,
      "grad_norm": 0.09217944741249084,
      "learning_rate": 0.0002711428400798258,
      "loss": 9.0969,
      "step": 11008,
      "throughput": 8822.297615116977
    },
    {
      "epoch": 0.173042108480474,
      "grad_norm": 0.0949367806315422,
      "learning_rate": 0.00027097321737750075,
      "loss": 9.1049,
      "step": 11040,
      "throughput": 8822.518257508536
    },
    {
      "epoch": 0.17354367980940288,
      "grad_norm": 0.08590497821569443,
      "learning_rate": 0.00027080315764131316,
      "loss": 9.0914,
      "step": 11072,
      "throughput": 8822.459541071326
    },
    {
      "epoch": 0.1740452511383318,
      "grad_norm": 0.08156461268663406,
      "learning_rate": 0.0002706326615725898,
      "loss": 9.1036,
      "step": 11104,
      "throughput": 8823.149445206382
    },
    {
      "epoch": 0.17454682246726072,
      "grad_norm": 0.08349745720624924,
      "learning_rate": 0.0002704617298744571,
      "loss": 9.087,
      "step": 11136,
      "throughput": 8823.319571727785
    },
    {
      "epoch": 0.17504839379618964,
      "grad_norm": 0.08500142395496368,
      "learning_rate": 0.00027029036325183775,
      "loss": 9.0931,
      "step": 11168,
      "throughput": 8823.57281056996
    },
    {
      "epoch": 0.17554996512511853,
      "grad_norm": 0.07930979877710342,
      "learning_rate": 0.0002701185624114483,
      "loss": 9.1172,
      "step": 11200,
      "throughput": 8823.489514513913
    },
    {
      "epoch": 0.17605153645404745,
      "grad_norm": 0.08875308930873871,
      "learning_rate": 0.0002699463280617959,
      "loss": 9.1167,
      "step": 11232,
      "throughput": 8824.160138599997
    },
    {
      "epoch": 0.17655310778297637,
      "grad_norm": 0.09016279131174088,
      "learning_rate": 0.00026977366091317554,
      "loss": 9.0826,
      "step": 11264,
      "throughput": 8824.325303855396
    },
    {
      "epoch": 0.17705467911190526,
      "grad_norm": 0.08394233882427216,
      "learning_rate": 0.00026960056167766704,
      "loss": 9.0927,
      "step": 11296,
      "throughput": 8824.564024855992
    },
    {
      "epoch": 0.17755625044083417,
      "grad_norm": 0.08371694386005402,
      "learning_rate": 0.0002694270310691321,
      "loss": 9.0813,
      "step": 11328,
      "throughput": 8824.520854810047
    },
    {
      "epoch": 0.1780578217697631,
      "grad_norm": 0.08177390694618225,
      "learning_rate": 0.0002692530698032116,
      "loss": 9.0806,
      "step": 11360,
      "throughput": 8825.18968171211
    },
    {
      "epoch": 0.178559393098692,
      "grad_norm": 0.08288709819316864,
      "learning_rate": 0.00026907867859732223,
      "loss": 9.0814,
      "step": 11392,
      "throughput": 8825.334697789634
    },
    {
      "epoch": 0.1790609644276209,
      "grad_norm": 0.08150959759950638,
      "learning_rate": 0.0002689038581706538,
      "loss": 9.0865,
      "step": 11424,
      "throughput": 8825.619281128274
    },
    {
      "epoch": 0.17956253575654982,
      "grad_norm": 0.0917833000421524,
      "learning_rate": 0.0002687286092441664,
      "loss": 9.0634,
      "step": 11456,
      "throughput": 8825.46108093822
    },
    {
      "epoch": 0.18006410708547874,
      "grad_norm": 0.08228582888841629,
      "learning_rate": 0.00026855293254058693,
      "loss": 9.078,
      "step": 11488,
      "throughput": 8826.122405390453
    },
    {
      "epoch": 0.18056567841440763,
      "grad_norm": 0.09273815155029297,
      "learning_rate": 0.0002683768287844068,
      "loss": 9.0745,
      "step": 11520,
      "throughput": 8826.264264292311
    },
    {
      "epoch": 0.18106724974333654,
      "grad_norm": 0.08710981905460358,
      "learning_rate": 0.0002682002987018783,
      "loss": 9.0937,
      "step": 11552,
      "throughput": 8826.403465179152
    },
    {
      "epoch": 0.18156882107226546,
      "grad_norm": 0.08702866733074188,
      "learning_rate": 0.00026802334302101214,
      "loss": 9.0829,
      "step": 11584,
      "throughput": 8826.163922195796
    },
    {
      "epoch": 0.18207039240119435,
      "grad_norm": 0.08332011848688126,
      "learning_rate": 0.000267845962471574,
      "loss": 9.0764,
      "step": 11616,
      "throughput": 8826.810681241455
    },
    {
      "epoch": 0.18257196373012327,
      "grad_norm": 0.08716616779565811,
      "learning_rate": 0.0002676681577850818,
      "loss": 9.0667,
      "step": 11648,
      "throughput": 8827.000958585257
    },
    {
      "epoch": 0.1830735350590522,
      "grad_norm": 0.09485767781734467,
      "learning_rate": 0.0002674899296948026,
      "loss": 9.0654,
      "step": 11680,
      "throughput": 8827.257014002527
    },
    {
      "epoch": 0.1835751063879811,
      "grad_norm": 0.08811061829328537,
      "learning_rate": 0.00026731127893574955,
      "loss": 9.0763,
      "step": 11712,
      "throughput": 8827.252253691711
    },
    {
      "epoch": 0.18407667771691,
      "grad_norm": 0.0871390700340271,
      "learning_rate": 0.00026713220624467894,
      "loss": 9.0888,
      "step": 11744,
      "throughput": 8827.888916068738
    },
    {
      "epoch": 0.18457824904583892,
      "grad_norm": 0.09058859199285507,
      "learning_rate": 0.00026695271236008703,
      "loss": 9.0656,
      "step": 11776,
      "throughput": 8828.18880745889
    },
    {
      "epoch": 0.18507982037476783,
      "grad_norm": 0.09240787476301193,
      "learning_rate": 0.00026677279802220726,
      "loss": 9.0753,
      "step": 11808,
      "throughput": 8828.28059056515
    },
    {
      "epoch": 0.18558139170369672,
      "grad_norm": 0.08470363169908524,
      "learning_rate": 0.00026659246397300673,
      "loss": 9.0631,
      "step": 11840,
      "throughput": 8828.482871548256
    },
    {
      "epoch": 0.18608296303262564,
      "grad_norm": 0.08286084234714508,
      "learning_rate": 0.00026641171095618366,
      "loss": 9.0549,
      "step": 11872,
      "throughput": 8829.017412682424
    },
    {
      "epoch": 0.18658453436155456,
      "grad_norm": 0.08882605284452438,
      "learning_rate": 0.0002662305397171641,
      "loss": 9.067,
      "step": 11904,
      "throughput": 8829.305455070786
    },
    {
      "epoch": 0.18708610569048348,
      "grad_norm": 0.08985249698162079,
      "learning_rate": 0.0002660489510030986,
      "loss": 9.0736,
      "step": 11936,
      "throughput": 8829.388474243517
    },
    {
      "epoch": 0.18758767701941237,
      "grad_norm": 0.08082137256860733,
      "learning_rate": 0.00026586694556285975,
      "loss": 9.0721,
      "step": 11968,
      "throughput": 8829.531639227724
    },
    {
      "epoch": 0.1880892483483413,
      "grad_norm": 0.0825573056936264,
      "learning_rate": 0.0002656845241470384,
      "loss": 9.0508,
      "step": 12000,
      "throughput": 8830.087587061433
    },
    {
      "epoch": 0.1885908196772702,
      "grad_norm": 0.08357170969247818,
      "learning_rate": 0.0002655016875079411,
      "loss": 9.0589,
      "step": 12032,
      "throughput": 8830.444982736646
    },
    {
      "epoch": 0.1890923910061991,
      "grad_norm": 0.08905555307865143,
      "learning_rate": 0.00026531843639958656,
      "loss": 9.048,
      "step": 12064,
      "throughput": 8830.452768499454
    },
    {
      "epoch": 0.189593962335128,
      "grad_norm": 0.09543339163064957,
      "learning_rate": 0.00026513477157770303,
      "loss": 9.0543,
      "step": 12096,
      "throughput": 8830.603060387728
    },
    {
      "epoch": 0.19009553366405693,
      "grad_norm": 0.08120942860841751,
      "learning_rate": 0.0002649506937997248,
      "loss": 9.0557,
      "step": 12128,
      "throughput": 8831.166615808197
    },
    {
      "epoch": 0.19059710499298585,
      "grad_norm": 0.0824967548251152,
      "learning_rate": 0.00026476620382478896,
      "loss": 9.065,
      "step": 12160,
      "throughput": 8831.508908054715
    },
    {
      "epoch": 0.19109867632191474,
      "grad_norm": 0.08354393392801285,
      "learning_rate": 0.0002645813024137329,
      "loss": 9.0662,
      "step": 12192,
      "throughput": 8831.518942983914
    },
    {
      "epoch": 0.19160024765084366,
      "grad_norm": 0.08198045194149017,
      "learning_rate": 0.00026439599032909055,
      "loss": 9.0654,
      "step": 12224,
      "throughput": 8831.612563602785
    },
    {
      "epoch": 0.19210181897977258,
      "grad_norm": 0.08542396128177643,
      "learning_rate": 0.0002642102683350894,
      "loss": 9.0627,
      "step": 12256,
      "throughput": 8832.15725508659
    },
    {
      "epoch": 0.19260339030870147,
      "grad_norm": 0.0902579054236412,
      "learning_rate": 0.00026402413719764774,
      "loss": 9.0462,
      "step": 12288,
      "throughput": 8832.467440454431
    },
    {
      "epoch": 0.19310496163763038,
      "grad_norm": 0.08987751603126526,
      "learning_rate": 0.0002638375976843707,
      "loss": 9.0471,
      "step": 12320,
      "throughput": 8830.962024657469
    },
    {
      "epoch": 0.1936065329665593,
      "grad_norm": 0.08961810916662216,
      "learning_rate": 0.0002636506505645478,
      "loss": 9.0339,
      "step": 12352,
      "throughput": 8830.759731801822
    },
    {
      "epoch": 0.19410810429548822,
      "grad_norm": 0.08138346672058105,
      "learning_rate": 0.00026346329660914964,
      "loss": 9.0483,
      "step": 12384,
      "throughput": 8831.283922611019
    },
    {
      "epoch": 0.1946096756244171,
      "grad_norm": 0.07983388006687164,
      "learning_rate": 0.00026327553659082444,
      "loss": 9.0541,
      "step": 12416,
      "throughput": 8831.629960002096
    },
    {
      "epoch": 0.19511124695334603,
      "grad_norm": 0.07903102785348892,
      "learning_rate": 0.00026308737128389513,
      "loss": 9.0251,
      "step": 12448,
      "throughput": 8831.828895401724
    },
    {
      "epoch": 0.19561281828227495,
      "grad_norm": 0.08130428940057755,
      "learning_rate": 0.0002628988014643558,
      "loss": 9.0535,
      "step": 12480,
      "throughput": 8831.631966129466
    },
    {
      "epoch": 0.19611438961120384,
      "grad_norm": 0.09019070118665695,
      "learning_rate": 0.00026270982790986916,
      "loss": 9.0521,
      "step": 12512,
      "throughput": 8832.183929204004
    },
    {
      "epoch": 0.19661596094013276,
      "grad_norm": 0.08807606995105743,
      "learning_rate": 0.00026252045139976254,
      "loss": 9.034,
      "step": 12544,
      "throughput": 8832.506918598896
    },
    {
      "epoch": 0.19711753226906167,
      "grad_norm": 0.08415339887142181,
      "learning_rate": 0.00026233067271502536,
      "loss": 9.0125,
      "step": 12576,
      "throughput": 8832.614296363512
    },
    {
      "epoch": 0.1976191035979906,
      "grad_norm": 0.07860168814659119,
      "learning_rate": 0.0002621404926383054,
      "loss": 9.0416,
      "step": 12608,
      "throughput": 8832.509196753348
    },
    {
      "epoch": 0.19812067492691948,
      "grad_norm": 0.08424630761146545,
      "learning_rate": 0.0002619499119539059,
      "loss": 9.0189,
      "step": 12640,
      "throughput": 8833.027633810203
    },
    {
      "epoch": 0.1986222462558484,
      "grad_norm": 0.08129549771547318,
      "learning_rate": 0.0002617589314477821,
      "loss": 9.0077,
      "step": 12672,
      "throughput": 8833.345538127445
    },
    {
      "epoch": 0.19912381758477732,
      "grad_norm": 0.08588721603155136,
      "learning_rate": 0.0002615675519075383,
      "loss": 9.0287,
      "step": 12704,
      "throughput": 8833.38708333183
    },
    {
      "epoch": 0.1996253889137062,
      "grad_norm": 0.08470887690782547,
      "learning_rate": 0.00026137577412242415,
      "loss": 9.0132,
      "step": 12736,
      "throughput": 8833.346991276674
    },
    {
      "epoch": 0.20012696024263513,
      "grad_norm": 0.0842299535870552,
      "learning_rate": 0.00026118359888333193,
      "loss": 8.9919,
      "step": 12768,
      "throughput": 8833.900822243395
    },
    {
      "epoch": 0.20062853157156404,
      "grad_norm": 0.09161315113306046,
      "learning_rate": 0.00026099102698279276,
      "loss": 9.0182,
      "step": 12800,
      "throughput": 8834.136623124325
    },
    {
      "epoch": 0.20113010290049296,
      "grad_norm": 0.0936942771077156,
      "learning_rate": 0.0002607980592149739,
      "loss": 9.0176,
      "step": 12832,
      "throughput": 8834.322026360836
    },
    {
      "epoch": 0.20163167422942185,
      "grad_norm": 0.08174356073141098,
      "learning_rate": 0.00026060469637567484,
      "loss": 9.0272,
      "step": 12864,
      "throughput": 8834.21636981973
    },
    {
      "epoch": 0.20213324555835077,
      "grad_norm": 0.0919305831193924,
      "learning_rate": 0.0002604109392623246,
      "loss": 9.0392,
      "step": 12896,
      "throughput": 8834.751212132414
    },
    {
      "epoch": 0.2026348168872797,
      "grad_norm": 0.0808793231844902,
      "learning_rate": 0.00026021678867397803,
      "loss": 9.022,
      "step": 12928,
      "throughput": 8835.025467652535
    },
    {
      "epoch": 0.20313638821620858,
      "grad_norm": 0.07867848128080368,
      "learning_rate": 0.00026002224541131274,
      "loss": 9.005,
      "step": 12960,
      "throughput": 8835.151417184361
    },
    {
      "epoch": 0.2036379595451375,
      "grad_norm": 0.08789924532175064,
      "learning_rate": 0.00025982731027662575,
      "loss": 9.0143,
      "step": 12992,
      "throughput": 8835.07073633031
    },
    {
      "epoch": 0.20413953087406642,
      "grad_norm": 0.08775952458381653,
      "learning_rate": 0.00025963198407383015,
      "loss": 9.0259,
      "step": 13024,
      "throughput": 8835.597813814413
    },
    {
      "epoch": 0.20464110220299533,
      "grad_norm": 0.07885041832923889,
      "learning_rate": 0.0002594362676084517,
      "loss": 9.0086,
      "step": 13056,
      "throughput": 8835.860231370563
    },
    {
      "epoch": 0.20514267353192422,
      "grad_norm": 0.08015837520360947,
      "learning_rate": 0.0002592401616876258,
      "loss": 9.0122,
      "step": 13088,
      "throughput": 8835.986147542233
    },
    {
      "epoch": 0.20564424486085314,
      "grad_norm": 0.08334717154502869,
      "learning_rate": 0.00025904366712009374,
      "loss": 9.0219,
      "step": 13120,
      "throughput": 8835.944234812112
    },
    {
      "epoch": 0.20614581618978206,
      "grad_norm": 0.08041632175445557,
      "learning_rate": 0.00025884678471619976,
      "loss": 9.0075,
      "step": 13152,
      "throughput": 8836.489174584438
    },
    {
      "epoch": 0.20664738751871095,
      "grad_norm": 0.08151724934577942,
      "learning_rate": 0.0002586495152878874,
      "loss": 9.0009,
      "step": 13184,
      "throughput": 8836.720480927283
    },
    {
      "epoch": 0.20714895884763987,
      "grad_norm": 0.08173686265945435,
      "learning_rate": 0.0002584518596486965,
      "loss": 9.0035,
      "step": 13216,
      "throughput": 8836.863723253064
    },
    {
      "epoch": 0.2076505301765688,
      "grad_norm": 0.08243660628795624,
      "learning_rate": 0.00025825381861375936,
      "loss": 9.0124,
      "step": 13248,
      "throughput": 8836.739052834788
    },
    {
      "epoch": 0.2081521015054977,
      "grad_norm": 0.08186870068311691,
      "learning_rate": 0.00025805539299979794,
      "loss": 9.0141,
      "step": 13280,
      "throughput": 8837.27394297609
    },
    {
      "epoch": 0.2086536728344266,
      "grad_norm": 0.08113693445920944,
      "learning_rate": 0.0002578565836251199,
      "loss": 9.0042,
      "step": 13312,
      "throughput": 8837.517877764802
    },
    {
      "epoch": 0.2091552441633555,
      "grad_norm": 0.08530589938163757,
      "learning_rate": 0.0002576573913096158,
      "loss": 9.0082,
      "step": 13344,
      "throughput": 8837.662081132275
    },
    {
      "epoch": 0.20965681549228443,
      "grad_norm": 0.0843268632888794,
      "learning_rate": 0.00025745781687475534,
      "loss": 9.0011,
      "step": 13376,
      "throughput": 8837.451410811751
    },
    {
      "epoch": 0.21015838682121332,
      "grad_norm": 0.08069983124732971,
      "learning_rate": 0.000257257861143584,
      "loss": 8.9965,
      "step": 13408,
      "throughput": 8837.967265466526
    },
    {
      "epoch": 0.21065995815014224,
      "grad_norm": 0.08797997981309891,
      "learning_rate": 0.00025705752494071995,
      "loss": 8.9972,
      "step": 13440,
      "throughput": 8838.164365564864
    },
    {
      "epoch": 0.21116152947907116,
      "grad_norm": 0.07875420898199081,
      "learning_rate": 0.0002568568090923501,
      "loss": 8.9905,
      "step": 13472,
      "throughput": 8838.29210044764
    },
    {
      "epoch": 0.21166310080800008,
      "grad_norm": 0.07931997627019882,
      "learning_rate": 0.0002566557144262273,
      "loss": 8.9995,
      "step": 13504,
      "throughput": 8838.121292638185
    },
    {
      "epoch": 0.21216467213692897,
      "grad_norm": 0.0929732397198677,
      "learning_rate": 0.00025645424177166663,
      "loss": 9.0055,
      "step": 13536,
      "throughput": 8838.629544739428
    },
    {
      "epoch": 0.21266624346585788,
      "grad_norm": 0.08669450134038925,
      "learning_rate": 0.0002562523919595418,
      "loss": 8.9944,
      "step": 13568,
      "throughput": 8838.865832350326
    },
    {
      "epoch": 0.2131678147947868,
      "grad_norm": 0.07837585359811783,
      "learning_rate": 0.0002560501658222821,
      "loss": 8.9782,
      "step": 13600,
      "throughput": 8839.040956514082
    },
    {
      "epoch": 0.2136693861237157,
      "grad_norm": 0.08947234600782394,
      "learning_rate": 0.0002558475641938686,
      "loss": 8.9823,
      "step": 13632,
      "throughput": 8838.921683629353
    },
    {
      "epoch": 0.2141709574526446,
      "grad_norm": 0.08249003440141678,
      "learning_rate": 0.00025564458790983114,
      "loss": 9.0016,
      "step": 13664,
      "throughput": 8839.415178957333
    },
    {
      "epoch": 0.21467252878157353,
      "grad_norm": 0.08924390375614166,
      "learning_rate": 0.0002554412378072445,
      "loss": 8.985,
      "step": 13696,
      "throughput": 8839.758654670273
    },
    {
      "epoch": 0.21517410011050242,
      "grad_norm": 0.08233807235956192,
      "learning_rate": 0.0002552375147247251,
      "loss": 8.9774,
      "step": 13728,
      "throughput": 8839.705210842236
    },
    {
      "epoch": 0.21567567143943134,
      "grad_norm": 0.0813618078827858,
      "learning_rate": 0.0002550334195024275,
      "loss": 8.9801,
      "step": 13760,
      "throughput": 8839.59999048092
    },
    {
      "epoch": 0.21617724276836026,
      "grad_norm": 0.07620636373758316,
      "learning_rate": 0.00025482895298204096,
      "loss": 8.9757,
      "step": 13792,
      "throughput": 8840.157377923262
    },
    {
      "epoch": 0.21667881409728917,
      "grad_norm": 0.08532450348138809,
      "learning_rate": 0.0002546241160067861,
      "loss": 8.9766,
      "step": 13824,
      "throughput": 8840.425867617763
    },
    {
      "epoch": 0.21718038542621806,
      "grad_norm": 0.08044885098934174,
      "learning_rate": 0.00025441890942141124,
      "loss": 8.994,
      "step": 13856,
      "throughput": 8840.400033321612
    },
    {
      "epoch": 0.21768195675514698,
      "grad_norm": 0.08269073814153671,
      "learning_rate": 0.00025421333407218884,
      "loss": 8.9888,
      "step": 13888,
      "throughput": 8840.231364870235
    },
    {
      "epoch": 0.2181835280840759,
      "grad_norm": 0.08159191161394119,
      "learning_rate": 0.0002540073908069124,
      "loss": 8.978,
      "step": 13920,
      "throughput": 8840.79464592254
    },
    {
      "epoch": 0.2186850994130048,
      "grad_norm": 0.08631081134080887,
      "learning_rate": 0.0002538010804748924,
      "loss": 8.9384,
      "step": 13952,
      "throughput": 8841.080385514952
    },
    {
      "epoch": 0.2191866707419337,
      "grad_norm": 0.07901880890130997,
      "learning_rate": 0.0002535944039269533,
      "loss": 8.986,
      "step": 13984,
      "throughput": 8841.048159496497
    },
    {
      "epoch": 0.21968824207086263,
      "grad_norm": 0.07891877740621567,
      "learning_rate": 0.0002533873620154299,
      "loss": 8.9743,
      "step": 14016,
      "throughput": 8840.959387551367
    },
    {
      "epoch": 0.22018981339979155,
      "grad_norm": 0.07669687271118164,
      "learning_rate": 0.0002531799555941635,
      "loss": 8.9712,
      "step": 14048,
      "throughput": 8841.50220321759
    },
    {
      "epoch": 0.22069138472872044,
      "grad_norm": 0.08152962476015091,
      "learning_rate": 0.00025297218551849885,
      "loss": 8.9487,
      "step": 14080,
      "throughput": 8841.774650330875
    },
    {
      "epoch": 0.22119295605764935,
      "grad_norm": 0.09327496588230133,
      "learning_rate": 0.00025276405264528044,
      "loss": 8.9746,
      "step": 14112,
      "throughput": 8841.79885493629
    },
    {
      "epoch": 0.22169452738657827,
      "grad_norm": 0.07541276514530182,
      "learning_rate": 0.00025255555783284877,
      "loss": 8.9697,
      "step": 14144,
      "throughput": 8841.727072238291
    },
    {
      "epoch": 0.22219609871550716,
      "grad_norm": 0.08202675729990005,
      "learning_rate": 0.0002523467019410371,
      "loss": 8.9714,
      "step": 14176,
      "throughput": 8842.26026429414
    },
    {
      "epoch": 0.22269767004443608,
      "grad_norm": 0.08090902864933014,
      "learning_rate": 0.00025213748583116776,
      "loss": 8.9849,
      "step": 14208,
      "throughput": 8842.548919521794
    },
    {
      "epoch": 0.223199241373365,
      "grad_norm": 0.080088309943676,
      "learning_rate": 0.0002519279103660486,
      "loss": 8.9407,
      "step": 14240,
      "throughput": 8842.441244363596
    },
    {
      "epoch": 0.22370081270229392,
      "grad_norm": 0.09118451923131943,
      "learning_rate": 0.0002517179764099694,
      "loss": 8.937,
      "step": 14272,
      "throughput": 8842.283163338127
    },
    {
      "epoch": 0.2242023840312228,
      "grad_norm": 0.0921279564499855,
      "learning_rate": 0.00025150768482869846,
      "loss": 8.9623,
      "step": 14304,
      "throughput": 8842.817465690732
    },
    {
      "epoch": 0.22470395536015172,
      "grad_norm": 0.07893610000610352,
      "learning_rate": 0.0002512970364894789,
      "loss": 8.9598,
      "step": 14336,
      "throughput": 8843.119331329157
    },
    {
      "epoch": 0.22520552668908064,
      "grad_norm": 0.08488957583904266,
      "learning_rate": 0.00025108603226102515,
      "loss": 8.9652,
      "step": 14368,
      "throughput": 8841.600361125842
    },
    {
      "epoch": 0.22570709801800953,
      "grad_norm": 0.07883718609809875,
      "learning_rate": 0.0002508746730135191,
      "loss": 8.9593,
      "step": 14400,
      "throughput": 8841.445944335217
    },
    {
      "epoch": 0.22620866934693845,
      "grad_norm": 0.08114926517009735,
      "learning_rate": 0.00025066295961860704,
      "loss": 8.9453,
      "step": 14432,
      "throughput": 8841.984887459164
    },
    {
      "epoch": 0.22671024067586737,
      "grad_norm": 0.0800662562251091,
      "learning_rate": 0.0002504508929493957,
      "loss": 8.9527,
      "step": 14464,
      "throughput": 8842.348536699352
    },
    {
      "epoch": 0.2272118120047963,
      "grad_norm": 0.0862034410238266,
      "learning_rate": 0.00025023847388044846,
      "loss": 8.9404,
      "step": 14496,
      "throughput": 8842.28608146023
    },
    {
      "epoch": 0.22771338333372518,
      "grad_norm": 0.09161315113306046,
      "learning_rate": 0.0002500257032877823,
      "loss": 8.9541,
      "step": 14528,
      "throughput": 8842.1214286272
    },
    {
      "epoch": 0.2282149546626541,
      "grad_norm": 0.08155594021081924,
      "learning_rate": 0.0002498125820488639,
      "loss": 8.9548,
      "step": 14560,
      "throughput": 8842.656952860514
    },
    {
      "epoch": 0.22871652599158301,
      "grad_norm": 0.09579102694988251,
      "learning_rate": 0.00024959911104260565,
      "loss": 8.9496,
      "step": 14592,
      "throughput": 8843.03001049117
    },
    {
      "epoch": 0.2292180973205119,
      "grad_norm": 0.0857420563697815,
      "learning_rate": 0.00024938529114936273,
      "loss": 8.963,
      "step": 14624,
      "throughput": 8842.954005956697
    },
    {
      "epoch": 0.22971966864944082,
      "grad_norm": 0.08455734699964523,
      "learning_rate": 0.000249171123250929,
      "loss": 8.9558,
      "step": 14656,
      "throughput": 8842.871556219898
    },
    {
      "epoch": 0.23022123997836974,
      "grad_norm": 0.08038202673196793,
      "learning_rate": 0.00024895660823053353,
      "loss": 8.944,
      "step": 14688,
      "throughput": 8843.389256958373
    },
    {
      "epoch": 0.23072281130729866,
      "grad_norm": 0.07368004322052002,
      "learning_rate": 0.00024874174697283685,
      "loss": 8.966,
      "step": 14720,
      "throughput": 8843.724505638962
    },
    {
      "epoch": 0.23122438263622755,
      "grad_norm": 0.07424993813037872,
      "learning_rate": 0.0002485265403639275,
      "loss": 8.9445,
      "step": 14752,
      "throughput": 8843.614720535039
    },
    {
      "epoch": 0.23172595396515647,
      "grad_norm": 0.07684794813394547,
      "learning_rate": 0.0002483109892913181,
      "loss": 8.9629,
      "step": 14784,
      "throughput": 8843.554306116497
    },
    {
      "epoch": 0.23222752529408539,
      "grad_norm": 0.08115751296281815,
      "learning_rate": 0.0002480950946439419,
      "loss": 8.9452,
      "step": 14816,
      "throughput": 8844.063542567186
    },
    {
      "epoch": 0.23272909662301428,
      "grad_norm": 0.07495855540037155,
      "learning_rate": 0.0002478788573121491,
      "loss": 8.9245,
      "step": 14848,
      "throughput": 8844.431926105013
    },
    {
      "epoch": 0.2332306679519432,
      "grad_norm": 0.08068421483039856,
      "learning_rate": 0.0002476622781877031,
      "loss": 8.925,
      "step": 14880,
      "throughput": 8844.360587283169
    },
    {
      "epoch": 0.2337322392808721,
      "grad_norm": 0.07673174142837524,
      "learning_rate": 0.0002474453581637769,
      "loss": 8.94,
      "step": 14912,
      "throughput": 8844.243253170607
    },
    {
      "epoch": 0.23423381060980103,
      "grad_norm": 0.08002306520938873,
      "learning_rate": 0.00024722809813494933,
      "loss": 8.9419,
      "step": 14944,
      "throughput": 8844.749124458962
    },
    {
      "epoch": 0.23473538193872992,
      "grad_norm": 0.08444210141897202,
      "learning_rate": 0.00024701049899720123,
      "loss": 8.9346,
      "step": 14976,
      "throughput": 8845.12059524771
    },
    {
      "epoch": 0.23523695326765884,
      "grad_norm": 0.07796745747327805,
      "learning_rate": 0.0002467925616479122,
      "loss": 8.9592,
      "step": 15008,
      "throughput": 8845.064143363543
    },
    {
      "epoch": 0.23573852459658776,
      "grad_norm": 0.0815286934375763,
      "learning_rate": 0.0002465742869858566,
      "loss": 8.9434,
      "step": 15040,
      "throughput": 8845.028768403501
    },
    {
      "epoch": 0.23624009592551665,
      "grad_norm": 0.07568243145942688,
      "learning_rate": 0.0002463556759111996,
      "loss": 8.9199,
      "step": 15072,
      "throughput": 8845.53829814696
    },
    {
      "epoch": 0.23674166725444556,
      "grad_norm": 0.08475536853075027,
      "learning_rate": 0.00024613672932549403,
      "loss": 8.9335,
      "step": 15104,
      "throughput": 8845.914556859072
    },
    {
      "epoch": 0.23724323858337448,
      "grad_norm": 0.08863034099340439,
      "learning_rate": 0.00024591744813167625,
      "loss": 8.9221,
      "step": 15136,
      "throughput": 8845.680743425255
    },
    {
      "epoch": 0.2377448099123034,
      "grad_norm": 0.09319442510604858,
      "learning_rate": 0.00024569783323406255,
      "loss": 8.9204,
      "step": 15168,
      "throughput": 8845.717517430303
    },
    {
      "epoch": 0.2382463812412323,
      "grad_norm": 0.0772862657904625,
      "learning_rate": 0.00024547788553834536,
      "loss": 8.9287,
      "step": 15200,
      "throughput": 8846.219617937282
    },
    {
      "epoch": 0.2387479525701612,
      "grad_norm": 0.09362047165632248,
      "learning_rate": 0.00024525760595158977,
      "loss": 8.9426,
      "step": 15232,
      "throughput": 8846.58514079462
    },
    {
      "epoch": 0.23924952389909013,
      "grad_norm": 0.07392299920320511,
      "learning_rate": 0.0002450369953822293,
      "loss": 8.93,
      "step": 15264,
      "throughput": 8846.413348386604
    },
    {
      "epoch": 0.23975109522801902,
      "grad_norm": 0.07865717262029648,
      "learning_rate": 0.0002448160547400627,
      "loss": 8.9214,
      "step": 15296,
      "throughput": 8846.330000239195
    },
    {
      "epoch": 0.24025266655694794,
      "grad_norm": 0.0818992331624031,
      "learning_rate": 0.00024459478493624973,
      "loss": 8.9222,
      "step": 15328,
      "throughput": 8846.830431784812
    },
    {
      "epoch": 0.24075423788587685,
      "grad_norm": 0.08360916376113892,
      "learning_rate": 0.0002443731868833078,
      "loss": 8.9249,
      "step": 15360,
      "throughput": 8847.183830926322
    },
    {
      "epoch": 0.24125580921480577,
      "grad_norm": 0.08100838959217072,
      "learning_rate": 0.0002441512614951079,
      "loss": 8.8944,
      "step": 15392,
      "throughput": 8847.052000253154
    },
    {
      "epoch": 0.24175738054373466,
      "grad_norm": 0.07800208777189255,
      "learning_rate": 0.00024392900968687103,
      "loss": 8.9333,
      "step": 15424,
      "throughput": 8846.88050180888
    },
    {
      "epoch": 0.24225895187266358,
      "grad_norm": 0.08001139760017395,
      "learning_rate": 0.00024370643237516426,
      "loss": 8.9166,
      "step": 15456,
      "throughput": 8847.353385177725
    },
    {
      "epoch": 0.2427605232015925,
      "grad_norm": 0.07767579704523087,
      "learning_rate": 0.00024348353047789708,
      "loss": 8.9362,
      "step": 15488,
      "throughput": 8847.707394107481
    },
    {
      "epoch": 0.2432620945305214,
      "grad_norm": 0.08075279742479324,
      "learning_rate": 0.0002432603049143176,
      "loss": 8.9314,
      "step": 15520,
      "throughput": 8847.557036979117
    },
    {
      "epoch": 0.2437636658594503,
      "grad_norm": 0.08244162052869797,
      "learning_rate": 0.0002430367566050087,
      "loss": 8.9232,
      "step": 15552,
      "throughput": 8847.42904210372
    },
    {
      "epoch": 0.24426523718837923,
      "grad_norm": 0.08184251189231873,
      "learning_rate": 0.00024281288647188425,
      "loss": 8.9084,
      "step": 15584,
      "throughput": 8847.911019319828
    },
    {
      "epoch": 0.24476680851730814,
      "grad_norm": 0.07141350954771042,
      "learning_rate": 0.00024258869543818535,
      "loss": 8.8898,
      "step": 15616,
      "throughput": 8848.260056243274
    },
    {
      "epoch": 0.24526837984623703,
      "grad_norm": 0.09621914476156235,
      "learning_rate": 0.00024236418442847652,
      "loss": 8.9365,
      "step": 15648,
      "throughput": 8848.086390788978
    },
    {
      "epoch": 0.24576995117516595,
      "grad_norm": 0.08253178000450134,
      "learning_rate": 0.0002421393543686418,
      "loss": 8.9261,
      "step": 15680,
      "throughput": 8847.997217683149
    },
    {
      "epoch": 0.24627152250409487,
      "grad_norm": 0.08006506413221359,
      "learning_rate": 0.00024191420618588103,
      "loss": 8.9107,
      "step": 15712,
      "throughput": 8848.472194979533
    },
    {
      "epoch": 0.24677309383302376,
      "grad_norm": 0.07812228798866272,
      "learning_rate": 0.000241688740808706,
      "loss": 8.9108,
      "step": 15744,
      "throughput": 8848.844196536425
    },
    {
      "epoch": 0.24727466516195268,
      "grad_norm": 0.0766172781586647,
      "learning_rate": 0.0002414629591669366,
      "loss": 8.9101,
      "step": 15776,
      "throughput": 8848.763152880012
    },
    {
      "epoch": 0.2477762364908816,
      "grad_norm": 0.07769637554883957,
      "learning_rate": 0.0002412368621916969,
      "loss": 8.9097,
      "step": 15808,
      "throughput": 8848.663138376132
    },
    {
      "epoch": 0.2482778078198105,
      "grad_norm": 0.07883188128471375,
      "learning_rate": 0.0002410104508154116,
      "loss": 8.913,
      "step": 15840,
      "throughput": 8849.133796430357
    },
    {
      "epoch": 0.2487793791487394,
      "grad_norm": 0.08420856297016144,
      "learning_rate": 0.00024078372597180183,
      "loss": 8.907,
      "step": 15872,
      "throughput": 8849.475555093464
    },
    {
      "epoch": 0.24928095047766832,
      "grad_norm": 0.08431058377027512,
      "learning_rate": 0.00024055668859588157,
      "loss": 8.8894,
      "step": 15904,
      "throughput": 8849.408986160002
    },
    {
      "epoch": 0.24978252180659724,
      "grad_norm": 0.07932168245315552,
      "learning_rate": 0.0002403293396239536,
      "loss": 8.9009,
      "step": 15936,
      "throughput": 8849.197175271445
    },
    {
      "epoch": 0.25028409313552613,
      "grad_norm": 0.08460178226232529,
      "learning_rate": 0.00024010167999360575,
      "loss": 8.9212,
      "step": 15968,
      "throughput": 8849.662358621626
    },
    {
      "epoch": 0.25078566446445505,
      "grad_norm": 0.08003672957420349,
      "learning_rate": 0.00023987371064370698,
      "loss": 8.8948,
      "step": 16000,
      "throughput": 8850.04432114083
    },
    {
      "epoch": 0.25128723579338397,
      "grad_norm": 0.08153244853019714,
      "learning_rate": 0.00023964543251440363,
      "loss": 8.8742,
      "step": 16032,
      "throughput": 8849.954581929667
    },
    {
      "epoch": 0.2517888071223129,
      "grad_norm": 0.08619936555624008,
      "learning_rate": 0.00023941684654711534,
      "loss": 8.9046,
      "step": 16064,
      "throughput": 8849.672440477376
    },
    {
      "epoch": 0.2522903784512418,
      "grad_norm": 0.07380315661430359,
      "learning_rate": 0.0002391879536845313,
      "loss": 8.9018,
      "step": 16096,
      "throughput": 8850.142241698979
    },
    {
      "epoch": 0.25279194978017067,
      "grad_norm": 0.08683207631111145,
      "learning_rate": 0.0002389587548706064,
      "loss": 8.8875,
      "step": 16128,
      "throughput": 8850.546625557658
    },
    {
      "epoch": 0.2532935211090996,
      "grad_norm": 0.07719036191701889,
      "learning_rate": 0.0002387292510505572,
      "loss": 8.8818,
      "step": 16160,
      "throughput": 8850.427029930328
    },
    {
      "epoch": 0.2537950924380285,
      "grad_norm": 0.07802041620016098,
      "learning_rate": 0.00023849944317085812,
      "loss": 8.9115,
      "step": 16192,
      "throughput": 8850.281862064956
    },
    {
      "epoch": 0.2542966637669574,
      "grad_norm": 0.07789981365203857,
      "learning_rate": 0.0002382693321792376,
      "loss": 8.898,
      "step": 16224,
      "throughput": 8850.741104033605
    },
    {
      "epoch": 0.25479823509588634,
      "grad_norm": 0.07908471673727036,
      "learning_rate": 0.00023803891902467406,
      "loss": 8.9094,
      "step": 16256,
      "throughput": 8851.116113257427
    },
    {
      "epoch": 0.25529980642481526,
      "grad_norm": 0.08057553321123123,
      "learning_rate": 0.0002378082046573919,
      "loss": 8.8815,
      "step": 16288,
      "throughput": 8851.002029177316
    },
    {
      "epoch": 0.2558013777537442,
      "grad_norm": 0.07993035018444061,
      "learning_rate": 0.00023757719002885793,
      "loss": 8.8784,
      "step": 16320,
      "throughput": 8850.852525962286
    },
    {
      "epoch": 0.25630294908267304,
      "grad_norm": 0.08261415362358093,
      "learning_rate": 0.00023734587609177725,
      "loss": 8.8955,
      "step": 16352,
      "throughput": 8851.312677135089
    },
    {
      "epoch": 0.25680452041160196,
      "grad_norm": 0.08019208908081055,
      "learning_rate": 0.000237114263800089,
      "loss": 8.8989,
      "step": 16384,
      "throughput": 8851.708561726255
    },
    {
      "epoch": 0.2573060917405309,
      "grad_norm": 0.08069220185279846,
      "learning_rate": 0.0002368823541089632,
      "loss": 8.8979,
      "step": 16416,
      "throughput": 8850.402276688073
    },
    {
      "epoch": 0.2578076630694598,
      "grad_norm": 0.08015625178813934,
      "learning_rate": 0.00023665014797479602,
      "loss": 8.8898,
      "step": 16448,
      "throughput": 8850.273118023713
    },
    {
      "epoch": 0.2583092343983887,
      "grad_norm": 0.08302200585603714,
      "learning_rate": 0.00023641764635520617,
      "loss": 8.879,
      "step": 16480,
      "throughput": 8850.730562782745
    },
    {
      "epoch": 0.2588108057273176,
      "grad_norm": 0.07490982115268707,
      "learning_rate": 0.0002361848502090311,
      "loss": 8.8805,
      "step": 16512,
      "throughput": 8851.111261480031
    },
    {
      "epoch": 0.25931237705624655,
      "grad_norm": 0.0764228031039238,
      "learning_rate": 0.0002359517604963228,
      "loss": 8.8969,
      "step": 16544,
      "throughput": 8851.001378740231
    },
    {
      "epoch": 0.2598139483851754,
      "grad_norm": 0.07773542404174805,
      "learning_rate": 0.0002357183781783439,
      "loss": 8.8794,
      "step": 16576,
      "throughput": 8850.781342052993
    },
    {
      "epoch": 0.2603155197141043,
      "grad_norm": 0.09659688174724579,
      "learning_rate": 0.0002354847042175638,
      "loss": 8.8671,
      "step": 16608,
      "throughput": 8851.225868462627
    },
    {
      "epoch": 0.26081709104303324,
      "grad_norm": 0.0778978168964386,
      "learning_rate": 0.0002352507395776546,
      "loss": 8.8743,
      "step": 16640,
      "throughput": 8851.597799348436
    },
    {
      "epoch": 0.26131866237196216,
      "grad_norm": 0.0769830197095871,
      "learning_rate": 0.00023501648522348715,
      "loss": 8.8877,
      "step": 16672,
      "throughput": 8851.56506268054
    },
    {
      "epoch": 0.2618202337008911,
      "grad_norm": 0.08127515017986298,
      "learning_rate": 0.0002347819421211271,
      "loss": 8.8921,
      "step": 16704,
      "throughput": 8851.318214920258
    },
    {
      "epoch": 0.26232180502982,
      "grad_norm": 0.08417029678821564,
      "learning_rate": 0.00023454711123783092,
      "loss": 8.8755,
      "step": 16736,
      "throughput": 8851.757418120564
    },
    {
      "epoch": 0.2628233763587489,
      "grad_norm": 0.07545661926269531,
      "learning_rate": 0.00023431199354204192,
      "loss": 8.8772,
      "step": 16768,
      "throughput": 8852.122258397247
    },
    {
      "epoch": 0.2633249476876778,
      "grad_norm": 0.07834376394748688,
      "learning_rate": 0.00023407659000338607,
      "loss": 8.8865,
      "step": 16800,
      "throughput": 8852.051755423026
    },
    {
      "epoch": 0.2638265190166067,
      "grad_norm": 0.07725591957569122,
      "learning_rate": 0.00023384090159266833,
      "loss": 8.8804,
      "step": 16832,
      "throughput": 8851.755112202223
    },
    {
      "epoch": 0.2643280903455356,
      "grad_norm": 0.07641211152076721,
      "learning_rate": 0.00023360492928186838,
      "loss": 8.8721,
      "step": 16864,
      "throughput": 8852.189476055588
    },
    {
      "epoch": 0.26482966167446453,
      "grad_norm": 0.07766464352607727,
      "learning_rate": 0.00023336867404413674,
      "loss": 8.8797,
      "step": 16896,
      "throughput": 8852.561389159348
    },
    {
      "epoch": 0.26533123300339345,
      "grad_norm": 0.08463136851787567,
      "learning_rate": 0.0002331321368537907,
      "loss": 8.8606,
      "step": 16928,
      "throughput": 8852.384328747887
    },
    {
      "epoch": 0.26583280433232237,
      "grad_norm": 0.08805921673774719,
      "learning_rate": 0.0002328953186863103,
      "loss": 8.8684,
      "step": 16960,
      "throughput": 8852.249964482584
    },
    {
      "epoch": 0.2663343756612513,
      "grad_norm": 0.07233244925737381,
      "learning_rate": 0.00023265822051833442,
      "loss": 8.8793,
      "step": 16992,
      "throughput": 8852.687555471475
    },
    {
      "epoch": 0.26683594699018015,
      "grad_norm": 0.0757635310292244,
      "learning_rate": 0.00023242084332765662,
      "loss": 8.8467,
      "step": 17024,
      "throughput": 8853.059822154499
    },
    {
      "epoch": 0.26733751831910907,
      "grad_norm": 0.07592292875051498,
      "learning_rate": 0.0002321831880932211,
      "loss": 8.8471,
      "step": 17056,
      "throughput": 8852.917810217152
    },
    {
      "epoch": 0.267839089648038,
      "grad_norm": 0.08107979595661163,
      "learning_rate": 0.00023194525579511876,
      "loss": 8.8732,
      "step": 17088,
      "throughput": 8852.652403313208
    },
    {
      "epoch": 0.2683406609769669,
      "grad_norm": 0.07222038507461548,
      "learning_rate": 0.00023170704741458308,
      "loss": 8.8802,
      "step": 17120,
      "throughput": 8853.078328328778
    },
    {
      "epoch": 0.2688422323058958,
      "grad_norm": 0.07747363299131393,
      "learning_rate": 0.00023146856393398615,
      "loss": 8.8865,
      "step": 17152,
      "throughput": 8853.434671798588
    },
    {
      "epoch": 0.26934380363482474,
      "grad_norm": 0.08264749497175217,
      "learning_rate": 0.0002312298063368346,
      "loss": 8.874,
      "step": 17184,
      "throughput": 8853.327615268681
    },
    {
      "epoch": 0.26984537496375366,
      "grad_norm": 0.07909268140792847,
      "learning_rate": 0.00023099077560776536,
      "loss": 8.8685,
      "step": 17216,
      "throughput": 8853.066307208373
    },
    {
      "epoch": 0.2703469462926825,
      "grad_norm": 0.07460527122020721,
      "learning_rate": 0.00023075147273254195,
      "loss": 8.8518,
      "step": 17248,
      "throughput": 8853.494637244397
    },
    {
      "epoch": 0.27084851762161144,
      "grad_norm": 0.0753212720155716,
      "learning_rate": 0.0002305118986980501,
      "loss": 8.8774,
      "step": 17280,
      "throughput": 8853.857709105841
    },
    {
      "epoch": 0.27135008895054036,
      "grad_norm": 0.0831172987818718,
      "learning_rate": 0.00023027205449229388,
      "loss": 8.8699,
      "step": 17312,
      "throughput": 8853.796085376076
    },
    {
      "epoch": 0.2718516602794693,
      "grad_norm": 0.08725563436746597,
      "learning_rate": 0.00023003194110439145,
      "loss": 8.8637,
      "step": 17344,
      "throughput": 8853.553669857007
    },
    {
      "epoch": 0.2723532316083982,
      "grad_norm": 0.075401172041893,
      "learning_rate": 0.00022979155952457118,
      "loss": 8.8637,
      "step": 17376,
      "throughput": 8853.973439032789
    },
    {
      "epoch": 0.2728548029373271,
      "grad_norm": 0.07143397629261017,
      "learning_rate": 0.00022955091074416733,
      "loss": 8.8577,
      "step": 17408,
      "throughput": 8854.336355288591
    },
    {
      "epoch": 0.27335637426625603,
      "grad_norm": 0.07553057372570038,
      "learning_rate": 0.0002293099957556163,
      "loss": 8.8733,
      "step": 17440,
      "throughput": 8854.379570846144
    },
    {
      "epoch": 0.2738579455951849,
      "grad_norm": 0.08055870234966278,
      "learning_rate": 0.00022906881555245212,
      "loss": 8.8247,
      "step": 17472,
      "throughput": 8854.08924615641
    },
    {
      "epoch": 0.2743595169241138,
      "grad_norm": 0.0758543387055397,
      "learning_rate": 0.0002288273711293028,
      "loss": 8.8353,
      "step": 17504,
      "throughput": 8854.501014264812
    },
    {
      "epoch": 0.27486108825304273,
      "grad_norm": 0.07926324754953384,
      "learning_rate": 0.00022858566348188568,
      "loss": 8.8772,
      "step": 17536,
      "throughput": 8854.84422521045
    },
    {
      "epoch": 0.27536265958197165,
      "grad_norm": 0.07983296364545822,
      "learning_rate": 0.00022834369360700394,
      "loss": 8.8558,
      "step": 17568,
      "throughput": 8854.904033354123
    },
    {
      "epoch": 0.27586423091090057,
      "grad_norm": 0.07504521310329437,
      "learning_rate": 0.00022810146250254196,
      "loss": 8.8663,
      "step": 17600,
      "throughput": 8854.581894360635
    },
    {
      "epoch": 0.2763658022398295,
      "grad_norm": 0.0762917697429657,
      "learning_rate": 0.00022785897116746166,
      "loss": 8.836,
      "step": 17632,
      "throughput": 8854.990553638641
    },
    {
      "epoch": 0.2768673735687584,
      "grad_norm": 0.07859618216753006,
      "learning_rate": 0.00022761622060179793,
      "loss": 8.8458,
      "step": 17664,
      "throughput": 8855.338111904854
    },
    {
      "epoch": 0.27736894489768726,
      "grad_norm": 0.07929467409849167,
      "learning_rate": 0.00022737321180665488,
      "loss": 8.8529,
      "step": 17696,
      "throughput": 8855.424227586012
    },
    {
      "epoch": 0.2778705162266162,
      "grad_norm": 0.08522498607635498,
      "learning_rate": 0.00022712994578420143,
      "loss": 8.879,
      "step": 17728,
      "throughput": 8855.043453434968
    },
    {
      "epoch": 0.2783720875555451,
      "grad_norm": 0.08172594010829926,
      "learning_rate": 0.00022688642353766746,
      "loss": 8.8265,
      "step": 17760,
      "throughput": 8855.448267391399
    },
    {
      "epoch": 0.278873658884474,
      "grad_norm": 0.07379541546106339,
      "learning_rate": 0.00022664264607133937,
      "loss": 8.8516,
      "step": 17792,
      "throughput": 8855.78500977693
    },
    {
      "epoch": 0.27937523021340294,
      "grad_norm": 0.08074238151311874,
      "learning_rate": 0.00022639861439055617,
      "loss": 8.8508,
      "step": 17824,
      "throughput": 8855.85539062389
    },
    {
      "epoch": 0.27987680154233185,
      "grad_norm": 0.07020998746156693,
      "learning_rate": 0.00022615432950170528,
      "loss": 8.8343,
      "step": 17856,
      "throughput": 8855.62056538393
    },
    {
      "epoch": 0.2803783728712608,
      "grad_norm": 0.07520820200443268,
      "learning_rate": 0.00022590979241221825,
      "loss": 8.8356,
      "step": 17888,
      "throughput": 8855.967147418854
    },
    {
      "epoch": 0.28087994420018964,
      "grad_norm": 0.07711929082870483,
      "learning_rate": 0.00022566500413056677,
      "loss": 8.8239,
      "step": 17920,
      "throughput": 8856.296819007126
    },
    {
      "epoch": 0.28138151552911855,
      "grad_norm": 0.082347571849823,
      "learning_rate": 0.00022541996566625841,
      "loss": 8.8508,
      "step": 17952,
      "throughput": 8856.407864410561
    },
    {
      "epoch": 0.28188308685804747,
      "grad_norm": 0.07675802707672119,
      "learning_rate": 0.00022517467802983266,
      "loss": 8.8465,
      "step": 17984,
      "throughput": 8855.985756478674
    },
    {
      "epoch": 0.2823846581869764,
      "grad_norm": 0.07771243155002594,
      "learning_rate": 0.0002249291422328563,
      "loss": 8.8411,
      "step": 18016,
      "throughput": 8856.335225590043
    },
    {
      "epoch": 0.2828862295159053,
      "grad_norm": 0.07669182121753693,
      "learning_rate": 0.00022468335928791977,
      "loss": 8.8348,
      "step": 18048,
      "throughput": 8856.675590260706
    },
    {
      "epoch": 0.2833878008448342,
      "grad_norm": 0.07644347846508026,
      "learning_rate": 0.00022443733020863262,
      "loss": 8.8249,
      "step": 18080,
      "throughput": 8856.790422663576
    },
    {
      "epoch": 0.28388937217376314,
      "grad_norm": 0.07570263743400574,
      "learning_rate": 0.00022419105600961955,
      "loss": 8.8312,
      "step": 18112,
      "throughput": 8856.386037032886
    },
    {
      "epoch": 0.284390943502692,
      "grad_norm": 0.07852896302938461,
      "learning_rate": 0.00022394453770651607,
      "loss": 8.8398,
      "step": 18144,
      "throughput": 8856.740806178015
    },
    {
      "epoch": 0.2848925148316209,
      "grad_norm": 0.07880748063325882,
      "learning_rate": 0.00022369777631596436,
      "loss": 8.8073,
      "step": 18176,
      "throughput": 8857.067865452158
    },
    {
      "epoch": 0.28539408616054984,
      "grad_norm": 0.08232726156711578,
      "learning_rate": 0.00022345077285560914,
      "loss": 8.8411,
      "step": 18208,
      "throughput": 8857.218494518329
    },
    {
      "epoch": 0.28589565748947876,
      "grad_norm": 0.07877877354621887,
      "learning_rate": 0.00022320352834409343,
      "loss": 8.8383,
      "step": 18240,
      "throughput": 8856.812395985982
    },
    {
      "epoch": 0.2863972288184077,
      "grad_norm": 0.08451380580663681,
      "learning_rate": 0.0002229560438010543,
      "loss": 8.8075,
      "step": 18272,
      "throughput": 8857.160473869671
    },
    {
      "epoch": 0.2868988001473366,
      "grad_norm": 0.0770159438252449,
      "learning_rate": 0.00022270832024711882,
      "loss": 8.8476,
      "step": 18304,
      "throughput": 8857.499612175583
    },
    {
      "epoch": 0.2874003714762655,
      "grad_norm": 0.07113203406333923,
      "learning_rate": 0.00022246035870389952,
      "loss": 8.8182,
      "step": 18336,
      "throughput": 8857.645947663099
    },
    {
      "epoch": 0.2879019428051944,
      "grad_norm": 0.07758703827857971,
      "learning_rate": 0.00022221216019399067,
      "loss": 8.8354,
      "step": 18368,
      "throughput": 8857.217151383928
    },
    {
      "epoch": 0.2884035141341233,
      "grad_norm": 0.07979318499565125,
      "learning_rate": 0.00022196372574096357,
      "loss": 8.8236,
      "step": 18400,
      "throughput": 8857.567966195786
    },
    {
      "epoch": 0.2889050854630522,
      "grad_norm": 0.08021023869514465,
      "learning_rate": 0.00022171505636936272,
      "loss": 8.8404,
      "step": 18432,
      "throughput": 8857.892187582856
    },
    {
      "epoch": 0.28940665679198113,
      "grad_norm": 0.07425079494714737,
      "learning_rate": 0.00022146615310470125,
      "loss": 8.8425,
      "step": 18464,
      "throughput": 8856.808418432038
    },
    {
      "epoch": 0.28990822812091005,
      "grad_norm": 0.07755149155855179,
      "learning_rate": 0.0002212170169734571,
      "loss": 8.8284,
      "step": 18496,
      "throughput": 8856.443410239268
    },
    {
      "epoch": 0.29040979944983897,
      "grad_norm": 0.07777094095945358,
      "learning_rate": 0.0002209676490030683,
      "loss": 8.8119,
      "step": 18528,
      "throughput": 8856.778676716774
    },
    {
      "epoch": 0.2909113707787679,
      "grad_norm": 0.08375611156225204,
      "learning_rate": 0.0002207180502219291,
      "loss": 8.8353,
      "step": 18560,
      "throughput": 8857.11272009533
    },
    {
      "epoch": 0.29141294210769675,
      "grad_norm": 0.08027869462966919,
      "learning_rate": 0.00022046822165938565,
      "loss": 8.8138,
      "step": 18592,
      "throughput": 8857.243246798447
    },
    {
      "epoch": 0.29191451343662567,
      "grad_norm": 0.07487241923809052,
      "learning_rate": 0.00022021816434573168,
      "loss": 8.8214,
      "step": 18624,
      "throughput": 8856.791305351167
    },
    {
      "epoch": 0.2924160847655546,
      "grad_norm": 0.07839448004961014,
      "learning_rate": 0.0002199678793122043,
      "loss": 8.828,
      "step": 18656,
      "throughput": 8857.088713588137
    },
    {
      "epoch": 0.2929176560944835,
      "grad_norm": 0.07757352292537689,
      "learning_rate": 0.0002197173675909797,
      "loss": 8.8034,
      "step": 18688,
      "throughput": 8857.407527915473
    },
    {
      "epoch": 0.2934192274234124,
      "grad_norm": 0.0852331817150116,
      "learning_rate": 0.00021946663021516895,
      "loss": 8.8277,
      "step": 18720,
      "throughput": 8857.545830289808
    },
    {
      "epoch": 0.29392079875234134,
      "grad_norm": 0.07976188510656357,
      "learning_rate": 0.0002192156682188138,
      "loss": 8.8383,
      "step": 18752,
      "throughput": 8857.14916930161
    },
    {
      "epoch": 0.29442237008127026,
      "grad_norm": 0.08353574573993683,
      "learning_rate": 0.00021896448263688224,
      "loss": 8.8016,
      "step": 18784,
      "throughput": 8857.460271507598
    },
    {
      "epoch": 0.2949239414101991,
      "grad_norm": 0.07478364557027817,
      "learning_rate": 0.00021871307450526428,
      "loss": 8.819,
      "step": 18816,
      "throughput": 8857.789192633714
    },
    {
      "epoch": 0.29542551273912804,
      "grad_norm": 0.08134711533784866,
      "learning_rate": 0.00021846144486076794,
      "loss": 8.7982,
      "step": 18848,
      "throughput": 8858.006440622918
    },
    {
      "epoch": 0.29592708406805696,
      "grad_norm": 0.07802871614694595,
      "learning_rate": 0.00021820959474111448,
      "loss": 8.8167,
      "step": 18880,
      "throughput": 8857.656923278422
    },
    {
      "epoch": 0.2964286553969859,
      "grad_norm": 0.07292959839105606,
      "learning_rate": 0.00021795752518493462,
      "loss": 8.8177,
      "step": 18912,
      "throughput": 8857.952320622437
    },
    {
      "epoch": 0.2969302267259148,
      "grad_norm": 0.07705529034137726,
      "learning_rate": 0.0002177052372317639,
      "loss": 8.809,
      "step": 18944,
      "throughput": 8858.283256023147
    },
    {
      "epoch": 0.2974317980548437,
      "grad_norm": 0.07219377160072327,
      "learning_rate": 0.00021745273192203871,
      "loss": 8.7992,
      "step": 18976,
      "throughput": 8858.485744890328
    },
    {
      "epoch": 0.2979333693837726,
      "grad_norm": 0.07287011295557022,
      "learning_rate": 0.00021720001029709152,
      "loss": 8.8159,
      "step": 19008,
      "throughput": 8858.189442588646
    },
    {
      "epoch": 0.2984349407127015,
      "grad_norm": 0.08055272698402405,
      "learning_rate": 0.00021694707339914722,
      "loss": 8.8072,
      "step": 19040,
      "throughput": 8858.477380933875
    },
    {
      "epoch": 0.2989365120416304,
      "grad_norm": 0.07618486136198044,
      "learning_rate": 0.00021669392227131816,
      "loss": 8.8157,
      "step": 19072,
      "throughput": 8858.800639897141
    },
    {
      "epoch": 0.2994380833705593,
      "grad_norm": 0.08146921545267105,
      "learning_rate": 0.0002164405579576005,
      "loss": 8.8208,
      "step": 19104,
      "throughput": 8859.004658024229
    },
    {
      "epoch": 0.29993965469948825,
      "grad_norm": 0.07948251813650131,
      "learning_rate": 0.0002161869815028694,
      "loss": 8.806,
      "step": 19136,
      "throughput": 8858.673377763416
    },
    {
      "epoch": 0.30044122602841716,
      "grad_norm": 0.07705122232437134,
      "learning_rate": 0.00021593319395287483,
      "loss": 8.8067,
      "step": 19168,
      "throughput": 8858.923571471372
    },
    {
      "epoch": 0.3009427973573461,
      "grad_norm": 0.07735167443752289,
      "learning_rate": 0.0002156791963542374,
      "loss": 8.8013,
      "step": 19200,
      "throughput": 8859.241962821041
    },
    {
      "epoch": 0.30144436868627494,
      "grad_norm": 0.07171089202165604,
      "learning_rate": 0.00021542498975444404,
      "loss": 8.7911,
      "step": 19232,
      "throughput": 8859.40818699571
    },
    {
      "epoch": 0.30194594001520386,
      "grad_norm": 0.07420752197504044,
      "learning_rate": 0.0002151705752018435,
      "loss": 8.7965,
      "step": 19264,
      "throughput": 8859.045418316844
    },
    {
      "epoch": 0.3024475113441328,
      "grad_norm": 0.07215207815170288,
      "learning_rate": 0.0002149159537456421,
      "loss": 8.8041,
      "step": 19296,
      "throughput": 8859.322351742645
    },
    {
      "epoch": 0.3029490826730617,
      "grad_norm": 0.0752219557762146,
      "learning_rate": 0.00021466112643589948,
      "loss": 8.7525,
      "step": 19328,
      "throughput": 8859.64663023597
    },
    {
      "epoch": 0.3034506540019906,
      "grad_norm": 0.07393915206193924,
      "learning_rate": 0.00021440609432352427,
      "loss": 8.8176,
      "step": 19360,
      "throughput": 8859.80417616545
    },
    {
      "epoch": 0.30395222533091953,
      "grad_norm": 0.071009062230587,
      "learning_rate": 0.00021415085846026961,
      "loss": 8.8123,
      "step": 19392,
      "throughput": 8859.451551701533
    },
    {
      "epoch": 0.30445379665984845,
      "grad_norm": 0.07098925113677979,
      "learning_rate": 0.00021389541989872904,
      "loss": 8.7806,
      "step": 19424,
      "throughput": 8859.725222106339
    },
    {
      "epoch": 0.3049553679887773,
      "grad_norm": 0.07782450318336487,
      "learning_rate": 0.00021363977969233186,
      "loss": 8.806,
      "step": 19456,
      "throughput": 8860.046117312131
    },
    {
      "epoch": 0.30545693931770623,
      "grad_norm": 0.07848142087459564,
      "learning_rate": 0.000213383938895339,
      "loss": 8.8004,
      "step": 19488,
      "throughput": 8860.240900857605
    },
    {
      "epoch": 0.30595851064663515,
      "grad_norm": 0.07593582570552826,
      "learning_rate": 0.00021312789856283885,
      "loss": 8.7832,
      "step": 19520,
      "throughput": 8859.933631407544
    },
    {
      "epoch": 0.30646008197556407,
      "grad_norm": 0.07840114831924438,
      "learning_rate": 0.0002128716597507423,
      "loss": 8.8136,
      "step": 19552,
      "throughput": 8860.162457203185
    },
    {
      "epoch": 0.306961653304493,
      "grad_norm": 0.07489600032567978,
      "learning_rate": 0.00021261522351577906,
      "loss": 8.8106,
      "step": 19584,
      "throughput": 8860.481862699764
    },
    {
      "epoch": 0.3074632246334219,
      "grad_norm": 0.07132317125797272,
      "learning_rate": 0.00021235859091549294,
      "loss": 8.8066,
      "step": 19616,
      "throughput": 8860.70040022265
    },
    {
      "epoch": 0.3079647959623508,
      "grad_norm": 0.07415090501308441,
      "learning_rate": 0.0002121017630082375,
      "loss": 8.7978,
      "step": 19648,
      "throughput": 8860.37478055252
    },
    {
      "epoch": 0.3084663672912797,
      "grad_norm": 0.07169399410486221,
      "learning_rate": 0.0002118447408531718,
      "loss": 8.7662,
      "step": 19680,
      "throughput": 8860.571476568459
    },
    {
      "epoch": 0.3089679386202086,
      "grad_norm": 0.07761473208665848,
      "learning_rate": 0.00021158752551025603,
      "loss": 8.784,
      "step": 19712,
      "throughput": 8860.875005279453
    },
    {
      "epoch": 0.3094695099491375,
      "grad_norm": 0.06970509141683578,
      "learning_rate": 0.0002113301180402469,
      "loss": 8.8111,
      "step": 19744,
      "throughput": 8861.079905140443
    },
    {
      "epoch": 0.30997108127806644,
      "grad_norm": 0.08040442317724228,
      "learning_rate": 0.0002110725195046937,
      "loss": 8.8005,
      "step": 19776,
      "throughput": 8860.789572291105
    },
    {
      "epoch": 0.31047265260699536,
      "grad_norm": 0.0767935961484909,
      "learning_rate": 0.00021081473096593348,
      "loss": 8.7964,
      "step": 19808,
      "throughput": 8861.053776804249
    },
    {
      "epoch": 0.3109742239359243,
      "grad_norm": 0.08461003750562668,
      "learning_rate": 0.000210556753487087,
      "loss": 8.788,
      "step": 19840,
      "throughput": 8861.359350502951
    },
    {
      "epoch": 0.3114757952648532,
      "grad_norm": 0.07828964293003082,
      "learning_rate": 0.00021029858813205408,
      "loss": 8.7645,
      "step": 19872,
      "throughput": 8861.55610888214
    },
    {
      "epoch": 0.31197736659378206,
      "grad_norm": 0.07688968628644943,
      "learning_rate": 0.00021004023596550946,
      "loss": 8.7912,
      "step": 19904,
      "throughput": 8861.254054605806
    },
    {
      "epoch": 0.312478937922711,
      "grad_norm": 0.07793660461902618,
      "learning_rate": 0.00020978169805289823,
      "loss": 8.7965,
      "step": 19936,
      "throughput": 8861.43085922013
    },
    {
      "epoch": 0.3129805092516399,
      "grad_norm": 0.08023947477340698,
      "learning_rate": 0.0002095229754604315,
      "loss": 8.7836,
      "step": 19968,
      "throughput": 8861.73919396938
    },
    {
      "epoch": 0.3134820805805688,
      "grad_norm": 0.06911280006170273,
      "learning_rate": 0.00020926406925508202,
      "loss": 8.7903,
      "step": 20000,
      "throughput": 8861.956996963781
    },
    {
      "epoch": 0.31398365190949773,
      "grad_norm": 0.07631926983594894,
      "learning_rate": 0.00020900498050457973,
      "loss": 8.8002,
      "step": 20032,
      "throughput": 8861.663725932234
    },
    {
      "epoch": 0.31448522323842665,
      "grad_norm": 0.07329485565423965,
      "learning_rate": 0.0002087457102774074,
      "loss": 8.7874,
      "step": 20064,
      "throughput": 8861.784496418903
    },
    {
      "epoch": 0.31498679456735557,
      "grad_norm": 0.0776711255311966,
      "learning_rate": 0.00020848625964279622,
      "loss": 8.7753,
      "step": 20096,
      "throughput": 8862.072151779965
    },
    {
      "epoch": 0.31548836589628443,
      "grad_norm": 0.07343296706676483,
      "learning_rate": 0.0002082266296707214,
      "loss": 8.7819,
      "step": 20128,
      "throughput": 8862.30263300387
    },
    {
      "epoch": 0.31598993722521335,
      "grad_norm": 0.07486861944198608,
      "learning_rate": 0.0002079668214318977,
      "loss": 8.7772,
      "step": 20160,
      "throughput": 8862.023287445149
    },
    {
      "epoch": 0.31649150855414226,
      "grad_norm": 0.07053958624601364,
      "learning_rate": 0.00020770683599777507,
      "loss": 8.7748,
      "step": 20192,
      "throughput": 8862.130224170123
    },
    {
      "epoch": 0.3169930798830712,
      "grad_norm": 0.07576991617679596,
      "learning_rate": 0.0002074466744405342,
      "loss": 8.7726,
      "step": 20224,
      "throughput": 8862.436196256274
    },
    {
      "epoch": 0.3174946512120001,
      "grad_norm": 0.07820528000593185,
      "learning_rate": 0.00020718633783308214,
      "loss": 8.764,
      "step": 20256,
      "throughput": 8862.643837958087
    },
    {
      "epoch": 0.317996222540929,
      "grad_norm": 0.07988286018371582,
      "learning_rate": 0.00020692582724904778,
      "loss": 8.7765,
      "step": 20288,
      "throughput": 8862.373019248727
    },
    {
      "epoch": 0.31849779386985794,
      "grad_norm": 0.07375448942184448,
      "learning_rate": 0.00020666514376277762,
      "loss": 8.7767,
      "step": 20320,
      "throughput": 8862.507849895823
    },
    {
      "epoch": 0.3189993651987868,
      "grad_norm": 0.07463452219963074,
      "learning_rate": 0.00020640428844933108,
      "loss": 8.7799,
      "step": 20352,
      "throughput": 8862.80857270762
    },
    {
      "epoch": 0.3195009365277157,
      "grad_norm": 0.0773720070719719,
      "learning_rate": 0.00020614326238447623,
      "loss": 8.7773,
      "step": 20384,
      "throughput": 8863.007704410107
    },
    {
      "epoch": 0.32000250785664464,
      "grad_norm": 0.07711444795131683,
      "learning_rate": 0.0002058820666446854,
      "loss": 8.7749,
      "step": 20416,
      "throughput": 8862.702690685419
    },
    {
      "epoch": 0.32050407918557355,
      "grad_norm": 0.07666298002004623,
      "learning_rate": 0.00020562070230713058,
      "loss": 8.7838,
      "step": 20448,
      "throughput": 8862.813511884808
    },
    {
      "epoch": 0.32100565051450247,
      "grad_norm": 0.08265794813632965,
      "learning_rate": 0.00020535917044967899,
      "loss": 8.7659,
      "step": 20480,
      "throughput": 8863.104435326033
    },
    {
      "epoch": 0.3215072218434314,
      "grad_norm": 0.07821661978960037,
      "learning_rate": 0.00020509747215088887,
      "loss": 8.7716,
      "step": 20512,
      "throughput": 8862.248408416432
    },
    {
      "epoch": 0.3220087931723603,
      "grad_norm": 0.07673295587301254,
      "learning_rate": 0.00020483560849000475,
      "loss": 8.7724,
      "step": 20544,
      "throughput": 8861.950632851338
    },
    {
      "epoch": 0.32251036450128917,
      "grad_norm": 0.07887155562639236,
      "learning_rate": 0.00020457358054695317,
      "loss": 8.7633,
      "step": 20576,
      "throughput": 8862.104116942202
    },
    {
      "epoch": 0.3230119358302181,
      "grad_norm": 0.0740322545170784,
      "learning_rate": 0.00020431138940233808,
      "loss": 8.7607,
      "step": 20608,
      "throughput": 8862.405699218747
    },
    {
      "epoch": 0.323513507159147,
      "grad_norm": 0.07571806013584137,
      "learning_rate": 0.00020404903613743664,
      "loss": 8.7607,
      "step": 20640,
      "throughput": 8862.580925632892
    },
    {
      "epoch": 0.3240150784880759,
      "grad_norm": 0.07117355614900589,
      "learning_rate": 0.0002037865218341944,
      "loss": 8.7698,
      "step": 20672,
      "throughput": 8862.345848437657
    },
    {
      "epoch": 0.32451664981700484,
      "grad_norm": 0.07989663630723953,
      "learning_rate": 0.00020352384757522113,
      "loss": 8.7533,
      "step": 20704,
      "throughput": 8862.46471189943
    },
    {
      "epoch": 0.32501822114593376,
      "grad_norm": 0.07916408777236938,
      "learning_rate": 0.00020326101444378633,
      "loss": 8.7779,
      "step": 20736,
      "throughput": 8862.762289104425
    },
    {
      "epoch": 0.3255197924748627,
      "grad_norm": 0.07352910190820694,
      "learning_rate": 0.0002029980235238145,
      "loss": 8.7761,
      "step": 20768,
      "throughput": 8862.900620640583
    },
    {
      "epoch": 0.32602136380379154,
      "grad_norm": 0.0776766985654831,
      "learning_rate": 0.0002027348758998811,
      "loss": 8.7684,
      "step": 20800,
      "throughput": 8862.731017971897
    },
    {
      "epoch": 0.32652293513272046,
      "grad_norm": 0.07036112248897552,
      "learning_rate": 0.0002024715726572076,
      "loss": 8.781,
      "step": 20832,
      "throughput": 8862.792925614236
    },
    {
      "epoch": 0.3270245064616494,
      "grad_norm": 0.07312353700399399,
      "learning_rate": 0.0002022081148816574,
      "loss": 8.7549,
      "step": 20864,
      "throughput": 8863.089293067325
    },
    {
      "epoch": 0.3275260777905783,
      "grad_norm": 0.07475027441978455,
      "learning_rate": 0.0002019445036597312,
      "loss": 8.7529,
      "step": 20896,
      "throughput": 8863.261879082387
    },
    {
      "epoch": 0.3280276491195072,
      "grad_norm": 0.0749066025018692,
      "learning_rate": 0.00020168074007856232,
      "loss": 8.7625,
      "step": 20928,
      "throughput": 8863.12822142747
    },
    {
      "epoch": 0.32852922044843613,
      "grad_norm": 0.07506958395242691,
      "learning_rate": 0.00020141682522591272,
      "loss": 8.7479,
      "step": 20960,
      "throughput": 8863.186783284209
    },
    {
      "epoch": 0.32903079177736505,
      "grad_norm": 0.07391706109046936,
      "learning_rate": 0.0002011527601901679,
      "loss": 8.767,
      "step": 20992,
      "throughput": 8863.46507968349
    },
    {
      "epoch": 0.3295323631062939,
      "grad_norm": 0.08027796447277069,
      "learning_rate": 0.00020088854606033292,
      "loss": 8.7727,
      "step": 21024,
      "throughput": 8863.629659452417
    },
    {
      "epoch": 0.33003393443522283,
      "grad_norm": 0.07504010945558548,
      "learning_rate": 0.00020062418392602767,
      "loss": 8.7545,
      "step": 21056,
      "throughput": 8863.450811395789
    },
    {
      "epoch": 0.33053550576415175,
      "grad_norm": 0.07410600036382675,
      "learning_rate": 0.00020035967487748226,
      "loss": 8.7675,
      "step": 21088,
      "throughput": 8863.513552246986
    },
    {
      "epoch": 0.33103707709308067,
      "grad_norm": 0.07737985253334045,
      "learning_rate": 0.00020009502000553286,
      "loss": 8.7567,
      "step": 21120,
      "throughput": 8863.794578497278
    },
    {
      "epoch": 0.3315386484220096,
      "grad_norm": 0.07133954018354416,
      "learning_rate": 0.00019983022040161692,
      "loss": 8.7443,
      "step": 21152,
      "throughput": 8863.957209664966
    },
    {
      "epoch": 0.3320402197509385,
      "grad_norm": 0.07269904017448425,
      "learning_rate": 0.00019956527715776887,
      "loss": 8.7645,
      "step": 21184,
      "throughput": 8863.802577653145
    },
    {
      "epoch": 0.3325417910798674,
      "grad_norm": 0.07274427264928818,
      "learning_rate": 0.0001993001913666153,
      "loss": 8.7624,
      "step": 21216,
      "throughput": 8863.817185842325
    },
    {
      "epoch": 0.3330433624087963,
      "grad_norm": 0.09224282950162888,
      "learning_rate": 0.00019903496412137093,
      "loss": 8.7587,
      "step": 21248,
      "throughput": 8864.10281088239
    },
    {
      "epoch": 0.3335449337377252,
      "grad_norm": 0.08311135321855545,
      "learning_rate": 0.00019876959651583362,
      "loss": 8.7614,
      "step": 21280,
      "throughput": 8864.230523668733
    },
    {
      "epoch": 0.3340465050666541,
      "grad_norm": 0.07675494253635406,
      "learning_rate": 0.00019850408964438023,
      "loss": 8.769,
      "step": 21312,
      "throughput": 8864.095079543797
    },
    {
      "epoch": 0.33454807639558304,
      "grad_norm": 0.08694007992744446,
      "learning_rate": 0.00019823844460196177,
      "loss": 8.763,
      "step": 21344,
      "throughput": 8864.0769825026
    },
    {
      "epoch": 0.33504964772451196,
      "grad_norm": 0.07510685920715332,
      "learning_rate": 0.00019797266248409932,
      "loss": 8.7363,
      "step": 21376,
      "throughput": 8864.363601091953
    },
    {
      "epoch": 0.3355512190534409,
      "grad_norm": 0.07192866504192352,
      "learning_rate": 0.000197706744386879,
      "loss": 8.7521,
      "step": 21408,
      "throughput": 8864.56809397559
    },
    {
      "epoch": 0.3360527903823698,
      "grad_norm": 0.07726701349020004,
      "learning_rate": 0.00019744069140694795,
      "loss": 8.7686,
      "step": 21440,
      "throughput": 8864.399078833221
    },
    {
      "epoch": 0.33655436171129866,
      "grad_norm": 0.07312531024217606,
      "learning_rate": 0.00019717450464150935,
      "loss": 8.7255,
      "step": 21472,
      "throughput": 8864.395924493345
    },
    {
      "epoch": 0.3370559330402276,
      "grad_norm": 0.07701530307531357,
      "learning_rate": 0.00019690818518831827,
      "loss": 8.7591,
      "step": 21504,
      "throughput": 8864.67034622098
    },
    {
      "epoch": 0.3375575043691565,
      "grad_norm": 0.0789533257484436,
      "learning_rate": 0.0001966417341456769,
      "loss": 8.7412,
      "step": 21536,
      "throughput": 8864.858013910027
    },
    {
      "epoch": 0.3380590756980854,
      "grad_norm": 0.07506786286830902,
      "learning_rate": 0.0001963751526124301,
      "loss": 8.7517,
      "step": 21568,
      "throughput": 8864.705015491547
    },
    {
      "epoch": 0.3385606470270143,
      "grad_norm": 0.08154984563589096,
      "learning_rate": 0.00019610844168796096,
      "loss": 8.766,
      "step": 21600,
      "throughput": 8864.701365582961
    },
    {
      "epoch": 0.33906221835594325,
      "grad_norm": 0.07672927528619766,
      "learning_rate": 0.0001958416024721861,
      "loss": 8.74,
      "step": 21632,
      "throughput": 8864.980959781919
    },
    {
      "epoch": 0.33956378968487216,
      "grad_norm": 0.07608082890510559,
      "learning_rate": 0.00019557463606555118,
      "loss": 8.7345,
      "step": 21664,
      "throughput": 8865.14812470399
    },
    {
      "epoch": 0.340065361013801,
      "grad_norm": 0.07585503160953522,
      "learning_rate": 0.0001953075435690266,
      "loss": 8.7663,
      "step": 21696,
      "throughput": 8865.01654954425
    },
    {
      "epoch": 0.34056693234272994,
      "grad_norm": 0.07558625936508179,
      "learning_rate": 0.0001950403260841024,
      "loss": 8.7208,
      "step": 21728,
      "throughput": 8865.01576011177
    },
    {
      "epoch": 0.34106850367165886,
      "grad_norm": 0.07169059664011002,
      "learning_rate": 0.0001947729847127845,
      "loss": 8.7642,
      "step": 21760,
      "throughput": 8865.297653033553
    },
    {
      "epoch": 0.3415700750005878,
      "grad_norm": 0.07630694657564163,
      "learning_rate": 0.00019450552055758934,
      "loss": 8.7271,
      "step": 21792,
      "throughput": 8865.502595810825
    },
    {
      "epoch": 0.3420716463295167,
      "grad_norm": 0.07808644324541092,
      "learning_rate": 0.00019423793472153996,
      "loss": 8.7277,
      "step": 21824,
      "throughput": 8865.337814499435
    },
    {
      "epoch": 0.3425732176584456,
      "grad_norm": 0.07151120156049728,
      "learning_rate": 0.0001939702283081611,
      "loss": 8.7362,
      "step": 21856,
      "throughput": 8865.367864029218
    },
    {
      "epoch": 0.34307478898737453,
      "grad_norm": 0.07521945238113403,
      "learning_rate": 0.00019370240242147488,
      "loss": 8.748,
      "step": 21888,
      "throughput": 8865.610709442726
    },
    {
      "epoch": 0.3435763603163034,
      "grad_norm": 0.0807991772890091,
      "learning_rate": 0.000193434458165996,
      "loss": 8.7549,
      "step": 21920,
      "throughput": 8865.805759537128
    },
    {
      "epoch": 0.3440779316452323,
      "grad_norm": 0.0741187185049057,
      "learning_rate": 0.00019316639664672733,
      "loss": 8.7242,
      "step": 21952,
      "throughput": 8865.651708866617
    },
    {
      "epoch": 0.34457950297416123,
      "grad_norm": 0.07587863504886627,
      "learning_rate": 0.00019289821896915544,
      "loss": 8.7291,
      "step": 21984,
      "throughput": 8865.681779431436
    },
    {
      "epoch": 0.34508107430309015,
      "grad_norm": 0.07360213249921799,
      "learning_rate": 0.00019262992623924585,
      "loss": 8.7376,
      "step": 22016,
      "throughput": 8865.925156901621
    },
    {
      "epoch": 0.34558264563201907,
      "grad_norm": 0.07608392089605331,
      "learning_rate": 0.00019236151956343852,
      "loss": 8.7118,
      "step": 22048,
      "throughput": 8866.107911752862
    },
    {
      "epoch": 0.346084216960948,
      "grad_norm": 0.07788034528493881,
      "learning_rate": 0.00019209300004864341,
      "loss": 8.7388,
      "step": 22080,
      "throughput": 8866.039446435672
    },
    {
      "epoch": 0.3465857882898769,
      "grad_norm": 0.07935154438018799,
      "learning_rate": 0.00019182436880223585,
      "loss": 8.7374,
      "step": 22112,
      "throughput": 8865.998122149582
    },
    {
      "epoch": 0.34708735961880577,
      "grad_norm": 0.07901440560817719,
      "learning_rate": 0.00019155562693205178,
      "loss": 8.7314,
      "step": 22144,
      "throughput": 8866.230615305081
    },
    {
      "epoch": 0.3475889309477347,
      "grad_norm": 0.07936616241931915,
      "learning_rate": 0.00019128677554638355,
      "loss": 8.7069,
      "step": 22176,
      "throughput": 8866.37955423216
    },
    {
      "epoch": 0.3480905022766636,
      "grad_norm": 0.08413133025169373,
      "learning_rate": 0.0001910178157539751,
      "loss": 8.7277,
      "step": 22208,
      "throughput": 8866.289248236735
    },
    {
      "epoch": 0.3485920736055925,
      "grad_norm": 0.07251808792352676,
      "learning_rate": 0.00019074874866401733,
      "loss": 8.7532,
      "step": 22240,
      "throughput": 8866.223294338466
    },
    {
      "epoch": 0.34909364493452144,
      "grad_norm": 0.07380632311105728,
      "learning_rate": 0.00019047957538614375,
      "loss": 8.7284,
      "step": 22272,
      "throughput": 8866.443971829462
    },
    {
      "epoch": 0.34959521626345036,
      "grad_norm": 0.08033832162618637,
      "learning_rate": 0.00019021029703042576,
      "loss": 8.7286,
      "step": 22304,
      "throughput": 8866.646670464452
    },
    {
      "epoch": 0.3500967875923793,
      "grad_norm": 0.07755397260189056,
      "learning_rate": 0.0001899409147073681,
      "loss": 8.7335,
      "step": 22336,
      "throughput": 8866.487407628047
    },
    {
      "epoch": 0.35059835892130814,
      "grad_norm": 0.0786450207233429,
      "learning_rate": 0.0001896714295279043,
      "loss": 8.7219,
      "step": 22368,
      "throughput": 8866.459515548839
    },
    {
      "epoch": 0.35109993025023706,
      "grad_norm": 0.07564543187618256,
      "learning_rate": 0.00018940184260339194,
      "loss": 8.7401,
      "step": 22400,
      "throughput": 8866.67087849643
    },
    {
      "epoch": 0.351601501579166,
      "grad_norm": 0.07476343214511871,
      "learning_rate": 0.00018913215504560838,
      "loss": 8.7529,
      "step": 22432,
      "throughput": 8866.867201927287
    },
    {
      "epoch": 0.3521030729080949,
      "grad_norm": 0.07755191624164581,
      "learning_rate": 0.0001888623679667459,
      "loss": 8.7243,
      "step": 22464,
      "throughput": 8866.767721777669
    },
    {
      "epoch": 0.3526046442370238,
      "grad_norm": 0.07201996445655823,
      "learning_rate": 0.00018859248247940722,
      "loss": 8.731,
      "step": 22496,
      "throughput": 8866.769220643666
    },
    {
      "epoch": 0.35310621556595273,
      "grad_norm": 0.07190986722707748,
      "learning_rate": 0.0001883224996966008,
      "loss": 8.7428,
      "step": 22528,
      "throughput": 8866.943729486306
    },
    {
      "epoch": 0.35360778689488165,
      "grad_norm": 0.07117670029401779,
      "learning_rate": 0.00018805242073173653,
      "loss": 8.7311,
      "step": 22560,
      "throughput": 8866.144862009913
    },
    {
      "epoch": 0.3541093582238105,
      "grad_norm": 0.07088455557823181,
      "learning_rate": 0.00018778224669862087,
      "loss": 8.7338,
      "step": 22592,
      "throughput": 8866.042481283248
    },
    {
      "epoch": 0.35461092955273943,
      "grad_norm": 0.07562088221311569,
      "learning_rate": 0.0001875119787114523,
      "loss": 8.7135,
      "step": 22624,
      "throughput": 8866.069381839432
    },
    {
      "epoch": 0.35511250088166835,
      "grad_norm": 0.07029681652784348,
      "learning_rate": 0.00018724161788481676,
      "loss": 8.7263,
      "step": 22656,
      "throughput": 8866.261751059588
    },
    {
      "epoch": 0.35561407221059727,
      "grad_norm": 0.07344888150691986,
      "learning_rate": 0.00018697116533368316,
      "loss": 8.7098,
      "step": 22688,
      "throughput": 8866.397560376685
    },
    {
      "epoch": 0.3561156435395262,
      "grad_norm": 0.07342462986707687,
      "learning_rate": 0.00018670062217339867,
      "loss": 8.7426,
      "step": 22720,
      "throughput": 8866.329069189647
    },
    {
      "epoch": 0.3566172148684551,
      "grad_norm": 0.07215454429388046,
      "learning_rate": 0.0001864299895196839,
      "loss": 8.7513,
      "step": 22752,
      "throughput": 8866.362397261639
    },
    {
      "epoch": 0.357118786197384,
      "grad_norm": 0.07690445333719254,
      "learning_rate": 0.00018615926848862893,
      "loss": 8.7467,
      "step": 22784,
      "throughput": 8866.53666383422
    },
    {
      "epoch": 0.3576203575263129,
      "grad_norm": 0.07840703427791595,
      "learning_rate": 0.00018588846019668793,
      "loss": 8.7088,
      "step": 22816,
      "throughput": 8866.691489926814
    },
    {
      "epoch": 0.3581219288552418,
      "grad_norm": 0.0753103718161583,
      "learning_rate": 0.00018561756576067524,
      "loss": 8.7098,
      "step": 22848,
      "throughput": 8866.619402757455
    },
    {
      "epoch": 0.3586235001841707,
      "grad_norm": 0.07334914058446884,
      "learning_rate": 0.0001853465862977602,
      "loss": 8.7045,
      "step": 22880,
      "throughput": 8866.65715732974
    },
    {
      "epoch": 0.35912507151309964,
      "grad_norm": 0.07309671491384506,
      "learning_rate": 0.00018507552292546295,
      "loss": 8.6958,
      "step": 22912,
      "throughput": 8866.789061564716
    },
    {
      "epoch": 0.35962664284202855,
      "grad_norm": 0.07427306473255157,
      "learning_rate": 0.00018480437676164968,
      "loss": 8.6913,
      "step": 22944,
      "throughput": 8866.966795257267
    },
    {
      "epoch": 0.3601282141709575,
      "grad_norm": 0.07504558563232422,
      "learning_rate": 0.00018453314892452795,
      "loss": 8.7261,
      "step": 22976,
      "throughput": 8866.94648140496
    },
    {
      "epoch": 0.36062978549988634,
      "grad_norm": 0.07671528309583664,
      "learning_rate": 0.00018426184053264215,
      "loss": 8.7222,
      "step": 23008,
      "throughput": 8866.981521554353
    },
    {
      "epoch": 0.36113135682881525,
      "grad_norm": 0.07723363488912582,
      "learning_rate": 0.0001839904527048689,
      "loss": 8.7267,
      "step": 23040,
      "throughput": 8867.111959507398
    },
    {
      "epoch": 0.36163292815774417,
      "grad_norm": 0.07163956016302109,
      "learning_rate": 0.0001837189865604124,
      "loss": 8.698,
      "step": 23072,
      "throughput": 8867.259393831302
    },
    {
      "epoch": 0.3621344994866731,
      "grad_norm": 0.07297907024621964,
      "learning_rate": 0.00018344744321879987,
      "loss": 8.7214,
      "step": 23104,
      "throughput": 8867.24415376364
    },
    {
      "epoch": 0.362636070815602,
      "grad_norm": 0.07717998325824738,
      "learning_rate": 0.0001831758237998768,
      "loss": 8.6994,
      "step": 23136,
      "throughput": 8867.269568652184
    },
    {
      "epoch": 0.3631376421445309,
      "grad_norm": 0.06853833794593811,
      "learning_rate": 0.00018290412942380252,
      "loss": 8.6805,
      "step": 23168,
      "throughput": 8867.423731234427
    },
    {
      "epoch": 0.36363921347345984,
      "grad_norm": 0.08529718220233917,
      "learning_rate": 0.00018263236121104543,
      "loss": 8.7047,
      "step": 23200,
      "throughput": 8867.61257424937
    },
    {
      "epoch": 0.3641407848023887,
      "grad_norm": 0.07281111925840378,
      "learning_rate": 0.00018236052028237847,
      "loss": 8.7055,
      "step": 23232,
      "throughput": 8867.513943135138
    },
    {
      "epoch": 0.3646423561313176,
      "grad_norm": 0.07215279340744019,
      "learning_rate": 0.0001820886077588744,
      "loss": 8.7075,
      "step": 23264,
      "throughput": 8867.534972717942
    },
    {
      "epoch": 0.36514392746024654,
      "grad_norm": 0.07825150340795517,
      "learning_rate": 0.00018181662476190127,
      "loss": 8.7221,
      "step": 23296,
      "throughput": 8867.653692389635
    },
    {
      "epoch": 0.36564549878917546,
      "grad_norm": 0.06967805325984955,
      "learning_rate": 0.00018154457241311773,
      "loss": 8.7052,
      "step": 23328,
      "throughput": 8867.854170707304
    },
    {
      "epoch": 0.3661470701181044,
      "grad_norm": 0.07603953033685684,
      "learning_rate": 0.00018127245183446858,
      "loss": 8.6969,
      "step": 23360,
      "throughput": 8867.761518804655
    },
    {
      "epoch": 0.3666486414470333,
      "grad_norm": 0.07054081559181213,
      "learning_rate": 0.00018100026414817987,
      "loss": 8.715,
      "step": 23392,
      "throughput": 8867.792561097536
    },
    {
      "epoch": 0.3671502127759622,
      "grad_norm": 0.0735711082816124,
      "learning_rate": 0.00018072801047675432,
      "loss": 8.7058,
      "step": 23424,
      "throughput": 8867.919506011427
    },
    {
      "epoch": 0.3676517841048911,
      "grad_norm": 0.07133655995130539,
      "learning_rate": 0.00018045569194296697,
      "loss": 8.7024,
      "step": 23456,
      "throughput": 8868.07518636643
    },
    {
      "epoch": 0.36815335543382,
      "grad_norm": 0.07236292213201523,
      "learning_rate": 0.00018018330966986022,
      "loss": 8.6952,
      "step": 23488,
      "throughput": 8868.049974110492
    },
    {
      "epoch": 0.3686549267627489,
      "grad_norm": 0.07108893990516663,
      "learning_rate": 0.00017991086478073943,
      "loss": 8.7161,
      "step": 23520,
      "throughput": 8868.052097936794
    },
    {
      "epoch": 0.36915649809167783,
      "grad_norm": 0.08245235681533813,
      "learning_rate": 0.0001796383583991681,
      "loss": 8.7005,
      "step": 23552,
      "throughput": 8868.188362773484
    },
    {
      "epoch": 0.36965806942060675,
      "grad_norm": 0.07327444851398468,
      "learning_rate": 0.00017936579164896333,
      "loss": 8.7022,
      "step": 23584,
      "throughput": 8868.340869115167
    },
    {
      "epoch": 0.37015964074953567,
      "grad_norm": 0.0701148584485054,
      "learning_rate": 0.0001790931656541912,
      "loss": 8.7133,
      "step": 23616,
      "throughput": 8868.29974006014
    },
    {
      "epoch": 0.3706612120784646,
      "grad_norm": 0.07192771881818771,
      "learning_rate": 0.00017882048153916214,
      "loss": 8.7273,
      "step": 23648,
      "throughput": 8868.346506488127
    },
    {
      "epoch": 0.37116278340739345,
      "grad_norm": 0.07537954300642014,
      "learning_rate": 0.00017854774042842626,
      "loss": 8.7159,
      "step": 23680,
      "throughput": 8868.482967951622
    },
    {
      "epoch": 0.37166435473632237,
      "grad_norm": 0.07714162021875381,
      "learning_rate": 0.00017827494344676873,
      "loss": 8.7134,
      "step": 23712,
      "throughput": 8868.679327198255
    },
    {
      "epoch": 0.3721659260652513,
      "grad_norm": 0.0708460807800293,
      "learning_rate": 0.000178002091719205,
      "loss": 8.6908,
      "step": 23744,
      "throughput": 8868.578581239224
    },
    {
      "epoch": 0.3726674973941802,
      "grad_norm": 0.070463165640831,
      "learning_rate": 0.00017772918637097657,
      "loss": 8.7001,
      "step": 23776,
      "throughput": 8868.63710866715
    },
    {
      "epoch": 0.3731690687231091,
      "grad_norm": 0.07860637456178665,
      "learning_rate": 0.00017745622852754575,
      "loss": 8.7026,
      "step": 23808,
      "throughput": 8868.75675413827
    },
    {
      "epoch": 0.37367064005203804,
      "grad_norm": 0.07869873940944672,
      "learning_rate": 0.00017718321931459163,
      "loss": 8.6933,
      "step": 23840,
      "throughput": 8868.94579659867
    },
    {
      "epoch": 0.37417221138096696,
      "grad_norm": 0.08232009410858154,
      "learning_rate": 0.00017691015985800488,
      "loss": 8.7128,
      "step": 23872,
      "throughput": 8868.846136915936
    },
    {
      "epoch": 0.3746737827098958,
      "grad_norm": 0.07898972928524017,
      "learning_rate": 0.0001766370512838836,
      "loss": 8.693,
      "step": 23904,
      "throughput": 8868.847818264216
    },
    {
      "epoch": 0.37517535403882474,
      "grad_norm": 0.0719093382358551,
      "learning_rate": 0.00017636389471852834,
      "loss": 8.6928,
      "step": 23936,
      "throughput": 8869.011717480415
    },
    {
      "epoch": 0.37567692536775366,
      "grad_norm": 0.07355839759111404,
      "learning_rate": 0.0001760906912884376,
      "loss": 8.6936,
      "step": 23968,
      "throughput": 8869.146767865981
    },
    {
      "epoch": 0.3761784966966826,
      "grad_norm": 0.07554195076227188,
      "learning_rate": 0.00017581744212030308,
      "loss": 8.6896,
      "step": 24000,
      "throughput": 8869.047406839176
    },
    {
      "epoch": 0.3766800680256115,
      "grad_norm": 0.07405807077884674,
      "learning_rate": 0.00017554414834100525,
      "loss": 8.681,
      "step": 24032,
      "throughput": 8869.049468682455
    },
    {
      "epoch": 0.3771816393545404,
      "grad_norm": 0.07666285336017609,
      "learning_rate": 0.00017527081107760834,
      "loss": 8.6829,
      "step": 24064,
      "throughput": 8869.210629828673
    },
    {
      "epoch": 0.37768321068346933,
      "grad_norm": 0.08341163396835327,
      "learning_rate": 0.00017499743145735615,
      "loss": 8.695,
      "step": 24096,
      "throughput": 8869.362543407795
    },
    {
      "epoch": 0.3781847820123982,
      "grad_norm": 0.07192159444093704,
      "learning_rate": 0.00017472401060766697,
      "loss": 8.6939,
      "step": 24128,
      "throughput": 8869.239088716518
    },
    {
      "epoch": 0.3786863533413271,
      "grad_norm": 0.07966649532318115,
      "learning_rate": 0.0001744505496561292,
      "loss": 8.698,
      "step": 24160,
      "throughput": 8869.33513356526
    },
    {
      "epoch": 0.379187924670256,
      "grad_norm": 0.07046504318714142,
      "learning_rate": 0.00017417704973049668,
      "loss": 8.7069,
      "step": 24192,
      "throughput": 8869.457953968098
    },
    {
      "epoch": 0.37968949599918494,
      "grad_norm": 0.07870694249868393,
      "learning_rate": 0.00017390351195868385,
      "loss": 8.6969,
      "step": 24224,
      "throughput": 8869.603073077134
    },
    {
      "epoch": 0.38019106732811386,
      "grad_norm": 0.07509059458971024,
      "learning_rate": 0.00017362993746876135,
      "loss": 8.695,
      "step": 24256,
      "throughput": 8869.509359643629
    },
    {
      "epoch": 0.3806926386570428,
      "grad_norm": 0.0709197148680687,
      "learning_rate": 0.00017335632738895113,
      "loss": 8.694,
      "step": 24288,
      "throughput": 8869.57627443901
    },
    {
      "epoch": 0.3811942099859717,
      "grad_norm": 0.07527151703834534,
      "learning_rate": 0.000173082682847622,
      "loss": 8.6916,
      "step": 24320,
      "throughput": 8869.696624010772
    },
    {
      "epoch": 0.38169578131490056,
      "grad_norm": 0.07428012043237686,
      "learning_rate": 0.0001728090049732848,
      "loss": 8.684,
      "step": 24352,
      "throughput": 8869.84031541236
    },
    {
      "epoch": 0.3821973526438295,
      "grad_norm": 0.07557906955480576,
      "learning_rate": 0.00017253529489458802,
      "loss": 8.6978,
      "step": 24384,
      "throughput": 8869.77798243609
    },
    {
      "epoch": 0.3826989239727584,
      "grad_norm": 0.07178075611591339,
      "learning_rate": 0.00017226155374031271,
      "loss": 8.6802,
      "step": 24416,
      "throughput": 8869.83705986506
    },
    {
      "epoch": 0.3832004953016873,
      "grad_norm": 0.0776342898607254,
      "learning_rate": 0.0001719877826393683,
      "loss": 8.682,
      "step": 24448,
      "throughput": 8869.958717232506
    },
    {
      "epoch": 0.38370206663061623,
      "grad_norm": 0.07733438163995743,
      "learning_rate": 0.00017171398272078752,
      "loss": 8.67,
      "step": 24480,
      "throughput": 8870.12149243361
    },
    {
      "epoch": 0.38420363795954515,
      "grad_norm": 0.07164875417947769,
      "learning_rate": 0.00017144015511372208,
      "loss": 8.6772,
      "step": 24512,
      "throughput": 8870.061453331795
    },
    {
      "epoch": 0.38470520928847407,
      "grad_norm": 0.06802382320165634,
      "learning_rate": 0.00017116630094743792,
      "loss": 8.6741,
      "step": 24544,
      "throughput": 8870.136585820119
    },
    {
      "epoch": 0.38520678061740293,
      "grad_norm": 0.08131495863199234,
      "learning_rate": 0.00017089242135131036,
      "loss": 8.6745,
      "step": 24576,
      "throughput": 8870.257355447251
    },
    {
      "epoch": 0.38570835194633185,
      "grad_norm": 0.07555441558361053,
      "learning_rate": 0.0001706185174548197,
      "loss": 8.6685,
      "step": 24608,
      "throughput": 8869.510047014594
    },
    {
      "epoch": 0.38620992327526077,
      "grad_norm": 0.07308689504861832,
      "learning_rate": 0.0001703445903875464,
      "loss": 8.6748,
      "step": 24640,
      "throughput": 8869.508137207475
    },
    {
      "epoch": 0.3867114946041897,
      "grad_norm": 0.07100782543420792,
      "learning_rate": 0.00017007064127916644,
      "loss": 8.6804,
      "step": 24672,
      "throughput": 8869.535923334248
    },
    {
      "epoch": 0.3872130659331186,
      "grad_norm": 0.07744865119457245,
      "learning_rate": 0.0001697966712594469,
      "loss": 8.6914,
      "step": 24704,
      "throughput": 8869.636771237965
    },
    {
      "epoch": 0.3877146372620475,
      "grad_norm": 0.0693967267870903,
      "learning_rate": 0.00016952268145824082,
      "loss": 8.6752,
      "step": 24736,
      "throughput": 8869.735649269436
    },
    {
      "epoch": 0.38821620859097644,
      "grad_norm": 0.07560817897319794,
      "learning_rate": 0.00016924867300548304,
      "loss": 8.6659,
      "step": 24768,
      "throughput": 8869.670423971971
    },
    {
      "epoch": 0.3887177799199053,
      "grad_norm": 0.07274651527404785,
      "learning_rate": 0.00016897464703118515,
      "loss": 8.6905,
      "step": 24800,
      "throughput": 8869.720716639269
    },
    {
      "epoch": 0.3892193512488342,
      "grad_norm": 0.07410851866006851,
      "learning_rate": 0.00016870060466543112,
      "loss": 8.65,
      "step": 24832,
      "throughput": 8869.822123904682
    },
    {
      "epoch": 0.38972092257776314,
      "grad_norm": 0.08210521936416626,
      "learning_rate": 0.0001684265470383725,
      "loss": 8.6903,
      "step": 24864,
      "throughput": 8869.90443907778
    },
    {
      "epoch": 0.39022249390669206,
      "grad_norm": 0.07470977306365967,
      "learning_rate": 0.0001681524752802237,
      "loss": 8.6972,
      "step": 24896,
      "throughput": 8869.853368054952
    },
    {
      "epoch": 0.390724065235621,
      "grad_norm": 0.07381650805473328,
      "learning_rate": 0.00016787839052125758,
      "loss": 8.6787,
      "step": 24928,
      "throughput": 8869.890685049933
    },
    {
      "epoch": 0.3912256365645499,
      "grad_norm": 0.06976639479398727,
      "learning_rate": 0.00016760429389180037,
      "loss": 8.6935,
      "step": 24960,
      "throughput": 8870.016580322115
    },
    {
      "epoch": 0.3917272078934788,
      "grad_norm": 0.07536919414997101,
      "learning_rate": 0.00016733018652222744,
      "loss": 8.6541,
      "step": 24992,
      "throughput": 8870.083976126552
    },
    {
      "epoch": 0.3922287792224077,
      "grad_norm": 0.06791981309652328,
      "learning_rate": 0.0001670560695429584,
      "loss": 8.6712,
      "step": 25024,
      "throughput": 8870.029229798514
    },
    {
      "epoch": 0.3927303505513366,
      "grad_norm": 0.07341574877500534,
      "learning_rate": 0.00016678194408445245,
      "loss": 8.6457,
      "step": 25056,
      "throughput": 8870.067860416464
    },
    {
      "epoch": 0.3932319218802655,
      "grad_norm": 0.07675306499004364,
      "learning_rate": 0.00016650781127720382,
      "loss": 8.6806,
      "step": 25088,
      "throughput": 8870.186841482271
    },
    {
      "epoch": 0.39373349320919443,
      "grad_norm": 0.07608195394277573,
      "learning_rate": 0.00016623367225173703,
      "loss": 8.6829,
      "step": 25120,
      "throughput": 8870.274246204594
    },
    {
      "epoch": 0.39423506453812335,
      "grad_norm": 0.07675729691982269,
      "learning_rate": 0.00016595952813860216,
      "loss": 8.6735,
      "step": 25152,
      "throughput": 8870.254386805169
    },
    {
      "epoch": 0.39473663586705227,
      "grad_norm": 0.07194288820028305,
      "learning_rate": 0.00016568538006837046,
      "loss": 8.6817,
      "step": 25184,
      "throughput": 8870.327053030003
    },
    {
      "epoch": 0.3952382071959812,
      "grad_norm": 0.07365325093269348,
      "learning_rate": 0.00016541122917162934,
      "loss": 8.6625,
      "step": 25216,
      "throughput": 8870.425180152624
    },
    {
      "epoch": 0.39573977852491005,
      "grad_norm": 0.07837095856666565,
      "learning_rate": 0.00016513707657897785,
      "loss": 8.6911,
      "step": 25248,
      "throughput": 8870.557387619592
    },
    {
      "epoch": 0.39624134985383896,
      "grad_norm": 0.07529828697443008,
      "learning_rate": 0.00016486292342102215,
      "loss": 8.6722,
      "step": 25280,
      "throughput": 8870.488229558243
    },
    {
      "epoch": 0.3967429211827679,
      "grad_norm": 0.07008527964353561,
      "learning_rate": 0.0001645887708283707,
      "loss": 8.6661,
      "step": 25312,
      "throughput": 8870.529715633977
    },
    {
      "epoch": 0.3972444925116968,
      "grad_norm": 0.07274020463228226,
      "learning_rate": 0.00016431461993162954,
      "loss": 8.6695,
      "step": 25344,
      "throughput": 8870.610755685926
    },
    {
      "epoch": 0.3977460638406257,
      "grad_norm": 0.07809685915708542,
      "learning_rate": 0.00016404047186139784,
      "loss": 8.6645,
      "step": 25376,
      "throughput": 8870.67675068345
    },
    {
      "epoch": 0.39824763516955464,
      "grad_norm": 0.08027014881372452,
      "learning_rate": 0.00016376632774826297,
      "loss": 8.6604,
      "step": 25408,
      "throughput": 8870.62851229032
    },
    {
      "epoch": 0.39874920649848355,
      "grad_norm": 0.07037220895290375,
      "learning_rate": 0.0001634921887227962,
      "loss": 8.6662,
      "step": 25440,
      "throughput": 8870.68671648454
    },
    {
      "epoch": 0.3992507778274124,
      "grad_norm": 0.07897590100765228,
      "learning_rate": 0.00016321805591554755,
      "loss": 8.6595,
      "step": 25472,
      "throughput": 8870.774777575869
    },
    {
      "epoch": 0.39975234915634134,
      "grad_norm": 0.07503047585487366,
      "learning_rate": 0.00016294393045704163,
      "loss": 8.6695,
      "step": 25504,
      "throughput": 8870.866991168901
    },
    {
      "epoch": 0.40025392048527025,
      "grad_norm": 0.07193060964345932,
      "learning_rate": 0.00016266981347777255,
      "loss": 8.6627,
      "step": 25536,
      "throughput": 8870.829377769813
    },
    {
      "epoch": 0.40075549181419917,
      "grad_norm": 0.07066937536001205,
      "learning_rate": 0.00016239570610819963,
      "loss": 8.6515,
      "step": 25568,
      "throughput": 8870.865608844539
    },
    {
      "epoch": 0.4012570631431281,
      "grad_norm": 0.06965267658233643,
      "learning_rate": 0.00016212160947874242,
      "loss": 8.6543,
      "step": 25600,
      "throughput": 8870.946621920393
    },
    {
      "epoch": 0.401758634472057,
      "grad_norm": 0.07811598479747772,
      "learning_rate": 0.00016184752471977627,
      "loss": 8.6526,
      "step": 25632,
      "throughput": 8871.096432293107
    },
    {
      "epoch": 0.4022602058009859,
      "grad_norm": 0.08140557259321213,
      "learning_rate": 0.0001615734529616275,
      "loss": 8.6447,
      "step": 25664,
      "throughput": 8871.087218631283
    },
    {
      "epoch": 0.4027617771299148,
      "grad_norm": 0.08000985532999039,
      "learning_rate": 0.00016129939533456888,
      "loss": 8.6645,
      "step": 25696,
      "throughput": 8871.150626139613
    },
    {
      "epoch": 0.4032633484588437,
      "grad_norm": 0.07303277403116226,
      "learning_rate": 0.00016102535296881485,
      "loss": 8.6534,
      "step": 25728,
      "throughput": 8871.24360844604
    },
    {
      "epoch": 0.4037649197877726,
      "grad_norm": 0.07288201153278351,
      "learning_rate": 0.00016075132699451701,
      "loss": 8.6671,
      "step": 25760,
      "throughput": 8871.357493273781
    },
    {
      "epoch": 0.40426649111670154,
      "grad_norm": 0.07564268261194229,
      "learning_rate": 0.00016047731854175917,
      "loss": 8.6561,
      "step": 25792,
      "throughput": 8871.299038597108
    },
    {
      "epoch": 0.40476806244563046,
      "grad_norm": 0.07370468974113464,
      "learning_rate": 0.00016020332874055313,
      "loss": 8.6658,
      "step": 25824,
      "throughput": 8871.375135691831
    },
    {
      "epoch": 0.4052696337745594,
      "grad_norm": 0.0826638862490654,
      "learning_rate": 0.00015992935872083356,
      "loss": 8.672,
      "step": 25856,
      "throughput": 8871.475268392245
    },
    {
      "epoch": 0.4057712051034883,
      "grad_norm": 0.07711603492498398,
      "learning_rate": 0.00015965540961245363,
      "loss": 8.6426,
      "step": 25888,
      "throughput": 8871.610726872705
    },
    {
      "epoch": 0.40627277643241716,
      "grad_norm": 0.07239612936973572,
      "learning_rate": 0.0001593814825451803,
      "loss": 8.6563,
      "step": 25920,
      "throughput": 8871.537753317134
    },
    {
      "epoch": 0.4067743477613461,
      "grad_norm": 0.07643844932317734,
      "learning_rate": 0.00015910757864868967,
      "loss": 8.6636,
      "step": 25952,
      "throughput": 8871.602785647328
    },
    {
      "epoch": 0.407275919090275,
      "grad_norm": 0.06919372826814651,
      "learning_rate": 0.0001588336990525621,
      "loss": 8.6657,
      "step": 25984,
      "throughput": 8871.725906269707
    },
    {
      "epoch": 0.4077774904192039,
      "grad_norm": 0.0749737024307251,
      "learning_rate": 0.00015855984488627792,
      "loss": 8.6709,
      "step": 26016,
      "throughput": 8871.885830919566
    },
    {
      "epoch": 0.40827906174813283,
      "grad_norm": 0.07118481397628784,
      "learning_rate": 0.00015828601727921248,
      "loss": 8.6471,
      "step": 26048,
      "throughput": 8871.78394377422
    },
    {
      "epoch": 0.40878063307706175,
      "grad_norm": 0.07816265523433685,
      "learning_rate": 0.0001580122173606317,
      "loss": 8.6644,
      "step": 26080,
      "throughput": 8871.82914920052
    },
    {
      "epoch": 0.40928220440599067,
      "grad_norm": 0.07230495661497116,
      "learning_rate": 0.00015773844625968726,
      "loss": 8.654,
      "step": 26112,
      "throughput": 8871.933818994134
    },
    {
      "epoch": 0.40978377573491953,
      "grad_norm": 0.06881450116634369,
      "learning_rate": 0.00015746470510541197,
      "loss": 8.6284,
      "step": 26144,
      "throughput": 8872.083173340814
    },
    {
      "epoch": 0.41028534706384845,
      "grad_norm": 0.07161400467157364,
      "learning_rate": 0.00015719099502671516,
      "loss": 8.6412,
      "step": 26176,
      "throughput": 8872.000838936652
    },
    {
      "epoch": 0.41078691839277737,
      "grad_norm": 0.07901474833488464,
      "learning_rate": 0.00015691731715237802,
      "loss": 8.633,
      "step": 26208,
      "throughput": 8872.06924059419
    },
    {
      "epoch": 0.4112884897217063,
      "grad_norm": 0.07110141217708588,
      "learning_rate": 0.00015664367261104887,
      "loss": 8.6585,
      "step": 26240,
      "throughput": 8872.149282164934
    },
    {
      "epoch": 0.4117900610506352,
      "grad_norm": 0.07459357380867004,
      "learning_rate": 0.00015637006253123865,
      "loss": 8.6473,
      "step": 26272,
      "throughput": 8872.275020695792
    },
    {
      "epoch": 0.4122916323795641,
      "grad_norm": 0.0729188397526741,
      "learning_rate": 0.00015609648804131612,
      "loss": 8.6276,
      "step": 26304,
      "throughput": 8872.212686359257
    },
    {
      "epoch": 0.41279320370849304,
      "grad_norm": 0.0700279101729393,
      "learning_rate": 0.00015582295026950332,
      "loss": 8.6507,
      "step": 26336,
      "throughput": 8872.283869781426
    },
    {
      "epoch": 0.4132947750374219,
      "grad_norm": 0.07937642931938171,
      "learning_rate": 0.00015554945034387075,
      "loss": 8.6427,
      "step": 26368,
      "throughput": 8872.336145608379
    },
    {
      "epoch": 0.4137963463663508,
      "grad_norm": 0.07519602030515671,
      "learning_rate": 0.00015527598939233303,
      "loss": 8.6468,
      "step": 26400,
      "throughput": 8872.465062210009
    },
    {
      "epoch": 0.41429791769527974,
      "grad_norm": 0.07092050462961197,
      "learning_rate": 0.00015500256854264385,
      "loss": 8.6378,
      "step": 26432,
      "throughput": 8872.406151953877
    },
    {
      "epoch": 0.41479948902420866,
      "grad_norm": 0.07096114754676819,
      "learning_rate": 0.00015472918892239166,
      "loss": 8.6505,
      "step": 26464,
      "throughput": 8872.47318466504
    },
    {
      "epoch": 0.4153010603531376,
      "grad_norm": 0.07255687564611435,
      "learning_rate": 0.00015445585165899475,
      "loss": 8.6462,
      "step": 26496,
      "throughput": 8872.526006080514
    },
    {
      "epoch": 0.4158026316820665,
      "grad_norm": 0.07390446960926056,
      "learning_rate": 0.00015418255787969692,
      "loss": 8.6444,
      "step": 26528,
      "throughput": 8872.666706172875
    },
    {
      "epoch": 0.4163042030109954,
      "grad_norm": 0.07466679066419601,
      "learning_rate": 0.0001539093087115624,
      "loss": 8.6638,
      "step": 26560,
      "throughput": 8872.605439396339
    },
    {
      "epoch": 0.4168057743399243,
      "grad_norm": 0.07198496162891388,
      "learning_rate": 0.00015363610528147163,
      "loss": 8.6529,
      "step": 26592,
      "throughput": 8872.687260092538
    },
    {
      "epoch": 0.4173073456688532,
      "grad_norm": 0.07361474633216858,
      "learning_rate": 0.00015336294871611637,
      "loss": 8.6325,
      "step": 26624,
      "throughput": 8872.760453910214
    },
    {
      "epoch": 0.4178089169977821,
      "grad_norm": 0.07440745830535889,
      "learning_rate": 0.00015308984014199511,
      "loss": 8.6457,
      "step": 26656,
      "throughput": 8872.094436600915
    },
    {
      "epoch": 0.418310488326711,
      "grad_norm": 0.0752161368727684,
      "learning_rate": 0.00015281678068540836,
      "loss": 8.6277,
      "step": 26688,
      "throughput": 8872.026746895188
    },
    {
      "epoch": 0.41881205965563995,
      "grad_norm": 0.07112942636013031,
      "learning_rate": 0.00015254377147245424,
      "loss": 8.6257,
      "step": 26720,
      "throughput": 8872.054171423386
    },
    {
      "epoch": 0.41931363098456886,
      "grad_norm": 0.09089133888483047,
      "learning_rate": 0.00015227081362902343,
      "loss": 8.6544,
      "step": 26752,
      "throughput": 8872.150418236
    },
    {
      "epoch": 0.4198152023134978,
      "grad_norm": 0.07310041040182114,
      "learning_rate": 0.000151997908280795,
      "loss": 8.6263,
      "step": 26784,
      "throughput": 8872.261633703853
    },
    {
      "epoch": 0.42031677364242664,
      "grad_norm": 0.07701604068279266,
      "learning_rate": 0.0001517250565532313,
      "loss": 8.6376,
      "step": 26816,
      "throughput": 8872.21203272543
    },
    {
      "epoch": 0.42081834497135556,
      "grad_norm": 0.07558988779783249,
      "learning_rate": 0.00015145225957157373,
      "loss": 8.6273,
      "step": 26848,
      "throughput": 8872.26083379703
    },
    {
      "epoch": 0.4213199163002845,
      "grad_norm": 0.07615737617015839,
      "learning_rate": 0.00015117951846083786,
      "loss": 8.6408,
      "step": 26880,
      "throughput": 8872.354404114056
    },
    {
      "epoch": 0.4218214876292134,
      "grad_norm": 0.07847557216882706,
      "learning_rate": 0.0001509068343458088,
      "loss": 8.625,
      "step": 26912,
      "throughput": 8872.468278857184
    },
    {
      "epoch": 0.4223230589581423,
      "grad_norm": 0.0712590292096138,
      "learning_rate": 0.00015063420835103667,
      "loss": 8.6498,
      "step": 26944,
      "throughput": 8872.410536215344
    },
    {
      "epoch": 0.42282463028707123,
      "grad_norm": 0.07340807467699051,
      "learning_rate": 0.0001503616416008319,
      "loss": 8.6503,
      "step": 26976,
      "throughput": 8872.482341042714
    },
    {
      "epoch": 0.42332620161600015,
      "grad_norm": 0.07415439188480377,
      "learning_rate": 0.00015008913521926052,
      "loss": 8.6437,
      "step": 27008,
      "throughput": 8872.581708434158
    },
    {
      "epoch": 0.423827772944929,
      "grad_norm": 0.07077255845069885,
      "learning_rate": 0.00014981669033013972,
      "loss": 8.6333,
      "step": 27040,
      "throughput": 8872.676276991508
    },
    {
      "epoch": 0.42432934427385793,
      "grad_norm": 0.07701191306114197,
      "learning_rate": 0.00014954430805703302,
      "loss": 8.6152,
      "step": 27072,
      "throughput": 8872.600067118728
    },
    {
      "epoch": 0.42483091560278685,
      "grad_norm": 0.07171820104122162,
      "learning_rate": 0.00014927198952324568,
      "loss": 8.6111,
      "step": 27104,
      "throughput": 8872.64366520588
    },
    {
      "epoch": 0.42533248693171577,
      "grad_norm": 0.0763024315237999,
      "learning_rate": 0.00014899973585182012,
      "loss": 8.6408,
      "step": 27136,
      "throughput": 8872.751917567166
    },
    {
      "epoch": 0.4258340582606447,
      "grad_norm": 0.07294578105211258,
      "learning_rate": 0.00014872754816553141,
      "loss": 8.6184,
      "step": 27168,
      "throughput": 8872.822795246944
    },
    {
      "epoch": 0.4263356295895736,
      "grad_norm": 0.07896049320697784,
      "learning_rate": 0.00014845542758688222,
      "loss": 8.6557,
      "step": 27200,
      "throughput": 8872.714598463881
    },
    {
      "epoch": 0.42683720091850247,
      "grad_norm": 0.07082242518663406,
      "learning_rate": 0.00014818337523809876,
      "loss": 8.6371,
      "step": 27232,
      "throughput": 8872.772047112663
    },
    {
      "epoch": 0.4273387722474314,
      "grad_norm": 0.07124398648738861,
      "learning_rate": 0.0001479113922411256,
      "loss": 8.6304,
      "step": 27264,
      "throughput": 8872.881022358144
    },
    {
      "epoch": 0.4278403435763603,
      "grad_norm": 0.07090860605239868,
      "learning_rate": 0.00014763947971762153,
      "loss": 8.6245,
      "step": 27296,
      "throughput": 8872.950858100683
    },
    {
      "epoch": 0.4283419149052892,
      "grad_norm": 0.07200966030359268,
      "learning_rate": 0.00014736763878895457,
      "loss": 8.623,
      "step": 27328,
      "throughput": 8872.846323918813
    },
    {
      "epoch": 0.42884348623421814,
      "grad_norm": 0.07737179845571518,
      "learning_rate": 0.00014709587057619748,
      "loss": 8.6463,
      "step": 27360,
      "throughput": 8872.900561966622
    },
    {
      "epoch": 0.42934505756314706,
      "grad_norm": 0.0813121348619461,
      "learning_rate": 0.0001468241762001232,
      "loss": 8.6273,
      "step": 27392,
      "throughput": 8873.021353610933
    },
    {
      "epoch": 0.429846628892076,
      "grad_norm": 0.07603728026151657,
      "learning_rate": 0.00014655255678120015,
      "loss": 8.6264,
      "step": 27424,
      "throughput": 8873.088690250943
    },
    {
      "epoch": 0.43034820022100484,
      "grad_norm": 0.07271303981542587,
      "learning_rate": 0.0001462810134395876,
      "loss": 8.6131,
      "step": 27456,
      "throughput": 8873.018806837712
    },
    {
      "epoch": 0.43084977154993376,
      "grad_norm": 0.07247339189052582,
      "learning_rate": 0.0001460095472951311,
      "loss": 8.6458,
      "step": 27488,
      "throughput": 8873.06833676307
    },
    {
      "epoch": 0.4313513428788627,
      "grad_norm": 0.07254812121391296,
      "learning_rate": 0.0001457381594673579,
      "loss": 8.6332,
      "step": 27520,
      "throughput": 8873.193427807744
    },
    {
      "epoch": 0.4318529142077916,
      "grad_norm": 0.07892932742834091,
      "learning_rate": 0.00014546685107547205,
      "loss": 8.6261,
      "step": 27552,
      "throughput": 8873.262743001163
    },
    {
      "epoch": 0.4323544855367205,
      "grad_norm": 0.07944980263710022,
      "learning_rate": 0.00014519562323835034,
      "loss": 8.6376,
      "step": 27584,
      "throughput": 8873.179062347474
    },
    {
      "epoch": 0.43285605686564943,
      "grad_norm": 0.07589094340801239,
      "learning_rate": 0.000144924477074537,
      "loss": 8.6238,
      "step": 27616,
      "throughput": 8873.226907929838
    },
    {
      "epoch": 0.43335762819457835,
      "grad_norm": 0.06933537870645523,
      "learning_rate": 0.00014465341370223977,
      "loss": 8.6294,
      "step": 27648,
      "throughput": 8873.345501280693
    },
    {
      "epoch": 0.4338591995235072,
      "grad_norm": 0.07746944576501846,
      "learning_rate": 0.00014438243423932476,
      "loss": 8.6122,
      "step": 27680,
      "throughput": 8873.411787429883
    },
    {
      "epoch": 0.43436077085243613,
      "grad_norm": 0.07950767129659653,
      "learning_rate": 0.00014411153980331198,
      "loss": 8.6242,
      "step": 27712,
      "throughput": 8873.34829689749
    },
    {
      "epoch": 0.43486234218136505,
      "grad_norm": 0.0721384733915329,
      "learning_rate": 0.00014384073151137104,
      "loss": 8.6003,
      "step": 27744,
      "throughput": 8873.384867423294
    },
    {
      "epoch": 0.43536391351029397,
      "grad_norm": 0.07308559864759445,
      "learning_rate": 0.00014357001048031603,
      "loss": 8.6236,
      "step": 27776,
      "throughput": 8873.492587486146
    },
    {
      "epoch": 0.4358654848392229,
      "grad_norm": 0.07080280035734177,
      "learning_rate": 0.00014329937782660136,
      "loss": 8.6146,
      "step": 27808,
      "throughput": 8873.573468115323
    },
    {
      "epoch": 0.4363670561681518,
      "grad_norm": 0.07138007879257202,
      "learning_rate": 0.00014302883466631676,
      "loss": 8.6406,
      "step": 27840,
      "throughput": 8873.479798005088
    },
    {
      "epoch": 0.4368686274970807,
      "grad_norm": 0.07479699701070786,
      "learning_rate": 0.0001427583821151832,
      "loss": 8.624,
      "step": 27872,
      "throughput": 8873.554725941001
    },
    {
      "epoch": 0.4373701988260096,
      "grad_norm": 0.07214164733886719,
      "learning_rate": 0.0001424880212885477,
      "loss": 8.6228,
      "step": 27904,
      "throughput": 8873.70765646058
    },
    {
      "epoch": 0.4378717701549385,
      "grad_norm": 0.08458901941776276,
      "learning_rate": 0.0001422177533013791,
      "loss": 8.6466,
      "step": 27936,
      "throughput": 8873.799614476173
    },
    {
      "epoch": 0.4383733414838674,
      "grad_norm": 0.07676039636135101,
      "learning_rate": 0.00014194757926826342,
      "loss": 8.6201,
      "step": 27968,
      "throughput": 8873.73437624315
    },
    {
      "epoch": 0.43887491281279634,
      "grad_norm": 0.07848938554525375,
      "learning_rate": 0.00014167750030339915,
      "loss": 8.6172,
      "step": 28000,
      "throughput": 8873.781156642763
    },
    {
      "epoch": 0.43937648414172525,
      "grad_norm": 0.07305929809808731,
      "learning_rate": 0.00014140751752059278,
      "loss": 8.6037,
      "step": 28032,
      "throughput": 8873.913417033487
    },
    {
      "epoch": 0.4398780554706542,
      "grad_norm": 0.08314230293035507,
      "learning_rate": 0.0001411376320332541,
      "loss": 8.6151,
      "step": 28064,
      "throughput": 8873.997696387973
    },
    {
      "epoch": 0.4403796267995831,
      "grad_norm": 0.07182671874761581,
      "learning_rate": 0.0001408678449543916,
      "loss": 8.6344,
      "step": 28096,
      "throughput": 8873.914407981782
    },
    {
      "epoch": 0.44088119812851195,
      "grad_norm": 0.07496217638254166,
      "learning_rate": 0.00014059815739660806,
      "loss": 8.6093,
      "step": 28128,
      "throughput": 8873.966478015534
    },
    {
      "epoch": 0.44138276945744087,
      "grad_norm": 0.07898411899805069,
      "learning_rate": 0.00014032857047209573,
      "loss": 8.6135,
      "step": 28160,
      "throughput": 8874.110777954276
    },
    {
      "epoch": 0.4418843407863698,
      "grad_norm": 0.07386748492717743,
      "learning_rate": 0.0001400590852926319,
      "loss": 8.6112,
      "step": 28192,
      "throughput": 8874.210112978559
    },
    {
      "epoch": 0.4423859121152987,
      "grad_norm": 0.07431478798389435,
      "learning_rate": 0.00013978970296957423,
      "loss": 8.606,
      "step": 28224,
      "throughput": 8874.171101333972
    },
    {
      "epoch": 0.4428874834442276,
      "grad_norm": 0.06942659616470337,
      "learning_rate": 0.00013952042461385625,
      "loss": 8.6087,
      "step": 28256,
      "throughput": 8874.187968964894
    },
    {
      "epoch": 0.44338905477315654,
      "grad_norm": 0.07364679872989655,
      "learning_rate": 0.00013925125133598266,
      "loss": 8.6124,
      "step": 28288,
      "throughput": 8874.330495103006
    },
    {
      "epoch": 0.44389062610208546,
      "grad_norm": 0.07738090306520462,
      "learning_rate": 0.0001389821842460249,
      "loss": 8.614,
      "step": 28320,
      "throughput": 8874.412446854609
    },
    {
      "epoch": 0.4443921974310143,
      "grad_norm": 0.07922062277793884,
      "learning_rate": 0.00013871322445361642,
      "loss": 8.6208,
      "step": 28352,
      "throughput": 8874.382217319006
    },
    {
      "epoch": 0.44489376875994324,
      "grad_norm": 0.0814685970544815,
      "learning_rate": 0.00013844437306794822,
      "loss": 8.6136,
      "step": 28384,
      "throughput": 8874.428691964775
    },
    {
      "epoch": 0.44539534008887216,
      "grad_norm": 0.07049067318439484,
      "learning_rate": 0.00013817563119776415,
      "loss": 8.5931,
      "step": 28416,
      "throughput": 8874.54071049907
    },
    {
      "epoch": 0.4458969114178011,
      "grad_norm": 0.08059202134609222,
      "learning_rate": 0.00013790699995135658,
      "loss": 8.6005,
      "step": 28448,
      "throughput": 8874.625059568178
    },
    {
      "epoch": 0.44639848274673,
      "grad_norm": 0.0694175511598587,
      "learning_rate": 0.00013763848043656148,
      "loss": 8.6154,
      "step": 28480,
      "throughput": 8874.587918082316
    },
    {
      "epoch": 0.4469000540756589,
      "grad_norm": 0.07339881360530853,
      "learning_rate": 0.00013737007376075414,
      "loss": 8.5956,
      "step": 28512,
      "throughput": 8874.633311785028
    },
    {
      "epoch": 0.44740162540458783,
      "grad_norm": 0.07454710453748703,
      "learning_rate": 0.0001371017810308445,
      "loss": 8.5953,
      "step": 28544,
      "throughput": 8874.732500278362
    },
    {
      "epoch": 0.4479031967335167,
      "grad_norm": 0.07213406264781952,
      "learning_rate": 0.00013683360335327264,
      "loss": 8.6271,
      "step": 28576,
      "throughput": 8874.838207728804
    },
    {
      "epoch": 0.4484047680624456,
      "grad_norm": 0.07962165027856827,
      "learning_rate": 0.000136565541834004,
      "loss": 8.6044,
      "step": 28608,
      "throughput": 8874.793838753636
    },
    {
      "epoch": 0.44890633939137453,
      "grad_norm": 0.07944195717573166,
      "learning_rate": 0.00013629759757852512,
      "loss": 8.61,
      "step": 28640,
      "throughput": 8874.835978417446
    },
    {
      "epoch": 0.44940791072030345,
      "grad_norm": 0.07339708507061005,
      "learning_rate": 0.00013602977169183884,
      "loss": 8.5739,
      "step": 28672,
      "throughput": 8874.939632681488
    },
    {
      "epoch": 0.44990948204923237,
      "grad_norm": 0.07606098800897598,
      "learning_rate": 0.00013576206527846004,
      "loss": 8.596,
      "step": 28704,
      "throughput": 8874.311135577633
    },
    {
      "epoch": 0.4504110533781613,
      "grad_norm": 0.07325044274330139,
      "learning_rate": 0.00013549447944241066,
      "loss": 8.6102,
      "step": 28736,
      "throughput": 8874.26637586415
    },
    {
      "epoch": 0.4509126247070902,
      "grad_norm": 0.07296261191368103,
      "learning_rate": 0.00013522701528721553,
      "loss": 8.6007,
      "step": 28768,
      "throughput": 8874.278863558644
    },
    {
      "epoch": 0.45141419603601907,
      "grad_norm": 0.0751008540391922,
      "learning_rate": 0.00013495967391589757,
      "loss": 8.5945,
      "step": 28800,
      "throughput": 8874.41343208501
    },
    {
      "epoch": 0.451915767364948,
      "grad_norm": 0.0770215392112732,
      "learning_rate": 0.00013469245643097345,
      "loss": 8.5957,
      "step": 28832,
      "throughput": 8874.474691285905
    },
    {
      "epoch": 0.4524173386938769,
      "grad_norm": 0.07606975734233856,
      "learning_rate": 0.0001344253639344488,
      "loss": 8.6028,
      "step": 28864,
      "throughput": 8874.454438965466
    },
    {
      "epoch": 0.4529189100228058,
      "grad_norm": 0.08218943327665329,
      "learning_rate": 0.00013415839752781392,
      "loss": 8.605,
      "step": 28896,
      "throughput": 8874.496925533944
    },
    {
      "epoch": 0.45342048135173474,
      "grad_norm": 0.07142216712236404,
      "learning_rate": 0.00013389155831203904,
      "loss": 8.6072,
      "step": 28928,
      "throughput": 8874.60667201992
    },
    {
      "epoch": 0.45392205268066366,
      "grad_norm": 0.08242444694042206,
      "learning_rate": 0.0001336248473875699,
      "loss": 8.602,
      "step": 28960,
      "throughput": 8874.707204975388
    },
    {
      "epoch": 0.4544236240095926,
      "grad_norm": 0.07306291908025742,
      "learning_rate": 0.00013335826585432313,
      "loss": 8.5926,
      "step": 28992,
      "throughput": 8874.691354885646
    },
    {
      "epoch": 0.45492519533852144,
      "grad_norm": 0.07619068026542664,
      "learning_rate": 0.00013309181481168173,
      "loss": 8.5913,
      "step": 29024,
      "throughput": 8874.700598118392
    },
    {
      "epoch": 0.45542676666745036,
      "grad_norm": 0.07132818549871445,
      "learning_rate": 0.00013282549535849065,
      "loss": 8.5916,
      "step": 29056,
      "throughput": 8874.825296268311
    },
    {
      "epoch": 0.4559283379963793,
      "grad_norm": 0.0796361193060875,
      "learning_rate": 0.00013255930859305205,
      "loss": 8.5836,
      "step": 29088,
      "throughput": 8874.928128288897
    },
    {
      "epoch": 0.4564299093253082,
      "grad_norm": 0.07567333430051804,
      "learning_rate": 0.000132293255613121,
      "loss": 8.6048,
      "step": 29120,
      "throughput": 8874.894230182781
    },
    {
      "epoch": 0.4569314806542371,
      "grad_norm": 0.07186120748519897,
      "learning_rate": 0.00013202733751590067,
      "loss": 8.587,
      "step": 29152,
      "throughput": 8874.943752287812
    },
    {
      "epoch": 0.45743305198316603,
      "grad_norm": 0.07044616341590881,
      "learning_rate": 0.00013176155539803818,
      "loss": 8.5969,
      "step": 29184,
      "throughput": 8875.022302255356
    },
    {
      "epoch": 0.45793462331209495,
      "grad_norm": 0.07153672724962234,
      "learning_rate": 0.00013149591035561977,
      "loss": 8.588,
      "step": 29216,
      "throughput": 8875.121942073476
    },
    {
      "epoch": 0.4584361946410238,
      "grad_norm": 0.0733792632818222,
      "learning_rate": 0.00013123040348416633,
      "loss": 8.5845,
      "step": 29248,
      "throughput": 8875.102025766064
    },
    {
      "epoch": 0.4589377659699527,
      "grad_norm": 0.07488974928855896,
      "learning_rate": 0.00013096503587862906,
      "loss": 8.6104,
      "step": 29280,
      "throughput": 8875.120380254424
    },
    {
      "epoch": 0.45943933729888164,
      "grad_norm": 0.07074250280857086,
      "learning_rate": 0.00013069980863338466,
      "loss": 8.5874,
      "step": 29312,
      "throughput": 8875.230425116066
    },
    {
      "epoch": 0.45994090862781056,
      "grad_norm": 0.07670116424560547,
      "learning_rate": 0.00013043472284223113,
      "loss": 8.5993,
      "step": 29344,
      "throughput": 8875.329479619582
    },
    {
      "epoch": 0.4604424799567395,
      "grad_norm": 0.07585009187459946,
      "learning_rate": 0.00013016977959838305,
      "loss": 8.5996,
      "step": 29376,
      "throughput": 8875.337858363735
    },
    {
      "epoch": 0.4609440512856684,
      "grad_norm": 0.0730140432715416,
      "learning_rate": 0.00012990497999446714,
      "loss": 8.5962,
      "step": 29408,
      "throughput": 8875.394291925266
    },
    {
      "epoch": 0.4614456226145973,
      "grad_norm": 0.0731014758348465,
      "learning_rate": 0.00012964032512251773,
      "loss": 8.595,
      "step": 29440,
      "throughput": 8875.505643899913
    },
    {
      "epoch": 0.4619471939435262,
      "grad_norm": 0.07306238263845444,
      "learning_rate": 0.00012937581607397236,
      "loss": 8.5911,
      "step": 29472,
      "throughput": 8875.564190178333
    },
    {
      "epoch": 0.4624487652724551,
      "grad_norm": 0.07264445722103119,
      "learning_rate": 0.00012911145393966703,
      "loss": 8.6046,
      "step": 29504,
      "throughput": 8875.600704726261
    },
    {
      "epoch": 0.462950336601384,
      "grad_norm": 0.07225610315799713,
      "learning_rate": 0.00012884723980983206,
      "loss": 8.5972,
      "step": 29536,
      "throughput": 8875.625193209586
    },
    {
      "epoch": 0.46345190793031293,
      "grad_norm": 0.07001107931137085,
      "learning_rate": 0.00012858317477408728,
      "loss": 8.6009,
      "step": 29568,
      "throughput": 8875.749078339293
    },
    {
      "epoch": 0.46395347925924185,
      "grad_norm": 0.07203416526317596,
      "learning_rate": 0.00012831925992143765,
      "loss": 8.6036,
      "step": 29600,
      "throughput": 8875.786375865675
    },
    {
      "epoch": 0.46445505058817077,
      "grad_norm": 0.06918448954820633,
      "learning_rate": 0.00012805549634026882,
      "loss": 8.588,
      "step": 29632,
      "throughput": 8875.833505349525
    },
    {
      "epoch": 0.4649566219170997,
      "grad_norm": 0.07581827789545059,
      "learning_rate": 0.00012779188511834256,
      "loss": 8.5998,
      "step": 29664,
      "throughput": 8875.849645993643
    },
    {
      "epoch": 0.46545819324602855,
      "grad_norm": 0.07366356998682022,
      "learning_rate": 0.00012752842734279238,
      "loss": 8.5807,
      "step": 29696,
      "throughput": 8875.961924137613
    },
    {
      "epoch": 0.46595976457495747,
      "grad_norm": 0.09135285019874573,
      "learning_rate": 0.0001272651241001189,
      "loss": 8.5932,
      "step": 29728,
      "throughput": 8876.071536647383
    },
    {
      "epoch": 0.4664613359038864,
      "grad_norm": 0.07081723213195801,
      "learning_rate": 0.00012700197647618549,
      "loss": 8.6031,
      "step": 29760,
      "throughput": 8876.095025167422
    },
    {
      "epoch": 0.4669629072328153,
      "grad_norm": 0.07992235571146011,
      "learning_rate": 0.00012673898555621373,
      "loss": 8.5904,
      "step": 29792,
      "throughput": 8876.10402483122
    },
    {
      "epoch": 0.4674644785617442,
      "grad_norm": 0.0734575167298317,
      "learning_rate": 0.00012647615242477887,
      "loss": 8.5568,
      "step": 29824,
      "throughput": 8876.20873167529
    },
    {
      "epoch": 0.46796604989067314,
      "grad_norm": 0.07131079584360123,
      "learning_rate": 0.0001262134781658056,
      "loss": 8.578,
      "step": 29856,
      "throughput": 8876.300822006067
    },
    {
      "epoch": 0.46846762121960206,
      "grad_norm": 0.07791458070278168,
      "learning_rate": 0.00012595096386256336,
      "loss": 8.5786,
      "step": 29888,
      "throughput": 8876.313810453454
    },
    {
      "epoch": 0.4689691925485309,
      "grad_norm": 0.07824485749006271,
      "learning_rate": 0.0001256886105976619,
      "loss": 8.6037,
      "step": 29920,
      "throughput": 8876.316123304772
    },
    {
      "epoch": 0.46947076387745984,
      "grad_norm": 0.07415210455656052,
      "learning_rate": 0.0001254264194530468,
      "loss": 8.5928,
      "step": 29952,
      "throughput": 8876.423316323264
    },
    {
      "epoch": 0.46997233520638876,
      "grad_norm": 0.07395078986883163,
      "learning_rate": 0.00012516439150999525,
      "loss": 8.5904,
      "step": 29984,
      "throughput": 8876.508654661935
    },
    {
      "epoch": 0.4704739065353177,
      "grad_norm": 0.07381439954042435,
      "learning_rate": 0.00012490252784911113,
      "loss": 8.5586,
      "step": 30016,
      "throughput": 8876.508811455968
    },
    {
      "epoch": 0.4709754778642466,
      "grad_norm": 0.07365623861551285,
      "learning_rate": 0.000124640829550321,
      "loss": 8.578,
      "step": 30048,
      "throughput": 8876.557384132608
    },
    {
      "epoch": 0.4714770491931755,
      "grad_norm": 0.06920890510082245,
      "learning_rate": 0.00012437929769286942,
      "loss": 8.5793,
      "step": 30080,
      "throughput": 8876.677506162016
    },
    {
      "epoch": 0.47197862052210443,
      "grad_norm": 0.07251748442649841,
      "learning_rate": 0.0001241179333553146,
      "loss": 8.5943,
      "step": 30112,
      "throughput": 8876.767853579231
    },
    {
      "epoch": 0.4724801918510333,
      "grad_norm": 0.08146440237760544,
      "learning_rate": 0.00012385673761552374,
      "loss": 8.5751,
      "step": 30144,
      "throughput": 8876.77526841676
    },
    {
      "epoch": 0.4729817631799622,
      "grad_norm": 0.06725624948740005,
      "learning_rate": 0.00012359571155066894,
      "loss": 8.5845,
      "step": 30176,
      "throughput": 8876.796895522772
    },
    {
      "epoch": 0.47348333450889113,
      "grad_norm": 0.07321012765169144,
      "learning_rate": 0.00012333485623722238,
      "loss": 8.5994,
      "step": 30208,
      "throughput": 8876.894242975139
    },
    {
      "epoch": 0.47398490583782005,
      "grad_norm": 0.06948108971118927,
      "learning_rate": 0.00012307417275095222,
      "loss": 8.5785,
      "step": 30240,
      "throughput": 8876.978857239008
    },
    {
      "epoch": 0.47448647716674897,
      "grad_norm": 0.07041703164577484,
      "learning_rate": 0.00012281366216691786,
      "loss": 8.5649,
      "step": 30272,
      "throughput": 8876.976901438517
    },
    {
      "epoch": 0.4749880484956779,
      "grad_norm": 0.07631143927574158,
      "learning_rate": 0.00012255332555946582,
      "loss": 8.5625,
      "step": 30304,
      "throughput": 8877.043764416963
    },
    {
      "epoch": 0.4754896198246068,
      "grad_norm": 0.0704835057258606,
      "learning_rate": 0.00012229316400222493,
      "loss": 8.59,
      "step": 30336,
      "throughput": 8877.134415497327
    },
    {
      "epoch": 0.47599119115353566,
      "grad_norm": 0.06994818150997162,
      "learning_rate": 0.00012203317856810232,
      "loss": 8.5859,
      "step": 30368,
      "throughput": 8877.18800100491
    },
    {
      "epoch": 0.4764927624824646,
      "grad_norm": 0.07705579698085785,
      "learning_rate": 0.0001217733703292786,
      "loss": 8.563,
      "step": 30400,
      "throughput": 8877.194226844798
    },
    {
      "epoch": 0.4769943338113935,
      "grad_norm": 0.07564400136470795,
      "learning_rate": 0.0001215137403572038,
      "loss": 8.5769,
      "step": 30432,
      "throughput": 8877.238367015549
    },
    {
      "epoch": 0.4774959051403224,
      "grad_norm": 0.07389659434556961,
      "learning_rate": 0.00012125428972259264,
      "loss": 8.5711,
      "step": 30464,
      "throughput": 8877.356088149087
    },
    {
      "epoch": 0.47799747646925134,
      "grad_norm": 0.07688334584236145,
      "learning_rate": 0.0001209950194954203,
      "loss": 8.5819,
      "step": 30496,
      "throughput": 8877.41932988071
    },
    {
      "epoch": 0.47849904779818025,
      "grad_norm": 0.07335702329874039,
      "learning_rate": 0.00012073593074491802,
      "loss": 8.5979,
      "step": 30528,
      "throughput": 8877.403826067108
    },
    {
      "epoch": 0.4790006191271092,
      "grad_norm": 0.07970409095287323,
      "learning_rate": 0.0001204770245395685,
      "loss": 8.5868,
      "step": 30560,
      "throughput": 8877.435590379278
    },
    {
      "epoch": 0.47950219045603804,
      "grad_norm": 0.073320172727108,
      "learning_rate": 0.00012021830194710178,
      "loss": 8.5728,
      "step": 30592,
      "throughput": 8877.552825749823
    },
    {
      "epoch": 0.48000376178496695,
      "grad_norm": 0.07845813781023026,
      "learning_rate": 0.00011995976403449054,
      "loss": 8.5728,
      "step": 30624,
      "throughput": 8877.614290495207
    },
    {
      "epoch": 0.48050533311389587,
      "grad_norm": 0.07357929646968842,
      "learning_rate": 0.00011970141186794592,
      "loss": 8.5839,
      "step": 30656,
      "throughput": 8877.611112615054
    },
    {
      "epoch": 0.4810069044428248,
      "grad_norm": 0.08569731563329697,
      "learning_rate": 0.00011944324651291299,
      "loss": 8.5576,
      "step": 30688,
      "throughput": 8877.662194945713
    },
    {
      "epoch": 0.4815084757717537,
      "grad_norm": 0.07027926295995712,
      "learning_rate": 0.00011918526903406647,
      "loss": 8.5569,
      "step": 30720,
      "throughput": 8877.783592572207
    },
    {
      "epoch": 0.4820100471006826,
      "grad_norm": 0.0785306990146637,
      "learning_rate": 0.0001189274804953063,
      "loss": 8.5765,
      "step": 30752,
      "throughput": 8877.205577707166
    },
    {
      "epoch": 0.48251161842961154,
      "grad_norm": 0.08035396784543991,
      "learning_rate": 0.00011866988195975307,
      "loss": 8.5716,
      "step": 30784,
      "throughput": 8877.162571351546
    },
    {
      "epoch": 0.4830131897585404,
      "grad_norm": 0.07607050985097885,
      "learning_rate": 0.00011841247448974398,
      "loss": 8.5749,
      "step": 30816,
      "throughput": 8877.200508662241
    },
    {
      "epoch": 0.4835147610874693,
      "grad_norm": 0.07426037639379501,
      "learning_rate": 0.00011815525914682817,
      "loss": 8.5535,
      "step": 30848,
      "throughput": 8877.338847754632
    },
    {
      "epoch": 0.48401633241639824,
      "grad_norm": 0.07136990875005722,
      "learning_rate": 0.00011789823699176249,
      "loss": 8.5748,
      "step": 30880,
      "throughput": 8877.429074529899
    },
    {
      "epoch": 0.48451790374532716,
      "grad_norm": 0.0776275247335434,
      "learning_rate": 0.00011764140908450703,
      "loss": 8.5631,
      "step": 30912,
      "throughput": 8877.401479776045
    },
    {
      "epoch": 0.4850194750742561,
      "grad_norm": 0.07546462118625641,
      "learning_rate": 0.0001173847764842209,
      "loss": 8.5765,
      "step": 30944,
      "throughput": 8877.458627369684
    },
    {
      "epoch": 0.485521046403185,
      "grad_norm": 0.07272691279649734,
      "learning_rate": 0.00011712834024925766,
      "loss": 8.5726,
      "step": 30976,
      "throughput": 8877.599290194024
    },
    {
      "epoch": 0.4860226177321139,
      "grad_norm": 0.07284701615571976,
      "learning_rate": 0.00011687210143716116,
      "loss": 8.558,
      "step": 31008,
      "throughput": 8877.681189949853
    },
    {
      "epoch": 0.4865241890610428,
      "grad_norm": 0.08194044232368469,
      "learning_rate": 0.00011661606110466095,
      "loss": 8.5643,
      "step": 31040,
      "throughput": 8877.64047607755
    },
    {
      "epoch": 0.4870257603899717,
      "grad_norm": 0.07076172530651093,
      "learning_rate": 0.00011636022030766818,
      "loss": 8.5746,
      "step": 31072,
      "throughput": 8877.68333698756
    },
    {
      "epoch": 0.4875273317189006,
      "grad_norm": 0.07631973922252655,
      "learning_rate": 0.00011610458010127093,
      "loss": 8.5635,
      "step": 31104,
      "throughput": 8877.804494843896
    },
    {
      "epoch": 0.48802890304782953,
      "grad_norm": 0.0787317231297493,
      "learning_rate": 0.00011584914153973036,
      "loss": 8.584,
      "step": 31136,
      "throughput": 8877.88577758543
    },
    {
      "epoch": 0.48853047437675845,
      "grad_norm": 0.07193674147129059,
      "learning_rate": 0.00011559390567647571,
      "loss": 8.5611,
      "step": 31168,
      "throughput": 8877.853278013274
    },
    {
      "epoch": 0.48903204570568737,
      "grad_norm": 0.06967335939407349,
      "learning_rate": 0.00011533887356410052,
      "loss": 8.5708,
      "step": 31200,
      "throughput": 8877.882846063356
    },
    {
      "epoch": 0.4895336170346163,
      "grad_norm": 0.07708664983510971,
      "learning_rate": 0.00011508404625435791,
      "loss": 8.5709,
      "step": 31232,
      "throughput": 8878.004876389365
    },
    {
      "epoch": 0.49003518836354515,
      "grad_norm": 0.07327646017074585,
      "learning_rate": 0.00011482942479815651,
      "loss": 8.5505,
      "step": 31264,
      "throughput": 8878.096059697875
    },
    {
      "epoch": 0.49053675969247407,
      "grad_norm": 0.07120782136917114,
      "learning_rate": 0.00011457501024555593,
      "loss": 8.5701,
      "step": 31296,
      "throughput": 8878.083957310526
    },
    {
      "epoch": 0.491038331021403,
      "grad_norm": 0.06819991022348404,
      "learning_rate": 0.00011432080364576256,
      "loss": 8.5491,
      "step": 31328,
      "throughput": 8878.177801032847
    },
    {
      "epoch": 0.4915399023503319,
      "grad_norm": 0.0801183432340622,
      "learning_rate": 0.00011406680604712517,
      "loss": 8.5686,
      "step": 31360,
      "throughput": 8878.30045967166
    },
    {
      "epoch": 0.4920414736792608,
      "grad_norm": 0.06938584893941879,
      "learning_rate": 0.00011381301849713059,
      "loss": 8.5674,
      "step": 31392,
      "throughput": 8878.38557907876
    },
    {
      "epoch": 0.49254304500818974,
      "grad_norm": 0.1152784451842308,
      "learning_rate": 0.00011355944204239944,
      "loss": 8.5672,
      "step": 31424,
      "throughput": 8878.390782361848
    },
    {
      "epoch": 0.4930446163371186,
      "grad_norm": 0.07254608720541,
      "learning_rate": 0.0001133060777286818,
      "loss": 8.5559,
      "step": 31456,
      "throughput": 8878.445844506625
    },
    {
      "epoch": 0.4935461876660475,
      "grad_norm": 0.07228664308786392,
      "learning_rate": 0.00011305292660085278,
      "loss": 8.5488,
      "step": 31488,
      "throughput": 8878.577357518385
    },
    {
      "epoch": 0.49404775899497644,
      "grad_norm": 0.07752840220928192,
      "learning_rate": 0.00011279998970290844,
      "loss": 8.5768,
      "step": 31520,
      "throughput": 8878.657580171179
    },
    {
      "epoch": 0.49454933032390536,
      "grad_norm": 0.08368710428476334,
      "learning_rate": 0.0001125472680779613,
      "loss": 8.5621,
      "step": 31552,
      "throughput": 8878.657276550583
    },
    {
      "epoch": 0.4950509016528343,
      "grad_norm": 0.07768121361732483,
      "learning_rate": 0.00011229476276823608,
      "loss": 8.5495,
      "step": 31584,
      "throughput": 8878.720184959087
    },
    {
      "epoch": 0.4955524729817632,
      "grad_norm": 0.07697410136461258,
      "learning_rate": 0.00011204247481506535,
      "loss": 8.5502,
      "step": 31616,
      "throughput": 8878.852409219156
    },
    {
      "epoch": 0.4960540443106921,
      "grad_norm": 0.07087410986423492,
      "learning_rate": 0.00011179040525888552,
      "loss": 8.5554,
      "step": 31648,
      "throughput": 8878.956401158952
    },
    {
      "epoch": 0.496555615639621,
      "grad_norm": 0.07429654896259308,
      "learning_rate": 0.00011153855513923207,
      "loss": 8.544,
      "step": 31680,
      "throughput": 8878.967627668655
    },
    {
      "epoch": 0.4970571869685499,
      "grad_norm": 0.09350360184907913,
      "learning_rate": 0.00011128692549473568,
      "loss": 8.5657,
      "step": 31712,
      "throughput": 8879.020572731342
    },
    {
      "epoch": 0.4975587582974788,
      "grad_norm": 0.07406873255968094,
      "learning_rate": 0.00011103551736311777,
      "loss": 8.5473,
      "step": 31744,
      "throughput": 8879.15050775114
    },
    {
      "epoch": 0.4980603296264077,
      "grad_norm": 0.07790439575910568,
      "learning_rate": 0.0001107843317811862,
      "loss": 8.5429,
      "step": 31776,
      "throughput": 8879.233401099962
    },
    {
      "epoch": 0.49856190095533665,
      "grad_norm": 0.07620224356651306,
      "learning_rate": 0.00011053336978483102,
      "loss": 8.5706,
      "step": 31808,
      "throughput": 8879.19222412429
    },
    {
      "epoch": 0.49906347228426556,
      "grad_norm": 0.08028724044561386,
      "learning_rate": 0.00011028263240902033,
      "loss": 8.539,
      "step": 31840,
      "throughput": 8879.253677757002
    },
    {
      "epoch": 0.4995650436131945,
      "grad_norm": 0.07354696840047836,
      "learning_rate": 0.0001100321206877957,
      "loss": 8.5431,
      "step": 31872,
      "throughput": 8879.351637433128
    },
    {
      "epoch": 0.5000666149421233,
      "grad_norm": 0.073768250644207,
      "learning_rate": 0.00010978183565426832,
      "loss": 8.556,
      "step": 31904,
      "throughput": 8879.423566520085
    },
    {
      "epoch": 0.5005681862710523,
      "grad_norm": 0.08214636147022247,
      "learning_rate": 0.00010953177834061435,
      "loss": 8.5693,
      "step": 31936,
      "throughput": 8879.384955887333
    },
    {
      "epoch": 0.5010697575999812,
      "grad_norm": 0.07873082906007767,
      "learning_rate": 0.00010928194977807091,
      "loss": 8.5494,
      "step": 31968,
      "throughput": 8879.47590132547
    },
    {
      "epoch": 0.5015713289289101,
      "grad_norm": 0.0739561915397644,
      "learning_rate": 0.00010903235099693174,
      "loss": 8.5376,
      "step": 32000,
      "throughput": 8879.5924262509
    },
    {
      "epoch": 0.502072900257839,
      "grad_norm": 0.08155156672000885,
      "learning_rate": 0.00010878298302654294,
      "loss": 8.569,
      "step": 32032,
      "throughput": 8879.651295267644
    },
    {
      "epoch": 0.5025744715867679,
      "grad_norm": 0.07305172085762024,
      "learning_rate": 0.00010853384689529873,
      "loss": 8.5545,
      "step": 32064,
      "throughput": 8879.635714989827
    },
    {
      "epoch": 0.5030760429156969,
      "grad_norm": 0.07556242495775223,
      "learning_rate": 0.00010828494363063732,
      "loss": 8.5534,
      "step": 32096,
      "throughput": 8879.706163985868
    },
    {
      "epoch": 0.5035776142446258,
      "grad_norm": 0.07298897951841354,
      "learning_rate": 0.0001080362742590364,
      "loss": 8.5631,
      "step": 32128,
      "throughput": 8879.817239222819
    },
    {
      "epoch": 0.5040791855735547,
      "grad_norm": 0.08162616193294525,
      "learning_rate": 0.00010778783980600939,
      "loss": 8.569,
      "step": 32160,
      "throughput": 8879.905393904175
    },
    {
      "epoch": 0.5045807569024836,
      "grad_norm": 0.07203565537929535,
      "learning_rate": 0.00010753964129610052,
      "loss": 8.5517,
      "step": 32192,
      "throughput": 8879.852917096618
    },
    {
      "epoch": 0.5050823282314125,
      "grad_norm": 0.07308918237686157,
      "learning_rate": 0.00010729167975288122,
      "loss": 8.5551,
      "step": 32224,
      "throughput": 8879.898546137045
    },
    {
      "epoch": 0.5055838995603413,
      "grad_norm": 0.07474292814731598,
      "learning_rate": 0.0001070439561989457,
      "loss": 8.5609,
      "step": 32256,
      "throughput": 8880.034979977114
    },
    {
      "epoch": 0.5060854708892703,
      "grad_norm": 0.07707252353429794,
      "learning_rate": 0.00010679647165590659,
      "loss": 8.5408,
      "step": 32288,
      "throughput": 8880.079002533506
    },
    {
      "epoch": 0.5065870422181992,
      "grad_norm": 0.07359939813613892,
      "learning_rate": 0.00010654922714439083,
      "loss": 8.5409,
      "step": 32320,
      "throughput": 8880.03827720708
    },
    {
      "epoch": 0.5070886135471281,
      "grad_norm": 0.08229360729455948,
      "learning_rate": 0.00010630222368403561,
      "loss": 8.5313,
      "step": 32352,
      "throughput": 8880.097772317216
    },
    {
      "epoch": 0.507590184876057,
      "grad_norm": 0.07442633807659149,
      "learning_rate": 0.00010605546229348396,
      "loss": 8.5595,
      "step": 32384,
      "throughput": 8880.231914183245
    },
    {
      "epoch": 0.5080917562049859,
      "grad_norm": 0.07239026576280594,
      "learning_rate": 0.00010580894399038044,
      "loss": 8.5563,
      "step": 32416,
      "throughput": 8880.331477725678
    },
    {
      "epoch": 0.5085933275339148,
      "grad_norm": 0.07314996421337128,
      "learning_rate": 0.00010556266979136734,
      "loss": 8.5389,
      "step": 32448,
      "throughput": 8880.30725576251
    },
    {
      "epoch": 0.5090948988628438,
      "grad_norm": 0.07866779714822769,
      "learning_rate": 0.00010531664071208019,
      "loss": 8.5422,
      "step": 32480,
      "throughput": 8880.404943660236
    },
    {
      "epoch": 0.5095964701917727,
      "grad_norm": 0.07595469057559967,
      "learning_rate": 0.00010507085776714369,
      "loss": 8.5319,
      "step": 32512,
      "throughput": 8880.517738723465
    },
    {
      "epoch": 0.5100980415207016,
      "grad_norm": 0.07140462100505829,
      "learning_rate": 0.00010482532197016732,
      "loss": 8.5504,
      "step": 32544,
      "throughput": 8880.613775883952
    },
    {
      "epoch": 0.5105996128496305,
      "grad_norm": 0.07571039348840714,
      "learning_rate": 0.00010458003433374152,
      "loss": 8.5415,
      "step": 32576,
      "throughput": 8880.57318962394
    },
    {
      "epoch": 0.5111011841785594,
      "grad_norm": 0.08015038073062897,
      "learning_rate": 0.00010433499586943319,
      "loss": 8.5512,
      "step": 32608,
      "throughput": 8880.661175602947
    },
    {
      "epoch": 0.5116027555074883,
      "grad_norm": 0.0716933086514473,
      "learning_rate": 0.00010409020758778178,
      "loss": 8.5497,
      "step": 32640,
      "throughput": 8880.779949426185
    },
    {
      "epoch": 0.5121043268364173,
      "grad_norm": 0.07379721105098724,
      "learning_rate": 0.00010384567049829474,
      "loss": 8.5389,
      "step": 32672,
      "throughput": 8880.86449327483
    },
    {
      "epoch": 0.5126058981653461,
      "grad_norm": 0.06897587329149246,
      "learning_rate": 0.00010360138560944379,
      "loss": 8.536,
      "step": 32704,
      "throughput": 8880.796608475323
    },
    {
      "epoch": 0.513107469494275,
      "grad_norm": 0.07183413207530975,
      "learning_rate": 0.00010335735392866061,
      "loss": 8.5316,
      "step": 32736,
      "throughput": 8880.86476432713
    },
    {
      "epoch": 0.5136090408232039,
      "grad_norm": 0.07627425342798233,
      "learning_rate": 0.00010311357646233255,
      "loss": 8.5474,
      "step": 32768,
      "throughput": 8880.967565750585
    },
    {
      "epoch": 0.5141106121521328,
      "grad_norm": 0.08256326615810394,
      "learning_rate": 0.00010287005421579854,
      "loss": 8.5603,
      "step": 32800,
      "throughput": 8880.427350610618
    },
    {
      "epoch": 0.5146121834810617,
      "grad_norm": 0.08104515075683594,
      "learning_rate": 0.00010262678819334511,
      "loss": 8.5393,
      "step": 32832,
      "throughput": 8880.38917977172
    },
    {
      "epoch": 0.5151137548099907,
      "grad_norm": 0.08197243511676788,
      "learning_rate": 0.00010238377939820202,
      "loss": 8.5454,
      "step": 32864,
      "throughput": 8880.469958966982
    },
    {
      "epoch": 0.5156153261389196,
      "grad_norm": 0.08547014743089676,
      "learning_rate": 0.00010214102883253832,
      "loss": 8.5422,
      "step": 32896,
      "throughput": 8880.562745151854
    },
    {
      "epoch": 0.5161168974678485,
      "grad_norm": 0.0771087184548378,
      "learning_rate": 0.00010189853749745799,
      "loss": 8.528,
      "step": 32928,
      "throughput": 8880.67678790059
    },
    {
      "epoch": 0.5166184687967774,
      "grad_norm": 0.07997187227010727,
      "learning_rate": 0.00010165630639299606,
      "loss": 8.5308,
      "step": 32960,
      "throughput": 8880.615246084193
    },
    {
      "epoch": 0.5171200401257063,
      "grad_norm": 0.08800628036260605,
      "learning_rate": 0.00010141433651811429,
      "loss": 8.5355,
      "step": 32992,
      "throughput": 8880.689982268765
    },
    {
      "epoch": 0.5176216114546353,
      "grad_norm": 0.09052596986293793,
      "learning_rate": 0.00010117262887069724,
      "loss": 8.5431,
      "step": 33024,
      "throughput": 8880.793214106987
    },
    {
      "epoch": 0.5181231827835642,
      "grad_norm": 0.07647205144166946,
      "learning_rate": 0.00010093118444754784,
      "loss": 8.5479,
      "step": 33056,
      "throughput": 8880.903723461652
    },
    {
      "epoch": 0.5186247541124931,
      "grad_norm": 0.07544301450252533,
      "learning_rate": 0.0001006900042443837,
      "loss": 8.5154,
      "step": 33088,
      "throughput": 8880.808497164393
    },
    {
      "epoch": 0.519126325441422,
      "grad_norm": 0.08619405329227448,
      "learning_rate": 0.00010044908925583264,
      "loss": 8.5522,
      "step": 33120,
      "throughput": 8880.884355850601
    },
    {
      "epoch": 0.5196278967703508,
      "grad_norm": 0.07676394283771515,
      "learning_rate": 0.00010020844047542886,
      "loss": 8.5266,
      "step": 33152,
      "throughput": 8880.995931369265
    },
    {
      "epoch": 0.5201294680992797,
      "grad_norm": 0.07478612661361694,
      "learning_rate": 9.996805889560857e-05,
      "loss": 8.5333,
      "step": 33184,
      "throughput": 8881.09647931044
    },
    {
      "epoch": 0.5206310394282087,
      "grad_norm": 0.07415000349283218,
      "learning_rate": 9.972794550770612e-05,
      "loss": 8.5272,
      "step": 33216,
      "throughput": 8881.035190751842
    },
    {
      "epoch": 0.5211326107571376,
      "grad_norm": 0.07461415231227875,
      "learning_rate": 9.948810130194984e-05,
      "loss": 8.5506,
      "step": 33248,
      "throughput": 8881.106414600572
    },
    {
      "epoch": 0.5216341820860665,
      "grad_norm": 0.07567655295133591,
      "learning_rate": 9.924852726745807e-05,
      "loss": 8.5222,
      "step": 33280,
      "throughput": 8881.204834663215
    },
    {
      "epoch": 0.5221357534149954,
      "grad_norm": 0.07916685938835144,
      "learning_rate": 9.900922439223464e-05,
      "loss": 8.5652,
      "step": 33312,
      "throughput": 8881.3064814247
    },
    {
      "epoch": 0.5226373247439243,
      "grad_norm": 0.06847506016492844,
      "learning_rate": 9.877019366316541e-05,
      "loss": 8.5146,
      "step": 33344,
      "throughput": 8881.272806522671
    },
    {
      "epoch": 0.5231388960728532,
      "grad_norm": 0.07268689572811127,
      "learning_rate": 9.85314360660138e-05,
      "loss": 8.5187,
      "step": 33376,
      "throughput": 8881.33976370783
    },
    {
      "epoch": 0.5236404674017822,
      "grad_norm": 0.0702684298157692,
      "learning_rate": 9.829295258541692e-05,
      "loss": 8.5289,
      "step": 33408,
      "throughput": 8881.454946050295
    },
    {
      "epoch": 0.5241420387307111,
      "grad_norm": 0.07464347779750824,
      "learning_rate": 9.805474420488123e-05,
      "loss": 8.5505,
      "step": 33440,
      "throughput": 8881.55141287771
    },
    {
      "epoch": 0.52464361005964,
      "grad_norm": 0.07443258166313171,
      "learning_rate": 9.78168119067789e-05,
      "loss": 8.5471,
      "step": 33472,
      "throughput": 8881.522813382444
    },
    {
      "epoch": 0.5251451813885689,
      "grad_norm": 0.07743299752473831,
      "learning_rate": 9.757915667234339e-05,
      "loss": 8.5459,
      "step": 33504,
      "throughput": 8881.584108867713
    },
    {
      "epoch": 0.5256467527174978,
      "grad_norm": 0.07928189635276794,
      "learning_rate": 9.734177948166558e-05,
      "loss": 8.5381,
      "step": 33536,
      "throughput": 8881.690984580047
    },
    {
      "epoch": 0.5261483240464266,
      "grad_norm": 0.07928431779146194,
      "learning_rate": 9.710468131368968e-05,
      "loss": 8.5025,
      "step": 33568,
      "throughput": 8881.775171208094
    },
    {
      "epoch": 0.5266498953753556,
      "grad_norm": 0.07774075865745544,
      "learning_rate": 9.68678631462093e-05,
      "loss": 8.5461,
      "step": 33600,
      "throughput": 8881.787401343072
    },
    {
      "epoch": 0.5271514667042845,
      "grad_norm": 0.07617669552564621,
      "learning_rate": 9.66313259558633e-05,
      "loss": 8.5331,
      "step": 33632,
      "throughput": 8881.813049327631
    },
    {
      "epoch": 0.5276530380332134,
      "grad_norm": 0.08349672704935074,
      "learning_rate": 9.639507071813166e-05,
      "loss": 8.5034,
      "step": 33664,
      "throughput": 8881.91225326137
    },
    {
      "epoch": 0.5281546093621423,
      "grad_norm": 0.07760000973939896,
      "learning_rate": 9.615909840733167e-05,
      "loss": 8.5311,
      "step": 33696,
      "throughput": 8882.02504331997
    },
    {
      "epoch": 0.5286561806910712,
      "grad_norm": 0.07814379036426544,
      "learning_rate": 9.592340999661393e-05,
      "loss": 8.5289,
      "step": 33728,
      "throughput": 8882.026454517742
    },
    {
      "epoch": 0.5291577520200001,
      "grad_norm": 0.07366339862346649,
      "learning_rate": 9.568800645795812e-05,
      "loss": 8.5552,
      "step": 33760,
      "throughput": 8882.043253472537
    },
    {
      "epoch": 0.5296593233489291,
      "grad_norm": 0.07660157978534698,
      "learning_rate": 9.545288876216901e-05,
      "loss": 8.5227,
      "step": 33792,
      "throughput": 8882.149626940249
    },
    {
      "epoch": 0.530160894677858,
      "grad_norm": 0.07678160816431046,
      "learning_rate": 9.521805787887285e-05,
      "loss": 8.518,
      "step": 33824,
      "throughput": 8882.220820754601
    },
    {
      "epoch": 0.5306624660067869,
      "grad_norm": 0.07604127377271652,
      "learning_rate": 9.498351477651286e-05,
      "loss": 8.5325,
      "step": 33856,
      "throughput": 8882.205366979448
    },
    {
      "epoch": 0.5311640373357158,
      "grad_norm": 0.07513943314552307,
      "learning_rate": 9.47492604223454e-05,
      "loss": 8.5155,
      "step": 33888,
      "throughput": 8882.214404156059
    },
    {
      "epoch": 0.5316656086646447,
      "grad_norm": 0.08751774579286575,
      "learning_rate": 9.451529578243618e-05,
      "loss": 8.5269,
      "step": 33920,
      "throughput": 8882.309489996225
    },
    {
      "epoch": 0.5321671799935737,
      "grad_norm": 0.07623326778411865,
      "learning_rate": 9.428162182165607e-05,
      "loss": 8.5138,
      "step": 33952,
      "throughput": 8882.40839640058
    },
    {
      "epoch": 0.5326687513225026,
      "grad_norm": 0.06985815614461899,
      "learning_rate": 9.40482395036772e-05,
      "loss": 8.5227,
      "step": 33984,
      "throughput": 8882.401640771297
    },
    {
      "epoch": 0.5331703226514314,
      "grad_norm": 0.07082241773605347,
      "learning_rate": 9.381514979096888e-05,
      "loss": 8.5024,
      "step": 34016,
      "throughput": 8882.42714656863
    },
    {
      "epoch": 0.5336718939803603,
      "grad_norm": 0.07866815477609634,
      "learning_rate": 9.35823536447938e-05,
      "loss": 8.5337,
      "step": 34048,
      "throughput": 8882.52549045798
    },
    {
      "epoch": 0.5341734653092892,
      "grad_norm": 0.07148824632167816,
      "learning_rate": 9.334985202520395e-05,
      "loss": 8.5005,
      "step": 34080,
      "throughput": 8882.623048916446
    },
    {
      "epoch": 0.5346750366382181,
      "grad_norm": 0.06872270256280899,
      "learning_rate": 9.311764589103679e-05,
      "loss": 8.5324,
      "step": 34112,
      "throughput": 8882.655170274242
    },
    {
      "epoch": 0.5351766079671471,
      "grad_norm": 0.07169391959905624,
      "learning_rate": 9.288573619991096e-05,
      "loss": 8.532,
      "step": 34144,
      "throughput": 8882.656679601589
    },
    {
      "epoch": 0.535678179296076,
      "grad_norm": 0.07784446328878403,
      "learning_rate": 9.265412390822278e-05,
      "loss": 8.5363,
      "step": 34176,
      "throughput": 8882.771757879587
    },
    {
      "epoch": 0.5361797506250049,
      "grad_norm": 0.07849624007940292,
      "learning_rate": 9.242280997114204e-05,
      "loss": 8.5078,
      "step": 34208,
      "throughput": 8882.852619223795
    },
    {
      "epoch": 0.5366813219539338,
      "grad_norm": 0.0793418139219284,
      "learning_rate": 9.219179534260811e-05,
      "loss": 8.5131,
      "step": 34240,
      "throughput": 8882.884299489215
    },
    {
      "epoch": 0.5371828932828627,
      "grad_norm": 0.07151144742965698,
      "learning_rate": 9.196108097532597e-05,
      "loss": 8.5116,
      "step": 34272,
      "throughput": 8882.89297451379
    },
    {
      "epoch": 0.5376844646117916,
      "grad_norm": 0.07799839228391647,
      "learning_rate": 9.173066782076236e-05,
      "loss": 8.5191,
      "step": 34304,
      "throughput": 8883.013899560407
    },
    {
      "epoch": 0.5381860359407206,
      "grad_norm": 0.06935375928878784,
      "learning_rate": 9.15005568291418e-05,
      "loss": 8.4986,
      "step": 34336,
      "throughput": 8883.143128934125
    },
    {
      "epoch": 0.5386876072696495,
      "grad_norm": 0.07601752877235413,
      "learning_rate": 9.12707489494428e-05,
      "loss": 8.4912,
      "step": 34368,
      "throughput": 8883.12263471243
    },
    {
      "epoch": 0.5391891785985784,
      "grad_norm": 0.07418935745954514,
      "learning_rate": 9.104124512939357e-05,
      "loss": 8.5373,
      "step": 34400,
      "throughput": 8883.126118539001
    },
    {
      "epoch": 0.5396907499275073,
      "grad_norm": 0.07352650910615921,
      "learning_rate": 9.081204631546867e-05,
      "loss": 8.5107,
      "step": 34432,
      "throughput": 8883.241844806826
    },
    {
      "epoch": 0.5401923212564361,
      "grad_norm": 0.07347220927476883,
      "learning_rate": 9.058315345288465e-05,
      "loss": 8.4956,
      "step": 34464,
      "throughput": 8883.34991937916
    },
    {
      "epoch": 0.540693892585365,
      "grad_norm": 0.07640855014324188,
      "learning_rate": 9.035456748559639e-05,
      "loss": 8.5371,
      "step": 34496,
      "throughput": 8883.324710473114
    },
    {
      "epoch": 0.541195463914294,
      "grad_norm": 0.10376151651144028,
      "learning_rate": 9.012628935629299e-05,
      "loss": 8.5067,
      "step": 34528,
      "throughput": 8883.330183627477
    },
    {
      "epoch": 0.5416970352432229,
      "grad_norm": 0.07584423571825027,
      "learning_rate": 8.989832000639424e-05,
      "loss": 8.4991,
      "step": 34560,
      "throughput": 8883.445808840017
    },
    {
      "epoch": 0.5421986065721518,
      "grad_norm": 0.06925126165151596,
      "learning_rate": 8.967066037604637e-05,
      "loss": 8.5198,
      "step": 34592,
      "throughput": 8883.562907732476
    },
    {
      "epoch": 0.5427001779010807,
      "grad_norm": 0.07632188498973846,
      "learning_rate": 8.944331140411841e-05,
      "loss": 8.525,
      "step": 34624,
      "throughput": 8883.549628252418
    },
    {
      "epoch": 0.5432017492300096,
      "grad_norm": 0.07406525313854218,
      "learning_rate": 8.921627402819813e-05,
      "loss": 8.5115,
      "step": 34656,
      "throughput": 8883.528618607796
    },
    {
      "epoch": 0.5437033205589386,
      "grad_norm": 0.07298589497804642,
      "learning_rate": 8.898954918458835e-05,
      "loss": 8.5207,
      "step": 34688,
      "throughput": 8883.647321597313
    },
    {
      "epoch": 0.5442048918878675,
      "grad_norm": 0.07894790172576904,
      "learning_rate": 8.876313780830305e-05,
      "loss": 8.5316,
      "step": 34720,
      "throughput": 8883.781552914974
    },
    {
      "epoch": 0.5447064632167964,
      "grad_norm": 0.0751766636967659,
      "learning_rate": 8.853704083306341e-05,
      "loss": 8.5235,
      "step": 34752,
      "throughput": 8883.768226796805
    },
    {
      "epoch": 0.5452080345457253,
      "grad_norm": 0.08003483712673187,
      "learning_rate": 8.831125919129397e-05,
      "loss": 8.5187,
      "step": 34784,
      "throughput": 8883.746416303493
    },
    {
      "epoch": 0.5457096058746542,
      "grad_norm": 0.07891687750816345,
      "learning_rate": 8.808579381411892e-05,
      "loss": 8.5174,
      "step": 34816,
      "throughput": 8883.877403875109
    },
    {
      "epoch": 0.5462111772035831,
      "grad_norm": 0.07358083128929138,
      "learning_rate": 8.786064563135815e-05,
      "loss": 8.5205,
      "step": 34848,
      "throughput": 8883.383841534293
    },
    {
      "epoch": 0.5467127485325121,
      "grad_norm": 0.0873100757598877,
      "learning_rate": 8.763581557152348e-05,
      "loss": 8.5185,
      "step": 34880,
      "throughput": 8883.393249342576
    },
    {
      "epoch": 0.5472143198614409,
      "grad_norm": 0.07261183112859726,
      "learning_rate": 8.741130456181463e-05,
      "loss": 8.5077,
      "step": 34912,
      "throughput": 8883.376127779211
    },
    {
      "epoch": 0.5477158911903698,
      "grad_norm": 0.0730830505490303,
      "learning_rate": 8.718711352811573e-05,
      "loss": 8.5139,
      "step": 34944,
      "throughput": 8883.498276316366
    },
    {
      "epoch": 0.5482174625192987,
      "grad_norm": 0.07217606902122498,
      "learning_rate": 8.696324339499135e-05,
      "loss": 8.5159,
      "step": 34976,
      "throughput": 8883.597327656475
    },
    {
      "epoch": 0.5487190338482276,
      "grad_norm": 0.08003465086221695,
      "learning_rate": 8.673969508568242e-05,
      "loss": 8.4899,
      "step": 35008,
      "throughput": 8883.574050139705
    },
    {
      "epoch": 0.5492206051771565,
      "grad_norm": 0.07030902057886124,
      "learning_rate": 8.651646952210293e-05,
      "loss": 8.5107,
      "step": 35040,
      "throughput": 8883.581419458513
    },
    {
      "epoch": 0.5497221765060855,
      "grad_norm": 0.07257349044084549,
      "learning_rate": 8.629356762483573e-05,
      "loss": 8.5134,
      "step": 35072,
      "throughput": 8883.695128678115
    },
    {
      "epoch": 0.5502237478350144,
      "grad_norm": 0.08085116744041443,
      "learning_rate": 8.607099031312901e-05,
      "loss": 8.5062,
      "step": 35104,
      "throughput": 8883.821992382886
    },
    {
      "epoch": 0.5507253191639433,
      "grad_norm": 0.08801715821027756,
      "learning_rate": 8.58487385048921e-05,
      "loss": 8.53,
      "step": 35136,
      "throughput": 8883.797216991694
    },
    {
      "epoch": 0.5512268904928722,
      "grad_norm": 0.07201190292835236,
      "learning_rate": 8.562681311669218e-05,
      "loss": 8.5199,
      "step": 35168,
      "throughput": 8883.765672377716
    },
    {
      "epoch": 0.5517284618218011,
      "grad_norm": 0.07491514086723328,
      "learning_rate": 8.540521506375026e-05,
      "loss": 8.512,
      "step": 35200,
      "throughput": 8883.881929345513
    },
    {
      "epoch": 0.55223003315073,
      "grad_norm": 0.07810447365045547,
      "learning_rate": 8.518394525993734e-05,
      "loss": 8.5059,
      "step": 35232,
      "throughput": 8883.995820434235
    },
    {
      "epoch": 0.552731604479659,
      "grad_norm": 0.07982519268989563,
      "learning_rate": 8.496300461777068e-05,
      "loss": 8.5158,
      "step": 35264,
      "throughput": 8883.975344967028
    },
    {
      "epoch": 0.5532331758085879,
      "grad_norm": 0.08146160840988159,
      "learning_rate": 8.474239404841023e-05,
      "loss": 8.5038,
      "step": 35296,
      "throughput": 8883.938716291977
    },
    {
      "epoch": 0.5537347471375168,
      "grad_norm": 0.0752020925283432,
      "learning_rate": 8.452211446165458e-05,
      "loss": 8.5064,
      "step": 35328,
      "throughput": 8884.057691728514
    },
    {
      "epoch": 0.5542363184664456,
      "grad_norm": 0.0742310956120491,
      "learning_rate": 8.430216676593744e-05,
      "loss": 8.5308,
      "step": 35360,
      "throughput": 8884.159179951805
    },
    {
      "epoch": 0.5547378897953745,
      "grad_norm": 0.07772351056337357,
      "learning_rate": 8.408255186832372e-05,
      "loss": 8.5202,
      "step": 35392,
      "throughput": 8884.131018453878
    },
    {
      "epoch": 0.5552394611243034,
      "grad_norm": 0.07519405335187912,
      "learning_rate": 8.386327067450593e-05,
      "loss": 8.4915,
      "step": 35424,
      "throughput": 8884.115394230439
    },
    {
      "epoch": 0.5557410324532324,
      "grad_norm": 0.07935325801372528,
      "learning_rate": 8.36443240888004e-05,
      "loss": 8.5052,
      "step": 35456,
      "throughput": 8884.23882340746
    },
    {
      "epoch": 0.5562426037821613,
      "grad_norm": 0.07572668790817261,
      "learning_rate": 8.342571301414342e-05,
      "loss": 8.5201,
      "step": 35488,
      "throughput": 8884.340679096225
    },
    {
      "epoch": 0.5567441751110902,
      "grad_norm": 0.08227578550577164,
      "learning_rate": 8.320743835208775e-05,
      "loss": 8.522,
      "step": 35520,
      "throughput": 8884.341695444704
    },
    {
      "epoch": 0.5572457464400191,
      "grad_norm": 0.07324469834566116,
      "learning_rate": 8.298950100279872e-05,
      "loss": 8.5111,
      "step": 35552,
      "throughput": 8884.332478967055
    },
    {
      "epoch": 0.557747317768948,
      "grad_norm": 0.08193643391132355,
      "learning_rate": 8.27719018650507e-05,
      "loss": 8.5238,
      "step": 35584,
      "throughput": 8884.452878142642
    },
    {
      "epoch": 0.558248889097877,
      "grad_norm": 0.10601537674665451,
      "learning_rate": 8.255464183622304e-05,
      "loss": 8.5241,
      "step": 35616,
      "throughput": 8884.556082756277
    },
    {
      "epoch": 0.5587504604268059,
      "grad_norm": 0.0735933780670166,
      "learning_rate": 8.23377218122968e-05,
      "loss": 8.5082,
      "step": 35648,
      "throughput": 8884.528473292614
    },
    {
      "epoch": 0.5592520317557348,
      "grad_norm": 0.06966464966535568,
      "learning_rate": 8.212114268785083e-05,
      "loss": 8.4966,
      "step": 35680,
      "throughput": 8884.53071339156
    },
    {
      "epoch": 0.5597536030846637,
      "grad_norm": 0.0795019343495369,
      "learning_rate": 8.190490535605809e-05,
      "loss": 8.4768,
      "step": 35712,
      "throughput": 8884.643559526203
    },
    {
      "epoch": 0.5602551744135926,
      "grad_norm": 0.07111742347478867,
      "learning_rate": 8.16890107086819e-05,
      "loss": 8.494,
      "step": 35744,
      "throughput": 8884.744796587207
    },
    {
      "epoch": 0.5607567457425215,
      "grad_norm": 0.07843293994665146,
      "learning_rate": 8.14734596360725e-05,
      "loss": 8.5043,
      "step": 35776,
      "throughput": 8884.730667694086
    },
    {
      "epoch": 0.5612583170714504,
      "grad_norm": 0.08138687163591385,
      "learning_rate": 8.12582530271631e-05,
      "loss": 8.5186,
      "step": 35808,
      "throughput": 8884.735111118021
    },
    {
      "epoch": 0.5617598884003793,
      "grad_norm": 0.07690315693616867,
      "learning_rate": 8.104339176946648e-05,
      "loss": 8.477,
      "step": 35840,
      "throughput": 8884.85119216147
    },
    {
      "epoch": 0.5622614597293082,
      "grad_norm": 0.0783621296286583,
      "learning_rate": 8.082887674907099e-05,
      "loss": 8.4963,
      "step": 35872,
      "throughput": 8884.941615967653
    },
    {
      "epoch": 0.5627630310582371,
      "grad_norm": 0.07450364530086517,
      "learning_rate": 8.061470885063726e-05,
      "loss": 8.5131,
      "step": 35904,
      "throughput": 8884.948427501336
    },
    {
      "epoch": 0.563264602387166,
      "grad_norm": 0.08089811354875565,
      "learning_rate": 8.040088895739433e-05,
      "loss": 8.5175,
      "step": 35936,
      "throughput": 8884.96577940189
    },
    {
      "epoch": 0.5637661737160949,
      "grad_norm": 0.07151561975479126,
      "learning_rate": 8.018741795113614e-05,
      "loss": 8.5038,
      "step": 35968,
      "throughput": 8885.047550193507
    },
    {
      "epoch": 0.5642677450450239,
      "grad_norm": 0.07573382556438446,
      "learning_rate": 7.997429671221764e-05,
      "loss": 8.5013,
      "step": 36000,
      "throughput": 8885.163958594867
    },
    {
      "epoch": 0.5647693163739528,
      "grad_norm": 0.08782751113176346,
      "learning_rate": 7.97615261195515e-05,
      "loss": 8.5074,
      "step": 36032,
      "throughput": 8885.143885689009
    },
    {
      "epoch": 0.5652708877028817,
      "grad_norm": 0.07787059992551804,
      "learning_rate": 7.95491070506043e-05,
      "loss": 8.5219,
      "step": 36064,
      "throughput": 8885.151103577797
    },
    {
      "epoch": 0.5657724590318106,
      "grad_norm": 0.07755567133426666,
      "learning_rate": 7.933704038139292e-05,
      "loss": 8.4863,
      "step": 36096,
      "throughput": 8885.227813447
    },
    {
      "epoch": 0.5662740303607395,
      "grad_norm": 0.0700237974524498,
      "learning_rate": 7.912532698648089e-05,
      "loss": 8.4916,
      "step": 36128,
      "throughput": 8885.333697137732
    },
    {
      "epoch": 0.5667756016896685,
      "grad_norm": 0.071931391954422,
      "learning_rate": 7.891396773897487e-05,
      "loss": 8.4837,
      "step": 36160,
      "throughput": 8885.319873634005
    },
    {
      "epoch": 0.5672771730185974,
      "grad_norm": 0.09684596210718155,
      "learning_rate": 7.870296351052104e-05,
      "loss": 8.479,
      "step": 36192,
      "throughput": 8885.345592323565
    },
    {
      "epoch": 0.5677787443475263,
      "grad_norm": 0.07962139695882797,
      "learning_rate": 7.849231517130151e-05,
      "loss": 8.493,
      "step": 36224,
      "throughput": 8885.403833978344
    },
    {
      "epoch": 0.5682803156764551,
      "grad_norm": 0.12747646868228912,
      "learning_rate": 7.828202359003058e-05,
      "loss": 8.4928,
      "step": 36256,
      "throughput": 8885.521391567749
    },
    {
      "epoch": 0.568781887005384,
      "grad_norm": 0.07479926198720932,
      "learning_rate": 7.807208963395139e-05,
      "loss": 8.4844,
      "step": 36288,
      "throughput": 8885.518655107615
    },
    {
      "epoch": 0.5692834583343129,
      "grad_norm": 0.07229211181402206,
      "learning_rate": 7.786251416883218e-05,
      "loss": 8.4924,
      "step": 36320,
      "throughput": 8885.550054180885
    },
    {
      "epoch": 0.5697850296632418,
      "grad_norm": 0.07093744724988937,
      "learning_rate": 7.765329805896287e-05,
      "loss": 8.5047,
      "step": 36352,
      "throughput": 8885.623178166292
    },
    {
      "epoch": 0.5702866009921708,
      "grad_norm": 0.07500205934047699,
      "learning_rate": 7.744444216715117e-05,
      "loss": 8.5033,
      "step": 36384,
      "throughput": 8885.710608474703
    },
    {
      "epoch": 0.5707881723210997,
      "grad_norm": 0.08139976114034653,
      "learning_rate": 7.723594735471952e-05,
      "loss": 8.5042,
      "step": 36416,
      "throughput": 8885.703183113117
    },
    {
      "epoch": 0.5712897436500286,
      "grad_norm": 0.07652562111616135,
      "learning_rate": 7.702781448150109e-05,
      "loss": 8.4971,
      "step": 36448,
      "throughput": 8885.708370268474
    },
    {
      "epoch": 0.5717913149789575,
      "grad_norm": 0.07908913493156433,
      "learning_rate": 7.682004440583654e-05,
      "loss": 8.4896,
      "step": 36480,
      "throughput": 8885.771023556996
    },
    {
      "epoch": 0.5722928863078864,
      "grad_norm": 0.0737721249461174,
      "learning_rate": 7.661263798457014e-05,
      "loss": 8.4904,
      "step": 36512,
      "throughput": 8885.893284596656
    },
    {
      "epoch": 0.5727944576368154,
      "grad_norm": 0.07399065047502518,
      "learning_rate": 7.64055960730467e-05,
      "loss": 8.4715,
      "step": 36544,
      "throughput": 8885.89665086395
    },
    {
      "epoch": 0.5732960289657443,
      "grad_norm": 0.07493384927511215,
      "learning_rate": 7.619891952510763e-05,
      "loss": 8.5003,
      "step": 36576,
      "throughput": 8885.881908155181
    },
    {
      "epoch": 0.5737976002946732,
      "grad_norm": 0.08065405488014221,
      "learning_rate": 7.599260919308764e-05,
      "loss": 8.4902,
      "step": 36608,
      "throughput": 8885.931565297993
    },
    {
      "epoch": 0.5742991716236021,
      "grad_norm": 0.07617328315973282,
      "learning_rate": 7.578666592781114e-05,
      "loss": 8.4989,
      "step": 36640,
      "throughput": 8886.04701726584
    },
    {
      "epoch": 0.574800742952531,
      "grad_norm": 0.07760798931121826,
      "learning_rate": 7.558109057858874e-05,
      "loss": 8.478,
      "step": 36672,
      "throughput": 8886.097106253352
    },
    {
      "epoch": 0.5753023142814598,
      "grad_norm": 0.07901562750339508,
      "learning_rate": 7.53758839932139e-05,
      "loss": 8.4934,
      "step": 36704,
      "throughput": 8886.058945302642
    },
    {
      "epoch": 0.5758038856103888,
      "grad_norm": 0.07319517433643341,
      "learning_rate": 7.517104701795905e-05,
      "loss": 8.4736,
      "step": 36736,
      "throughput": 8886.096675370176
    },
    {
      "epoch": 0.5763054569393177,
      "grad_norm": 0.07419974356889725,
      "learning_rate": 7.496658049757255e-05,
      "loss": 8.4854,
      "step": 36768,
      "throughput": 8886.215806089569
    },
    {
      "epoch": 0.5768070282682466,
      "grad_norm": 0.08911358565092087,
      "learning_rate": 7.476248527527492e-05,
      "loss": 8.4836,
      "step": 36800,
      "throughput": 8886.26092789472
    },
    {
      "epoch": 0.5773085995971755,
      "grad_norm": 0.07458078861236572,
      "learning_rate": 7.455876219275552e-05,
      "loss": 8.4766,
      "step": 36832,
      "throughput": 8886.227500957024
    },
    {
      "epoch": 0.5778101709261044,
      "grad_norm": 0.07872738689184189,
      "learning_rate": 7.435541209016885e-05,
      "loss": 8.5004,
      "step": 36864,
      "throughput": 8886.277084685154
    },
    {
      "epoch": 0.5783117422550333,
      "grad_norm": 0.07543252408504486,
      "learning_rate": 7.415243580613134e-05,
      "loss": 8.4784,
      "step": 36896,
      "throughput": 8885.820176886395
    },
    {
      "epoch": 0.5788133135839623,
      "grad_norm": 0.06923045217990875,
      "learning_rate": 7.394983417771791e-05,
      "loss": 8.4983,
      "step": 36928,
      "throughput": 8885.825564564371
    },
    {
      "epoch": 0.5793148849128912,
      "grad_norm": 0.08396708220243454,
      "learning_rate": 7.374760804045815e-05,
      "loss": 8.5045,
      "step": 36960,
      "throughput": 8885.798962338915
    },
    {
      "epoch": 0.5798164562418201,
      "grad_norm": 0.07503468543291092,
      "learning_rate": 7.354575822833331e-05,
      "loss": 8.4812,
      "step": 36992,
      "throughput": 8885.86703467302
    },
    {
      "epoch": 0.580318027570749,
      "grad_norm": 0.08438971638679504,
      "learning_rate": 7.334428557377258e-05,
      "loss": 8.5011,
      "step": 37024,
      "throughput": 8885.979098155063
    },
    {
      "epoch": 0.5808195988996779,
      "grad_norm": 0.07898327708244324,
      "learning_rate": 7.314319090764985e-05,
      "loss": 8.4963,
      "step": 37056,
      "throughput": 8886.007212733173
    },
    {
      "epoch": 0.5813211702286069,
      "grad_norm": 0.07942887395620346,
      "learning_rate": 7.294247505928003e-05,
      "loss": 8.4917,
      "step": 37088,
      "throughput": 8885.973764510194
    },
    {
      "epoch": 0.5818227415575358,
      "grad_norm": 0.08021794259548187,
      "learning_rate": 7.274213885641592e-05,
      "loss": 8.482,
      "step": 37120,
      "throughput": 8886.027117021331
    },
    {
      "epoch": 0.5823243128864646,
      "grad_norm": 0.07906454056501389,
      "learning_rate": 7.254218312524461e-05,
      "loss": 8.4884,
      "step": 37152,
      "throughput": 8886.159000057156
    },
    {
      "epoch": 0.5828258842153935,
      "grad_norm": 0.071269690990448,
      "learning_rate": 7.234260869038417e-05,
      "loss": 8.5017,
      "step": 37184,
      "throughput": 8886.220635883392
    },
    {
      "epoch": 0.5833274555443224,
      "grad_norm": 0.07759478688240051,
      "learning_rate": 7.214341637488007e-05,
      "loss": 8.5069,
      "step": 37216,
      "throughput": 8886.22728503575
    },
    {
      "epoch": 0.5838290268732513,
      "grad_norm": 0.07550026476383209,
      "learning_rate": 7.194460700020206e-05,
      "loss": 8.4778,
      "step": 37248,
      "throughput": 8886.241364373975
    },
    {
      "epoch": 0.5843305982021803,
      "grad_norm": 0.07170677930116653,
      "learning_rate": 7.174618138624058e-05,
      "loss": 8.4978,
      "step": 37280,
      "throughput": 8886.359489058588
    },
    {
      "epoch": 0.5848321695311092,
      "grad_norm": 0.08358849585056305,
      "learning_rate": 7.154814035130351e-05,
      "loss": 8.4932,
      "step": 37312,
      "throughput": 8886.402678647795
    },
    {
      "epoch": 0.5853337408600381,
      "grad_norm": 0.0761912614107132,
      "learning_rate": 7.135048471211257e-05,
      "loss": 8.4841,
      "step": 37344,
      "throughput": 8886.419195436345
    },
    {
      "epoch": 0.585835312188967,
      "grad_norm": 0.07179554551839828,
      "learning_rate": 7.115321528380024e-05,
      "loss": 8.5062,
      "step": 37376,
      "throughput": 8886.407493711495
    },
    {
      "epoch": 0.5863368835178959,
      "grad_norm": 0.10507599264383316,
      "learning_rate": 7.095633287990622e-05,
      "loss": 8.4877,
      "step": 37408,
      "throughput": 8886.557576944077
    },
    {
      "epoch": 0.5868384548468248,
      "grad_norm": 0.08583709597587585,
      "learning_rate": 7.075983831237421e-05,
      "loss": 8.4672,
      "step": 37440,
      "throughput": 8886.628687314525
    },
    {
      "epoch": 0.5873400261757538,
      "grad_norm": 0.07975829392671585,
      "learning_rate": 7.056373239154826e-05,
      "loss": 8.4819,
      "step": 37472,
      "throughput": 8886.650496913231
    },
    {
      "epoch": 0.5878415975046827,
      "grad_norm": 0.07645593583583832,
      "learning_rate": 7.036801592616982e-05,
      "loss": 8.4568,
      "step": 37504,
      "throughput": 8886.638514566923
    },
    {
      "epoch": 0.5883431688336116,
      "grad_norm": 0.07267199456691742,
      "learning_rate": 7.017268972337419e-05,
      "loss": 8.458,
      "step": 37536,
      "throughput": 8886.790797617663
    },
    {
      "epoch": 0.5888447401625405,
      "grad_norm": 0.07526517659425735,
      "learning_rate": 6.997775458868724e-05,
      "loss": 8.4878,
      "step": 37568,
      "throughput": 8886.852398981955
    },
    {
      "epoch": 0.5893463114914693,
      "grad_norm": 0.07897040247917175,
      "learning_rate": 6.978321132602197e-05,
      "loss": 8.4842,
      "step": 37600,
      "throughput": 8886.843188088478
    },
    {
      "epoch": 0.5898478828203982,
      "grad_norm": 0.08200077712535858,
      "learning_rate": 6.95890607376754e-05,
      "loss": 8.4688,
      "step": 37632,
      "throughput": 8886.852773489969
    },
    {
      "epoch": 0.5903494541493272,
      "grad_norm": 0.07454876601696014,
      "learning_rate": 6.939530362432513e-05,
      "loss": 8.4862,
      "step": 37664,
      "throughput": 8887.001803979198
    },
    {
      "epoch": 0.5908510254782561,
      "grad_norm": 0.08084976673126221,
      "learning_rate": 6.920194078502611e-05,
      "loss": 8.4805,
      "step": 37696,
      "throughput": 8887.072771566227
    },
    {
      "epoch": 0.591352596807185,
      "grad_norm": 0.07187534868717194,
      "learning_rate": 6.900897301720721e-05,
      "loss": 8.4855,
      "step": 37728,
      "throughput": 8887.058230712222
    },
    {
      "epoch": 0.5918541681361139,
      "grad_norm": 0.06797279417514801,
      "learning_rate": 6.881640111666807e-05,
      "loss": 8.5012,
      "step": 37760,
      "throughput": 8887.052989800553
    },
    {
      "epoch": 0.5923557394650428,
      "grad_norm": 0.07955506443977356,
      "learning_rate": 6.862422587757581e-05,
      "loss": 8.4873,
      "step": 37792,
      "throughput": 8887.19590824548
    },
    {
      "epoch": 0.5928573107939717,
      "grad_norm": 0.07935384660959244,
      "learning_rate": 6.843244809246173e-05,
      "loss": 8.5038,
      "step": 37824,
      "throughput": 8887.268535404251
    },
    {
      "epoch": 0.5933588821229007,
      "grad_norm": 0.07026589661836624,
      "learning_rate": 6.824106855221788e-05,
      "loss": 8.485,
      "step": 37856,
      "throughput": 8887.246180262757
    },
    {
      "epoch": 0.5938604534518296,
      "grad_norm": 0.07547298073768616,
      "learning_rate": 6.805008804609411e-05,
      "loss": 8.48,
      "step": 37888,
      "throughput": 8887.253621811145
    },
    {
      "epoch": 0.5943620247807585,
      "grad_norm": 0.08095958828926086,
      "learning_rate": 6.78595073616946e-05,
      "loss": 8.476,
      "step": 37920,
      "throughput": 8887.366914831318
    },
    {
      "epoch": 0.5948635961096874,
      "grad_norm": 0.08343052119016647,
      "learning_rate": 6.766932728497468e-05,
      "loss": 8.501,
      "step": 37952,
      "throughput": 8887.442337101993
    },
    {
      "epoch": 0.5953651674386163,
      "grad_norm": 0.07000665366649628,
      "learning_rate": 6.747954860023746e-05,
      "loss": 8.5003,
      "step": 37984,
      "throughput": 8887.4245648121
    },
    {
      "epoch": 0.5958667387675451,
      "grad_norm": 0.07254104316234589,
      "learning_rate": 6.729017209013086e-05,
      "loss": 8.4894,
      "step": 38016,
      "throughput": 8887.438706603321
    },
    {
      "epoch": 0.5963683100964741,
      "grad_norm": 0.07622817903757095,
      "learning_rate": 6.710119853564422e-05,
      "loss": 8.495,
      "step": 38048,
      "throughput": 8887.558373040401
    },
    {
      "epoch": 0.596869881425403,
      "grad_norm": 0.09238285571336746,
      "learning_rate": 6.69126287161049e-05,
      "loss": 8.5041,
      "step": 38080,
      "throughput": 8887.620834143852
    },
    {
      "epoch": 0.5973714527543319,
      "grad_norm": 0.08314980566501617,
      "learning_rate": 6.672446340917553e-05,
      "loss": 8.4884,
      "step": 38112,
      "throughput": 8887.602805809758
    },
    {
      "epoch": 0.5978730240832608,
      "grad_norm": 0.08117211610078812,
      "learning_rate": 6.653670339085031e-05,
      "loss": 8.4792,
      "step": 38144,
      "throughput": 8887.628870528435
    },
    {
      "epoch": 0.5983745954121897,
      "grad_norm": 0.0752682313323021,
      "learning_rate": 6.634934943545217e-05,
      "loss": 8.4863,
      "step": 38176,
      "throughput": 8887.739052589919
    },
    {
      "epoch": 0.5988761667411187,
      "grad_norm": 0.07365549355745316,
      "learning_rate": 6.616240231562933e-05,
      "loss": 8.4621,
      "step": 38208,
      "throughput": 8887.788441557968
    },
    {
      "epoch": 0.5993777380700476,
      "grad_norm": 0.0774432122707367,
      "learning_rate": 6.597586280235227e-05,
      "loss": 8.491,
      "step": 38240,
      "throughput": 8887.777448927698
    },
    {
      "epoch": 0.5998793093989765,
      "grad_norm": 0.08132058382034302,
      "learning_rate": 6.578973166491053e-05,
      "loss": 8.475,
      "step": 38272,
      "throughput": 8887.796764760067
    },
    {
      "epoch": 0.6003808807279054,
      "grad_norm": 0.07279162108898163,
      "learning_rate": 6.560400967090948e-05,
      "loss": 8.4748,
      "step": 38304,
      "throughput": 8887.91410213565
    },
    {
      "epoch": 0.6008824520568343,
      "grad_norm": 0.07235167175531387,
      "learning_rate": 6.54186975862671e-05,
      "loss": 8.4689,
      "step": 38336,
      "throughput": 8887.977423903028
    },
    {
      "epoch": 0.6013840233857632,
      "grad_norm": 0.07213466614484787,
      "learning_rate": 6.523379617521104e-05,
      "loss": 8.4697,
      "step": 38368,
      "throughput": 8887.960699058545
    },
    {
      "epoch": 0.6018855947146922,
      "grad_norm": 0.0761331170797348,
      "learning_rate": 6.504930620027524e-05,
      "loss": 8.471,
      "step": 38400,
      "throughput": 8887.9758169686
    },
    {
      "epoch": 0.6023871660436211,
      "grad_norm": 0.07972035557031631,
      "learning_rate": 6.486522842229692e-05,
      "loss": 8.4779,
      "step": 38432,
      "throughput": 8888.093491518159
    },
    {
      "epoch": 0.6028887373725499,
      "grad_norm": 0.07006843388080597,
      "learning_rate": 6.468156360041337e-05,
      "loss": 8.4802,
      "step": 38464,
      "throughput": 8888.155714870378
    },
    {
      "epoch": 0.6033903087014788,
      "grad_norm": 0.07934440672397614,
      "learning_rate": 6.449831249205887e-05,
      "loss": 8.4662,
      "step": 38496,
      "throughput": 8888.122112787783
    },
    {
      "epoch": 0.6038918800304077,
      "grad_norm": 0.07978641241788864,
      "learning_rate": 6.431547585296156e-05,
      "loss": 8.4644,
      "step": 38528,
      "throughput": 8888.136088222396
    },
    {
      "epoch": 0.6043934513593366,
      "grad_norm": 0.08016802370548248,
      "learning_rate": 6.413305443714022e-05,
      "loss": 8.4664,
      "step": 38560,
      "throughput": 8888.255943168042
    },
    {
      "epoch": 0.6048950226882656,
      "grad_norm": 0.0753985047340393,
      "learning_rate": 6.395104899690134e-05,
      "loss": 8.4689,
      "step": 38592,
      "throughput": 8888.334236582781
    },
    {
      "epoch": 0.6053965940171945,
      "grad_norm": 0.08145113289356232,
      "learning_rate": 6.37694602828359e-05,
      "loss": 8.4775,
      "step": 38624,
      "throughput": 8888.327950078981
    },
    {
      "epoch": 0.6058981653461234,
      "grad_norm": 0.07525403797626495,
      "learning_rate": 6.358828904381632e-05,
      "loss": 8.4654,
      "step": 38656,
      "throughput": 8888.313105008327
    },
    {
      "epoch": 0.6063997366750523,
      "grad_norm": 0.08340601623058319,
      "learning_rate": 6.340753602699327e-05,
      "loss": 8.4894,
      "step": 38688,
      "throughput": 8888.417408381963
    },
    {
      "epoch": 0.6069013080039812,
      "grad_norm": 0.09112949669361115,
      "learning_rate": 6.322720197779275e-05,
      "loss": 8.4926,
      "step": 38720,
      "throughput": 8888.502338303459
    },
    {
      "epoch": 0.6074028793329102,
      "grad_norm": 0.07264062762260437,
      "learning_rate": 6.304728763991291e-05,
      "loss": 8.4727,
      "step": 38752,
      "throughput": 8888.501994446655
    },
    {
      "epoch": 0.6079044506618391,
      "grad_norm": 0.07016344368457794,
      "learning_rate": 6.286779375532107e-05,
      "loss": 8.4899,
      "step": 38784,
      "throughput": 8888.494497272626
    },
    {
      "epoch": 0.608406021990768,
      "grad_norm": 0.0778326466679573,
      "learning_rate": 6.268872106425044e-05,
      "loss": 8.4491,
      "step": 38816,
      "throughput": 8888.587600647677
    },
    {
      "epoch": 0.6089075933196969,
      "grad_norm": 0.06960074603557587,
      "learning_rate": 6.25100703051974e-05,
      "loss": 8.4596,
      "step": 38848,
      "throughput": 8888.669944429052
    },
    {
      "epoch": 0.6094091646486258,
      "grad_norm": 0.07572120428085327,
      "learning_rate": 6.233184221491818e-05,
      "loss": 8.4787,
      "step": 38880,
      "throughput": 8888.687934410838
    },
    {
      "epoch": 0.6099107359775546,
      "grad_norm": 0.0762973502278328,
      "learning_rate": 6.2154037528426e-05,
      "loss": 8.4591,
      "step": 38912,
      "throughput": 8888.677327994048
    },
    {
      "epoch": 0.6104123073064835,
      "grad_norm": 0.08137981593608856,
      "learning_rate": 6.197665697898784e-05,
      "loss": 8.4705,
      "step": 38944,
      "throughput": 8888.243099346064
    },
    {
      "epoch": 0.6109138786354125,
      "grad_norm": 0.08791758865118027,
      "learning_rate": 6.179970129812166e-05,
      "loss": 8.4666,
      "step": 38976,
      "throughput": 8888.328760115613
    },
    {
      "epoch": 0.6114154499643414,
      "grad_norm": 0.07459740340709686,
      "learning_rate": 6.16231712155932e-05,
      "loss": 8.4809,
      "step": 39008,
      "throughput": 8888.300833774185
    },
    {
      "epoch": 0.6119170212932703,
      "grad_norm": 0.08468503504991531,
      "learning_rate": 6.144706745941308e-05,
      "loss": 8.4617,
      "step": 39040,
      "throughput": 8888.288480514897
    },
    {
      "epoch": 0.6124185926221992,
      "grad_norm": 0.0737047791481018,
      "learning_rate": 6.127139075583363e-05,
      "loss": 8.4671,
      "step": 39072,
      "throughput": 8888.391755677107
    },
    {
      "epoch": 0.6129201639511281,
      "grad_norm": 0.07463378459215164,
      "learning_rate": 6.109614182934616e-05,
      "loss": 8.4668,
      "step": 39104,
      "throughput": 8888.47370055962
    },
    {
      "epoch": 0.6134217352800571,
      "grad_norm": 0.07298613339662552,
      "learning_rate": 6.092132140267775e-05,
      "loss": 8.4549,
      "step": 39136,
      "throughput": 8888.460213971857
    },
    {
      "epoch": 0.613923306608986,
      "grad_norm": 0.09070611000061035,
      "learning_rate": 6.074693019678839e-05,
      "loss": 8.4759,
      "step": 39168,
      "throughput": 8888.464936901131
    },
    {
      "epoch": 0.6144248779379149,
      "grad_norm": 0.07279788702726364,
      "learning_rate": 6.0572968930867827e-05,
      "loss": 8.4575,
      "step": 39200,
      "throughput": 8888.570133881885
    },
    {
      "epoch": 0.6149264492668438,
      "grad_norm": 0.08355917036533356,
      "learning_rate": 6.039943832233293e-05,
      "loss": 8.4659,
      "step": 39232,
      "throughput": 8888.641314332495
    },
    {
      "epoch": 0.6154280205957727,
      "grad_norm": 0.07693200558423996,
      "learning_rate": 6.022633908682442e-05,
      "loss": 8.4661,
      "step": 39264,
      "throughput": 8888.600508933388
    },
    {
      "epoch": 0.6159295919247016,
      "grad_norm": 0.08254023641347885,
      "learning_rate": 6.005367193820408e-05,
      "loss": 8.4609,
      "step": 39296,
      "throughput": 8888.611166639346
    },
    {
      "epoch": 0.6164311632536306,
      "grad_norm": 0.07168231159448624,
      "learning_rate": 5.9881437588551675e-05,
      "loss": 8.4665,
      "step": 39328,
      "throughput": 8888.713888915134
    },
    {
      "epoch": 0.6169327345825594,
      "grad_norm": 0.08645154535770416,
      "learning_rate": 5.970963674816224e-05,
      "loss": 8.4573,
      "step": 39360,
      "throughput": 8888.808883024485
    },
    {
      "epoch": 0.6174343059114883,
      "grad_norm": 0.0723043829202652,
      "learning_rate": 5.953827012554291e-05,
      "loss": 8.4695,
      "step": 39392,
      "throughput": 8888.755871735146
    },
    {
      "epoch": 0.6179358772404172,
      "grad_norm": 0.08425875008106232,
      "learning_rate": 5.9367338427410197e-05,
      "loss": 8.4644,
      "step": 39424,
      "throughput": 8888.740088349667
    },
    {
      "epoch": 0.6184374485693461,
      "grad_norm": 0.10152871906757355,
      "learning_rate": 5.9196842358686866e-05,
      "loss": 8.4703,
      "step": 39456,
      "throughput": 8888.853319781223
    },
    {
      "epoch": 0.618939019898275,
      "grad_norm": 0.07223829627037048,
      "learning_rate": 5.902678262249923e-05,
      "loss": 8.4706,
      "step": 39488,
      "throughput": 8888.957989404042
    },
    {
      "epoch": 0.619440591227204,
      "grad_norm": 0.0781422033905983,
      "learning_rate": 5.885715992017419e-05,
      "loss": 8.4563,
      "step": 39520,
      "throughput": 8888.928558727486
    },
    {
      "epoch": 0.6199421625561329,
      "grad_norm": 0.07951901853084564,
      "learning_rate": 5.86879749512362e-05,
      "loss": 8.4608,
      "step": 39552,
      "throughput": 8888.908106562007
    },
    {
      "epoch": 0.6204437338850618,
      "grad_norm": 0.07194496691226959,
      "learning_rate": 5.851922841340461e-05,
      "loss": 8.4655,
      "step": 39584,
      "throughput": 8889.00558454673
    },
    {
      "epoch": 0.6209453052139907,
      "grad_norm": 0.07164505869150162,
      "learning_rate": 5.835092100259063e-05,
      "loss": 8.4528,
      "step": 39616,
      "throughput": 8889.105516908532
    },
    {
      "epoch": 0.6214468765429196,
      "grad_norm": 0.07270708680152893,
      "learning_rate": 5.818305341289458e-05,
      "loss": 8.4718,
      "step": 39648,
      "throughput": 8889.086944760817
    },
    {
      "epoch": 0.6219484478718486,
      "grad_norm": 0.08275506645441055,
      "learning_rate": 5.8015626336602814e-05,
      "loss": 8.4544,
      "step": 39680,
      "throughput": 8889.103020213392
    },
    {
      "epoch": 0.6224500192007775,
      "grad_norm": 0.07898057997226715,
      "learning_rate": 5.7848640464185124e-05,
      "loss": 8.4759,
      "step": 39712,
      "throughput": 8889.179327476135
    },
    {
      "epoch": 0.6229515905297064,
      "grad_norm": 0.07599367201328278,
      "learning_rate": 5.768209648429174e-05,
      "loss": 8.4706,
      "step": 39744,
      "throughput": 8889.26222452118
    },
    {
      "epoch": 0.6234531618586353,
      "grad_norm": 0.07903989404439926,
      "learning_rate": 5.751599508375059e-05,
      "loss": 8.4711,
      "step": 39776,
      "throughput": 8889.247412514856
    },
    {
      "epoch": 0.6239547331875641,
      "grad_norm": 0.07619236409664154,
      "learning_rate": 5.735033694756423e-05,
      "loss": 8.4552,
      "step": 39808,
      "throughput": 8889.229682914522
    },
    {
      "epoch": 0.624456304516493,
      "grad_norm": 0.07185118645429611,
      "learning_rate": 5.718512275890737e-05,
      "loss": 8.4543,
      "step": 39840,
      "throughput": 8889.34296320931
    },
    {
      "epoch": 0.624957875845422,
      "grad_norm": 0.07389679551124573,
      "learning_rate": 5.70203531991238e-05,
      "loss": 8.4598,
      "step": 39872,
      "throughput": 8889.437729760273
    },
    {
      "epoch": 0.6254594471743509,
      "grad_norm": 0.08193694055080414,
      "learning_rate": 5.6856028947723734e-05,
      "loss": 8.4619,
      "step": 39904,
      "throughput": 8889.412655528009
    },
    {
      "epoch": 0.6259610185032798,
      "grad_norm": 0.0743367001414299,
      "learning_rate": 5.669215068238075e-05,
      "loss": 8.4472,
      "step": 39936,
      "throughput": 8889.431497923688
    },
    {
      "epoch": 0.6264625898322087,
      "grad_norm": 0.07348310202360153,
      "learning_rate": 5.652871907892934e-05,
      "loss": 8.4661,
      "step": 39968,
      "throughput": 8889.521056001866
    },
    {
      "epoch": 0.6269641611611376,
      "grad_norm": 0.07830830663442612,
      "learning_rate": 5.6365734811362026e-05,
      "loss": 8.4629,
      "step": 40000,
      "throughput": 8889.62522923375
    },
    {
      "epoch": 0.6274657324900665,
      "grad_norm": 0.08075550198554993,
      "learning_rate": 5.620319855182629e-05,
      "loss": 8.4526,
      "step": 40032,
      "throughput": 8889.585173448973
    },
    {
      "epoch": 0.6279673038189955,
      "grad_norm": 0.0775223821401596,
      "learning_rate": 5.60411109706222e-05,
      "loss": 8.4293,
      "step": 40064,
      "throughput": 8889.569407062347
    },
    {
      "epoch": 0.6284688751479244,
      "grad_norm": 0.07300705462694168,
      "learning_rate": 5.587947273619938e-05,
      "loss": 8.4419,
      "step": 40096,
      "throughput": 8889.694101400937
    },
    {
      "epoch": 0.6289704464768533,
      "grad_norm": 0.07574678957462311,
      "learning_rate": 5.5718284515154476e-05,
      "loss": 8.4488,
      "step": 40128,
      "throughput": 8889.792966352983
    },
    {
      "epoch": 0.6294720178057822,
      "grad_norm": 0.07535702735185623,
      "learning_rate": 5.5557546972228114e-05,
      "loss": 8.4759,
      "step": 40160,
      "throughput": 8889.779889465628
    },
    {
      "epoch": 0.6299735891347111,
      "grad_norm": 0.08066742867231369,
      "learning_rate": 5.539726077030239e-05,
      "loss": 8.4689,
      "step": 40192,
      "throughput": 8889.748342331195
    },
    {
      "epoch": 0.63047516046364,
      "grad_norm": 0.07345551997423172,
      "learning_rate": 5.523742657039809e-05,
      "loss": 8.4555,
      "step": 40224,
      "throughput": 8889.869626650914
    },
    {
      "epoch": 0.6309767317925689,
      "grad_norm": 0.07321788370609283,
      "learning_rate": 5.5078045031672005e-05,
      "loss": 8.4845,
      "step": 40256,
      "throughput": 8889.95961121878
    },
    {
      "epoch": 0.6314783031214978,
      "grad_norm": 0.06991337984800339,
      "learning_rate": 5.491911681141394e-05,
      "loss": 8.4488,
      "step": 40288,
      "throughput": 8889.946034390981
    },
    {
      "epoch": 0.6319798744504267,
      "grad_norm": 0.07396293431520462,
      "learning_rate": 5.476064256504443e-05,
      "loss": 8.487,
      "step": 40320,
      "throughput": 8889.93125361081
    },
    {
      "epoch": 0.6324814457793556,
      "grad_norm": 0.07787778973579407,
      "learning_rate": 5.460262294611172e-05,
      "loss": 8.471,
      "step": 40352,
      "throughput": 8890.03715925217
    },
    {
      "epoch": 0.6329830171082845,
      "grad_norm": 0.0852857232093811,
      "learning_rate": 5.444505860628923e-05,
      "loss": 8.4354,
      "step": 40384,
      "throughput": 8890.129414989433
    },
    {
      "epoch": 0.6334845884372134,
      "grad_norm": 0.07648869603872299,
      "learning_rate": 5.428795019537268e-05,
      "loss": 8.4452,
      "step": 40416,
      "throughput": 8890.106046954319
    },
    {
      "epoch": 0.6339861597661424,
      "grad_norm": 0.07862301170825958,
      "learning_rate": 5.413129836127766e-05,
      "loss": 8.4576,
      "step": 40448,
      "throughput": 8890.111296393472
    },
    {
      "epoch": 0.6344877310950713,
      "grad_norm": 0.07297796756029129,
      "learning_rate": 5.3975103750036805e-05,
      "loss": 8.4432,
      "step": 40480,
      "throughput": 8890.172831193517
    },
    {
      "epoch": 0.6349893024240002,
      "grad_norm": 0.08166255801916122,
      "learning_rate": 5.3819367005797186e-05,
      "loss": 8.4398,
      "step": 40512,
      "throughput": 8890.25577196834
    },
    {
      "epoch": 0.6354908737529291,
      "grad_norm": 0.07448875904083252,
      "learning_rate": 5.366408877081752e-05,
      "loss": 8.4707,
      "step": 40544,
      "throughput": 8890.241049888195
    },
    {
      "epoch": 0.635992445081858,
      "grad_norm": 0.07577415555715561,
      "learning_rate": 5.3509269685465764e-05,
      "loss": 8.4439,
      "step": 40576,
      "throughput": 8890.287344628367
    },
    {
      "epoch": 0.636494016410787,
      "grad_norm": 0.07667967677116394,
      "learning_rate": 5.3354910388216274e-05,
      "loss": 8.4611,
      "step": 40608,
      "throughput": 8890.303070975244
    },
    {
      "epoch": 0.6369955877397159,
      "grad_norm": 0.07609532028436661,
      "learning_rate": 5.3201011515647276e-05,
      "loss": 8.4667,
      "step": 40640,
      "throughput": 8890.394540219058
    },
    {
      "epoch": 0.6374971590686448,
      "grad_norm": 0.07594712823629379,
      "learning_rate": 5.304757370243811e-05,
      "loss": 8.4443,
      "step": 40672,
      "throughput": 8890.380144757337
    },
    {
      "epoch": 0.6379987303975736,
      "grad_norm": 0.07783151417970657,
      "learning_rate": 5.2894597581366835e-05,
      "loss": 8.4683,
      "step": 40704,
      "throughput": 8890.426500048565
    },
    {
      "epoch": 0.6385003017265025,
      "grad_norm": 0.08565002679824829,
      "learning_rate": 5.274208378330737e-05,
      "loss": 8.467,
      "step": 40736,
      "throughput": 8890.456915042812
    },
    {
      "epoch": 0.6390018730554314,
      "grad_norm": 0.07598944753408432,
      "learning_rate": 5.2590032937227154e-05,
      "loss": 8.4597,
      "step": 40768,
      "throughput": 8890.551799466513
    },
    {
      "epoch": 0.6395034443843604,
      "grad_norm": 0.08324527740478516,
      "learning_rate": 5.2438445670184244e-05,
      "loss": 8.4601,
      "step": 40800,
      "throughput": 8890.510488344746
    },
    {
      "epoch": 0.6400050157132893,
      "grad_norm": 0.0747150406241417,
      "learning_rate": 5.2287322607325e-05,
      "loss": 8.4416,
      "step": 40832,
      "throughput": 8890.561222276989
    },
    {
      "epoch": 0.6405065870422182,
      "grad_norm": 0.07257223129272461,
      "learning_rate": 5.213666437188141e-05,
      "loss": 8.4731,
      "step": 40864,
      "throughput": 8890.597776004659
    },
    {
      "epoch": 0.6410081583711471,
      "grad_norm": 0.07314898818731308,
      "learning_rate": 5.1986471585168485e-05,
      "loss": 8.4614,
      "step": 40896,
      "throughput": 8890.689564843153
    },
    {
      "epoch": 0.641509729700076,
      "grad_norm": 0.08105824142694473,
      "learning_rate": 5.183674486658167e-05,
      "loss": 8.4763,
      "step": 40928,
      "throughput": 8890.654920890887
    },
    {
      "epoch": 0.6420113010290049,
      "grad_norm": 0.07510533183813095,
      "learning_rate": 5.168748483359445e-05,
      "loss": 8.4648,
      "step": 40960,
      "throughput": 8890.6647160896
    },
    {
      "epoch": 0.6425128723579339,
      "grad_norm": 0.07071134448051453,
      "learning_rate": 5.153869210175563e-05,
      "loss": 8.4359,
      "step": 40992,
      "throughput": 8890.230400694345
    },
    {
      "epoch": 0.6430144436868628,
      "grad_norm": 0.07490675896406174,
      "learning_rate": 5.139036728468686e-05,
      "loss": 8.4446,
      "step": 41024,
      "throughput": 8890.323327806726
    },
    {
      "epoch": 0.6435160150157917,
      "grad_norm": 0.07903076708316803,
      "learning_rate": 5.124251099408012e-05,
      "loss": 8.434,
      "step": 41056,
      "throughput": 8890.298497593038
    },
    {
      "epoch": 0.6440175863447206,
      "grad_norm": 0.07559023797512054,
      "learning_rate": 5.1095123839695224e-05,
      "loss": 8.4566,
      "step": 41088,
      "throughput": 8890.331839698267
    },
    {
      "epoch": 0.6445191576736495,
      "grad_norm": 0.0739353820681572,
      "learning_rate": 5.0948206429357224e-05,
      "loss": 8.4676,
      "step": 41120,
      "throughput": 8890.38437114305
    },
    {
      "epoch": 0.6450207290025783,
      "grad_norm": 0.08174904435873032,
      "learning_rate": 5.080175936895392e-05,
      "loss": 8.4735,
      "step": 41152,
      "throughput": 8890.48658112442
    },
    {
      "epoch": 0.6455223003315073,
      "grad_norm": 0.0728902593255043,
      "learning_rate": 5.065578326243348e-05,
      "loss": 8.4228,
      "step": 41184,
      "throughput": 8890.48142922228
    },
    {
      "epoch": 0.6460238716604362,
      "grad_norm": 0.08640056103467941,
      "learning_rate": 5.0510278711801735e-05,
      "loss": 8.4599,
      "step": 41216,
      "throughput": 8890.523424199167
    },
    {
      "epoch": 0.6465254429893651,
      "grad_norm": 0.07627113163471222,
      "learning_rate": 5.036524631711996e-05,
      "loss": 8.4511,
      "step": 41248,
      "throughput": 8890.57459525624
    },
    {
      "epoch": 0.647027014318294,
      "grad_norm": 0.07577764242887497,
      "learning_rate": 5.02206866765021e-05,
      "loss": 8.4497,
      "step": 41280,
      "throughput": 8890.680407253796
    },
    {
      "epoch": 0.6475285856472229,
      "grad_norm": 0.07532326877117157,
      "learning_rate": 5.007660038611259e-05,
      "loss": 8.4625,
      "step": 41312,
      "throughput": 8890.682211363383
    },
    {
      "epoch": 0.6480301569761518,
      "grad_norm": 0.08512426912784576,
      "learning_rate": 4.9932988040163726e-05,
      "loss": 8.4481,
      "step": 41344,
      "throughput": 8890.730346071183
    },
    {
      "epoch": 0.6485317283050808,
      "grad_norm": 0.08049551397562027,
      "learning_rate": 4.978985023091324e-05,
      "loss": 8.4542,
      "step": 41376,
      "throughput": 8890.755837575718
    },
    {
      "epoch": 0.6490332996340097,
      "grad_norm": 0.0796920582652092,
      "learning_rate": 4.964718754866186e-05,
      "loss": 8.445,
      "step": 41408,
      "throughput": 8890.876073098967
    },
    {
      "epoch": 0.6495348709629386,
      "grad_norm": 0.07723706215620041,
      "learning_rate": 4.95050005817509e-05,
      "loss": 8.4584,
      "step": 41440,
      "throughput": 8890.840355837523
    },
    {
      "epoch": 0.6500364422918675,
      "grad_norm": 0.07665670663118362,
      "learning_rate": 4.936328991655988e-05,
      "loss": 8.4199,
      "step": 41472,
      "throughput": 8890.8515222639
    },
    {
      "epoch": 0.6505380136207964,
      "grad_norm": 0.07526352256536484,
      "learning_rate": 4.9222056137504e-05,
      "loss": 8.4412,
      "step": 41504,
      "throughput": 8890.912446640825
    },
    {
      "epoch": 0.6510395849497254,
      "grad_norm": 0.07671099901199341,
      "learning_rate": 4.908129982703169e-05,
      "loss": 8.4451,
      "step": 41536,
      "throughput": 8891.01823466129
    },
    {
      "epoch": 0.6515411562786543,
      "grad_norm": 0.07746397703886032,
      "learning_rate": 4.8941021565622516e-05,
      "loss": 8.4567,
      "step": 41568,
      "throughput": 8891.00199828533
    },
    {
      "epoch": 0.6520427276075831,
      "grad_norm": 0.07340981811285019,
      "learning_rate": 4.880122193178441e-05,
      "loss": 8.4557,
      "step": 41600,
      "throughput": 8890.99166793191
    },
    {
      "epoch": 0.652544298936512,
      "grad_norm": 0.07931915670633316,
      "learning_rate": 4.866190150205143e-05,
      "loss": 8.449,
      "step": 41632,
      "throughput": 8891.04096212233
    },
    {
      "epoch": 0.6530458702654409,
      "grad_norm": 0.07507026195526123,
      "learning_rate": 4.8523060850981476e-05,
      "loss": 8.4285,
      "step": 41664,
      "throughput": 8891.154000433016
    },
    {
      "epoch": 0.6535474415943698,
      "grad_norm": 0.0907602310180664,
      "learning_rate": 4.838470055115379e-05,
      "loss": 8.4494,
      "step": 41696,
      "throughput": 8891.133463958888
    },
    {
      "epoch": 0.6540490129232988,
      "grad_norm": 0.07654330134391785,
      "learning_rate": 4.82468211731667e-05,
      "loss": 8.4551,
      "step": 41728,
      "throughput": 8891.12942557528
    },
    {
      "epoch": 0.6545505842522277,
      "grad_norm": 0.07788608968257904,
      "learning_rate": 4.8109423285635116e-05,
      "loss": 8.453,
      "step": 41760,
      "throughput": 8891.176412367118
    },
    {
      "epoch": 0.6550521555811566,
      "grad_norm": 0.07579880207777023,
      "learning_rate": 4.797250745518833e-05,
      "loss": 8.4164,
      "step": 41792,
      "throughput": 8891.289322325178
    },
    {
      "epoch": 0.6555537269100855,
      "grad_norm": 0.07648367434740067,
      "learning_rate": 4.7836074246467685e-05,
      "loss": 8.4408,
      "step": 41824,
      "throughput": 8891.292242880736
    },
    {
      "epoch": 0.6560552982390144,
      "grad_norm": 0.0801176205277443,
      "learning_rate": 4.770012422212412e-05,
      "loss": 8.4428,
      "step": 41856,
      "throughput": 8891.281171123885
    },
    {
      "epoch": 0.6565568695679433,
      "grad_norm": 0.07864414155483246,
      "learning_rate": 4.756465794281592e-05,
      "loss": 8.4415,
      "step": 41888,
      "throughput": 8891.340641747001
    },
    {
      "epoch": 0.6570584408968723,
      "grad_norm": 0.0740278884768486,
      "learning_rate": 4.742967596720641e-05,
      "loss": 8.4544,
      "step": 41920,
      "throughput": 8891.435292238171
    },
    {
      "epoch": 0.6575600122258012,
      "grad_norm": 0.07727956771850586,
      "learning_rate": 4.729517885196169e-05,
      "loss": 8.4639,
      "step": 41952,
      "throughput": 8891.43753280489
    },
    {
      "epoch": 0.6580615835547301,
      "grad_norm": 0.08166259527206421,
      "learning_rate": 4.716116715174827e-05,
      "loss": 8.4382,
      "step": 41984,
      "throughput": 8891.458781093757
    },
    {
      "epoch": 0.6585631548836589,
      "grad_norm": 0.07395589351654053,
      "learning_rate": 4.702764141923075e-05,
      "loss": 8.4559,
      "step": 42016,
      "throughput": 8891.479298197422
    },
    {
      "epoch": 0.6590647262125878,
      "grad_norm": 0.0727098360657692,
      "learning_rate": 4.6894602205069674e-05,
      "loss": 8.4285,
      "step": 42048,
      "throughput": 8891.577574987778
    },
    {
      "epoch": 0.6595662975415167,
      "grad_norm": 0.08630625158548355,
      "learning_rate": 4.6762050057919165e-05,
      "loss": 8.4324,
      "step": 42080,
      "throughput": 8891.56434519639
    },
    {
      "epoch": 0.6600678688704457,
      "grad_norm": 0.07782085984945297,
      "learning_rate": 4.6629985524424686e-05,
      "loss": 8.4326,
      "step": 42112,
      "throughput": 8891.53842517495
    },
    {
      "epoch": 0.6605694401993746,
      "grad_norm": 0.07772604376077652,
      "learning_rate": 4.649840914922071e-05,
      "loss": 8.4428,
      "step": 42144,
      "throughput": 8891.597460590583
    },
    {
      "epoch": 0.6610710115283035,
      "grad_norm": 0.08418555557727814,
      "learning_rate": 4.636732147492863e-05,
      "loss": 8.4317,
      "step": 42176,
      "throughput": 8891.697936976874
    },
    {
      "epoch": 0.6615725828572324,
      "grad_norm": 0.08168166130781174,
      "learning_rate": 4.6236723042154424e-05,
      "loss": 8.4574,
      "step": 42208,
      "throughput": 8891.667655818983
    },
    {
      "epoch": 0.6620741541861613,
      "grad_norm": 0.07932674884796143,
      "learning_rate": 4.61066143894864e-05,
      "loss": 8.4471,
      "step": 42240,
      "throughput": 8891.666142559854
    },
    {
      "epoch": 0.6625757255150903,
      "grad_norm": 0.07798092812299728,
      "learning_rate": 4.5976996053492996e-05,
      "loss": 8.4351,
      "step": 42272,
      "throughput": 8891.69451428347
    },
    {
      "epoch": 0.6630772968440192,
      "grad_norm": 0.09541884064674377,
      "learning_rate": 4.5847868568720646e-05,
      "loss": 8.4264,
      "step": 42304,
      "throughput": 8891.79013699881
    },
    {
      "epoch": 0.6635788681729481,
      "grad_norm": 0.07872533798217773,
      "learning_rate": 4.571923246769147e-05,
      "loss": 8.45,
      "step": 42336,
      "throughput": 8891.764855345577
    },
    {
      "epoch": 0.664080439501877,
      "grad_norm": 0.07670172303915024,
      "learning_rate": 4.559108828090115e-05,
      "loss": 8.4414,
      "step": 42368,
      "throughput": 8891.80559471926
    },
    {
      "epoch": 0.6645820108308059,
      "grad_norm": 0.0734407901763916,
      "learning_rate": 4.546343653681667e-05,
      "loss": 8.4511,
      "step": 42400,
      "throughput": 8891.832799174283
    },
    {
      "epoch": 0.6650835821597348,
      "grad_norm": 0.10266581922769547,
      "learning_rate": 4.53362777618742e-05,
      "loss": 8.437,
      "step": 42432,
      "throughput": 8891.934778631321
    },
    {
      "epoch": 0.6655851534886637,
      "grad_norm": 0.07219371199607849,
      "learning_rate": 4.52096124804769e-05,
      "loss": 8.4346,
      "step": 42464,
      "throughput": 8891.932544586749
    },
    {
      "epoch": 0.6660867248175926,
      "grad_norm": 0.07945689558982849,
      "learning_rate": 4.508344121499281e-05,
      "loss": 8.4346,
      "step": 42496,
      "throughput": 8891.953683702157
    },
    {
      "epoch": 0.6665882961465215,
      "grad_norm": 0.08158594369888306,
      "learning_rate": 4.495776448575255e-05,
      "loss": 8.4336,
      "step": 42528,
      "throughput": 8891.976572119378
    },
    {
      "epoch": 0.6670898674754504,
      "grad_norm": 0.07030566781759262,
      "learning_rate": 4.483258281104734e-05,
      "loss": 8.4158,
      "step": 42560,
      "throughput": 8892.083303595728
    },
    {
      "epoch": 0.6675914388043793,
      "grad_norm": 0.07251957803964615,
      "learning_rate": 4.470789670712681e-05,
      "loss": 8.4341,
      "step": 42592,
      "throughput": 8892.073672250981
    },
    {
      "epoch": 0.6680930101333082,
      "grad_norm": 0.08236385881900787,
      "learning_rate": 4.458370668819676e-05,
      "loss": 8.4496,
      "step": 42624,
      "throughput": 8892.09051768536
    },
    {
      "epoch": 0.6685945814622372,
      "grad_norm": 0.0733921006321907,
      "learning_rate": 4.4460013266417226e-05,
      "loss": 8.424,
      "step": 42656,
      "throughput": 8892.135135286919
    },
    {
      "epoch": 0.6690961527911661,
      "grad_norm": 0.08383151143789291,
      "learning_rate": 4.433681695190027e-05,
      "loss": 8.4315,
      "step": 42688,
      "throughput": 8892.220307742966
    },
    {
      "epoch": 0.669597724120095,
      "grad_norm": 0.08101122826337814,
      "learning_rate": 4.421411825270785e-05,
      "loss": 8.4227,
      "step": 42720,
      "throughput": 8892.227313835552
    },
    {
      "epoch": 0.6700992954490239,
      "grad_norm": 0.07665781676769257,
      "learning_rate": 4.4091917674849727e-05,
      "loss": 8.4411,
      "step": 42752,
      "throughput": 8892.223930766586
    },
    {
      "epoch": 0.6706008667779528,
      "grad_norm": 0.07866701483726501,
      "learning_rate": 4.397021572228147e-05,
      "loss": 8.4472,
      "step": 42784,
      "throughput": 8892.283116147706
    },
    {
      "epoch": 0.6711024381068817,
      "grad_norm": 0.07769527286291122,
      "learning_rate": 4.38490128969023e-05,
      "loss": 8.4492,
      "step": 42816,
      "throughput": 8892.359819898718
    },
    {
      "epoch": 0.6716040094358107,
      "grad_norm": 0.08288507908582687,
      "learning_rate": 4.3728309698553056e-05,
      "loss": 8.4514,
      "step": 42848,
      "throughput": 8892.378210684165
    },
    {
      "epoch": 0.6721055807647396,
      "grad_norm": 0.07571935653686523,
      "learning_rate": 4.3608106625014014e-05,
      "loss": 8.4338,
      "step": 42880,
      "throughput": 8892.355894409146
    },
    {
      "epoch": 0.6726071520936684,
      "grad_norm": 0.0770522728562355,
      "learning_rate": 4.348840417200306e-05,
      "loss": 8.4417,
      "step": 42912,
      "throughput": 8892.42663160015
    },
    {
      "epoch": 0.6731087234225973,
      "grad_norm": 0.08030234277248383,
      "learning_rate": 4.336920283317343e-05,
      "loss": 8.4531,
      "step": 42944,
      "throughput": 8892.508469753493
    },
    {
      "epoch": 0.6736102947515262,
      "grad_norm": 0.07389728724956512,
      "learning_rate": 4.325050310011183e-05,
      "loss": 8.4575,
      "step": 42976,
      "throughput": 8892.518853790318
    },
    {
      "epoch": 0.6741118660804551,
      "grad_norm": 0.08421860635280609,
      "learning_rate": 4.3132305462336306e-05,
      "loss": 8.4433,
      "step": 43008,
      "throughput": 8892.490140747977
    },
    {
      "epoch": 0.6746134374093841,
      "grad_norm": 0.07845675945281982,
      "learning_rate": 4.301461040729424e-05,
      "loss": 8.4634,
      "step": 43040,
      "throughput": 8892.038946444562
    },
    {
      "epoch": 0.675115008738313,
      "grad_norm": 0.08324997872114182,
      "learning_rate": 4.289741842036042e-05,
      "loss": 8.4343,
      "step": 43072,
      "throughput": 8892.10840247032
    },
    {
      "epoch": 0.6756165800672419,
      "grad_norm": 0.07667215168476105,
      "learning_rate": 4.2780729984834916e-05,
      "loss": 8.4246,
      "step": 43104,
      "throughput": 8892.101825268575
    },
    {
      "epoch": 0.6761181513961708,
      "grad_norm": 0.07493610680103302,
      "learning_rate": 4.266454558194122e-05,
      "loss": 8.4348,
      "step": 43136,
      "throughput": 8892.085081520685
    },
    {
      "epoch": 0.6766197227250997,
      "grad_norm": 0.08308251947164536,
      "learning_rate": 4.254886569082413e-05,
      "loss": 8.4182,
      "step": 43168,
      "throughput": 8892.149902078248
    },
    {
      "epoch": 0.6771212940540287,
      "grad_norm": 0.08159197866916656,
      "learning_rate": 4.243369078854788e-05,
      "loss": 8.4305,
      "step": 43200,
      "throughput": 8892.234243039149
    },
    {
      "epoch": 0.6776228653829576,
      "grad_norm": 0.08277013152837753,
      "learning_rate": 4.231902135009407e-05,
      "loss": 8.4528,
      "step": 43232,
      "throughput": 8892.228431107438
    },
    {
      "epoch": 0.6781244367118865,
      "grad_norm": 0.08104237169027328,
      "learning_rate": 4.220485784835984e-05,
      "loss": 8.4421,
      "step": 43264,
      "throughput": 8892.238283717748
    },
    {
      "epoch": 0.6786260080408154,
      "grad_norm": 0.07368003576993942,
      "learning_rate": 4.209120075415577e-05,
      "loss": 8.4157,
      "step": 43296,
      "throughput": 8892.29604650612
    },
    {
      "epoch": 0.6791275793697443,
      "grad_norm": 0.0873587504029274,
      "learning_rate": 4.197805053620411e-05,
      "loss": 8.4212,
      "step": 43328,
      "throughput": 8892.373143548699
    },
    {
      "epoch": 0.6796291506986731,
      "grad_norm": 0.08577149361371994,
      "learning_rate": 4.186540766113665e-05,
      "loss": 8.426,
      "step": 43360,
      "throughput": 8892.386226062543
    },
    {
      "epoch": 0.680130722027602,
      "grad_norm": 0.08065624535083771,
      "learning_rate": 4.1753272593492956e-05,
      "loss": 8.4365,
      "step": 43392,
      "throughput": 8892.378610413167
    },
    {
      "epoch": 0.680632293356531,
      "grad_norm": 0.07502314448356628,
      "learning_rate": 4.1641645795718364e-05,
      "loss": 8.4459,
      "step": 43424,
      "throughput": 8892.440894507208
    },
    {
      "epoch": 0.6811338646854599,
      "grad_norm": 0.09220045059919357,
      "learning_rate": 4.153052772816217e-05,
      "loss": 8.4139,
      "step": 43456,
      "throughput": 8892.508903647942
    },
    {
      "epoch": 0.6816354360143888,
      "grad_norm": 0.07826574891805649,
      "learning_rate": 4.141991884907555e-05,
      "loss": 8.4193,
      "step": 43488,
      "throughput": 8892.53030710533
    },
    {
      "epoch": 0.6821370073433177,
      "grad_norm": 0.07697897404432297,
      "learning_rate": 4.1309819614609865e-05,
      "loss": 8.4213,
      "step": 43520,
      "throughput": 8892.526230805855
    },
    {
      "epoch": 0.6826385786722466,
      "grad_norm": 0.07445234060287476,
      "learning_rate": 4.1200230478814695e-05,
      "loss": 8.4411,
      "step": 43552,
      "throughput": 8892.590827408416
    },
    {
      "epoch": 0.6831401500011756,
      "grad_norm": 0.07667402923107147,
      "learning_rate": 4.109115189363601e-05,
      "loss": 8.4357,
      "step": 43584,
      "throughput": 8892.659610304425
    },
    {
      "epoch": 0.6836417213301045,
      "grad_norm": 0.07593469321727753,
      "learning_rate": 4.0982584308914114e-05,
      "loss": 8.4079,
      "step": 43616,
      "throughput": 8892.660647639474
    },
    {
      "epoch": 0.6841432926590334,
      "grad_norm": 0.07302851229906082,
      "learning_rate": 4.0874528172382114e-05,
      "loss": 8.4365,
      "step": 43648,
      "throughput": 8892.622449682587
    },
    {
      "epoch": 0.6846448639879623,
      "grad_norm": 0.07740382850170135,
      "learning_rate": 4.0766983929663835e-05,
      "loss": 8.4103,
      "step": 43680,
      "throughput": 8892.69376033167
    },
    {
      "epoch": 0.6851464353168912,
      "grad_norm": 0.07721805572509766,
      "learning_rate": 4.065995202427206e-05,
      "loss": 8.4223,
      "step": 43712,
      "throughput": 8892.755478850933
    },
    {
      "epoch": 0.6856480066458202,
      "grad_norm": 0.07630830258131027,
      "learning_rate": 4.055343289760664e-05,
      "loss": 8.4292,
      "step": 43744,
      "throughput": 8892.764671392262
    },
    {
      "epoch": 0.6861495779747491,
      "grad_norm": 0.07535416632890701,
      "learning_rate": 4.0447426988952816e-05,
      "loss": 8.4037,
      "step": 43776,
      "throughput": 8892.720405218828
    },
    {
      "epoch": 0.6866511493036779,
      "grad_norm": 0.07637656480073929,
      "learning_rate": 4.0341934735479224e-05,
      "loss": 8.4254,
      "step": 43808,
      "throughput": 8892.78404823713
    },
    {
      "epoch": 0.6871527206326068,
      "grad_norm": 0.07621389627456665,
      "learning_rate": 4.02369565722363e-05,
      "loss": 8.4427,
      "step": 43840,
      "throughput": 8892.854481523502
    },
    {
      "epoch": 0.6876542919615357,
      "grad_norm": 0.07737398892641068,
      "learning_rate": 4.013249293215422e-05,
      "loss": 8.4014,
      "step": 43872,
      "throughput": 8892.86718784577
    },
    {
      "epoch": 0.6881558632904646,
      "grad_norm": 0.07027604430913925,
      "learning_rate": 4.0028544246041406e-05,
      "loss": 8.4149,
      "step": 43904,
      "throughput": 8892.83076939561
    },
    {
      "epoch": 0.6886574346193935,
      "grad_norm": 0.09703302383422852,
      "learning_rate": 3.99251109425825e-05,
      "loss": 8.4426,
      "step": 43936,
      "throughput": 8892.896175152631
    },
    {
      "epoch": 0.6891590059483225,
      "grad_norm": 0.07484853267669678,
      "learning_rate": 3.982219344833681e-05,
      "loss": 8.4334,
      "step": 43968,
      "throughput": 8892.970712885226
    },
    {
      "epoch": 0.6896605772772514,
      "grad_norm": 0.07486287504434586,
      "learning_rate": 3.971979218773634e-05,
      "loss": 8.4072,
      "step": 44000,
      "throughput": 8892.976506505798
    },
    {
      "epoch": 0.6901621486061803,
      "grad_norm": 0.08308301866054535,
      "learning_rate": 3.961790758308418e-05,
      "loss": 8.4238,
      "step": 44032,
      "throughput": 8892.957701269286
    },
    {
      "epoch": 0.6906637199351092,
      "grad_norm": 0.07927940040826797,
      "learning_rate": 3.951654005455281e-05,
      "loss": 8.4175,
      "step": 44064,
      "throughput": 8893.056101654616
    },
    {
      "epoch": 0.6911652912640381,
      "grad_norm": 0.07408682256937027,
      "learning_rate": 3.9415690020182154e-05,
      "loss": 8.4284,
      "step": 44096,
      "throughput": 8893.10282842892
    },
    {
      "epoch": 0.6916668625929671,
      "grad_norm": 0.07429683953523636,
      "learning_rate": 3.9315357895878066e-05,
      "loss": 8.42,
      "step": 44128,
      "throughput": 8893.090918747575
    },
    {
      "epoch": 0.692168433921896,
      "grad_norm": 0.07516713440418243,
      "learning_rate": 3.921554409541053e-05,
      "loss": 8.4188,
      "step": 44160,
      "throughput": 8893.090593852326
    },
    {
      "epoch": 0.6926700052508249,
      "grad_norm": 0.08680712431669235,
      "learning_rate": 3.911624903041198e-05,
      "loss": 8.4395,
      "step": 44192,
      "throughput": 8893.163611643442
    },
    {
      "epoch": 0.6931715765797538,
      "grad_norm": 0.07508208602666855,
      "learning_rate": 3.9017473110375525e-05,
      "loss": 8.4294,
      "step": 44224,
      "throughput": 8893.215364050175
    },
    {
      "epoch": 0.6936731479086826,
      "grad_norm": 0.07337779551744461,
      "learning_rate": 3.891921674265336e-05,
      "loss": 8.4095,
      "step": 44256,
      "throughput": 8893.217250426818
    },
    {
      "epoch": 0.6941747192376115,
      "grad_norm": 0.08711987733840942,
      "learning_rate": 3.8821480332455024e-05,
      "loss": 8.4182,
      "step": 44288,
      "throughput": 8893.222381878295
    },
    {
      "epoch": 0.6946762905665405,
      "grad_norm": 0.07527220249176025,
      "learning_rate": 3.87242642828458e-05,
      "loss": 8.4389,
      "step": 44320,
      "throughput": 8893.30610869026
    },
    {
      "epoch": 0.6951778618954694,
      "grad_norm": 0.0782904177904129,
      "learning_rate": 3.862756899474493e-05,
      "loss": 8.4264,
      "step": 44352,
      "throughput": 8893.35663963662
    },
    {
      "epoch": 0.6956794332243983,
      "grad_norm": 0.07315529882907867,
      "learning_rate": 3.853139486692408e-05,
      "loss": 8.3995,
      "step": 44384,
      "throughput": 8893.36900993904
    },
    {
      "epoch": 0.6961810045533272,
      "grad_norm": 0.07299330830574036,
      "learning_rate": 3.843574229600565e-05,
      "loss": 8.4172,
      "step": 44416,
      "throughput": 8893.364377678341
    },
    {
      "epoch": 0.6966825758822561,
      "grad_norm": 0.07816275954246521,
      "learning_rate": 3.834061167646112e-05,
      "loss": 8.4272,
      "step": 44448,
      "throughput": 8893.45437929874
    },
    {
      "epoch": 0.697184147211185,
      "grad_norm": 0.07753538340330124,
      "learning_rate": 3.8246003400609424e-05,
      "loss": 8.4189,
      "step": 44480,
      "throughput": 8893.522440354585
    },
    {
      "epoch": 0.697685718540114,
      "grad_norm": 0.0742424800992012,
      "learning_rate": 3.81519178586154e-05,
      "loss": 8.4166,
      "step": 44512,
      "throughput": 8893.504933907088
    },
    {
      "epoch": 0.6981872898690429,
      "grad_norm": 0.0793553963303566,
      "learning_rate": 3.805835543848809e-05,
      "loss": 8.4453,
      "step": 44544,
      "throughput": 8893.485443527716
    },
    {
      "epoch": 0.6986888611979718,
      "grad_norm": 0.07728642970323563,
      "learning_rate": 3.796531652607919e-05,
      "loss": 8.4371,
      "step": 44576,
      "throughput": 8893.568888989736
    },
    {
      "epoch": 0.6991904325269007,
      "grad_norm": 0.07519408315420151,
      "learning_rate": 3.7872801505081434e-05,
      "loss": 8.4439,
      "step": 44608,
      "throughput": 8893.645694995372
    },
    {
      "epoch": 0.6996920038558296,
      "grad_norm": 0.08204668760299683,
      "learning_rate": 3.778081075702709e-05,
      "loss": 8.414,
      "step": 44640,
      "throughput": 8893.618307605391
    },
    {
      "epoch": 0.7001935751847586,
      "grad_norm": 0.07789931446313858,
      "learning_rate": 3.7689344661286264e-05,
      "loss": 8.4418,
      "step": 44672,
      "throughput": 8893.589302884588
    },
    {
      "epoch": 0.7006951465136874,
      "grad_norm": 0.07528722286224365,
      "learning_rate": 3.759840359506536e-05,
      "loss": 8.4128,
      "step": 44704,
      "throughput": 8893.668691472745
    },
    {
      "epoch": 0.7011967178426163,
      "grad_norm": 0.07807424664497375,
      "learning_rate": 3.750798793340565e-05,
      "loss": 8.4278,
      "step": 44736,
      "throughput": 8893.748497852805
    },
    {
      "epoch": 0.7016982891715452,
      "grad_norm": 0.0780852809548378,
      "learning_rate": 3.7418098049181573e-05,
      "loss": 8.4391,
      "step": 44768,
      "throughput": 8893.740000518525
    },
    {
      "epoch": 0.7021998605004741,
      "grad_norm": 0.0890655443072319,
      "learning_rate": 3.732873431309929e-05,
      "loss": 8.416,
      "step": 44800,
      "throughput": 8893.753083047186
    },
    {
      "epoch": 0.702701431829403,
      "grad_norm": 0.07461878657341003,
      "learning_rate": 3.7239897093695106e-05,
      "loss": 8.4273,
      "step": 44832,
      "throughput": 8893.805946569892
    },
    {
      "epoch": 0.703203003158332,
      "grad_norm": 0.07440055161714554,
      "learning_rate": 3.715158675733396e-05,
      "loss": 8.427,
      "step": 44864,
      "throughput": 8893.89045700433
    },
    {
      "epoch": 0.7037045744872609,
      "grad_norm": 0.0728885680437088,
      "learning_rate": 3.706380366820796e-05,
      "loss": 8.4221,
      "step": 44896,
      "throughput": 8893.882995999113
    },
    {
      "epoch": 0.7042061458161898,
      "grad_norm": 0.09329033643007278,
      "learning_rate": 3.6976548188334834e-05,
      "loss": 8.4037,
      "step": 44928,
      "throughput": 8893.894693010705
    },
    {
      "epoch": 0.7047077171451187,
      "grad_norm": 0.07572057843208313,
      "learning_rate": 3.688982067755642e-05,
      "loss": 8.4042,
      "step": 44960,
      "throughput": 8893.955631621076
    },
    {
      "epoch": 0.7052092884740476,
      "grad_norm": 0.08463934808969498,
      "learning_rate": 3.680362149353724e-05,
      "loss": 8.4324,
      "step": 44992,
      "throughput": 8894.0305370298
    },
    {
      "epoch": 0.7057108598029765,
      "grad_norm": 0.0743819996714592,
      "learning_rate": 3.671795099176297e-05,
      "loss": 8.4116,
      "step": 45024,
      "throughput": 8894.01516265958
    },
    {
      "epoch": 0.7062124311319055,
      "grad_norm": 0.07929202914237976,
      "learning_rate": 3.6632809525539055e-05,
      "loss": 8.4251,
      "step": 45056,
      "throughput": 8894.018330030944
    },
    {
      "epoch": 0.7067140024608344,
      "grad_norm": 0.0749729722738266,
      "learning_rate": 3.6548197445989086e-05,
      "loss": 8.4302,
      "step": 45088,
      "throughput": 8893.599986004909
    },
    {
      "epoch": 0.7072155737897633,
      "grad_norm": 0.07228686660528183,
      "learning_rate": 3.6464115102053596e-05,
      "loss": 8.4133,
      "step": 45120,
      "throughput": 8893.675785171614
    },
    {
      "epoch": 0.7077171451186921,
      "grad_norm": 0.0786343514919281,
      "learning_rate": 3.6380562840488376e-05,
      "loss": 8.4365,
      "step": 45152,
      "throughput": 8893.641740279048
    },
    {
      "epoch": 0.708218716447621,
      "grad_norm": 0.0737488642334938,
      "learning_rate": 3.629754100586323e-05,
      "loss": 8.414,
      "step": 45184,
      "throughput": 8893.642569637646
    },
    {
      "epoch": 0.7087202877765499,
      "grad_norm": 0.08403093367815018,
      "learning_rate": 3.6215049940560433e-05,
      "loss": 8.4351,
      "step": 45216,
      "throughput": 8893.732444783913
    },
    {
      "epoch": 0.7092218591054789,
      "grad_norm": 0.08579878509044647,
      "learning_rate": 3.613308998477339e-05,
      "loss": 8.3874,
      "step": 45248,
      "throughput": 8893.810930803864
    },
    {
      "epoch": 0.7097234304344078,
      "grad_norm": 0.07760308682918549,
      "learning_rate": 3.605166147650517e-05,
      "loss": 8.4148,
      "step": 45280,
      "throughput": 8893.777941937868
    },
    {
      "epoch": 0.7102250017633367,
      "grad_norm": 0.0734131783246994,
      "learning_rate": 3.597076475156726e-05,
      "loss": 8.4353,
      "step": 45312,
      "throughput": 8893.765879205092
    },
    {
      "epoch": 0.7107265730922656,
      "grad_norm": 0.08017542213201523,
      "learning_rate": 3.589040014357791e-05,
      "loss": 8.4379,
      "step": 45344,
      "throughput": 8893.84987470408
    },
    {
      "epoch": 0.7112281444211945,
      "grad_norm": 0.08208679407835007,
      "learning_rate": 3.581056798396105e-05,
      "loss": 8.4242,
      "step": 45376,
      "throughput": 8893.941186582775
    },
    {
      "epoch": 0.7117297157501234,
      "grad_norm": 0.10507772862911224,
      "learning_rate": 3.57312686019447e-05,
      "loss": 8.4098,
      "step": 45408,
      "throughput": 8893.914640901407
    },
    {
      "epoch": 0.7122312870790524,
      "grad_norm": 0.07428514212369919,
      "learning_rate": 3.565250232455983e-05,
      "loss": 8.4264,
      "step": 45440,
      "throughput": 8893.898159260922
    },
    {
      "epoch": 0.7127328584079813,
      "grad_norm": 0.07359248399734497,
      "learning_rate": 3.55742694766387e-05,
      "loss": 8.4139,
      "step": 45472,
      "throughput": 8893.988018737435
    },
    {
      "epoch": 0.7132344297369102,
      "grad_norm": 0.07536429166793823,
      "learning_rate": 3.549657038081386e-05,
      "loss": 8.4189,
      "step": 45504,
      "throughput": 8894.083007954694
    },
    {
      "epoch": 0.7137360010658391,
      "grad_norm": 0.07599938660860062,
      "learning_rate": 3.5419405357516624e-05,
      "loss": 8.4067,
      "step": 45536,
      "throughput": 8894.044541761768
    },
    {
      "epoch": 0.714237572394768,
      "grad_norm": 0.07829944044351578,
      "learning_rate": 3.534277472497574e-05,
      "loss": 8.4119,
      "step": 45568,
      "throughput": 8894.038960063219
    },
    {
      "epoch": 0.7147391437236968,
      "grad_norm": 0.10090679675340652,
      "learning_rate": 3.52666787992162e-05,
      "loss": 8.432,
      "step": 45600,
      "throughput": 8894.131161951538
    },
    {
      "epoch": 0.7152407150526258,
      "grad_norm": 0.07359547913074493,
      "learning_rate": 3.519111789405779e-05,
      "loss": 8.4454,
      "step": 45632,
      "throughput": 8894.225585803943
    },
    {
      "epoch": 0.7157422863815547,
      "grad_norm": 0.07896874845027924,
      "learning_rate": 3.5116092321113936e-05,
      "loss": 8.4216,
      "step": 45664,
      "throughput": 8894.173687919589
    },
    {
      "epoch": 0.7162438577104836,
      "grad_norm": 0.08137936145067215,
      "learning_rate": 3.504160238979032e-05,
      "loss": 8.3998,
      "step": 45696,
      "throughput": 8894.15914533694
    },
    {
      "epoch": 0.7167454290394125,
      "grad_norm": 0.07841339707374573,
      "learning_rate": 3.496764840728361e-05,
      "loss": 8.3995,
      "step": 45728,
      "throughput": 8894.242613880711
    },
    {
      "epoch": 0.7172470003683414,
      "grad_norm": 0.09461364895105362,
      "learning_rate": 3.489423067858027e-05,
      "loss": 8.4117,
      "step": 45760,
      "throughput": 8894.331621507126
    },
    {
      "epoch": 0.7177485716972704,
      "grad_norm": 0.07874492555856705,
      "learning_rate": 3.4821349506455255e-05,
      "loss": 8.4268,
      "step": 45792,
      "throughput": 8894.298459594376
    },
    {
      "epoch": 0.7182501430261993,
      "grad_norm": 0.07716283202171326,
      "learning_rate": 3.47490051914707e-05,
      "loss": 8.3923,
      "step": 45824,
      "throughput": 8894.276196431043
    },
    {
      "epoch": 0.7187517143551282,
      "grad_norm": 0.08003325760364532,
      "learning_rate": 3.4677198031974784e-05,
      "loss": 8.4112,
      "step": 45856,
      "throughput": 8894.377815461386
    },
    {
      "epoch": 0.7192532856840571,
      "grad_norm": 0.08741440623998642,
      "learning_rate": 3.4605928324100444e-05,
      "loss": 8.4322,
      "step": 45888,
      "throughput": 8894.454749304441
    },
    {
      "epoch": 0.719754857012986,
      "grad_norm": 0.07573242485523224,
      "learning_rate": 3.45351963617642e-05,
      "loss": 8.4023,
      "step": 45920,
      "throughput": 8894.404858200647
    },
    {
      "epoch": 0.720256428341915,
      "grad_norm": 0.08988625556230545,
      "learning_rate": 3.446500243666481e-05,
      "loss": 8.432,
      "step": 45952,
      "throughput": 8894.370859969931
    },
    {
      "epoch": 0.7207579996708439,
      "grad_norm": 0.07446542382240295,
      "learning_rate": 3.439534683828228e-05,
      "loss": 8.4157,
      "step": 45984,
      "throughput": 8894.465354263342
    },
    {
      "epoch": 0.7212595709997727,
      "grad_norm": 0.07130029797554016,
      "learning_rate": 3.4326229853876475e-05,
      "loss": 8.4267,
      "step": 46016,
      "throughput": 8894.547165939282
    },
    {
      "epoch": 0.7217611423287016,
      "grad_norm": 0.09455982595682144,
      "learning_rate": 3.425765176848607e-05,
      "loss": 8.4167,
      "step": 46048,
      "throughput": 8894.505506216065
    },
    {
      "epoch": 0.7222627136576305,
      "grad_norm": 0.07977471500635147,
      "learning_rate": 3.418961286492728e-05,
      "loss": 8.4246,
      "step": 46080,
      "throughput": 8894.48630621797
    },
    {
      "epoch": 0.7227642849865594,
      "grad_norm": 0.08261588215827942,
      "learning_rate": 3.412211342379273e-05,
      "loss": 8.4299,
      "step": 46112,
      "throughput": 8894.578353839574
    },
    {
      "epoch": 0.7232658563154883,
      "grad_norm": 0.07853943109512329,
      "learning_rate": 3.405515372345033e-05,
      "loss": 8.415,
      "step": 46144,
      "throughput": 8894.650451220767
    },
    {
      "epoch": 0.7237674276444173,
      "grad_norm": 0.08436351269483566,
      "learning_rate": 3.398873404004209e-05,
      "loss": 8.4269,
      "step": 46176,
      "throughput": 8894.625718834395
    },
    {
      "epoch": 0.7242689989733462,
      "grad_norm": 0.0723554790019989,
      "learning_rate": 3.392285464748298e-05,
      "loss": 8.4267,
      "step": 46208,
      "throughput": 8894.640924683054
    },
    {
      "epoch": 0.7247705703022751,
      "grad_norm": 0.0779571607708931,
      "learning_rate": 3.385751581745979e-05,
      "loss": 8.4206,
      "step": 46240,
      "throughput": 8894.715659554004
    },
    {
      "epoch": 0.725272141631204,
      "grad_norm": 0.07471830397844315,
      "learning_rate": 3.379271781943007e-05,
      "loss": 8.4034,
      "step": 46272,
      "throughput": 8894.789629584317
    },
    {
      "epoch": 0.7257737129601329,
      "grad_norm": 0.08449774235486984,
      "learning_rate": 3.372846092062095e-05,
      "loss": 8.416,
      "step": 46304,
      "throughput": 8894.76510619109
    },
    {
      "epoch": 0.7262752842890619,
      "grad_norm": 0.07693791389465332,
      "learning_rate": 3.366474538602806e-05,
      "loss": 8.4207,
      "step": 46336,
      "throughput": 8894.779872623403
    },
    {
      "epoch": 0.7267768556179908,
      "grad_norm": 0.07645770162343979,
      "learning_rate": 3.3601571478414455e-05,
      "loss": 8.4001,
      "step": 46368,
      "throughput": 8894.855687529682
    },
    {
      "epoch": 0.7272784269469197,
      "grad_norm": 0.07502170652151108,
      "learning_rate": 3.3538939458309556e-05,
      "loss": 8.4142,
      "step": 46400,
      "throughput": 8894.934405652899
    },
    {
      "epoch": 0.7277799982758486,
      "grad_norm": 0.07404825091362,
      "learning_rate": 3.347684958400795e-05,
      "loss": 8.4029,
      "step": 46432,
      "throughput": 8894.906361495016
    },
    {
      "epoch": 0.7282815696047774,
      "grad_norm": 0.07349622249603271,
      "learning_rate": 3.341530211156847e-05,
      "loss": 8.4086,
      "step": 46464,
      "throughput": 8894.905702225637
    },
    {
      "epoch": 0.7287831409337063,
      "grad_norm": 0.07417155802249908,
      "learning_rate": 3.33542972948131e-05,
      "loss": 8.4119,
      "step": 46496,
      "throughput": 8894.986632206708
    },
    {
      "epoch": 0.7292847122626352,
      "grad_norm": 0.07474679499864578,
      "learning_rate": 3.329383538532587e-05,
      "loss": 8.417,
      "step": 46528,
      "throughput": 8895.063273953068
    },
    {
      "epoch": 0.7297862835915642,
      "grad_norm": 0.07456853985786438,
      "learning_rate": 3.323391663245188e-05,
      "loss": 8.4095,
      "step": 46560,
      "throughput": 8895.038880258573
    },
    {
      "epoch": 0.7302878549204931,
      "grad_norm": 0.08269164711236954,
      "learning_rate": 3.3174541283296225e-05,
      "loss": 8.4066,
      "step": 46592,
      "throughput": 8895.007096485651
    },
    {
      "epoch": 0.730789426249422,
      "grad_norm": 0.07989947497844696,
      "learning_rate": 3.311570958272303e-05,
      "loss": 8.3936,
      "step": 46624,
      "throughput": 8895.086686128505
    },
    {
      "epoch": 0.7312909975783509,
      "grad_norm": 0.0955742597579956,
      "learning_rate": 3.305742177335444e-05,
      "loss": 8.4006,
      "step": 46656,
      "throughput": 8895.1647328995
    },
    {
      "epoch": 0.7317925689072798,
      "grad_norm": 0.0774775892496109,
      "learning_rate": 3.29996780955695e-05,
      "loss": 8.3994,
      "step": 46688,
      "throughput": 8895.15862533771
    },
    {
      "epoch": 0.7322941402362088,
      "grad_norm": 0.07914526760578156,
      "learning_rate": 3.294247878750333e-05,
      "loss": 8.4166,
      "step": 46720,
      "throughput": 8895.125977648093
    },
    {
      "epoch": 0.7327957115651377,
      "grad_norm": 0.0831366628408432,
      "learning_rate": 3.288582408504603e-05,
      "loss": 8.4,
      "step": 46752,
      "throughput": 8895.192527081916
    },
    {
      "epoch": 0.7332972828940666,
      "grad_norm": 0.07906366139650345,
      "learning_rate": 3.2829714221841805e-05,
      "loss": 8.4372,
      "step": 46784,
      "throughput": 8895.26399886747
    },
    {
      "epoch": 0.7337988542229955,
      "grad_norm": 0.08686784654855728,
      "learning_rate": 3.2774149429287854e-05,
      "loss": 8.4136,
      "step": 46816,
      "throughput": 8895.276284252073
    },
    {
      "epoch": 0.7343004255519244,
      "grad_norm": 0.07456189393997192,
      "learning_rate": 3.271912993653357e-05,
      "loss": 8.4237,
      "step": 46848,
      "throughput": 8895.240545857285
    },
    {
      "epoch": 0.7348019968808533,
      "grad_norm": 0.0784822478890419,
      "learning_rate": 3.266465597047948e-05,
      "loss": 8.4144,
      "step": 46880,
      "throughput": 8895.311687503458
    },
    {
      "epoch": 0.7353035682097822,
      "grad_norm": 0.07475121319293976,
      "learning_rate": 3.261072775577641e-05,
      "loss": 8.4172,
      "step": 46912,
      "throughput": 8895.390936161253
    },
    {
      "epoch": 0.7358051395387111,
      "grad_norm": 0.07888434082269669,
      "learning_rate": 3.255734551482446e-05,
      "loss": 8.4018,
      "step": 46944,
      "throughput": 8895.40673442008
    },
    {
      "epoch": 0.73630671086764,
      "grad_norm": 0.08462213724851608,
      "learning_rate": 3.2504509467772154e-05,
      "loss": 8.4053,
      "step": 46976,
      "throughput": 8895.382993414712
    },
    {
      "epoch": 0.7368082821965689,
      "grad_norm": 0.08841930329799652,
      "learning_rate": 3.24522198325155e-05,
      "loss": 8.4,
      "step": 47008,
      "throughput": 8895.447764250572
    },
    {
      "epoch": 0.7373098535254978,
      "grad_norm": 0.07977207005023956,
      "learning_rate": 3.2400476824697126e-05,
      "loss": 8.4007,
      "step": 47040,
      "throughput": 8895.516781410292
    },
    {
      "epoch": 0.7378114248544267,
      "grad_norm": 0.0798354521393776,
      "learning_rate": 3.234928065770532e-05,
      "loss": 8.4296,
      "step": 47072,
      "throughput": 8895.507091154892
    },
    {
      "epoch": 0.7383129961833557,
      "grad_norm": 0.08863840252161026,
      "learning_rate": 3.2298631542673254e-05,
      "loss": 8.4139,
      "step": 47104,
      "throughput": 8895.49570712789
    },
    {
      "epoch": 0.7388145675122846,
      "grad_norm": 0.08590971678495407,
      "learning_rate": 3.2248529688478036e-05,
      "loss": 8.4385,
      "step": 47136,
      "throughput": 8895.124791778228
    },
    {
      "epoch": 0.7393161388412135,
      "grad_norm": 0.07869445532560349,
      "learning_rate": 3.2198975301739834e-05,
      "loss": 8.4068,
      "step": 47168,
      "throughput": 8895.203100537423
    },
    {
      "epoch": 0.7398177101701424,
      "grad_norm": 0.07871249318122864,
      "learning_rate": 3.214996858682109e-05,
      "loss": 8.4091,
      "step": 47200,
      "throughput": 8895.19382455048
    },
    {
      "epoch": 0.7403192814990713,
      "grad_norm": 0.08750054985284805,
      "learning_rate": 3.210150974582565e-05,
      "loss": 8.4275,
      "step": 47232,
      "throughput": 8895.161067308942
    },
    {
      "epoch": 0.7408208528280003,
      "grad_norm": 0.08193980902433395,
      "learning_rate": 3.205359897859793e-05,
      "loss": 8.4034,
      "step": 47264,
      "throughput": 8895.249539504743
    },
    {
      "epoch": 0.7413224241569292,
      "grad_norm": 0.0736575797200203,
      "learning_rate": 3.2006236482722034e-05,
      "loss": 8.3818,
      "step": 47296,
      "throughput": 8895.33768076271
    },
    {
      "epoch": 0.7418239954858581,
      "grad_norm": 0.0773068219423294,
      "learning_rate": 3.195942245352108e-05,
      "loss": 8.4243,
      "step": 47328,
      "throughput": 8895.324131426094
    },
    {
      "epoch": 0.7423255668147869,
      "grad_norm": 0.0826229453086853,
      "learning_rate": 3.191315708405626e-05,
      "loss": 8.4079,
      "step": 47360,
      "throughput": 8895.304989026181
    },
    {
      "epoch": 0.7428271381437158,
      "grad_norm": 0.07899387180805206,
      "learning_rate": 3.1867440565126066e-05,
      "loss": 8.4325,
      "step": 47392,
      "throughput": 8895.374665493979
    },
    {
      "epoch": 0.7433287094726447,
      "grad_norm": 0.16860762238502502,
      "learning_rate": 3.182227308526557e-05,
      "loss": 8.4022,
      "step": 47424,
      "throughput": 8895.458982203403
    },
    {
      "epoch": 0.7438302808015737,
      "grad_norm": 0.07298004627227783,
      "learning_rate": 3.17776548307456e-05,
      "loss": 8.4252,
      "step": 47456,
      "throughput": 8895.449998748658
    },
    {
      "epoch": 0.7443318521305026,
      "grad_norm": 0.07910618185997009,
      "learning_rate": 3.173358598557196e-05,
      "loss": 8.4017,
      "step": 47488,
      "throughput": 8895.42143796254
    },
    {
      "epoch": 0.7448334234594315,
      "grad_norm": 0.07871539145708084,
      "learning_rate": 3.169006673148473e-05,
      "loss": 8.3927,
      "step": 47520,
      "throughput": 8895.500163381023
    },
    {
      "epoch": 0.7453349947883604,
      "grad_norm": 0.09069735556840897,
      "learning_rate": 3.1647097247957385e-05,
      "loss": 8.4048,
      "step": 47552,
      "throughput": 8895.603389232176
    },
    {
      "epoch": 0.7458365661172893,
      "grad_norm": 0.08377867192029953,
      "learning_rate": 3.160467771219624e-05,
      "loss": 8.4123,
      "step": 47584,
      "throughput": 8895.581162530754
    },
    {
      "epoch": 0.7463381374462182,
      "grad_norm": 0.07731027156114578,
      "learning_rate": 3.1562808299139596e-05,
      "loss": 8.4229,
      "step": 47616,
      "throughput": 8895.540017225196
    },
    {
      "epoch": 0.7468397087751472,
      "grad_norm": 0.0828324630856514,
      "learning_rate": 3.1521489181457005e-05,
      "loss": 8.4149,
      "step": 47648,
      "throughput": 8895.616793753607
    },
    {
      "epoch": 0.7473412801040761,
      "grad_norm": 0.10275556892156601,
      "learning_rate": 3.1480720529548654e-05,
      "loss": 8.4065,
      "step": 47680,
      "throughput": 8895.722701961871
    },
    {
      "epoch": 0.747842851433005,
      "grad_norm": 0.09654593467712402,
      "learning_rate": 3.1440502511544566e-05,
      "loss": 8.4057,
      "step": 47712,
      "throughput": 8895.714717567691
    },
    {
      "epoch": 0.7483444227619339,
      "grad_norm": 0.08919548988342285,
      "learning_rate": 3.1400835293303984e-05,
      "loss": 8.4201,
      "step": 47744,
      "throughput": 8895.67539681683
    },
    {
      "epoch": 0.7488459940908628,
      "grad_norm": 0.08011159300804138,
      "learning_rate": 3.136171903841463e-05,
      "loss": 8.4234,
      "step": 47776,
      "throughput": 8895.754041534876
    },
    {
      "epoch": 0.7493475654197916,
      "grad_norm": 0.08086296170949936,
      "learning_rate": 3.1323153908192057e-05,
      "loss": 8.4123,
      "step": 47808,
      "throughput": 8895.867884774138
    },
    {
      "epoch": 0.7498491367487206,
      "grad_norm": 0.07986344397068024,
      "learning_rate": 3.128514006167897e-05,
      "loss": 8.4253,
      "step": 47840,
      "throughput": 8895.852017272211
    },
    {
      "epoch": 0.7503507080776495,
      "grad_norm": 0.07865214347839355,
      "learning_rate": 3.124767765564459e-05,
      "loss": 8.404,
      "step": 47872,
      "throughput": 8895.824903835235
    },
    {
      "epoch": 0.7508522794065784,
      "grad_norm": 0.07204648107290268,
      "learning_rate": 3.121076684458398e-05,
      "loss": 8.4139,
      "step": 47904,
      "throughput": 8895.893625485049
    },
    {
      "epoch": 0.7513538507355073,
      "grad_norm": 0.08589160442352295,
      "learning_rate": 3.1174407780717433e-05,
      "loss": 8.4103,
      "step": 47936,
      "throughput": 8895.9942421036
    },
    {
      "epoch": 0.7518554220644362,
      "grad_norm": 0.09325224161148071,
      "learning_rate": 3.113860061398985e-05,
      "loss": 8.3908,
      "step": 47968,
      "throughput": 8895.961989508723
    },
    {
      "epoch": 0.7523569933933651,
      "grad_norm": 0.08093949407339096,
      "learning_rate": 3.110334549207009e-05,
      "loss": 8.4049,
      "step": 48000,
      "throughput": 8895.953046044224
    },
    {
      "epoch": 0.7528585647222941,
      "grad_norm": 0.09794154763221741,
      "learning_rate": 3.1068642560350375e-05,
      "loss": 8.3908,
      "step": 48032,
      "throughput": 8896.009533534052
    },
    {
      "epoch": 0.753360136051223,
      "grad_norm": 0.07886497676372528,
      "learning_rate": 3.103449196194569e-05,
      "loss": 8.4077,
      "step": 48064,
      "throughput": 8896.106538320832
    },
    {
      "epoch": 0.7538617073801519,
      "grad_norm": 0.08433572947978973,
      "learning_rate": 3.1000893837693234e-05,
      "loss": 8.4283,
      "step": 48096,
      "throughput": 8896.095852382228
    },
    {
      "epoch": 0.7543632787090808,
      "grad_norm": 0.07943733781576157,
      "learning_rate": 3.096784832615175e-05,
      "loss": 8.3774,
      "step": 48128,
      "throughput": 8896.080020295462
    },
    {
      "epoch": 0.7548648500380097,
      "grad_norm": 0.07670002430677414,
      "learning_rate": 3.093535556360101e-05,
      "loss": 8.4291,
      "step": 48160,
      "throughput": 8896.140985773469
    },
    {
      "epoch": 0.7553664213669387,
      "grad_norm": 0.08377315104007721,
      "learning_rate": 3.0903415684041285e-05,
      "loss": 8.4105,
      "step": 48192,
      "throughput": 8896.234860974488
    },
    {
      "epoch": 0.7558679926958676,
      "grad_norm": 0.08657362312078476,
      "learning_rate": 3.087202881919273e-05,
      "loss": 8.4106,
      "step": 48224,
      "throughput": 8896.232594710647
    },
    {
      "epoch": 0.7563695640247964,
      "grad_norm": 0.08452863991260529,
      "learning_rate": 3.084119509849488e-05,
      "loss": 8.4097,
      "step": 48256,
      "throughput": 8896.233640640303
    },
    {
      "epoch": 0.7568711353537253,
      "grad_norm": 0.08716598898172379,
      "learning_rate": 3.081091464910606e-05,
      "loss": 8.4183,
      "step": 48288,
      "throughput": 8896.302885903542
    },
    {
      "epoch": 0.7573727066826542,
      "grad_norm": 0.0744166448712349,
      "learning_rate": 3.078118759590295e-05,
      "loss": 8.4048,
      "step": 48320,
      "throughput": 8896.408020396842
    },
    {
      "epoch": 0.7578742780115831,
      "grad_norm": 0.08003176748752594,
      "learning_rate": 3.075201406148001e-05,
      "loss": 8.4005,
      "step": 48352,
      "throughput": 8896.377259493374
    },
    {
      "epoch": 0.758375849340512,
      "grad_norm": 0.07641202211380005,
      "learning_rate": 3.072339416614899e-05,
      "loss": 8.4061,
      "step": 48384,
      "throughput": 8896.355147113938
    },
    {
      "epoch": 0.758877420669441,
      "grad_norm": 0.08283281326293945,
      "learning_rate": 3.069532802793839e-05,
      "loss": 8.3926,
      "step": 48416,
      "throughput": 8896.417785882708
    },
    {
      "epoch": 0.7593789919983699,
      "grad_norm": 0.08166106045246124,
      "learning_rate": 3.066781576259309e-05,
      "loss": 8.4236,
      "step": 48448,
      "throughput": 8896.52052368501
    },
    {
      "epoch": 0.7598805633272988,
      "grad_norm": 0.08659474551677704,
      "learning_rate": 3.0640857483573714e-05,
      "loss": 8.4095,
      "step": 48480,
      "throughput": 8896.489488514619
    },
    {
      "epoch": 0.7603821346562277,
      "grad_norm": 0.07853197306394577,
      "learning_rate": 3.061445330205631e-05,
      "loss": 8.3986,
      "step": 48512,
      "throughput": 8896.482951834703
    },
    {
      "epoch": 0.7608837059851566,
      "grad_norm": 0.0776255801320076,
      "learning_rate": 3.0588603326931796e-05,
      "loss": 8.4111,
      "step": 48544,
      "throughput": 8896.515826418297
    },
    {
      "epoch": 0.7613852773140856,
      "grad_norm": 0.07786612212657928,
      "learning_rate": 3.056330766480554e-05,
      "loss": 8.4013,
      "step": 48576,
      "throughput": 8896.621289813445
    },
    {
      "epoch": 0.7618868486430145,
      "grad_norm": 0.07415422052145004,
      "learning_rate": 3.053856641999694e-05,
      "loss": 8.4016,
      "step": 48608,
      "throughput": 8896.604208429937
    },
    {
      "epoch": 0.7623884199719434,
      "grad_norm": 0.07848580926656723,
      "learning_rate": 3.0514379694538932e-05,
      "loss": 8.3971,
      "step": 48640,
      "throughput": 8896.604218805502
    },
    {
      "epoch": 0.7628899913008723,
      "grad_norm": 0.07402683794498444,
      "learning_rate": 3.0490747588177684e-05,
      "loss": 8.4195,
      "step": 48672,
      "throughput": 8896.648336095484
    },
    {
      "epoch": 0.7633915626298011,
      "grad_norm": 0.07746639102697372,
      "learning_rate": 3.0467670198372044e-05,
      "loss": 8.4097,
      "step": 48704,
      "throughput": 8896.755910382974
    },
    {
      "epoch": 0.76389313395873,
      "grad_norm": 0.07922644168138504,
      "learning_rate": 3.044514762029326e-05,
      "loss": 8.3952,
      "step": 48736,
      "throughput": 8896.714345888688
    },
    {
      "epoch": 0.764394705287659,
      "grad_norm": 0.0855683907866478,
      "learning_rate": 3.0423179946824494e-05,
      "loss": 8.4142,
      "step": 48768,
      "throughput": 8896.72705439139
    },
    {
      "epoch": 0.7648962766165879,
      "grad_norm": 0.07957347482442856,
      "learning_rate": 3.040176726856049e-05,
      "loss": 8.4067,
      "step": 48800,
      "throughput": 8896.773556908885
    },
    {
      "epoch": 0.7653978479455168,
      "grad_norm": 0.07848001271486282,
      "learning_rate": 3.0380909673807205e-05,
      "loss": 8.3885,
      "step": 48832,
      "throughput": 8896.879164687118
    },
    {
      "epoch": 0.7658994192744457,
      "grad_norm": 0.08187400549650192,
      "learning_rate": 3.0360607248581437e-05,
      "loss": 8.4094,
      "step": 48864,
      "throughput": 8896.820305927478
    },
    {
      "epoch": 0.7664009906033746,
      "grad_norm": 0.08541760593652725,
      "learning_rate": 3.0340860076610427e-05,
      "loss": 8.3964,
      "step": 48896,
      "throughput": 8896.814791286453
    },
    {
      "epoch": 0.7669025619323036,
      "grad_norm": 0.0846327468752861,
      "learning_rate": 3.0321668239331582e-05,
      "loss": 8.4064,
      "step": 48928,
      "throughput": 8896.849370458765
    },
    {
      "epoch": 0.7674041332612325,
      "grad_norm": 0.11470185965299606,
      "learning_rate": 3.030303181589207e-05,
      "loss": 8.3863,
      "step": 48960,
      "throughput": 8896.954688708582
    },
    {
      "epoch": 0.7679057045901614,
      "grad_norm": 0.08249527961015701,
      "learning_rate": 3.0284950883148598e-05,
      "loss": 8.3919,
      "step": 48992,
      "throughput": 8896.907999028363
    },
    {
      "epoch": 0.7684072759190903,
      "grad_norm": 0.08515627682209015,
      "learning_rate": 3.026742551566696e-05,
      "loss": 8.3981,
      "step": 49024,
      "throughput": 8896.916454662103
    },
    {
      "epoch": 0.7689088472480192,
      "grad_norm": 0.07765959948301315,
      "learning_rate": 3.0250455785721827e-05,
      "loss": 8.4031,
      "step": 49056,
      "throughput": 8896.958168425917
    },
    {
      "epoch": 0.7694104185769481,
      "grad_norm": 0.08357566595077515,
      "learning_rate": 3.023404176329643e-05,
      "loss": 8.4077,
      "step": 49088,
      "throughput": 8897.05878428246
    },
    {
      "epoch": 0.7699119899058771,
      "grad_norm": 0.0747281089425087,
      "learning_rate": 3.021818351608223e-05,
      "loss": 8.4002,
      "step": 49120,
      "throughput": 8897.018858526288
    },
    {
      "epoch": 0.7704135612348059,
      "grad_norm": 0.07242199033498764,
      "learning_rate": 3.0202881109478676e-05,
      "loss": 8.4119,
      "step": 49152,
      "throughput": 8897.003698841101
    },
    {
      "epoch": 0.7709151325637348,
      "grad_norm": 0.09169508516788483,
      "learning_rate": 3.0188134606592958e-05,
      "loss": 8.4123,
      "step": 49184,
      "throughput": 8896.6229254484
    },
    {
      "epoch": 0.7714167038926637,
      "grad_norm": 0.07344987988471985,
      "learning_rate": 3.017394406823969e-05,
      "loss": 8.4033,
      "step": 49216,
      "throughput": 8896.726633691835
    },
    {
      "epoch": 0.7719182752215926,
      "grad_norm": 0.0744016170501709,
      "learning_rate": 3.0160309552940704e-05,
      "loss": 8.427,
      "step": 49248,
      "throughput": 8896.704982883732
    },
    {
      "epoch": 0.7724198465505215,
      "grad_norm": 0.07636785507202148,
      "learning_rate": 3.014723111692476e-05,
      "loss": 8.4045,
      "step": 49280,
      "throughput": 8896.70186019132
    },
    {
      "epoch": 0.7729214178794505,
      "grad_norm": 0.07975538820028305,
      "learning_rate": 3.013470881412739e-05,
      "loss": 8.3832,
      "step": 49312,
      "throughput": 8896.750642487581
    },
    {
      "epoch": 0.7734229892083794,
      "grad_norm": 0.07941026985645294,
      "learning_rate": 3.0122742696190606e-05,
      "loss": 8.4185,
      "step": 49344,
      "throughput": 8896.850750910937
    },
    {
      "epoch": 0.7739245605373083,
      "grad_norm": 0.07521425932645798,
      "learning_rate": 3.0111332812462692e-05,
      "loss": 8.4036,
      "step": 49376,
      "throughput": 8896.813329451572
    },
    {
      "epoch": 0.7744261318662372,
      "grad_norm": 0.07578073441982269,
      "learning_rate": 3.0100479209998055e-05,
      "loss": 8.3934,
      "step": 49408,
      "throughput": 8896.831284182328
    },
    {
      "epoch": 0.7749277031951661,
      "grad_norm": 0.09989731013774872,
      "learning_rate": 3.0090181933556994e-05,
      "loss": 8.4085,
      "step": 49440,
      "throughput": 8896.879084852144
    },
    {
      "epoch": 0.775429274524095,
      "grad_norm": 0.08798599988222122,
      "learning_rate": 3.0080441025605494e-05,
      "loss": 8.3887,
      "step": 49472,
      "throughput": 8896.984651790684
    },
    {
      "epoch": 0.775930845853024,
      "grad_norm": 0.08368100225925446,
      "learning_rate": 3.007125652631508e-05,
      "loss": 8.3882,
      "step": 49504,
      "throughput": 8896.924194215266
    },
    {
      "epoch": 0.7764324171819529,
      "grad_norm": 0.07710754871368408,
      "learning_rate": 3.006262847356269e-05,
      "loss": 8.3931,
      "step": 49536,
      "throughput": 8896.92686921276
    },
    {
      "epoch": 0.7769339885108818,
      "grad_norm": 0.08023960143327713,
      "learning_rate": 3.0054556902930394e-05,
      "loss": 8.4064,
      "step": 49568,
      "throughput": 8896.966642033312
    },
    {
      "epoch": 0.7774355598398106,
      "grad_norm": 0.07401903718709946,
      "learning_rate": 3.0047041847705404e-05,
      "loss": 8.41,
      "step": 49600,
      "throughput": 8897.071490538188
    },
    {
      "epoch": 0.7779371311687395,
      "grad_norm": 0.08108972012996674,
      "learning_rate": 3.0040083338879834e-05,
      "loss": 8.3832,
      "step": 49632,
      "throughput": 8897.013010592347
    },
    {
      "epoch": 0.7784387024976684,
      "grad_norm": 0.08791600167751312,
      "learning_rate": 3.0033681405150554e-05,
      "loss": 8.4159,
      "step": 49664,
      "throughput": 8897.0419185872
    },
    {
      "epoch": 0.7789402738265974,
      "grad_norm": 0.07926676422357559,
      "learning_rate": 3.0027836072919202e-05,
      "loss": 8.3874,
      "step": 49696,
      "throughput": 8897.077031031937
    },
    {
      "epoch": 0.7794418451555263,
      "grad_norm": 0.14427538216114044,
      "learning_rate": 3.002254736629194e-05,
      "loss": 8.4154,
      "step": 49728,
      "throughput": 8897.182181287917
    },
    {
      "epoch": 0.7799434164844552,
      "grad_norm": 0.09725314378738403,
      "learning_rate": 3.001781530707938e-05,
      "loss": 8.3869,
      "step": 49760,
      "throughput": 8897.149475721768
    },
    {
      "epoch": 0.7804449878133841,
      "grad_norm": 0.07859361916780472,
      "learning_rate": 3.0013639914796586e-05,
      "loss": 8.404,
      "step": 49792,
      "throughput": 8897.178265734985
    },
    {
      "epoch": 0.780946559142313,
      "grad_norm": 0.07424914091825485,
      "learning_rate": 3.001002120666285e-05,
      "loss": 8.3968,
      "step": 49824,
      "throughput": 8897.209883269028
    },
    {
      "epoch": 0.781448130471242,
      "grad_norm": 0.07746188342571259,
      "learning_rate": 3.0006959197601765e-05,
      "loss": 8.4069,
      "step": 49856,
      "throughput": 8897.308706740672
    },
    {
      "epoch": 0.7819497018001709,
      "grad_norm": 0.09235438704490662,
      "learning_rate": 3.000445390024106e-05,
      "loss": 8.4015,
      "step": 49888,
      "throughput": 8897.259296904602
    },
    {
      "epoch": 0.7824512731290998,
      "grad_norm": 0.07753193378448486,
      "learning_rate": 3.0002505324912582e-05,
      "loss": 8.3881,
      "step": 49920,
      "throughput": 8897.287677847626
    },
    {
      "epoch": 0.7829528444580287,
      "grad_norm": 0.09510205686092377,
      "learning_rate": 3.0001113479652246e-05,
      "loss": 8.398,
      "step": 49952,
      "throughput": 8897.326425338933
    },
    {
      "epoch": 0.7834544157869576,
      "grad_norm": 0.07465380430221558,
      "learning_rate": 3.0000278370200057e-05,
      "loss": 8.4051,
      "step": 49984,
      "throughput": 8897.427443018712
    },
    {
      "epoch": 0.7839559871158865,
      "grad_norm": 0.0882812961935997,
      "learning_rate": 2.9999999999999997e-05,
      "loss": 8.3954,
      "step": 50016,
      "throughput": 8897.361097831126
    },
    {
      "epoch": 0.7839559871158865,
      "step": 50016,
      "throughput": 8896.939390152644,
      "total_flos": 1.1998395573363655e+21,
      "train_loss": 8.993753637019747,
      "train_runtime": 368424.292,
      "train_samples_per_second": 139.015,
      "train_steps_per_second": 0.136
    }
  ],
  "logging_steps": 32,
  "max_steps": 50016,
  "num_input_tokens_seen": 104891154432,
  "num_train_epochs": 1,
  "save_steps": 2048,
  "stateful_callbacks": {
    "LogCallback": {
      "elapsed_time": 368424.2897763252,
      "start_time": 1766739748.8421957
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.1998395573363655e+21,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}