{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.7839559871158865,
  "eval_steps": 500,
  "global_step": 50016,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0005015713289289101,
      "grad_norm": 5.1814727783203125,
      "learning_rate": 1.875e-05,
      "loss": 39.1474,
      "step": 32,
      "throughput": 3995.130054373345
    },
    {
      "epoch": 0.0010031426578578201,
      "grad_norm": 2.1393489837646484,
      "learning_rate": 3.75e-05,
      "loss": 30.6991,
      "step": 64,
      "throughput": 5926.865921123086
    },
    {
      "epoch": 0.0015047139867867302,
      "grad_norm": 1.213215947151184,
      "learning_rate": 5.625e-05,
      "loss": 27.1674,
      "step": 96,
      "throughput": 7137.788461518295
    },
    {
      "epoch": 0.0020062853157156403,
      "grad_norm": 1.3887317180633545,
      "learning_rate": 7.5e-05,
      "loss": 24.7095,
      "step": 128,
      "throughput": 8012.59199968389
    },
    {
      "epoch": 0.0025078566446445506,
      "grad_norm": 1.5313411951065063,
      "learning_rate": 9.374999999999999e-05,
      "loss": 22.9573,
      "step": 160,
      "throughput": 8640.775590769676
    },
    {
      "epoch": 0.0030094279735734604,
      "grad_norm": 1.0794322490692139,
      "learning_rate": 0.0001125,
      "loss": 21.5803,
      "step": 192,
      "throughput": 9124.247227654996
    },
    {
      "epoch": 0.0035109993025023707,
      "grad_norm": 1.7150249481201172,
      "learning_rate": 0.00013125,
      "loss": 20.2367,
      "step": 224,
      "throughput": 9503.876590751574
    },
    {
      "epoch": 0.0040125706314312806,
      "grad_norm": 1.1530752182006836,
      "learning_rate": 0.00015,
      "loss": 19.0211,
      "step": 256,
      "throughput": 9810.215026698792
    },
    {
      "epoch": 0.004514141960360191,
      "grad_norm": 1.1234049797058105,
      "learning_rate": 0.00016874999999999998,
      "loss": 17.9283,
      "step": 288,
      "throughput": 10062.289764004558
    },
    {
      "epoch": 0.005015713289289101,
      "grad_norm": 0.913296639919281,
      "learning_rate": 0.00018749999999999998,
      "loss": 16.9365,
      "step": 320,
      "throughput": 10273.36417506043
    },
    {
      "epoch": 0.005517284618218011,
      "grad_norm": 1.1062116622924805,
      "learning_rate": 0.00020624999999999997,
      "loss": 16.1751,
      "step": 352,
      "throughput": 10452.745437624211
    },
    {
      "epoch": 0.006018855947146921,
      "grad_norm": 0.9756858944892883,
      "learning_rate": 0.000225,
      "loss": 15.4471,
      "step": 384,
      "throughput": 10565.312459821444
    },
    {
      "epoch": 0.006520427276075831,
      "grad_norm": 0.9211740493774414,
      "learning_rate": 0.00024375,
      "loss": 14.8975,
      "step": 416,
      "throughput": 10647.274434110246
    },
    {
      "epoch": 0.007021998605004741,
      "grad_norm": 0.9290841221809387,
      "learning_rate": 0.0002625,
      "loss": 14.4338,
      "step": 448,
      "throughput": 10769.698005822003
    },
    {
      "epoch": 0.007523569933933652,
      "grad_norm": 0.8251187205314636,
      "learning_rate": 0.00028125,
      "loss": 14.0338,
      "step": 480,
      "throughput": 10878.049557563976
    },
    {
      "epoch": 0.008025141262862561,
      "grad_norm": 0.6234081387519836,
      "learning_rate": 0.0003,
      "loss": 13.7662,
      "step": 512,
      "throughput": 10971.220222130216
    },
    {
      "epoch": 0.008526712591791472,
      "grad_norm": 0.5208855867385864,
      "learning_rate": 0.00029999972162979993,
      "loss": 13.4863,
      "step": 544,
      "throughput": 11058.017884661573
    },
    {
      "epoch": 0.009028283920720382,
      "grad_norm": 0.4786897897720337,
      "learning_rate": 0.00029999888652034774,
      "loss": 13.2042,
      "step": 576,
      "throughput": 11136.350438298447
    },
    {
      "epoch": 0.009529855249649291,
      "grad_norm": 0.42444518208503723,
      "learning_rate": 0.00029999749467508744,
      "loss": 13.014,
      "step": 608,
      "throughput": 11207.51047082044
    },
    {
      "epoch": 0.010031426578578202,
      "grad_norm": 0.4553331732749939,
      "learning_rate": 0.0002999955460997589,
      "loss": 12.7939,
      "step": 640,
      "throughput": 11272.208055366227
    },
    {
      "epoch": 0.010532997907507112,
      "grad_norm": 0.4054524004459381,
      "learning_rate": 0.0002999930408023982,
      "loss": 12.6612,
      "step": 672,
      "throughput": 11331.413096739732
    },
    {
      "epoch": 0.011034569236436023,
      "grad_norm": 0.4195540249347687,
      "learning_rate": 0.00029998997879333714,
      "loss": 12.501,
      "step": 704,
      "throughput": 11361.610717738524
    },
    {
      "epoch": 0.011536140565364932,
      "grad_norm": 0.42652466893196106,
      "learning_rate": 0.0002999863600852034,
      "loss": 12.355,
      "step": 736,
      "throughput": 11378.28571357199
    },
    {
      "epoch": 0.012037711894293842,
      "grad_norm": 0.31916525959968567,
      "learning_rate": 0.0002999821846929206,
      "loss": 12.2631,
      "step": 768,
      "throughput": 11420.460494303246
    },
    {
      "epoch": 0.012539283223222753,
      "grad_norm": 0.3535449802875519,
      "learning_rate": 0.000299977452633708,
      "loss": 12.1538,
      "step": 800,
      "throughput": 11465.309609724914
    },
    {
      "epoch": 0.013040854552151662,
      "grad_norm": 0.2800523638725281,
      "learning_rate": 0.00029997216392708075,
      "loss": 12.0527,
      "step": 832,
      "throughput": 11505.28822949725
    },
    {
      "epoch": 0.013542425881080573,
      "grad_norm": 0.33092939853668213,
      "learning_rate": 0.00029996631859484943,
      "loss": 11.9626,
      "step": 864,
      "throughput": 11544.232063949898
    },
    {
      "epoch": 0.014043997210009483,
      "grad_norm": 0.25572431087493896,
      "learning_rate": 0.00029995991666112014,
      "loss": 11.876,
      "step": 896,
      "throughput": 11580.621967456918
    },
    {
      "epoch": 0.014545568538938392,
      "grad_norm": 0.3377090394496918,
      "learning_rate": 0.0002999529581522946,
      "loss": 11.8027,
      "step": 928,
      "throughput": 11614.753535897125
    },
    {
      "epoch": 0.015047139867867303,
      "grad_norm": 0.26273658871650696,
      "learning_rate": 0.0002999454430970696,
      "loss": 11.763,
      "step": 960,
      "throughput": 11646.571087012338
    },
    {
      "epoch": 0.015548711196796213,
      "grad_norm": 0.2514078915119171,
      "learning_rate": 0.0002999373715264373,
      "loss": 11.6852,
      "step": 992,
      "throughput": 11676.406141226686
    },
    {
      "epoch": 0.016050282525725122,
      "grad_norm": 0.21636246144771576,
      "learning_rate": 0.0002999287434736849,
      "loss": 11.6263,
      "step": 1024,
      "throughput": 11690.870213371038
    },
    {
      "epoch": 0.016551853854654033,
      "grad_norm": 0.21994490921497345,
      "learning_rate": 0.0002999195589743945,
      "loss": 11.5639,
      "step": 1056,
      "throughput": 11691.441399084995
    },
    {
      "epoch": 0.017053425183582945,
      "grad_norm": 0.2077898383140564,
      "learning_rate": 0.000299909818066443,
      "loss": 11.5014,
      "step": 1088,
      "throughput": 11710.072498228305
    },
    {
      "epoch": 0.017554996512511852,
      "grad_norm": 0.20699281990528107,
      "learning_rate": 0.00029989952079000195,
      "loss": 11.4781,
      "step": 1120,
      "throughput": 11734.86066531971
    },
    {
      "epoch": 0.018056567841440763,
      "grad_norm": 0.20005486905574799,
      "learning_rate": 0.0002998886671875373,
      "loss": 11.4124,
      "step": 1152,
      "throughput": 11758.296021447659
    },
    {
      "epoch": 0.018558139170369675,
      "grad_norm": 0.18954195082187653,
      "learning_rate": 0.0002998772573038094,
      "loss": 11.3601,
      "step": 1184,
      "throughput": 11779.562423849526
    },
    {
      "epoch": 0.019059710499298582,
      "grad_norm": 0.21691370010375977,
      "learning_rate": 0.0002998652911858726,
      "loss": 11.3089,
      "step": 1216,
      "throughput": 11800.96803685039
    },
    {
      "epoch": 0.019561281828227493,
      "grad_norm": 0.24996884167194366,
      "learning_rate": 0.00029985276888307524,
      "loss": 11.2703,
      "step": 1248,
      "throughput": 11821.228998747654
    },
    {
      "epoch": 0.020062853157156404,
      "grad_norm": 0.18788766860961914,
      "learning_rate": 0.00029983969044705927,
      "loss": 11.2489,
      "step": 1280,
      "throughput": 11840.652754110957
    },
    {
      "epoch": 0.020564424486085316,
      "grad_norm": 0.20493867993354797,
      "learning_rate": 0.0002998260559317603,
      "loss": 11.2038,
      "step": 1312,
      "throughput": 11859.215230540382
    },
    {
      "epoch": 0.021065995815014223,
      "grad_norm": 0.20930466055870056,
      "learning_rate": 0.00029981186539340703,
      "loss": 11.1555,
      "step": 1344,
      "throughput": 11870.635377978188
    },
    {
      "epoch": 0.021567567143943134,
      "grad_norm": 0.1783915013074875,
      "learning_rate": 0.0002997971188905213,
      "loss": 11.1366,
      "step": 1376,
      "throughput": 11874.738822938076
    },
    {
      "epoch": 0.022069138472872046,
      "grad_norm": 0.16567964851856232,
      "learning_rate": 0.0002997818164839178,
      "loss": 11.0964,
      "step": 1408,
      "throughput": 11875.943651931068
    },
    {
      "epoch": 0.022570709801800953,
      "grad_norm": 0.19126038253307343,
      "learning_rate": 0.00029976595823670354,
      "loss": 11.0634,
      "step": 1440,
      "throughput": 11890.967243214012
    },
    {
      "epoch": 0.023072281130729864,
      "grad_norm": 0.15882526338100433,
      "learning_rate": 0.0002997495442142781,
      "loss": 11.0499,
      "step": 1472,
      "throughput": 11906.388345043066
    },
    {
      "epoch": 0.023573852459658776,
      "grad_norm": 0.17650361359119415,
      "learning_rate": 0.000299732574484333,
      "loss": 11.0088,
      "step": 1504,
      "throughput": 11920.006024618477
    },
    {
      "epoch": 0.024075423788587683,
      "grad_norm": 0.19148582220077515,
      "learning_rate": 0.0002997150491168514,
      "loss": 10.9806,
      "step": 1536,
      "throughput": 11934.273864161154
    },
    {
      "epoch": 0.024576995117516594,
      "grad_norm": 0.18846440315246582,
      "learning_rate": 0.0002996969681841079,
      "loss": 10.9449,
      "step": 1568,
      "throughput": 11947.993589384481
    },
    {
      "epoch": 0.025078566446445506,
      "grad_norm": 0.18076573312282562,
      "learning_rate": 0.0002996783317606684,
      "loss": 10.9236,
      "step": 1600,
      "throughput": 11961.20785667367
    },
    {
      "epoch": 0.025580137775374417,
      "grad_norm": 0.164947047829628,
      "learning_rate": 0.0002996591399233895,
      "loss": 10.8896,
      "step": 1632,
      "throughput": 11973.88601819077
    },
    {
      "epoch": 0.026081709104303324,
      "grad_norm": 0.18726250529289246,
      "learning_rate": 0.00029963939275141855,
      "loss": 10.8525,
      "step": 1664,
      "throughput": 11985.116203079922
    },
    {
      "epoch": 0.026583280433232236,
      "grad_norm": 0.18523889780044556,
      "learning_rate": 0.00029961909032619275,
      "loss": 10.8433,
      "step": 1696,
      "throughput": 11986.272076412846
    },
    {
      "epoch": 0.027084851762161147,
      "grad_norm": 0.18160128593444824,
      "learning_rate": 0.00029959823273143947,
      "loss": 10.8182,
      "step": 1728,
      "throughput": 11983.895196652385
    },
    {
      "epoch": 0.027586423091090054,
      "grad_norm": 0.15962938964366913,
      "learning_rate": 0.0002995768200531755,
      "loss": 10.8248,
      "step": 1760,
      "throughput": 11992.332506370003
    },
    {
      "epoch": 0.028087994420018966,
      "grad_norm": 0.15896254777908325,
      "learning_rate": 0.00029955485237970675,
      "loss": 10.7814,
      "step": 1792,
      "throughput": 12003.474001535129
    },
    {
      "epoch": 0.028589565748947877,
      "grad_norm": 0.15553218126296997,
      "learning_rate": 0.00029953232980162793,
      "loss": 10.76,
      "step": 1824,
      "throughput": 12013.247989756059
    },
    {
      "epoch": 0.029091137077876784,
      "grad_norm": 0.16225172579288483,
      "learning_rate": 0.0002995092524118223,
      "loss": 10.7184,
      "step": 1856,
      "throughput": 12023.777304766878
    },
    {
      "epoch": 0.029592708406805696,
      "grad_norm": 0.1489713191986084,
      "learning_rate": 0.00029948562030546107,
      "loss": 10.7283,
      "step": 1888,
      "throughput": 12033.905583821244
    },
    {
      "epoch": 0.030094279735734607,
      "grad_norm": 0.15911641716957092,
      "learning_rate": 0.00029946143358000306,
      "loss": 10.6952,
      "step": 1920,
      "throughput": 12043.77236211472
    },
    {
      "epoch": 0.030595851064663518,
      "grad_norm": 0.16882722079753876,
      "learning_rate": 0.0002994366923351945,
      "loss": 10.6803,
      "step": 1952,
      "throughput": 12053.312899128261
    },
    {
      "epoch": 0.031097422393592426,
      "grad_norm": 0.1465579718351364,
      "learning_rate": 0.00029941139667306817,
      "loss": 10.6555,
      "step": 1984,
      "throughput": 12061.72702376433
    },
    {
      "epoch": 0.03159899372252133,
      "grad_norm": 0.14933708310127258,
      "learning_rate": 0.00029938554669794364,
      "loss": 10.629,
      "step": 2016,
      "throughput": 12064.317775333797
    },
    {
      "epoch": 0.032100565051450244,
      "grad_norm": 0.16598300635814667,
      "learning_rate": 0.00029935914251642625,
      "loss": 10.6152,
      "step": 2048,
      "throughput": 12061.086543803585
    },
    {
      "epoch": 0.032602136380379156,
      "grad_norm": 0.15937751531600952,
      "learning_rate": 0.0002993321842374069,
      "loss": 10.5994,
      "step": 2080,
      "throughput": 12055.457800713268
    },
    {
      "epoch": 0.03310370770930807,
      "grad_norm": 0.15959997475147247,
      "learning_rate": 0.00029930467197206156,
      "loss": 10.5698,
      "step": 2112,
      "throughput": 12064.1609508185
    },
    {
      "epoch": 0.03360527903823698,
      "grad_norm": 0.15936905145645142,
      "learning_rate": 0.000299276605833851,
      "loss": 10.5483,
      "step": 2144,
      "throughput": 12072.650798888526
    },
    {
      "epoch": 0.03410685036716589,
      "grad_norm": 0.15118514001369476,
      "learning_rate": 0.00029924798593851994,
      "loss": 10.5501,
      "step": 2176,
      "throughput": 12080.019658762709
    },
    {
      "epoch": 0.0346084216960948,
      "grad_norm": 0.14804407954216003,
      "learning_rate": 0.00029921881240409703,
      "loss": 10.5372,
      "step": 2208,
      "throughput": 12087.98405323524
    },
    {
      "epoch": 0.035109993025023704,
      "grad_norm": 0.142776757478714,
      "learning_rate": 0.00029918908535089394,
      "loss": 10.5293,
      "step": 2240,
      "throughput": 12095.798193861325
    },
    {
      "epoch": 0.035611564353952616,
      "grad_norm": 0.149314746260643,
      "learning_rate": 0.00029915880490150515,
      "loss": 10.5013,
      "step": 2272,
      "throughput": 12103.443761699426
    },
    {
      "epoch": 0.03611313568288153,
      "grad_norm": 0.14946982264518738,
      "learning_rate": 0.0002991279711808072,
      "loss": 10.5007,
      "step": 2304,
      "throughput": 12110.91765243157
    },
    {
      "epoch": 0.03661470701181044,
      "grad_norm": 0.15681925415992737,
      "learning_rate": 0.0002990965843159587,
      "loss": 10.462,
      "step": 2336,
      "throughput": 12113.172526776905
    },
    {
      "epoch": 0.03711627834073935,
      "grad_norm": 0.14676795899868011,
      "learning_rate": 0.000299064644436399,
      "loss": 10.4549,
      "step": 2368,
      "throughput": 12111.914287763128
    },
    {
      "epoch": 0.03761784966966826,
      "grad_norm": 0.1510101556777954,
      "learning_rate": 0.0002990321516738482,
      "loss": 10.4197,
      "step": 2400,
      "throughput": 12111.473673161498
    },
    {
      "epoch": 0.038119420998597164,
      "grad_norm": 0.1597863733768463,
      "learning_rate": 0.00029899910616230674,
      "loss": 10.4311,
      "step": 2432,
      "throughput": 12117.028182195081
    },
    {
      "epoch": 0.038620992327526076,
      "grad_norm": 0.14865833520889282,
      "learning_rate": 0.0002989655080380543,
      "loss": 10.4316,
      "step": 2464,
      "throughput": 12123.821883309733
    },
    {
      "epoch": 0.03912256365645499,
      "grad_norm": 0.1432724893093109,
      "learning_rate": 0.0002989313574396496,
      "loss": 10.3984,
      "step": 2496,
      "throughput": 12129.646773074963
    },
    {
      "epoch": 0.0396241349853839,
      "grad_norm": 0.1372014582157135,
      "learning_rate": 0.00029889665450792983,
      "loss": 10.3833,
      "step": 2528,
      "throughput": 12136.138293021668
    },
    {
      "epoch": 0.04012570631431281,
      "grad_norm": 0.13886210322380066,
      "learning_rate": 0.0002988613993860101,
      "loss": 10.3763,
      "step": 2560,
      "throughput": 12142.472546726538
    },
    {
      "epoch": 0.04062727764324172,
      "grad_norm": 0.13816037774085999,
      "learning_rate": 0.0002988255922192825,
      "loss": 10.3653,
      "step": 2592,
      "throughput": 12148.65141120839
    },
    {
      "epoch": 0.04112884897217063,
      "grad_norm": 0.14726093411445618,
      "learning_rate": 0.000298789233155416,
      "loss": 10.3521,
      "step": 2624,
      "throughput": 12154.738846922763
    },
    {
      "epoch": 0.041630420301099536,
      "grad_norm": 0.14088797569274902,
      "learning_rate": 0.0002987523223443554,
      "loss": 10.342,
      "step": 2656,
      "throughput": 12157.94437275133
    },
    {
      "epoch": 0.04213199163002845,
      "grad_norm": 0.12842944264411926,
      "learning_rate": 0.000298714859938321,
      "loss": 10.3272,
      "step": 2688,
      "throughput": 12156.68430068634
    },
    {
      "epoch": 0.04263356295895736,
      "grad_norm": 0.13655942678451538,
      "learning_rate": 0.0002986768460918079,
      "loss": 10.3139,
      "step": 2720,
      "throughput": 12154.777180138237
    },
    {
      "epoch": 0.04313513428788627,
      "grad_norm": 0.1346423178911209,
      "learning_rate": 0.0002986382809615853,
      "loss": 10.3062,
      "step": 2752,
      "throughput": 12158.481768431573
    },
    {
      "epoch": 0.04363670561681518,
      "grad_norm": 0.15266847610473633,
      "learning_rate": 0.00029859916470669596,
      "loss": 10.3068,
      "step": 2784,
      "throughput": 12164.032464673319
    },
    {
      "epoch": 0.04413827694574409,
      "grad_norm": 0.13238975405693054,
      "learning_rate": 0.0002985594974884554,
      "loss": 10.2559,
      "step": 2816,
      "throughput": 12168.953498849784
    },
    {
      "epoch": 0.044639848274673,
      "grad_norm": 0.13468751311302185,
      "learning_rate": 0.00029851927947045136,
      "loss": 10.271,
      "step": 2848,
      "throughput": 12174.27847236647
    },
    {
      "epoch": 0.04514141960360191,
      "grad_norm": 0.13259458541870117,
      "learning_rate": 0.000298478510818543,
      "loss": 10.2444,
      "step": 2880,
      "throughput": 12179.546502271302
    },
    {
      "epoch": 0.04564299093253082,
      "grad_norm": 0.14665883779525757,
      "learning_rate": 0.0002984371917008604,
      "loss": 10.2316,
      "step": 2912,
      "throughput": 12184.691956190614
    },
    {
      "epoch": 0.04614456226145973,
      "grad_norm": 0.13770803809165955,
      "learning_rate": 0.0002983953222878037,
      "loss": 10.2499,
      "step": 2944,
      "throughput": 12189.781748950709
    },
    {
      "epoch": 0.04664613359038864,
      "grad_norm": 0.15984688699245453,
      "learning_rate": 0.0002983529027520426,
      "loss": 10.208,
      "step": 2976,
      "throughput": 12193.11276934592
    },
    {
      "epoch": 0.04714770491931755,
      "grad_norm": 0.14149409532546997,
      "learning_rate": 0.0002983099332685153,
      "loss": 10.2187,
      "step": 3008,
      "throughput": 12192.766548304595
    },
    {
      "epoch": 0.04764927624824646,
      "grad_norm": 0.14984308183193207,
      "learning_rate": 0.000298266414014428,
      "loss": 10.2101,
      "step": 3040,
      "throughput": 12189.453681022953
    },
    {
      "epoch": 0.04815084757717537,
      "grad_norm": 0.13683457672595978,
      "learning_rate": 0.0002982223451692544,
      "loss": 10.2017,
      "step": 3072,
      "throughput": 12191.962757263142
    },
    {
      "epoch": 0.04865241890610428,
      "grad_norm": 0.14434507489204407,
      "learning_rate": 0.0002981777269147344,
      "loss": 10.1901,
      "step": 3104,
      "throughput": 12196.147790334488
    },
    {
      "epoch": 0.04915399023503319,
      "grad_norm": 0.13738803565502167,
      "learning_rate": 0.0002981325594348739,
      "loss": 10.2011,
      "step": 3136,
      "throughput": 12200.803712412984
    },
    {
      "epoch": 0.0496555615639621,
      "grad_norm": 0.1290079653263092,
      "learning_rate": 0.00029808684291594373,
      "loss": 10.1577,
      "step": 3168,
      "throughput": 12204.732814989668
    },
    {
      "epoch": 0.05015713289289101,
      "grad_norm": 0.12609000504016876,
      "learning_rate": 0.0002980405775464789,
      "loss": 10.153,
      "step": 3200,
      "throughput": 12209.032284004508
    },
    {
      "epoch": 0.05065870422181992,
      "grad_norm": 0.1282884180545807,
      "learning_rate": 0.00029799376351727797,
      "loss": 10.1375,
      "step": 3232,
      "throughput": 12213.400158356682
    },
    {
      "epoch": 0.051160275550748834,
      "grad_norm": 0.13750484585762024,
      "learning_rate": 0.00029794640102140206,
      "loss": 10.1275,
      "step": 3264,
      "throughput": 12217.685151283038
    },
    {
      "epoch": 0.05166184687967774,
      "grad_norm": 0.17931711673736572,
      "learning_rate": 0.00029789849025417433,
      "loss": 10.1251,
      "step": 3296,
      "throughput": 12221.40489775536
    },
    {
      "epoch": 0.05216341820860665,
      "grad_norm": 0.11613932251930237,
      "learning_rate": 0.0002978500314131789,
      "loss": 10.1578,
      "step": 3328,
      "throughput": 12221.815611125585
    },
    {
      "epoch": 0.05266498953753556,
      "grad_norm": 0.13067440688610077,
      "learning_rate": 0.00029780102469826014,
      "loss": 10.1099,
      "step": 3360,
      "throughput": 12221.108428713243
    },
    {
      "epoch": 0.05316656086646447,
      "grad_norm": 0.13646602630615234,
      "learning_rate": 0.00029775147031152195,
      "loss": 10.098,
      "step": 3392,
      "throughput": 12220.73803888541
    },
    {
      "epoch": 0.05366813219539338,
      "grad_norm": 0.1439736783504486,
      "learning_rate": 0.0002977013684573267,
      "loss": 10.101,
      "step": 3424,
      "throughput": 12223.880725167104
    },
    {
      "epoch": 0.054169703524322294,
      "grad_norm": 0.1305106282234192,
      "learning_rate": 0.0002976507193422946,
      "loss": 10.0933,
      "step": 3456,
      "throughput": 12227.953070782476
    },
    {
      "epoch": 0.0546712748532512,
      "grad_norm": 0.15496422350406647,
      "learning_rate": 0.00029759952317530284,
      "loss": 10.1026,
      "step": 3488,
      "throughput": 12231.44852716162
    },
    {
      "epoch": 0.05517284618218011,
      "grad_norm": 0.1231626644730568,
      "learning_rate": 0.0002975477801674845,
      "loss": 10.0508,
      "step": 3520,
      "throughput": 12235.337631141307
    },
    {
      "epoch": 0.05567441751110902,
      "grad_norm": 0.12398959696292877,
      "learning_rate": 0.00029749549053222784,
      "loss": 10.0712,
      "step": 3552,
      "throughput": 12239.127986608266
    },
    {
      "epoch": 0.05617598884003793,
      "grad_norm": 0.11847177147865295,
      "learning_rate": 0.0002974426544851755,
      "loss": 10.0456,
      "step": 3584,
      "throughput": 12242.863384381568
    },
    {
      "epoch": 0.05667756016896684,
      "grad_norm": 0.12773899734020233,
      "learning_rate": 0.00029738927224422354,
      "loss": 10.0489,
      "step": 3616,
      "throughput": 12246.014967120955
    },
    {
      "epoch": 0.057179131497895753,
      "grad_norm": 0.13713018596172333,
      "learning_rate": 0.0002973353440295205,
      "loss": 10.0223,
      "step": 3648,
      "throughput": 12247.24311621234
    },
    {
      "epoch": 0.057680702826824665,
      "grad_norm": 0.14463427662849426,
      "learning_rate": 0.0002972808700634664,
      "loss": 10.0269,
      "step": 3680,
      "throughput": 12245.601304229167
    },
    {
      "epoch": 0.05818227415575357,
      "grad_norm": 0.12428227812051773,
      "learning_rate": 0.0002972258505707121,
      "loss": 10.0188,
      "step": 3712,
      "throughput": 12244.117691271802
    },
    {
      "epoch": 0.05868384548468248,
      "grad_norm": 0.1311129778623581,
      "learning_rate": 0.00029717028577815817,
      "loss": 10.0069,
      "step": 3744,
      "throughput": 12246.616999806483
    },
    {
      "epoch": 0.05918541681361139,
      "grad_norm": 0.1432640105485916,
      "learning_rate": 0.0002971141759149539,
      "loss": 10.0253,
      "step": 3776,
      "throughput": 12249.62555966762
    },
    {
      "epoch": 0.0596869881425403,
      "grad_norm": 0.12176132202148438,
      "learning_rate": 0.00029705752121249665,
      "loss": 10.0013,
      "step": 3808,
      "throughput": 12252.551883866128
    },
    {
      "epoch": 0.060188559471469213,
      "grad_norm": 0.1390705704689026,
      "learning_rate": 0.0002970003219044305,
      "loss": 9.9886,
      "step": 3840,
      "throughput": 12255.913981330372
    },
    {
      "epoch": 0.060690130800398125,
      "grad_norm": 0.13727904856204987,
      "learning_rate": 0.0002969425782266455,
      "loss": 10.0022,
      "step": 3872,
      "throughput": 12259.206427835396
    },
    {
      "epoch": 0.061191702129327036,
      "grad_norm": 0.1457907259464264,
      "learning_rate": 0.0002968842904172769,
      "loss": 9.9932,
      "step": 3904,
      "throughput": 12262.448975674195
    },
    {
      "epoch": 0.06169327345825594,
      "grad_norm": 0.13661052286624908,
      "learning_rate": 0.00029682545871670375,
      "loss": 9.9852,
      "step": 3936,
      "throughput": 12265.135223847203
    },
    {
      "epoch": 0.06219484478718485,
      "grad_norm": 0.11689677834510803,
      "learning_rate": 0.0002967660833675481,
      "loss": 9.9688,
      "step": 3968,
      "throughput": 12266.224417757796
    },
    {
      "epoch": 0.06269641611611376,
      "grad_norm": 0.1388208568096161,
      "learning_rate": 0.0002967061646146741,
      "loss": 9.956,
      "step": 4000,
      "throughput": 12265.161713743151
    },
    {
      "epoch": 0.06319798744504267,
      "grad_norm": 0.14242246747016907,
      "learning_rate": 0.00029664570270518685,
      "loss": 9.9404,
      "step": 4032,
      "throughput": 12265.547154570646
    },
    {
      "epoch": 0.06369955877397158,
      "grad_norm": 0.12950783967971802,
      "learning_rate": 0.00029658469788843147,
      "loss": 9.9449,
      "step": 4064,
      "throughput": 12265.70607379335
    },
    {
      "epoch": 0.06420113010290049,
      "grad_norm": 0.12653489410877228,
      "learning_rate": 0.00029652315041599203,
      "loss": 9.9341,
      "step": 4096,
      "throughput": 12268.422166740711
    },
    {
      "epoch": 0.0647027014318294,
      "grad_norm": 0.13766422867774963,
      "learning_rate": 0.00029646106054169046,
      "loss": 9.9369,
      "step": 4128,
      "throughput": 12266.532288400096
    },
    {
      "epoch": 0.06520427276075831,
      "grad_norm": 0.125003382563591,
      "learning_rate": 0.00029639842852158553,
      "loss": 9.9329,
      "step": 4160,
      "throughput": 12269.080286215773
    },
    {
      "epoch": 0.06570584408968723,
      "grad_norm": 0.12812422215938568,
      "learning_rate": 0.00029633525461397194,
      "loss": 9.927,
      "step": 4192,
      "throughput": 12272.078468674545
    },
    {
      "epoch": 0.06620741541861613,
      "grad_norm": 0.13568617403507233,
      "learning_rate": 0.00029627153907937903,
      "loss": 9.9293,
      "step": 4224,
      "throughput": 12275.038225707407
    },
    {
      "epoch": 0.06670898674754504,
      "grad_norm": 0.12992961704730988,
      "learning_rate": 0.0002962072821805699,
      "loss": 9.8933,
      "step": 4256,
      "throughput": 12277.575637351083
    },
    {
      "epoch": 0.06721055807647396,
      "grad_norm": 0.11827738583087921,
      "learning_rate": 0.0002961424841825402,
      "loss": 9.9004,
      "step": 4288,
      "throughput": 12279.553752485568
    },
    {
      "epoch": 0.06771212940540286,
      "grad_norm": 0.12144280970096588,
      "learning_rate": 0.00029607714535251703,
      "loss": 9.8951,
      "step": 4320,
      "throughput": 12279.402994481276
    },
    {
      "epoch": 0.06821370073433178,
      "grad_norm": 0.11789822578430176,
      "learning_rate": 0.00029601126595995794,
      "loss": 9.8987,
      "step": 4352,
      "throughput": 12278.677511112539
    },
    {
      "epoch": 0.06871527206326068,
      "grad_norm": 0.1158544048666954,
      "learning_rate": 0.0002959448462765497,
      "loss": 9.8789,
      "step": 4384,
      "throughput": 12278.85046684112
    },
    {
      "epoch": 0.0692168433921896,
      "grad_norm": 0.12830589711666107,
      "learning_rate": 0.0002958778865762072,
      "loss": 9.8952,
      "step": 4416,
      "throughput": 12280.41777725107
    },
    {
      "epoch": 0.0697184147211185,
      "grad_norm": 0.118076391518116,
      "learning_rate": 0.0002958103871350727,
      "loss": 9.8743,
      "step": 4448,
      "throughput": 12283.159086085681
    },
    {
      "epoch": 0.07021998605004741,
      "grad_norm": 0.13101021945476532,
      "learning_rate": 0.0002957423482315139,
      "loss": 9.8762,
      "step": 4480,
      "throughput": 12285.418379854667
    },
    {
      "epoch": 0.07072155737897633,
      "grad_norm": 0.14345066249370575,
      "learning_rate": 0.0002956737701461235,
      "loss": 9.8613,
      "step": 4512,
      "throughput": 12288.06607647167
    },
    {
      "epoch": 0.07122312870790523,
      "grad_norm": 0.13554368913173676,
      "learning_rate": 0.00029560465316171773,
      "loss": 9.8495,
      "step": 4544,
      "throughput": 12290.674364258046
    },
    {
      "epoch": 0.07172470003683415,
      "grad_norm": 0.12480172514915466,
      "learning_rate": 0.0002955349975633352,
      "loss": 9.8633,
      "step": 4576,
      "throughput": 12293.266006329883
    },
    {
      "epoch": 0.07222627136576305,
      "grad_norm": 0.12317401170730591,
      "learning_rate": 0.00029546480363823577,
      "loss": 9.859,
      "step": 4608,
      "throughput": 12295.056858274298
    },
    {
      "epoch": 0.07272784269469197,
      "grad_norm": 0.12118421494960785,
      "learning_rate": 0.0002953940716758995,
      "loss": 9.8337,
      "step": 4640,
      "throughput": 12295.407806257885
    },
    {
      "epoch": 0.07322941402362088,
      "grad_norm": 0.12546321749687195,
      "learning_rate": 0.0002953228019680252,
      "loss": 9.8429,
      "step": 4672,
      "throughput": 12295.151649105255
    },
    {
      "epoch": 0.07373098535254978,
      "grad_norm": 0.1200215220451355,
      "learning_rate": 0.0002952509948085293,
      "loss": 9.82,
      "step": 4704,
      "throughput": 12294.887429480248
    },
    {
      "epoch": 0.0742325566814787,
      "grad_norm": 0.1412278413772583,
      "learning_rate": 0.00029517865049354477,
      "loss": 9.8439,
      "step": 4736,
      "throughput": 12296.221471223143
    },
    {
      "epoch": 0.0747341280104076,
      "grad_norm": 0.13449692726135254,
      "learning_rate": 0.0002951057693214197,
      "loss": 9.8178,
      "step": 4768,
      "throughput": 12298.279352568583
    },
    {
      "epoch": 0.07523569933933652,
      "grad_norm": 0.11419707536697388,
      "learning_rate": 0.0002950323515927164,
      "loss": 9.7989,
      "step": 4800,
      "throughput": 12300.697676788242
    },
    {
      "epoch": 0.07573727066826542,
      "grad_norm": 0.11441710591316223,
      "learning_rate": 0.0002949583976102097,
      "loss": 9.8139,
      "step": 4832,
      "throughput": 12302.73785434334
    },
    {
      "epoch": 0.07623884199719433,
      "grad_norm": 0.12667381763458252,
      "learning_rate": 0.00029488390767888606,
      "loss": 9.8028,
      "step": 4864,
      "throughput": 12305.123450605599
    },
    {
      "epoch": 0.07674041332612325,
      "grad_norm": 0.1141253337264061,
      "learning_rate": 0.0002948088821059422,
      "loss": 9.7975,
      "step": 4896,
      "throughput": 12307.419387340185
    },
    {
      "epoch": 0.07724198465505215,
      "grad_norm": 0.11875823140144348,
      "learning_rate": 0.0002947333212007838,
      "loss": 9.8001,
      "step": 4928,
      "throughput": 12309.448497408639
    },
    {
      "epoch": 0.07774355598398107,
      "grad_norm": 0.12149334698915482,
      "learning_rate": 0.0002946572252750242,
      "loss": 9.8045,
      "step": 4960,
      "throughput": 12309.602888274778
    },
    {
      "epoch": 0.07824512731290997,
      "grad_norm": 0.11435220390558243,
      "learning_rate": 0.0002945805946424834,
      "loss": 9.7856,
      "step": 4992,
      "throughput": 12309.117653614778
    },
    {
      "epoch": 0.07874669864183889,
      "grad_norm": 0.12228445708751678,
      "learning_rate": 0.0002945034296191861,
      "loss": 9.7938,
      "step": 5024,
      "throughput": 12308.370488061273
    },
    {
      "epoch": 0.0792482699707678,
      "grad_norm": 0.11748974025249481,
      "learning_rate": 0.00029442573052336127,
      "loss": 9.7885,
      "step": 5056,
      "throughput": 12309.201100353435
    },
    {
      "epoch": 0.0797498412996967,
      "grad_norm": 0.11757509410381317,
      "learning_rate": 0.0002943474976754401,
      "loss": 9.7451,
      "step": 5088,
      "throughput": 12311.036028542083
    },
    {
      "epoch": 0.08025141262862562,
      "grad_norm": 0.1277211606502533,
      "learning_rate": 0.0002942687313980552,
      "loss": 9.7726,
      "step": 5120,
      "throughput": 12313.223751076262
    },
    {
      "epoch": 0.08075298395755452,
      "grad_norm": 0.12524756789207458,
      "learning_rate": 0.0002941894320160389,
      "loss": 9.7733,
      "step": 5152,
      "throughput": 12314.977454209267
    },
    {
      "epoch": 0.08125455528648344,
      "grad_norm": 0.12436572462320328,
      "learning_rate": 0.00029410959985642205,
      "loss": 9.7558,
      "step": 5184,
      "throughput": 12317.09806861488
    },
    {
      "epoch": 0.08175612661541234,
      "grad_norm": 0.11354023963212967,
      "learning_rate": 0.0002940292352484327,
      "loss": 9.7416,
      "step": 5216,
      "throughput": 12319.223291715813
    },
    {
      "epoch": 0.08225769794434126,
      "grad_norm": 0.11431898921728134,
      "learning_rate": 0.0002939483385234948,
      "loss": 9.7403,
      "step": 5248,
      "throughput": 12321.033988877938
    },
    {
      "epoch": 0.08275926927327017,
      "grad_norm": 0.12201514840126038,
      "learning_rate": 0.0002938669100152266,
      "loss": 9.7622,
      "step": 5280,
      "throughput": 12321.809804324736
    },
    {
      "epoch": 0.08326084060219907,
      "grad_norm": 0.12244311720132828,
      "learning_rate": 0.00029378495005943954,
      "loss": 9.7318,
      "step": 5312,
      "throughput": 12321.709958468915
    },
    {
      "epoch": 0.08376241193112799,
      "grad_norm": 0.12442632764577866,
      "learning_rate": 0.00029370245899413677,
      "loss": 9.7324,
      "step": 5344,
      "throughput": 12320.893035853058
    },
    {
      "epoch": 0.0842639832600569,
      "grad_norm": 0.10813046246767044,
      "learning_rate": 0.0002936194371595116,
      "loss": 9.7355,
      "step": 5376,
      "throughput": 12321.800544082063
    },
    {
      "epoch": 0.08476555458898581,
      "grad_norm": 0.1120183914899826,
      "learning_rate": 0.00029353588489794636,
      "loss": 9.7248,
      "step": 5408,
      "throughput": 12322.747158697055
    },
    {
      "epoch": 0.08526712591791472,
      "grad_norm": 0.1144866794347763,
      "learning_rate": 0.0002934518025540109,
      "loss": 9.723,
      "step": 5440,
      "throughput": 12324.437603602319
    },
    {
      "epoch": 0.08576869724684363,
      "grad_norm": 0.10953057557344437,
      "learning_rate": 0.00029336719047446096,
      "loss": 9.7424,
      "step": 5472,
      "throughput": 12326.037578732192
    },
    {
      "epoch": 0.08627026857577254,
      "grad_norm": 0.12415914237499237,
      "learning_rate": 0.000293282049008237,
      "loss": 9.7132,
      "step": 5504,
      "throughput": 12327.988044033029
    },
    {
      "epoch": 0.08677183990470144,
      "grad_norm": 0.12535545229911804,
      "learning_rate": 0.00029319637850646273,
      "loss": 9.7169,
      "step": 5536,
      "throughput": 12329.945895602787
    },
    {
      "epoch": 0.08727341123363036,
      "grad_norm": 0.11870857328176498,
      "learning_rate": 0.0002931101793224435,
      "loss": 9.7188,
      "step": 5568,
      "throughput": 12331.603366242149
    },
    {
      "epoch": 0.08777498256255926,
      "grad_norm": 0.11292309314012527,
      "learning_rate": 0.0002930234518116651,
      "loss": 9.7178,
      "step": 5600,
      "throughput": 12332.284273719622
    },
    {
      "epoch": 0.08827655389148818,
      "grad_norm": 0.12400523573160172,
      "learning_rate": 0.000292936196331792,
      "loss": 9.6843,
      "step": 5632,
      "throughput": 12332.697010653936
    },
    {
      "epoch": 0.08877812522041709,
      "grad_norm": 0.11735209077596664,
      "learning_rate": 0.000292848413242666,
      "loss": 9.7094,
      "step": 5664,
      "throughput": 12332.795802288674
    },
    {
      "epoch": 0.089279696549346,
      "grad_norm": 0.11407672613859177,
      "learning_rate": 0.0002927601029063049,
      "loss": 9.6873,
      "step": 5696,
      "throughput": 12332.697222211196
    },
    {
      "epoch": 0.08978126787827491,
      "grad_norm": 0.12642164528369904,
      "learning_rate": 0.0002926712656869007,
      "loss": 9.6826,
      "step": 5728,
      "throughput": 12333.568590858733
    },
    {
      "epoch": 0.09028283920720381,
      "grad_norm": 0.11882464587688446,
      "learning_rate": 0.0002925819019508184,
      "loss": 9.6877,
      "step": 5760,
      "throughput": 12335.226550369427
    },
    {
      "epoch": 0.09078441053613273,
      "grad_norm": 0.12189045548439026,
      "learning_rate": 0.0002924920120665943,
      "loss": 9.6935,
      "step": 5792,
      "throughput": 12337.029247432713
    },
    {
      "epoch": 0.09128598186506164,
      "grad_norm": 0.12018447369337082,
      "learning_rate": 0.00029240159640493463,
      "loss": 9.689,
      "step": 5824,
      "throughput": 12338.4540721314
    },
    {
      "epoch": 0.09178755319399055,
      "grad_norm": 0.11342897266149521,
      "learning_rate": 0.00029231065533871374,
      "loss": 9.6768,
      "step": 5856,
      "throughput": 12340.23535636152
    },
    {
      "epoch": 0.09228912452291946,
      "grad_norm": 0.11006417125463486,
      "learning_rate": 0.0002922191892429729,
      "loss": 9.657,
      "step": 5888,
      "throughput": 12341.714272101834
    },
    {
      "epoch": 0.09279069585184836,
      "grad_norm": 0.1147543340921402,
      "learning_rate": 0.0002921271984949185,
      "loss": 9.6669,
      "step": 5920,
      "throughput": 12343.211380920462
    },
    {
      "epoch": 0.09329226718077728,
      "grad_norm": 0.10900643467903137,
      "learning_rate": 0.0002920346834739208,
      "loss": 9.6532,
      "step": 5952,
      "throughput": 12343.042496781856
    },
    {
      "epoch": 0.09379383850970618,
      "grad_norm": 0.11826995015144348,
      "learning_rate": 0.0002919416445615119,
      "loss": 9.6561,
      "step": 5984,
      "throughput": 12342.599329399904
    },
    {
      "epoch": 0.0942954098386351,
      "grad_norm": 0.12266913801431656,
      "learning_rate": 0.0002918480821413846,
      "loss": 9.6378,
      "step": 6016,
      "throughput": 12342.16939111886
    },
    {
      "epoch": 0.094796981167564,
      "grad_norm": 0.10905318707227707,
      "learning_rate": 0.0002917539965993906,
      "loss": 9.6405,
      "step": 6048,
      "throughput": 12342.86339634901
    },
    {
      "epoch": 0.09529855249649292,
      "grad_norm": 0.11646851897239685,
      "learning_rate": 0.00029165938832353885,
      "loss": 9.6457,
      "step": 6080,
      "throughput": 12344.365800153453
    },
    {
      "epoch": 0.09580012382542183,
      "grad_norm": 0.11943021416664124,
      "learning_rate": 0.00029156425770399434,
      "loss": 9.6318,
      "step": 6112,
      "throughput": 12346.042912433375
    },
    {
      "epoch": 0.09630169515435073,
      "grad_norm": 0.12691287696361542,
      "learning_rate": 0.0002914686051330759,
      "loss": 9.6346,
      "step": 6144,
      "throughput": 12347.387013004074
    },
    {
      "epoch": 0.09680326648327965,
      "grad_norm": 0.1268269121646881,
      "learning_rate": 0.00029137243100525506,
      "loss": 9.6586,
      "step": 6176,
      "throughput": 12345.744944764449
    },
    {
      "epoch": 0.09730483781220856,
      "grad_norm": 0.11386391520500183,
      "learning_rate": 0.00029127573571715416,
      "loss": 9.6269,
      "step": 6208,
      "throughput": 12347.386245501879
    },
    {
      "epoch": 0.09780640914113747,
      "grad_norm": 0.11313401907682419,
      "learning_rate": 0.00029117851966754495,
      "loss": 9.6245,
      "step": 6240,
      "throughput": 12348.8221084515
    },
    {
      "epoch": 0.09830798047006638,
      "grad_norm": 0.11410092562437057,
      "learning_rate": 0.00029108078325734666,
      "loss": 9.6307,
      "step": 6272,
      "throughput": 12348.797703159744
    },
    {
      "epoch": 0.0988095517989953,
      "grad_norm": 0.12609340250492096,
      "learning_rate": 0.0002909825268896245,
      "loss": 9.6201,
      "step": 6304,
      "throughput": 12348.257753633938
    },
    {
      "epoch": 0.0993111231279242,
      "grad_norm": 0.10618647187948227,
      "learning_rate": 0.000290883750969588,
      "loss": 9.6123,
      "step": 6336,
      "throughput": 12348.513898569872
    },
    {
      "epoch": 0.0998126944568531,
      "grad_norm": 0.12335828691720963,
      "learning_rate": 0.00029078445590458946,
      "loss": 9.6015,
      "step": 6368,
      "throughput": 12349.050939061655
    },
    {
      "epoch": 0.10031426578578202,
      "grad_norm": 0.10779345035552979,
      "learning_rate": 0.0002906846421041219,
      "loss": 9.6393,
      "step": 6400,
      "throughput": 12349.737200284571
    },
    {
      "epoch": 0.10081583711471093,
      "grad_norm": 0.11098971217870712,
      "learning_rate": 0.00029058430997981784,
      "loss": 9.5854,
      "step": 6432,
      "throughput": 12351.193305270057
    },
    {
      "epoch": 0.10131740844363984,
      "grad_norm": 0.11716262996196747,
      "learning_rate": 0.0002904834599454472,
      "loss": 9.5901,
      "step": 6464,
      "throughput": 12352.517470523717
    },
    {
      "epoch": 0.10181897977256875,
      "grad_norm": 0.10677265375852585,
      "learning_rate": 0.00029038209241691575,
      "loss": 9.617,
      "step": 6496,
      "throughput": 12354.060061073262
    },
    {
      "epoch": 0.10232055110149767,
      "grad_norm": 0.1280500292778015,
      "learning_rate": 0.0002902802078122636,
      "loss": 9.5724,
      "step": 6528,
      "throughput": 12355.586840332062
    },
    {
      "epoch": 0.10282212243042657,
      "grad_norm": 0.11151953786611557,
      "learning_rate": 0.00029017780655166315,
      "loss": 9.5919,
      "step": 6560,
      "throughput": 12356.84589819978
    },
    {
      "epoch": 0.10332369375935548,
      "grad_norm": 0.11358866095542908,
      "learning_rate": 0.0002900748890574175,
      "loss": 9.599,
      "step": 6592,
      "throughput": 12357.114017955366
    },
    {
      "epoch": 0.1038252650882844,
      "grad_norm": 0.10364415496587753,
      "learning_rate": 0.0002899714557539586,
      "loss": 9.5957,
      "step": 6624,
      "throughput": 12356.952324675427
    },
    {
      "epoch": 0.1043268364172133,
      "grad_norm": 0.1232060045003891,
      "learning_rate": 0.00028986750706784574,
      "loss": 9.5985,
      "step": 6656,
      "throughput": 12357.1067247594
    },
    {
      "epoch": 0.10482840774614222,
      "grad_norm": 0.102699875831604,
      "learning_rate": 0.0002897630434277637,
      "loss": 9.5758,
      "step": 6688,
      "throughput": 12356.907550118318
    },
    {
      "epoch": 0.10532997907507112,
      "grad_norm": 0.10401725023984909,
      "learning_rate": 0.0002896580652645207,
      "loss": 9.5691,
      "step": 6720,
      "throughput": 12357.6009277936
    },
    {
      "epoch": 0.10583155040400004,
      "grad_norm": 0.11600401997566223,
      "learning_rate": 0.00028955257301104714,
      "loss": 9.5597,
      "step": 6752,
      "throughput": 12358.876152325516
    },
    {
      "epoch": 0.10633312173292894,
      "grad_norm": 0.1357276439666748,
      "learning_rate": 0.00028944656710239337,
      "loss": 9.5518,
      "step": 6784,
      "throughput": 12360.335183790394
    },
    {
      "epoch": 0.10683469306185785,
      "grad_norm": 0.1068183034658432,
      "learning_rate": 0.00028934004797572795,
      "loss": 9.5883,
      "step": 6816,
      "throughput": 12361.487846370224
    },
    {
      "epoch": 0.10733626439078676,
      "grad_norm": 0.10323406755924225,
      "learning_rate": 0.00028923301607033616,
      "loss": 9.5374,
      "step": 6848,
      "throughput": 12362.91980993743
    },
    {
      "epoch": 0.10783783571971567,
      "grad_norm": 0.12117818742990494,
      "learning_rate": 0.0002891254718276178,
      "loss": 9.5873,
      "step": 6880,
      "throughput": 12364.151782712226
    },
    {
      "epoch": 0.10833940704864459,
      "grad_norm": 0.11063683032989502,
      "learning_rate": 0.00028901741569108586,
      "loss": 9.5572,
      "step": 6912,
      "throughput": 12364.86991250105
    },
    {
      "epoch": 0.10884097837757349,
      "grad_norm": 0.12235350161790848,
      "learning_rate": 0.00028890884810636394,
      "loss": 9.5629,
      "step": 6944,
      "throughput": 12364.687522607777
    },
    {
      "epoch": 0.1093425497065024,
      "grad_norm": 0.10094378143548965,
      "learning_rate": 0.00028879976952118523,
      "loss": 9.5556,
      "step": 6976,
      "throughput": 12364.93814754636
    },
    {
      "epoch": 0.10984412103543131,
      "grad_norm": 0.1071799248456955,
      "learning_rate": 0.0002886901803853901,
      "loss": 9.5773,
      "step": 7008,
      "throughput": 12364.533697610548
    },
    {
      "epoch": 0.11034569236436022,
      "grad_norm": 0.10255227237939835,
      "learning_rate": 0.00028858008115092445,
      "loss": 9.5436,
      "step": 7040,
      "throughput": 12365.488571719223
    },
    {
      "epoch": 0.11084726369328914,
      "grad_norm": 0.11478185653686523,
      "learning_rate": 0.0002884694722718378,
      "loss": 9.5448,
      "step": 7072,
      "throughput": 12366.375113572562
    },
    {
      "epoch": 0.11134883502221804,
      "grad_norm": 0.11082364618778229,
      "learning_rate": 0.00028835835420428163,
      "loss": 9.5343,
      "step": 7104,
      "throughput": 12367.574394631327
    },
    {
      "epoch": 0.11185040635114696,
      "grad_norm": 0.10560789704322815,
      "learning_rate": 0.000288246727406507,
      "loss": 9.5368,
      "step": 7136,
      "throughput": 12368.715921217981
    },
    {
      "epoch": 0.11235197768007586,
      "grad_norm": 0.1051524356007576,
      "learning_rate": 0.00028813459233886335,
      "loss": 9.5262,
      "step": 7168,
      "throughput": 12370.051759183418
    },
    {
      "epoch": 0.11285354900900477,
      "grad_norm": 0.12176744639873505,
      "learning_rate": 0.00028802194946379585,
      "loss": 9.5051,
      "step": 7200,
      "throughput": 12371.17860459098
    },
    {
      "epoch": 0.11335512033793368,
      "grad_norm": 0.10709969699382782,
      "learning_rate": 0.0002879087992458442,
      "loss": 9.5216,
      "step": 7232,
      "throughput": 12372.029559352753
    },
    {
      "epoch": 0.11385669166686259,
      "grad_norm": 0.11609054356813431,
      "learning_rate": 0.00028779514215164015,
      "loss": 9.5081,
      "step": 7264,
      "throughput": 12372.039048884106
    },
    {
      "epoch": 0.11435826299579151,
      "grad_norm": 0.10819747298955917,
      "learning_rate": 0.0002876809786499059,
      "loss": 9.5228,
      "step": 7296,
      "throughput": 12371.93998091193
    },
    {
      "epoch": 0.11485983432472041,
      "grad_norm": 0.10557551681995392,
      "learning_rate": 0.0002875663092114521,
      "loss": 9.5303,
      "step": 7328,
      "throughput": 12372.227380949147
    },
    {
      "epoch": 0.11536140565364933,
      "grad_norm": 0.11606358736753464,
      "learning_rate": 0.0002874511343091758,
      "loss": 9.517,
      "step": 7360,
      "throughput": 12372.649695724014
    },
    {
      "epoch": 0.11586297698257823,
      "grad_norm": 0.12774042785167694,
      "learning_rate": 0.00028733545441805874,
      "loss": 9.5215,
      "step": 7392,
      "throughput": 12373.206108875034
    },
    {
      "epoch": 0.11636454831150714,
      "grad_norm": 0.10692732781171799,
      "learning_rate": 0.00028721927001516503,
      "loss": 9.5291,
      "step": 7424,
      "throughput": 12374.371738841517
    },
    {
      "epoch": 0.11686611964043606,
      "grad_norm": 0.12107761949300766,
      "learning_rate": 0.00028710258157963955,
      "loss": 9.5179,
      "step": 7456,
      "throughput": 12375.376720026492
    },
    {
      "epoch": 0.11736769096936496,
      "grad_norm": 0.11835253238677979,
      "learning_rate": 0.00028698538959270577,
      "loss": 9.5159,
      "step": 7488,
      "throughput": 12376.636426577035
    },
    {
      "epoch": 0.11786926229829388,
      "grad_norm": 0.12197201699018478,
      "learning_rate": 0.00028686769453766366,
      "loss": 9.5129,
      "step": 7520,
      "throughput": 12377.680118744986
    },
    {
      "epoch": 0.11837083362722278,
      "grad_norm": 0.11214063316583633,
      "learning_rate": 0.00028674949689988814,
      "loss": 9.4886,
      "step": 7552,
      "throughput": 12378.917310693925
    },
    {
      "epoch": 0.1188724049561517,
      "grad_norm": 0.10614926367998123,
      "learning_rate": 0.00028663079716682654,
      "loss": 9.4835,
      "step": 7584,
      "throughput": 12378.823140998962
    },
    {
      "epoch": 0.1193739762850806,
      "grad_norm": 0.10913848131895065,
      "learning_rate": 0.00028651159582799695,
      "loss": 9.4918,
      "step": 7616,
      "throughput": 12378.437097872087
    },
    {
      "epoch": 0.11987554761400951,
      "grad_norm": 0.11134051531553268,
      "learning_rate": 0.000286391893374986,
      "loss": 9.4976,
      "step": 7648,
      "throughput": 12379.085729381744
    },
    {
      "epoch": 0.12037711894293843,
      "grad_norm": 0.10430318862199783,
      "learning_rate": 0.0002862716903014469,
      "loss": 9.4942,
      "step": 7680,
      "throughput": 12378.795603573722
    },
    {
      "epoch": 0.12087869027186733,
      "grad_norm": 0.10720957070589066,
      "learning_rate": 0.0002861509871030977,
      "loss": 9.479,
      "step": 7712,
      "throughput": 12379.607401723628
    },
    {
      "epoch": 0.12138026160079625,
      "grad_norm": 0.11613073199987411,
      "learning_rate": 0.0002860297842777185,
      "loss": 9.4616,
      "step": 7744,
      "throughput": 12380.6370253053
    },
    {
      "epoch": 0.12188183292972515,
      "grad_norm": 0.10742553323507309,
      "learning_rate": 0.00028590808232515025,
      "loss": 9.475,
      "step": 7776,
      "throughput": 12381.74917855792
    },
    {
      "epoch": 0.12238340425865407,
      "grad_norm": 0.10911645740270615,
      "learning_rate": 0.00028578588174729214,
      "loss": 9.4682,
      "step": 7808,
      "throughput": 12382.676613473282
    },
    {
      "epoch": 0.12288497558758298,
      "grad_norm": 0.11030824482440948,
      "learning_rate": 0.0002856631830480997,
      "loss": 9.472,
      "step": 7840,
      "throughput": 12383.85872889057
    },
    {
      "epoch": 0.12338654691651188,
      "grad_norm": 0.10081150382757187,
      "learning_rate": 0.0002855399867335827,
      "loss": 9.4654,
      "step": 7872,
      "throughput": 12384.80405612001
    },
    {
      "epoch": 0.1238881182454408,
      "grad_norm": 0.11015335470438004,
      "learning_rate": 0.0002854162933118032,
      "loss": 9.4729,
      "step": 7904,
      "throughput": 12384.971868652405
    },
    {
      "epoch": 0.1243896895743697,
      "grad_norm": 0.1104336678981781,
      "learning_rate": 0.0002852921032928732,
      "loss": 9.4595,
      "step": 7936,
      "throughput": 12384.722784807582
    },
    {
      "epoch": 0.12489126090329862,
      "grad_norm": 0.10051671415567398,
      "learning_rate": 0.0002851674171889526,
      "loss": 9.4595,
      "step": 7968,
      "throughput": 12384.931877529722
    },
    {
      "epoch": 0.12539283223222752,
      "grad_norm": 0.10377933830022812,
      "learning_rate": 0.0002850422355142474,
      "loss": 9.4561,
      "step": 8000,
      "throughput": 12384.742319617793
    },
    {
      "epoch": 0.12589440356115644,
      "grad_norm": 0.0999143123626709,
      "learning_rate": 0.00028491655878500716,
      "loss": 9.4604,
      "step": 8032,
      "throughput": 12385.513044172185
    },
    {
      "epoch": 0.12639597489008533,
      "grad_norm": 0.10762206465005875,
      "learning_rate": 0.0002847903875195231,
      "loss": 9.4473,
      "step": 8064,
      "throughput": 12386.24801555529
    },
    {
      "epoch": 0.12689754621901425,
      "grad_norm": 0.10370098799467087,
      "learning_rate": 0.00028466372223812575,
      "loss": 9.4184,
      "step": 8096,
      "throughput": 12387.234833254606
    },
    {
      "epoch": 0.12739911754794317,
      "grad_norm": 0.10739105939865112,
      "learning_rate": 0.0002845365634631833,
      "loss": 9.4831,
      "step": 8128,
      "throughput": 12388.129755041251
    },
    {
      "epoch": 0.1279006888768721,
      "grad_norm": 0.12630467116832733,
      "learning_rate": 0.0002844089117190988,
      "loss": 9.444,
      "step": 8160,
      "throughput": 12389.256771455293
    },
    {
      "epoch": 0.12840226020580098,
      "grad_norm": 0.10570238530635834,
      "learning_rate": 0.0002842807675323085,
      "loss": 9.4557,
      "step": 8192,
      "throughput": 12390.185039932749
    },
    {
      "epoch": 0.1289038315347299,
      "grad_norm": 0.0973723828792572,
      "learning_rate": 0.00028415213143127935,
      "loss": 9.4579,
      "step": 8224,
      "throughput": 12388.011068468239
    },
    {
      "epoch": 0.1294054028636588,
      "grad_norm": 0.09969617426395416,
      "learning_rate": 0.00028402300394650697,
      "loss": 9.4488,
      "step": 8256,
      "throughput": 12387.788565521383
    },
    {
      "epoch": 0.1299069741925877,
      "grad_norm": 0.106363944709301,
      "learning_rate": 0.0002838933856105136,
      "loss": 9.4312,
      "step": 8288,
      "throughput": 12388.006313569616
    },
    {
      "epoch": 0.13040854552151662,
      "grad_norm": 0.11680560559034348,
      "learning_rate": 0.0002837632769578455,
      "loss": 9.4467,
      "step": 8320,
      "throughput": 12388.584432786123
    },
    {
      "epoch": 0.13091011685044554,
      "grad_norm": 0.10573258250951767,
      "learning_rate": 0.00028363267852507133,
      "loss": 9.43,
      "step": 8352,
      "throughput": 12388.676328642412
    },
    {
      "epoch": 0.13141168817937446,
      "grad_norm": 0.10454431176185608,
      "learning_rate": 0.0002835015908507793,
      "loss": 9.4489,
      "step": 8384,
      "throughput": 12389.374522278958
    },
    {
      "epoch": 0.13191325950830335,
      "grad_norm": 0.1074761152267456,
      "learning_rate": 0.0002833700144755753,
      "loss": 9.4229,
      "step": 8416,
      "throughput": 12390.318722076217
    },
    {
      "epoch": 0.13241483083723227,
      "grad_norm": 0.10638105869293213,
      "learning_rate": 0.0002832379499420808,
      "loss": 9.4412,
      "step": 8448,
      "throughput": 12391.3988338955
    },
    {
      "epoch": 0.13291640216616118,
      "grad_norm": 0.11379750072956085,
      "learning_rate": 0.0002831053977949303,
      "loss": 9.4209,
      "step": 8480,
      "throughput": 12392.285440841459
    },
    {
      "epoch": 0.13341797349509008,
      "grad_norm": 0.09992935508489609,
      "learning_rate": 0.00028297235858076923,
      "loss": 9.4292,
      "step": 8512,
      "throughput": 12393.139650485358
    },
    {
      "epoch": 0.133919544824019,
      "grad_norm": 0.09835680574178696,
      "learning_rate": 0.0002828388328482517,
      "loss": 9.4154,
      "step": 8544,
      "throughput": 12393.640883769935
    },
    {
      "epoch": 0.1344211161529479,
      "grad_norm": 0.10513360053300858,
      "learning_rate": 0.0002827048211480383,
      "loss": 9.4142,
      "step": 8576,
      "throughput": 12393.44078783823
    },
    {
      "epoch": 0.13492268748187683,
      "grad_norm": 0.10509216040372849,
      "learning_rate": 0.00028257032403279354,
      "loss": 9.4211,
      "step": 8608,
      "throughput": 12393.843707408567
    },
    {
      "epoch": 0.13542425881080572,
      "grad_norm": 0.10336083173751831,
      "learning_rate": 0.00028243534205718405,
      "loss": 9.4149,
      "step": 8640,
      "throughput": 12394.43407922468
    },
    {
      "epoch": 0.13592583013973464,
      "grad_norm": 0.11077357828617096,
      "learning_rate": 0.00028229987577787585,
      "loss": 9.4099,
      "step": 8672,
      "throughput": 12394.714826627447
    },
    {
      "epoch": 0.13642740146866356,
      "grad_norm": 0.12169834226369858,
      "learning_rate": 0.00028216392575353225,
      "loss": 9.3903,
      "step": 8704,
      "throughput": 12395.355541862467
    },
    {
      "epoch": 0.13692897279759245,
      "grad_norm": 0.11460896581411362,
      "learning_rate": 0.00028202749254481165,
      "loss": 9.3963,
      "step": 8736,
      "throughput": 12396.24861068647
    },
    {
      "epoch": 0.13743054412652136,
      "grad_norm": 0.10884397476911545,
      "learning_rate": 0.0002818905767143649,
      "loss": 9.4084,
      "step": 8768,
      "throughput": 12397.254228749884
    },
    {
      "epoch": 0.13793211545545028,
      "grad_norm": 0.1012769564986229,
      "learning_rate": 0.0002817531788268333,
      "loss": 9.4047,
      "step": 8800,
      "throughput": 12398.007134221343
    },
    {
      "epoch": 0.1384336867843792,
      "grad_norm": 0.09802801162004471,
      "learning_rate": 0.0002816152994488462,
      "loss": 9.4027,
      "step": 8832,
      "throughput": 12398.821127462916
    },
    {
      "epoch": 0.1389352581133081,
      "grad_norm": 0.09760963916778564,
      "learning_rate": 0.0002814769391490185,
      "loss": 9.4263,
      "step": 8864,
      "throughput": 12399.59773979323
    },
    {
      "epoch": 0.139436829442237,
      "grad_norm": 0.10140734910964966,
      "learning_rate": 0.0002813380984979486,
      "loss": 9.3938,
      "step": 8896,
      "throughput": 12399.861320612416
    },
    {
      "epoch": 0.13993840077116593,
      "grad_norm": 0.10481631755828857,
      "learning_rate": 0.00028119877806821557,
      "loss": 9.3936,
      "step": 8928,
      "throughput": 12399.387348726712
    },
    {
      "epoch": 0.14043997210009482,
      "grad_norm": 0.09842522442340851,
      "learning_rate": 0.00028105897843437746,
      "loss": 9.4058,
      "step": 8960,
      "throughput": 12399.916140995505
    },
    {
      "epoch": 0.14094154342902374,
      "grad_norm": 0.10943567752838135,
      "learning_rate": 0.0002809187001729683,
      "loss": 9.4116,
      "step": 8992,
      "throughput": 12399.814656388015
    },
    {
      "epoch": 0.14144311475795265,
      "grad_norm": 0.10182485729455948,
      "learning_rate": 0.00028077794386249604,
      "loss": 9.3873,
      "step": 9024,
      "throughput": 12400.67111512373
    },
    {
      "epoch": 0.14194468608688157,
      "grad_norm": 0.11439745873212814,
      "learning_rate": 0.0002806367100834401,
      "loss": 9.3823,
      "step": 9056,
      "throughput": 12401.250114755268
    },
    {
      "epoch": 0.14244625741581046,
      "grad_norm": 0.10296665132045746,
      "learning_rate": 0.00028049499941824906,
      "loss": 9.3898,
      "step": 9088,
      "throughput": 12402.11882297528
    },
    {
      "epoch": 0.14294782874473938,
      "grad_norm": 0.10108328610658646,
      "learning_rate": 0.0002803528124513382,
      "loss": 9.3674,
      "step": 9120,
      "throughput": 12402.854530836656
    },
    {
      "epoch": 0.1434494000736683,
      "grad_norm": 0.10175160318613052,
      "learning_rate": 0.00028021014976908676,
      "loss": 9.3948,
      "step": 9152,
      "throughput": 12403.821189917853
    },
    {
      "epoch": 0.1439509714025972,
      "grad_norm": 0.09991069883108139,
      "learning_rate": 0.0002800670119598363,
      "loss": 9.3585,
      "step": 9184,
      "throughput": 12404.386932082074
    },
    {
      "epoch": 0.1444525427315261,
      "grad_norm": 0.10071180760860443,
      "learning_rate": 0.0002799233996138874,
      "loss": 9.3922,
      "step": 9216,
      "throughput": 12404.620936018224
    },
    {
      "epoch": 0.14495411406045502,
      "grad_norm": 0.09633713215589523,
      "learning_rate": 0.00027977931332349786,
      "loss": 9.3715,
      "step": 9248,
      "throughput": 12404.222577241006
    },
    {
      "epoch": 0.14545568538938394,
      "grad_norm": 0.09819021821022034,
      "learning_rate": 0.00027963475368288006,
      "loss": 9.3892,
      "step": 9280,
      "throughput": 12404.729388610622
    },
    {
      "epoch": 0.14595725671831283,
      "grad_norm": 0.10015096515417099,
      "learning_rate": 0.00027948972128819823,
      "loss": 9.367,
      "step": 9312,
      "throughput": 12404.946783085732
    },
    {
      "epoch": 0.14645882804724175,
      "grad_norm": 0.0994173064827919,
      "learning_rate": 0.0002793442167375665,
      "loss": 9.3612,
      "step": 9344,
      "throughput": 12405.370142775204
    },
    {
      "epoch": 0.14696039937617067,
      "grad_norm": 0.10099942237138748,
      "learning_rate": 0.0002791982406310461,
      "loss": 9.3731,
      "step": 9376,
      "throughput": 12405.942822251614
    },
    {
      "epoch": 0.14746197070509956,
      "grad_norm": 0.10506460070610046,
      "learning_rate": 0.0002790517935706428,
      "loss": 9.3701,
      "step": 9408,
      "throughput": 12406.739091407724
    },
    {
      "epoch": 0.14796354203402848,
      "grad_norm": 0.10151456296443939,
      "learning_rate": 0.00027890487616030475,
      "loss": 9.3723,
      "step": 9440,
      "throughput": 12407.6516964536
    },
    {
      "epoch": 0.1484651133629574,
      "grad_norm": 0.10469347983598709,
      "learning_rate": 0.0002787574890059199,
      "loss": 9.3405,
      "step": 9472,
      "throughput": 12408.330275954717
    },
    {
      "epoch": 0.1489666846918863,
      "grad_norm": 0.09777528792619705,
      "learning_rate": 0.0002786096327153131,
      "loss": 9.3857,
      "step": 9504,
      "throughput": 12409.100735501179
    },
    {
      "epoch": 0.1494682560208152,
      "grad_norm": 0.10799805074930191,
      "learning_rate": 0.00027846130789824437,
      "loss": 9.3455,
      "step": 9536,
      "throughput": 12409.336452258403
    },
    {
      "epoch": 0.14996982734974412,
      "grad_norm": 0.10517607629299164,
      "learning_rate": 0.00027831251516640553,
      "loss": 9.3661,
      "step": 9568,
      "throughput": 12409.25003662909
    },
    {
      "epoch": 0.15047139867867304,
      "grad_norm": 0.10305366665124893,
      "learning_rate": 0.00027816325513341835,
      "loss": 9.3567,
      "step": 9600,
      "throughput": 12409.543587883387
    },
    {
      "epoch": 0.15097297000760193,
      "grad_norm": 0.1012721136212349,
      "learning_rate": 0.0002780135284148315,
      "loss": 9.3639,
      "step": 9632,
      "throughput": 12409.81051181453
    },
    {
      "epoch": 0.15147454133653085,
      "grad_norm": 0.10102220624685287,
      "learning_rate": 0.00027786333562811855,
      "loss": 9.3481,
      "step": 9664,
      "throughput": 12409.927072305443
    },
    {
      "epoch": 0.15197611266545977,
      "grad_norm": 0.10052553564310074,
      "learning_rate": 0.00027771267739267494,
      "loss": 9.3439,
      "step": 9696,
      "throughput": 12410.603156199979
    },
    {
      "epoch": 0.15247768399438866,
      "grad_norm": 0.09986281394958496,
      "learning_rate": 0.0002775615543298157,
      "loss": 9.3327,
      "step": 9728,
      "throughput": 12411.311376912594
    },
    {
      "epoch": 0.15297925532331758,
      "grad_norm": 0.09924038499593735,
      "learning_rate": 0.0002774099670627728,
      "loss": 9.3511,
      "step": 9760,
      "throughput": 12412.067980367869
    },
    {
      "epoch": 0.1534808266522465,
      "grad_norm": 0.1034877747297287,
      "learning_rate": 0.00027725791621669257,
      "loss": 9.3601,
      "step": 9792,
      "throughput": 12412.759386683938
    },
    {
      "epoch": 0.1539823979811754,
      "grad_norm": 0.09982676804065704,
      "learning_rate": 0.0002771054024186331,
      "loss": 9.3615,
      "step": 9824,
      "throughput": 12413.450320974767
    },
    {
      "epoch": 0.1544839693101043,
      "grad_norm": 0.09577617049217224,
      "learning_rate": 0.0002769524262975618,
      "loss": 9.3295,
      "step": 9856,
      "throughput": 12413.697121302801
    },
    {
      "epoch": 0.15498554063903322,
      "grad_norm": 0.10734862834215164,
      "learning_rate": 0.0002767989884843527,
      "loss": 9.3307,
      "step": 9888,
      "throughput": 12413.508577803374
    },
    {
      "epoch": 0.15548711196796214,
      "grad_norm": 0.09929708391427994,
      "learning_rate": 0.0002766450896117837,
      "loss": 9.3416,
      "step": 9920,
      "throughput": 12413.928230848545
    },
    {
      "epoch": 0.15598868329689103,
      "grad_norm": 0.10370606929063797,
      "learning_rate": 0.0002764907303145342,
      "loss": 9.3548,
      "step": 9952,
      "throughput": 12414.38389159902
    },
    {
      "epoch": 0.15649025462581995,
      "grad_norm": 0.098115473985672,
      "learning_rate": 0.00027633591122918244,
      "loss": 9.3249,
      "step": 9984,
      "throughput": 12414.269640854896
    },
    {
      "epoch": 0.15699182595474886,
      "grad_norm": 0.11088874191045761,
      "learning_rate": 0.0002761806329942028,
      "loss": 9.3433,
      "step": 10016,
      "throughput": 12415.007216940608
    },
    {
      "epoch": 0.15749339728367778,
      "grad_norm": 0.09496035426855087,
      "learning_rate": 0.0002760248962499632,
      "loss": 9.3195,
      "step": 10048,
      "throughput": 12415.475000469298
    },
    {
      "epoch": 0.15799496861260667,
      "grad_norm": 0.1069183349609375,
      "learning_rate": 0.0002758687016387223,
      "loss": 9.3469,
      "step": 10080,
      "throughput": 12416.20782001634
    },
    {
      "epoch": 0.1584965399415356,
      "grad_norm": 0.10577341914176941,
      "learning_rate": 0.0002757120498046273,
      "loss": 9.3427,
      "step": 10112,
      "throughput": 12416.860885508157
    },
    {
      "epoch": 0.1589981112704645,
      "grad_norm": 0.11294721812009811,
      "learning_rate": 0.00027555494139371077,
      "loss": 9.3316,
      "step": 10144,
      "throughput": 12417.545013450359
    },
    {
      "epoch": 0.1594996825993934,
      "grad_norm": 0.10189155489206314,
      "learning_rate": 0.0002753973770538882,
      "loss": 9.3031,
      "step": 10176,
      "throughput": 12417.850483599095
    },
    {
      "epoch": 0.16000125392832232,
      "grad_norm": 0.10261227190494537,
      "learning_rate": 0.00027523935743495553,
      "loss": 9.2924,
      "step": 10208,
      "throughput": 12418.2050685847
    },
    {
      "epoch": 0.16050282525725124,
      "grad_norm": 0.11461346596479416,
      "learning_rate": 0.00027508088318858604,
      "loss": 9.3316,
      "step": 10240,
      "throughput": 12418.174694582778
    },
    {
      "epoch": 0.16100439658618015,
      "grad_norm": 0.10292674601078033,
      "learning_rate": 0.000274921954968328,
      "loss": 9.3192,
      "step": 10272,
      "throughput": 12416.573063562828
    },
    {
      "epoch": 0.16150596791510904,
      "grad_norm": 0.09348347783088684,
      "learning_rate": 0.0002747625734296019,
      "loss": 9.3187,
      "step": 10304,
      "throughput": 12416.975740478432
    },
    {
      "epoch": 0.16200753924403796,
      "grad_norm": 0.09840043634176254,
      "learning_rate": 0.00027460273922969757,
      "loss": 9.3455,
      "step": 10336,
      "throughput": 12417.054217394312
    },
    {
      "epoch": 0.16250911057296688,
      "grad_norm": 0.10484471917152405,
      "learning_rate": 0.0002744424530277719,
      "loss": 9.314,
      "step": 10368,
      "throughput": 12417.481807473956
    },
    {
      "epoch": 0.16301068190189577,
      "grad_norm": 0.09991072863340378,
      "learning_rate": 0.0002742817154848455,
      "loss": 9.3004,
      "step": 10400,
      "throughput": 12418.165294747014
    },
    {
      "epoch": 0.1635122532308247,
      "grad_norm": 0.10125566273927689,
      "learning_rate": 0.00027412052726380053,
      "loss": 9.3233,
      "step": 10432,
      "throughput": 12418.964350230903
    },
    {
      "epoch": 0.1640138245597536,
      "grad_norm": 0.09845706075429916,
      "learning_rate": 0.00027395888902937777,
      "loss": 9.3054,
      "step": 10464,
      "throughput": 12419.410394767716
    },
    {
      "epoch": 0.16451539588868253,
      "grad_norm": 0.09788595885038376,
      "learning_rate": 0.0002737968014481737,
      "loss": 9.3053,
      "step": 10496,
      "throughput": 12420.121766830405
    },
    {
      "epoch": 0.16501696721761142,
      "grad_norm": 0.10306710004806519,
      "learning_rate": 0.000273634265188638,
      "loss": 9.2992,
      "step": 10528,
      "throughput": 12420.205403133641
    },
    {
      "epoch": 0.16551853854654033,
      "grad_norm": 0.09703079611063004,
      "learning_rate": 0.0002734712809210706,
      "loss": 9.305,
      "step": 10560,
      "throughput": 12419.9598115143
    },
    {
      "epoch": 0.16602010987546925,
      "grad_norm": 0.09484507143497467,
      "learning_rate": 0.00027330784931761925,
      "loss": 9.2816,
      "step": 10592,
      "throughput": 12420.453727918748
    },
    {
      "epoch": 0.16652168120439814,
      "grad_norm": 0.10056675970554352,
      "learning_rate": 0.0002731439710522763,
      "loss": 9.2778,
      "step": 10624,
      "throughput": 12420.888924283547
    },
    {
      "epoch": 0.16702325253332706,
      "grad_norm": 0.11179500818252563,
      "learning_rate": 0.00027297964680087617,
      "loss": 9.2987,
      "step": 10656,
      "throughput": 12420.94204471036
    },
    {
      "epoch": 0.16752482386225598,
      "grad_norm": 0.0983198955655098,
      "learning_rate": 0.0002728148772410926,
      "loss": 9.306,
      "step": 10688,
      "throughput": 12421.60389960054
    },
    {
      "epoch": 0.1680263951911849,
      "grad_norm": 0.09311419725418091,
      "learning_rate": 0.0002726496630524358,
      "loss": 9.3227,
      "step": 10720,
      "throughput": 12422.161743529983
    },
    {
      "epoch": 0.1685279665201138,
      "grad_norm": 0.09853453934192657,
      "learning_rate": 0.00027248400491624946,
      "loss": 9.2681,
      "step": 10752,
      "throughput": 12422.810732317097
    },
    {
      "epoch": 0.1690295378490427,
      "grad_norm": 0.0933060273528099,
      "learning_rate": 0.00027231790351570827,
      "loss": 9.2915,
      "step": 10784,
      "throughput": 12423.395082393106
    },
    {
      "epoch": 0.16953110917797162,
      "grad_norm": 0.10200263559818268,
      "learning_rate": 0.00027215135953581485,
      "loss": 9.2958,
      "step": 10816,
      "throughput": 12423.998726700142
    },
    {
      "epoch": 0.1700326805069005,
      "grad_norm": 0.10002099722623825,
      "learning_rate": 0.00027198437366339717,
      "loss": 9.2777,
      "step": 10848,
      "throughput": 12424.287990361512
    },
    {
      "epoch": 0.17053425183582943,
      "grad_norm": 0.0953054279088974,
      "learning_rate": 0.00027181694658710544,
      "loss": 9.2702,
      "step": 10880,
      "throughput": 12423.905610047961
    },
    {
      "epoch": 0.17103582316475835,
      "grad_norm": 0.10247793793678284,
      "learning_rate": 0.00027164907899740936,
      "loss": 9.2791,
      "step": 10912,
      "throughput": 12424.41885749256
    },
    {
      "epoch": 0.17153739449368727,
      "grad_norm": 0.10717090964317322,
      "learning_rate": 0.0002714807715865954,
      "loss": 9.2969,
      "step": 10944,
      "throughput": 12424.8069628263
    },
    {
      "epoch": 0.17203896582261616,
      "grad_norm": 0.09374012798070908,
      "learning_rate": 0.0002713120250487638,
      "loss": 9.2734,
      "step": 10976,
      "throughput": 12424.6868270757
    },
    {
      "epoch": 0.17254053715154508,
      "grad_norm": 0.10755176097154617,
      "learning_rate": 0.0002711428400798258,
      "loss": 9.2656,
      "step": 11008,
      "throughput": 12425.323752751452
    },
    {
      "epoch": 0.173042108480474,
      "grad_norm": 0.10529007017612457,
      "learning_rate": 0.00027097321737750075,
      "loss": 9.2736,
      "step": 11040,
      "throughput": 12425.749055984017
    },
    {
      "epoch": 0.17354367980940288,
      "grad_norm": 0.09935403615236282,
      "learning_rate": 0.00027080315764131316,
      "loss": 9.2595,
      "step": 11072,
      "throughput": 12426.397228197524
    },
    {
      "epoch": 0.1740452511383318,
      "grad_norm": 0.09364161640405655,
      "learning_rate": 0.0002706326615725898,
      "loss": 9.2718,
      "step": 11104,
      "throughput": 12427.134187369495
    },
    {
      "epoch": 0.17454682246726072,
      "grad_norm": 0.10694800317287445,
      "learning_rate": 0.0002704617298744571,
      "loss": 9.2548,
      "step": 11136,
      "throughput": 12427.526446402135
    },
    {
      "epoch": 0.17504839379618964,
      "grad_norm": 0.09899157285690308,
      "learning_rate": 0.00027029036325183775,
      "loss": 9.2609,
      "step": 11168,
      "throughput": 12427.766299430596
    },
    {
      "epoch": 0.17554996512511853,
      "grad_norm": 0.09556271135807037,
      "learning_rate": 0.0002701185624114483,
      "loss": 9.2855,
      "step": 11200,
      "throughput": 12427.589754597111
    },
    {
      "epoch": 0.17605153645404745,
      "grad_norm": 0.09693208336830139,
      "learning_rate": 0.0002699463280617959,
      "loss": 9.2851,
      "step": 11232,
      "throughput": 12427.899615353563
    },
    {
      "epoch": 0.17655310778297637,
      "grad_norm": 0.10468527674674988,
      "learning_rate": 0.00026977366091317554,
      "loss": 9.251,
      "step": 11264,
      "throughput": 12428.405877491856
    },
    {
      "epoch": 0.17705467911190526,
      "grad_norm": 0.09517171233892441,
      "learning_rate": 0.00026960056167766704,
      "loss": 9.261,
      "step": 11296,
      "throughput": 12428.646470904001
    },
    {
      "epoch": 0.17755625044083417,
      "grad_norm": 0.10003001242876053,
      "learning_rate": 0.0002694270310691321,
      "loss": 9.2499,
      "step": 11328,
      "throughput": 12428.8394563157
    },
    {
      "epoch": 0.1780578217697631,
      "grad_norm": 0.09825020283460617,
      "learning_rate": 0.0002692530698032116,
      "loss": 9.2489,
      "step": 11360,
      "throughput": 12429.263224231341
    },
    {
      "epoch": 0.178559393098692,
      "grad_norm": 0.09543386101722717,
      "learning_rate": 0.00026907867859732223,
      "loss": 9.2502,
      "step": 11392,
      "throughput": 12429.864680876386
    },
    {
      "epoch": 0.1790609644276209,
      "grad_norm": 0.08976162225008011,
      "learning_rate": 0.0002689038581706538,
      "loss": 9.2548,
      "step": 11424,
      "throughput": 12430.568483071242
    },
    {
      "epoch": 0.17956253575654982,
      "grad_norm": 0.09635276347398758,
      "learning_rate": 0.0002687286092441664,
      "loss": 9.2324,
      "step": 11456,
      "throughput": 12430.97783219846
    },
    {
      "epoch": 0.18006410708547874,
      "grad_norm": 0.09948880970478058,
      "learning_rate": 0.00026855293254058693,
      "loss": 9.247,
      "step": 11488,
      "throughput": 12431.419926216979
    },
    {
      "epoch": 0.18056567841440763,
      "grad_norm": 0.09892695397138596,
      "learning_rate": 0.0002683768287844068,
      "loss": 9.2428,
      "step": 11520,
      "throughput": 12431.406197621318
    },
    {
      "epoch": 0.18106724974333654,
      "grad_norm": 0.08800537884235382,
      "learning_rate": 0.0002682002987018783,
      "loss": 9.2631,
      "step": 11552,
      "throughput": 12431.331997975556
    },
    {
      "epoch": 0.18156882107226546,
      "grad_norm": 0.09760425984859467,
      "learning_rate": 0.00026802334302101214,
      "loss": 9.2529,
      "step": 11584,
      "throughput": 12431.785048037085
    },
    {
      "epoch": 0.18207039240119435,
      "grad_norm": 0.09639564901590347,
      "learning_rate": 0.000267845962471574,
      "loss": 9.2467,
      "step": 11616,
      "throughput": 12432.096365632751
    },
    {
      "epoch": 0.18257196373012327,
      "grad_norm": 0.09691153466701508,
      "learning_rate": 0.0002676681577850818,
      "loss": 9.2362,
      "step": 11648,
      "throughput": 12432.214956023765
    },
    {
      "epoch": 0.1830735350590522,
      "grad_norm": 0.10566407442092896,
      "learning_rate": 0.0002674899296948026,
      "loss": 9.2356,
      "step": 11680,
      "throughput": 12432.750941160994
    },
    {
      "epoch": 0.1835751063879811,
      "grad_norm": 0.10178868472576141,
      "learning_rate": 0.00026731127893574955,
      "loss": 9.2464,
      "step": 11712,
      "throughput": 12433.210388907002
    },
    {
      "epoch": 0.18407667771691,
      "grad_norm": 0.09865662455558777,
      "learning_rate": 0.00026713220624467894,
      "loss": 9.258,
      "step": 11744,
      "throughput": 12433.810090653798
    },
    {
      "epoch": 0.18457824904583892,
      "grad_norm": 0.09568797051906586,
      "learning_rate": 0.00026695271236008703,
      "loss": 9.2356,
      "step": 11776,
      "throughput": 12434.359978227534
    },
    {
      "epoch": 0.18507982037476783,
      "grad_norm": 0.10666729509830475,
      "learning_rate": 0.00026677279802220726,
      "loss": 9.2451,
      "step": 11808,
      "throughput": 12434.619542280407
    },
    {
      "epoch": 0.18558139170369672,
      "grad_norm": 0.1080276370048523,
      "learning_rate": 0.00026659246397300673,
      "loss": 9.2341,
      "step": 11840,
      "throughput": 12434.90427034286
    },
    {
      "epoch": 0.18608296303262564,
      "grad_norm": 0.08994440734386444,
      "learning_rate": 0.00026641171095618366,
      "loss": 9.2243,
      "step": 11872,
      "throughput": 12434.770224584045
    },
    {
      "epoch": 0.18658453436155456,
      "grad_norm": 0.0883726105093956,
      "learning_rate": 0.0002662305397171641,
      "loss": 9.2367,
      "step": 11904,
      "throughput": 12435.450539027144
    },
    {
      "epoch": 0.18708610569048348,
      "grad_norm": 0.10415124893188477,
      "learning_rate": 0.0002660489510030986,
      "loss": 9.2432,
      "step": 11936,
      "throughput": 12435.823666373835
    },
    {
      "epoch": 0.18758767701941237,
      "grad_norm": 0.08748330175876617,
      "learning_rate": 0.00026586694556285975,
      "loss": 9.2405,
      "step": 11968,
      "throughput": 12435.880263397072
    },
    {
      "epoch": 0.1880892483483413,
      "grad_norm": 0.09719226509332657,
      "learning_rate": 0.0002656845241470384,
      "loss": 9.2218,
      "step": 12000,
      "throughput": 12436.268853279063
    },
    {
      "epoch": 0.1885908196772702,
      "grad_norm": 0.09952506422996521,
      "learning_rate": 0.0002655016875079411,
      "loss": 9.2282,
      "step": 12032,
      "throughput": 12436.61441153604
    },
    {
      "epoch": 0.1890923910061991,
      "grad_norm": 0.09944985061883926,
      "learning_rate": 0.00026531843639958656,
      "loss": 9.2178,
      "step": 12064,
      "throughput": 12437.222547862855
    },
    {
      "epoch": 0.189593962335128,
      "grad_norm": 0.09388846158981323,
      "learning_rate": 0.00026513477157770303,
      "loss": 9.2241,
      "step": 12096,
      "throughput": 12437.750289101668
    },
    {
      "epoch": 0.19009553366405693,
      "grad_norm": 0.09762909263372421,
      "learning_rate": 0.0002649506937997248,
      "loss": 9.2254,
      "step": 12128,
      "throughput": 12438.002763734487
    },
    {
      "epoch": 0.19059710499298585,
      "grad_norm": 0.09256457537412643,
      "learning_rate": 0.00026476620382478896,
      "loss": 9.2364,
      "step": 12160,
      "throughput": 12438.39101625529
    },
    {
      "epoch": 0.19109867632191474,
      "grad_norm": 0.09618276357650757,
      "learning_rate": 0.0002645813024137329,
      "loss": 9.2358,
      "step": 12192,
      "throughput": 12438.071607881528
    },
    {
      "epoch": 0.19160024765084366,
      "grad_norm": 0.09417402744293213,
      "learning_rate": 0.00026439599032909055,
      "loss": 9.235,
      "step": 12224,
      "throughput": 12438.644950872664
    },
    {
      "epoch": 0.19210181897977258,
      "grad_norm": 0.09499766677618027,
      "learning_rate": 0.0002642102683350894,
      "loss": 9.2321,
      "step": 12256,
      "throughput": 12439.038275554216
    },
    {
      "epoch": 0.19260339030870147,
      "grad_norm": 0.1137000098824501,
      "learning_rate": 0.00026402413719764774,
      "loss": 9.2159,
      "step": 12288,
      "throughput": 12439.351334000727
    },
    {
      "epoch": 0.19310496163763038,
      "grad_norm": 0.11453639715909958,
      "learning_rate": 0.0002638375976843707,
      "loss": 9.2178,
      "step": 12320,
      "throughput": 12437.731017940176
    },
    {
      "epoch": 0.1936065329665593,
      "grad_norm": 0.09753817319869995,
      "learning_rate": 0.0002636506505645478,
      "loss": 9.2044,
      "step": 12352,
      "throughput": 12438.039993313554
    },
    {
      "epoch": 0.19410810429548822,
      "grad_norm": 0.09776368737220764,
      "learning_rate": 0.00026346329660914964,
      "loss": 9.2191,
      "step": 12384,
      "throughput": 12438.682273817161
    },
    {
      "epoch": 0.1946096756244171,
      "grad_norm": 0.08489309996366501,
      "learning_rate": 0.00026327553659082444,
      "loss": 9.2244,
      "step": 12416,
      "throughput": 12439.226287196929
    },
    {
      "epoch": 0.19511124695334603,
      "grad_norm": 0.08857341855764389,
      "learning_rate": 0.00026308737128389513,
      "loss": 9.196,
      "step": 12448,
      "throughput": 12439.724146748655
    },
    {
      "epoch": 0.19561281828227495,
      "grad_norm": 0.08907376229763031,
      "learning_rate": 0.0002628988014643558,
      "loss": 9.2243,
      "step": 12480,
      "throughput": 12439.667975570137
    },
    {
      "epoch": 0.19611438961120384,
      "grad_norm": 0.09848618507385254,
      "learning_rate": 0.00026270982790986916,
      "loss": 9.2228,
      "step": 12512,
      "throughput": 12439.53419868086
    },
    {
      "epoch": 0.19661596094013276,
      "grad_norm": 0.09671392291784286,
      "learning_rate": 0.00026252045139976254,
      "loss": 9.2039,
      "step": 12544,
      "throughput": 12439.947746294905
    },
    {
      "epoch": 0.19711753226906167,
      "grad_norm": 0.10359849035739899,
      "learning_rate": 0.00026233067271502536,
      "loss": 9.1828,
      "step": 12576,
      "throughput": 12440.319779950294
    },
    {
      "epoch": 0.1976191035979906,
      "grad_norm": 0.09061454981565475,
      "learning_rate": 0.0002621404926383054,
      "loss": 9.2126,
      "step": 12608,
      "throughput": 12440.6969953609
    },
    {
      "epoch": 0.19812067492691948,
      "grad_norm": 0.09022842347621918,
      "learning_rate": 0.0002619499119539059,
      "loss": 9.1902,
      "step": 12640,
      "throughput": 12440.700680783257
    },
    {
      "epoch": 0.1986222462558484,
      "grad_norm": 0.0907374769449234,
      "learning_rate": 0.0002617589314477821,
      "loss": 9.1781,
      "step": 12672,
      "throughput": 12441.124311331203
    },
    {
      "epoch": 0.19912381758477732,
      "grad_norm": 0.09435463696718216,
      "learning_rate": 0.0002615675519075383,
      "loss": 9.1994,
      "step": 12704,
      "throughput": 12441.538612785944
    },
    {
      "epoch": 0.1996253889137062,
      "grad_norm": 0.09547092020511627,
      "learning_rate": 0.00026137577412242415,
      "loss": 9.1838,
      "step": 12736,
      "throughput": 12442.053947258997
    },
    {
      "epoch": 0.20012696024263513,
      "grad_norm": 0.09545017033815384,
      "learning_rate": 0.00026118359888333193,
      "loss": 9.1638,
      "step": 12768,
      "throughput": 12442.52808605685
    },
    {
      "epoch": 0.20062853157156404,
      "grad_norm": 0.10029665380716324,
      "learning_rate": 0.00026099102698279276,
      "loss": 9.1894,
      "step": 12800,
      "throughput": 12442.81989626998
    },
    {
      "epoch": 0.20113010290049296,
      "grad_norm": 0.0926876813173294,
      "learning_rate": 0.0002607980592149739,
      "loss": 9.1888,
      "step": 12832,
      "throughput": 12442.680258798211
    },
    {
      "epoch": 0.20163167422942185,
      "grad_norm": 0.08556065708398819,
      "learning_rate": 0.00026060469637567484,
      "loss": 9.1988,
      "step": 12864,
      "throughput": 12442.735682073666
    },
    {
      "epoch": 0.20213324555835077,
      "grad_norm": 0.10560698062181473,
      "learning_rate": 0.0002604109392623246,
      "loss": 9.2117,
      "step": 12896,
      "throughput": 12443.310352249038
    },
    {
      "epoch": 0.2026348168872797,
      "grad_norm": 0.09636469930410385,
      "learning_rate": 0.00026021678867397803,
      "loss": 9.1932,
      "step": 12928,
      "throughput": 12443.6837281359
    },
    {
      "epoch": 0.20313638821620858,
      "grad_norm": 0.0874796211719513,
      "learning_rate": 0.00026002224541131274,
      "loss": 9.1755,
      "step": 12960,
      "throughput": 12443.673899619098
    },
    {
      "epoch": 0.2036379595451375,
      "grad_norm": 0.09777245670557022,
      "learning_rate": 0.00025982731027662575,
      "loss": 9.1858,
      "step": 12992,
      "throughput": 12444.038423715197
    },
    {
      "epoch": 0.20413953087406642,
      "grad_norm": 0.08904954791069031,
      "learning_rate": 0.00025963198407383015,
      "loss": 9.1974,
      "step": 13024,
      "throughput": 12444.35839014486
    },
    {
      "epoch": 0.20464110220299533,
      "grad_norm": 0.09526413679122925,
      "learning_rate": 0.0002594362676084517,
      "loss": 9.1799,
      "step": 13056,
      "throughput": 12444.857251219728
    },
    {
      "epoch": 0.20514267353192422,
      "grad_norm": 0.08287783712148666,
      "learning_rate": 0.0002592401616876258,
      "loss": 9.183,
      "step": 13088,
      "throughput": 12445.341489344219
    },
    {
      "epoch": 0.20564424486085314,
      "grad_norm": 0.09923765808343887,
      "learning_rate": 0.00025904366712009374,
      "loss": 9.1939,
      "step": 13120,
      "throughput": 12445.612606772658
    },
    {
      "epoch": 0.20614581618978206,
      "grad_norm": 0.08840049058198929,
      "learning_rate": 0.00025884678471619976,
      "loss": 9.1789,
      "step": 13152,
      "throughput": 12445.759194472117
    },
    {
      "epoch": 0.20664738751871095,
      "grad_norm": 0.08951904624700546,
      "learning_rate": 0.0002586495152878874,
      "loss": 9.172,
      "step": 13184,
      "throughput": 12445.513086078448
    },
    {
      "epoch": 0.20714895884763987,
      "grad_norm": 0.09838932752609253,
      "learning_rate": 0.0002584518596486965,
      "loss": 9.1754,
      "step": 13216,
      "throughput": 12446.079319557672
    },
    {
      "epoch": 0.2076505301765688,
      "grad_norm": 0.09289150685071945,
      "learning_rate": 0.00025825381861375936,
      "loss": 9.1846,
      "step": 13248,
      "throughput": 12446.36823530712
    },
    {
      "epoch": 0.2081521015054977,
      "grad_norm": 0.09429288655519485,
      "learning_rate": 0.00025805539299979794,
      "loss": 9.1853,
      "step": 13280,
      "throughput": 12446.710311829484
    },
    {
      "epoch": 0.2086536728344266,
      "grad_norm": 0.09022123366594315,
      "learning_rate": 0.0002578565836251199,
      "loss": 9.176,
      "step": 13312,
      "throughput": 12446.692568537916
    },
    {
      "epoch": 0.2091552441633555,
      "grad_norm": 0.09260479360818863,
      "learning_rate": 0.0002576573913096158,
      "loss": 9.18,
      "step": 13344,
      "throughput": 12447.103754047945
    },
    {
      "epoch": 0.20965681549228443,
      "grad_norm": 0.09113609790802002,
      "learning_rate": 0.00025745781687475534,
      "loss": 9.1733,
      "step": 13376,
      "throughput": 12447.567055655258
    },
    {
      "epoch": 0.21015838682121332,
      "grad_norm": 0.10043779760599136,
      "learning_rate": 0.000257257861143584,
      "loss": 9.1684,
      "step": 13408,
      "throughput": 12447.914304533839
    },
    {
      "epoch": 0.21065995815014224,
      "grad_norm": 0.09665901213884354,
      "learning_rate": 0.00025705752494071995,
      "loss": 9.1682,
      "step": 13440,
      "throughput": 12448.32566896259
    },
    {
      "epoch": 0.21116152947907116,
      "grad_norm": 0.09273158758878708,
      "learning_rate": 0.0002568568090923501,
      "loss": 9.1623,
      "step": 13472,
      "throughput": 12448.45948722694
    },
    {
      "epoch": 0.21166310080800008,
      "grad_norm": 0.08880336582660675,
      "learning_rate": 0.0002566557144262273,
      "loss": 9.1719,
      "step": 13504,
      "throughput": 12448.16848694662
    },
    {
      "epoch": 0.21216467213692897,
      "grad_norm": 0.106996551156044,
      "learning_rate": 0.00025645424177166663,
      "loss": 9.1783,
      "step": 13536,
      "throughput": 12448.623930365138
    },
    {
      "epoch": 0.21266624346585788,
      "grad_norm": 0.08988650143146515,
      "learning_rate": 0.0002562523919595418,
      "loss": 9.1667,
      "step": 13568,
      "throughput": 12448.980666446922
    },
    {
      "epoch": 0.2131678147947868,
      "grad_norm": 0.08989129215478897,
      "learning_rate": 0.0002560501658222821,
      "loss": 9.1502,
      "step": 13600,
      "throughput": 12449.314149661397
    },
    {
      "epoch": 0.2136693861237157,
      "grad_norm": 0.09585653990507126,
      "learning_rate": 0.0002558475641938686,
      "loss": 9.1551,
      "step": 13632,
      "throughput": 12449.311532938784
    },
    {
      "epoch": 0.2141709574526446,
      "grad_norm": 0.09390981495380402,
      "learning_rate": 0.00025564458790983114,
      "loss": 9.1736,
      "step": 13664,
      "throughput": 12449.744463766221
    },
    {
      "epoch": 0.21467252878157353,
      "grad_norm": 0.09790906310081482,
      "learning_rate": 0.0002554412378072445,
      "loss": 9.1576,
      "step": 13696,
      "throughput": 12450.113208718572
    },
    {
      "epoch": 0.21517410011050242,
      "grad_norm": 0.09180324524641037,
      "learning_rate": 0.0002552375147247251,
      "loss": 9.1495,
      "step": 13728,
      "throughput": 12450.45926139864
    },
    {
      "epoch": 0.21567567143943134,
      "grad_norm": 0.09764540195465088,
      "learning_rate": 0.0002550334195024275,
      "loss": 9.1521,
      "step": 13760,
      "throughput": 12450.848037094125
    },
    {
      "epoch": 0.21617724276836026,
      "grad_norm": 0.09107381105422974,
      "learning_rate": 0.00025482895298204096,
      "loss": 9.1481,
      "step": 13792,
      "throughput": 12451.089328643548
    },
    {
      "epoch": 0.21667881409728917,
      "grad_norm": 0.09703461825847626,
      "learning_rate": 0.0002546241160067861,
      "loss": 9.1497,
      "step": 13824,
      "throughput": 12450.94018235877
    },
    {
      "epoch": 0.21718038542621806,
      "grad_norm": 0.08792301267385483,
      "learning_rate": 0.00025441890942141124,
      "loss": 9.1662,
      "step": 13856,
      "throughput": 12451.16430652669
    },
    {
      "epoch": 0.21768195675514698,
      "grad_norm": 0.09113834798336029,
      "learning_rate": 0.00025421333407218884,
      "loss": 9.1619,
      "step": 13888,
      "throughput": 12451.63478506473
    },
    {
      "epoch": 0.2181835280840759,
      "grad_norm": 0.08990354835987091,
      "learning_rate": 0.0002540073908069124,
      "loss": 9.1508,
      "step": 13920,
      "throughput": 12451.925579345352
    },
    {
      "epoch": 0.2186850994130048,
      "grad_norm": 0.09492233395576477,
      "learning_rate": 0.0002538010804748924,
      "loss": 9.1105,
      "step": 13952,
      "throughput": 12451.864245608442
    },
    {
      "epoch": 0.2191866707419337,
      "grad_norm": 0.08453084528446198,
      "learning_rate": 0.0002535944039269533,
      "loss": 9.1589,
      "step": 13984,
      "throughput": 12452.173408506707
    },
    {
      "epoch": 0.21968824207086263,
      "grad_norm": 0.08925742655992508,
      "learning_rate": 0.0002533873620154299,
      "loss": 9.1472,
      "step": 14016,
      "throughput": 12452.429342118676
    },
    {
      "epoch": 0.22018981339979155,
      "grad_norm": 0.09351540356874466,
      "learning_rate": 0.0002531799555941635,
      "loss": 9.1445,
      "step": 14048,
      "throughput": 12452.87988335345
    },
    {
      "epoch": 0.22069138472872044,
      "grad_norm": 0.09525007754564285,
      "learning_rate": 0.00025297218551849885,
      "loss": 9.122,
      "step": 14080,
      "throughput": 12453.234261015668
    },
    {
      "epoch": 0.22119295605764935,
      "grad_norm": 0.0959862768650055,
      "learning_rate": 0.00025276405264528044,
      "loss": 9.1476,
      "step": 14112,
      "throughput": 12453.426942170903
    },
    {
      "epoch": 0.22169452738657827,
      "grad_norm": 0.09031637012958527,
      "learning_rate": 0.00025255555783284877,
      "loss": 9.143,
      "step": 14144,
      "throughput": 12453.448582132878
    },
    {
      "epoch": 0.22219609871550716,
      "grad_norm": 0.09333668649196625,
      "learning_rate": 0.0002523467019410371,
      "loss": 9.1443,
      "step": 14176,
      "throughput": 12453.532394747865
    },
    {
      "epoch": 0.22269767004443608,
      "grad_norm": 0.09974166750907898,
      "learning_rate": 0.00025213748583116776,
      "loss": 9.1576,
      "step": 14208,
      "throughput": 12454.061305626105
    },
    {
      "epoch": 0.223199241373365,
      "grad_norm": 0.08883793652057648,
      "learning_rate": 0.0002519279103660486,
      "loss": 9.1129,
      "step": 14240,
      "throughput": 12454.288113899367
    },
    {
      "epoch": 0.22370081270229392,
      "grad_norm": 0.09600594639778137,
      "learning_rate": 0.0002517179764099694,
      "loss": 9.1099,
      "step": 14272,
      "throughput": 12454.587461563679
    },
    {
      "epoch": 0.2242023840312228,
      "grad_norm": 0.09509039670228958,
      "learning_rate": 0.00025150768482869846,
      "loss": 9.1359,
      "step": 14304,
      "throughput": 12454.547506360477
    },
    {
      "epoch": 0.22470395536015172,
      "grad_norm": 0.09394937753677368,
      "learning_rate": 0.0002512970364894789,
      "loss": 9.1322,
      "step": 14336,
      "throughput": 12454.915646449184
    },
    {
      "epoch": 0.22520552668908064,
      "grad_norm": 0.09316191077232361,
      "learning_rate": 0.00025108603226102515,
      "loss": 9.1384,
      "step": 14368,
      "throughput": 12453.864038449128
    },
    {
      "epoch": 0.22570709801800953,
      "grad_norm": 0.08423268049955368,
      "learning_rate": 0.0002508746730135191,
      "loss": 9.1312,
      "step": 14400,
      "throughput": 12454.192454638365
    },
    {
      "epoch": 0.22620866934693845,
      "grad_norm": 0.09922663122415543,
      "learning_rate": 0.00025066295961860704,
      "loss": 9.1184,
      "step": 14432,
      "throughput": 12454.558520463599
    },
    {
      "epoch": 0.22671024067586737,
      "grad_norm": 0.09510110318660736,
      "learning_rate": 0.0002504508929493957,
      "loss": 9.126,
      "step": 14464,
      "throughput": 12454.530903811134
    },
    {
      "epoch": 0.2272118120047963,
      "grad_norm": 0.09779904037714005,
      "learning_rate": 0.00025023847388044846,
      "loss": 9.1127,
      "step": 14496,
      "throughput": 12454.323381332644
    },
    {
      "epoch": 0.22771338333372518,
      "grad_norm": 0.09605950117111206,
      "learning_rate": 0.0002500257032877823,
      "loss": 9.1277,
      "step": 14528,
      "throughput": 12454.842419857887
    },
    {
      "epoch": 0.2282149546626541,
      "grad_norm": 0.09250783175230026,
      "learning_rate": 0.0002498125820488639,
      "loss": 9.1286,
      "step": 14560,
      "throughput": 12455.171467399261
    },
    {
      "epoch": 0.22871652599158301,
      "grad_norm": 0.08579394966363907,
      "learning_rate": 0.00024959911104260565,
      "loss": 9.1233,
      "step": 14592,
      "throughput": 12455.492864059066
    },
    {
      "epoch": 0.2292180973205119,
      "grad_norm": 0.0896739736199379,
      "learning_rate": 0.00024938529114936273,
      "loss": 9.1357,
      "step": 14624,
      "throughput": 12455.516121629098
    },
    {
      "epoch": 0.22971966864944082,
      "grad_norm": 0.09005829691886902,
      "learning_rate": 0.000249171123250929,
      "loss": 9.1294,
      "step": 14656,
      "throughput": 12455.94223097607
    },
    {
      "epoch": 0.23022123997836974,
      "grad_norm": 0.09095371514558792,
      "learning_rate": 0.00024895660823053353,
      "loss": 9.1181,
      "step": 14688,
      "throughput": 12456.204985354474
    },
    {
      "epoch": 0.23072281130729866,
      "grad_norm": 0.08464957028627396,
      "learning_rate": 0.00024874174697283685,
      "loss": 9.1398,
      "step": 14720,
      "throughput": 12456.506742235348
    },
    {
      "epoch": 0.23122438263622755,
      "grad_norm": 0.08647891134023666,
      "learning_rate": 0.0002485265403639275,
      "loss": 9.1181,
      "step": 14752,
      "throughput": 12456.886134556755
    },
    {
      "epoch": 0.23172595396515647,
      "grad_norm": 0.0868907943367958,
      "learning_rate": 0.0002483109892913181,
      "loss": 9.1373,
      "step": 14784,
      "throughput": 12456.926867132648
    },
    {
      "epoch": 0.23222752529408539,
      "grad_norm": 0.09411929547786713,
      "learning_rate": 0.0002480950946439419,
      "loss": 9.1187,
      "step": 14816,
      "throughput": 12456.728607751143
    },
    {
      "epoch": 0.23272909662301428,
      "grad_norm": 0.0950883999466896,
      "learning_rate": 0.0002478788573121491,
      "loss": 9.0984,
      "step": 14848,
      "throughput": 12457.22976486714
    },
    {
      "epoch": 0.2332306679519432,
      "grad_norm": 0.09563660621643066,
      "learning_rate": 0.0002476622781877031,
      "loss": 9.0984,
      "step": 14880,
      "throughput": 12457.659338375488
    },
    {
      "epoch": 0.2337322392808721,
      "grad_norm": 0.08693066984415054,
      "learning_rate": 0.0002474453581637769,
      "loss": 9.114,
      "step": 14912,
      "throughput": 12457.949642433958
    },
    {
      "epoch": 0.23423381060980103,
      "grad_norm": 0.08431732654571533,
      "learning_rate": 0.00024722809813494933,
      "loss": 9.116,
      "step": 14944,
      "throughput": 12458.053641101553
    },
    {
      "epoch": 0.23473538193872992,
      "grad_norm": 0.09093215316534042,
      "learning_rate": 0.00024701049899720123,
      "loss": 9.1089,
      "step": 14976,
      "throughput": 12458.21994265974
    },
    {
      "epoch": 0.23523695326765884,
      "grad_norm": 0.0873025506734848,
      "learning_rate": 0.0002467925616479122,
      "loss": 9.1335,
      "step": 15008,
      "throughput": 12458.469333257306
    },
    {
      "epoch": 0.23573852459658776,
      "grad_norm": 0.09324125945568085,
      "learning_rate": 0.0002465742869858566,
      "loss": 9.1183,
      "step": 15040,
      "throughput": 12458.901220612812
    },
    {
      "epoch": 0.23624009592551665,
      "grad_norm": 0.08744019269943237,
      "learning_rate": 0.0002463556759111996,
      "loss": 9.0946,
      "step": 15072,
      "throughput": 12459.200893969739
    },
    {
      "epoch": 0.23674166725444556,
      "grad_norm": 0.09340982139110565,
      "learning_rate": 0.00024613672932549403,
      "loss": 9.1079,
      "step": 15104,
      "throughput": 12459.360936978843
    },
    {
      "epoch": 0.23724323858337448,
      "grad_norm": 0.10181102901697159,
      "learning_rate": 0.00024591744813167625,
      "loss": 9.0957,
      "step": 15136,
      "throughput": 12458.99247469336
    },
    {
      "epoch": 0.2377448099123034,
      "grad_norm": 0.09072301536798477,
      "learning_rate": 0.00024569783323406255,
      "loss": 9.0942,
      "step": 15168,
      "throughput": 12459.357491939969
    },
    {
      "epoch": 0.2382463812412323,
      "grad_norm": 0.09112021327018738,
      "learning_rate": 0.00024547788553834536,
      "loss": 9.1048,
      "step": 15200,
      "throughput": 12459.768172440561
    },
    {
      "epoch": 0.2387479525701612,
      "grad_norm": 0.10005882382392883,
      "learning_rate": 0.00024525760595158977,
      "loss": 9.1169,
      "step": 15232,
      "throughput": 12460.056387703413
    },
    {
      "epoch": 0.23924952389909013,
      "grad_norm": 0.08876467496156693,
      "learning_rate": 0.0002450369953822293,
      "loss": 9.1045,
      "step": 15264,
      "throughput": 12460.229257340188
    },
    {
      "epoch": 0.23975109522801902,
      "grad_norm": 0.08295506238937378,
      "learning_rate": 0.0002448160547400627,
      "loss": 9.0954,
      "step": 15296,
      "throughput": 12460.40222242608
    },
    {
      "epoch": 0.24025266655694794,
      "grad_norm": 0.0876840129494667,
      "learning_rate": 0.00024459478493624973,
      "loss": 9.0972,
      "step": 15328,
      "throughput": 12460.757852817978
    },
    {
      "epoch": 0.24075423788587685,
      "grad_norm": 0.08689826726913452,
      "learning_rate": 0.0002443731868833078,
      "loss": 9.0995,
      "step": 15360,
      "throughput": 12461.081935768496
    },
    {
      "epoch": 0.24125580921480577,
      "grad_norm": 0.09742100536823273,
      "learning_rate": 0.0002441512614951079,
      "loss": 9.0689,
      "step": 15392,
      "throughput": 12461.290076223115
    },
    {
      "epoch": 0.24175738054373466,
      "grad_norm": 0.0885290801525116,
      "learning_rate": 0.00024392900968687103,
      "loss": 9.1068,
      "step": 15424,
      "throughput": 12461.528644102316
    },
    {
      "epoch": 0.24225895187266358,
      "grad_norm": 0.08928472548723221,
      "learning_rate": 0.00024370643237516426,
      "loss": 9.0912,
      "step": 15456,
      "throughput": 12461.374216765722
    },
    {
      "epoch": 0.2427605232015925,
      "grad_norm": 0.08863835781812668,
      "learning_rate": 0.00024348353047789708,
      "loss": 9.1112,
      "step": 15488,
      "throughput": 12461.407715970065
    },
    {
      "epoch": 0.2432620945305214,
      "grad_norm": 0.09474772959947586,
      "learning_rate": 0.0002432603049143176,
      "loss": 9.1056,
      "step": 15520,
      "throughput": 12461.8778387018
    },
    {
      "epoch": 0.2437636658594503,
      "grad_norm": 0.09183931350708008,
      "learning_rate": 0.0002430367566050087,
      "loss": 9.0976,
      "step": 15552,
      "throughput": 12462.158419903451
    },
    {
      "epoch": 0.24426523718837923,
      "grad_norm": 0.08852574229240417,
      "learning_rate": 0.00024281288647188425,
      "loss": 9.083,
      "step": 15584,
      "throughput": 12462.339616265517
    },
    {
      "epoch": 0.24476680851730814,
      "grad_norm": 0.08312542736530304,
      "learning_rate": 0.00024258869543818535,
      "loss": 9.0648,
      "step": 15616,
      "throughput": 12462.38442635059
    },
    {
      "epoch": 0.24526837984623703,
      "grad_norm": 0.08816740661859512,
      "learning_rate": 0.00024236418442847652,
      "loss": 9.1116,
      "step": 15648,
      "throughput": 12462.802347087081
    },
    {
      "epoch": 0.24576995117516595,
      "grad_norm": 0.08942883461713791,
      "learning_rate": 0.0002421393543686418,
      "loss": 9.1015,
      "step": 15680,
      "throughput": 12463.01906254443
    },
    {
      "epoch": 0.24627152250409487,
      "grad_norm": 0.09535627067089081,
      "learning_rate": 0.00024191420618588103,
      "loss": 9.0871,
      "step": 15712,
      "throughput": 12463.308121069998
    },
    {
      "epoch": 0.24677309383302376,
      "grad_norm": 0.08258014917373657,
      "learning_rate": 0.000241688740808706,
      "loss": 9.0858,
      "step": 15744,
      "throughput": 12463.654375533093
    },
    {
      "epoch": 0.24727466516195268,
      "grad_norm": 0.09317824989557266,
      "learning_rate": 0.0002414629591669366,
      "loss": 9.0855,
      "step": 15776,
      "throughput": 12463.483273297135
    },
    {
      "epoch": 0.2477762364908816,
      "grad_norm": 0.09186193346977234,
      "learning_rate": 0.0002412368621916969,
      "loss": 9.0853,
      "step": 15808,
      "throughput": 12463.471119369466
    },
    {
      "epoch": 0.2482778078198105,
      "grad_norm": 0.08628609776496887,
      "learning_rate": 0.0002410104508154116,
      "loss": 9.0883,
      "step": 15840,
      "throughput": 12463.925664207842
    },
    {
      "epoch": 0.2487793791487394,
      "grad_norm": 0.08932186663150787,
      "learning_rate": 0.00024078372597180183,
      "loss": 9.0832,
      "step": 15872,
      "throughput": 12464.291830861448
    },
    {
      "epoch": 0.24928095047766832,
      "grad_norm": 0.09460990875959396,
      "learning_rate": 0.00024055668859588157,
      "loss": 9.0645,
      "step": 15904,
      "throughput": 12464.580911922092
    },
    {
      "epoch": 0.24978252180659724,
      "grad_norm": 0.0907793715596199,
      "learning_rate": 0.0002403293396239536,
      "loss": 9.076,
      "step": 15936,
      "throughput": 12464.736545075013
    },
    {
      "epoch": 0.25028409313552613,
      "grad_norm": 0.09719564020633698,
      "learning_rate": 0.00024010167999360575,
      "loss": 9.0965,
      "step": 15968,
      "throughput": 12464.852240415541
    },
    {
      "epoch": 0.25078566446445505,
      "grad_norm": 0.0805700495839119,
      "learning_rate": 0.00023987371064370698,
      "loss": 9.0711,
      "step": 16000,
      "throughput": 12465.15117822591
    },
    {
      "epoch": 0.25128723579338397,
      "grad_norm": 0.09074220061302185,
      "learning_rate": 0.00023964543251440363,
      "loss": 9.0491,
      "step": 16032,
      "throughput": 12465.322025815385
    },
    {
      "epoch": 0.2517888071223129,
      "grad_norm": 0.09387928247451782,
      "learning_rate": 0.00023941684654711534,
      "loss": 9.0807,
      "step": 16064,
      "throughput": 12465.66815792879
    },
    {
      "epoch": 0.2522903784512418,
      "grad_norm": 0.08193206787109375,
      "learning_rate": 0.0002391879536845313,
      "loss": 9.0775,
      "step": 16096,
      "throughput": 12465.936712337047
    },
    {
      "epoch": 0.25279194978017067,
      "grad_norm": 0.08214768022298813,
      "learning_rate": 0.0002389587548706064,
      "loss": 9.062,
      "step": 16128,
      "throughput": 12465.487703177709
    },
    {
      "epoch": 0.2532935211090996,
      "grad_norm": 0.08649898320436478,
      "learning_rate": 0.0002387292510505572,
      "loss": 9.0575,
      "step": 16160,
      "throughput": 12465.917880215575
    },
    {
      "epoch": 0.2537950924380285,
      "grad_norm": 0.09687966853380203,
      "learning_rate": 0.00023849944317085812,
      "loss": 9.0867,
      "step": 16192,
      "throughput": 12466.294707813418
    },
    {
      "epoch": 0.2542966637669574,
      "grad_norm": 0.08462590724229813,
      "learning_rate": 0.0002382693321792376,
      "loss": 9.0744,
      "step": 16224,
      "throughput": 12466.617902711769
    },
    {
      "epoch": 0.25479823509588634,
      "grad_norm": 0.08881812542676926,
      "learning_rate": 0.00023803891902467406,
      "loss": 9.0858,
      "step": 16256,
      "throughput": 12466.848251406545
    },
    {
      "epoch": 0.25529980642481526,
      "grad_norm": 0.08228327333927155,
      "learning_rate": 0.0002378082046573919,
      "loss": 9.057,
      "step": 16288,
      "throughput": 12466.895905251296
    },
    {
      "epoch": 0.2558013777537442,
      "grad_norm": 0.0878542885184288,
      "learning_rate": 0.00023757719002885793,
      "loss": 9.0545,
      "step": 16320,
      "throughput": 12467.12757205178
    },
    {
      "epoch": 0.25630294908267304,
      "grad_norm": 0.0913226455450058,
      "learning_rate": 0.00023734587609177725,
      "loss": 9.0711,
      "step": 16352,
      "throughput": 12467.356317805159
    },
    {
      "epoch": 0.25680452041160196,
      "grad_norm": 0.09229837357997894,
      "learning_rate": 0.000237114263800089,
      "loss": 9.0747,
      "step": 16384,
      "throughput": 12467.666892843608
    },
    {
      "epoch": 0.2573060917405309,
      "grad_norm": 0.08693083375692368,
      "learning_rate": 0.0002368823541089632,
      "loss": 9.0735,
      "step": 16416,
      "throughput": 12466.558353855355
    },
    {
      "epoch": 0.2578076630694598,
      "grad_norm": 0.08696688711643219,
      "learning_rate": 0.00023665014797479602,
      "loss": 9.0649,
      "step": 16448,
      "throughput": 12466.40559245942
    },
    {
      "epoch": 0.2583092343983887,
      "grad_norm": 0.08479771018028259,
      "learning_rate": 0.00023641764635520617,
      "loss": 9.0544,
      "step": 16480,
      "throughput": 12466.56688086586
    },
    {
      "epoch": 0.2588108057273176,
      "grad_norm": 0.08818965405225754,
      "learning_rate": 0.0002361848502090311,
      "loss": 9.0554,
      "step": 16512,
      "throughput": 12466.926689103695
    },
    {
      "epoch": 0.25931237705624655,
      "grad_norm": 0.08338773995637894,
      "learning_rate": 0.0002359517604963228,
      "loss": 9.0725,
      "step": 16544,
      "throughput": 12467.23486422751
    },
    {
      "epoch": 0.2598139483851754,
      "grad_norm": 0.0901360809803009,
      "learning_rate": 0.0002357183781783439,
      "loss": 9.0556,
      "step": 16576,
      "throughput": 12467.402090050127
    },
    {
      "epoch": 0.2603155197141043,
      "grad_norm": 0.085529625415802,
      "learning_rate": 0.0002354847042175638,
      "loss": 9.0426,
      "step": 16608,
      "throughput": 12467.666467634639
    },
    {
      "epoch": 0.26081709104303324,
      "grad_norm": 0.08443993330001831,
      "learning_rate": 0.0002352507395776546,
      "loss": 9.0501,
      "step": 16640,
      "throughput": 12467.807254167117
    },
    {
      "epoch": 0.26131866237196216,
      "grad_norm": 0.08707763254642487,
      "learning_rate": 0.00023501648522348715,
      "loss": 9.0651,
      "step": 16672,
      "throughput": 12468.114144258147
    },
    {
      "epoch": 0.2618202337008911,
      "grad_norm": 0.09138821065425873,
      "learning_rate": 0.0002347819421211271,
      "loss": 9.0681,
      "step": 16704,
      "throughput": 12468.226462686382
    },
    {
      "epoch": 0.26232180502982,
      "grad_norm": 0.0871325135231018,
      "learning_rate": 0.00023454711123783092,
      "loss": 9.0517,
      "step": 16736,
      "throughput": 12468.42955905754
    },
    {
      "epoch": 0.2628233763587489,
      "grad_norm": 0.08815670013427734,
      "learning_rate": 0.00023431199354204192,
      "loss": 9.0533,
      "step": 16768,
      "throughput": 12468.204008980289
    },
    {
      "epoch": 0.2633249476876778,
      "grad_norm": 0.09001166373491287,
      "learning_rate": 0.00023407659000338607,
      "loss": 9.0623,
      "step": 16800,
      "throughput": 12468.376115452253
    },
    {
      "epoch": 0.2638265190166067,
      "grad_norm": 0.09031942486763,
      "learning_rate": 0.00023384090159266833,
      "loss": 9.0574,
      "step": 16832,
      "throughput": 12468.806218505566
    },
    {
      "epoch": 0.2643280903455356,
      "grad_norm": 0.08624611794948578,
      "learning_rate": 0.00023360492928186838,
      "loss": 9.0482,
      "step": 16864,
      "throughput": 12469.183607749183
    },
    {
      "epoch": 0.26482966167446453,
      "grad_norm": 0.09370267391204834,
      "learning_rate": 0.00023336867404413674,
      "loss": 9.0566,
      "step": 16896,
      "throughput": 12469.430245980588
    },
    {
      "epoch": 0.26533123300339345,
      "grad_norm": 0.08853529393672943,
      "learning_rate": 0.0002331321368537907,
      "loss": 9.0371,
      "step": 16928,
      "throughput": 12469.541779301038
    },
    {
      "epoch": 0.26583280433232237,
      "grad_norm": 0.08989576250314713,
      "learning_rate": 0.0002328953186863103,
      "loss": 9.0451,
      "step": 16960,
      "throughput": 12469.598915485374
    },
    {
      "epoch": 0.2663343756612513,
      "grad_norm": 0.08611343055963516,
      "learning_rate": 0.00023265822051833442,
      "loss": 9.056,
      "step": 16992,
      "throughput": 12469.884718462825
    },
    {
      "epoch": 0.26683594699018015,
      "grad_norm": 0.0905052199959755,
      "learning_rate": 0.00023242084332765662,
      "loss": 9.0236,
      "step": 17024,
      "throughput": 12469.978079412462
    },
    {
      "epoch": 0.26733751831910907,
      "grad_norm": 0.08522782474756241,
      "learning_rate": 0.0002321831880932211,
      "loss": 9.0232,
      "step": 17056,
      "throughput": 12470.208961711229
    },
    {
      "epoch": 0.267839089648038,
      "grad_norm": 0.08784380555152893,
      "learning_rate": 0.00023194525579511876,
      "loss": 9.0504,
      "step": 17088,
      "throughput": 12470.301773323268
    },
    {
      "epoch": 0.2683406609769669,
      "grad_norm": 0.08720999956130981,
      "learning_rate": 0.00023170704741458308,
      "loss": 9.0575,
      "step": 17120,
      "throughput": 12470.064606079304
    },
    {
      "epoch": 0.2688422323058958,
      "grad_norm": 0.09138575941324234,
      "learning_rate": 0.00023146856393398615,
      "loss": 9.0648,
      "step": 17152,
      "throughput": 12470.468089389984
    },
    {
      "epoch": 0.26934380363482474,
      "grad_norm": 0.0843966007232666,
      "learning_rate": 0.0002312298063368346,
      "loss": 9.0507,
      "step": 17184,
      "throughput": 12470.801132714467
    },
    {
      "epoch": 0.26984537496375366,
      "grad_norm": 0.08482661098241806,
      "learning_rate": 0.00023099077560776536,
      "loss": 9.045,
      "step": 17216,
      "throughput": 12471.096914700734
    },
    {
      "epoch": 0.2703469462926825,
      "grad_norm": 0.08337967842817307,
      "learning_rate": 0.00023075147273254195,
      "loss": 9.0286,
      "step": 17248,
      "throughput": 12471.145123746845
    },
    {
      "epoch": 0.27084851762161144,
      "grad_norm": 0.09060463309288025,
      "learning_rate": 0.0002305118986980501,
      "loss": 9.0553,
      "step": 17280,
      "throughput": 12471.393467544609
    },
    {
      "epoch": 0.27135008895054036,
      "grad_norm": 0.08552798628807068,
      "learning_rate": 0.00023027205449229388,
      "loss": 9.0473,
      "step": 17312,
      "throughput": 12471.505573657802
    },
    {
      "epoch": 0.2718516602794693,
      "grad_norm": 0.09061522781848907,
      "learning_rate": 0.00023003194110439145,
      "loss": 9.0404,
      "step": 17344,
      "throughput": 12471.799272716305
    },
    {
      "epoch": 0.2723532316083982,
      "grad_norm": 0.08229053765535355,
      "learning_rate": 0.00022979155952457118,
      "loss": 9.0418,
      "step": 17376,
      "throughput": 12471.917751952064
    },
    {
      "epoch": 0.2728548029373271,
      "grad_norm": 0.08962202817201614,
      "learning_rate": 0.00022955091074416733,
      "loss": 9.0345,
      "step": 17408,
      "throughput": 12472.004623410867
    },
    {
      "epoch": 0.27335637426625603,
      "grad_norm": 0.07952834665775299,
      "learning_rate": 0.0002293099957556163,
      "loss": 9.0516,
      "step": 17440,
      "throughput": 12471.662820281383
    },
    {
      "epoch": 0.2738579455951849,
      "grad_norm": 0.08492957800626755,
      "learning_rate": 0.00022906881555245212,
      "loss": 9.0019,
      "step": 17472,
      "throughput": 12472.05627389257
    },
    {
      "epoch": 0.2743595169241138,
      "grad_norm": 0.08579554408788681,
      "learning_rate": 0.0002288273711293028,
      "loss": 9.013,
      "step": 17504,
      "throughput": 12472.39775437576
    },
    {
      "epoch": 0.27486108825304273,
      "grad_norm": 0.0875297486782074,
      "learning_rate": 0.00022858566348188568,
      "loss": 9.055,
      "step": 17536,
      "throughput": 12472.693638964387
    },
    {
      "epoch": 0.27536265958197165,
      "grad_norm": 0.08446948975324631,
      "learning_rate": 0.00022834369360700394,
      "loss": 9.0329,
      "step": 17568,
      "throughput": 12472.90183148617
    },
    {
      "epoch": 0.27586423091090057,
      "grad_norm": 0.08951190859079361,
      "learning_rate": 0.00022810146250254196,
      "loss": 9.0438,
      "step": 17600,
      "throughput": 12473.13702330547
    },
    {
      "epoch": 0.2763658022398295,
      "grad_norm": 0.08349533379077911,
      "learning_rate": 0.00022785897116746166,
      "loss": 9.014,
      "step": 17632,
      "throughput": 12473.249106840682
    },
    {
      "epoch": 0.2768673735687584,
      "grad_norm": 0.08524182438850403,
      "learning_rate": 0.00022761622060179793,
      "loss": 9.0231,
      "step": 17664,
      "throughput": 12473.447528872795
    },
    {
      "epoch": 0.27736894489768726,
      "grad_norm": 0.0913608968257904,
      "learning_rate": 0.00022737321180665488,
      "loss": 9.0312,
      "step": 17696,
      "throughput": 12473.66558932742
    },
    {
      "epoch": 0.2778705162266162,
      "grad_norm": 0.09343055635690689,
      "learning_rate": 0.00022712994578420143,
      "loss": 9.0572,
      "step": 17728,
      "throughput": 12473.783745762661
    },
    {
      "epoch": 0.2783720875555451,
      "grad_norm": 0.09041980654001236,
      "learning_rate": 0.00022688642353766746,
      "loss": 9.0042,
      "step": 17760,
      "throughput": 12473.63779914607
    },
    {
      "epoch": 0.278873658884474,
      "grad_norm": 0.08400727808475494,
      "learning_rate": 0.00022664264607133937,
      "loss": 9.0289,
      "step": 17792,
      "throughput": 12473.776377954498
    },
    {
      "epoch": 0.27937523021340294,
      "grad_norm": 0.08442512899637222,
      "learning_rate": 0.00022639861439055617,
      "loss": 9.0285,
      "step": 17824,
      "throughput": 12474.12696957615
    },
    {
      "epoch": 0.27987680154233185,
      "grad_norm": 0.08340003341436386,
      "learning_rate": 0.00022615432950170528,
      "loss": 9.0122,
      "step": 17856,
      "throughput": 12474.5182505348
    },
    {
      "epoch": 0.2803783728712608,
      "grad_norm": 0.0845126211643219,
      "learning_rate": 0.00022590979241221825,
      "loss": 9.0138,
      "step": 17888,
      "throughput": 12474.690420465082
    },
    {
      "epoch": 0.28087994420018964,
      "grad_norm": 0.09434281289577484,
      "learning_rate": 0.00022566500413056677,
      "loss": 9.0015,
      "step": 17920,
      "throughput": 12474.861393165009
    },
    {
      "epoch": 0.28138151552911855,
      "grad_norm": 0.08486516028642654,
      "learning_rate": 0.00022541996566625841,
      "loss": 9.0289,
      "step": 17952,
      "throughput": 12474.946996187711
    },
    {
      "epoch": 0.28188308685804747,
      "grad_norm": 0.08841805905103683,
      "learning_rate": 0.00022517467802983266,
      "loss": 9.0236,
      "step": 17984,
      "throughput": 12475.122543622343
    },
    {
      "epoch": 0.2823846581869764,
      "grad_norm": 0.08873917162418365,
      "learning_rate": 0.0002249291422328563,
      "loss": 9.0198,
      "step": 18016,
      "throughput": 12475.351462062104
    },
    {
      "epoch": 0.2828862295159053,
      "grad_norm": 0.08457177132368088,
      "learning_rate": 0.00022468335928791977,
      "loss": 9.0135,
      "step": 18048,
      "throughput": 12475.472738775841
    },
    {
      "epoch": 0.2833878008448342,
      "grad_norm": 0.08301544189453125,
      "learning_rate": 0.00022443733020863262,
      "loss": 9.0039,
      "step": 18080,
      "throughput": 12475.459476439604
    },
    {
      "epoch": 0.28388937217376314,
      "grad_norm": 0.08134116977453232,
      "learning_rate": 0.00022419105600961955,
      "loss": 9.009,
      "step": 18112,
      "throughput": 12475.444797157037
    },
    {
      "epoch": 0.284390943502692,
      "grad_norm": 0.091349758207798,
      "learning_rate": 0.00022394453770651607,
      "loss": 9.0186,
      "step": 18144,
      "throughput": 12475.755194761135
    },
    {
      "epoch": 0.2848925148316209,
      "grad_norm": 0.08693146705627441,
      "learning_rate": 0.00022369777631596436,
      "loss": 8.9856,
      "step": 18176,
      "throughput": 12476.1312937397
    },
    {
      "epoch": 0.28539408616054984,
      "grad_norm": 0.09362078458070755,
      "learning_rate": 0.00022345077285560914,
      "loss": 9.0205,
      "step": 18208,
      "throughput": 12476.409711464714
    },
    {
      "epoch": 0.28589565748947876,
      "grad_norm": 0.0814775675535202,
      "learning_rate": 0.00022320352834409343,
      "loss": 9.0158,
      "step": 18240,
      "throughput": 12476.478521129118
    },
    {
      "epoch": 0.2863972288184077,
      "grad_norm": 0.09202456474304199,
      "learning_rate": 0.0002229560438010543,
      "loss": 8.986,
      "step": 18272,
      "throughput": 12476.62019737783
    },
    {
      "epoch": 0.2868988001473366,
      "grad_norm": 0.08562493324279785,
      "learning_rate": 0.00022270832024711882,
      "loss": 9.027,
      "step": 18304,
      "throughput": 12476.907886201338
    },
    {
      "epoch": 0.2874003714762655,
      "grad_norm": 0.08332978188991547,
      "learning_rate": 0.00022246035870389952,
      "loss": 8.997,
      "step": 18336,
      "throughput": 12477.075444601718
    },
    {
      "epoch": 0.2879019428051944,
      "grad_norm": 0.09485689550638199,
      "learning_rate": 0.00022221216019399067,
      "loss": 9.0143,
      "step": 18368,
      "throughput": 12477.11013677757
    },
    {
      "epoch": 0.2884035141341233,
      "grad_norm": 0.08859565109014511,
      "learning_rate": 0.00022196372574096357,
      "loss": 9.0016,
      "step": 18400,
      "throughput": 12477.177361621938
    },
    {
      "epoch": 0.2889050854630522,
      "grad_norm": 0.08996104449033737,
      "learning_rate": 0.00022171505636936272,
      "loss": 9.0183,
      "step": 18432,
      "throughput": 12476.974357767254
    },
    {
      "epoch": 0.28940665679198113,
      "grad_norm": 0.08240412175655365,
      "learning_rate": 0.00022146615310470125,
      "loss": 9.0214,
      "step": 18464,
      "throughput": 12476.223451799962
    },
    {
      "epoch": 0.28990822812091005,
      "grad_norm": 0.08608371764421463,
      "learning_rate": 0.0002212170169734571,
      "loss": 9.0067,
      "step": 18496,
      "throughput": 12476.53300311954
    },
    {
      "epoch": 0.29040979944983897,
      "grad_norm": 0.0910928025841713,
      "learning_rate": 0.0002209676490030683,
      "loss": 8.9914,
      "step": 18528,
      "throughput": 12476.796098408216
    },
    {
      "epoch": 0.2909113707787679,
      "grad_norm": 0.08564506471157074,
      "learning_rate": 0.0002207180502219291,
      "loss": 9.0139,
      "step": 18560,
      "throughput": 12476.923207970885
    },
    {
      "epoch": 0.29141294210769675,
      "grad_norm": 0.09284878522157669,
      "learning_rate": 0.00022046822165938565,
      "loss": 8.9929,
      "step": 18592,
      "throughput": 12477.036495695165
    },
    {
      "epoch": 0.29191451343662567,
      "grad_norm": 0.08678165823221207,
      "learning_rate": 0.00022021816434573168,
      "loss": 8.9992,
      "step": 18624,
      "throughput": 12477.242414439332
    },
    {
      "epoch": 0.2924160847655546,
      "grad_norm": 0.08513201773166656,
      "learning_rate": 0.0002199678793122043,
      "loss": 9.0067,
      "step": 18656,
      "throughput": 12477.385674047086
    },
    {
      "epoch": 0.2929176560944835,
      "grad_norm": 0.08333751559257507,
      "learning_rate": 0.0002197173675909797,
      "loss": 8.9823,
      "step": 18688,
      "throughput": 12477.468328011044
    },
    {
      "epoch": 0.2934192274234124,
      "grad_norm": 0.08798499405384064,
      "learning_rate": 0.00021946663021516895,
      "loss": 9.0078,
      "step": 18720,
      "throughput": 12477.499937727878
    },
    {
      "epoch": 0.29392079875234134,
      "grad_norm": 0.0856068879365921,
      "learning_rate": 0.0002192156682188138,
      "loss": 9.0173,
      "step": 18752,
      "throughput": 12477.421765774649
    },
    {
      "epoch": 0.29442237008127026,
      "grad_norm": 0.08265390992164612,
      "learning_rate": 0.00021896448263688224,
      "loss": 8.9792,
      "step": 18784,
      "throughput": 12477.603668330741
    },
    {
      "epoch": 0.2949239414101991,
      "grad_norm": 0.08359767496585846,
      "learning_rate": 0.00021871307450526428,
      "loss": 8.9992,
      "step": 18816,
      "throughput": 12477.905816630331
    },
    {
      "epoch": 0.29542551273912804,
      "grad_norm": 0.08933964371681213,
      "learning_rate": 0.00021846144486076794,
      "loss": 8.9771,
      "step": 18848,
      "throughput": 12478.262695014355
    },
    {
      "epoch": 0.29592708406805696,
      "grad_norm": 0.08512786775827408,
      "learning_rate": 0.00021820959474111448,
      "loss": 8.9957,
      "step": 18880,
      "throughput": 12478.297900233913
    },
    {
      "epoch": 0.2964286553969859,
      "grad_norm": 0.08374758064746857,
      "learning_rate": 0.00021795752518493462,
      "loss": 8.9974,
      "step": 18912,
      "throughput": 12478.560628901585
    },
    {
      "epoch": 0.2969302267259148,
      "grad_norm": 0.09408605843782425,
      "learning_rate": 0.0002177052372317639,
      "loss": 8.9884,
      "step": 18944,
      "throughput": 12478.682912323291
    },
    {
      "epoch": 0.2974317980548437,
      "grad_norm": 0.08023850619792938,
      "learning_rate": 0.00021745273192203871,
      "loss": 8.9801,
      "step": 18976,
      "throughput": 12478.841125057868
    },
    {
      "epoch": 0.2979333693837726,
      "grad_norm": 0.08267463743686676,
      "learning_rate": 0.00021720001029709152,
      "loss": 8.9955,
      "step": 19008,
      "throughput": 12479.080966465785
    },
    {
      "epoch": 0.2984349407127015,
      "grad_norm": 0.08654811233282089,
      "learning_rate": 0.00021694707339914722,
      "loss": 8.9867,
      "step": 19040,
      "throughput": 12478.997052386876
    },
    {
      "epoch": 0.2989365120416304,
      "grad_norm": 0.08444288372993469,
      "learning_rate": 0.00021669392227131816,
      "loss": 8.9951,
      "step": 19072,
      "throughput": 12478.872801380136
    },
    {
      "epoch": 0.2994380833705593,
      "grad_norm": 0.08606360852718353,
      "learning_rate": 0.0002164405579576005,
      "loss": 9.0,
      "step": 19104,
      "throughput": 12479.01897110598
    },
    {
      "epoch": 0.29993965469948825,
      "grad_norm": 0.08675479143857956,
      "learning_rate": 0.0002161869815028694,
      "loss": 8.985,
      "step": 19136,
      "throughput": 12479.319002499655
    },
    {
      "epoch": 0.30044122602841716,
      "grad_norm": 0.08233457058668137,
      "learning_rate": 0.00021593319395287483,
      "loss": 8.9862,
      "step": 19168,
      "throughput": 12479.665124742653
    },
    {
      "epoch": 0.3009427973573461,
      "grad_norm": 0.0809185728430748,
      "learning_rate": 0.0002156791963542374,
      "loss": 8.9818,
      "step": 19200,
      "throughput": 12479.90954284929
    },
    {
      "epoch": 0.30144436868627494,
      "grad_norm": 0.08249987661838531,
      "learning_rate": 0.00021542498975444404,
      "loss": 8.97,
      "step": 19232,
      "throughput": 12479.959052127691
    },
    {
      "epoch": 0.30194594001520386,
      "grad_norm": 0.0827348455786705,
      "learning_rate": 0.0002151705752018435,
      "loss": 8.9766,
      "step": 19264,
      "throughput": 12480.163904790157
    },
    {
      "epoch": 0.3024475113441328,
      "grad_norm": 0.08294457942247391,
      "learning_rate": 0.0002149159537456421,
      "loss": 8.9846,
      "step": 19296,
      "throughput": 12480.258854661215
    },
    {
      "epoch": 0.3029490826730617,
      "grad_norm": 0.08031655102968216,
      "learning_rate": 0.00021466112643589948,
      "loss": 8.9328,
      "step": 19328,
      "throughput": 12480.488916400263
    },
    {
      "epoch": 0.3034506540019906,
      "grad_norm": 0.08047141134738922,
      "learning_rate": 0.00021440609432352427,
      "loss": 8.997,
      "step": 19360,
      "throughput": 12480.4834522581
    },
    {
      "epoch": 0.30395222533091953,
      "grad_norm": 0.08190543204545975,
      "learning_rate": 0.00021415085846026961,
      "loss": 8.9921,
      "step": 19392,
      "throughput": 12480.460116763796
    },
    {
      "epoch": 0.30445379665984845,
      "grad_norm": 0.08133073151111603,
      "learning_rate": 0.00021389541989872904,
      "loss": 8.9605,
      "step": 19424,
      "throughput": 12480.449168736688
    },
    {
      "epoch": 0.3049553679887773,
      "grad_norm": 0.08820736408233643,
      "learning_rate": 0.00021363977969233186,
      "loss": 8.9873,
      "step": 19456,
      "throughput": 12480.73539843842
    },
    {
      "epoch": 0.30545693931770623,
      "grad_norm": 0.08675903081893921,
      "learning_rate": 0.000213383938895339,
      "loss": 8.9809,
      "step": 19488,
      "throughput": 12481.072723554666
    },
    {
      "epoch": 0.30595851064663515,
      "grad_norm": 0.08426385372877121,
      "learning_rate": 0.00021312789856283885,
      "loss": 8.9639,
      "step": 19520,
      "throughput": 12481.321765084207
    },
    {
      "epoch": 0.30646008197556407,
      "grad_norm": 0.08889193087816238,
      "learning_rate": 0.0002128716597507423,
      "loss": 8.9951,
      "step": 19552,
      "throughput": 12481.474794690199
    },
    {
      "epoch": 0.306961653304493,
      "grad_norm": 0.08428184688091278,
      "learning_rate": 0.00021261522351577906,
      "loss": 8.9918,
      "step": 19584,
      "throughput": 12481.585242922429
    },
    {
      "epoch": 0.3074632246334219,
      "grad_norm": 0.08475416898727417,
      "learning_rate": 0.00021235859091549294,
      "loss": 8.987,
      "step": 19616,
      "throughput": 12481.706000617602
    },
    {
      "epoch": 0.3079647959623508,
      "grad_norm": 0.07965004444122314,
      "learning_rate": 0.0002121017630082375,
      "loss": 8.9772,
      "step": 19648,
      "throughput": 12481.899747104279
    },
    {
      "epoch": 0.3084663672912797,
      "grad_norm": 0.07988788187503815,
      "learning_rate": 0.0002118447408531718,
      "loss": 8.9472,
      "step": 19680,
      "throughput": 12481.973442574455
    },
    {
      "epoch": 0.3089679386202086,
      "grad_norm": 0.08512098342180252,
      "learning_rate": 0.00021158752551025603,
      "loss": 8.9638,
      "step": 19712,
      "throughput": 12481.8821989218
    },
    {
      "epoch": 0.3094695099491375,
      "grad_norm": 0.08206585049629211,
      "learning_rate": 0.0002113301180402469,
      "loss": 8.9916,
      "step": 19744,
      "throughput": 12481.923126716347
    },
    {
      "epoch": 0.30997108127806644,
      "grad_norm": 0.08557426184415817,
      "learning_rate": 0.0002110725195046937,
      "loss": 8.9807,
      "step": 19776,
      "throughput": 12482.157295879151
    },
    {
      "epoch": 0.31047265260699536,
      "grad_norm": 0.08954965323209763,
      "learning_rate": 0.00021081473096593348,
      "loss": 8.976,
      "step": 19808,
      "throughput": 12482.426387632684
    },
    {
      "epoch": 0.3109742239359243,
      "grad_norm": 0.08191045373678207,
      "learning_rate": 0.000210556753487087,
      "loss": 8.968,
      "step": 19840,
      "throughput": 12482.767802868539
    },
    {
      "epoch": 0.3114757952648532,
      "grad_norm": 0.083841472864151,
      "learning_rate": 0.00021029858813205408,
      "loss": 8.9452,
      "step": 19872,
      "throughput": 12482.79312026922
    },
    {
      "epoch": 0.31197736659378206,
      "grad_norm": 0.08564392477273941,
      "learning_rate": 0.00021004023596550946,
      "loss": 8.9712,
      "step": 19904,
      "throughput": 12482.98858331792
    },
    {
      "epoch": 0.312478937922711,
      "grad_norm": 0.08598575741052628,
      "learning_rate": 0.00020978169805289823,
      "loss": 8.9772,
      "step": 19936,
      "throughput": 12483.168994339207
    },
    {
      "epoch": 0.3129805092516399,
      "grad_norm": 0.08651523292064667,
      "learning_rate": 0.0002095229754604315,
      "loss": 8.9647,
      "step": 19968,
      "throughput": 12483.301932936576
    },
    {
      "epoch": 0.3134820805805688,
      "grad_norm": 0.08066987246274948,
      "learning_rate": 0.00020926406925508202,
      "loss": 8.9706,
      "step": 20000,
      "throughput": 12483.468640673438
    },
    {
      "epoch": 0.31398365190949773,
      "grad_norm": 0.08455055952072144,
      "learning_rate": 0.00020900498050457973,
      "loss": 8.9811,
      "step": 20032,
      "throughput": 12483.389454275892
    },
    {
      "epoch": 0.31448522323842665,
      "grad_norm": 0.07816529273986816,
      "learning_rate": 0.0002087457102774074,
      "loss": 8.9669,
      "step": 20064,
      "throughput": 12483.280268234008
    },
    {
      "epoch": 0.31498679456735557,
      "grad_norm": 0.0842684879899025,
      "learning_rate": 0.00020848625964279622,
      "loss": 8.9567,
      "step": 20096,
      "throughput": 12483.441512925816
    },
    {
      "epoch": 0.31548836589628443,
      "grad_norm": 0.0792151615023613,
      "learning_rate": 0.0002082266296707214,
      "loss": 8.9627,
      "step": 20128,
      "throughput": 12483.70604001735
    },
    {
      "epoch": 0.31598993722521335,
      "grad_norm": 0.08477991074323654,
      "learning_rate": 0.0002079668214318977,
      "loss": 8.9572,
      "step": 20160,
      "throughput": 12484.02449720299
    },
    {
      "epoch": 0.31649150855414226,
      "grad_norm": 0.08497337996959686,
      "learning_rate": 0.00020770683599777507,
      "loss": 8.9547,
      "step": 20192,
      "throughput": 12484.258667934406
    },
    {
      "epoch": 0.3169930798830712,
      "grad_norm": 0.07823009788990021,
      "learning_rate": 0.0002074466744405342,
      "loss": 8.9537,
      "step": 20224,
      "throughput": 12484.284754838114
    },
    {
      "epoch": 0.3174946512120001,
      "grad_norm": 0.08207400888204575,
      "learning_rate": 0.00020718633783308214,
      "loss": 8.945,
      "step": 20256,
      "throughput": 12484.460604663742
    },
    {
      "epoch": 0.317996222540929,
      "grad_norm": 0.08717039227485657,
      "learning_rate": 0.00020692582724904778,
      "loss": 8.957,
      "step": 20288,
      "throughput": 12484.521166834636
    },
    {
      "epoch": 0.31849779386985794,
      "grad_norm": 0.08661855012178421,
      "learning_rate": 0.00020666514376277762,
      "loss": 8.9567,
      "step": 20320,
      "throughput": 12484.745023419275
    },
    {
      "epoch": 0.3189993651987868,
      "grad_norm": 0.08689385652542114,
      "learning_rate": 0.00020640428844933108,
      "loss": 8.9604,
      "step": 20352,
      "throughput": 12484.6409939241
    },
    {
      "epoch": 0.3195009365277157,
      "grad_norm": 0.08753615617752075,
      "learning_rate": 0.00020614326238447623,
      "loss": 8.9588,
      "step": 20384,
      "throughput": 12484.499387366826
    },
    {
      "epoch": 0.32000250785664464,
      "grad_norm": 0.08454905450344086,
      "learning_rate": 0.0002058820666446854,
      "loss": 8.9554,
      "step": 20416,
      "throughput": 12484.73143159062
    },
    {
      "epoch": 0.32050407918557355,
      "grad_norm": 0.08608614653348923,
      "learning_rate": 0.00020562070230713058,
      "loss": 8.9639,
      "step": 20448,
      "throughput": 12484.917610846744
    },
    {
      "epoch": 0.32100565051450247,
      "grad_norm": 0.08359532803297043,
      "learning_rate": 0.00020535917044967899,
      "loss": 8.9463,
      "step": 20480,
      "throughput": 12485.221669401204
    },
    {
      "epoch": 0.3215072218434314,
      "grad_norm": 0.09456279873847961,
      "learning_rate": 0.00020509747215088887,
      "loss": 8.952,
      "step": 20512,
      "throughput": 12484.477850075247
    },
    {
      "epoch": 0.3220087931723603,
      "grad_norm": 0.08295111358165741,
      "learning_rate": 0.00020483560849000475,
      "loss": 8.9512,
      "step": 20544,
      "throughput": 12484.502256651174
    },
    {
      "epoch": 0.32251036450128917,
      "grad_norm": 0.08061188459396362,
      "learning_rate": 0.00020457358054695317,
      "loss": 8.9436,
      "step": 20576,
      "throughput": 12484.75864322967
    },
    {
      "epoch": 0.3230119358302181,
      "grad_norm": 0.08765893429517746,
      "learning_rate": 0.00020431138940233808,
      "loss": 8.9409,
      "step": 20608,
      "throughput": 12484.795386174996
    },
    {
      "epoch": 0.323513507159147,
      "grad_norm": 0.08479173481464386,
      "learning_rate": 0.00020404903613743664,
      "loss": 8.9421,
      "step": 20640,
      "throughput": 12484.957643859123
    },
    {
      "epoch": 0.3240150784880759,
      "grad_norm": 0.08192974328994751,
      "learning_rate": 0.0002037865218341944,
      "loss": 8.951,
      "step": 20672,
      "throughput": 12485.044667605318
    },
    {
      "epoch": 0.32451664981700484,
      "grad_norm": 0.08531540632247925,
      "learning_rate": 0.00020352384757522113,
      "loss": 8.9339,
      "step": 20704,
      "throughput": 12484.871636248427
    },
    {
      "epoch": 0.32501822114593376,
      "grad_norm": 0.0843503326177597,
      "learning_rate": 0.00020326101444378633,
      "loss": 8.9596,
      "step": 20736,
      "throughput": 12484.93287006955
    },
    {
      "epoch": 0.3255197924748627,
      "grad_norm": 0.08475443720817566,
      "learning_rate": 0.0002029980235238145,
      "loss": 8.9572,
      "step": 20768,
      "throughput": 12485.100293172296
    },
    {
      "epoch": 0.32602136380379154,
      "grad_norm": 0.09080865979194641,
      "learning_rate": 0.0002027348758998811,
      "loss": 8.9502,
      "step": 20800,
      "throughput": 12485.411867627947
    },
    {
      "epoch": 0.32652293513272046,
      "grad_norm": 0.08043165504932404,
      "learning_rate": 0.0002024715726572076,
      "loss": 8.9619,
      "step": 20832,
      "throughput": 12485.646951030745
    },
    {
      "epoch": 0.3270245064616494,
      "grad_norm": 0.0829484760761261,
      "learning_rate": 0.0002022081148816574,
      "loss": 8.9353,
      "step": 20864,
      "throughput": 12485.772787112774
    },
    {
      "epoch": 0.3275260777905783,
      "grad_norm": 0.09577605128288269,
      "learning_rate": 0.0002019445036597312,
      "loss": 8.9345,
      "step": 20896,
      "throughput": 12485.953043466774
    },
    {
      "epoch": 0.3280276491195072,
      "grad_norm": 0.08556250482797623,
      "learning_rate": 0.00020168074007856232,
      "loss": 8.9435,
      "step": 20928,
      "throughput": 12486.017639406224
    },
    {
      "epoch": 0.32852922044843613,
      "grad_norm": 0.08058907091617584,
      "learning_rate": 0.00020141682522591272,
      "loss": 8.9289,
      "step": 20960,
      "throughput": 12486.170992647689
    },
    {
      "epoch": 0.32903079177736505,
      "grad_norm": 0.08227542787790298,
      "learning_rate": 0.0002011527601901679,
      "loss": 8.9489,
      "step": 20992,
      "throughput": 12486.233883077815
    },
    {
      "epoch": 0.3295323631062939,
      "grad_norm": 0.08085721731185913,
      "learning_rate": 0.00020088854606033292,
      "loss": 8.9545,
      "step": 21024,
      "throughput": 12486.051737341617
    },
    {
      "epoch": 0.33003393443522283,
      "grad_norm": 0.09055308252573013,
      "learning_rate": 0.00020062418392602767,
      "loss": 8.9369,
      "step": 21056,
      "throughput": 12486.145621966194
    },
    {
      "epoch": 0.33053550576415175,
      "grad_norm": 0.07927481830120087,
      "learning_rate": 0.00020035967487748226,
      "loss": 8.9486,
      "step": 21088,
      "throughput": 12486.34457992798
    },
    {
      "epoch": 0.33103707709308067,
      "grad_norm": 0.0844564139842987,
      "learning_rate": 0.00020009502000553286,
      "loss": 8.9383,
      "step": 21120,
      "throughput": 12486.611307376023
    },
    {
      "epoch": 0.3315386484220096,
      "grad_norm": 0.07819940149784088,
      "learning_rate": 0.00019983022040161692,
      "loss": 8.9262,
      "step": 21152,
      "throughput": 12486.924114265516
    },
    {
      "epoch": 0.3320402197509385,
      "grad_norm": 0.08591725677251816,
      "learning_rate": 0.00019956527715776887,
      "loss": 8.9463,
      "step": 21184,
      "throughput": 12487.161038107502
    },
    {
      "epoch": 0.3325417910798674,
      "grad_norm": 0.07994264364242554,
      "learning_rate": 0.0001993001913666153,
      "loss": 8.9448,
      "step": 21216,
      "throughput": 12487.150532210248
    },
    {
      "epoch": 0.3330433624087963,
      "grad_norm": 0.08380083739757538,
      "learning_rate": 0.00019903496412137093,
      "loss": 8.9406,
      "step": 21248,
      "throughput": 12487.258940781852
    },
    {
      "epoch": 0.3335449337377252,
      "grad_norm": 0.0842265859246254,
      "learning_rate": 0.00019876959651583362,
      "loss": 8.9426,
      "step": 21280,
      "throughput": 12487.332584155203
    },
    {
      "epoch": 0.3340465050666541,
      "grad_norm": 0.08240363746881485,
      "learning_rate": 0.00019850408964438023,
      "loss": 8.9497,
      "step": 21312,
      "throughput": 12487.40224732748
    },
    {
      "epoch": 0.33454807639558304,
      "grad_norm": 0.09238409996032715,
      "learning_rate": 0.00019823844460196177,
      "loss": 8.944,
      "step": 21344,
      "throughput": 12487.584956361457
    },
    {
      "epoch": 0.33504964772451196,
      "grad_norm": 0.08725325018167496,
      "learning_rate": 0.00019797266248409932,
      "loss": 8.918,
      "step": 21376,
      "throughput": 12487.304178496872
    },
    {
      "epoch": 0.3355512190534409,
      "grad_norm": 0.08461218327283859,
      "learning_rate": 0.000197706744386879,
      "loss": 8.9337,
      "step": 21408,
      "throughput": 12487.563503986132
    },
    {
      "epoch": 0.3360527903823698,
      "grad_norm": 0.08346909284591675,
      "learning_rate": 0.00019744069140694795,
      "loss": 8.9519,
      "step": 21440,
      "throughput": 12487.72316076954
    },
    {
      "epoch": 0.33655436171129866,
      "grad_norm": 0.08190145343542099,
      "learning_rate": 0.00019717450464150935,
      "loss": 8.9081,
      "step": 21472,
      "throughput": 12488.030838551273
    },
    {
      "epoch": 0.3370559330402276,
      "grad_norm": 0.07997757941484451,
      "learning_rate": 0.00019690818518831827,
      "loss": 8.9402,
      "step": 21504,
      "throughput": 12488.24650412556
    },
    {
      "epoch": 0.3375575043691565,
      "grad_norm": 0.08381337672472,
      "learning_rate": 0.0001966417341456769,
      "loss": 8.9227,
      "step": 21536,
      "throughput": 12488.279142058242
    },
    {
      "epoch": 0.3380590756980854,
      "grad_norm": 0.08766081184148788,
      "learning_rate": 0.0001963751526124301,
      "loss": 8.9325,
      "step": 21568,
      "throughput": 12488.457524807016
    },
    {
      "epoch": 0.3385606470270143,
      "grad_norm": 0.08199552446603775,
      "learning_rate": 0.00019610844168796096,
      "loss": 8.9479,
      "step": 21600,
      "throughput": 12488.495104920967
    },
    {
      "epoch": 0.33906221835594325,
      "grad_norm": 0.08472704142332077,
      "learning_rate": 0.0001958416024721861,
      "loss": 8.9206,
      "step": 21632,
      "throughput": 12488.592639697072
    },
    {
      "epoch": 0.33956378968487216,
      "grad_norm": 0.09208200871944427,
      "learning_rate": 0.00019557463606555118,
      "loss": 8.9175,
      "step": 21664,
      "throughput": 12488.699079971992
    },
    {
      "epoch": 0.340065361013801,
      "grad_norm": 0.08901580423116684,
      "learning_rate": 0.0001953075435690266,
      "loss": 8.9482,
      "step": 21696,
      "throughput": 12488.521563231709
    },
    {
      "epoch": 0.34056693234272994,
      "grad_norm": 0.08557577431201935,
      "learning_rate": 0.0001950403260841024,
      "loss": 8.9027,
      "step": 21728,
      "throughput": 12488.646320404407
    },
    {
      "epoch": 0.34106850367165886,
      "grad_norm": 0.08255859464406967,
      "learning_rate": 0.0001947729847127845,
      "loss": 8.9463,
      "step": 21760,
      "throughput": 12488.833395405463
    },
    {
      "epoch": 0.3415700750005878,
      "grad_norm": 0.08207535743713379,
      "learning_rate": 0.00019450552055758934,
      "loss": 8.91,
      "step": 21792,
      "throughput": 12489.129996873391
    },
    {
      "epoch": 0.3420716463295167,
      "grad_norm": 0.0815618485212326,
      "learning_rate": 0.00019423793472153996,
      "loss": 8.9103,
      "step": 21824,
      "throughput": 12489.336254605087
    },
    {
      "epoch": 0.3425732176584456,
      "grad_norm": 0.08508460223674774,
      "learning_rate": 0.0001939702283081611,
      "loss": 8.9172,
      "step": 21856,
      "throughput": 12489.435337129702
    },
    {
      "epoch": 0.34307478898737453,
      "grad_norm": 0.07797224074602127,
      "learning_rate": 0.00019370240242147488,
      "loss": 8.93,
      "step": 21888,
      "throughput": 12489.610602421475
    },
    {
      "epoch": 0.3435763603163034,
      "grad_norm": 0.08201367408037186,
      "learning_rate": 0.000193434458165996,
      "loss": 8.937,
      "step": 21920,
      "throughput": 12489.67687894913
    },
    {
      "epoch": 0.3440779316452323,
      "grad_norm": 0.07757963240146637,
      "learning_rate": 0.00019316639664672733,
      "loss": 8.9071,
      "step": 21952,
      "throughput": 12489.828498519091
    },
    {
      "epoch": 0.34457950297416123,
      "grad_norm": 0.08153794705867767,
      "learning_rate": 0.00019289821896915544,
      "loss": 8.9108,
      "step": 21984,
      "throughput": 12489.855986021807
    },
    {
      "epoch": 0.34508107430309015,
      "grad_norm": 0.08086151629686356,
      "learning_rate": 0.00019262992623924585,
      "loss": 8.92,
      "step": 22016,
      "throughput": 12489.738904543394
    },
    {
      "epoch": 0.34558264563201907,
      "grad_norm": 0.08799968659877777,
      "learning_rate": 0.00019236151956343852,
      "loss": 8.8953,
      "step": 22048,
      "throughput": 12489.764779282503
    },
    {
      "epoch": 0.346084216960948,
      "grad_norm": 0.08785464614629745,
      "learning_rate": 0.00019209300004864341,
      "loss": 8.9214,
      "step": 22080,
      "throughput": 12489.914629776307
    },
    {
      "epoch": 0.3465857882898769,
      "grad_norm": 0.08560092747211456,
      "learning_rate": 0.00019182436880223585,
      "loss": 8.9188,
      "step": 22112,
      "throughput": 12490.212450694407
    },
    {
      "epoch": 0.34708735961880577,
      "grad_norm": 0.08484335988759995,
      "learning_rate": 0.00019155562693205178,
      "loss": 8.9132,
      "step": 22144,
      "throughput": 12490.511833539385
    },
    {
      "epoch": 0.3475889309477347,
      "grad_norm": 0.08568539470434189,
      "learning_rate": 0.00019128677554638355,
      "loss": 8.8883,
      "step": 22176,
      "throughput": 12490.672564495928
    },
    {
      "epoch": 0.3480905022766636,
      "grad_norm": 0.08342450857162476,
      "learning_rate": 0.0001910178157539751,
      "loss": 8.9109,
      "step": 22208,
      "throughput": 12490.765383820944
    },
    {
      "epoch": 0.3485920736055925,
      "grad_norm": 0.08115052431821823,
      "learning_rate": 0.00019074874866401733,
      "loss": 8.9371,
      "step": 22240,
      "throughput": 12490.853232384366
    },
    {
      "epoch": 0.34909364493452144,
      "grad_norm": 0.08548019826412201,
      "learning_rate": 0.00019047957538614375,
      "loss": 8.9111,
      "step": 22272,
      "throughput": 12490.9313045578
    },
    {
      "epoch": 0.34959521626345036,
      "grad_norm": 0.08596168458461761,
      "learning_rate": 0.00019021029703042576,
      "loss": 8.9106,
      "step": 22304,
      "throughput": 12490.990787166018
    },
    {
      "epoch": 0.3500967875923793,
      "grad_norm": 0.0818779468536377,
      "learning_rate": 0.0001899409147073681,
      "loss": 8.9162,
      "step": 22336,
      "throughput": 12490.918567562465
    },
    {
      "epoch": 0.35059835892130814,
      "grad_norm": 0.09088233858346939,
      "learning_rate": 0.0001896714295279043,
      "loss": 8.9048,
      "step": 22368,
      "throughput": 12490.87804323234
    },
    {
      "epoch": 0.35109993025023706,
      "grad_norm": 0.08182457089424133,
      "learning_rate": 0.00018940184260339194,
      "loss": 8.9223,
      "step": 22400,
      "throughput": 12491.16283867804
    },
    {
      "epoch": 0.351601501579166,
      "grad_norm": 0.0823916345834732,
      "learning_rate": 0.00018913215504560838,
      "loss": 8.9362,
      "step": 22432,
      "throughput": 12491.302279886577
    },
    {
      "epoch": 0.3521030729080949,
      "grad_norm": 0.0841815397143364,
      "learning_rate": 0.0001888623679667459,
      "loss": 8.9064,
      "step": 22464,
      "throughput": 12491.589846482098
    },
    {
      "epoch": 0.3526046442370238,
      "grad_norm": 0.08144458383321762,
      "learning_rate": 0.00018859248247940722,
      "loss": 8.9139,
      "step": 22496,
      "throughput": 12491.790941636938
    },
    {
      "epoch": 0.35310621556595273,
      "grad_norm": 0.08848226815462112,
      "learning_rate": 0.0001883224996966008,
      "loss": 8.9261,
      "step": 22528,
      "throughput": 12491.801249014014
    },
    {
      "epoch": 0.35360778689488165,
      "grad_norm": 0.08245450258255005,
      "learning_rate": 0.00018805242073173653,
      "loss": 8.9141,
      "step": 22560,
      "throughput": 12491.01421133125
    },
    {
      "epoch": 0.3541093582238105,
      "grad_norm": 0.0856962725520134,
      "learning_rate": 0.00018778224669862087,
      "loss": 8.9172,
      "step": 22592,
      "throughput": 12491.096014213206
    },
    {
      "epoch": 0.35461092955273943,
      "grad_norm": 0.08765954524278641,
      "learning_rate": 0.0001875119787114523,
      "loss": 8.8969,
      "step": 22624,
      "throughput": 12491.141670027499
    },
    {
      "epoch": 0.35511250088166835,
      "grad_norm": 0.08133430778980255,
      "learning_rate": 0.00018724161788481676,
      "loss": 8.909,
      "step": 22656,
      "throughput": 12491.235690121868
    },
    {
      "epoch": 0.35561407221059727,
      "grad_norm": 0.07738807797431946,
      "learning_rate": 0.00018697116533368316,
      "loss": 8.8928,
      "step": 22688,
      "throughput": 12491.025826665942
    },
    {
      "epoch": 0.3561156435395262,
      "grad_norm": 0.08607741445302963,
      "learning_rate": 0.00018670062217339867,
      "loss": 8.9262,
      "step": 22720,
      "throughput": 12491.256184899139
    },
    {
      "epoch": 0.3566172148684551,
      "grad_norm": 0.08493805676698685,
      "learning_rate": 0.0001864299895196839,
      "loss": 8.9354,
      "step": 22752,
      "throughput": 12491.3963119095
    },
    {
      "epoch": 0.357118786197384,
      "grad_norm": 0.08784986287355423,
      "learning_rate": 0.00018615926848862893,
      "loss": 8.9295,
      "step": 22784,
      "throughput": 12491.676705107877
    },
    {
      "epoch": 0.3576203575263129,
      "grad_norm": 0.08774597197771072,
      "learning_rate": 0.00018588846019668793,
      "loss": 8.8907,
      "step": 22816,
      "throughput": 12491.886952645285
    },
    {
      "epoch": 0.3581219288552418,
      "grad_norm": 0.08723000437021255,
      "learning_rate": 0.00018561756576067524,
      "loss": 8.8931,
      "step": 22848,
      "throughput": 12491.986990443083
    },
    {
      "epoch": 0.3586235001841707,
      "grad_norm": 0.0842309296131134,
      "learning_rate": 0.0001853465862977602,
      "loss": 8.8862,
      "step": 22880,
      "throughput": 12492.07097622102
    },
    {
      "epoch": 0.35912507151309964,
      "grad_norm": 0.08362879604101181,
      "learning_rate": 0.00018507552292546295,
      "loss": 8.8787,
      "step": 22912,
      "throughput": 12492.275117711019
    },
    {
      "epoch": 0.35962664284202855,
      "grad_norm": 0.08330900967121124,
      "learning_rate": 0.00018480437676164968,
      "loss": 8.8744,
      "step": 22944,
      "throughput": 12492.206883242661
    },
    {
      "epoch": 0.3601282141709575,
      "grad_norm": 0.08307932317256927,
      "learning_rate": 0.00018453314892452795,
      "loss": 8.9092,
      "step": 22976,
      "throughput": 12492.343599424185
    },
    {
      "epoch": 0.36062978549988634,
      "grad_norm": 0.07551781088113785,
      "learning_rate": 0.00018426184053264215,
      "loss": 8.9058,
      "step": 23008,
      "throughput": 12492.164113570247
    },
    {
      "epoch": 0.36113135682881525,
      "grad_norm": 0.0835670456290245,
      "learning_rate": 0.0001839904527048689,
      "loss": 8.9104,
      "step": 23040,
      "throughput": 12492.282754142729
    },
    {
      "epoch": 0.36163292815774417,
      "grad_norm": 0.07876232266426086,
      "learning_rate": 0.0001837189865604124,
      "loss": 8.8814,
      "step": 23072,
      "throughput": 12492.42812864148
    },
    {
      "epoch": 0.3621344994866731,
      "grad_norm": 0.08294995129108429,
      "learning_rate": 0.00018344744321879987,
      "loss": 8.9042,
      "step": 23104,
      "throughput": 12492.70628042669
    },
    {
      "epoch": 0.362636070815602,
      "grad_norm": 0.08761091530323029,
      "learning_rate": 0.0001831758237998768,
      "loss": 8.8812,
      "step": 23136,
      "throughput": 12492.979705813483
    },
    {
      "epoch": 0.3631376421445309,
      "grad_norm": 0.08298665285110474,
      "learning_rate": 0.00018290412942380252,
      "loss": 8.8642,
      "step": 23168,
      "throughput": 12493.119589836599
    },
    {
      "epoch": 0.36363921347345984,
      "grad_norm": 0.08473809063434601,
      "learning_rate": 0.00018263236121104543,
      "loss": 8.8885,
      "step": 23200,
      "throughput": 12493.119772385908
    },
    {
      "epoch": 0.3641407848023887,
      "grad_norm": 0.0838628038764,
      "learning_rate": 0.00018236052028237847,
      "loss": 8.8879,
      "step": 23232,
      "throughput": 12493.347540635039
    },
    {
      "epoch": 0.3646423561313176,
      "grad_norm": 0.08237037807703018,
      "learning_rate": 0.0001820886077588744,
      "loss": 8.891,
      "step": 23264,
      "throughput": 12493.378674750831
    },
    {
      "epoch": 0.36514392746024654,
      "grad_norm": 0.08084205538034439,
      "learning_rate": 0.00018181662476190127,
      "loss": 8.906,
      "step": 23296,
      "throughput": 12493.29387889249
    },
    {
      "epoch": 0.36564549878917546,
      "grad_norm": 0.07574167102575302,
      "learning_rate": 0.00018154457241311773,
      "loss": 8.8886,
      "step": 23328,
      "throughput": 12493.272845143263
    },
    {
      "epoch": 0.3661470701181044,
      "grad_norm": 0.07789778709411621,
      "learning_rate": 0.00018127245183446858,
      "loss": 8.8813,
      "step": 23360,
      "throughput": 12493.230559544294
    },
    {
      "epoch": 0.3666486414470333,
      "grad_norm": 0.08263697475194931,
      "learning_rate": 0.00018100026414817987,
      "loss": 8.8993,
      "step": 23392,
      "throughput": 12493.44952418477
    },
    {
      "epoch": 0.3671502127759622,
      "grad_norm": 0.08247614651918411,
      "learning_rate": 0.00018072801047675432,
      "loss": 8.8895,
      "step": 23424,
      "throughput": 12493.643180969875
    },
    {
      "epoch": 0.3676517841048911,
      "grad_norm": 0.08045326173305511,
      "learning_rate": 0.00018045569194296697,
      "loss": 8.8869,
      "step": 23456,
      "throughput": 12493.910386986438
    },
    {
      "epoch": 0.36815335543382,
      "grad_norm": 0.08219939470291138,
      "learning_rate": 0.00018018330966986022,
      "loss": 8.8781,
      "step": 23488,
      "throughput": 12494.108872504368
    },
    {
      "epoch": 0.3686549267627489,
      "grad_norm": 0.0800880491733551,
      "learning_rate": 0.00017991086478073943,
      "loss": 8.9001,
      "step": 23520,
      "throughput": 12494.105943629875
    },
    {
      "epoch": 0.36915649809167783,
      "grad_norm": 0.08422353863716125,
      "learning_rate": 0.0001796383583991681,
      "loss": 8.8848,
      "step": 23552,
      "throughput": 12494.258574568401
    },
    {
      "epoch": 0.36965806942060675,
      "grad_norm": 0.07969654351472855,
      "learning_rate": 0.00017936579164896333,
      "loss": 8.886,
      "step": 23584,
      "throughput": 12494.279822975659
    },
    {
      "epoch": 0.37015964074953567,
      "grad_norm": 0.08507449179887772,
      "learning_rate": 0.0001790931656541912,
      "loss": 8.897,
      "step": 23616,
      "throughput": 12494.323654439937
    },
    {
      "epoch": 0.3706612120784646,
      "grad_norm": 0.08387662470340729,
      "learning_rate": 0.00017882048153916214,
      "loss": 8.9107,
      "step": 23648,
      "throughput": 12494.32082459582
    },
    {
      "epoch": 0.37116278340739345,
      "grad_norm": 0.08334920555353165,
      "learning_rate": 0.00017854774042842626,
      "loss": 8.9006,
      "step": 23680,
      "throughput": 12494.24012124739
    },
    {
      "epoch": 0.37166435473632237,
      "grad_norm": 0.08700676262378693,
      "learning_rate": 0.00017827494344676873,
      "loss": 8.8973,
      "step": 23712,
      "throughput": 12494.50655693926
    },
    {
      "epoch": 0.3721659260652513,
      "grad_norm": 0.07877275347709656,
      "learning_rate": 0.000178002091719205,
      "loss": 8.8749,
      "step": 23744,
      "throughput": 12494.635084936986
    },
    {
      "epoch": 0.3726674973941802,
      "grad_norm": 0.08263049274682999,
      "learning_rate": 0.00017772918637097657,
      "loss": 8.8834,
      "step": 23776,
      "throughput": 12494.894484086817
    },
    {
      "epoch": 0.3731690687231091,
      "grad_norm": 0.08499334007501602,
      "learning_rate": 0.00017745622852754575,
      "loss": 8.8862,
      "step": 23808,
      "throughput": 12495.071060301598
    },
    {
      "epoch": 0.37367064005203804,
      "grad_norm": 0.08328446745872498,
      "learning_rate": 0.00017718321931459163,
      "loss": 8.8783,
      "step": 23840,
      "throughput": 12495.137190776335
    },
    {
      "epoch": 0.37417221138096696,
      "grad_norm": 0.08706315606832504,
      "learning_rate": 0.00017691015985800488,
      "loss": 8.8969,
      "step": 23872,
      "throughput": 12495.214628327643
    },
    {
      "epoch": 0.3746737827098958,
      "grad_norm": 0.08637236803770065,
      "learning_rate": 0.0001766370512838836,
      "loss": 8.8764,
      "step": 23904,
      "throughput": 12495.361833829736
    },
    {
      "epoch": 0.37517535403882474,
      "grad_norm": 0.08245234936475754,
      "learning_rate": 0.00017636389471852834,
      "loss": 8.8771,
      "step": 23936,
      "throughput": 12495.349511779505
    },
    {
      "epoch": 0.37567692536775366,
      "grad_norm": 0.07800939679145813,
      "learning_rate": 0.0001760906912884376,
      "loss": 8.8775,
      "step": 23968,
      "throughput": 12495.340517367329
    },
    {
      "epoch": 0.3761784966966826,
      "grad_norm": 0.08624035865068436,
      "learning_rate": 0.00017581744212030308,
      "loss": 8.8739,
      "step": 24000,
      "throughput": 12495.316485319103
    },
    {
      "epoch": 0.3766800680256115,
      "grad_norm": 0.08235324919223785,
      "learning_rate": 0.00017554414834100525,
      "loss": 8.8649,
      "step": 24032,
      "throughput": 12495.474988710415
    },
    {
      "epoch": 0.3771816393545404,
      "grad_norm": 0.08272941410541534,
      "learning_rate": 0.00017527081107760834,
      "loss": 8.8668,
      "step": 24064,
      "throughput": 12495.685198313075
    },
    {
      "epoch": 0.37768321068346933,
      "grad_norm": 0.0816485658288002,
      "learning_rate": 0.00017499743145735615,
      "loss": 8.8801,
      "step": 24096,
      "throughput": 12495.873403249487
    },
    {
      "epoch": 0.3781847820123982,
      "grad_norm": 0.08586040884256363,
      "learning_rate": 0.00017472401060766697,
      "loss": 8.8779,
      "step": 24128,
      "throughput": 12496.129806396117
    },
    {
      "epoch": 0.3786863533413271,
      "grad_norm": 0.07926472276449203,
      "learning_rate": 0.0001744505496561292,
      "loss": 8.882,
      "step": 24160,
      "throughput": 12496.258584111014
    },
    {
      "epoch": 0.379187924670256,
      "grad_norm": 0.08123722672462463,
      "learning_rate": 0.00017417704973049668,
      "loss": 8.8913,
      "step": 24192,
      "throughput": 12496.246278915552
    },
    {
      "epoch": 0.37968949599918494,
      "grad_norm": 0.08329039812088013,
      "learning_rate": 0.00017390351195868385,
      "loss": 8.8809,
      "step": 24224,
      "throughput": 12496.423361994388
    },
    {
      "epoch": 0.38019106732811386,
      "grad_norm": 0.08363816887140274,
      "learning_rate": 0.00017362993746876135,
      "loss": 8.879,
      "step": 24256,
      "throughput": 12496.36571410399
    },
    {
      "epoch": 0.3806926386570428,
      "grad_norm": 0.07615621387958527,
      "learning_rate": 0.00017335632738895113,
      "loss": 8.8774,
      "step": 24288,
      "throughput": 12496.385541333166
    },
    {
      "epoch": 0.3811942099859717,
      "grad_norm": 0.08446671068668365,
      "learning_rate": 0.000173082682847622,
      "loss": 8.875,
      "step": 24320,
      "throughput": 12496.301883161885
    },
    {
      "epoch": 0.38169578131490056,
      "grad_norm": 0.07616794854402542,
      "learning_rate": 0.0001728090049732848,
      "loss": 8.8687,
      "step": 24352,
      "throughput": 12496.402973511722
    },
    {
      "epoch": 0.3821973526438295,
      "grad_norm": 0.08101163804531097,
      "learning_rate": 0.00017253529489458802,
      "loss": 8.8822,
      "step": 24384,
      "throughput": 12496.606250274554
    },
    {
      "epoch": 0.3826989239727584,
      "grad_norm": 0.08165282756090164,
      "learning_rate": 0.00017226155374031271,
      "loss": 8.8651,
      "step": 24416,
      "throughput": 12496.778618678321
    },
    {
      "epoch": 0.3832004953016873,
      "grad_norm": 0.08300849050283432,
      "learning_rate": 0.0001719877826393683,
      "loss": 8.8663,
      "step": 24448,
      "throughput": 12497.034343131501
    },
    {
      "epoch": 0.38370206663061623,
      "grad_norm": 0.08287367969751358,
      "learning_rate": 0.00017171398272078752,
      "loss": 8.8555,
      "step": 24480,
      "throughput": 12497.208343750086
    },
    {
      "epoch": 0.38420363795954515,
      "grad_norm": 0.07704202830791473,
      "learning_rate": 0.00017144015511372208,
      "loss": 8.8612,
      "step": 24512,
      "throughput": 12497.156610468535
    },
    {
      "epoch": 0.38470520928847407,
      "grad_norm": 0.08153821527957916,
      "learning_rate": 0.00017116630094743792,
      "loss": 8.8589,
      "step": 24544,
      "throughput": 12497.372581395
    },
    {
      "epoch": 0.38520678061740293,
      "grad_norm": 0.0911712720990181,
      "learning_rate": 0.00017089242135131036,
      "loss": 8.86,
      "step": 24576,
      "throughput": 12497.384800573247
    },
    {
      "epoch": 0.38570835194633185,
      "grad_norm": 0.08098107576370239,
      "learning_rate": 0.0001706185174548197,
      "loss": 8.8532,
      "step": 24608,
      "throughput": 12496.52204489201
    },
    {
      "epoch": 0.38620992327526077,
      "grad_norm": 0.08704471588134766,
      "learning_rate": 0.0001703445903875464,
      "loss": 8.8599,
      "step": 24640,
      "throughput": 12496.435891320403
    },
    {
      "epoch": 0.3867114946041897,
      "grad_norm": 0.07287541031837463,
      "learning_rate": 0.00017007064127916644,
      "loss": 8.8649,
      "step": 24672,
      "throughput": 12496.530306289622
    },
    {
      "epoch": 0.3872130659331186,
      "grad_norm": 0.08605916053056717,
      "learning_rate": 0.0001697966712594469,
      "loss": 8.8758,
      "step": 24704,
      "throughput": 12496.624926668494
    },
    {
      "epoch": 0.3877146372620475,
      "grad_norm": 0.08018253743648529,
      "learning_rate": 0.00016952268145824082,
      "loss": 8.8595,
      "step": 24736,
      "throughput": 12496.806147490599
    },
    {
      "epoch": 0.38821620859097644,
      "grad_norm": 0.08926723152399063,
      "learning_rate": 0.00016924867300548304,
      "loss": 8.8501,
      "step": 24768,
      "throughput": 12497.061754200398
    },
    {
      "epoch": 0.3887177799199053,
      "grad_norm": 0.08146440982818604,
      "learning_rate": 0.00016897464703118515,
      "loss": 8.8759,
      "step": 24800,
      "throughput": 12497.260149261956
    },
    {
      "epoch": 0.3892193512488342,
      "grad_norm": 0.08535002171993256,
      "learning_rate": 0.00016870060466543112,
      "loss": 8.8354,
      "step": 24832,
      "throughput": 12497.26649049281
    },
    {
      "epoch": 0.38972092257776314,
      "grad_norm": 0.08504383265972137,
      "learning_rate": 0.0001684265470383725,
      "loss": 8.8764,
      "step": 24864,
      "throughput": 12497.399025265984
    },
    {
      "epoch": 0.39022249390669206,
      "grad_norm": 0.07920678704977036,
      "learning_rate": 0.0001681524752802237,
      "loss": 8.8827,
      "step": 24896,
      "throughput": 12497.535226442347
    },
    {
      "epoch": 0.390724065235621,
      "grad_norm": 0.07792805880308151,
      "learning_rate": 0.00016787839052125758,
      "loss": 8.8628,
      "step": 24928,
      "throughput": 12497.435883965982
    },
    {
      "epoch": 0.3912256365645499,
      "grad_norm": 0.07983972132205963,
      "learning_rate": 0.00016760429389180037,
      "loss": 8.879,
      "step": 24960,
      "throughput": 12497.395890011567
    },
    {
      "epoch": 0.3917272078934788,
      "grad_norm": 0.07780390232801437,
      "learning_rate": 0.00016733018652222744,
      "loss": 8.8389,
      "step": 24992,
      "throughput": 12497.41402842221
    },
    {
      "epoch": 0.3922287792224077,
      "grad_norm": 0.07822234183549881,
      "learning_rate": 0.0001670560695429584,
      "loss": 8.8569,
      "step": 25024,
      "throughput": 12497.555066487259
    },
    {
      "epoch": 0.3927303505513366,
      "grad_norm": 0.08859014511108398,
      "learning_rate": 0.00016678194408445245,
      "loss": 8.8311,
      "step": 25056,
      "throughput": 12497.761692108925
    },
    {
      "epoch": 0.3932319218802655,
      "grad_norm": 0.08568881452083588,
      "learning_rate": 0.00016650781127720382,
      "loss": 8.8668,
      "step": 25088,
      "throughput": 12497.92260971887
    },
    {
      "epoch": 0.39373349320919443,
      "grad_norm": 0.08156470954418182,
      "learning_rate": 0.00016623367225173703,
      "loss": 8.8681,
      "step": 25120,
      "throughput": 12498.166384791184
    },
    {
      "epoch": 0.39423506453812335,
      "grad_norm": 0.0885007381439209,
      "learning_rate": 0.00016595952813860216,
      "loss": 8.8592,
      "step": 25152,
      "throughput": 12498.292358025039
    },
    {
      "epoch": 0.39473663586705227,
      "grad_norm": 0.08698877692222595,
      "learning_rate": 0.00016568538006837046,
      "loss": 8.8666,
      "step": 25184,
      "throughput": 12498.292639086796
    },
    {
      "epoch": 0.3952382071959812,
      "grad_norm": 0.0833880603313446,
      "learning_rate": 0.00016541122917162934,
      "loss": 8.8482,
      "step": 25216,
      "throughput": 12498.369864553024
    },
    {
      "epoch": 0.39573977852491005,
      "grad_norm": 0.07714637368917465,
      "learning_rate": 0.00016513707657897785,
      "loss": 8.877,
      "step": 25248,
      "throughput": 12498.344570077656
    },
    {
      "epoch": 0.39624134985383896,
      "grad_norm": 0.07872116565704346,
      "learning_rate": 0.00016486292342102215,
      "loss": 8.857,
      "step": 25280,
      "throughput": 12498.259394281939
    },
    {
      "epoch": 0.3967429211827679,
      "grad_norm": 0.0763251781463623,
      "learning_rate": 0.0001645887708283707,
      "loss": 8.8524,
      "step": 25312,
      "throughput": 12498.192551632414
    },
    {
      "epoch": 0.3972444925116968,
      "grad_norm": 0.0815349817276001,
      "learning_rate": 0.00016431461993162954,
      "loss": 8.8548,
      "step": 25344,
      "throughput": 12498.342362005955
    },
    {
      "epoch": 0.3977460638406257,
      "grad_norm": 0.08260805159807205,
      "learning_rate": 0.00016404047186139784,
      "loss": 8.8503,
      "step": 25376,
      "throughput": 12498.53850460718
    },
    {
      "epoch": 0.39824763516955464,
      "grad_norm": 0.08485168218612671,
      "learning_rate": 0.00016376632774826297,
      "loss": 8.8444,
      "step": 25408,
      "throughput": 12498.708249727906
    },
    {
      "epoch": 0.39874920649848355,
      "grad_norm": 0.07725406438112259,
      "learning_rate": 0.0001634921887227962,
      "loss": 8.8515,
      "step": 25440,
      "throughput": 12498.95239798028
    },
    {
      "epoch": 0.3992507778274124,
      "grad_norm": 0.08141039311885834,
      "learning_rate": 0.00016321805591554755,
      "loss": 8.845,
      "step": 25472,
      "throughput": 12499.127438050664
    },
    {
      "epoch": 0.39975234915634134,
      "grad_norm": 0.0798976719379425,
      "learning_rate": 0.00016294393045704163,
      "loss": 8.8547,
      "step": 25504,
      "throughput": 12499.066997003883
    },
    {
      "epoch": 0.40025392048527025,
      "grad_norm": 0.08230122923851013,
      "learning_rate": 0.00016266981347777255,
      "loss": 8.8483,
      "step": 25536,
      "throughput": 12499.249046093964
    },
    {
      "epoch": 0.40075549181419917,
      "grad_norm": 0.07611089199781418,
      "learning_rate": 0.00016239570610819963,
      "loss": 8.8369,
      "step": 25568,
      "throughput": 12499.307959676233
    },
    {
      "epoch": 0.4012570631431281,
      "grad_norm": 0.08280283212661743,
      "learning_rate": 0.00016212160947874242,
      "loss": 8.8398,
      "step": 25600,
      "throughput": 12499.148777309101
    },
    {
      "epoch": 0.401758634472057,
      "grad_norm": 0.09161941707134247,
      "learning_rate": 0.00016184752471977627,
      "loss": 8.839,
      "step": 25632,
      "throughput": 12499.14721527316
    },
    {
      "epoch": 0.4022602058009859,
      "grad_norm": 0.0951690599322319,
      "learning_rate": 0.0001615734529616275,
      "loss": 8.8305,
      "step": 25664,
      "throughput": 12499.27000679289
    },
    {
      "epoch": 0.4027617771299148,
      "grad_norm": 0.0823458731174469,
      "learning_rate": 0.00016129939533456888,
      "loss": 8.8506,
      "step": 25696,
      "throughput": 12499.367547398746
    },
    {
      "epoch": 0.4032633484588437,
      "grad_norm": 0.08224660158157349,
      "learning_rate": 0.00016102535296881485,
      "loss": 8.8392,
      "step": 25728,
      "throughput": 12499.606853458625
    },
    {
      "epoch": 0.4037649197877726,
      "grad_norm": 0.07632069289684296,
      "learning_rate": 0.00016075132699451701,
      "loss": 8.8528,
      "step": 25760,
      "throughput": 12499.774179612356
    },
    {
      "epoch": 0.40426649111670154,
      "grad_norm": 0.08515363931655884,
      "learning_rate": 0.00016047731854175917,
      "loss": 8.8424,
      "step": 25792,
      "throughput": 12499.938706886793
    },
    {
      "epoch": 0.40476806244563046,
      "grad_norm": 0.0868089571595192,
      "learning_rate": 0.00016020332874055313,
      "loss": 8.8523,
      "step": 25824,
      "throughput": 12499.876721209184
    },
    {
      "epoch": 0.4052696337745594,
      "grad_norm": 0.08290804177522659,
      "learning_rate": 0.00015992935872083356,
      "loss": 8.8574,
      "step": 25856,
      "throughput": 12500.070181450272
    },
    {
      "epoch": 0.4057712051034883,
      "grad_norm": 0.10937785357236862,
      "learning_rate": 0.00015965540961245363,
      "loss": 8.8286,
      "step": 25888,
      "throughput": 12500.06492258528
    },
    {
      "epoch": 0.40627277643241716,
      "grad_norm": 0.08178723603487015,
      "learning_rate": 0.0001593814825451803,
      "loss": 8.8433,
      "step": 25920,
      "throughput": 12500.081162426208
    },
    {
      "epoch": 0.4067743477613461,
      "grad_norm": 0.08957145363092422,
      "learning_rate": 0.00015910757864868967,
      "loss": 8.8489,
      "step": 25952,
      "throughput": 12499.956948132949
    },
    {
      "epoch": 0.407275919090275,
      "grad_norm": 0.07651602476835251,
      "learning_rate": 0.0001588336990525621,
      "loss": 8.8528,
      "step": 25984,
      "throughput": 12500.06247363665
    },
    {
      "epoch": 0.4077774904192039,
      "grad_norm": 0.08387041836977005,
      "learning_rate": 0.00015855984488627792,
      "loss": 8.8567,
      "step": 26016,
      "throughput": 12500.208528946365
    },
    {
      "epoch": 0.40827906174813283,
      "grad_norm": 0.07692224532365799,
      "learning_rate": 0.00015828601727921248,
      "loss": 8.8327,
      "step": 26048,
      "throughput": 12500.392543525437
    },
    {
      "epoch": 0.40878063307706175,
      "grad_norm": 0.08937270939350128,
      "learning_rate": 0.0001580122173606317,
      "loss": 8.8506,
      "step": 26080,
      "throughput": 12500.54759119154
    },
    {
      "epoch": 0.40928220440599067,
      "grad_norm": 0.0803786963224411,
      "learning_rate": 0.00015773844625968726,
      "loss": 8.8404,
      "step": 26112,
      "throughput": 12500.779817177776
    },
    {
      "epoch": 0.40978377573491953,
      "grad_norm": 0.07738781720399857,
      "learning_rate": 0.00015746470510541197,
      "loss": 8.815,
      "step": 26144,
      "throughput": 12500.808397361501
    },
    {
      "epoch": 0.41028534706384845,
      "grad_norm": 0.09304353594779968,
      "learning_rate": 0.00015719099502671516,
      "loss": 8.8271,
      "step": 26176,
      "throughput": 12500.886080461782
    },
    {
      "epoch": 0.41078691839277737,
      "grad_norm": 0.08904910832643509,
      "learning_rate": 0.00015691731715237802,
      "loss": 8.8203,
      "step": 26208,
      "throughput": 12500.956352095312
    },
    {
      "epoch": 0.4112884897217063,
      "grad_norm": 0.07861921936273575,
      "learning_rate": 0.00015664367261104887,
      "loss": 8.8447,
      "step": 26240,
      "throughput": 12500.905992308413
    },
    {
      "epoch": 0.4117900610506352,
      "grad_norm": 0.08831379562616348,
      "learning_rate": 0.00015637006253123865,
      "loss": 8.834,
      "step": 26272,
      "throughput": 12500.824906221855
    },
    {
      "epoch": 0.4122916323795641,
      "grad_norm": 0.08581741154193878,
      "learning_rate": 0.00015609648804131612,
      "loss": 8.8145,
      "step": 26304,
      "throughput": 12500.845637212557
    },
    {
      "epoch": 0.41279320370849304,
      "grad_norm": 0.08051755279302597,
      "learning_rate": 0.00015582295026950332,
      "loss": 8.8362,
      "step": 26336,
      "throughput": 12500.988015783307
    },
    {
      "epoch": 0.4132947750374219,
      "grad_norm": 0.08957970142364502,
      "learning_rate": 0.00015554945034387075,
      "loss": 8.8285,
      "step": 26368,
      "throughput": 12501.173553645169
    },
    {
      "epoch": 0.4137963463663508,
      "grad_norm": 0.07820248603820801,
      "learning_rate": 0.00015527598939233303,
      "loss": 8.8344,
      "step": 26400,
      "throughput": 12501.333332567217
    },
    {
      "epoch": 0.41429791769527974,
      "grad_norm": 0.07684613764286041,
      "learning_rate": 0.00015500256854264385,
      "loss": 8.8242,
      "step": 26432,
      "throughput": 12501.561419958342
    },
    {
      "epoch": 0.41479948902420866,
      "grad_norm": 0.08777330070734024,
      "learning_rate": 0.00015472918892239166,
      "loss": 8.8374,
      "step": 26464,
      "throughput": 12501.681715732624
    },
    {
      "epoch": 0.4153010603531376,
      "grad_norm": 0.08308760821819305,
      "learning_rate": 0.00015445585165899475,
      "loss": 8.8335,
      "step": 26496,
      "throughput": 12501.681390072552
    },
    {
      "epoch": 0.4158026316820665,
      "grad_norm": 0.0822068527340889,
      "learning_rate": 0.00015418255787969692,
      "loss": 8.8303,
      "step": 26528,
      "throughput": 12501.842470612857
    },
    {
      "epoch": 0.4163042030109954,
      "grad_norm": 0.07649709284305573,
      "learning_rate": 0.0001539093087115624,
      "loss": 8.8497,
      "step": 26560,
      "throughput": 12501.80819770379
    },
    {
      "epoch": 0.4168057743399243,
      "grad_norm": 0.08079321682453156,
      "learning_rate": 0.00015363610528147163,
      "loss": 8.8393,
      "step": 26592,
      "throughput": 12501.674214507679
    },
    {
      "epoch": 0.4173073456688532,
      "grad_norm": 0.08345367014408112,
      "learning_rate": 0.00015336294871611637,
      "loss": 8.8202,
      "step": 26624,
      "throughput": 12501.62953861138
    },
    {
      "epoch": 0.4178089169977821,
      "grad_norm": 0.07904289662837982,
      "learning_rate": 0.00015308984014199511,
      "loss": 8.8326,
      "step": 26656,
      "throughput": 12501.080528407167
    },
    {
      "epoch": 0.418310488326711,
      "grad_norm": 0.08403212577104568,
      "learning_rate": 0.00015281678068540836,
      "loss": 8.8156,
      "step": 26688,
      "throughput": 12501.172306996934
    },
    {
      "epoch": 0.41881205965563995,
      "grad_norm": 0.08107221871614456,
      "learning_rate": 0.00015254377147245424,
      "loss": 8.8123,
      "step": 26720,
      "throughput": 12501.392173340688
    },
    {
      "epoch": 0.41931363098456886,
      "grad_norm": 0.08362643420696259,
      "learning_rate": 0.00015227081362902343,
      "loss": 8.8411,
      "step": 26752,
      "throughput": 12501.543097902952
    },
    {
      "epoch": 0.4198152023134978,
      "grad_norm": 0.08034635335206985,
      "learning_rate": 0.000151997908280795,
      "loss": 8.8134,
      "step": 26784,
      "throughput": 12501.628132213427
    },
    {
      "epoch": 0.42031677364242664,
      "grad_norm": 0.0822720155119896,
      "learning_rate": 0.0001517250565532313,
      "loss": 8.8242,
      "step": 26816,
      "throughput": 12501.645734944539
    },
    {
      "epoch": 0.42081834497135556,
      "grad_norm": 0.08424656093120575,
      "learning_rate": 0.00015145225957157373,
      "loss": 8.8146,
      "step": 26848,
      "throughput": 12501.785904689688
    },
    {
      "epoch": 0.4213199163002845,
      "grad_norm": 0.07683435082435608,
      "learning_rate": 0.00015117951846083786,
      "loss": 8.8272,
      "step": 26880,
      "throughput": 12501.788421695752
    },
    {
      "epoch": 0.4218214876292134,
      "grad_norm": 0.08346127718687057,
      "learning_rate": 0.0001509068343458088,
      "loss": 8.8126,
      "step": 26912,
      "throughput": 12501.779378240033
    },
    {
      "epoch": 0.4223230589581423,
      "grad_norm": 0.07858198881149292,
      "learning_rate": 0.00015063420835103667,
      "loss": 8.8373,
      "step": 26944,
      "throughput": 12501.67446416126
    },
    {
      "epoch": 0.42282463028707123,
      "grad_norm": 0.07885950058698654,
      "learning_rate": 0.0001503616416008319,
      "loss": 8.8379,
      "step": 26976,
      "throughput": 12501.74871067608
    },
    {
      "epoch": 0.42332620161600015,
      "grad_norm": 0.08099333196878433,
      "learning_rate": 0.00015008913521926052,
      "loss": 8.8305,
      "step": 27008,
      "throughput": 12501.839545957073
    },
    {
      "epoch": 0.423827772944929,
      "grad_norm": 0.08029799163341522,
      "learning_rate": 0.00014981669033013972,
      "loss": 8.8205,
      "step": 27040,
      "throughput": 12502.066045863343
    },
    {
      "epoch": 0.42432934427385793,
      "grad_norm": 0.08457601815462112,
      "learning_rate": 0.00014954430805703302,
      "loss": 8.8005,
      "step": 27072,
      "throughput": 12502.214232006745
    },
    {
      "epoch": 0.42483091560278685,
      "grad_norm": 0.08561043441295624,
      "learning_rate": 0.00014927198952324568,
      "loss": 8.7984,
      "step": 27104,
      "throughput": 12502.378735296224
    },
    {
      "epoch": 0.42533248693171577,
      "grad_norm": 0.08229973167181015,
      "learning_rate": 0.00014899973585182012,
      "loss": 8.828,
      "step": 27136,
      "throughput": 12502.534067644634
    },
    {
      "epoch": 0.4258340582606447,
      "grad_norm": 0.08372906595468521,
      "learning_rate": 0.00014872754816553141,
      "loss": 8.805,
      "step": 27168,
      "throughput": 12502.498069887373
    },
    {
      "epoch": 0.4263356295895736,
      "grad_norm": 0.08462665230035782,
      "learning_rate": 0.00014845542758688222,
      "loss": 8.8426,
      "step": 27200,
      "throughput": 12502.522762260758
    },
    {
      "epoch": 0.42683720091850247,
      "grad_norm": 0.07806643843650818,
      "learning_rate": 0.00014818337523809876,
      "loss": 8.824,
      "step": 27232,
      "throughput": 12502.55702013303
    },
    {
      "epoch": 0.4273387722474314,
      "grad_norm": 0.08268404752016068,
      "learning_rate": 0.0001479113922411256,
      "loss": 8.8181,
      "step": 27264,
      "throughput": 12502.444745720913
    },
    {
      "epoch": 0.4278403435763603,
      "grad_norm": 0.07644987851381302,
      "learning_rate": 0.00014763947971762153,
      "loss": 8.8116,
      "step": 27296,
      "throughput": 12502.521799841566
    },
    {
      "epoch": 0.4283419149052892,
      "grad_norm": 0.07754526287317276,
      "learning_rate": 0.00014736763878895457,
      "loss": 8.8104,
      "step": 27328,
      "throughput": 12502.750098933497
    },
    {
      "epoch": 0.42884348623421814,
      "grad_norm": 0.08302488178014755,
      "learning_rate": 0.00014709587057619748,
      "loss": 8.8339,
      "step": 27360,
      "throughput": 12502.846138213567
    },
    {
      "epoch": 0.42934505756314706,
      "grad_norm": 0.0887962132692337,
      "learning_rate": 0.0001468241762001232,
      "loss": 8.8151,
      "step": 27392,
      "throughput": 12502.98997233486
    },
    {
      "epoch": 0.429846628892076,
      "grad_norm": 0.07846581190824509,
      "learning_rate": 0.00014655255678120015,
      "loss": 8.8135,
      "step": 27424,
      "throughput": 12503.21332257332
    },
    {
      "epoch": 0.43034820022100484,
      "grad_norm": 0.07606692612171173,
      "learning_rate": 0.0001462810134395876,
      "loss": 8.8011,
      "step": 27456,
      "throughput": 12503.302442524873
    },
    {
      "epoch": 0.43084977154993376,
      "grad_norm": 0.08284489810466766,
      "learning_rate": 0.0001460095472951311,
      "loss": 8.834,
      "step": 27488,
      "throughput": 12503.297877481697
    },
    {
      "epoch": 0.4313513428788627,
      "grad_norm": 0.08123686164617538,
      "learning_rate": 0.0001457381594673579,
      "loss": 8.8202,
      "step": 27520,
      "throughput": 12503.346816106572
    },
    {
      "epoch": 0.4318529142077916,
      "grad_norm": 0.08661215752363205,
      "learning_rate": 0.00014546685107547205,
      "loss": 8.813,
      "step": 27552,
      "throughput": 12503.391456162526
    },
    {
      "epoch": 0.4323544855367205,
      "grad_norm": 0.07993298023939133,
      "learning_rate": 0.00014519562323835034,
      "loss": 8.8252,
      "step": 27584,
      "throughput": 12503.144877970894
    },
    {
      "epoch": 0.43285605686564943,
      "grad_norm": 0.08379069715738297,
      "learning_rate": 0.000144924477074537,
      "loss": 8.8113,
      "step": 27616,
      "throughput": 12503.21132642366
    },
    {
      "epoch": 0.43335762819457835,
      "grad_norm": 0.07715869694948196,
      "learning_rate": 0.00014465341370223977,
      "loss": 8.8169,
      "step": 27648,
      "throughput": 12503.427370544727
    },
    {
      "epoch": 0.4338591995235072,
      "grad_norm": 0.07746557891368866,
      "learning_rate": 0.00014438243423932476,
      "loss": 8.7993,
      "step": 27680,
      "throughput": 12503.528196923007
    },
    {
      "epoch": 0.43436077085243613,
      "grad_norm": 0.08105004578828812,
      "learning_rate": 0.00014411153980331198,
      "loss": 8.8118,
      "step": 27712,
      "throughput": 12503.745351961994
    },
    {
      "epoch": 0.43486234218136505,
      "grad_norm": 0.07802123576402664,
      "learning_rate": 0.00014384073151137104,
      "loss": 8.7873,
      "step": 27744,
      "throughput": 12503.894860879096
    },
    {
      "epoch": 0.43536391351029397,
      "grad_norm": 0.0854947492480278,
      "learning_rate": 0.00014357001048031603,
      "loss": 8.8103,
      "step": 27776,
      "throughput": 12503.983286527413
    },
    {
      "epoch": 0.4358654848392229,
      "grad_norm": 0.08572660386562347,
      "learning_rate": 0.00014329937782660136,
      "loss": 8.8024,
      "step": 27808,
      "throughput": 12503.977311837294
    },
    {
      "epoch": 0.4363670561681518,
      "grad_norm": 0.08063625544309616,
      "learning_rate": 0.00014302883466631676,
      "loss": 8.8278,
      "step": 27840,
      "throughput": 12504.113899698337
    },
    {
      "epoch": 0.4368686274970807,
      "grad_norm": 0.08082283288240433,
      "learning_rate": 0.0001427583821151832,
      "loss": 8.8121,
      "step": 27872,
      "throughput": 12504.080774410691
    },
    {
      "epoch": 0.4373701988260096,
      "grad_norm": 0.07952408492565155,
      "learning_rate": 0.0001424880212885477,
      "loss": 8.8099,
      "step": 27904,
      "throughput": 12503.973240062027
    },
    {
      "epoch": 0.4378717701549385,
      "grad_norm": 0.08380618691444397,
      "learning_rate": 0.0001422177533013791,
      "loss": 8.8345,
      "step": 27936,
      "throughput": 12504.020545306383
    },
    {
      "epoch": 0.4383733414838674,
      "grad_norm": 0.0780053436756134,
      "learning_rate": 0.00014194757926826342,
      "loss": 8.8071,
      "step": 27968,
      "throughput": 12504.18453474137
    },
    {
      "epoch": 0.43887491281279634,
      "grad_norm": 0.0810895636677742,
      "learning_rate": 0.00014167750030339915,
      "loss": 8.8046,
      "step": 28000,
      "throughput": 12504.34025809574
    },
    {
      "epoch": 0.43937648414172525,
      "grad_norm": 0.07910797744989395,
      "learning_rate": 0.00014140751752059278,
      "loss": 8.7905,
      "step": 28032,
      "throughput": 12504.500546568357
    },
    {
      "epoch": 0.4398780554706542,
      "grad_norm": 0.07732567191123962,
      "learning_rate": 0.0001411376320332541,
      "loss": 8.8035,
      "step": 28064,
      "throughput": 12504.638004289523
    },
    {
      "epoch": 0.4403796267995831,
      "grad_norm": 0.0854324921965599,
      "learning_rate": 0.0001408678449543916,
      "loss": 8.8217,
      "step": 28096,
      "throughput": 12504.787997647045
    },
    {
      "epoch": 0.44088119812851195,
      "grad_norm": 0.08483397215604782,
      "learning_rate": 0.00014059815739660806,
      "loss": 8.7962,
      "step": 28128,
      "throughput": 12504.872527000663
    },
    {
      "epoch": 0.44138276945744087,
      "grad_norm": 0.08548751473426819,
      "learning_rate": 0.00014032857047209573,
      "loss": 8.8012,
      "step": 28160,
      "throughput": 12504.923092080882
    },
    {
      "epoch": 0.4418843407863698,
      "grad_norm": 0.08244451135396957,
      "learning_rate": 0.0001400590852926319,
      "loss": 8.799,
      "step": 28192,
      "throughput": 12504.924302578516
    },
    {
      "epoch": 0.4423859121152987,
      "grad_norm": 0.08533225208520889,
      "learning_rate": 0.00013978970296957423,
      "loss": 8.7928,
      "step": 28224,
      "throughput": 12504.876186012274
    },
    {
      "epoch": 0.4428874834442276,
      "grad_norm": 0.07828489691019058,
      "learning_rate": 0.00013952042461385625,
      "loss": 8.7974,
      "step": 28256,
      "throughput": 12504.820262170877
    },
    {
      "epoch": 0.44338905477315654,
      "grad_norm": 0.08020341396331787,
      "learning_rate": 0.00013925125133598266,
      "loss": 8.8,
      "step": 28288,
      "throughput": 12504.884078631061
    },
    {
      "epoch": 0.44389062610208546,
      "grad_norm": 0.08242405205965042,
      "learning_rate": 0.0001389821842460249,
      "loss": 8.8036,
      "step": 28320,
      "throughput": 12505.056763796627
    },
    {
      "epoch": 0.4443921974310143,
      "grad_norm": 0.08134979009628296,
      "learning_rate": 0.00013871322445361642,
      "loss": 8.8092,
      "step": 28352,
      "throughput": 12505.178490608681
    },
    {
      "epoch": 0.44489376875994324,
      "grad_norm": 0.0873064249753952,
      "learning_rate": 0.00013844437306794822,
      "loss": 8.8024,
      "step": 28384,
      "throughput": 12505.394171807027
    },
    {
      "epoch": 0.44539534008887216,
      "grad_norm": 0.07534735649824142,
      "learning_rate": 0.00013817563119776415,
      "loss": 8.781,
      "step": 28416,
      "throughput": 12505.479880378469
    },
    {
      "epoch": 0.4458969114178011,
      "grad_norm": 0.08811984956264496,
      "learning_rate": 0.00013790699995135658,
      "loss": 8.7893,
      "step": 28448,
      "throughput": 12505.621916456059
    },
    {
      "epoch": 0.44639848274673,
      "grad_norm": 0.08254732936620712,
      "learning_rate": 0.00013763848043656148,
      "loss": 8.8034,
      "step": 28480,
      "throughput": 12505.628225728338
    },
    {
      "epoch": 0.4469000540756589,
      "grad_norm": 0.08350877463817596,
      "learning_rate": 0.00013737007376075414,
      "loss": 8.7836,
      "step": 28512,
      "throughput": 12505.672086429726
    },
    {
      "epoch": 0.44740162540458783,
      "grad_norm": 0.08083383738994598,
      "learning_rate": 0.0001371017810308445,
      "loss": 8.7824,
      "step": 28544,
      "throughput": 12505.701112095701
    },
    {
      "epoch": 0.4479031967335167,
      "grad_norm": 0.08054768294095993,
      "learning_rate": 0.00013683360335327264,
      "loss": 8.8157,
      "step": 28576,
      "throughput": 12505.449680561696
    },
    {
      "epoch": 0.4484047680624456,
      "grad_norm": 0.08552040904760361,
      "learning_rate": 0.000136565541834004,
      "loss": 8.792,
      "step": 28608,
      "throughput": 12505.508398606178
    },
    {
      "epoch": 0.44890633939137453,
      "grad_norm": 0.08794131875038147,
      "learning_rate": 0.00013629759757852512,
      "loss": 8.7987,
      "step": 28640,
      "throughput": 12505.669945796693
    },
    {
      "epoch": 0.44940791072030345,
      "grad_norm": 0.08450362831354141,
      "learning_rate": 0.00013602977169183884,
      "loss": 8.7623,
      "step": 28672,
      "throughput": 12505.776475800698
    },
    {
      "epoch": 0.44990948204923237,
      "grad_norm": 0.08908046782016754,
      "learning_rate": 0.00013576206527846004,
      "loss": 8.7837,
      "step": 28704,
      "throughput": 12505.181770324087
    },
    {
      "epoch": 0.4504110533781613,
      "grad_norm": 0.08419577032327652,
      "learning_rate": 0.00013549447944241066,
      "loss": 8.7983,
      "step": 28736,
      "throughput": 12505.265880752857
    },
    {
      "epoch": 0.4509126247070902,
      "grad_norm": 0.07569599896669388,
      "learning_rate": 0.00013522701528721553,
      "loss": 8.7892,
      "step": 28768,
      "throughput": 12505.406476236294
    },
    {
      "epoch": 0.45141419603601907,
      "grad_norm": 0.07981006801128387,
      "learning_rate": 0.00013495967391589757,
      "loss": 8.7834,
      "step": 28800,
      "throughput": 12505.43962817237
    },
    {
      "epoch": 0.451915767364948,
      "grad_norm": 0.08539936691522598,
      "learning_rate": 0.00013469245643097345,
      "loss": 8.785,
      "step": 28832,
      "throughput": 12505.481130315899
    },
    {
      "epoch": 0.4524173386938769,
      "grad_norm": 0.09472446888685226,
      "learning_rate": 0.0001344253639344488,
      "loss": 8.7916,
      "step": 28864,
      "throughput": 12505.544241528234
    },
    {
      "epoch": 0.4529189100228058,
      "grad_norm": 0.08921679109334946,
      "learning_rate": 0.00013415839752781392,
      "loss": 8.7943,
      "step": 28896,
      "throughput": 12505.45157406719
    },
    {
      "epoch": 0.45342048135173474,
      "grad_norm": 0.07996971905231476,
      "learning_rate": 0.00013389155831203904,
      "loss": 8.7959,
      "step": 28928,
      "throughput": 12505.422222633157
    },
    {
      "epoch": 0.45392205268066366,
      "grad_norm": 0.08380214124917984,
      "learning_rate": 0.0001336248473875699,
      "loss": 8.7908,
      "step": 28960,
      "throughput": 12505.626056080559
    },
    {
      "epoch": 0.4544236240095926,
      "grad_norm": 0.08199769258499146,
      "learning_rate": 0.00013335826585432313,
      "loss": 8.781,
      "step": 28992,
      "throughput": 12505.795266845515
    },
    {
      "epoch": 0.45492519533852144,
      "grad_norm": 0.0836104303598404,
      "learning_rate": 0.00013309181481168173,
      "loss": 8.7806,
      "step": 29024,
      "throughput": 12505.904108385801
    },
    {
      "epoch": 0.45542676666745036,
      "grad_norm": 0.08294175565242767,
      "learning_rate": 0.00013282549535849065,
      "loss": 8.7792,
      "step": 29056,
      "throughput": 12506.039652361887
    },
    {
      "epoch": 0.4559283379963793,
      "grad_norm": 0.08239220827817917,
      "learning_rate": 0.00013255930859305205,
      "loss": 8.7722,
      "step": 29088,
      "throughput": 12506.181981745243
    },
    {
      "epoch": 0.4564299093253082,
      "grad_norm": 0.08480555564165115,
      "learning_rate": 0.000132293255613121,
      "loss": 8.7934,
      "step": 29120,
      "throughput": 12506.28065800319
    },
    {
      "epoch": 0.4569314806542371,
      "grad_norm": 0.07948991656303406,
      "learning_rate": 0.00013202733751590067,
      "loss": 8.7755,
      "step": 29152,
      "throughput": 12506.191430123663
    },
    {
      "epoch": 0.45743305198316603,
      "grad_norm": 0.07923830300569534,
      "learning_rate": 0.00013176155539803818,
      "loss": 8.7846,
      "step": 29184,
      "throughput": 12506.31910845052
    },
    {
      "epoch": 0.45793462331209495,
      "grad_norm": 0.08175349235534668,
      "learning_rate": 0.00013149591035561977,
      "loss": 8.7768,
      "step": 29216,
      "throughput": 12506.16222401864
    },
    {
      "epoch": 0.4584361946410238,
      "grad_norm": 0.07527513056993484,
      "learning_rate": 0.00013123040348416633,
      "loss": 8.7737,
      "step": 29248,
      "throughput": 12506.083389719804
    },
    {
      "epoch": 0.4589377659699527,
      "grad_norm": 0.08060269057750702,
      "learning_rate": 0.00013096503587862906,
      "loss": 8.7995,
      "step": 29280,
      "throughput": 12506.285669988638
    },
    {
      "epoch": 0.45943933729888164,
      "grad_norm": 0.08061324805021286,
      "learning_rate": 0.00013069980863338466,
      "loss": 8.7762,
      "step": 29312,
      "throughput": 12506.460119233994
    },
    {
      "epoch": 0.45994090862781056,
      "grad_norm": 0.07880888134241104,
      "learning_rate": 0.00013043472284223113,
      "loss": 8.788,
      "step": 29344,
      "throughput": 12506.569670524006
    },
    {
      "epoch": 0.4604424799567395,
      "grad_norm": 0.08075280487537384,
      "learning_rate": 0.00013016977959838305,
      "loss": 8.7876,
      "step": 29376,
      "throughput": 12506.773916148955
    },
    {
      "epoch": 0.4609440512856684,
      "grad_norm": 0.08283714205026627,
      "learning_rate": 0.00012990497999446714,
      "loss": 8.7854,
      "step": 29408,
      "throughput": 12506.853457567413
    },
    {
      "epoch": 0.4614456226145973,
      "grad_norm": 0.07776404172182083,
      "learning_rate": 0.00012964032512251773,
      "loss": 8.7846,
      "step": 29440,
      "throughput": 12506.998261842957
    },
    {
      "epoch": 0.4619471939435262,
      "grad_norm": 0.0844292938709259,
      "learning_rate": 0.00012937581607397236,
      "loss": 8.7803,
      "step": 29472,
      "throughput": 12506.968712999418
    },
    {
      "epoch": 0.4624487652724551,
      "grad_norm": 0.07433100044727325,
      "learning_rate": 0.00012911145393966703,
      "loss": 8.7934,
      "step": 29504,
      "throughput": 12507.010283981752
    },
    {
      "epoch": 0.462950336601384,
      "grad_norm": 0.08090733736753464,
      "learning_rate": 0.00012884723980983206,
      "loss": 8.7857,
      "step": 29536,
      "throughput": 12506.893742771588
    },
    {
      "epoch": 0.46345190793031293,
      "grad_norm": 0.07989344745874405,
      "learning_rate": 0.00012858317477408728,
      "loss": 8.79,
      "step": 29568,
      "throughput": 12506.84533351148
    },
    {
      "epoch": 0.46395347925924185,
      "grad_norm": 0.07905034720897675,
      "learning_rate": 0.00012831925992143765,
      "loss": 8.7932,
      "step": 29600,
      "throughput": 12506.91298107222
    },
    {
      "epoch": 0.46445505058817077,
      "grad_norm": 0.07559997588396072,
      "learning_rate": 0.00012805549634026882,
      "loss": 8.7792,
      "step": 29632,
      "throughput": 12507.071866990049
    },
    {
      "epoch": 0.4649566219170997,
      "grad_norm": 0.08120205253362656,
      "learning_rate": 0.00012779188511834256,
      "loss": 8.7904,
      "step": 29664,
      "throughput": 12507.233106886928
    },
    {
      "epoch": 0.46545819324602855,
      "grad_norm": 0.08174072206020355,
      "learning_rate": 0.00012752842734279238,
      "loss": 8.7693,
      "step": 29696,
      "throughput": 12507.373090242085
    },
    {
      "epoch": 0.46595976457495747,
      "grad_norm": 0.08241820335388184,
      "learning_rate": 0.0001272651241001189,
      "loss": 8.7826,
      "step": 29728,
      "throughput": 12507.438613262173
    },
    {
      "epoch": 0.4664613359038864,
      "grad_norm": 0.0764114111661911,
      "learning_rate": 0.00012700197647618549,
      "loss": 8.7917,
      "step": 29760,
      "throughput": 12507.574595308435
    },
    {
      "epoch": 0.4669629072328153,
      "grad_norm": 0.0877339169383049,
      "learning_rate": 0.00012673898555621373,
      "loss": 8.7797,
      "step": 29792,
      "throughput": 12507.562236454418
    },
    {
      "epoch": 0.4674644785617442,
      "grad_norm": 0.08090822398662567,
      "learning_rate": 0.00012647615242477887,
      "loss": 8.745,
      "step": 29824,
      "throughput": 12507.588567247858
    },
    {
      "epoch": 0.46796604989067314,
      "grad_norm": 0.07626835256814957,
      "learning_rate": 0.0001262134781658056,
      "loss": 8.7662,
      "step": 29856,
      "throughput": 12507.644819327088
    },
    {
      "epoch": 0.46846762121960206,
      "grad_norm": 0.0867987796664238,
      "learning_rate": 0.00012595096386256336,
      "loss": 8.7677,
      "step": 29888,
      "throughput": 12507.488479913989
    },
    {
      "epoch": 0.4689691925485309,
      "grad_norm": 0.08022241294384003,
      "learning_rate": 0.0001256886105976619,
      "loss": 8.7928,
      "step": 29920,
      "throughput": 12507.475755007914
    },
    {
      "epoch": 0.46947076387745984,
      "grad_norm": 0.0800698921084404,
      "learning_rate": 0.0001254264194530468,
      "loss": 8.7815,
      "step": 29952,
      "throughput": 12507.627904220677
    },
    {
      "epoch": 0.46997233520638876,
      "grad_norm": 0.08612282574176788,
      "learning_rate": 0.00012516439150999525,
      "loss": 8.779,
      "step": 29984,
      "throughput": 12507.827197284922
    },
    {
      "epoch": 0.4704739065353177,
      "grad_norm": 0.07774586230516434,
      "learning_rate": 0.00012490252784911113,
      "loss": 8.7475,
      "step": 30016,
      "throughput": 12507.941832818991
    },
    {
      "epoch": 0.4709754778642466,
      "grad_norm": 0.07533223181962967,
      "learning_rate": 0.000124640829550321,
      "loss": 8.7656,
      "step": 30048,
      "throughput": 12508.084077309297
    },
    {
      "epoch": 0.4714770491931755,
      "grad_norm": 0.08107419312000275,
      "learning_rate": 0.00012437929769286942,
      "loss": 8.7687,
      "step": 30080,
      "throughput": 12508.211097889753
    },
    {
      "epoch": 0.47197862052210443,
      "grad_norm": 0.07990825921297073,
      "learning_rate": 0.0001241179333553146,
      "loss": 8.7837,
      "step": 30112,
      "throughput": 12508.302930498237
    },
    {
      "epoch": 0.4724801918510333,
      "grad_norm": 0.08068916946649551,
      "learning_rate": 0.00012385673761552374,
      "loss": 8.7643,
      "step": 30144,
      "throughput": 12508.208036804326
    },
    {
      "epoch": 0.4729817631799622,
      "grad_norm": 0.08091454207897186,
      "learning_rate": 0.00012359571155066894,
      "loss": 8.7732,
      "step": 30176,
      "throughput": 12508.296185243455
    },
    {
      "epoch": 0.47348333450889113,
      "grad_norm": 0.17354527115821838,
      "learning_rate": 0.00012333485623722238,
      "loss": 8.7897,
      "step": 30208,
      "throughput": 12508.14258921723
    },
    {
      "epoch": 0.47398490583782005,
      "grad_norm": 0.08511584997177124,
      "learning_rate": 0.00012307417275095222,
      "loss": 8.7677,
      "step": 30240,
      "throughput": 12508.115507616394
    },
    {
      "epoch": 0.47448647716674897,
      "grad_norm": 0.07827948778867722,
      "learning_rate": 0.00012281366216691786,
      "loss": 8.7539,
      "step": 30272,
      "throughput": 12508.305558338443
    },
    {
      "epoch": 0.4749880484956779,
      "grad_norm": 0.07761247456073761,
      "learning_rate": 0.00012255332555946582,
      "loss": 8.7526,
      "step": 30304,
      "throughput": 12508.455505946777
    },
    {
      "epoch": 0.4754896198246068,
      "grad_norm": 0.08157996088266373,
      "learning_rate": 0.00012229316400222493,
      "loss": 8.7795,
      "step": 30336,
      "throughput": 12508.555800396765
    },
    {
      "epoch": 0.47599119115353566,
      "grad_norm": 0.07903724908828735,
      "learning_rate": 0.00012203317856810232,
      "loss": 8.7757,
      "step": 30368,
      "throughput": 12508.690458329196
    },
    {
      "epoch": 0.4764927624824646,
      "grad_norm": 0.08179880678653717,
      "learning_rate": 0.0001217733703292786,
      "loss": 8.7522,
      "step": 30400,
      "throughput": 12508.814542608767
    },
    {
      "epoch": 0.4769943338113935,
      "grad_norm": 0.07757299393415451,
      "learning_rate": 0.0001215137403572038,
      "loss": 8.7659,
      "step": 30432,
      "throughput": 12508.907694365285
    },
    {
      "epoch": 0.4774959051403224,
      "grad_norm": 0.0800146833062172,
      "learning_rate": 0.00012125428972259264,
      "loss": 8.7598,
      "step": 30464,
      "throughput": 12508.851198702956
    },
    {
      "epoch": 0.47799747646925134,
      "grad_norm": 0.07752782106399536,
      "learning_rate": 0.0001209950194954203,
      "loss": 8.7707,
      "step": 30496,
      "throughput": 12508.925468249761
    },
    {
      "epoch": 0.47849904779818025,
      "grad_norm": 0.08032878488302231,
      "learning_rate": 0.00012073593074491802,
      "loss": 8.7868,
      "step": 30528,
      "throughput": 12508.849551927575
    },
    {
      "epoch": 0.4790006191271092,
      "grad_norm": 0.0815482884645462,
      "learning_rate": 0.0001204770245395685,
      "loss": 8.7762,
      "step": 30560,
      "throughput": 12508.749956563777
    },
    {
      "epoch": 0.47950219045603804,
      "grad_norm": 0.07393264025449753,
      "learning_rate": 0.00012021830194710178,
      "loss": 8.7618,
      "step": 30592,
      "throughput": 12508.921022993149
    },
    {
      "epoch": 0.48000376178496695,
      "grad_norm": 0.09324093163013458,
      "learning_rate": 0.00011995976403449054,
      "loss": 8.7621,
      "step": 30624,
      "throughput": 12509.087942919288
    },
    {
      "epoch": 0.48050533311389587,
      "grad_norm": 0.08318941295146942,
      "learning_rate": 0.00011970141186794592,
      "loss": 8.7732,
      "step": 30656,
      "throughput": 12509.245774551542
    },
    {
      "epoch": 0.4810069044428248,
      "grad_norm": 0.08678317815065384,
      "learning_rate": 0.00011944324651291299,
      "loss": 8.7464,
      "step": 30688,
      "throughput": 12509.332430369472
    },
    {
      "epoch": 0.4815084757717537,
      "grad_norm": 0.07876382023096085,
      "learning_rate": 0.00011918526903406647,
      "loss": 8.7479,
      "step": 30720,
      "throughput": 12509.453968965496
    },
    {
      "epoch": 0.4820100471006826,
      "grad_norm": 0.08849357813596725,
      "learning_rate": 0.0001189274804953063,
      "loss": 8.7674,
      "step": 30752,
      "throughput": 12508.958433858608
    },
    {
      "epoch": 0.48251161842961154,
      "grad_norm": 0.07928457856178284,
      "learning_rate": 0.00011866988195975307,
      "loss": 8.7614,
      "step": 30784,
      "throughput": 12508.846489232832
    },
    {
      "epoch": 0.4830131897585404,
      "grad_norm": 0.12978075444698334,
      "learning_rate": 0.00011841247448974398,
      "loss": 8.7638,
      "step": 30816,
      "throughput": 12508.88239726129
    },
    {
      "epoch": 0.4835147610874693,
      "grad_norm": 0.07514094561338425,
      "learning_rate": 0.00011815525914682817,
      "loss": 8.7428,
      "step": 30848,
      "throughput": 12508.814152962192
    },
    {
      "epoch": 0.48401633241639824,
      "grad_norm": 0.08580697327852249,
      "learning_rate": 0.00011789823699176249,
      "loss": 8.7646,
      "step": 30880,
      "throughput": 12508.78627396142
    },
    {
      "epoch": 0.48451790374532716,
      "grad_norm": 0.08245430886745453,
      "learning_rate": 0.00011764140908450703,
      "loss": 8.753,
      "step": 30912,
      "throughput": 12508.808191222613
    },
    {
      "epoch": 0.4850194750742561,
      "grad_norm": 0.07886088639497757,
      "learning_rate": 0.0001173847764842209,
      "loss": 8.7662,
      "step": 30944,
      "throughput": 12508.962621729643
    },
    {
      "epoch": 0.485521046403185,
      "grad_norm": 0.07890153676271439,
      "learning_rate": 0.00011712834024925766,
      "loss": 8.7624,
      "step": 30976,
      "throughput": 12509.141366691623
    },
    {
      "epoch": 0.4860226177321139,
      "grad_norm": 0.08036927133798599,
      "learning_rate": 0.00011687210143716116,
      "loss": 8.7479,
      "step": 31008,
      "throughput": 12509.238001841131
    },
    {
      "epoch": 0.4865241890610428,
      "grad_norm": 0.09068219363689423,
      "learning_rate": 0.00011661606110466095,
      "loss": 8.7535,
      "step": 31040,
      "throughput": 12509.368151742632
    },
    {
      "epoch": 0.4870257603899717,
      "grad_norm": 0.0785689428448677,
      "learning_rate": 0.00011636022030766818,
      "loss": 8.7651,
      "step": 31072,
      "throughput": 12509.487875607168
    },
    {
      "epoch": 0.4875273317189006,
      "grad_norm": 0.08110499382019043,
      "learning_rate": 0.00011610458010127093,
      "loss": 8.7536,
      "step": 31104,
      "throughput": 12509.537437094328
    },
    {
      "epoch": 0.48802890304782953,
      "grad_norm": 0.07937299460172653,
      "learning_rate": 0.00011584914153973036,
      "loss": 8.7736,
      "step": 31136,
      "throughput": 12509.438853906258
    },
    {
      "epoch": 0.48853047437675845,
      "grad_norm": 0.08056695759296417,
      "learning_rate": 0.00011559390567647571,
      "loss": 8.7492,
      "step": 31168,
      "throughput": 12509.480462998552
    },
    {
      "epoch": 0.48903204570568737,
      "grad_norm": 0.08015215396881104,
      "learning_rate": 0.00011533887356410052,
      "loss": 8.7586,
      "step": 31200,
      "throughput": 12509.36621670406
    },
    {
      "epoch": 0.4895336170346163,
      "grad_norm": 0.08530990779399872,
      "learning_rate": 0.00011508404625435791,
      "loss": 8.7608,
      "step": 31232,
      "throughput": 12509.389685983637
    },
    {
      "epoch": 0.49003518836354515,
      "grad_norm": 0.08087247610092163,
      "learning_rate": 0.00011482942479815651,
      "loss": 8.7399,
      "step": 31264,
      "throughput": 12509.533223065642
    },
    {
      "epoch": 0.49053675969247407,
      "grad_norm": 0.08212879300117493,
      "learning_rate": 0.00011457501024555593,
      "loss": 8.7591,
      "step": 31296,
      "throughput": 12509.71621584718
    },
    {
      "epoch": 0.491038331021403,
      "grad_norm": 0.07677069306373596,
      "learning_rate": 0.00011432080364576256,
      "loss": 8.7386,
      "step": 31328,
      "throughput": 12509.86786846776
    },
    {
      "epoch": 0.4915399023503319,
      "grad_norm": 0.08998126536607742,
      "learning_rate": 0.00011406680604712517,
      "loss": 8.7581,
      "step": 31360,
      "throughput": 12509.947022465163
    },
    {
      "epoch": 0.4920414736792608,
      "grad_norm": 0.07648464292287827,
      "learning_rate": 0.00011381301849713059,
      "loss": 8.7573,
      "step": 31392,
      "throughput": 12510.063359933165
    },
    {
      "epoch": 0.49254304500818974,
      "grad_norm": 0.080910325050354,
      "learning_rate": 0.00011355944204239944,
      "loss": 8.7566,
      "step": 31424,
      "throughput": 12510.18732631582
    },
    {
      "epoch": 0.4930446163371186,
      "grad_norm": 0.0793800875544548,
      "learning_rate": 0.0001133060777286818,
      "loss": 8.7457,
      "step": 31456,
      "throughput": 12510.05788704578
    },
    {
      "epoch": 0.4935461876660475,
      "grad_norm": 0.08065943419933319,
      "learning_rate": 0.00011305292660085278,
      "loss": 8.739,
      "step": 31488,
      "throughput": 12510.145206585308
    },
    {
      "epoch": 0.49404775899497644,
      "grad_norm": 0.08295351266860962,
      "learning_rate": 0.00011279998970290844,
      "loss": 8.7673,
      "step": 31520,
      "throughput": 12510.0537774648
    },
    {
      "epoch": 0.49454933032390536,
      "grad_norm": 0.08329456299543381,
      "learning_rate": 0.0001125472680779613,
      "loss": 8.7515,
      "step": 31552,
      "throughput": 12510.008718802392
    },
    {
      "epoch": 0.4950509016528343,
      "grad_norm": 0.08850586414337158,
      "learning_rate": 0.00011229476276823608,
      "loss": 8.7395,
      "step": 31584,
      "throughput": 12510.16297776667
    },
    {
      "epoch": 0.4955524729817632,
      "grad_norm": 0.08080103248357773,
      "learning_rate": 0.00011204247481506535,
      "loss": 8.7392,
      "step": 31616,
      "throughput": 12510.3125035182
    },
    {
      "epoch": 0.4960540443106921,
      "grad_norm": 0.08016198873519897,
      "learning_rate": 0.00011179040525888552,
      "loss": 8.7455,
      "step": 31648,
      "throughput": 12510.463262300123
    },
    {
      "epoch": 0.496555615639621,
      "grad_norm": 0.0868767723441124,
      "learning_rate": 0.00011153855513923207,
      "loss": 8.7339,
      "step": 31680,
      "throughput": 12510.564056947635
    },
    {
      "epoch": 0.4970571869685499,
      "grad_norm": 0.08294384926557541,
      "learning_rate": 0.00011128692549473568,
      "loss": 8.7557,
      "step": 31712,
      "throughput": 12510.745317389034
    },
    {
      "epoch": 0.4975587582974788,
      "grad_norm": 0.10098681598901749,
      "learning_rate": 0.00011103551736311777,
      "loss": 8.7379,
      "step": 31744,
      "throughput": 12510.86022010489
    },
    {
      "epoch": 0.4980603296264077,
      "grad_norm": 0.08104648441076279,
      "learning_rate": 0.0001107843317811862,
      "loss": 8.7329,
      "step": 31776,
      "throughput": 12510.681606577396
    },
    {
      "epoch": 0.49856190095533665,
      "grad_norm": 0.08391673862934113,
      "learning_rate": 0.00011053336978483102,
      "loss": 8.7606,
      "step": 31808,
      "throughput": 12510.734798121777
    },
    {
      "epoch": 0.49906347228426556,
      "grad_norm": 0.08090377599000931,
      "learning_rate": 0.00011028263240902033,
      "loss": 8.7285,
      "step": 31840,
      "throughput": 12510.676006699603
    },
    {
      "epoch": 0.4995650436131945,
      "grad_norm": 0.08488581329584122,
      "learning_rate": 0.0001100321206877957,
      "loss": 8.7327,
      "step": 31872,
      "throughput": 12510.641308355624
    },
    {
      "epoch": 0.5000666149421233,
      "grad_norm": 0.07887663692235947,
      "learning_rate": 0.00010978183565426832,
      "loss": 8.7461,
      "step": 31904,
      "throughput": 12510.75024742261
    },
    {
      "epoch": 0.5005681862710523,
      "grad_norm": 0.0869184285402298,
      "learning_rate": 0.00010953177834061435,
      "loss": 8.7597,
      "step": 31936,
      "throughput": 12510.895384025353
    },
    {
      "epoch": 0.5010697575999812,
      "grad_norm": 0.08527307957410812,
      "learning_rate": 0.00010928194977807091,
      "loss": 8.7394,
      "step": 31968,
      "throughput": 12511.070523100629
    },
    {
      "epoch": 0.5015713289289101,
      "grad_norm": 0.08460355550050735,
      "learning_rate": 0.00010903235099693174,
      "loss": 8.7285,
      "step": 32000,
      "throughput": 12511.11540023717
    },
    {
      "epoch": 0.502072900257839,
      "grad_norm": 0.09160695970058441,
      "learning_rate": 0.00010878298302654294,
      "loss": 8.7587,
      "step": 32032,
      "throughput": 12511.295692044801
    },
    {
      "epoch": 0.5025744715867679,
      "grad_norm": 0.081563301384449,
      "learning_rate": 0.00010853384689529873,
      "loss": 8.7453,
      "step": 32064,
      "throughput": 12511.40980242315
    },
    {
      "epoch": 0.5030760429156969,
      "grad_norm": 0.08575332164764404,
      "learning_rate": 0.00010828494363063732,
      "loss": 8.7427,
      "step": 32096,
      "throughput": 12511.381388249676
    },
    {
      "epoch": 0.5035776142446258,
      "grad_norm": 0.08162245899438858,
      "learning_rate": 0.0001080362742590364,
      "loss": 8.7537,
      "step": 32128,
      "throughput": 12511.317804481178
    },
    {
      "epoch": 0.5040791855735547,
      "grad_norm": 0.0902937799692154,
      "learning_rate": 0.00010778783980600939,
      "loss": 8.7594,
      "step": 32160,
      "throughput": 12511.351734559135
    },
    {
      "epoch": 0.5045807569024836,
      "grad_norm": 0.08423754572868347,
      "learning_rate": 0.00010753964129610052,
      "loss": 8.743,
      "step": 32192,
      "throughput": 12511.245678204701
    },
    {
      "epoch": 0.5050823282314125,
      "grad_norm": 0.08069079369306564,
      "learning_rate": 0.00010729167975288122,
      "loss": 8.7456,
      "step": 32224,
      "throughput": 12511.309004588747
    },
    {
      "epoch": 0.5055838995603413,
      "grad_norm": 0.07970630377531052,
      "learning_rate": 0.0001070439561989457,
      "loss": 8.7515,
      "step": 32256,
      "throughput": 12511.464278292919
    },
    {
      "epoch": 0.5060854708892703,
      "grad_norm": 0.08481299132108688,
      "learning_rate": 0.00010679647165590659,
      "loss": 8.7306,
      "step": 32288,
      "throughput": 12511.645110625795
    },
    {
      "epoch": 0.5065870422181992,
      "grad_norm": 0.08320998400449753,
      "learning_rate": 0.00010654922714439083,
      "loss": 8.7316,
      "step": 32320,
      "throughput": 12511.732213907615
    },
    {
      "epoch": 0.5070886135471281,
      "grad_norm": 0.08566644042730331,
      "learning_rate": 0.00010630222368403561,
      "loss": 8.7223,
      "step": 32352,
      "throughput": 12511.857384326633
    },
    {
      "epoch": 0.507590184876057,
      "grad_norm": 0.0807805135846138,
      "learning_rate": 0.00010605546229348396,
      "loss": 8.7497,
      "step": 32384,
      "throughput": 12511.968185963713
    },
    {
      "epoch": 0.5080917562049859,
      "grad_norm": 0.08053547143936157,
      "learning_rate": 0.00010580894399038044,
      "loss": 8.7466,
      "step": 32416,
      "throughput": 12512.034265205712
    },
    {
      "epoch": 0.5085933275339148,
      "grad_norm": 0.12142524868249893,
      "learning_rate": 0.00010556266979136734,
      "loss": 8.73,
      "step": 32448,
      "throughput": 12511.94393793066
    },
    {
      "epoch": 0.5090948988628438,
      "grad_norm": 0.0837826207280159,
      "learning_rate": 0.00010531664071208019,
      "loss": 8.733,
      "step": 32480,
      "throughput": 12512.009493397778
    },
    {
      "epoch": 0.5095964701917727,
      "grad_norm": 0.08382360637187958,
      "learning_rate": 0.00010507085776714369,
      "loss": 8.7224,
      "step": 32512,
      "throughput": 12511.999355929676
    },
    {
      "epoch": 0.5100980415207016,
      "grad_norm": 0.0817563384771347,
      "learning_rate": 0.00010482532197016732,
      "loss": 8.7403,
      "step": 32544,
      "throughput": 12511.856327004498
    },
    {
      "epoch": 0.5105996128496305,
      "grad_norm": 0.08128924667835236,
      "learning_rate": 0.00010458003433374152,
      "loss": 8.7322,
      "step": 32576,
      "throughput": 12511.991218821682
    },
    {
      "epoch": 0.5111011841785594,
      "grad_norm": 0.08386674523353577,
      "learning_rate": 0.00010433499586943319,
      "loss": 8.7419,
      "step": 32608,
      "throughput": 12512.170004084111
    },
    {
      "epoch": 0.5116027555074883,
      "grad_norm": 0.07870669662952423,
      "learning_rate": 0.00010409020758778178,
      "loss": 8.7404,
      "step": 32640,
      "throughput": 12512.34686175348
    },
    {
      "epoch": 0.5121043268364173,
      "grad_norm": 0.0825226828455925,
      "learning_rate": 0.00010384567049829474,
      "loss": 8.7298,
      "step": 32672,
      "throughput": 12512.38983596489
    },
    {
      "epoch": 0.5126058981653461,
      "grad_norm": 0.08137047290802002,
      "learning_rate": 0.00010360138560944379,
      "loss": 8.7255,
      "step": 32704,
      "throughput": 12512.564533810659
    },
    {
      "epoch": 0.513107469494275,
      "grad_norm": 0.08141603320837021,
      "learning_rate": 0.00010335735392866061,
      "loss": 8.7222,
      "step": 32736,
      "throughput": 12512.62880948394
    },
    {
      "epoch": 0.5136090408232039,
      "grad_norm": 0.08862655609846115,
      "learning_rate": 0.00010311357646233255,
      "loss": 8.7387,
      "step": 32768,
      "throughput": 12512.50488243399
    },
    {
      "epoch": 0.5141106121521328,
      "grad_norm": 0.08207987248897552,
      "learning_rate": 0.00010287005421579854,
      "loss": 8.7515,
      "step": 32800,
      "throughput": 12511.983187127891
    },
    {
      "epoch": 0.5146121834810617,
      "grad_norm": 0.07818058878183365,
      "learning_rate": 0.00010262678819334511,
      "loss": 8.7303,
      "step": 32832,
      "throughput": 12511.901100086807
    },
    {
      "epoch": 0.5151137548099907,
      "grad_norm": 0.08296819031238556,
      "learning_rate": 0.00010238377939820202,
      "loss": 8.7355,
      "step": 32864,
      "throughput": 12511.843185395246
    },
    {
      "epoch": 0.5156153261389196,
      "grad_norm": 0.09073803573846817,
      "learning_rate": 0.00010214102883253832,
      "loss": 8.7332,
      "step": 32896,
      "throughput": 12511.953352864462
    },
    {
      "epoch": 0.5161168974678485,
      "grad_norm": 0.32719534635543823,
      "learning_rate": 0.00010189853749745799,
      "loss": 8.7188,
      "step": 32928,
      "throughput": 12512.104071246153
    },
    {
      "epoch": 0.5166184687967774,
      "grad_norm": 0.08849098533391953,
      "learning_rate": 0.00010165630639299606,
      "loss": 8.7225,
      "step": 32960,
      "throughput": 12512.283954151511
    },
    {
      "epoch": 0.5171200401257063,
      "grad_norm": 0.08463660627603531,
      "learning_rate": 0.00010141433651811429,
      "loss": 8.7263,
      "step": 32992,
      "throughput": 12512.360463779769
    },
    {
      "epoch": 0.5176216114546353,
      "grad_norm": 0.09417407959699631,
      "learning_rate": 0.00010117262887069724,
      "loss": 8.7347,
      "step": 33024,
      "throughput": 12512.487421895641
    },
    {
      "epoch": 0.5181231827835642,
      "grad_norm": 0.07990865409374237,
      "learning_rate": 0.00010093118444754784,
      "loss": 8.7374,
      "step": 33056,
      "throughput": 12512.604844924646
    },
    {
      "epoch": 0.5186247541124931,
      "grad_norm": 0.1536962240934372,
      "learning_rate": 0.0001006900042443837,
      "loss": 8.7055,
      "step": 33088,
      "throughput": 12512.54815890842
    },
    {
      "epoch": 0.519126325441422,
      "grad_norm": 0.09293182939291,
      "learning_rate": 0.00010044908925583264,
      "loss": 8.7433,
      "step": 33120,
      "throughput": 12512.486599010308
    },
    {
      "epoch": 0.5196278967703508,
      "grad_norm": 0.08393841981887817,
      "learning_rate": 0.00010020844047542886,
      "loss": 8.7175,
      "step": 33152,
      "throughput": 12512.50869110717
    },
    {
      "epoch": 0.5201294680992797,
      "grad_norm": 0.0840592086315155,
      "learning_rate": 9.996805889560857e-05,
      "loss": 8.7241,
      "step": 33184,
      "throughput": 12512.387015564022
    },
    {
      "epoch": 0.5206310394282087,
      "grad_norm": 0.08570297062397003,
      "learning_rate": 9.972794550770612e-05,
      "loss": 8.7176,
      "step": 33216,
      "throughput": 12512.459075590425
    },
    {
      "epoch": 0.5211326107571376,
      "grad_norm": 0.08551673591136932,
      "learning_rate": 9.948810130194984e-05,
      "loss": 8.7408,
      "step": 33248,
      "throughput": 12512.593138858181
    },
    {
      "epoch": 0.5216341820860665,
      "grad_norm": 0.08799657970666885,
      "learning_rate": 9.924852726745807e-05,
      "loss": 8.7136,
      "step": 33280,
      "throughput": 12512.76581620304
    },
    {
      "epoch": 0.5221357534149954,
      "grad_norm": 0.08384101092815399,
      "learning_rate": 9.900922439223464e-05,
      "loss": 8.7567,
      "step": 33312,
      "throughput": 12512.863226335372
    },
    {
      "epoch": 0.5226373247439243,
      "grad_norm": 0.08380109071731567,
      "learning_rate": 9.877019366316541e-05,
      "loss": 8.7051,
      "step": 33344,
      "throughput": 12512.982174536366
    },
    {
      "epoch": 0.5231388960728532,
      "grad_norm": 0.08169150352478027,
      "learning_rate": 9.85314360660138e-05,
      "loss": 8.7099,
      "step": 33376,
      "throughput": 12513.085254147247
    },
    {
      "epoch": 0.5236404674017822,
      "grad_norm": 0.08903324604034424,
      "learning_rate": 9.829295258541692e-05,
      "loss": 8.7207,
      "step": 33408,
      "throughput": 12513.046439956352
    },
    {
      "epoch": 0.5241420387307111,
      "grad_norm": 0.08354644477367401,
      "learning_rate": 9.805474420488123e-05,
      "loss": 8.742,
      "step": 33440,
      "throughput": 12512.953627898272
    },
    {
      "epoch": 0.52464361005964,
      "grad_norm": 0.07761716097593307,
      "learning_rate": 9.78168119067789e-05,
      "loss": 8.7388,
      "step": 33472,
      "throughput": 12512.971486634364
    },
    {
      "epoch": 0.5251451813885689,
      "grad_norm": 0.08411026000976562,
      "learning_rate": 9.757915667234339e-05,
      "loss": 8.7373,
      "step": 33504,
      "throughput": 12512.91443184655
    },
    {
      "epoch": 0.5256467527174978,
      "grad_norm": 0.08170992136001587,
      "learning_rate": 9.734177948166558e-05,
      "loss": 8.7299,
      "step": 33536,
      "throughput": 12512.913718463229
    },
    {
      "epoch": 0.5261483240464266,
      "grad_norm": 0.11816666275262833,
      "learning_rate": 9.710468131368968e-05,
      "loss": 8.6938,
      "step": 33568,
      "throughput": 12513.025537345375
    },
    {
      "epoch": 0.5266498953753556,
      "grad_norm": 0.08636017143726349,
      "learning_rate": 9.68678631462093e-05,
      "loss": 8.7373,
      "step": 33600,
      "throughput": 12513.19843227255
    },
    {
      "epoch": 0.5271514667042845,
      "grad_norm": 0.07713182270526886,
      "learning_rate": 9.66313259558633e-05,
      "loss": 8.7255,
      "step": 33632,
      "throughput": 12513.317219699162
    },
    {
      "epoch": 0.5276530380332134,
      "grad_norm": 0.09151088446378708,
      "learning_rate": 9.639507071813166e-05,
      "loss": 8.6943,
      "step": 33664,
      "throughput": 12513.411913962553
    },
    {
      "epoch": 0.5281546093621423,
      "grad_norm": 0.08120939135551453,
      "learning_rate": 9.615909840733167e-05,
      "loss": 8.7223,
      "step": 33696,
      "throughput": 12513.582126916053
    },
    {
      "epoch": 0.5286561806910712,
      "grad_norm": 0.08518539369106293,
      "learning_rate": 9.592340999661393e-05,
      "loss": 8.7208,
      "step": 33728,
      "throughput": 12513.6103713382
    },
    {
      "epoch": 0.5291577520200001,
      "grad_norm": 0.07572302967309952,
      "learning_rate": 9.568800645795812e-05,
      "loss": 8.7463,
      "step": 33760,
      "throughput": 12513.480475179858
    },
    {
      "epoch": 0.5296593233489291,
      "grad_norm": 0.08459645509719849,
      "learning_rate": 9.545288876216901e-05,
      "loss": 8.714,
      "step": 33792,
      "throughput": 12513.512840943906
    },
    {
      "epoch": 0.530160894677858,
      "grad_norm": 0.08286039531230927,
      "learning_rate": 9.521805787887285e-05,
      "loss": 8.7092,
      "step": 33824,
      "throughput": 12513.469277726148
    },
    {
      "epoch": 0.5306624660067869,
      "grad_norm": 0.08557367324829102,
      "learning_rate": 9.498351477651286e-05,
      "loss": 8.7235,
      "step": 33856,
      "throughput": 12513.451216054991
    },
    {
      "epoch": 0.5311640373357158,
      "grad_norm": 0.0787692740559578,
      "learning_rate": 9.47492604223454e-05,
      "loss": 8.7063,
      "step": 33888,
      "throughput": 12513.513260723728
    },
    {
      "epoch": 0.5316656086646447,
      "grad_norm": 0.08846087753772736,
      "learning_rate": 9.451529578243618e-05,
      "loss": 8.7176,
      "step": 33920,
      "throughput": 12513.670563059231
    },
    {
      "epoch": 0.5321671799935737,
      "grad_norm": 0.08972983062267303,
      "learning_rate": 9.428162182165607e-05,
      "loss": 8.7047,
      "step": 33952,
      "throughput": 12513.786490143648
    },
    {
      "epoch": 0.5326687513225026,
      "grad_norm": 0.07745558023452759,
      "learning_rate": 9.40482395036772e-05,
      "loss": 8.7144,
      "step": 33984,
      "throughput": 12513.919700520197
    },
    {
      "epoch": 0.5331703226514314,
      "grad_norm": 0.07799308747053146,
      "learning_rate": 9.381514979096888e-05,
      "loss": 8.6938,
      "step": 34016,
      "throughput": 12514.037489764116
    },
    {
      "epoch": 0.5336718939803603,
      "grad_norm": 0.08440552651882172,
      "learning_rate": 9.35823536447938e-05,
      "loss": 8.7255,
      "step": 34048,
      "throughput": 12514.114761239924
    },
    {
      "epoch": 0.5341734653092892,
      "grad_norm": 0.07518921047449112,
      "learning_rate": 9.334985202520395e-05,
      "loss": 8.6927,
      "step": 34080,
      "throughput": 12514.078383131604
    },
    {
      "epoch": 0.5346750366382181,
      "grad_norm": 0.0790555477142334,
      "learning_rate": 9.311764589103679e-05,
      "loss": 8.7241,
      "step": 34112,
      "throughput": 12513.982049437827
    },
    {
      "epoch": 0.5351766079671471,
      "grad_norm": 0.08127565681934357,
      "learning_rate": 9.288573619991096e-05,
      "loss": 8.7226,
      "step": 34144,
      "throughput": 12514.01556900709
    },
    {
      "epoch": 0.535678179296076,
      "grad_norm": 0.07951905578374863,
      "learning_rate": 9.265412390822278e-05,
      "loss": 8.7273,
      "step": 34176,
      "throughput": 12513.949064899492
    },
    {
      "epoch": 0.5361797506250049,
      "grad_norm": 0.08190485835075378,
      "learning_rate": 9.242280997114204e-05,
      "loss": 8.6994,
      "step": 34208,
      "throughput": 12514.017668664614
    },
    {
      "epoch": 0.5366813219539338,
      "grad_norm": 0.0807257890701294,
      "learning_rate": 9.219179534260811e-05,
      "loss": 8.705,
      "step": 34240,
      "throughput": 12514.14673158072
    },
    {
      "epoch": 0.5371828932828627,
      "grad_norm": 0.08508472889661789,
      "learning_rate": 9.196108097532597e-05,
      "loss": 8.7023,
      "step": 34272,
      "throughput": 12514.308283370869
    },
    {
      "epoch": 0.5376844646117916,
      "grad_norm": 0.08585110306739807,
      "learning_rate": 9.173066782076236e-05,
      "loss": 8.7118,
      "step": 34304,
      "throughput": 12514.387570121344
    },
    {
      "epoch": 0.5381860359407206,
      "grad_norm": 0.0836232453584671,
      "learning_rate": 9.15005568291418e-05,
      "loss": 8.6902,
      "step": 34336,
      "throughput": 12514.502622322905
    },
    {
      "epoch": 0.5386876072696495,
      "grad_norm": 0.08131017535924911,
      "learning_rate": 9.12707489494428e-05,
      "loss": 8.682,
      "step": 34368,
      "throughput": 12514.60707528309
    },
    {
      "epoch": 0.5391891785985784,
      "grad_norm": 0.08063245564699173,
      "learning_rate": 9.104124512939357e-05,
      "loss": 8.7298,
      "step": 34400,
      "throughput": 12514.569249700598
    },
    {
      "epoch": 0.5396907499275073,
      "grad_norm": 0.08101888746023178,
      "learning_rate": 9.081204631546867e-05,
      "loss": 8.7014,
      "step": 34432,
      "throughput": 12514.477715518917
    },
    {
      "epoch": 0.5401923212564361,
      "grad_norm": 0.08040083944797516,
      "learning_rate": 9.058315345288465e-05,
      "loss": 8.687,
      "step": 34464,
      "throughput": 12514.475338292768
    },
    {
      "epoch": 0.540693892585365,
      "grad_norm": 0.08651674538850784,
      "learning_rate": 9.035456748559639e-05,
      "loss": 8.729,
      "step": 34496,
      "throughput": 12514.422458791822
    },
    {
      "epoch": 0.541195463914294,
      "grad_norm": 0.08819051086902618,
      "learning_rate": 9.012628935629299e-05,
      "loss": 8.6979,
      "step": 34528,
      "throughput": 12514.461501678546
    },
    {
      "epoch": 0.5416970352432229,
      "grad_norm": 0.0806623175740242,
      "learning_rate": 8.989832000639424e-05,
      "loss": 8.6905,
      "step": 34560,
      "throughput": 12514.552034181705
    },
    {
      "epoch": 0.5421986065721518,
      "grad_norm": 0.07820701599121094,
      "learning_rate": 8.967066037604637e-05,
      "loss": 8.7115,
      "step": 34592,
      "throughput": 12514.716440195212
    },
    {
      "epoch": 0.5427001779010807,
      "grad_norm": 0.08373693376779556,
      "learning_rate": 8.944331140411841e-05,
      "loss": 8.7161,
      "step": 34624,
      "throughput": 12514.836142421645
    },
    {
      "epoch": 0.5432017492300096,
      "grad_norm": 0.08561219274997711,
      "learning_rate": 8.921627402819813e-05,
      "loss": 8.7024,
      "step": 34656,
      "throughput": 12514.960254163809
    },
    {
      "epoch": 0.5437033205589386,
      "grad_norm": 0.08857572823762894,
      "learning_rate": 8.898954918458835e-05,
      "loss": 8.7119,
      "step": 34688,
      "throughput": 12515.078179186417
    },
    {
      "epoch": 0.5442048918878675,
      "grad_norm": 0.08995655179023743,
      "learning_rate": 8.876313780830305e-05,
      "loss": 8.723,
      "step": 34720,
      "throughput": 12515.100646360597
    },
    {
      "epoch": 0.5447064632167964,
      "grad_norm": 0.08579035848379135,
      "learning_rate": 8.853704083306341e-05,
      "loss": 8.7166,
      "step": 34752,
      "throughput": 12514.98232968835
    },
    {
      "epoch": 0.5452080345457253,
      "grad_norm": 0.08334320038557053,
      "learning_rate": 8.831125919129397e-05,
      "loss": 8.71,
      "step": 34784,
      "throughput": 12515.002708526768
    },
    {
      "epoch": 0.5457096058746542,
      "grad_norm": 0.08186853677034378,
      "learning_rate": 8.808579381411892e-05,
      "loss": 8.7085,
      "step": 34816,
      "throughput": 12515.016993182575
    },
    {
      "epoch": 0.5462111772035831,
      "grad_norm": 0.08742789179086685,
      "learning_rate": 8.786064563135815e-05,
      "loss": 8.7129,
      "step": 34848,
      "throughput": 12514.411341164006
    },
    {
      "epoch": 0.5467127485325121,
      "grad_norm": 0.09223673492670059,
      "learning_rate": 8.763581557152348e-05,
      "loss": 8.7098,
      "step": 34880,
      "throughput": 12514.476674435435
    },
    {
      "epoch": 0.5472143198614409,
      "grad_norm": 0.08679956942796707,
      "learning_rate": 8.741130456181463e-05,
      "loss": 8.7003,
      "step": 34912,
      "throughput": 12514.636751601753
    },
    {
      "epoch": 0.5477158911903698,
      "grad_norm": 0.0794166699051857,
      "learning_rate": 8.718711352811573e-05,
      "loss": 8.7058,
      "step": 34944,
      "throughput": 12514.743625683801
    },
    {
      "epoch": 0.5482174625192987,
      "grad_norm": 0.07620302587747574,
      "learning_rate": 8.696324339499135e-05,
      "loss": 8.7075,
      "step": 34976,
      "throughput": 12514.867235682797
    },
    {
      "epoch": 0.5487190338482276,
      "grad_norm": 0.09295224398374557,
      "learning_rate": 8.673969508568242e-05,
      "loss": 8.6821,
      "step": 35008,
      "throughput": 12514.977725954972
    },
    {
      "epoch": 0.5492206051771565,
      "grad_norm": 0.07569506764411926,
      "learning_rate": 8.651646952210293e-05,
      "loss": 8.7031,
      "step": 35040,
      "throughput": 12514.95672239073
    },
    {
      "epoch": 0.5497221765060855,
      "grad_norm": 0.08172563463449478,
      "learning_rate": 8.629356762483573e-05,
      "loss": 8.7053,
      "step": 35072,
      "throughput": 12514.972298338476
    },
    {
      "epoch": 0.5502237478350144,
      "grad_norm": 0.08027073740959167,
      "learning_rate": 8.607099031312901e-05,
      "loss": 8.6982,
      "step": 35104,
      "throughput": 12514.91865195136
    },
    {
      "epoch": 0.5507253191639433,
      "grad_norm": 0.08717502653598785,
      "learning_rate": 8.58487385048921e-05,
      "loss": 8.7226,
      "step": 35136,
      "throughput": 12514.937656725599
    },
    {
      "epoch": 0.5512268904928722,
      "grad_norm": 0.10650716722011566,
      "learning_rate": 8.562681311669218e-05,
      "loss": 8.7122,
      "step": 35168,
      "throughput": 12514.89458864502
    },
    {
      "epoch": 0.5517284618218011,
      "grad_norm": 0.07669194042682648,
      "learning_rate": 8.540521506375026e-05,
      "loss": 8.7039,
      "step": 35200,
      "throughput": 12514.952425684478
    },
    {
      "epoch": 0.55223003315073,
      "grad_norm": 0.08644834160804749,
      "learning_rate": 8.518394525993734e-05,
      "loss": 8.6973,
      "step": 35232,
      "throughput": 12515.091657244611
    },
    {
      "epoch": 0.552731604479659,
      "grad_norm": 0.08324025571346283,
      "learning_rate": 8.496300461777068e-05,
      "loss": 8.7074,
      "step": 35264,
      "throughput": 12515.20257350424
    },
    {
      "epoch": 0.5532331758085879,
      "grad_norm": 0.08295135200023651,
      "learning_rate": 8.474239404841023e-05,
      "loss": 8.6957,
      "step": 35296,
      "throughput": 12515.360529727743
    },
    {
      "epoch": 0.5537347471375168,
      "grad_norm": 0.0789424255490303,
      "learning_rate": 8.452211446165458e-05,
      "loss": 8.6975,
      "step": 35328,
      "throughput": 12515.486852153686
    },
    {
      "epoch": 0.5542363184664456,
      "grad_norm": 0.08173195272684097,
      "learning_rate": 8.430216676593744e-05,
      "loss": 8.7228,
      "step": 35360,
      "throughput": 12515.514720827478
    },
    {
      "epoch": 0.5547378897953745,
      "grad_norm": 0.08786769211292267,
      "learning_rate": 8.408255186832372e-05,
      "loss": 8.7126,
      "step": 35392,
      "throughput": 12515.545096190892
    },
    {
      "epoch": 0.5552394611243034,
      "grad_norm": 0.09330630302429199,
      "learning_rate": 8.386327067450593e-05,
      "loss": 8.6834,
      "step": 35424,
      "throughput": 12515.437722737353
    },
    {
      "epoch": 0.5557410324532324,
      "grad_norm": 0.08301142603158951,
      "learning_rate": 8.36443240888004e-05,
      "loss": 8.6979,
      "step": 35456,
      "throughput": 12515.457683968867
    },
    {
      "epoch": 0.5562426037821613,
      "grad_norm": 0.08266784995794296,
      "learning_rate": 8.342571301414342e-05,
      "loss": 8.7118,
      "step": 35488,
      "throughput": 12515.426200839343
    },
    {
      "epoch": 0.5567441751110902,
      "grad_norm": 0.09009415656328201,
      "learning_rate": 8.320743835208775e-05,
      "loss": 8.715,
      "step": 35520,
      "throughput": 12515.464315842382
    },
    {
      "epoch": 0.5572457464400191,
      "grad_norm": 0.08386842161417007,
      "learning_rate": 8.298950100279872e-05,
      "loss": 8.7033,
      "step": 35552,
      "throughput": 12515.580516730502
    },
    {
      "epoch": 0.557747317768948,
      "grad_norm": 0.08837930858135223,
      "learning_rate": 8.27719018650507e-05,
      "loss": 8.7159,
      "step": 35584,
      "throughput": 12515.690077420006
    },
    {
      "epoch": 0.558248889097877,
      "grad_norm": 0.09241513162851334,
      "learning_rate": 8.255464183622304e-05,
      "loss": 8.7163,
      "step": 35616,
      "throughput": 12515.849451780045
    },
    {
      "epoch": 0.5587504604268059,
      "grad_norm": 0.08907388150691986,
      "learning_rate": 8.23377218122968e-05,
      "loss": 8.6996,
      "step": 35648,
      "throughput": 12515.982321961046
    },
    {
      "epoch": 0.5592520317557348,
      "grad_norm": 0.08048530668020248,
      "learning_rate": 8.212114268785083e-05,
      "loss": 8.6889,
      "step": 35680,
      "throughput": 12516.09922219064
    },
    {
      "epoch": 0.5597536030846637,
      "grad_norm": 0.08962694555521011,
      "learning_rate": 8.190490535605809e-05,
      "loss": 8.6696,
      "step": 35712,
      "throughput": 12516.10359288709
    },
    {
      "epoch": 0.5602551744135926,
      "grad_norm": 0.07987916469573975,
      "learning_rate": 8.16890107086819e-05,
      "loss": 8.6858,
      "step": 35744,
      "throughput": 12515.92148666699
    },
    {
      "epoch": 0.5607567457425215,
      "grad_norm": 0.08420739322900772,
      "learning_rate": 8.14734596360725e-05,
      "loss": 8.6962,
      "step": 35776,
      "throughput": 12515.934187064237
    },
    {
      "epoch": 0.5612583170714504,
      "grad_norm": 0.08826903998851776,
      "learning_rate": 8.12582530271631e-05,
      "loss": 8.7113,
      "step": 35808,
      "throughput": 12515.952704779598
    },
    {
      "epoch": 0.5617598884003793,
      "grad_norm": 0.10297546535730362,
      "learning_rate": 8.104339176946648e-05,
      "loss": 8.6696,
      "step": 35840,
      "throughput": 12515.947157806651
    },
    {
      "epoch": 0.5622614597293082,
      "grad_norm": 0.07826440036296844,
      "learning_rate": 8.082887674907099e-05,
      "loss": 8.6888,
      "step": 35872,
      "throughput": 12516.01639726759
    },
    {
      "epoch": 0.5627630310582371,
      "grad_norm": 0.08734133839607239,
      "learning_rate": 8.061470885063726e-05,
      "loss": 8.7059,
      "step": 35904,
      "throughput": 12516.124572831535
    },
    {
      "epoch": 0.563264602387166,
      "grad_norm": 0.09758967161178589,
      "learning_rate": 8.040088895739433e-05,
      "loss": 8.7093,
      "step": 35936,
      "throughput": 12516.278365852624
    },
    {
      "epoch": 0.5637661737160949,
      "grad_norm": 0.0903221070766449,
      "learning_rate": 8.018741795113614e-05,
      "loss": 8.6963,
      "step": 35968,
      "throughput": 12516.40036791016
    },
    {
      "epoch": 0.5642677450450239,
      "grad_norm": 0.0803726464509964,
      "learning_rate": 7.997429671221764e-05,
      "loss": 8.6938,
      "step": 36000,
      "throughput": 12516.506289442148
    },
    {
      "epoch": 0.5647693163739528,
      "grad_norm": 0.08807411044836044,
      "learning_rate": 7.97615261195515e-05,
      "loss": 8.7012,
      "step": 36032,
      "throughput": 12516.495471268532
    },
    {
      "epoch": 0.5652708877028817,
      "grad_norm": 0.08333015441894531,
      "learning_rate": 7.95491070506043e-05,
      "loss": 8.715,
      "step": 36064,
      "throughput": 12516.464464403765
    },
    {
      "epoch": 0.5657724590318106,
      "grad_norm": 0.0851825550198555,
      "learning_rate": 7.933704038139292e-05,
      "loss": 8.6781,
      "step": 36096,
      "throughput": 12516.371951049214
    },
    {
      "epoch": 0.5662740303607395,
      "grad_norm": 0.07540696859359741,
      "learning_rate": 7.912532698648089e-05,
      "loss": 8.684,
      "step": 36128,
      "throughput": 12516.47121087276
    },
    {
      "epoch": 0.5667756016896685,
      "grad_norm": 0.0787714496254921,
      "learning_rate": 7.891396773897487e-05,
      "loss": 8.6759,
      "step": 36160,
      "throughput": 12516.426290858957
    },
    {
      "epoch": 0.5672771730185974,
      "grad_norm": 0.08228734880685806,
      "learning_rate": 7.870296351052104e-05,
      "loss": 8.6708,
      "step": 36192,
      "throughput": 12516.486618277379
    },
    {
      "epoch": 0.5677787443475263,
      "grad_norm": 0.08103416115045547,
      "learning_rate": 7.849231517130151e-05,
      "loss": 8.6864,
      "step": 36224,
      "throughput": 12516.617793702857
    },
    {
      "epoch": 0.5682803156764551,
      "grad_norm": 0.08393923193216324,
      "learning_rate": 7.828202359003058e-05,
      "loss": 8.6839,
      "step": 36256,
      "throughput": 12516.722469381535
    },
    {
      "epoch": 0.568781887005384,
      "grad_norm": 0.07748636603355408,
      "learning_rate": 7.807208963395139e-05,
      "loss": 8.6778,
      "step": 36288,
      "throughput": 12516.875094866176
    },
    {
      "epoch": 0.5692834583343129,
      "grad_norm": 0.08194294571876526,
      "learning_rate": 7.786251416883218e-05,
      "loss": 8.6849,
      "step": 36320,
      "throughput": 12516.998174008146
    },
    {
      "epoch": 0.5697850296632418,
      "grad_norm": 0.08278539031744003,
      "learning_rate": 7.765329805896287e-05,
      "loss": 8.6976,
      "step": 36352,
      "throughput": 12517.033518057026
    },
    {
      "epoch": 0.5702866009921708,
      "grad_norm": 0.07897041738033295,
      "learning_rate": 7.744444216715117e-05,
      "loss": 8.6953,
      "step": 36384,
      "throughput": 12516.974273839218
    },
    {
      "epoch": 0.5707881723210997,
      "grad_norm": 0.08542395383119583,
      "learning_rate": 7.723594735471952e-05,
      "loss": 8.6978,
      "step": 36416,
      "throughput": 12516.858572450183
    },
    {
      "epoch": 0.5712897436500286,
      "grad_norm": 0.07970910519361496,
      "learning_rate": 7.702781448150109e-05,
      "loss": 8.6899,
      "step": 36448,
      "throughput": 12516.874160403917
    },
    {
      "epoch": 0.5717913149789575,
      "grad_norm": 0.0827026516199112,
      "learning_rate": 7.682004440583654e-05,
      "loss": 8.6825,
      "step": 36480,
      "throughput": 12516.862928531891
    },
    {
      "epoch": 0.5722928863078864,
      "grad_norm": 0.07867981493473053,
      "learning_rate": 7.661263798457014e-05,
      "loss": 8.6836,
      "step": 36512,
      "throughput": 12516.91203717116
    },
    {
      "epoch": 0.5727944576368154,
      "grad_norm": 0.08377469331026077,
      "learning_rate": 7.64055960730467e-05,
      "loss": 8.6646,
      "step": 36544,
      "throughput": 12517.001455253561
    },
    {
      "epoch": 0.5732960289657443,
      "grad_norm": 0.09216863662004471,
      "learning_rate": 7.619891952510763e-05,
      "loss": 8.6938,
      "step": 36576,
      "throughput": 12517.107661462249
    },
    {
      "epoch": 0.5737976002946732,
      "grad_norm": 0.08290153741836548,
      "learning_rate": 7.599260919308764e-05,
      "loss": 8.6821,
      "step": 36608,
      "throughput": 12517.260289134058
    },
    {
      "epoch": 0.5742991716236021,
      "grad_norm": 0.09356728941202164,
      "learning_rate": 7.578666592781114e-05,
      "loss": 8.6922,
      "step": 36640,
      "throughput": 12517.389574354975
    },
    {
      "epoch": 0.574800742952531,
      "grad_norm": 0.08148008584976196,
      "learning_rate": 7.558109057858874e-05,
      "loss": 8.6712,
      "step": 36672,
      "throughput": 12517.430869809043
    },
    {
      "epoch": 0.5753023142814598,
      "grad_norm": 0.08250202238559723,
      "learning_rate": 7.53758839932139e-05,
      "loss": 8.6867,
      "step": 36704,
      "throughput": 12517.466052972384
    },
    {
      "epoch": 0.5758038856103888,
      "grad_norm": 0.08435425907373428,
      "learning_rate": 7.517104701795905e-05,
      "loss": 8.6657,
      "step": 36736,
      "throughput": 12517.283483662877
    },
    {
      "epoch": 0.5763054569393177,
      "grad_norm": 0.09276581555604935,
      "learning_rate": 7.496658049757255e-05,
      "loss": 8.6782,
      "step": 36768,
      "throughput": 12517.328290633728
    },
    {
      "epoch": 0.5768070282682466,
      "grad_norm": 0.08053594082593918,
      "learning_rate": 7.476248527527492e-05,
      "loss": 8.6766,
      "step": 36800,
      "throughput": 12517.335458268808
    },
    {
      "epoch": 0.5773085995971755,
      "grad_norm": 0.08317738771438599,
      "learning_rate": 7.455876219275552e-05,
      "loss": 8.6696,
      "step": 36832,
      "throughput": 12517.356605369205
    },
    {
      "epoch": 0.5778101709261044,
      "grad_norm": 0.08413007855415344,
      "learning_rate": 7.435541209016885e-05,
      "loss": 8.694,
      "step": 36864,
      "throughput": 12517.419176649293
    },
    {
      "epoch": 0.5783117422550333,
      "grad_norm": 0.08062509447336197,
      "learning_rate": 7.415243580613134e-05,
      "loss": 8.6712,
      "step": 36896,
      "throughput": 12516.973503237807
    },
    {
      "epoch": 0.5788133135839623,
      "grad_norm": 0.08116944134235382,
      "learning_rate": 7.394983417771791e-05,
      "loss": 8.6912,
      "step": 36928,
      "throughput": 12517.128302324305
    },
    {
      "epoch": 0.5793148849128912,
      "grad_norm": 0.08938824385404587,
      "learning_rate": 7.374760804045815e-05,
      "loss": 8.6986,
      "step": 36960,
      "throughput": 12517.282038382906
    },
    {
      "epoch": 0.5798164562418201,
      "grad_norm": 0.07905585318803787,
      "learning_rate": 7.354575822833331e-05,
      "loss": 8.6741,
      "step": 36992,
      "throughput": 12517.382771112063
    },
    {
      "epoch": 0.580318027570749,
      "grad_norm": 0.08489411324262619,
      "learning_rate": 7.334428557377258e-05,
      "loss": 8.6935,
      "step": 37024,
      "throughput": 12517.346292375569
    },
    {
      "epoch": 0.5808195988996779,
      "grad_norm": 0.09119182080030441,
      "learning_rate": 7.314319090764985e-05,
      "loss": 8.6883,
      "step": 37056,
      "throughput": 12517.23137805927
    },
    {
      "epoch": 0.5813211702286069,
      "grad_norm": 0.08179455995559692,
      "learning_rate": 7.294247505928003e-05,
      "loss": 8.6844,
      "step": 37088,
      "throughput": 12517.178960965015
    },
    {
      "epoch": 0.5818227415575358,
      "grad_norm": 0.08026742935180664,
      "learning_rate": 7.274213885641592e-05,
      "loss": 8.6751,
      "step": 37120,
      "throughput": 12517.181465984202
    },
    {
      "epoch": 0.5823243128864646,
      "grad_norm": 0.08509814739227295,
      "learning_rate": 7.254218312524461e-05,
      "loss": 8.6828,
      "step": 37152,
      "throughput": 12517.27502240571
    },
    {
      "epoch": 0.5828258842153935,
      "grad_norm": 0.08018773049116135,
      "learning_rate": 7.234260869038417e-05,
      "loss": 8.695,
      "step": 37184,
      "throughput": 12517.289733393207
    },
    {
      "epoch": 0.5833274555443224,
      "grad_norm": 0.08433583378791809,
      "learning_rate": 7.214341637488007e-05,
      "loss": 8.7007,
      "step": 37216,
      "throughput": 12517.361447458245
    },
    {
      "epoch": 0.5838290268732513,
      "grad_norm": 0.08063721656799316,
      "learning_rate": 7.194460700020206e-05,
      "loss": 8.6719,
      "step": 37248,
      "throughput": 12517.512031241993
    },
    {
      "epoch": 0.5843305982021803,
      "grad_norm": 0.08141735196113586,
      "learning_rate": 7.174618138624058e-05,
      "loss": 8.6913,
      "step": 37280,
      "throughput": 12517.664695356068
    },
    {
      "epoch": 0.5848321695311092,
      "grad_norm": 0.08511857688426971,
      "learning_rate": 7.154814035130351e-05,
      "loss": 8.6866,
      "step": 37312,
      "throughput": 12517.791148806165
    },
    {
      "epoch": 0.5853337408600381,
      "grad_norm": 0.08427421748638153,
      "learning_rate": 7.135048471211257e-05,
      "loss": 8.6776,
      "step": 37344,
      "throughput": 12517.793106161102
    },
    {
      "epoch": 0.585835312188967,
      "grad_norm": 0.08205582201480865,
      "learning_rate": 7.115321528380024e-05,
      "loss": 8.7002,
      "step": 37376,
      "throughput": 12517.730083261857
    },
    {
      "epoch": 0.5863368835178959,
      "grad_norm": 0.08410238474607468,
      "learning_rate": 7.095633287990622e-05,
      "loss": 8.6803,
      "step": 37408,
      "throughput": 12517.57726651346
    },
    {
      "epoch": 0.5868384548468248,
      "grad_norm": 0.08508728444576263,
      "learning_rate": 7.075983831237421e-05,
      "loss": 8.6598,
      "step": 37440,
      "throughput": 12517.667385193266
    },
    {
      "epoch": 0.5873400261757538,
      "grad_norm": 0.2568950355052948,
      "learning_rate": 7.056373239154826e-05,
      "loss": 8.6748,
      "step": 37472,
      "throughput": 12517.671686146488
    },
    {
      "epoch": 0.5878415975046827,
      "grad_norm": 0.08162539452314377,
      "learning_rate": 7.036801592616982e-05,
      "loss": 8.6504,
      "step": 37504,
      "throughput": 12517.690999905139
    },
    {
      "epoch": 0.5883431688336116,
      "grad_norm": 0.07849689573049545,
      "learning_rate": 7.017268972337419e-05,
      "loss": 8.6504,
      "step": 37536,
      "throughput": 12517.768847195408
    },
    {
      "epoch": 0.5888447401625405,
      "grad_norm": 0.08418171852827072,
      "learning_rate": 6.997775458868724e-05,
      "loss": 8.6812,
      "step": 37568,
      "throughput": 12517.915109902939
    },
    {
      "epoch": 0.5893463114914693,
      "grad_norm": 0.08415034413337708,
      "learning_rate": 6.978321132602197e-05,
      "loss": 8.6772,
      "step": 37600,
      "throughput": 12518.061122467996
    },
    {
      "epoch": 0.5898478828203982,
      "grad_norm": 0.08779383450746536,
      "learning_rate": 6.95890607376754e-05,
      "loss": 8.6624,
      "step": 37632,
      "throughput": 12518.181187880014
    },
    {
      "epoch": 0.5903494541493272,
      "grad_norm": 0.07764028757810593,
      "learning_rate": 6.939530362432513e-05,
      "loss": 8.6793,
      "step": 37664,
      "throughput": 12518.19822260176
    },
    {
      "epoch": 0.5908510254782561,
      "grad_norm": 0.0823327898979187,
      "learning_rate": 6.920194078502611e-05,
      "loss": 8.6744,
      "step": 37696,
      "throughput": 12518.172950977636
    },
    {
      "epoch": 0.591352596807185,
      "grad_norm": 0.07632856070995331,
      "learning_rate": 6.900897301720721e-05,
      "loss": 8.6787,
      "step": 37728,
      "throughput": 12518.087089234079
    },
    {
      "epoch": 0.5918541681361139,
      "grad_norm": 0.07819085568189621,
      "learning_rate": 6.881640111666807e-05,
      "loss": 8.6944,
      "step": 37760,
      "throughput": 12518.08412581868
    },
    {
      "epoch": 0.5923557394650428,
      "grad_norm": 0.08589126914739609,
      "learning_rate": 6.862422587757581e-05,
      "loss": 8.6819,
      "step": 37792,
      "throughput": 12518.12154185379
    },
    {
      "epoch": 0.5928573107939717,
      "grad_norm": 0.08168213069438934,
      "learning_rate": 6.843244809246173e-05,
      "loss": 8.6967,
      "step": 37824,
      "throughput": 12518.118068216625
    },
    {
      "epoch": 0.5933588821229007,
      "grad_norm": 0.07826963812112808,
      "learning_rate": 6.824106855221788e-05,
      "loss": 8.6777,
      "step": 37856,
      "throughput": 12518.203417005496
    },
    {
      "epoch": 0.5938604534518296,
      "grad_norm": 0.08327604085206985,
      "learning_rate": 6.805008804609411e-05,
      "loss": 8.6729,
      "step": 37888,
      "throughput": 12518.299888113675
    },
    {
      "epoch": 0.5943620247807585,
      "grad_norm": 0.08167058974504471,
      "learning_rate": 6.78595073616946e-05,
      "loss": 8.6702,
      "step": 37920,
      "throughput": 12518.440363110465
    },
    {
      "epoch": 0.5948635961096874,
      "grad_norm": 0.08159983903169632,
      "learning_rate": 6.766932728497468e-05,
      "loss": 8.6946,
      "step": 37952,
      "throughput": 12518.577594035549
    },
    {
      "epoch": 0.5953651674386163,
      "grad_norm": 0.08012655377388,
      "learning_rate": 6.747954860023746e-05,
      "loss": 8.6939,
      "step": 37984,
      "throughput": 12518.626317917498
    },
    {
      "epoch": 0.5958667387675451,
      "grad_norm": 0.0828748419880867,
      "learning_rate": 6.729017209013086e-05,
      "loss": 8.6817,
      "step": 38016,
      "throughput": 12518.667987562065
    },
    {
      "epoch": 0.5963683100964741,
      "grad_norm": 0.08672600984573364,
      "learning_rate": 6.710119853564422e-05,
      "loss": 8.689,
      "step": 38048,
      "throughput": 12518.465836159847
    },
    {
      "epoch": 0.596869881425403,
      "grad_norm": 0.09564584493637085,
      "learning_rate": 6.69126287161049e-05,
      "loss": 8.6977,
      "step": 38080,
      "throughput": 12518.43985834833
    },
    {
      "epoch": 0.5973714527543319,
      "grad_norm": 0.08064235001802444,
      "learning_rate": 6.672446340917553e-05,
      "loss": 8.6815,
      "step": 38112,
      "throughput": 12518.4491105091
    },
    {
      "epoch": 0.5978730240832608,
      "grad_norm": 0.08223454654216766,
      "learning_rate": 6.653670339085031e-05,
      "loss": 8.6738,
      "step": 38144,
      "throughput": 12518.495327793264
    },
    {
      "epoch": 0.5983745954121897,
      "grad_norm": 0.08172369003295898,
      "learning_rate": 6.634934943545217e-05,
      "loss": 8.6804,
      "step": 38176,
      "throughput": 12518.589927415283
    },
    {
      "epoch": 0.5988761667411187,
      "grad_norm": 0.08844827860593796,
      "learning_rate": 6.616240231562933e-05,
      "loss": 8.6558,
      "step": 38208,
      "throughput": 12518.669975625291
    },
    {
      "epoch": 0.5993777380700476,
      "grad_norm": 0.08515715599060059,
      "learning_rate": 6.597586280235227e-05,
      "loss": 8.6843,
      "step": 38240,
      "throughput": 12518.811041374041
    },
    {
      "epoch": 0.5998793093989765,
      "grad_norm": 0.0898033082485199,
      "learning_rate": 6.578973166491053e-05,
      "loss": 8.6682,
      "step": 38272,
      "throughput": 12518.948975647792
    },
    {
      "epoch": 0.6003808807279054,
      "grad_norm": 0.0798042044043541,
      "learning_rate": 6.560400967090948e-05,
      "loss": 8.6688,
      "step": 38304,
      "throughput": 12519.03731635014
    },
    {
      "epoch": 0.6008824520568343,
      "grad_norm": 0.07899042218923569,
      "learning_rate": 6.54186975862671e-05,
      "loss": 8.6622,
      "step": 38336,
      "throughput": 12519.056239877606
    },
    {
      "epoch": 0.6013840233857632,
      "grad_norm": 0.0829099491238594,
      "learning_rate": 6.523379617521104e-05,
      "loss": 8.6642,
      "step": 38368,
      "throughput": 12518.961806386887
    },
    {
      "epoch": 0.6018855947146922,
      "grad_norm": 0.07706566154956818,
      "learning_rate": 6.504930620027524e-05,
      "loss": 8.6645,
      "step": 38400,
      "throughput": 12518.839803727704
    },
    {
      "epoch": 0.6023871660436211,
      "grad_norm": 0.08346829563379288,
      "learning_rate": 6.486522842229692e-05,
      "loss": 8.6709,
      "step": 38432,
      "throughput": 12518.82358632836
    },
    {
      "epoch": 0.6028887373725499,
      "grad_norm": 0.08457069098949432,
      "learning_rate": 6.468156360041337e-05,
      "loss": 8.6736,
      "step": 38464,
      "throughput": 12518.91697685828
    },
    {
      "epoch": 0.6033903087014788,
      "grad_norm": 0.08379726111888885,
      "learning_rate": 6.449831249205887e-05,
      "loss": 8.6604,
      "step": 38496,
      "throughput": 12518.920394830555
    },
    {
      "epoch": 0.6038918800304077,
      "grad_norm": 0.08050908893346786,
      "learning_rate": 6.431547585296156e-05,
      "loss": 8.6586,
      "step": 38528,
      "throughput": 12519.000297488355
    },
    {
      "epoch": 0.6043934513593366,
      "grad_norm": 0.08401334285736084,
      "learning_rate": 6.413305443714022e-05,
      "loss": 8.6613,
      "step": 38560,
      "throughput": 12519.14071583752
    },
    {
      "epoch": 0.6048950226882656,
      "grad_norm": 0.08327256143093109,
      "learning_rate": 6.395104899690134e-05,
      "loss": 8.6622,
      "step": 38592,
      "throughput": 12519.282051726044
    },
    {
      "epoch": 0.6053965940171945,
      "grad_norm": 0.08408129215240479,
      "learning_rate": 6.37694602828359e-05,
      "loss": 8.6708,
      "step": 38624,
      "throughput": 12519.367297466042
    },
    {
      "epoch": 0.6058981653461234,
      "grad_norm": 0.07974757254123688,
      "learning_rate": 6.358828904381632e-05,
      "loss": 8.6589,
      "step": 38656,
      "throughput": 12519.429108838523
    },
    {
      "epoch": 0.6063997366750523,
      "grad_norm": 0.0898723155260086,
      "learning_rate": 6.340753602699327e-05,
      "loss": 8.6824,
      "step": 38688,
      "throughput": 12519.31796076407
    },
    {
      "epoch": 0.6069013080039812,
      "grad_norm": 0.09549321979284286,
      "learning_rate": 6.322720197779275e-05,
      "loss": 8.6868,
      "step": 38720,
      "throughput": 12519.202567652002
    },
    {
      "epoch": 0.6074028793329102,
      "grad_norm": 0.08469787240028381,
      "learning_rate": 6.304728763991291e-05,
      "loss": 8.6661,
      "step": 38752,
      "throughput": 12519.26707210921
    },
    {
      "epoch": 0.6079044506618391,
      "grad_norm": 0.0787658542394638,
      "learning_rate": 6.286779375532107e-05,
      "loss": 8.6826,
      "step": 38784,
      "throughput": 12519.306401204129
    },
    {
      "epoch": 0.608406021990768,
      "grad_norm": 0.07940182834863663,
      "learning_rate": 6.268872106425044e-05,
      "loss": 8.643,
      "step": 38816,
      "throughput": 12519.314657029476
    },
    {
      "epoch": 0.6089075933196969,
      "grad_norm": 0.07980058342218399,
      "learning_rate": 6.25100703051974e-05,
      "loss": 8.6536,
      "step": 38848,
      "throughput": 12519.35759876071
    },
    {
      "epoch": 0.6094091646486258,
      "grad_norm": 0.08572038263082504,
      "learning_rate": 6.233184221491818e-05,
      "loss": 8.6725,
      "step": 38880,
      "throughput": 12519.494620818527
    },
    {
      "epoch": 0.6099107359775546,
      "grad_norm": 0.09471891075372696,
      "learning_rate": 6.2154037528426e-05,
      "loss": 8.6527,
      "step": 38912,
      "throughput": 12519.632116222083
    },
    {
      "epoch": 0.6104123073064835,
      "grad_norm": 0.0854351744055748,
      "learning_rate": 6.197665697898784e-05,
      "loss": 8.6649,
      "step": 38944,
      "throughput": 12519.236822148852
    },
    {
      "epoch": 0.6109138786354125,
      "grad_norm": 0.10343901813030243,
      "learning_rate": 6.179970129812166e-05,
      "loss": 8.6608,
      "step": 38976,
      "throughput": 12519.257951071279
    },
    {
      "epoch": 0.6114154499643414,
      "grad_norm": 0.0799938291311264,
      "learning_rate": 6.16231712155932e-05,
      "loss": 8.676,
      "step": 39008,
      "throughput": 12519.278111050882
    },
    {
      "epoch": 0.6119170212932703,
      "grad_norm": 0.08410289883613586,
      "learning_rate": 6.144706745941308e-05,
      "loss": 8.6546,
      "step": 39040,
      "throughput": 12519.151087954797
    },
    {
      "epoch": 0.6124185926221992,
      "grad_norm": 0.08022049069404602,
      "learning_rate": 6.127139075583363e-05,
      "loss": 8.661,
      "step": 39072,
      "throughput": 12519.139449349837
    },
    {
      "epoch": 0.6129201639511281,
      "grad_norm": 0.08267541974782944,
      "learning_rate": 6.109614182934616e-05,
      "loss": 8.6619,
      "step": 39104,
      "throughput": 12519.196024051356
    },
    {
      "epoch": 0.6134217352800571,
      "grad_norm": 0.08225016295909882,
      "learning_rate": 6.092132140267775e-05,
      "loss": 8.649,
      "step": 39136,
      "throughput": 12519.243129954537
    },
    {
      "epoch": 0.613923306608986,
      "grad_norm": 0.11314070969820023,
      "learning_rate": 6.074693019678839e-05,
      "loss": 8.6704,
      "step": 39168,
      "throughput": 12519.263339232122
    },
    {
      "epoch": 0.6144248779379149,
      "grad_norm": 0.08187414705753326,
      "learning_rate": 6.0572968930867827e-05,
      "loss": 8.6522,
      "step": 39200,
      "throughput": 12519.38006793021
    },
    {
      "epoch": 0.6149264492668438,
      "grad_norm": 0.08883952349424362,
      "learning_rate": 6.039943832233293e-05,
      "loss": 8.6594,
      "step": 39232,
      "throughput": 12519.518153664507
    },
    {
      "epoch": 0.6154280205957727,
      "grad_norm": 0.08027220517396927,
      "learning_rate": 6.022633908682442e-05,
      "loss": 8.6596,
      "step": 39264,
      "throughput": 12519.657498292741
    },
    {
      "epoch": 0.6159295919247016,
      "grad_norm": 0.07722441852092743,
      "learning_rate": 6.005367193820408e-05,
      "loss": 8.6551,
      "step": 39296,
      "throughput": 12519.71106762286
    },
    {
      "epoch": 0.6164311632536306,
      "grad_norm": 0.08117758482694626,
      "learning_rate": 5.9881437588551675e-05,
      "loss": 8.66,
      "step": 39328,
      "throughput": 12519.773714208372
    },
    {
      "epoch": 0.6169327345825594,
      "grad_norm": 0.08332060277462006,
      "learning_rate": 5.970963674816224e-05,
      "loss": 8.6514,
      "step": 39360,
      "throughput": 12519.722993629892
    },
    {
      "epoch": 0.6174343059114883,
      "grad_norm": 0.08834696561098099,
      "learning_rate": 5.953827012554291e-05,
      "loss": 8.6643,
      "step": 39392,
      "throughput": 12519.569812218686
    },
    {
      "epoch": 0.6179358772404172,
      "grad_norm": 0.08121950924396515,
      "learning_rate": 5.9367338427410197e-05,
      "loss": 8.6577,
      "step": 39424,
      "throughput": 12519.582037618115
    },
    {
      "epoch": 0.6184374485693461,
      "grad_norm": 0.08764735609292984,
      "learning_rate": 5.9196842358686866e-05,
      "loss": 8.6642,
      "step": 39456,
      "throughput": 12519.679513734258
    },
    {
      "epoch": 0.618939019898275,
      "grad_norm": 0.08493324369192123,
      "learning_rate": 5.902678262249923e-05,
      "loss": 8.6652,
      "step": 39488,
      "throughput": 12519.700233712612
    },
    {
      "epoch": 0.619440591227204,
      "grad_norm": 0.08276000618934631,
      "learning_rate": 5.885715992017419e-05,
      "loss": 8.6494,
      "step": 39520,
      "throughput": 12519.772592101563
    },
    {
      "epoch": 0.6199421625561329,
      "grad_norm": 0.0843217521905899,
      "learning_rate": 5.86879749512362e-05,
      "loss": 8.6553,
      "step": 39552,
      "throughput": 12519.908684142914
    },
    {
      "epoch": 0.6204437338850618,
      "grad_norm": 0.0830702856183052,
      "learning_rate": 5.851922841340461e-05,
      "loss": 8.659,
      "step": 39584,
      "throughput": 12520.048560578924
    },
    {
      "epoch": 0.6209453052139907,
      "grad_norm": 0.09729384630918503,
      "learning_rate": 5.835092100259063e-05,
      "loss": 8.6474,
      "step": 39616,
      "throughput": 12520.16554718025
    },
    {
      "epoch": 0.6214468765429196,
      "grad_norm": 0.07610440254211426,
      "learning_rate": 5.818305341289458e-05,
      "loss": 8.6655,
      "step": 39648,
      "throughput": 12520.177816100835
    },
    {
      "epoch": 0.6219484478718486,
      "grad_norm": 0.07896667718887329,
      "learning_rate": 5.8015626336602814e-05,
      "loss": 8.6495,
      "step": 39680,
      "throughput": 12520.13873298695
    },
    {
      "epoch": 0.6224500192007775,
      "grad_norm": 0.08647879958152771,
      "learning_rate": 5.7848640464185124e-05,
      "loss": 8.6696,
      "step": 39712,
      "throughput": 12519.979631811184
    },
    {
      "epoch": 0.6229515905297064,
      "grad_norm": 0.07803455740213394,
      "learning_rate": 5.768209648429174e-05,
      "loss": 8.6655,
      "step": 39744,
      "throughput": 12519.932013667134
    },
    {
      "epoch": 0.6234531618586353,
      "grad_norm": 0.08229484409093857,
      "learning_rate": 5.751599508375059e-05,
      "loss": 8.664,
      "step": 39776,
      "throughput": 12520.069857426954
    },
    {
      "epoch": 0.6239547331875641,
      "grad_norm": 0.07795999199151993,
      "learning_rate": 5.735033694756423e-05,
      "loss": 8.649,
      "step": 39808,
      "throughput": 12520.027184068515
    },
    {
      "epoch": 0.624456304516493,
      "grad_norm": 0.07954669743776321,
      "learning_rate": 5.718512275890737e-05,
      "loss": 8.6491,
      "step": 39840,
      "throughput": 12520.12953119863
    },
    {
      "epoch": 0.624957875845422,
      "grad_norm": 0.08642455190420151,
      "learning_rate": 5.70203531991238e-05,
      "loss": 8.6537,
      "step": 39872,
      "throughput": 12520.23675201523
    },
    {
      "epoch": 0.6254594471743509,
      "grad_norm": 0.08800368756055832,
      "learning_rate": 5.6856028947723734e-05,
      "loss": 8.6564,
      "step": 39904,
      "throughput": 12520.373320639146
    },
    {
      "epoch": 0.6259610185032798,
      "grad_norm": 0.08899801969528198,
      "learning_rate": 5.669215068238075e-05,
      "loss": 8.6403,
      "step": 39936,
      "throughput": 12520.487837263145
    },
    {
      "epoch": 0.6264625898322087,
      "grad_norm": 0.07982466369867325,
      "learning_rate": 5.652871907892934e-05,
      "loss": 8.6608,
      "step": 39968,
      "throughput": 12520.534128921343
    },
    {
      "epoch": 0.6269641611611376,
      "grad_norm": 0.08430910110473633,
      "learning_rate": 5.6365734811362026e-05,
      "loss": 8.6578,
      "step": 40000,
      "throughput": 12520.507540367866
    },
    {
      "epoch": 0.6274657324900665,
      "grad_norm": 0.08782041817903519,
      "learning_rate": 5.620319855182629e-05,
      "loss": 8.6465,
      "step": 40032,
      "throughput": 12520.373597182608
    },
    {
      "epoch": 0.6279673038189955,
      "grad_norm": 0.08348139375448227,
      "learning_rate": 5.60411109706222e-05,
      "loss": 8.6222,
      "step": 40064,
      "throughput": 12520.378249662394
    },
    {
      "epoch": 0.6284688751479244,
      "grad_norm": 0.07767164707183838,
      "learning_rate": 5.587947273619938e-05,
      "loss": 8.6362,
      "step": 40096,
      "throughput": 12520.447447399005
    },
    {
      "epoch": 0.6289704464768533,
      "grad_norm": 0.08124253898859024,
      "learning_rate": 5.5718284515154476e-05,
      "loss": 8.6423,
      "step": 40128,
      "throughput": 12520.503797659545
    },
    {
      "epoch": 0.6294720178057822,
      "grad_norm": 0.08289259672164917,
      "learning_rate": 5.5557546972228114e-05,
      "loss": 8.6708,
      "step": 40160,
      "throughput": 12520.511064661736
    },
    {
      "epoch": 0.6299735891347111,
      "grad_norm": 0.11718080937862396,
      "learning_rate": 5.539726077030239e-05,
      "loss": 8.6634,
      "step": 40192,
      "throughput": 12520.621801043319
    },
    {
      "epoch": 0.63047516046364,
      "grad_norm": 0.08058638125658035,
      "learning_rate": 5.523742657039809e-05,
      "loss": 8.6504,
      "step": 40224,
      "throughput": 12520.757605927323
    },
    {
      "epoch": 0.6309767317925689,
      "grad_norm": 0.08185072988271713,
      "learning_rate": 5.5078045031672005e-05,
      "loss": 8.6793,
      "step": 40256,
      "throughput": 12520.866675222109
    },
    {
      "epoch": 0.6314783031214978,
      "grad_norm": 0.07993034273386002,
      "learning_rate": 5.491911681141394e-05,
      "loss": 8.6437,
      "step": 40288,
      "throughput": 12520.907599693744
    },
    {
      "epoch": 0.6319798744504267,
      "grad_norm": 0.08810719847679138,
      "learning_rate": 5.476064256504443e-05,
      "loss": 8.6815,
      "step": 40320,
      "throughput": 12520.931947301131
    },
    {
      "epoch": 0.6324814457793556,
      "grad_norm": 0.08399613946676254,
      "learning_rate": 5.460262294611172e-05,
      "loss": 8.6655,
      "step": 40352,
      "throughput": 12520.840774088949
    },
    {
      "epoch": 0.6329830171082845,
      "grad_norm": 0.081191286444664,
      "learning_rate": 5.444505860628923e-05,
      "loss": 8.629,
      "step": 40384,
      "throughput": 12520.802830375966
    },
    {
      "epoch": 0.6334845884372134,
      "grad_norm": 0.084128238260746,
      "learning_rate": 5.428795019537268e-05,
      "loss": 8.6391,
      "step": 40416,
      "throughput": 12520.839767156504
    },
    {
      "epoch": 0.6339861597661424,
      "grad_norm": 0.08739369362592697,
      "learning_rate": 5.413129836127766e-05,
      "loss": 8.6524,
      "step": 40448,
      "throughput": 12520.90426857039
    },
    {
      "epoch": 0.6344877310950713,
      "grad_norm": 0.07963095605373383,
      "learning_rate": 5.3975103750036805e-05,
      "loss": 8.6379,
      "step": 40480,
      "throughput": 12520.897375271004
    },
    {
      "epoch": 0.6349893024240002,
      "grad_norm": 0.08396175503730774,
      "learning_rate": 5.3819367005797186e-05,
      "loss": 8.6349,
      "step": 40512,
      "throughput": 12521.006784652312
    },
    {
      "epoch": 0.6354908737529291,
      "grad_norm": 0.08460961282253265,
      "learning_rate": 5.366408877081752e-05,
      "loss": 8.6645,
      "step": 40544,
      "throughput": 12521.141832276591
    },
    {
      "epoch": 0.635992445081858,
      "grad_norm": 0.08145654946565628,
      "learning_rate": 5.3509269685465764e-05,
      "loss": 8.6382,
      "step": 40576,
      "throughput": 12521.276430339789
    },
    {
      "epoch": 0.636494016410787,
      "grad_norm": 0.11674635857343674,
      "learning_rate": 5.3354910388216274e-05,
      "loss": 8.6558,
      "step": 40608,
      "throughput": 12521.312809689112
    },
    {
      "epoch": 0.6369955877397159,
      "grad_norm": 0.09571953862905502,
      "learning_rate": 5.3201011515647276e-05,
      "loss": 8.6613,
      "step": 40640,
      "throughput": 12521.388907581995
    },
    {
      "epoch": 0.6374971590686448,
      "grad_norm": 0.08281499892473221,
      "learning_rate": 5.304757370243811e-05,
      "loss": 8.6383,
      "step": 40672,
      "throughput": 12521.366481892124
    },
    {
      "epoch": 0.6379987303975736,
      "grad_norm": 0.08709795773029327,
      "learning_rate": 5.2894597581366835e-05,
      "loss": 8.663,
      "step": 40704,
      "throughput": 12521.240769983162
    },
    {
      "epoch": 0.6385003017265025,
      "grad_norm": 0.08456841111183167,
      "learning_rate": 5.274208378330737e-05,
      "loss": 8.6629,
      "step": 40736,
      "throughput": 12521.203232916065
    },
    {
      "epoch": 0.6390018730554314,
      "grad_norm": 0.09472116082906723,
      "learning_rate": 5.2590032937227154e-05,
      "loss": 8.6541,
      "step": 40768,
      "throughput": 12521.336472196534
    },
    {
      "epoch": 0.6395034443843604,
      "grad_norm": 0.08780498802661896,
      "learning_rate": 5.2438445670184244e-05,
      "loss": 8.654,
      "step": 40800,
      "throughput": 12521.3264815158
    },
    {
      "epoch": 0.6400050157132893,
      "grad_norm": 0.08326949179172516,
      "learning_rate": 5.2287322607325e-05,
      "loss": 8.6358,
      "step": 40832,
      "throughput": 12521.392052429343
    },
    {
      "epoch": 0.6405065870422182,
      "grad_norm": 0.08162552863359451,
      "learning_rate": 5.213666437188141e-05,
      "loss": 8.6674,
      "step": 40864,
      "throughput": 12521.494973148838
    },
    {
      "epoch": 0.6410081583711471,
      "grad_norm": 0.08285222202539444,
      "learning_rate": 5.1986471585168485e-05,
      "loss": 8.656,
      "step": 40896,
      "throughput": 12521.629309490367
    },
    {
      "epoch": 0.641509729700076,
      "grad_norm": 0.09120822697877884,
      "learning_rate": 5.183674486658167e-05,
      "loss": 8.6716,
      "step": 40928,
      "throughput": 12521.699504095955
    },
    {
      "epoch": 0.6420113010290049,
      "grad_norm": 0.08420810103416443,
      "learning_rate": 5.168748483359445e-05,
      "loss": 8.6607,
      "step": 40960,
      "throughput": 12521.75876137181
    },
    {
      "epoch": 0.6425128723579339,
      "grad_norm": 0.07815767079591751,
      "learning_rate": 5.153869210175563e-05,
      "loss": 8.6309,
      "step": 40992,
      "throughput": 12521.244443250478
    },
    {
      "epoch": 0.6430144436868628,
      "grad_norm": 0.08167236298322678,
      "learning_rate": 5.139036728468686e-05,
      "loss": 8.6386,
      "step": 41024,
      "throughput": 12521.098366250244
    },
    {
      "epoch": 0.6435160150157917,
      "grad_norm": 0.07946034520864487,
      "learning_rate": 5.124251099408012e-05,
      "loss": 8.6276,
      "step": 41056,
      "throughput": 12521.02820184281
    },
    {
      "epoch": 0.6440175863447206,
      "grad_norm": 0.08526898920536041,
      "learning_rate": 5.1095123839695224e-05,
      "loss": 8.6517,
      "step": 41088,
      "throughput": 12521.155406940761
    },
    {
      "epoch": 0.6445191576736495,
      "grad_norm": 0.08295472711324692,
      "learning_rate": 5.0948206429357224e-05,
      "loss": 8.6621,
      "step": 41120,
      "throughput": 12521.166390281849
    },
    {
      "epoch": 0.6450207290025783,
      "grad_norm": 0.09882532060146332,
      "learning_rate": 5.080175936895392e-05,
      "loss": 8.6701,
      "step": 41152,
      "throughput": 12521.21875397715
    },
    {
      "epoch": 0.6455223003315073,
      "grad_norm": 0.07845364511013031,
      "learning_rate": 5.065578326243348e-05,
      "loss": 8.6176,
      "step": 41184,
      "throughput": 12521.31994753445
    },
    {
      "epoch": 0.6460238716604362,
      "grad_norm": 0.08402087539434433,
      "learning_rate": 5.0510278711801735e-05,
      "loss": 8.6548,
      "step": 41216,
      "throughput": 12521.449296348317
    },
    {
      "epoch": 0.6465254429893651,
      "grad_norm": 0.08985952287912369,
      "learning_rate": 5.036524631711996e-05,
      "loss": 8.6473,
      "step": 41248,
      "throughput": 12521.563405601884
    },
    {
      "epoch": 0.647027014318294,
      "grad_norm": 0.07821807265281677,
      "learning_rate": 5.02206866765021e-05,
      "loss": 8.6453,
      "step": 41280,
      "throughput": 12521.576374610004
    },
    {
      "epoch": 0.6475285856472229,
      "grad_norm": 0.08190401643514633,
      "learning_rate": 5.007660038611259e-05,
      "loss": 8.6582,
      "step": 41312,
      "throughput": 12521.573527242897
    },
    {
      "epoch": 0.6480301569761518,
      "grad_norm": 0.0907343253493309,
      "learning_rate": 4.9932988040163726e-05,
      "loss": 8.6428,
      "step": 41344,
      "throughput": 12521.475130040359
    },
    {
      "epoch": 0.6485317283050808,
      "grad_norm": 0.08307422697544098,
      "learning_rate": 4.978985023091324e-05,
      "loss": 8.6496,
      "step": 41376,
      "throughput": 12521.449891877653
    },
    {
      "epoch": 0.6490332996340097,
      "grad_norm": 0.0786016508936882,
      "learning_rate": 4.964718754866186e-05,
      "loss": 8.6396,
      "step": 41408,
      "throughput": 12521.499320841976
    },
    {
      "epoch": 0.6495348709629386,
      "grad_norm": 0.08367376029491425,
      "learning_rate": 4.95050005817509e-05,
      "loss": 8.6546,
      "step": 41440,
      "throughput": 12521.56282824873
    },
    {
      "epoch": 0.6500364422918675,
      "grad_norm": 0.08345983922481537,
      "learning_rate": 4.936328991655988e-05,
      "loss": 8.6153,
      "step": 41472,
      "throughput": 12521.561639231697
    },
    {
      "epoch": 0.6505380136207964,
      "grad_norm": 0.08260690420866013,
      "learning_rate": 4.9222056137504e-05,
      "loss": 8.6362,
      "step": 41504,
      "throughput": 12521.667206337985
    },
    {
      "epoch": 0.6510395849497254,
      "grad_norm": 0.08155972510576248,
      "learning_rate": 4.908129982703169e-05,
      "loss": 8.6391,
      "step": 41536,
      "throughput": 12521.796105573369
    },
    {
      "epoch": 0.6515411562786543,
      "grad_norm": 0.08556462824344635,
      "learning_rate": 4.8941021565622516e-05,
      "loss": 8.6523,
      "step": 41568,
      "throughput": 12521.898259358102
    },
    {
      "epoch": 0.6520427276075831,
      "grad_norm": 0.0759713277220726,
      "learning_rate": 4.880122193178441e-05,
      "loss": 8.651,
      "step": 41600,
      "throughput": 12521.960176028337
    },
    {
      "epoch": 0.652544298936512,
      "grad_norm": 0.08541751652956009,
      "learning_rate": 4.866190150205143e-05,
      "loss": 8.6432,
      "step": 41632,
      "throughput": 12521.945643081093
    },
    {
      "epoch": 0.6530458702654409,
      "grad_norm": 0.08364102244377136,
      "learning_rate": 4.8523060850981476e-05,
      "loss": 8.6236,
      "step": 41664,
      "throughput": 12521.91633035345
    },
    {
      "epoch": 0.6535474415943698,
      "grad_norm": 0.09959270060062408,
      "learning_rate": 4.838470055115379e-05,
      "loss": 8.6442,
      "step": 41696,
      "throughput": 12521.843651647958
    },
    {
      "epoch": 0.6540490129232988,
      "grad_norm": 0.08332763612270355,
      "learning_rate": 4.82468211731667e-05,
      "loss": 8.6504,
      "step": 41728,
      "throughput": 12521.829790599762
    },
    {
      "epoch": 0.6545505842522277,
      "grad_norm": 0.08586803823709488,
      "learning_rate": 4.8109423285635116e-05,
      "loss": 8.6482,
      "step": 41760,
      "throughput": 12521.95742416806
    },
    {
      "epoch": 0.6550521555811566,
      "grad_norm": 0.08022906631231308,
      "learning_rate": 4.797250745518833e-05,
      "loss": 8.6108,
      "step": 41792,
      "throughput": 12521.939657731617
    },
    {
      "epoch": 0.6555537269100855,
      "grad_norm": 0.08036471158266068,
      "learning_rate": 4.7836074246467685e-05,
      "loss": 8.6355,
      "step": 41824,
      "throughput": 12522.020873759328
    },
    {
      "epoch": 0.6560552982390144,
      "grad_norm": 0.08128458261489868,
      "learning_rate": 4.770012422212412e-05,
      "loss": 8.6377,
      "step": 41856,
      "throughput": 12522.124525838915
    },
    {
      "epoch": 0.6565568695679433,
      "grad_norm": 0.08254926651716232,
      "learning_rate": 4.756465794281592e-05,
      "loss": 8.6364,
      "step": 41888,
      "throughput": 12522.22535205011
    },
    {
      "epoch": 0.6570584408968723,
      "grad_norm": 0.08438771218061447,
      "learning_rate": 4.742967596720641e-05,
      "loss": 8.6498,
      "step": 41920,
      "throughput": 12522.306833724853
    },
    {
      "epoch": 0.6575600122258012,
      "grad_norm": 0.08102980256080627,
      "learning_rate": 4.729517885196169e-05,
      "loss": 8.6601,
      "step": 41952,
      "throughput": 12522.312067211904
    },
    {
      "epoch": 0.6580615835547301,
      "grad_norm": 0.0832732692360878,
      "learning_rate": 4.716116715174827e-05,
      "loss": 8.6327,
      "step": 41984,
      "throughput": 12522.290298437994
    },
    {
      "epoch": 0.6585631548836589,
      "grad_norm": 0.08147306740283966,
      "learning_rate": 4.702764141923075e-05,
      "loss": 8.6518,
      "step": 42016,
      "throughput": 12522.187377477721
    },
    {
      "epoch": 0.6590647262125878,
      "grad_norm": 0.08547472208738327,
      "learning_rate": 4.6894602205069674e-05,
      "loss": 8.6239,
      "step": 42048,
      "throughput": 12522.12719798037
    },
    {
      "epoch": 0.6595662975415167,
      "grad_norm": 0.08419306576251984,
      "learning_rate": 4.6762050057919165e-05,
      "loss": 8.6273,
      "step": 42080,
      "throughput": 12522.252802743767
    },
    {
      "epoch": 0.6600678688704457,
      "grad_norm": 0.08612877875566483,
      "learning_rate": 4.6629985524424686e-05,
      "loss": 8.6288,
      "step": 42112,
      "throughput": 12522.27762845165
    },
    {
      "epoch": 0.6605694401993746,
      "grad_norm": 0.08975645899772644,
      "learning_rate": 4.649840914922071e-05,
      "loss": 8.6381,
      "step": 42144,
      "throughput": 12522.307565885858
    },
    {
      "epoch": 0.6610710115283035,
      "grad_norm": 0.08304554969072342,
      "learning_rate": 4.636732147492863e-05,
      "loss": 8.6272,
      "step": 42176,
      "throughput": 12522.40601976463
    },
    {
      "epoch": 0.6615725828572324,
      "grad_norm": 0.08024667203426361,
      "learning_rate": 4.6236723042154424e-05,
      "loss": 8.6528,
      "step": 42208,
      "throughput": 12522.5278263286
    },
    {
      "epoch": 0.6620741541861613,
      "grad_norm": 0.08220332860946655,
      "learning_rate": 4.61066143894864e-05,
      "loss": 8.6424,
      "step": 42240,
      "throughput": 12522.587369184974
    },
    {
      "epoch": 0.6625757255150903,
      "grad_norm": 0.08129006624221802,
      "learning_rate": 4.5976996053492996e-05,
      "loss": 8.6298,
      "step": 42272,
      "throughput": 12522.670632374928
    },
    {
      "epoch": 0.6630772968440192,
      "grad_norm": 0.08768677711486816,
      "learning_rate": 4.5847868568720646e-05,
      "loss": 8.6229,
      "step": 42304,
      "throughput": 12522.68761818525
    },
    {
      "epoch": 0.6635788681729481,
      "grad_norm": 0.07980692386627197,
      "learning_rate": 4.571923246769147e-05,
      "loss": 8.6451,
      "step": 42336,
      "throughput": 12522.582664506732
    },
    {
      "epoch": 0.664080439501877,
      "grad_norm": 0.08362894505262375,
      "learning_rate": 4.559108828090115e-05,
      "loss": 8.6364,
      "step": 42368,
      "throughput": 12522.497043430209
    },
    {
      "epoch": 0.6645820108308059,
      "grad_norm": 0.08338017016649246,
      "learning_rate": 4.546343653681667e-05,
      "loss": 8.6472,
      "step": 42400,
      "throughput": 12522.586423737033
    },
    {
      "epoch": 0.6650835821597348,
      "grad_norm": 0.09748286753892899,
      "learning_rate": 4.53362777618742e-05,
      "loss": 8.6326,
      "step": 42432,
      "throughput": 12522.64904954824
    },
    {
      "epoch": 0.6655851534886637,
      "grad_norm": 0.08786331117153168,
      "learning_rate": 4.52096124804769e-05,
      "loss": 8.6305,
      "step": 42464,
      "throughput": 12522.634563136266
    },
    {
      "epoch": 0.6660867248175926,
      "grad_norm": 0.08766663819551468,
      "learning_rate": 4.508344121499281e-05,
      "loss": 8.6313,
      "step": 42496,
      "throughput": 12522.761445075928
    },
    {
      "epoch": 0.6665882961465215,
      "grad_norm": 0.08180690556764603,
      "learning_rate": 4.495776448575255e-05,
      "loss": 8.6289,
      "step": 42528,
      "throughput": 12522.863547315234
    },
    {
      "epoch": 0.6670898674754504,
      "grad_norm": 0.07657379657030106,
      "learning_rate": 4.483258281104734e-05,
      "loss": 8.6117,
      "step": 42560,
      "throughput": 12522.965550374338
    },
    {
      "epoch": 0.6675914388043793,
      "grad_norm": 0.0844731479883194,
      "learning_rate": 4.470789670712681e-05,
      "loss": 8.6299,
      "step": 42592,
      "throughput": 12522.990678540467
    },
    {
      "epoch": 0.6680930101333082,
      "grad_norm": 0.08531223982572556,
      "learning_rate": 4.458370668819676e-05,
      "loss": 8.6462,
      "step": 42624,
      "throughput": 12523.031978335513
    },
    {
      "epoch": 0.6685945814622372,
      "grad_norm": 0.08948558568954468,
      "learning_rate": 4.4460013266417226e-05,
      "loss": 8.6194,
      "step": 42656,
      "throughput": 12522.988029400258
    },
    {
      "epoch": 0.6690961527911661,
      "grad_norm": 0.08160790055990219,
      "learning_rate": 4.433681695190027e-05,
      "loss": 8.6268,
      "step": 42688,
      "throughput": 12522.897319792524
    },
    {
      "epoch": 0.669597724120095,
      "grad_norm": 0.08070877939462662,
      "learning_rate": 4.421411825270785e-05,
      "loss": 8.6173,
      "step": 42720,
      "throughput": 12522.890851115651
    },
    {
      "epoch": 0.6700992954490239,
      "grad_norm": 0.0807216539978981,
      "learning_rate": 4.4091917674849727e-05,
      "loss": 8.6364,
      "step": 42752,
      "throughput": 12522.989144430921
    },
    {
      "epoch": 0.6706008667779528,
      "grad_norm": 0.07999275624752045,
      "learning_rate": 4.397021572228147e-05,
      "loss": 8.6432,
      "step": 42784,
      "throughput": 12522.988455472507
    },
    {
      "epoch": 0.6711024381068817,
      "grad_norm": 0.08301256597042084,
      "learning_rate": 4.38490128969023e-05,
      "loss": 8.6446,
      "step": 42816,
      "throughput": 12523.010419309921
    },
    {
      "epoch": 0.6716040094358107,
      "grad_norm": 0.093504898250103,
      "learning_rate": 4.3728309698553056e-05,
      "loss": 8.6468,
      "step": 42848,
      "throughput": 12522.775666640384
    },
    {
      "epoch": 0.6721055807647396,
      "grad_norm": 0.07958458364009857,
      "learning_rate": 4.3608106625014014e-05,
      "loss": 8.6301,
      "step": 42880,
      "throughput": 12522.867752852817
    },
    {
      "epoch": 0.6726071520936684,
      "grad_norm": 0.0886707678437233,
      "learning_rate": 4.348840417200306e-05,
      "loss": 8.6371,
      "step": 42912,
      "throughput": 12522.919741376949
    },
    {
      "epoch": 0.6731087234225973,
      "grad_norm": 0.08578263223171234,
      "learning_rate": 4.336920283317343e-05,
      "loss": 8.6488,
      "step": 42944,
      "throughput": 12522.922350456145
    },
    {
      "epoch": 0.6736102947515262,
      "grad_norm": 0.08376625925302505,
      "learning_rate": 4.325050310011183e-05,
      "loss": 8.6529,
      "step": 42976,
      "throughput": 12522.931695930905
    },
    {
      "epoch": 0.6741118660804551,
      "grad_norm": 0.11295043677091599,
      "learning_rate": 4.3132305462336306e-05,
      "loss": 8.6394,
      "step": 43008,
      "throughput": 12522.788272499705
    },
    {
      "epoch": 0.6746134374093841,
      "grad_norm": 0.08392587304115295,
      "learning_rate": 4.301461040729424e-05,
      "loss": 8.6591,
      "step": 43040,
      "throughput": 12522.27830297955
    },
    {
      "epoch": 0.675115008738313,
      "grad_norm": 0.08914206176996231,
      "learning_rate": 4.289741842036042e-05,
      "loss": 8.6304,
      "step": 43072,
      "throughput": 12522.394654787475
    },
    {
      "epoch": 0.6756165800672419,
      "grad_norm": 0.08153049647808075,
      "learning_rate": 4.2780729984834916e-05,
      "loss": 8.6209,
      "step": 43104,
      "throughput": 12522.398481053842
    },
    {
      "epoch": 0.6761181513961708,
      "grad_norm": 0.08689522743225098,
      "learning_rate": 4.266454558194122e-05,
      "loss": 8.6312,
      "step": 43136,
      "throughput": 12522.42026822721
    },
    {
      "epoch": 0.6766197227250997,
      "grad_norm": 0.08201098442077637,
      "learning_rate": 4.254886569082413e-05,
      "loss": 8.6142,
      "step": 43168,
      "throughput": 12522.51619543128
    },
    {
      "epoch": 0.6771212940540287,
      "grad_norm": 0.09345916658639908,
      "learning_rate": 4.243369078854788e-05,
      "loss": 8.6268,
      "step": 43200,
      "throughput": 12522.60465925701
    },
    {
      "epoch": 0.6776228653829576,
      "grad_norm": 0.07916395366191864,
      "learning_rate": 4.231902135009407e-05,
      "loss": 8.648,
      "step": 43232,
      "throughput": 12522.652433449026
    },
    {
      "epoch": 0.6781244367118865,
      "grad_norm": 0.08550101518630981,
      "learning_rate": 4.220485784835984e-05,
      "loss": 8.638,
      "step": 43264,
      "throughput": 12522.654022435197
    },
    {
      "epoch": 0.6786260080408154,
      "grad_norm": 0.08053749054670334,
      "learning_rate": 4.209120075415577e-05,
      "loss": 8.6118,
      "step": 43296,
      "throughput": 12522.67075424937
    },
    {
      "epoch": 0.6791275793697443,
      "grad_norm": 0.09553582966327667,
      "learning_rate": 4.197805053620411e-05,
      "loss": 8.6166,
      "step": 43328,
      "throughput": 12522.60568477544
    },
    {
      "epoch": 0.6796291506986731,
      "grad_norm": 0.08054535835981369,
      "learning_rate": 4.186540766113665e-05,
      "loss": 8.6222,
      "step": 43360,
      "throughput": 12522.514819398422
    },
    {
      "epoch": 0.680130722027602,
      "grad_norm": 0.08594219386577606,
      "learning_rate": 4.1753272593492956e-05,
      "loss": 8.6323,
      "step": 43392,
      "throughput": 12522.628488500615
    },
    {
      "epoch": 0.680632293356531,
      "grad_norm": 0.09543893486261368,
      "learning_rate": 4.1641645795718364e-05,
      "loss": 8.6415,
      "step": 43424,
      "throughput": 12522.637533434876
    },
    {
      "epoch": 0.6811338646854599,
      "grad_norm": 0.08152885735034943,
      "learning_rate": 4.153052772816217e-05,
      "loss": 8.6092,
      "step": 43456,
      "throughput": 12522.662222482037
    },
    {
      "epoch": 0.6816354360143888,
      "grad_norm": 0.0825868621468544,
      "learning_rate": 4.141991884907555e-05,
      "loss": 8.6168,
      "step": 43488,
      "throughput": 12522.777858646657
    },
    {
      "epoch": 0.6821370073433177,
      "grad_norm": 0.08323723077774048,
      "learning_rate": 4.1309819614609865e-05,
      "loss": 8.618,
      "step": 43520,
      "throughput": 12522.84369218315
    },
    {
      "epoch": 0.6826385786722466,
      "grad_norm": 0.09601942449808121,
      "learning_rate": 4.1200230478814695e-05,
      "loss": 8.6368,
      "step": 43552,
      "throughput": 12522.914836625052
    },
    {
      "epoch": 0.6831401500011756,
      "grad_norm": 0.08802493661642075,
      "learning_rate": 4.109115189363601e-05,
      "loss": 8.6315,
      "step": 43584,
      "throughput": 12522.984965508142
    },
    {
      "epoch": 0.6836417213301045,
      "grad_norm": 0.08340758085250854,
      "learning_rate": 4.0982584308914114e-05,
      "loss": 8.6037,
      "step": 43616,
      "throughput": 12522.988422248942
    },
    {
      "epoch": 0.6841432926590334,
      "grad_norm": 0.08395954966545105,
      "learning_rate": 4.0874528172382114e-05,
      "loss": 8.632,
      "step": 43648,
      "throughput": 12522.980385093575
    },
    {
      "epoch": 0.6846448639879623,
      "grad_norm": 0.08390320092439651,
      "learning_rate": 4.0766983929663835e-05,
      "loss": 8.6055,
      "step": 43680,
      "throughput": 12522.821441200436
    },
    {
      "epoch": 0.6851464353168912,
      "grad_norm": 0.08433941751718521,
      "learning_rate": 4.065995202427206e-05,
      "loss": 8.6176,
      "step": 43712,
      "throughput": 12522.842022596295
    },
    {
      "epoch": 0.6856480066458202,
      "grad_norm": 0.07991538941860199,
      "learning_rate": 4.055343289760664e-05,
      "loss": 8.6254,
      "step": 43744,
      "throughput": 12522.886808340942
    },
    {
      "epoch": 0.6861495779747491,
      "grad_norm": 0.08180645108222961,
      "learning_rate": 4.0447426988952816e-05,
      "loss": 8.5996,
      "step": 43776,
      "throughput": 12522.927032292622
    },
    {
      "epoch": 0.6866511493036779,
      "grad_norm": 0.08377785980701447,
      "learning_rate": 4.0341934735479224e-05,
      "loss": 8.6212,
      "step": 43808,
      "throughput": 12522.991394982113
    },
    {
      "epoch": 0.6871527206326068,
      "grad_norm": 0.09080639481544495,
      "learning_rate": 4.02369565722363e-05,
      "loss": 8.6394,
      "step": 43840,
      "throughput": 12523.084577149331
    },
    {
      "epoch": 0.6876542919615357,
      "grad_norm": 0.08551277220249176,
      "learning_rate": 4.013249293215422e-05,
      "loss": 8.597,
      "step": 43872,
      "throughput": 12523.135716651132
    },
    {
      "epoch": 0.6881558632904646,
      "grad_norm": 0.07742547988891602,
      "learning_rate": 4.0028544246041406e-05,
      "loss": 8.6112,
      "step": 43904,
      "throughput": 12523.209614895739
    },
    {
      "epoch": 0.6886574346193935,
      "grad_norm": 0.0808200091123581,
      "learning_rate": 3.99251109425825e-05,
      "loss": 8.6392,
      "step": 43936,
      "throughput": 12523.23536883359
    },
    {
      "epoch": 0.6891590059483225,
      "grad_norm": 0.08219944685697556,
      "learning_rate": 3.982219344833681e-05,
      "loss": 8.6295,
      "step": 43968,
      "throughput": 12523.249453557088
    },
    {
      "epoch": 0.6896605772772514,
      "grad_norm": 0.07823171466588974,
      "learning_rate": 3.971979218773634e-05,
      "loss": 8.604,
      "step": 44000,
      "throughput": 12523.173674158134
    },
    {
      "epoch": 0.6901621486061803,
      "grad_norm": 0.083199642598629,
      "learning_rate": 3.961790758308418e-05,
      "loss": 8.619,
      "step": 44032,
      "throughput": 12523.113882214477
    },
    {
      "epoch": 0.6906637199351092,
      "grad_norm": 0.08003423362970352,
      "learning_rate": 3.951654005455281e-05,
      "loss": 8.6139,
      "step": 44064,
      "throughput": 12523.2006829187
    },
    {
      "epoch": 0.6911652912640381,
      "grad_norm": 0.08449212461709976,
      "learning_rate": 3.9415690020182154e-05,
      "loss": 8.6242,
      "step": 44096,
      "throughput": 12523.240405619921
    },
    {
      "epoch": 0.6916668625929671,
      "grad_norm": 0.08138830214738846,
      "learning_rate": 3.9315357895878066e-05,
      "loss": 8.6153,
      "step": 44128,
      "throughput": 12523.267847555973
    },
    {
      "epoch": 0.692168433921896,
      "grad_norm": 0.08120720088481903,
      "learning_rate": 3.921554409541053e-05,
      "loss": 8.6142,
      "step": 44160,
      "throughput": 12523.359212932843
    },
    {
      "epoch": 0.6926700052508249,
      "grad_norm": 0.08653061091899872,
      "learning_rate": 3.911624903041198e-05,
      "loss": 8.6334,
      "step": 44192,
      "throughput": 12523.44384977313
    },
    {
      "epoch": 0.6931715765797538,
      "grad_norm": 0.08123726397752762,
      "learning_rate": 3.9017473110375525e-05,
      "loss": 8.6243,
      "step": 44224,
      "throughput": 12523.48548232217
    },
    {
      "epoch": 0.6936731479086826,
      "grad_norm": 0.0898829996585846,
      "learning_rate": 3.891921674265336e-05,
      "loss": 8.6052,
      "step": 44256,
      "throughput": 12523.506511880501
    },
    {
      "epoch": 0.6941747192376115,
      "grad_norm": 0.08948096632957458,
      "learning_rate": 3.8821480332455024e-05,
      "loss": 8.6146,
      "step": 44288,
      "throughput": 12523.524708510986
    },
    {
      "epoch": 0.6946762905665405,
      "grad_norm": 0.0792609453201294,
      "learning_rate": 3.87242642828458e-05,
      "loss": 8.6346,
      "step": 44320,
      "throughput": 12523.469758766978
    },
    {
      "epoch": 0.6951778618954694,
      "grad_norm": 0.08997055888175964,
      "learning_rate": 3.862756899474493e-05,
      "loss": 8.6231,
      "step": 44352,
      "throughput": 12523.368936179342
    },
    {
      "epoch": 0.6956794332243983,
      "grad_norm": 0.08132538944482803,
      "learning_rate": 3.853139486692408e-05,
      "loss": 8.5962,
      "step": 44384,
      "throughput": 12523.454513690915
    },
    {
      "epoch": 0.6961810045533272,
      "grad_norm": 0.08895617723464966,
      "learning_rate": 3.843574229600565e-05,
      "loss": 8.6124,
      "step": 44416,
      "throughput": 12523.481205386239
    },
    {
      "epoch": 0.6966825758822561,
      "grad_norm": 0.07943489402532578,
      "learning_rate": 3.834061167646112e-05,
      "loss": 8.6234,
      "step": 44448,
      "throughput": 12523.50126168083
    },
    {
      "epoch": 0.697184147211185,
      "grad_norm": 0.0908041000366211,
      "learning_rate": 3.8246003400609424e-05,
      "loss": 8.6159,
      "step": 44480,
      "throughput": 12523.608974555584
    },
    {
      "epoch": 0.697685718540114,
      "grad_norm": 0.08159472048282623,
      "learning_rate": 3.81519178586154e-05,
      "loss": 8.6132,
      "step": 44512,
      "throughput": 12523.664315294256
    },
    {
      "epoch": 0.6981872898690429,
      "grad_norm": 0.08212780207395554,
      "learning_rate": 3.805835543848809e-05,
      "loss": 8.6418,
      "step": 44544,
      "throughput": 12523.696854249029
    },
    {
      "epoch": 0.6986888611979718,
      "grad_norm": 0.08898486942052841,
      "learning_rate": 3.796531652607919e-05,
      "loss": 8.6337,
      "step": 44576,
      "throughput": 12523.767501252678
    },
    {
      "epoch": 0.6991904325269007,
      "grad_norm": 0.07757981866598129,
      "learning_rate": 3.7872801505081434e-05,
      "loss": 8.64,
      "step": 44608,
      "throughput": 12523.763177558329
    },
    {
      "epoch": 0.6996920038558296,
      "grad_norm": 0.09902704507112503,
      "learning_rate": 3.778081075702709e-05,
      "loss": 8.6096,
      "step": 44640,
      "throughput": 12523.724903622915
    },
    {
      "epoch": 0.7001935751847586,
      "grad_norm": 0.09403155744075775,
      "learning_rate": 3.7689344661286264e-05,
      "loss": 8.6372,
      "step": 44672,
      "throughput": 12523.62412386298
    },
    {
      "epoch": 0.7006951465136874,
      "grad_norm": 0.07636962831020355,
      "learning_rate": 3.759840359506536e-05,
      "loss": 8.6089,
      "step": 44704,
      "throughput": 12523.68024082709
    },
    {
      "epoch": 0.7011967178426163,
      "grad_norm": 0.07870710641145706,
      "learning_rate": 3.750798793340565e-05,
      "loss": 8.6243,
      "step": 44736,
      "throughput": 12523.730161145108
    },
    {
      "epoch": 0.7016982891715452,
      "grad_norm": 0.09148543328046799,
      "learning_rate": 3.7418098049181573e-05,
      "loss": 8.635,
      "step": 44768,
      "throughput": 12523.798281632704
    },
    {
      "epoch": 0.7021998605004741,
      "grad_norm": 0.10121971368789673,
      "learning_rate": 3.732873431309929e-05,
      "loss": 8.6121,
      "step": 44800,
      "throughput": 12523.82858597023
    },
    {
      "epoch": 0.702701431829403,
      "grad_norm": 0.09453009814023972,
      "learning_rate": 3.7239897093695106e-05,
      "loss": 8.623,
      "step": 44832,
      "throughput": 12523.891706175975
    },
    {
      "epoch": 0.703203003158332,
      "grad_norm": 0.08002530783414841,
      "learning_rate": 3.715158675733396e-05,
      "loss": 8.6228,
      "step": 44864,
      "throughput": 12523.936516927986
    },
    {
      "epoch": 0.7037045744872609,
      "grad_norm": 0.08112312108278275,
      "learning_rate": 3.706380366820796e-05,
      "loss": 8.6189,
      "step": 44896,
      "throughput": 12523.96820859386
    },
    {
      "epoch": 0.7042061458161898,
      "grad_norm": 0.08807221800088882,
      "learning_rate": 3.6976548188334834e-05,
      "loss": 8.6011,
      "step": 44928,
      "throughput": 12524.00821230705
    },
    {
      "epoch": 0.7047077171451187,
      "grad_norm": 0.07938039302825928,
      "learning_rate": 3.688982067755642e-05,
      "loss": 8.5999,
      "step": 44960,
      "throughput": 12524.02571071888
    },
    {
      "epoch": 0.7052092884740476,
      "grad_norm": 0.08693437278270721,
      "learning_rate": 3.680362149353724e-05,
      "loss": 8.6299,
      "step": 44992,
      "throughput": 12523.882489605088
    },
    {
      "epoch": 0.7057108598029765,
      "grad_norm": 0.08155670017004013,
      "learning_rate": 3.671795099176297e-05,
      "loss": 8.6078,
      "step": 45024,
      "throughput": 12523.846569313075
    },
    {
      "epoch": 0.7062124311319055,
      "grad_norm": 0.08850400149822235,
      "learning_rate": 3.6632809525539055e-05,
      "loss": 8.6215,
      "step": 45056,
      "throughput": 12523.903709696035
    },
    {
      "epoch": 0.7067140024608344,
      "grad_norm": 0.08342622220516205,
      "learning_rate": 3.6548197445989086e-05,
      "loss": 8.6262,
      "step": 45088,
      "throughput": 12523.479201287208
    },
    {
      "epoch": 0.7072155737897633,
      "grad_norm": 0.07904572039842606,
      "learning_rate": 3.6464115102053596e-05,
      "loss": 8.6112,
      "step": 45120,
      "throughput": 12523.491504430669
    },
    {
      "epoch": 0.7077171451186921,
      "grad_norm": 0.08593115955591202,
      "learning_rate": 3.6380562840488376e-05,
      "loss": 8.6333,
      "step": 45152,
      "throughput": 12523.580804834894
    },
    {
      "epoch": 0.708218716447621,
      "grad_norm": 0.07867827266454697,
      "learning_rate": 3.629754100586323e-05,
      "loss": 8.6095,
      "step": 45184,
      "throughput": 12523.619096618833
    },
    {
      "epoch": 0.7087202877765499,
      "grad_norm": 0.0866331160068512,
      "learning_rate": 3.6215049940560433e-05,
      "loss": 8.6319,
      "step": 45216,
      "throughput": 12523.682169342635
    },
    {
      "epoch": 0.7092218591054789,
      "grad_norm": 0.089105524122715,
      "learning_rate": 3.613308998477339e-05,
      "loss": 8.5836,
      "step": 45248,
      "throughput": 12523.703755312761
    },
    {
      "epoch": 0.7097234304344078,
      "grad_norm": 0.07944278419017792,
      "learning_rate": 3.605166147650517e-05,
      "loss": 8.611,
      "step": 45280,
      "throughput": 12523.690922392816
    },
    {
      "epoch": 0.7102250017633367,
      "grad_norm": 0.07689522206783295,
      "learning_rate": 3.597076475156726e-05,
      "loss": 8.633,
      "step": 45312,
      "throughput": 12523.659284638932
    },
    {
      "epoch": 0.7107265730922656,
      "grad_norm": 0.08848965167999268,
      "learning_rate": 3.589040014357791e-05,
      "loss": 8.6344,
      "step": 45344,
      "throughput": 12523.5316571319
    },
    {
      "epoch": 0.7112281444211945,
      "grad_norm": 0.0853632241487503,
      "learning_rate": 3.581056798396105e-05,
      "loss": 8.6217,
      "step": 45376,
      "throughput": 12523.58453266287
    },
    {
      "epoch": 0.7117297157501234,
      "grad_norm": 0.09939948469400406,
      "learning_rate": 3.57312686019447e-05,
      "loss": 8.6062,
      "step": 45408,
      "throughput": 12523.692480312704
    },
    {
      "epoch": 0.7122312870790524,
      "grad_norm": 0.08067552000284195,
      "learning_rate": 3.565250232455983e-05,
      "loss": 8.6223,
      "step": 45440,
      "throughput": 12523.672272983184
    },
    {
      "epoch": 0.7127328584079813,
      "grad_norm": 0.08435127884149551,
      "learning_rate": 3.55742694766387e-05,
      "loss": 8.61,
      "step": 45472,
      "throughput": 12523.780598951442
    },
    {
      "epoch": 0.7132344297369102,
      "grad_norm": 0.100026935338974,
      "learning_rate": 3.549657038081386e-05,
      "loss": 8.616,
      "step": 45504,
      "throughput": 12523.806392368822
    },
    {
      "epoch": 0.7137360010658391,
      "grad_norm": 0.08343475311994553,
      "learning_rate": 3.5419405357516624e-05,
      "loss": 8.6031,
      "step": 45536,
      "throughput": 12523.875363752408
    },
    {
      "epoch": 0.714237572394768,
      "grad_norm": 0.08745314925909042,
      "learning_rate": 3.534277472497574e-05,
      "loss": 8.6085,
      "step": 45568,
      "throughput": 12523.903771054958
    },
    {
      "epoch": 0.7147391437236968,
      "grad_norm": 0.0844145342707634,
      "learning_rate": 3.52666787992162e-05,
      "loss": 8.6286,
      "step": 45600,
      "throughput": 12523.90038549103
    },
    {
      "epoch": 0.7152407150526258,
      "grad_norm": 0.08013051003217697,
      "learning_rate": 3.519111789405779e-05,
      "loss": 8.6428,
      "step": 45632,
      "throughput": 12523.89713389975
    },
    {
      "epoch": 0.7157422863815547,
      "grad_norm": 0.08158287405967712,
      "learning_rate": 3.5116092321113936e-05,
      "loss": 8.6173,
      "step": 45664,
      "throughput": 12523.80169957661
    },
    {
      "epoch": 0.7162438577104836,
      "grad_norm": 0.08240120112895966,
      "learning_rate": 3.504160238979032e-05,
      "loss": 8.5966,
      "step": 45696,
      "throughput": 12523.83175255946
    },
    {
      "epoch": 0.7167454290394125,
      "grad_norm": 0.08658699691295624,
      "learning_rate": 3.496764840728361e-05,
      "loss": 8.5956,
      "step": 45728,
      "throughput": 12523.899073897346
    },
    {
      "epoch": 0.7172470003683414,
      "grad_norm": 0.12535245716571808,
      "learning_rate": 3.489423067858027e-05,
      "loss": 8.6085,
      "step": 45760,
      "throughput": 12523.970433581719
    },
    {
      "epoch": 0.7177485716972704,
      "grad_norm": 0.09029490500688553,
      "learning_rate": 3.4821349506455255e-05,
      "loss": 8.6238,
      "step": 45792,
      "throughput": 12523.995812741854
    },
    {
      "epoch": 0.7182501430261993,
      "grad_norm": 0.08282382786273956,
      "learning_rate": 3.47490051914707e-05,
      "loss": 8.5885,
      "step": 45824,
      "throughput": 12524.055137789073
    },
    {
      "epoch": 0.7187517143551282,
      "grad_norm": 0.08247379213571548,
      "learning_rate": 3.4677198031974784e-05,
      "loss": 8.6082,
      "step": 45856,
      "throughput": 12524.080343281556
    },
    {
      "epoch": 0.7192532856840571,
      "grad_norm": 0.08638311922550201,
      "learning_rate": 3.4605928324100444e-05,
      "loss": 8.6297,
      "step": 45888,
      "throughput": 12524.10711505808
    },
    {
      "epoch": 0.719754857012986,
      "grad_norm": 0.09089305996894836,
      "learning_rate": 3.45351963617642e-05,
      "loss": 8.5998,
      "step": 45920,
      "throughput": 12524.126242425193
    },
    {
      "epoch": 0.720256428341915,
      "grad_norm": 0.11494186520576477,
      "learning_rate": 3.446500243666481e-05,
      "loss": 8.6295,
      "step": 45952,
      "throughput": 12524.163220188617
    },
    {
      "epoch": 0.7207579996708439,
      "grad_norm": 0.0827856957912445,
      "learning_rate": 3.439534683828228e-05,
      "loss": 8.6133,
      "step": 45984,
      "throughput": 12524.0293047988
    },
    {
      "epoch": 0.7212595709997727,
      "grad_norm": 0.08527000993490219,
      "learning_rate": 3.4326229853876475e-05,
      "loss": 8.6231,
      "step": 46016,
      "throughput": 12524.019004494541
    },
    {
      "epoch": 0.7217611423287016,
      "grad_norm": 0.09022502601146698,
      "learning_rate": 3.425765176848607e-05,
      "loss": 8.6127,
      "step": 46048,
      "throughput": 12524.058892874886
    },
    {
      "epoch": 0.7222627136576305,
      "grad_norm": 0.08133542537689209,
      "learning_rate": 3.418961286492728e-05,
      "loss": 8.6219,
      "step": 46080,
      "throughput": 12524.13604167911
    },
    {
      "epoch": 0.7227642849865594,
      "grad_norm": 0.08808305859565735,
      "learning_rate": 3.412211342379273e-05,
      "loss": 8.6263,
      "step": 46112,
      "throughput": 12524.171982740596
    },
    {
      "epoch": 0.7232658563154883,
      "grad_norm": 0.0857241079211235,
      "learning_rate": 3.405515372345033e-05,
      "loss": 8.6126,
      "step": 46144,
      "throughput": 12524.256170090268
    },
    {
      "epoch": 0.7237674276444173,
      "grad_norm": 0.08425363153219223,
      "learning_rate": 3.398873404004209e-05,
      "loss": 8.6241,
      "step": 46176,
      "throughput": 12524.295988496866
    },
    {
      "epoch": 0.7242689989733462,
      "grad_norm": 0.10848300904035568,
      "learning_rate": 3.392285464748298e-05,
      "loss": 8.6233,
      "step": 46208,
      "throughput": 12524.335729369192
    },
    {
      "epoch": 0.7247705703022751,
      "grad_norm": 0.10150907933712006,
      "learning_rate": 3.385751581745979e-05,
      "loss": 8.6177,
      "step": 46240,
      "throughput": 12524.332371447043
    },
    {
      "epoch": 0.725272141631204,
      "grad_norm": 0.0821448266506195,
      "learning_rate": 3.379271781943007e-05,
      "loss": 8.5999,
      "step": 46272,
      "throughput": 12524.365300158324
    },
    {
      "epoch": 0.7257737129601329,
      "grad_norm": 0.0838143527507782,
      "learning_rate": 3.372846092062095e-05,
      "loss": 8.6133,
      "step": 46304,
      "throughput": 12524.275482011071
    },
    {
      "epoch": 0.7262752842890619,
      "grad_norm": 0.07988490909337997,
      "learning_rate": 3.366474538602806e-05,
      "loss": 8.6177,
      "step": 46336,
      "throughput": 12524.206904659482
    },
    {
      "epoch": 0.7267768556179908,
      "grad_norm": 0.0871606096625328,
      "learning_rate": 3.3601571478414455e-05,
      "loss": 8.5967,
      "step": 46368,
      "throughput": 12524.257923440766
    },
    {
      "epoch": 0.7272784269469197,
      "grad_norm": 0.08344225585460663,
      "learning_rate": 3.3538939458309556e-05,
      "loss": 8.6105,
      "step": 46400,
      "throughput": 12524.36591856638
    },
    {
      "epoch": 0.7277799982758486,
      "grad_norm": 0.08372768014669418,
      "learning_rate": 3.347684958400795e-05,
      "loss": 8.5999,
      "step": 46432,
      "throughput": 12524.353684013731
    },
    {
      "epoch": 0.7282815696047774,
      "grad_norm": 0.07885193079710007,
      "learning_rate": 3.341530211156847e-05,
      "loss": 8.6069,
      "step": 46464,
      "throughput": 12524.435881493126
    },
    {
      "epoch": 0.7287831409337063,
      "grad_norm": 0.07780560851097107,
      "learning_rate": 3.33542972948131e-05,
      "loss": 8.6087,
      "step": 46496,
      "throughput": 12524.477125348127
    },
    {
      "epoch": 0.7292847122626352,
      "grad_norm": 0.07899513840675354,
      "learning_rate": 3.329383538532587e-05,
      "loss": 8.6141,
      "step": 46528,
      "throughput": 12524.51234135451
    },
    {
      "epoch": 0.7297862835915642,
      "grad_norm": 0.07994463294744492,
      "learning_rate": 3.323391663245188e-05,
      "loss": 8.607,
      "step": 46560,
      "throughput": 12524.558991803693
    },
    {
      "epoch": 0.7302878549204931,
      "grad_norm": 0.08503729104995728,
      "learning_rate": 3.3174541283296225e-05,
      "loss": 8.6031,
      "step": 46592,
      "throughput": 12524.564242509407
    },
    {
      "epoch": 0.730789426249422,
      "grad_norm": 0.07991162687540054,
      "learning_rate": 3.311570958272303e-05,
      "loss": 8.5908,
      "step": 46624,
      "throughput": 12524.528518215346
    },
    {
      "epoch": 0.7312909975783509,
      "grad_norm": 0.10354507714509964,
      "learning_rate": 3.305742177335444e-05,
      "loss": 8.5972,
      "step": 46656,
      "throughput": 12524.490738426928
    },
    {
      "epoch": 0.7317925689072798,
      "grad_norm": 0.08643455803394318,
      "learning_rate": 3.29996780955695e-05,
      "loss": 8.5962,
      "step": 46688,
      "throughput": 12524.459445706503
    },
    {
      "epoch": 0.7322941402362088,
      "grad_norm": 0.08863314241170883,
      "learning_rate": 3.294247878750333e-05,
      "loss": 8.6137,
      "step": 46720,
      "throughput": 12524.56493807568
    },
    {
      "epoch": 0.7327957115651377,
      "grad_norm": 0.0950147733092308,
      "learning_rate": 3.288582408504603e-05,
      "loss": 8.5964,
      "step": 46752,
      "throughput": 12524.634684763916
    },
    {
      "epoch": 0.7332972828940666,
      "grad_norm": 0.08327952027320862,
      "learning_rate": 3.2829714221841805e-05,
      "loss": 8.6353,
      "step": 46784,
      "throughput": 12524.63104910303
    },
    {
      "epoch": 0.7337988542229955,
      "grad_norm": 0.0803689956665039,
      "learning_rate": 3.2774149429287854e-05,
      "loss": 8.6109,
      "step": 46816,
      "throughput": 12524.673779338787
    },
    {
      "epoch": 0.7343004255519244,
      "grad_norm": 0.09594424813985825,
      "learning_rate": 3.271912993653357e-05,
      "loss": 8.6212,
      "step": 46848,
      "throughput": 12524.734358163496
    },
    {
      "epoch": 0.7348019968808533,
      "grad_norm": 0.0844351202249527,
      "learning_rate": 3.266465597047948e-05,
      "loss": 8.6105,
      "step": 46880,
      "throughput": 12524.771159082025
    },
    {
      "epoch": 0.7353035682097822,
      "grad_norm": 0.08568018674850464,
      "learning_rate": 3.261072775577641e-05,
      "loss": 8.614,
      "step": 46912,
      "throughput": 12524.792275334812
    },
    {
      "epoch": 0.7358051395387111,
      "grad_norm": 0.0839168131351471,
      "learning_rate": 3.255734551482446e-05,
      "loss": 8.598,
      "step": 46944,
      "throughput": 12524.83965946456
    },
    {
      "epoch": 0.73630671086764,
      "grad_norm": 0.08393090963363647,
      "learning_rate": 3.2504509467772154e-05,
      "loss": 8.604,
      "step": 46976,
      "throughput": 12524.73058877774
    },
    {
      "epoch": 0.7368082821965689,
      "grad_norm": 0.08217764645814896,
      "learning_rate": 3.24522198325155e-05,
      "loss": 8.597,
      "step": 47008,
      "throughput": 12524.642241833213
    },
    {
      "epoch": 0.7373098535254978,
      "grad_norm": 0.0889383926987648,
      "learning_rate": 3.2400476824697126e-05,
      "loss": 8.5989,
      "step": 47040,
      "throughput": 12524.745536910536
    },
    {
      "epoch": 0.7378114248544267,
      "grad_norm": 0.09606282413005829,
      "learning_rate": 3.234928065770532e-05,
      "loss": 8.6268,
      "step": 47072,
      "throughput": 12524.822726977116
    },
    {
      "epoch": 0.7383129961833557,
      "grad_norm": 0.08575651049613953,
      "learning_rate": 3.2298631542673254e-05,
      "loss": 8.6118,
      "step": 47104,
      "throughput": 12524.841677926874
    },
    {
      "epoch": 0.7388145675122846,
      "grad_norm": 0.08529612421989441,
      "learning_rate": 3.2248529688478036e-05,
      "loss": 8.6369,
      "step": 47136,
      "throughput": 12524.46294525117
    },
    {
      "epoch": 0.7393161388412135,
      "grad_norm": 0.09575065970420837,
      "learning_rate": 3.2198975301739834e-05,
      "loss": 8.6034,
      "step": 47168,
      "throughput": 12524.506369958186
    },
    {
      "epoch": 0.7398177101701424,
      "grad_norm": 0.07998523861169815,
      "learning_rate": 3.214996858682109e-05,
      "loss": 8.6062,
      "step": 47200,
      "throughput": 12524.528555009028
    },
    {
      "epoch": 0.7403192814990713,
      "grad_norm": 0.08077394962310791,
      "learning_rate": 3.210150974582565e-05,
      "loss": 8.6244,
      "step": 47232,
      "throughput": 12524.578044882532
    },
    {
      "epoch": 0.7408208528280003,
      "grad_norm": 0.08935201168060303,
      "learning_rate": 3.205359897859793e-05,
      "loss": 8.6,
      "step": 47264,
      "throughput": 12524.58983487304
    },
    {
      "epoch": 0.7413224241569292,
      "grad_norm": 0.08008461445569992,
      "learning_rate": 3.2006236482722034e-05,
      "loss": 8.5788,
      "step": 47296,
      "throughput": 12524.508451575897
    },
    {
      "epoch": 0.7418239954858581,
      "grad_norm": 0.08044737577438354,
      "learning_rate": 3.195942245352108e-05,
      "loss": 8.6208,
      "step": 47328,
      "throughput": 12524.453576738908
    },
    {
      "epoch": 0.7423255668147869,
      "grad_norm": 0.08467201143503189,
      "learning_rate": 3.191315708405626e-05,
      "loss": 8.6039,
      "step": 47360,
      "throughput": 12524.503867721372
    },
    {
      "epoch": 0.7428271381437158,
      "grad_norm": 0.09131542593240738,
      "learning_rate": 3.1867440565126066e-05,
      "loss": 8.6309,
      "step": 47392,
      "throughput": 12524.608496881645
    },
    {
      "epoch": 0.7433287094726447,
      "grad_norm": 0.08028768748044968,
      "learning_rate": 3.182227308526557e-05,
      "loss": 8.5986,
      "step": 47424,
      "throughput": 12524.634275872028
    },
    {
      "epoch": 0.7438302808015737,
      "grad_norm": 0.07837190479040146,
      "learning_rate": 3.17776548307456e-05,
      "loss": 8.6226,
      "step": 47456,
      "throughput": 12524.676327812673
    },
    {
      "epoch": 0.7443318521305026,
      "grad_norm": 0.08891215920448303,
      "learning_rate": 3.173358598557196e-05,
      "loss": 8.6002,
      "step": 47488,
      "throughput": 12524.69703560755
    },
    {
      "epoch": 0.7448334234594315,
      "grad_norm": 0.08777833729982376,
      "learning_rate": 3.169006673148473e-05,
      "loss": 8.5901,
      "step": 47520,
      "throughput": 12524.737627397319
    },
    {
      "epoch": 0.7453349947883604,
      "grad_norm": 0.08668994158506393,
      "learning_rate": 3.1647097247957385e-05,
      "loss": 8.6023,
      "step": 47552,
      "throughput": 12524.751668629939
    },
    {
      "epoch": 0.7458365661172893,
      "grad_norm": 0.09065524488687515,
      "learning_rate": 3.160467771219624e-05,
      "loss": 8.6088,
      "step": 47584,
      "throughput": 12524.784184177812
    },
    {
      "epoch": 0.7463381374462182,
      "grad_norm": 0.09433916956186295,
      "learning_rate": 3.1562808299139596e-05,
      "loss": 8.6203,
      "step": 47616,
      "throughput": 12524.793844792443
    },
    {
      "epoch": 0.7468397087751472,
      "grad_norm": 0.0789889246225357,
      "learning_rate": 3.1521489181457005e-05,
      "loss": 8.6115,
      "step": 47648,
      "throughput": 12524.728785991274
    },
    {
      "epoch": 0.7473412801040761,
      "grad_norm": 0.08276744186878204,
      "learning_rate": 3.1480720529548654e-05,
      "loss": 8.6034,
      "step": 47680,
      "throughput": 12524.688017450091
    },
    {
      "epoch": 0.747842851433005,
      "grad_norm": 0.07992296665906906,
      "learning_rate": 3.1440502511544566e-05,
      "loss": 8.6031,
      "step": 47712,
      "throughput": 12524.793787131397
    },
    {
      "epoch": 0.7483444227619339,
      "grad_norm": 0.08115531504154205,
      "learning_rate": 3.1400835293303984e-05,
      "loss": 8.6163,
      "step": 47744,
      "throughput": 12524.866111468855
    },
    {
      "epoch": 0.7488459940908628,
      "grad_norm": 0.08725763857364655,
      "learning_rate": 3.136171903841463e-05,
      "loss": 8.6209,
      "step": 47776,
      "throughput": 12524.861843279761
    },
    {
      "epoch": 0.7493475654197916,
      "grad_norm": 0.08313852548599243,
      "learning_rate": 3.1323153908192057e-05,
      "loss": 8.6085,
      "step": 47808,
      "throughput": 12524.903077268247
    },
    {
      "epoch": 0.7498491367487206,
      "grad_norm": 0.0964965894818306,
      "learning_rate": 3.128514006167897e-05,
      "loss": 8.6233,
      "step": 47840,
      "throughput": 12524.910717496554
    },
    {
      "epoch": 0.7503507080776495,
      "grad_norm": 0.08035736531019211,
      "learning_rate": 3.124767765564459e-05,
      "loss": 8.6016,
      "step": 47872,
      "throughput": 12524.97226222231
    },
    {
      "epoch": 0.7508522794065784,
      "grad_norm": 0.07964587211608887,
      "learning_rate": 3.121076684458398e-05,
      "loss": 8.6102,
      "step": 47904,
      "throughput": 12525.004078127577
    },
    {
      "epoch": 0.7513538507355073,
      "grad_norm": 0.08621484786272049,
      "learning_rate": 3.1174407780717433e-05,
      "loss": 8.6079,
      "step": 47936,
      "throughput": 12525.004016543815
    },
    {
      "epoch": 0.7518554220644362,
      "grad_norm": 0.08485983312129974,
      "learning_rate": 3.113860061398985e-05,
      "loss": 8.5892,
      "step": 47968,
      "throughput": 12524.956430873583
    },
    {
      "epoch": 0.7523569933933651,
      "grad_norm": 0.09468486905097961,
      "learning_rate": 3.110334549207009e-05,
      "loss": 8.6019,
      "step": 48000,
      "throughput": 12524.856626194747
    },
    {
      "epoch": 0.7528585647222941,
      "grad_norm": 0.09153393656015396,
      "learning_rate": 3.1068642560350375e-05,
      "loss": 8.588,
      "step": 48032,
      "throughput": 12524.957839340766
    },
    {
      "epoch": 0.753360136051223,
      "grad_norm": 0.08684483170509338,
      "learning_rate": 3.103449196194569e-05,
      "loss": 8.6055,
      "step": 48064,
      "throughput": 12525.02156365344
    },
    {
      "epoch": 0.7538617073801519,
      "grad_norm": 0.08171502500772476,
      "learning_rate": 3.1000893837693234e-05,
      "loss": 8.6261,
      "step": 48096,
      "throughput": 12525.017951077596
    },
    {
      "epoch": 0.7543632787090808,
      "grad_norm": 0.08191504329442978,
      "learning_rate": 3.096784832615175e-05,
      "loss": 8.5741,
      "step": 48128,
      "throughput": 12525.075499919187
    },
    {
      "epoch": 0.7548648500380097,
      "grad_norm": 0.08089284598827362,
      "learning_rate": 3.093535556360101e-05,
      "loss": 8.6271,
      "step": 48160,
      "throughput": 12525.120487941642
    },
    {
      "epoch": 0.7553664213669387,
      "grad_norm": 0.08522171527147293,
      "learning_rate": 3.0903415684041285e-05,
      "loss": 8.6076,
      "step": 48192,
      "throughput": 12525.145324905377
    },
    {
      "epoch": 0.7558679926958676,
      "grad_norm": 0.08962146192789078,
      "learning_rate": 3.087202881919273e-05,
      "loss": 8.6072,
      "step": 48224,
      "throughput": 12525.163169395968
    },
    {
      "epoch": 0.7563695640247964,
      "grad_norm": 0.08281126618385315,
      "learning_rate": 3.084119509849488e-05,
      "loss": 8.6067,
      "step": 48256,
      "throughput": 12525.192516711173
    },
    {
      "epoch": 0.7568711353537253,
      "grad_norm": 0.08468001335859299,
      "learning_rate": 3.081091464910606e-05,
      "loss": 8.6145,
      "step": 48288,
      "throughput": 12525.122473990757
    },
    {
      "epoch": 0.7573727066826542,
      "grad_norm": 0.08133754879236221,
      "learning_rate": 3.078118759590295e-05,
      "loss": 8.6025,
      "step": 48320,
      "throughput": 12525.068061495733
    },
    {
      "epoch": 0.7578742780115831,
      "grad_norm": 0.08273835480213165,
      "learning_rate": 3.075201406148001e-05,
      "loss": 8.5968,
      "step": 48352,
      "throughput": 12525.133415826404
    },
    {
      "epoch": 0.758375849340512,
      "grad_norm": 0.0818416029214859,
      "learning_rate": 3.072339416614899e-05,
      "loss": 8.6016,
      "step": 48384,
      "throughput": 12525.235286059295
    },
    {
      "epoch": 0.758877420669441,
      "grad_norm": 0.09383910149335861,
      "learning_rate": 3.069532802793839e-05,
      "loss": 8.5897,
      "step": 48416,
      "throughput": 12525.264290396648
    },
    {
      "epoch": 0.7593789919983699,
      "grad_norm": 0.09501391649246216,
      "learning_rate": 3.066781576259309e-05,
      "loss": 8.6211,
      "step": 48448,
      "throughput": 12525.27034147465
    },
    {
      "epoch": 0.7598805633272988,
      "grad_norm": 0.08860146254301071,
      "learning_rate": 3.0640857483573714e-05,
      "loss": 8.6066,
      "step": 48480,
      "throughput": 12525.306394704627
    },
    {
      "epoch": 0.7603821346562277,
      "grad_norm": 0.08503951877355576,
      "learning_rate": 3.061445330205631e-05,
      "loss": 8.5952,
      "step": 48512,
      "throughput": 12525.334264765412
    },
    {
      "epoch": 0.7608837059851566,
      "grad_norm": 0.08074294030666351,
      "learning_rate": 3.0588603326931796e-05,
      "loss": 8.6088,
      "step": 48544,
      "throughput": 12525.34869462047
    },
    {
      "epoch": 0.7613852773140856,
      "grad_norm": 0.08348783105611801,
      "learning_rate": 3.056330766480554e-05,
      "loss": 8.5983,
      "step": 48576,
      "throughput": 12525.405007836334
    },
    {
      "epoch": 0.7618868486430145,
      "grad_norm": 0.08301907032728195,
      "learning_rate": 3.053856641999694e-05,
      "loss": 8.5994,
      "step": 48608,
      "throughput": 12525.346566915949
    },
    {
      "epoch": 0.7623884199719434,
      "grad_norm": 0.08006524294614792,
      "learning_rate": 3.0514379694538932e-05,
      "loss": 8.5948,
      "step": 48640,
      "throughput": 12525.320557107438
    },
    {
      "epoch": 0.7628899913008723,
      "grad_norm": 0.08431556075811386,
      "learning_rate": 3.0490747588177684e-05,
      "loss": 8.6174,
      "step": 48672,
      "throughput": 12525.308854529463
    },
    {
      "epoch": 0.7633915626298011,
      "grad_norm": 0.08287610858678818,
      "learning_rate": 3.0467670198372044e-05,
      "loss": 8.6072,
      "step": 48704,
      "throughput": 12525.408934162428
    },
    {
      "epoch": 0.76389313395873,
      "grad_norm": 0.16203758120536804,
      "learning_rate": 3.044514762029326e-05,
      "loss": 8.5923,
      "step": 48736,
      "throughput": 12525.47215639991
    },
    {
      "epoch": 0.764394705287659,
      "grad_norm": 0.08479800820350647,
      "learning_rate": 3.0423179946824494e-05,
      "loss": 8.6132,
      "step": 48768,
      "throughput": 12525.474404700622
    },
    {
      "epoch": 0.7648962766165879,
      "grad_norm": 0.08963710814714432,
      "learning_rate": 3.040176726856049e-05,
      "loss": 8.6053,
      "step": 48800,
      "throughput": 12525.493884954105
    },
    {
      "epoch": 0.7653978479455168,
      "grad_norm": 0.09195411205291748,
      "learning_rate": 3.0380909673807205e-05,
      "loss": 8.5854,
      "step": 48832,
      "throughput": 12525.534557200805
    },
    {
      "epoch": 0.7658994192744457,
      "grad_norm": 0.08387543261051178,
      "learning_rate": 3.0360607248581437e-05,
      "loss": 8.6059,
      "step": 48864,
      "throughput": 12525.53278414358
    },
    {
      "epoch": 0.7664009906033746,
      "grad_norm": 0.08522969484329224,
      "learning_rate": 3.0340860076610427e-05,
      "loss": 8.5936,
      "step": 48896,
      "throughput": 12525.592779349634
    },
    {
      "epoch": 0.7669025619323036,
      "grad_norm": 0.08993417024612427,
      "learning_rate": 3.0321668239331582e-05,
      "loss": 8.6044,
      "step": 48928,
      "throughput": 12525.60480003467
    },
    {
      "epoch": 0.7674041332612325,
      "grad_norm": 0.08827044814825058,
      "learning_rate": 3.030303181589207e-05,
      "loss": 8.5837,
      "step": 48960,
      "throughput": 12525.509729326744
    },
    {
      "epoch": 0.7679057045901614,
      "grad_norm": 0.08753187954425812,
      "learning_rate": 3.0284950883148598e-05,
      "loss": 8.5909,
      "step": 48992,
      "throughput": 12525.453731483165
    },
    {
      "epoch": 0.7684072759190903,
      "grad_norm": 0.0935630276799202,
      "learning_rate": 3.026742551566696e-05,
      "loss": 8.5953,
      "step": 49024,
      "throughput": 12525.55475574127
    },
    {
      "epoch": 0.7689088472480192,
      "grad_norm": 0.09898614138364792,
      "learning_rate": 3.0250455785721827e-05,
      "loss": 8.6012,
      "step": 49056,
      "throughput": 12525.620507141155
    },
    {
      "epoch": 0.7694104185769481,
      "grad_norm": 0.08655832707881927,
      "learning_rate": 3.023404176329643e-05,
      "loss": 8.6057,
      "step": 49088,
      "throughput": 12525.62817059881
    },
    {
      "epoch": 0.7699119899058771,
      "grad_norm": 0.07959497720003128,
      "learning_rate": 3.021818351608223e-05,
      "loss": 8.5975,
      "step": 49120,
      "throughput": 12525.676969174108
    },
    {
      "epoch": 0.7704135612348059,
      "grad_norm": 0.09171836823225021,
      "learning_rate": 3.0202881109478676e-05,
      "loss": 8.6115,
      "step": 49152,
      "throughput": 12525.69393438
    },
    {
      "epoch": 0.7709151325637348,
      "grad_norm": 0.07986843585968018,
      "learning_rate": 3.0188134606592958e-05,
      "loss": 8.6097,
      "step": 49184,
      "throughput": 12525.331320803678
    },
    {
      "epoch": 0.7714167038926637,
      "grad_norm": 0.0822770819067955,
      "learning_rate": 3.017394406823969e-05,
      "loss": 8.5995,
      "step": 49216,
      "throughput": 12525.360102708491
    },
    {
      "epoch": 0.7719182752215926,
      "grad_norm": 0.08194506913423538,
      "learning_rate": 3.0160309552940704e-05,
      "loss": 8.6245,
      "step": 49248,
      "throughput": 12525.395683174307
    },
    {
      "epoch": 0.7724198465505215,
      "grad_norm": 0.08263807743787766,
      "learning_rate": 3.014723111692476e-05,
      "loss": 8.6017,
      "step": 49280,
      "throughput": 12525.34133444018
    },
    {
      "epoch": 0.7729214178794505,
      "grad_norm": 0.08559015393257141,
      "learning_rate": 3.013470881412739e-05,
      "loss": 8.5813,
      "step": 49312,
      "throughput": 12525.303717545508
    },
    {
      "epoch": 0.7734229892083794,
      "grad_norm": 0.08753157407045364,
      "learning_rate": 3.0122742696190606e-05,
      "loss": 8.6161,
      "step": 49344,
      "throughput": 12525.361815283388
    },
    {
      "epoch": 0.7739245605373083,
      "grad_norm": 0.0791105255484581,
      "learning_rate": 3.0111332812462692e-05,
      "loss": 8.6013,
      "step": 49376,
      "throughput": 12525.459584093609
    },
    {
      "epoch": 0.7744261318662372,
      "grad_norm": 0.08456587046384811,
      "learning_rate": 3.0100479209998055e-05,
      "loss": 8.5916,
      "step": 49408,
      "throughput": 12525.448924109218
    },
    {
      "epoch": 0.7749277031951661,
      "grad_norm": 0.09241674095392227,
      "learning_rate": 3.0090181933556994e-05,
      "loss": 8.6064,
      "step": 49440,
      "throughput": 12525.465303008194
    },
    {
      "epoch": 0.775429274524095,
      "grad_norm": 0.09265810251235962,
      "learning_rate": 3.0080441025605494e-05,
      "loss": 8.5875,
      "step": 49472,
      "throughput": 12525.49486172476
    },
    {
      "epoch": 0.775930845853024,
      "grad_norm": 0.0823616310954094,
      "learning_rate": 3.007125652631508e-05,
      "loss": 8.5854,
      "step": 49504,
      "throughput": 12525.532190424396
    },
    {
      "epoch": 0.7764324171819529,
      "grad_norm": 0.08635231107473373,
      "learning_rate": 3.006262847356269e-05,
      "loss": 8.5906,
      "step": 49536,
      "throughput": 12525.569918707946
    },
    {
      "epoch": 0.7769339885108818,
      "grad_norm": 0.09322790056467056,
      "learning_rate": 3.0054556902930394e-05,
      "loss": 8.6039,
      "step": 49568,
      "throughput": 12525.62596869029
    },
    {
      "epoch": 0.7774355598398106,
      "grad_norm": 0.08192238211631775,
      "learning_rate": 3.0047041847705404e-05,
      "loss": 8.6082,
      "step": 49600,
      "throughput": 12525.607799600375
    },
    {
      "epoch": 0.7779371311687395,
      "grad_norm": 0.08833806216716766,
      "learning_rate": 3.0040083338879834e-05,
      "loss": 8.58,
      "step": 49632,
      "throughput": 12525.533183024592
    },
    {
      "epoch": 0.7784387024976684,
      "grad_norm": 0.0832701176404953,
      "learning_rate": 3.0033681405150554e-05,
      "loss": 8.6138,
      "step": 49664,
      "throughput": 12525.52730959789
    },
    {
      "epoch": 0.7789402738265974,
      "grad_norm": 0.09455437958240509,
      "learning_rate": 3.0027836072919202e-05,
      "loss": 8.5852,
      "step": 49696,
      "throughput": 12525.62178306482
    },
    {
      "epoch": 0.7794418451555263,
      "grad_norm": 0.10499484091997147,
      "learning_rate": 3.002254736629194e-05,
      "loss": 8.6143,
      "step": 49728,
      "throughput": 12525.680177305796
    },
    {
      "epoch": 0.7799434164844552,
      "grad_norm": 0.08849858492612839,
      "learning_rate": 3.001781530707938e-05,
      "loss": 8.585,
      "step": 49760,
      "throughput": 12525.640270537673
    },
    {
      "epoch": 0.7804449878133841,
      "grad_norm": 0.08110593259334564,
      "learning_rate": 3.0013639914796586e-05,
      "loss": 8.6013,
      "step": 49792,
      "throughput": 12525.696330785242
    },
    {
      "epoch": 0.780946559142313,
      "grad_norm": 0.08891758322715759,
      "learning_rate": 3.001002120666285e-05,
      "loss": 8.5945,
      "step": 49824,
      "throughput": 12525.695608125934
    },
    {
      "epoch": 0.781448130471242,
      "grad_norm": 0.10169275850057602,
      "learning_rate": 3.0006959197601765e-05,
      "loss": 8.6049,
      "step": 49856,
      "throughput": 12525.73024644998
    },
    {
      "epoch": 0.7819497018001709,
      "grad_norm": 0.08562269061803818,
      "learning_rate": 3.000445390024106e-05,
      "loss": 8.5981,
      "step": 49888,
      "throughput": 12525.791092855176
    },
    {
      "epoch": 0.7824512731290998,
      "grad_norm": 0.08182035386562347,
      "learning_rate": 3.0002505324912582e-05,
      "loss": 8.586,
      "step": 49920,
      "throughput": 12525.789284073133
    },
    {
      "epoch": 0.7829528444580287,
      "grad_norm": 0.08575500547885895,
      "learning_rate": 3.0001113479652246e-05,
      "loss": 8.596,
      "step": 49952,
      "throughput": 12525.74085708137
    },
    {
      "epoch": 0.7834544157869576,
      "grad_norm": 0.09045220166444778,
      "learning_rate": 3.0000278370200057e-05,
      "loss": 8.6039,
      "step": 49984,
      "throughput": 12525.726369494236
    },
    {
      "epoch": 0.7839559871158865,
      "grad_norm": 0.09492523223161697,
      "learning_rate": 2.9999999999999997e-05,
      "loss": 8.5925,
      "step": 50016,
      "throughput": 12525.78270587197
    },
    {
      "epoch": 0.7839559871158865,
      "step": 50016,
      "throughput": 12525.363357673923,
      "total_flos": 8.619164947133655e+20,
      "train_loss": 9.173326368447839,
      "train_runtime": 261696.889,
      "train_samples_per_second": 195.709,
      "train_steps_per_second": 0.191
    }
  ],
  "logging_steps": 32,
  "max_steps": 50016,
  "num_input_tokens_seen": 104891154432,
  "num_train_epochs": 1,
  "save_steps": 2048,
  "stateful_callbacks": {
    "LogCallback": {
      "elapsed_time": 261696.88514399529,
      "start_time": 1766740210.8614042
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 8.619164947133655e+20,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}