diff --git "a/checkpoint-46000/trainer_state.json" "b/checkpoint-46000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-46000/trainer_state.json"
@@ -0,0 +1,6841 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.275788787619302,
+  "eval_steps": 1000,
+  "global_step": 46000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0013867269430644586,
+      "grad_norm": 1.8933687210083008,
+      "learning_rate": 2.957486136783734e-06,
+      "loss": 1.2241,
+      "step": 50
+    },
+    {
+      "epoch": 0.002773453886128917,
+      "grad_norm": 0.7502820491790771,
+      "learning_rate": 6.038200862600124e-06,
+      "loss": 1.0267,
+      "step": 100
+    },
+    {
+      "epoch": 0.004160180829193376,
+      "grad_norm": 0.5821689963340759,
+      "learning_rate": 9.118915588416513e-06,
+      "loss": 0.8167,
+      "step": 150
+    },
+    {
+      "epoch": 0.005546907772257834,
+      "grad_norm": 0.5138927698135376,
+      "learning_rate": 1.2199630314232902e-05,
+      "loss": 0.6408,
+      "step": 200
+    },
+    {
+      "epoch": 0.006933634715322293,
+      "grad_norm": 0.619263768196106,
+      "learning_rate": 1.5280345040049293e-05,
+      "loss": 0.5468,
+      "step": 250
+    },
+    {
+      "epoch": 0.008320361658386751,
+      "grad_norm": 0.5078439712524414,
+      "learning_rate": 1.836105976586568e-05,
+      "loss": 0.4952,
+      "step": 300
+    },
+    {
+      "epoch": 0.00970708860145121,
+      "grad_norm": 0.5653749108314514,
+      "learning_rate": 2.144177449168207e-05,
+      "loss": 0.4388,
+      "step": 350
+    },
+    {
+      "epoch": 0.011093815544515669,
+      "grad_norm": 0.6189213991165161,
+      "learning_rate": 2.452248921749846e-05,
+      "loss": 0.4232,
+      "step": 400
+    },
+    {
+      "epoch": 0.012480542487580126,
+      "grad_norm": 0.6082913875579834,
+      "learning_rate": 2.760320394331485e-05,
+      "loss": 0.401,
+      "step": 450
+    },
+    {
+      "epoch": 0.013867269430644586,
+      "grad_norm": 0.6956301331520081,
+      "learning_rate": 3.068391866913124e-05,
+      "loss": 0.3895,
+      "step": 500
+    },
+    {
+      "epoch": 0.015253996373709043,
+      "grad_norm": 0.7030412554740906,
+      "learning_rate": 3.3764633394947633e-05,
+      "loss": 0.3676,
+      "step": 550
+    },
+    {
+      "epoch": 0.016640723316773503,
+      "grad_norm": 0.6779190897941589,
+      "learning_rate": 3.684534812076402e-05,
+      "loss": 0.3653,
+      "step": 600
+    },
+    {
+      "epoch": 0.01802745025983796,
+      "grad_norm": 0.8930213451385498,
+      "learning_rate": 3.992606284658041e-05,
+      "loss": 0.3645,
+      "step": 650
+    },
+    {
+      "epoch": 0.01941417720290242,
+      "grad_norm": 0.6423994302749634,
+      "learning_rate": 4.30067775723968e-05,
+      "loss": 0.3514,
+      "step": 700
+    },
+    {
+      "epoch": 0.02080090414596688,
+      "grad_norm": 0.7728660106658936,
+      "learning_rate": 4.608749229821319e-05,
+      "loss": 0.3468,
+      "step": 750
+    },
+    {
+      "epoch": 0.022187631089031337,
+      "grad_norm": 0.7561061978340149,
+      "learning_rate": 4.916820702402958e-05,
+      "loss": 0.3499,
+      "step": 800
+    },
+    {
+      "epoch": 0.023574358032095795,
+      "grad_norm": 0.6163890957832336,
+      "learning_rate": 5.224892174984597e-05,
+      "loss": 0.3417,
+      "step": 850
+    },
+    {
+      "epoch": 0.024961084975160253,
+      "grad_norm": 0.7334563732147217,
+      "learning_rate": 5.532963647566236e-05,
+      "loss": 0.3299,
+      "step": 900
+    },
+    {
+      "epoch": 0.026347811918224714,
+      "grad_norm": 0.655237078666687,
+      "learning_rate": 5.841035120147874e-05,
+      "loss": 0.3306,
+      "step": 950
+    },
+    {
+      "epoch": 0.02773453886128917,
+      "grad_norm": 0.8147113919258118,
+      "learning_rate": 6.149106592729513e-05,
+      "loss": 0.3281,
+      "step": 1000
+    },
+    {
+      "epoch": 0.02773453886128917,
+      "eval_loss": 0.32194069027900696,
+      "eval_runtime": 501.2457,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 1000
+    },
+    {
+      "epoch": 0.02912126580435363,
+      "grad_norm": 0.6397083401679993,
+      "learning_rate": 6.457178065311152e-05,
+      "loss": 0.3204,
+      "step": 1050
+    },
+    {
+      "epoch": 0.030507992747418087,
+      "grad_norm": 0.5808627009391785,
+      "learning_rate": 6.765249537892791e-05,
+      "loss": 0.3229,
+      "step": 1100
+    },
+    {
+      "epoch": 0.03189471969048255,
+      "grad_norm": 0.6929567456245422,
+      "learning_rate": 7.073321010474431e-05,
+      "loss": 0.3148,
+      "step": 1150
+    },
+    {
+      "epoch": 0.033281446633547006,
+      "grad_norm": 0.620298445224762,
+      "learning_rate": 7.38139248305607e-05,
+      "loss": 0.32,
+      "step": 1200
+    },
+    {
+      "epoch": 0.034668173576611463,
+      "grad_norm": 0.5947968363761902,
+      "learning_rate": 7.689463955637708e-05,
+      "loss": 0.306,
+      "step": 1250
+    },
+    {
+      "epoch": 0.03605490051967592,
+      "grad_norm": 0.6097683906555176,
+      "learning_rate": 7.997535428219347e-05,
+      "loss": 0.3179,
+      "step": 1300
+    },
+    {
+      "epoch": 0.03744162746274038,
+      "grad_norm": 0.6339348554611206,
+      "learning_rate": 8.305606900800986e-05,
+      "loss": 0.3161,
+      "step": 1350
+    },
+    {
+      "epoch": 0.03882835440580484,
+      "grad_norm": 0.5278933644294739,
+      "learning_rate": 8.613678373382625e-05,
+      "loss": 0.3153,
+      "step": 1400
+    },
+    {
+      "epoch": 0.040215081348869294,
+      "grad_norm": 0.4927423894405365,
+      "learning_rate": 8.921749845964264e-05,
+      "loss": 0.3111,
+      "step": 1450
+    },
+    {
+      "epoch": 0.04160180829193376,
+      "grad_norm": 0.4745596945285797,
+      "learning_rate": 9.229821318545902e-05,
+      "loss": 0.304,
+      "step": 1500
+    },
+    {
+      "epoch": 0.04298853523499822,
+      "grad_norm": 0.6532231569290161,
+      "learning_rate": 9.537892791127541e-05,
+      "loss": 0.3084,
+      "step": 1550
+    },
+    {
+      "epoch": 0.044375262178062674,
+      "grad_norm": 0.5528659820556641,
+      "learning_rate": 9.84596426370918e-05,
+      "loss": 0.3084,
+      "step": 1600
+    },
+    {
+      "epoch": 0.04576198912112713,
+      "grad_norm": 0.45793089270591736,
+      "learning_rate": 0.0001015403573629082,
+      "loss": 0.2964,
+      "step": 1650
+    },
+    {
+      "epoch": 0.04714871606419159,
+      "grad_norm": 0.5063529014587402,
+      "learning_rate": 0.00010462107208872458,
+      "loss": 0.2924,
+      "step": 1700
+    },
+    {
+      "epoch": 0.04853544300725605,
+      "grad_norm": 0.48600247502326965,
+      "learning_rate": 0.00010770178681454097,
+      "loss": 0.2947,
+      "step": 1750
+    },
+    {
+      "epoch": 0.049922169950320505,
+      "grad_norm": 0.4872143268585205,
+      "learning_rate": 0.00011078250154035737,
+      "loss": 0.297,
+      "step": 1800
+    },
+    {
+      "epoch": 0.05130889689338496,
+      "grad_norm": 0.5091805458068848,
+      "learning_rate": 0.00011386321626617376,
+      "loss": 0.2888,
+      "step": 1850
+    },
+    {
+      "epoch": 0.05269562383644943,
+      "grad_norm": 0.41649994254112244,
+      "learning_rate": 0.00011694393099199015,
+      "loss": 0.2871,
+      "step": 1900
+    },
+    {
+      "epoch": 0.054082350779513885,
+      "grad_norm": 0.5174862146377563,
+      "learning_rate": 0.00012002464571780654,
+      "loss": 0.2922,
+      "step": 1950
+    },
+    {
+      "epoch": 0.05546907772257834,
+      "grad_norm": 0.45786553621292114,
+      "learning_rate": 0.00012310536044362293,
+      "loss": 0.2883,
+      "step": 2000
+    },
+    {
+      "epoch": 0.05546907772257834,
+      "eval_loss": 0.28488224744796753,
+      "eval_runtime": 500.9558,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 2000
+    },
+    {
+      "epoch": 0.0568558046656428,
+      "grad_norm": 0.4992533326148987,
+      "learning_rate": 0.00012606284658040666,
+      "loss": 0.3033,
+      "step": 2050
+    },
+    {
+      "epoch": 0.05824253160870726,
+      "grad_norm": 0.4205988049507141,
+      "learning_rate": 0.00012914356130622304,
+      "loss": 0.2867,
+      "step": 2100
+    },
+    {
+      "epoch": 0.059629258551771716,
+      "grad_norm": 0.4288152754306793,
+      "learning_rate": 0.00013222427603203944,
+      "loss": 0.2795,
+      "step": 2150
+    },
+    {
+      "epoch": 0.061015985494836174,
+      "grad_norm": 0.4856145977973938,
+      "learning_rate": 0.00013530499075785582,
+      "loss": 0.2833,
+      "step": 2200
+    },
+    {
+      "epoch": 0.06240271243790063,
+      "grad_norm": 0.4891654849052429,
+      "learning_rate": 0.00013838570548367222,
+      "loss": 0.2797,
+      "step": 2250
+    },
+    {
+      "epoch": 0.0637894393809651,
+      "grad_norm": 0.39899352192878723,
+      "learning_rate": 0.00014146642020948863,
+      "loss": 0.2785,
+      "step": 2300
+    },
+    {
+      "epoch": 0.06517616632402955,
+      "grad_norm": 0.3616255819797516,
+      "learning_rate": 0.000144547134935305,
+      "loss": 0.2798,
+      "step": 2350
+    },
+    {
+      "epoch": 0.06656289326709401,
+      "grad_norm": 0.3556617498397827,
+      "learning_rate": 0.0001476278496611214,
+      "loss": 0.2811,
+      "step": 2400
+    },
+    {
+      "epoch": 0.06794962021015846,
+      "grad_norm": 0.39639297127723694,
+      "learning_rate": 0.00015070856438693776,
+      "loss": 0.2813,
+      "step": 2450
+    },
+    {
+      "epoch": 0.06933634715322293,
+      "grad_norm": 0.35177573561668396,
+      "learning_rate": 0.00015378927911275416,
+      "loss": 0.2797,
+      "step": 2500
+    },
+    {
+      "epoch": 0.07072307409628739,
+      "grad_norm": 0.38610222935676575,
+      "learning_rate": 0.00015686999383857054,
+      "loss": 0.2747,
+      "step": 2550
+    },
+    {
+      "epoch": 0.07210980103935184,
+      "grad_norm": 0.36727309226989746,
+      "learning_rate": 0.00015995070856438694,
+      "loss": 0.2776,
+      "step": 2600
+    },
+    {
+      "epoch": 0.07349652798241631,
+      "grad_norm": 0.3905107378959656,
+      "learning_rate": 0.00016303142329020332,
+      "loss": 0.2772,
+      "step": 2650
+    },
+    {
+      "epoch": 0.07488325492548076,
+      "grad_norm": 0.3958912193775177,
+      "learning_rate": 0.00016611213801601973,
+      "loss": 0.2707,
+      "step": 2700
+    },
+    {
+      "epoch": 0.07626998186854522,
+      "grad_norm": 0.4029497504234314,
+      "learning_rate": 0.0001691928527418361,
+      "loss": 0.2692,
+      "step": 2750
+    },
+    {
+      "epoch": 0.07765670881160967,
+      "grad_norm": 0.3514055907726288,
+      "learning_rate": 0.0001722735674676525,
+      "loss": 0.2759,
+      "step": 2800
+    },
+    {
+      "epoch": 0.07904343575467414,
+      "grad_norm": 0.34912553429603577,
+      "learning_rate": 0.00017529266789895255,
+      "loss": 0.2793,
+      "step": 2850
+    },
+    {
+      "epoch": 0.08043016269773859,
+      "grad_norm": 0.3493233621120453,
+      "learning_rate": 0.00017831176833025262,
+      "loss": 0.2845,
+      "step": 2900
+    },
+    {
+      "epoch": 0.08181688964080305,
+      "grad_norm": 0.30080145597457886,
+      "learning_rate": 0.00018139248305606902,
+      "loss": 0.2686,
+      "step": 2950
+    },
+    {
+      "epoch": 0.08320361658386752,
+      "grad_norm": 0.3265998959541321,
+      "learning_rate": 0.0001844731977818854,
+      "loss": 0.2695,
+      "step": 3000
+    },
+    {
+      "epoch": 0.08320361658386752,
+      "eval_loss": 0.26523345708847046,
+      "eval_runtime": 500.4565,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 3000
+    },
+    {
+      "epoch": 0.08459034352693197,
+      "grad_norm": 0.29866209626197815,
+      "learning_rate": 0.0001875539125077018,
+      "loss": 0.2679,
+      "step": 3050
+    },
+    {
+      "epoch": 0.08597707046999643,
+      "grad_norm": 0.3191625475883484,
+      "learning_rate": 0.00019063462723351818,
+      "loss": 0.267,
+      "step": 3100
+    },
+    {
+      "epoch": 0.08736379741306088,
+      "grad_norm": 0.3110339939594269,
+      "learning_rate": 0.00019371534195933459,
+      "loss": 0.2658,
+      "step": 3150
+    },
+    {
+      "epoch": 0.08875052435612535,
+      "grad_norm": 0.32120850682258606,
+      "learning_rate": 0.00019679605668515096,
+      "loss": 0.2724,
+      "step": 3200
+    },
+    {
+      "epoch": 0.0901372512991898,
+      "grad_norm": 0.28446418046951294,
+      "learning_rate": 0.00019987677141096734,
+      "loss": 0.268,
+      "step": 3250
+    },
+    {
+      "epoch": 0.09152397824225426,
+      "grad_norm": 0.2722443640232086,
+      "learning_rate": 0.00019999989671933422,
+      "loss": 0.2716,
+      "step": 3300
+    },
+    {
+      "epoch": 0.09291070518531871,
+      "grad_norm": 0.31304416060447693,
+      "learning_rate": 0.00019999956948482068,
+      "loss": 0.2631,
+      "step": 3350
+    },
+    {
+      "epoch": 0.09429743212838318,
+      "grad_norm": 0.2516928017139435,
+      "learning_rate": 0.00019999901811788604,
+      "loss": 0.2647,
+      "step": 3400
+    },
+    {
+      "epoch": 0.09568415907144764,
+      "grad_norm": 0.288006067276001,
+      "learning_rate": 0.00019999824261976613,
+      "loss": 0.263,
+      "step": 3450
+    },
+    {
+      "epoch": 0.0970708860145121,
+      "grad_norm": 0.2745107114315033,
+      "learning_rate": 0.00019999724299219913,
+      "loss": 0.2642,
+      "step": 3500
+    },
+    {
+      "epoch": 0.09845761295757656,
+      "grad_norm": 2.800987720489502,
+      "learning_rate": 0.00019999601923742548,
+      "loss": 0.7176,
+      "step": 3550
+    },
+    {
+      "epoch": 0.09984433990064101,
+      "grad_norm": 0.3590925931930542,
+      "learning_rate": 0.00019999457135818805,
+      "loss": 0.3146,
+      "step": 3600
+    },
+    {
+      "epoch": 0.10123106684370548,
+      "grad_norm": 0.32617494463920593,
+      "learning_rate": 0.00019999289935773202,
+      "loss": 0.2786,
+      "step": 3650
+    },
+    {
+      "epoch": 0.10261779378676993,
+      "grad_norm": 0.3239264488220215,
+      "learning_rate": 0.0001999910032398049,
+      "loss": 0.2807,
+      "step": 3700
+    },
+    {
+      "epoch": 0.10400452072983439,
+      "grad_norm": 0.3022274076938629,
+      "learning_rate": 0.00019998888300865652,
+      "loss": 0.2758,
+      "step": 3750
+    },
+    {
+      "epoch": 0.10539124767289886,
+      "grad_norm": 0.33024862408638,
+      "learning_rate": 0.000199986538669039,
+      "loss": 0.2687,
+      "step": 3800
+    },
+    {
+      "epoch": 0.1067779746159633,
+      "grad_norm": 0.6899451017379761,
+      "learning_rate": 0.00019998397022620687,
+      "loss": 0.2699,
+      "step": 3850
+    },
+    {
+      "epoch": 0.10816470155902777,
+      "grad_norm": 0.2794604003429413,
+      "learning_rate": 0.0001999811776859168,
+      "loss": 0.2667,
+      "step": 3900
+    },
+    {
+      "epoch": 0.10955142850209222,
+      "grad_norm": 0.2764255106449127,
+      "learning_rate": 0.00019997816105442778,
+      "loss": 0.2658,
+      "step": 3950
+    },
+    {
+      "epoch": 0.11093815544515669,
+      "grad_norm": 0.43574222922325134,
+      "learning_rate": 0.0001999749203385012,
+      "loss": 0.2664,
+      "step": 4000
+    },
+    {
+      "epoch": 0.11093815544515669,
+      "eval_loss": 0.26065966486930847,
+      "eval_runtime": 500.842,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 4000
+    },
+    {
+      "epoch": 0.11232488238822114,
+      "grad_norm": 0.5340762734413147,
+      "learning_rate": 0.00019997145554540046,
+      "loss": 0.272,
+      "step": 4050
+    },
+    {
+      "epoch": 0.1137116093312856,
+      "grad_norm": 0.32403895258903503,
+      "learning_rate": 0.00019996776668289136,
+      "loss": 0.2679,
+      "step": 4100
+    },
+    {
+      "epoch": 0.11509833627435005,
+      "grad_norm": 0.2928290367126465,
+      "learning_rate": 0.0001999638537592419,
+      "loss": 0.2624,
+      "step": 4150
+    },
+    {
+      "epoch": 0.11648506321741452,
+      "grad_norm": 0.23226021230220795,
+      "learning_rate": 0.00019995971678322228,
+      "loss": 0.2557,
+      "step": 4200
+    },
+    {
+      "epoch": 0.11787179016047898,
+      "grad_norm": 0.2748055160045624,
+      "learning_rate": 0.00019995535576410476,
+      "loss": 0.2625,
+      "step": 4250
+    },
+    {
+      "epoch": 0.11925851710354343,
+      "grad_norm": 0.2713299095630646,
+      "learning_rate": 0.00019995077071166385,
+      "loss": 0.2611,
+      "step": 4300
+    },
+    {
+      "epoch": 0.1206452440466079,
+      "grad_norm": 0.24674977362155914,
+      "learning_rate": 0.00019994596163617624,
+      "loss": 0.2647,
+      "step": 4350
+    },
+    {
+      "epoch": 0.12203197098967235,
+      "grad_norm": 0.359017014503479,
+      "learning_rate": 0.00019994092854842065,
+      "loss": 0.2601,
+      "step": 4400
+    },
+    {
+      "epoch": 0.12341869793273681,
+      "grad_norm": 0.38051414489746094,
+      "learning_rate": 0.00019993567145967791,
+      "loss": 0.253,
+      "step": 4450
+    },
+    {
+      "epoch": 0.12480542487580126,
+      "grad_norm": 0.26227161288261414,
+      "learning_rate": 0.0001999301903817309,
+      "loss": 0.2584,
+      "step": 4500
+    },
+    {
+      "epoch": 0.12619215181886573,
+      "grad_norm": 0.21259668469429016,
+      "learning_rate": 0.00019992448532686453,
+      "loss": 0.2618,
+      "step": 4550
+    },
+    {
+      "epoch": 0.1275788787619302,
+      "grad_norm": 0.23226451873779297,
+      "learning_rate": 0.0001999185563078658,
+      "loss": 0.2526,
+      "step": 4600
+    },
+    {
+      "epoch": 0.12896560570499466,
+      "grad_norm": 0.24459871649742126,
+      "learning_rate": 0.00019991240333802352,
+      "loss": 0.2523,
+      "step": 4650
+    },
+    {
+      "epoch": 0.1303523326480591,
+      "grad_norm": 0.29185208678245544,
+      "learning_rate": 0.00019990602643112863,
+      "loss": 0.2546,
+      "step": 4700
+    },
+    {
+      "epoch": 0.13173905959112356,
+      "grad_norm": 0.23443324863910675,
+      "learning_rate": 0.00019989942560147387,
+      "loss": 0.2557,
+      "step": 4750
+    },
+    {
+      "epoch": 0.13312578653418802,
+      "grad_norm": 0.22915039956569672,
+      "learning_rate": 0.00019989260086385394,
+      "loss": 0.2546,
+      "step": 4800
+    },
+    {
+      "epoch": 0.1345125134772525,
+      "grad_norm": 0.2710748016834259,
+      "learning_rate": 0.00019988555223356531,
+      "loss": 0.2619,
+      "step": 4850
+    },
+    {
+      "epoch": 0.13589924042031692,
+      "grad_norm": 0.24671098589897156,
+      "learning_rate": 0.00019987827972640633,
+      "loss": 0.2594,
+      "step": 4900
+    },
+    {
+      "epoch": 0.1372859673633814,
+      "grad_norm": 0.2359282672405243,
+      "learning_rate": 0.00019987078335867713,
+      "loss": 0.2616,
+      "step": 4950
+    },
+    {
+      "epoch": 0.13867269430644585,
+      "grad_norm": 0.2197064608335495,
+      "learning_rate": 0.00019986306314717956,
+      "loss": 0.2507,
+      "step": 5000
+    },
+    {
+      "epoch": 0.13867269430644585,
+      "eval_loss": 0.25083017349243164,
+      "eval_runtime": 500.7995,
+      "eval_samples_per_second": 5.705,
+      "eval_steps_per_second": 5.705,
+      "step": 5000
+    },
+    {
+      "epoch": 0.14005942124951032,
+      "grad_norm": 0.2249370515346527,
+      "learning_rate": 0.0001998551191092172,
+      "loss": 0.2574,
+      "step": 5050
+    },
+    {
+      "epoch": 0.14144614819257478,
+      "grad_norm": 0.36345556378364563,
+      "learning_rate": 0.0001998469512625953,
+      "loss": 0.2493,
+      "step": 5100
+    },
+    {
+      "epoch": 0.14283287513563922,
+      "grad_norm": 0.24807791411876678,
+      "learning_rate": 0.00019983855962562067,
+      "loss": 0.2542,
+      "step": 5150
+    },
+    {
+      "epoch": 0.14421960207870368,
+      "grad_norm": 3.6125738620758057,
+      "learning_rate": 0.00019982994421710186,
+      "loss": 0.2595,
+      "step": 5200
+    },
+    {
+      "epoch": 0.14560632902176815,
+      "grad_norm": 0.4985048472881317,
+      "learning_rate": 0.0001998211050563488,
+      "loss": 0.2558,
+      "step": 5250
+    },
+    {
+      "epoch": 0.14699305596483261,
+      "grad_norm": 0.3320443332195282,
+      "learning_rate": 0.00019981204216317308,
+      "loss": 0.2545,
+      "step": 5300
+    },
+    {
+      "epoch": 0.14837978290789705,
+      "grad_norm": 0.2081877887248993,
+      "learning_rate": 0.00019980275555788759,
+      "loss": 0.2536,
+      "step": 5350
+    },
+    {
+      "epoch": 0.14976650985096152,
+      "grad_norm": 0.27258801460266113,
+      "learning_rate": 0.00019979324526130676,
+      "loss": 0.2505,
+      "step": 5400
+    },
+    {
+      "epoch": 0.15115323679402598,
+      "grad_norm": 0.23199999332427979,
+      "learning_rate": 0.00019978351129474632,
+      "loss": 0.2556,
+      "step": 5450
+    },
+    {
+      "epoch": 0.15253996373709044,
+      "grad_norm": 0.20929445326328278,
+      "learning_rate": 0.00019977355368002334,
+      "loss": 0.2486,
+      "step": 5500
+    },
+    {
+      "epoch": 0.1539266906801549,
+      "grad_norm": 0.23551955819129944,
+      "learning_rate": 0.00019976337243945617,
+      "loss": 0.2517,
+      "step": 5550
+    },
+    {
+      "epoch": 0.15531341762321935,
+      "grad_norm": 0.30231812596321106,
+      "learning_rate": 0.0001997529675958644,
+      "loss": 0.2498,
+      "step": 5600
+    },
+    {
+      "epoch": 0.1567001445662838,
+      "grad_norm": 0.24430635571479797,
+      "learning_rate": 0.00019974233917256865,
+      "loss": 0.2523,
+      "step": 5650
+    },
+    {
+      "epoch": 0.15808687150934828,
+      "grad_norm": 6.362756252288818,
+      "learning_rate": 0.0001997314871933909,
+      "loss": 0.2529,
+      "step": 5700
+    },
+    {
+      "epoch": 0.15947359845241274,
+      "grad_norm": 0.2339017242193222,
+      "learning_rate": 0.00019972041168265397,
+      "loss": 0.2524,
+      "step": 5750
+    },
+    {
+      "epoch": 0.16086032539547718,
+      "grad_norm": 0.22503100335597992,
+      "learning_rate": 0.0001997091126651818,
+      "loss": 0.251,
+      "step": 5800
+    },
+    {
+      "epoch": 0.16224705233854164,
+      "grad_norm": 0.26495125889778137,
+      "learning_rate": 0.00019969759016629928,
+      "loss": 0.2517,
+      "step": 5850
+    },
+    {
+      "epoch": 0.1636337792816061,
+      "grad_norm": 0.25339657068252563,
+      "learning_rate": 0.00019968584421183212,
+      "loss": 0.2505,
+      "step": 5900
+    },
+    {
+      "epoch": 0.16502050622467057,
+      "grad_norm": 0.20266841351985931,
+      "learning_rate": 0.000199673874828107,
+      "loss": 0.2501,
+      "step": 5950
+    },
+    {
+      "epoch": 0.16640723316773504,
+      "grad_norm": 0.19285647571086884,
+      "learning_rate": 0.00019966168204195125,
+      "loss": 0.2445,
+      "step": 6000
+    },
+    {
+      "epoch": 0.16640723316773504,
+      "eval_loss": 0.24731825292110443,
+      "eval_runtime": 500.9495,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 6000
+    },
+    {
+      "epoch": 0.16779396011079947,
+      "grad_norm": 0.2121065855026245,
+      "learning_rate": 0.000199649265880693,
+      "loss": 0.2466,
+      "step": 6050
+    },
+    {
+      "epoch": 0.16918068705386394,
+      "grad_norm": 0.2560518980026245,
+      "learning_rate": 0.000199636626372161,
+      "loss": 0.2572,
+      "step": 6100
+    },
+    {
+      "epoch": 0.1705674139969284,
+      "grad_norm": 0.22927352786064148,
+      "learning_rate": 0.00019962376354468466,
+      "loss": 0.2509,
+      "step": 6150
+    },
+    {
+      "epoch": 0.17195414093999287,
+      "grad_norm": 0.2201690673828125,
+      "learning_rate": 0.00019961067742709377,
+      "loss": 0.2501,
+      "step": 6200
+    },
+    {
+      "epoch": 0.1733408678830573,
+      "grad_norm": 0.23233374953269958,
+      "learning_rate": 0.0001995973680487188,
+      "loss": 0.2525,
+      "step": 6250
+    },
+    {
+      "epoch": 0.17472759482612177,
+      "grad_norm": 0.254256933927536,
+      "learning_rate": 0.00019958383543939041,
+      "loss": 0.2499,
+      "step": 6300
+    },
+    {
+      "epoch": 0.17611432176918623,
+      "grad_norm": 0.1754632294178009,
+      "learning_rate": 0.00019957007962943975,
+      "loss": 0.251,
+      "step": 6350
+    },
+    {
+      "epoch": 0.1775010487122507,
+      "grad_norm": 0.23628771305084229,
+      "learning_rate": 0.00019955610064969817,
+      "loss": 0.256,
+      "step": 6400
+    },
+    {
+      "epoch": 0.17888777565531516,
+      "grad_norm": 0.23698653280735016,
+      "learning_rate": 0.00019954189853149725,
+      "loss": 0.2474,
+      "step": 6450
+    },
+    {
+      "epoch": 0.1802745025983796,
+      "grad_norm": 0.27713823318481445,
+      "learning_rate": 0.00019952747330666867,
+      "loss": 0.2481,
+      "step": 6500
+    },
+    {
+      "epoch": 0.18166122954144406,
+      "grad_norm": 0.1710810512304306,
+      "learning_rate": 0.00019951282500754413,
+      "loss": 0.2564,
+      "step": 6550
+    },
+    {
+      "epoch": 0.18304795648450853,
+      "grad_norm": 0.21406157314777374,
+      "learning_rate": 0.00019949795366695544,
+      "loss": 0.2517,
+      "step": 6600
+    },
+    {
+      "epoch": 0.184434683427573,
+      "grad_norm": 0.20108449459075928,
+      "learning_rate": 0.00019948285931823415,
+      "loss": 0.2518,
+      "step": 6650
+    },
+    {
+      "epoch": 0.18582141037063743,
+      "grad_norm": 5.1352715492248535,
+      "learning_rate": 0.0001994675419952118,
+      "loss": 0.2546,
+      "step": 6700
+    },
+    {
+      "epoch": 0.1872081373137019,
+      "grad_norm": 0.22743810713291168,
+      "learning_rate": 0.00019945200173221962,
+      "loss": 0.2457,
+      "step": 6750
+    },
+    {
+      "epoch": 0.18859486425676636,
+      "grad_norm": 0.20475907623767853,
+      "learning_rate": 0.0001994362385640885,
+      "loss": 0.2529,
+      "step": 6800
+    },
+    {
+      "epoch": 0.18998159119983082,
+      "grad_norm": 0.22172316908836365,
+      "learning_rate": 0.000199420252526149,
+      "loss": 0.2554,
+      "step": 6850
+    },
+    {
+      "epoch": 0.1913683181428953,
+      "grad_norm": 2.967470407485962,
+      "learning_rate": 0.0001994040436542311,
+      "loss": 0.2555,
+      "step": 6900
+    },
+    {
+      "epoch": 0.19275504508595973,
+      "grad_norm": 0.23698735237121582,
+      "learning_rate": 0.00019938761198466437,
+      "loss": 0.2619,
+      "step": 6950
+    },
+    {
+      "epoch": 0.1941417720290242,
+      "grad_norm": 0.17891797423362732,
+      "learning_rate": 0.0001993709575542776,
+      "loss": 0.2464,
+      "step": 7000
+    },
+    {
+      "epoch": 0.1941417720290242,
+      "eval_loss": 0.24410127103328705,
+      "eval_runtime": 500.8833,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 7000
+    },
+    {
+      "epoch": 0.19552849897208865,
+      "grad_norm": 0.21030811965465546,
+      "learning_rate": 0.00019935408040039901,
+      "loss": 0.2517,
+      "step": 7050
+    },
+    {
+      "epoch": 0.19691522591515312,
+      "grad_norm": 0.1913098245859146,
+      "learning_rate": 0.00019933698056085586,
+      "loss": 0.249,
+      "step": 7100
+    },
+    {
+      "epoch": 0.19830195285821758,
+      "grad_norm": 0.2044433057308197,
+      "learning_rate": 0.00019931965807397465,
+      "loss": 0.2496,
+      "step": 7150
+    },
+    {
+      "epoch": 0.19968867980128202,
+      "grad_norm": 0.18698015809059143,
+      "learning_rate": 0.00019930211297858078,
+      "loss": 0.2537,
+      "step": 7200
+    },
+    {
+      "epoch": 0.20107540674434649,
+      "grad_norm": 0.22580522298812866,
+      "learning_rate": 0.00019928434531399876,
+      "loss": 0.2456,
+      "step": 7250
+    },
+    {
+      "epoch": 0.20246213368741095,
+      "grad_norm": 0.1749202162027359,
+      "learning_rate": 0.00019926635512005183,
+      "loss": 0.2504,
+      "step": 7300
+    },
+    {
+      "epoch": 0.20384886063047541,
+      "grad_norm": 0.2123364359140396,
+      "learning_rate": 0.00019924814243706197,
+      "loss": 0.2477,
+      "step": 7350
+    },
+    {
+      "epoch": 0.20523558757353985,
+      "grad_norm": 0.2234705090522766,
+      "learning_rate": 0.00019922970730584997,
+      "loss": 0.2457,
+      "step": 7400
+    },
+    {
+      "epoch": 0.20662231451660432,
+      "grad_norm": 0.20742256939411163,
+      "learning_rate": 0.00019921104976773505,
+      "loss": 0.249,
+      "step": 7450
+    },
+    {
+      "epoch": 0.20800904145966878,
+      "grad_norm": 0.18315458297729492,
+      "learning_rate": 0.000199192169864535,
+      "loss": 0.2459,
+      "step": 7500
+    },
+    {
+      "epoch": 0.20939576840273325,
+      "grad_norm": 0.19357183575630188,
+      "learning_rate": 0.000199173067638566,
+      "loss": 0.2439,
+      "step": 7550
+    },
+    {
+      "epoch": 0.2107824953457977,
+      "grad_norm": 0.2398926168680191,
+      "learning_rate": 0.00019915374313264248,
+      "loss": 0.2497,
+      "step": 7600
+    },
+    {
+      "epoch": 0.21216922228886215,
+      "grad_norm": 0.20313721895217896,
+      "learning_rate": 0.00019913419639007714,
+      "loss": 0.2447,
+      "step": 7650
+    },
+    {
+      "epoch": 0.2135559492319266,
+      "grad_norm": 0.17255066335201263,
+      "learning_rate": 0.00019911442745468075,
+      "loss": 0.2447,
+      "step": 7700
+    },
+    {
+      "epoch": 0.21494267617499108,
+      "grad_norm": 0.19140756130218506,
+      "learning_rate": 0.0001990944363707621,
+      "loss": 0.2383,
+      "step": 7750
+    },
+    {
+      "epoch": 0.21632940311805554,
+      "grad_norm": 0.15212053060531616,
+      "learning_rate": 0.00019907422318312783,
+      "loss": 0.2485,
+      "step": 7800
+    },
+    {
+      "epoch": 0.21771613006111998,
+      "grad_norm": 0.1841588169336319,
+      "learning_rate": 0.0001990537879370825,
+      "loss": 0.2432,
+      "step": 7850
+    },
+    {
+      "epoch": 0.21910285700418444,
+      "grad_norm": 0.2013355791568756,
+      "learning_rate": 0.00019903313067842833,
+      "loss": 0.2431,
+      "step": 7900
+    },
+    {
+      "epoch": 0.2204895839472489,
+      "grad_norm": 0.17149454355239868,
+      "learning_rate": 0.0001990122514534651,
+      "loss": 0.247,
+      "step": 7950
+    },
+    {
+      "epoch": 0.22187631089031337,
+      "grad_norm": 0.24272453784942627,
+      "learning_rate": 0.00019899115030899014,
+      "loss": 0.2468,
+      "step": 8000
+    },
+    {
+      "epoch": 0.22187631089031337,
+      "eval_loss": 0.24099861085414886,
+      "eval_runtime": 501.2129,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 8000
+    },
+    {
+      "epoch": 0.22326303783337784,
+      "grad_norm": 0.2419915497303009,
+      "learning_rate": 0.00019896982729229813,
+      "loss": 0.2454,
+      "step": 8050
+    },
+    {
+      "epoch": 0.22464976477644227,
+      "grad_norm": 0.16482336819171906,
+      "learning_rate": 0.0001989482824511811,
+      "loss": 0.2423,
+      "step": 8100
+    },
+    {
+      "epoch": 0.22603649171950674,
+      "grad_norm": 0.22351431846618652,
+      "learning_rate": 0.00019892651583392824,
+      "loss": 0.2501,
+      "step": 8150
+    },
+    {
+      "epoch": 0.2274232186625712,
+      "grad_norm": 0.19319549202919006,
+      "learning_rate": 0.0001989045274893258,
+      "loss": 0.2452,
+      "step": 8200
+    },
+    {
+      "epoch": 0.22880994560563567,
+      "grad_norm": 0.15613292157649994,
+      "learning_rate": 0.00019888231746665696,
+      "loss": 0.2428,
+      "step": 8250
+    },
+    {
+      "epoch": 0.2301966725487001,
+      "grad_norm": 0.18092665076255798,
+      "learning_rate": 0.00019885988581570184,
+      "loss": 0.2448,
+      "step": 8300
+    },
+    {
+      "epoch": 0.23158339949176457,
+      "grad_norm": 0.18928927183151245,
+      "learning_rate": 0.00019883723258673724,
+      "loss": 0.2493,
+      "step": 8350
+    },
+    {
+      "epoch": 0.23297012643482903,
+      "grad_norm": 0.19816988706588745,
+      "learning_rate": 0.0001988143578305366,
+      "loss": 0.2465,
+      "step": 8400
+    },
+    {
+      "epoch": 0.2343568533778935,
+      "grad_norm": 0.19853706657886505,
+      "learning_rate": 0.00019879126159836992,
+      "loss": 0.2443,
+      "step": 8450
+    },
+    {
+      "epoch": 0.23574358032095796,
+      "grad_norm": 0.17544203996658325,
+      "learning_rate": 0.00019876794394200353,
+      "loss": 0.2429,
+      "step": 8500
+    },
+    {
+      "epoch": 0.2371303072640224,
+      "grad_norm": 0.16583149135112762,
+      "learning_rate": 0.0001987444049137001,
+      "loss": 0.244,
+      "step": 8550
+    },
+    {
+      "epoch": 0.23851703420708686,
+      "grad_norm": 0.18239592015743256,
+      "learning_rate": 0.00019872064456621848,
+      "loss": 0.2447,
+      "step": 8600
+    },
+    {
+      "epoch": 0.23990376115015133,
+      "grad_norm": 0.15820704400539398,
+      "learning_rate": 0.0001986966629528135,
+      "loss": 0.2469,
+      "step": 8650
+    },
+    {
+      "epoch": 0.2412904880932158,
+      "grad_norm": 0.18477188050746918,
+      "learning_rate": 0.00019867246012723598,
+      "loss": 0.2407,
+      "step": 8700
+    },
+    {
+      "epoch": 0.24267721503628023,
+      "grad_norm": 0.1676979809999466,
+      "learning_rate": 0.0001986480361437325,
+      "loss": 0.2448,
+      "step": 8750
+    },
+    {
+      "epoch": 0.2440639419793447,
+      "grad_norm": 0.2173600196838379,
+      "learning_rate": 0.00019862339105704543,
+      "loss": 0.2409,
+      "step": 8800
+    },
+    {
+      "epoch": 0.24545066892240916,
+      "grad_norm": 0.17326687276363373,
+      "learning_rate": 0.00019859852492241256,
+      "loss": 0.2387,
+      "step": 8850
+    },
+    {
+      "epoch": 0.24683739586547362,
+      "grad_norm": 0.16229301691055298,
+      "learning_rate": 0.00019857343779556725,
+      "loss": 0.2467,
+      "step": 8900
+    },
+    {
+      "epoch": 0.2482241228085381,
+      "grad_norm": 0.21166543662548065,
+      "learning_rate": 0.0001985481297327381,
+      "loss": 0.2507,
+      "step": 8950
+    },
+    {
+      "epoch": 0.24961084975160253,
+      "grad_norm": 0.17892777919769287,
+      "learning_rate": 0.00019852260079064894,
+      "loss": 0.2416,
+      "step": 9000
+    },
+    {
+      "epoch": 0.24961084975160253,
+      "eval_loss": 0.23973840475082397,
+      "eval_runtime": 500.5349,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 9000
+    },
+    {
+      "epoch": 0.250997576694667,
+      "grad_norm": 0.20435132086277008,
+      "learning_rate": 0.00019849685102651867,
+      "loss": 0.2385,
+      "step": 9050
+    },
+    {
+      "epoch": 0.25238430363773146,
+      "grad_norm": 0.1890842318534851,
+      "learning_rate": 0.0001984708804980611,
+      "loss": 0.2416,
+      "step": 9100
+    },
+    {
+      "epoch": 0.2537710305807959,
+      "grad_norm": 0.18390174210071564,
+      "learning_rate": 0.00019844468926348482,
+      "loss": 0.2469,
+      "step": 9150
+    },
+    {
+      "epoch": 0.2551577575238604,
+      "grad_norm": 0.23599492013454437,
+      "learning_rate": 0.00019841827738149314,
+      "loss": 0.2417,
+      "step": 9200
+    },
+    {
+      "epoch": 0.25654448446692485,
+      "grad_norm": 0.1522965133190155,
+      "learning_rate": 0.00019839164491128398,
+      "loss": 0.2427,
+      "step": 9250
+    },
+    {
+      "epoch": 0.2579312114099893,
+      "grad_norm": 0.206534281373024,
+      "learning_rate": 0.00019836479191254948,
+      "loss": 0.2452,
+      "step": 9300
+    },
+    {
+      "epoch": 0.2593179383530537,
+      "grad_norm": 0.18928374350070953,
+      "learning_rate": 0.00019833771844547627,
+      "loss": 0.244,
+      "step": 9350
+    },
+    {
+      "epoch": 0.2607046652961182,
+      "grad_norm": 0.17130087316036224,
+      "learning_rate": 0.00019831042457074498,
+      "loss": 0.2488,
+      "step": 9400
+    },
+    {
+      "epoch": 0.26209139223918265,
+      "grad_norm": 0.17631781101226807,
+      "learning_rate": 0.00019828291034953033,
+      "loss": 0.2441,
+      "step": 9450
+    },
+    {
+      "epoch": 0.2634781191822471,
+      "grad_norm": 0.1852494180202484,
+      "learning_rate": 0.00019825517584350083,
+      "loss": 0.2414,
+      "step": 9500
+    },
+    {
+      "epoch": 0.2648648461253116,
+      "grad_norm": 0.21513506770133972,
+      "learning_rate": 0.0001982272211148188,
+      "loss": 0.2412,
+      "step": 9550
+    },
+    {
+      "epoch": 0.26625157306837605,
+      "grad_norm": 0.18172813951969147,
+      "learning_rate": 0.0001981990462261401,
+      "loss": 0.2435,
+      "step": 9600
+    },
+    {
+      "epoch": 0.2676383000114405,
+      "grad_norm": 0.1561124324798584,
+      "learning_rate": 0.00019817065124061407,
+      "loss": 0.238,
+      "step": 9650
+    },
+    {
+      "epoch": 0.269025026954505,
+      "grad_norm": 0.16663338243961334,
+      "learning_rate": 0.00019814203622188338,
+      "loss": 0.2383,
+      "step": 9700
+    },
+    {
+      "epoch": 0.27041175389756944,
+      "grad_norm": 0.17735238373279572,
+      "learning_rate": 0.0001981132012340838,
+      "loss": 0.2459,
+      "step": 9750
+    },
+    {
+      "epoch": 0.27179848084063385,
+      "grad_norm": 0.21334126591682434,
+      "learning_rate": 0.00019808414634184417,
+      "loss": 0.2425,
+      "step": 9800
+    },
+    {
+      "epoch": 0.2731852077836983,
+      "grad_norm": 0.16817434132099152,
+      "learning_rate": 0.00019805487161028625,
+      "loss": 0.2361,
+      "step": 9850
+    },
+    {
+      "epoch": 0.2745719347267628,
+      "grad_norm": 0.17149919271469116,
+      "learning_rate": 0.00019802537710502443,
+      "loss": 0.2431,
+      "step": 9900
+    },
+    {
+      "epoch": 0.27595866166982724,
+      "grad_norm": 0.1521356999874115,
+      "learning_rate": 0.00019799566289216576,
+      "loss": 0.2411,
+      "step": 9950
+    },
+    {
+      "epoch": 0.2773453886128917,
+      "grad_norm": 0.15583455562591553,
+      "learning_rate": 0.00019796572903830974,
+      "loss": 0.2388,
+      "step": 10000
+    },
+    {
+      "epoch": 0.2773453886128917,
+      "eval_loss": 0.23783154785633087,
+      "eval_runtime": 501.3932,
+      "eval_samples_per_second": 5.698,
+      "eval_steps_per_second": 5.698,
+      "step": 10000
+    },
+    {
+      "epoch": 0.2787321155559562,
+      "grad_norm": 0.15069644153118134,
+      "learning_rate": 0.00019793557561054807,
+      "loss": 0.245,
+      "step": 10050
+    },
+    {
+      "epoch": 0.28011884249902064,
+      "grad_norm": 0.16481320559978485,
+      "learning_rate": 0.0001979052026764647,
+      "loss": 0.2403,
+      "step": 10100
+    },
+    {
+      "epoch": 0.2815055694420851,
+      "grad_norm": 0.16549484431743622,
+      "learning_rate": 0.00019787461030413553,
+      "loss": 0.2404,
+      "step": 10150
+    },
+    {
+      "epoch": 0.28289229638514957,
+      "grad_norm": 0.1722942292690277,
+      "learning_rate": 0.0001978437985621282,
+      "loss": 0.2407,
+      "step": 10200
+    },
+    {
+      "epoch": 0.284279023328214,
+      "grad_norm": 1.554700255393982,
+      "learning_rate": 0.0001978127675195022,
+      "loss": 0.2423,
+      "step": 10250
+    },
+    {
+      "epoch": 0.28566575027127844,
+      "grad_norm": 0.18697640299797058,
+      "learning_rate": 0.0001977815172458084,
+      "loss": 0.2458,
+      "step": 10300
+    },
+    {
+      "epoch": 0.2870524772143429,
+      "grad_norm": 0.19721738994121552,
+      "learning_rate": 0.00019775004781108914,
+      "loss": 0.2423,
+      "step": 10350
+    },
+    {
+      "epoch": 0.28843920415740737,
+      "grad_norm": 0.13843601942062378,
+      "learning_rate": 0.00019771835928587787,
+      "loss": 0.249,
+      "step": 10400
+    },
+    {
+      "epoch": 0.28982593110047183,
+      "grad_norm": 0.19530989229679108,
+      "learning_rate": 0.0001976864517411992,
+      "loss": 0.2438,
+      "step": 10450
+    },
+    {
+      "epoch": 0.2912126580435363,
+      "grad_norm": 0.14896182715892792,
+      "learning_rate": 0.0001976543252485686,
+      "loss": 0.2392,
+      "step": 10500
+    },
+    {
+      "epoch": 0.29259938498660076,
+      "grad_norm": 0.1485060602426529,
+      "learning_rate": 0.00019762197987999223,
+      "loss": 0.2371,
+      "step": 10550
+    },
+    {
+      "epoch": 0.29398611192966523,
+      "grad_norm": 0.20084735751152039,
+      "learning_rate": 0.00019758941570796688,
+      "loss": 0.2461,
+      "step": 10600
+    },
+    {
+      "epoch": 0.2953728388727297,
+      "grad_norm": 0.1450163722038269,
+      "learning_rate": 0.0001975566328054797,
+      "loss": 0.2379,
+      "step": 10650
+    },
+    {
+      "epoch": 0.2967595658157941,
+      "grad_norm": 0.14225760102272034,
+      "learning_rate": 0.00019752363124600817,
+      "loss": 0.2465,
+      "step": 10700
+    },
+    {
+      "epoch": 0.29814629275885857,
+      "grad_norm": 0.182630255818367,
+      "learning_rate": 0.00019749041110351975,
+      "loss": 0.2382,
+      "step": 10750
+    },
+    {
+      "epoch": 0.29953301970192303,
+      "grad_norm": 0.18140457570552826,
+      "learning_rate": 0.00019745697245247194,
+      "loss": 0.2394,
+      "step": 10800
+    },
+    {
+      "epoch": 0.3009197466449875,
+      "grad_norm": 0.1756162941455841,
+      "learning_rate": 0.00019742331536781187,
+      "loss": 0.2377,
+      "step": 10850
+    },
+    {
+      "epoch": 0.30230647358805196,
+      "grad_norm": 0.14414621889591217,
+      "learning_rate": 0.0001973894399249763,
+      "loss": 0.2408,
+      "step": 10900
+    },
+    {
+      "epoch": 0.3036932005311164,
+      "grad_norm": 0.1697167605161667,
+      "learning_rate": 0.00019735534619989142,
+      "loss": 0.2442,
+      "step": 10950
+    },
+    {
+      "epoch": 0.3050799274741809,
+      "grad_norm": 0.15641078352928162,
+      "learning_rate": 0.00019732103426897265,
+      "loss": 0.2421,
+      "step": 11000
+    },
+    {
+      "epoch": 0.3050799274741809,
+      "eval_loss": 0.23684217035770416,
+      "eval_runtime": 500.474,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 11000
+    },
+    {
+      "epoch": 0.30646665441724535,
+      "grad_norm": 0.190172016620636,
+      "learning_rate": 0.00019728650420912448,
+      "loss": 0.2475,
+      "step": 11050
+    },
+    {
+      "epoch": 0.3078533813603098,
+      "grad_norm": 0.16632623970508575,
+      "learning_rate": 0.0001972517560977403,
+      "loss": 0.2426,
+      "step": 11100
+    },
+    {
+      "epoch": 0.30924010830337423,
+      "grad_norm": 0.16913548111915588,
+      "learning_rate": 0.00019721679001270226,
+      "loss": 0.2386,
+      "step": 11150
+    },
+    {
+      "epoch": 0.3106268352464387,
+      "grad_norm": 0.16081750392913818,
+      "learning_rate": 0.00019718160603238096,
+      "loss": 0.2358,
+      "step": 11200
+    },
+    {
+      "epoch": 0.31201356218950316,
+      "grad_norm": 0.19061852991580963,
+      "learning_rate": 0.00019714620423563552,
+      "loss": 0.238,
+      "step": 11250
+    },
+    {
+      "epoch": 0.3134002891325676,
+      "grad_norm": 0.16220314800739288,
+      "learning_rate": 0.00019711058470181316,
+      "loss": 0.2428,
+      "step": 11300
+    },
+    {
+      "epoch": 0.3147870160756321,
+      "grad_norm": 0.20064842700958252,
+      "learning_rate": 0.00019707474751074915,
+      "loss": 0.2393,
+      "step": 11350
+    },
+    {
+      "epoch": 0.31617374301869655,
+      "grad_norm": 0.14250491559505463,
+      "learning_rate": 0.00019703869274276657,
+      "loss": 0.2376,
+      "step": 11400
+    },
+    {
+      "epoch": 0.317560469961761,
+      "grad_norm": 0.18501660227775574,
+      "learning_rate": 0.00019700242047867623,
+      "loss": 0.2405,
+      "step": 11450
+    },
+    {
+      "epoch": 0.3189471969048255,
+      "grad_norm": 0.1680876910686493,
+      "learning_rate": 0.00019696593079977635,
+      "loss": 0.241,
+      "step": 11500
+    },
+    {
+      "epoch": 0.32033392384788995,
+      "grad_norm": 0.15119992196559906,
+      "learning_rate": 0.00019692922378785252,
+      "loss": 0.2371,
+      "step": 11550
+    },
+    {
+      "epoch": 0.32172065079095435,
+      "grad_norm": 0.15388673543930054,
+      "learning_rate": 0.0001968922995251774,
+      "loss": 0.2425,
+      "step": 11600
+    },
+    {
+      "epoch": 0.3231073777340188,
+      "grad_norm": 0.19946704804897308,
+      "learning_rate": 0.00019685515809451056,
+      "loss": 0.2476,
+      "step": 11650
+    },
+    {
+      "epoch": 0.3244941046770833,
+      "grad_norm": 0.17677927017211914,
+      "learning_rate": 0.0001968177995790984,
+      "loss": 0.2432,
+      "step": 11700
+    },
+    {
+      "epoch": 0.32588083162014775,
+      "grad_norm": 0.18418142199516296,
+      "learning_rate": 0.00019678022406267374,
+      "loss": 0.2387,
+      "step": 11750
+    },
+    {
+      "epoch": 0.3272675585632122,
+      "grad_norm": 0.1462264358997345,
+      "learning_rate": 0.00019674243162945594,
+      "loss": 0.2377,
+      "step": 11800
+    },
+    {
+      "epoch": 0.3286542855062767,
+      "grad_norm": 0.14166492223739624,
+      "learning_rate": 0.0001967044223641504,
+      "loss": 0.238,
+      "step": 11850
+    },
+    {
+      "epoch": 0.33004101244934114,
+      "grad_norm": 0.17436008155345917,
+      "learning_rate": 0.00019666619635194866,
+      "loss": 0.2429,
+      "step": 11900
+    },
+    {
+      "epoch": 0.3314277393924056,
+      "grad_norm": 0.15779553353786469,
+      "learning_rate": 0.00019662775367852787,
+      "loss": 0.2404,
+      "step": 11950
+    },
+    {
+      "epoch": 0.33281446633547007,
+      "grad_norm": 0.17796078324317932,
+      "learning_rate": 0.000196589094430051,
+      "loss": 0.235,
+      "step": 12000
+    },
+    {
+      "epoch": 0.33281446633547007,
+      "eval_loss": 0.235828697681427,
+      "eval_runtime": 500.6046,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 12000
+    },
+    {
+      "epoch": 0.3342011932785345,
+      "grad_norm": 0.14978894591331482,
+      "learning_rate": 0.0001965502186931662,
+      "loss": 0.2419,
+      "step": 12050
+    },
+    {
+      "epoch": 0.33558792022159895,
+      "grad_norm": 0.17456893622875214,
+      "learning_rate": 0.00019651112655500713,
+      "loss": 0.2389,
+      "step": 12100
+    },
+    {
+      "epoch": 0.3369746471646634,
+      "grad_norm": 0.1462843269109726,
+      "learning_rate": 0.0001964718181031922,
+      "loss": 0.2363,
+      "step": 12150
+    },
+    {
+      "epoch": 0.3383613741077279,
+      "grad_norm": 0.16996078193187714,
+      "learning_rate": 0.0001964322934258248,
+      "loss": 0.2404,
+      "step": 12200
+    },
+    {
+      "epoch": 0.33974810105079234,
+      "grad_norm": 0.1906641721725464,
+      "learning_rate": 0.00019639255261149298,
+      "loss": 0.2394,
+      "step": 12250
+    },
+    {
+      "epoch": 0.3411348279938568,
+      "grad_norm": 0.15007531642913818,
+      "learning_rate": 0.00019635259574926912,
+      "loss": 0.2371,
+      "step": 12300
+    },
+    {
+      "epoch": 0.34252155493692127,
+      "grad_norm": 0.18667016923427582,
+      "learning_rate": 0.00019631242292870993,
+      "loss": 0.24,
+      "step": 12350
+    },
+    {
+      "epoch": 0.34390828187998573,
+      "grad_norm": 0.1689510941505432,
+      "learning_rate": 0.0001962720342398561,
+      "loss": 0.2359,
+      "step": 12400
+    },
+    {
+      "epoch": 0.3452950088230502,
+      "grad_norm": 0.1622210294008255,
+      "learning_rate": 0.0001962314297732321,
+      "loss": 0.2405,
+      "step": 12450
+    },
+    {
+      "epoch": 0.3466817357661146,
+      "grad_norm": 0.20153377950191498,
+      "learning_rate": 0.0001961906096198462,
+      "loss": 0.2368,
+      "step": 12500
+    },
+    {
+      "epoch": 0.34806846270917907,
+      "grad_norm": 0.1634126603603363,
+      "learning_rate": 0.00019614957387118994,
+      "loss": 0.236,
+      "step": 12550
+    },
+    {
+      "epoch": 0.34945518965224354,
+      "grad_norm": 0.21276158094406128,
+      "learning_rate": 0.00019610832261923817,
+      "loss": 0.2397,
+      "step": 12600
+    },
+    {
+      "epoch": 0.350841916595308,
+      "grad_norm": 0.16108940541744232,
+      "learning_rate": 0.00019606685595644865,
+      "loss": 0.2424,
+      "step": 12650
+    },
+    {
+      "epoch": 0.35222864353837247,
+      "grad_norm": 0.20505978167057037,
+      "learning_rate": 0.00019602517397576205,
+      "loss": 0.2423,
+      "step": 12700
+    },
+    {
+      "epoch": 0.35361537048143693,
+      "grad_norm": 0.1431368589401245,
+      "learning_rate": 0.0001959832767706016,
+      "loss": 0.2353,
+      "step": 12750
+    },
+    {
+      "epoch": 0.3550020974245014,
+      "grad_norm": 0.1670791357755661,
+      "learning_rate": 0.00019594116443487293,
+      "loss": 0.2366,
+      "step": 12800
+    },
+    {
+      "epoch": 0.35638882436756586,
+      "grad_norm": 0.1353309154510498,
+      "learning_rate": 0.00019589883706296385,
+      "loss": 0.2387,
+      "step": 12850
+    },
+    {
+      "epoch": 0.3577755513106303,
+      "grad_norm": 0.16561363637447357,
+      "learning_rate": 0.00019585629474974415,
+      "loss": 0.2373,
+      "step": 12900
+    },
+    {
+      "epoch": 0.35916227825369473,
+      "grad_norm": 0.16978101432323456,
+      "learning_rate": 0.00019581353759056528,
+      "loss": 0.2383,
+      "step": 12950
+    },
+    {
+      "epoch": 0.3605490051967592,
+      "grad_norm": 0.13398033380508423,
+      "learning_rate": 0.0001957705656812604,
+      "loss": 0.2389,
+      "step": 13000
+    },
+    {
+      "epoch": 0.3605490051967592,
+      "eval_loss": 0.2349192500114441,
+      "eval_runtime": 500.9767,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 13000
+    },
+    {
+      "epoch": 0.36193573213982366,
+      "grad_norm": 0.17141664028167725,
+      "learning_rate": 0.00019572737911814387,
+      "loss": 0.2379,
+      "step": 13050
+    },
+    {
+      "epoch": 0.3633224590828881,
+      "grad_norm": 0.25635290145874023,
+      "learning_rate": 0.00019568397799801118,
+      "loss": 0.2354,
+      "step": 13100
+    },
+    {
+      "epoch": 0.3647091860259526,
+      "grad_norm": 0.19244590401649475,
+      "learning_rate": 0.00019564036241813876,
+      "loss": 0.2372,
+      "step": 13150
+    },
+    {
+      "epoch": 0.36609591296901706,
+      "grad_norm": 0.1587456613779068,
+      "learning_rate": 0.00019559653247628364,
+      "loss": 0.2399,
+      "step": 13200
+    },
+    {
+      "epoch": 0.3674826399120815,
+      "grad_norm": 0.22146746516227722,
+      "learning_rate": 0.0001955524882706834,
+      "loss": 0.2356,
+      "step": 13250
+    },
+    {
+      "epoch": 0.368869366855146,
+      "grad_norm": 0.21101641654968262,
+      "learning_rate": 0.0001955082299000558,
+      "loss": 0.2425,
+      "step": 13300
+    },
+    {
+      "epoch": 0.37025609379821045,
+      "grad_norm": 0.16459371149539948,
+      "learning_rate": 0.0001954637574635986,
+      "loss": 0.239,
+      "step": 13350
+    },
+    {
+      "epoch": 0.37164282074127486,
+      "grad_norm": 0.15547959506511688,
+      "learning_rate": 0.0001954190710609894,
+      "loss": 0.2358,
+      "step": 13400
+    },
+    {
+      "epoch": 0.3730295476843393,
+      "grad_norm": 0.1342894285917282,
+      "learning_rate": 0.00019537417079238534,
+      "loss": 0.2363,
+      "step": 13450
+    },
+    {
+      "epoch": 0.3744162746274038,
+      "grad_norm": 0.14169098436832428,
+      "learning_rate": 0.0001953290567584229,
+      "loss": 0.2355,
+      "step": 13500
+    },
+    {
+      "epoch": 0.37580300157046825,
+      "grad_norm": 0.17943793535232544,
+      "learning_rate": 0.00019528372906021772,
+      "loss": 0.2354,
+      "step": 13550
+    },
+    {
+      "epoch": 0.3771897285135327,
+      "grad_norm": 0.20254671573638916,
+      "learning_rate": 0.0001952381877993643,
+      "loss": 0.2411,
+      "step": 13600
+    },
+    {
+      "epoch": 0.3785764554565972,
+      "grad_norm": 0.1362125426530838,
+      "learning_rate": 0.0001951924330779358,
+      "loss": 0.2383,
+      "step": 13650
+    },
+    {
+      "epoch": 0.37996318239966165,
+      "grad_norm": 0.19201667606830597,
+      "learning_rate": 0.0001951464649984838,
+      "loss": 0.2398,
+      "step": 13700
+    },
+    {
+      "epoch": 0.3813499093427261,
+      "grad_norm": 0.15204668045043945,
+      "learning_rate": 0.0001951002836640382,
+      "loss": 0.2347,
+      "step": 13750
+    },
+    {
+      "epoch": 0.3827366362857906,
+      "grad_norm": 0.14426596462726593,
+      "learning_rate": 0.00019505388917810665,
+      "loss": 0.2399,
+      "step": 13800
+    },
+    {
+      "epoch": 0.38412336322885504,
+      "grad_norm": 0.1463170200586319,
+      "learning_rate": 0.0001950072816446748,
+      "loss": 0.2316,
+      "step": 13850
+    },
+    {
+      "epoch": 0.38551009017191945,
+      "grad_norm": 0.15552669763565063,
+      "learning_rate": 0.00019496046116820566,
+      "loss": 0.2354,
+      "step": 13900
+    },
+    {
+      "epoch": 0.3868968171149839,
+      "grad_norm": 0.16742919385433197,
+      "learning_rate": 0.00019491342785363952,
+      "loss": 0.2388,
+      "step": 13950
+    },
+    {
+      "epoch": 0.3882835440580484,
+      "grad_norm": 0.16111566126346588,
+      "learning_rate": 0.00019486618180639375,
+      "loss": 0.2385,
+      "step": 14000
+    },
+    {
+      "epoch": 0.3882835440580484,
+      "eval_loss": 0.23382489383220673,
+      "eval_runtime": 500.6533,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 14000
+    },
+    {
+      "epoch": 0.38967027100111284,
+      "grad_norm": 0.15741662681102753,
+      "learning_rate": 0.00019481872313236256,
+      "loss": 0.2374,
+      "step": 14050
+    },
+    {
+      "epoch": 0.3910569979441773,
+      "grad_norm": 0.15046770870685577,
+      "learning_rate": 0.00019477105193791664,
+      "loss": 0.2379,
+      "step": 14100
+    },
+    {
+      "epoch": 0.3924437248872418,
+      "grad_norm": 0.14219743013381958,
+      "learning_rate": 0.00019472316832990308,
+      "loss": 0.2434,
+      "step": 14150
+    },
+    {
+      "epoch": 0.39383045183030624,
+      "grad_norm": 0.15226851403713226,
+      "learning_rate": 0.000194675072415645,
+      "loss": 0.2427,
+      "step": 14200
+    },
+    {
+      "epoch": 0.3952171787733707,
+      "grad_norm": 0.19782114028930664,
+      "learning_rate": 0.00019462676430294143,
+      "loss": 0.2357,
+      "step": 14250
+    },
+    {
+      "epoch": 0.39660390571643517,
+      "grad_norm": 0.14243118464946747,
+      "learning_rate": 0.00019457824410006692,
+      "loss": 0.2343,
+      "step": 14300
+    },
+    {
+      "epoch": 0.3979906326594996,
+      "grad_norm": 0.22301803529262543,
+      "learning_rate": 0.00019452951191577155,
+      "loss": 0.2406,
+      "step": 14350
+    },
+    {
+      "epoch": 0.39937735960256404,
+      "grad_norm": 0.13103021681308746,
+      "learning_rate": 0.00019448056785928032,
+      "loss": 0.2398,
+      "step": 14400
+    },
+    {
+      "epoch": 0.4007640865456285,
+      "grad_norm": 0.16922806203365326,
+      "learning_rate": 0.00019443141204029325,
+      "loss": 0.2363,
+      "step": 14450
+    },
+    {
+      "epoch": 0.40215081348869297,
+      "grad_norm": 0.17801126837730408,
+      "learning_rate": 0.00019438204456898492,
+      "loss": 0.2377,
+      "step": 14500
+    },
+    {
+      "epoch": 0.40353754043175744,
+      "grad_norm": 0.14513610303401947,
+      "learning_rate": 0.0001943324655560043,
+      "loss": 0.241,
+      "step": 14550
+    },
+    {
+      "epoch": 0.4049242673748219,
+      "grad_norm": 0.14587055146694183,
+      "learning_rate": 0.00019428267511247457,
+      "loss": 0.2345,
+      "step": 14600
+    },
+    {
+      "epoch": 0.40631099431788636,
+      "grad_norm": 0.17200471460819244,
+      "learning_rate": 0.00019423267334999267,
+      "loss": 0.2345,
+      "step": 14650
+    },
+    {
+      "epoch": 0.40769772126095083,
+      "grad_norm": 0.16612234711647034,
+      "learning_rate": 0.00019418246038062928,
+      "loss": 0.235,
+      "step": 14700
+    },
+    {
+      "epoch": 0.4090844482040153,
+      "grad_norm": 0.14822156727313995,
+      "learning_rate": 0.00019413203631692843,
+      "loss": 0.2384,
+      "step": 14750
+    },
+    {
+      "epoch": 0.4104711751470797,
+      "grad_norm": 0.15960198640823364,
+      "learning_rate": 0.00019408140127190725,
+      "loss": 0.2375,
+      "step": 14800
+    },
+    {
+      "epoch": 0.41185790209014417,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019403157434308126,
+      "loss": 0.233,
+      "step": 14850
+    },
+    {
+      "epoch": 0.41324462903320863,
+      "grad_norm": 0.15910230576992035,
+      "learning_rate": 0.00019398154500404588,
+      "loss": 0.2728,
+      "step": 14900
+    },
+    {
+      "epoch": 0.4146313559762731,
+      "grad_norm": 0.16004903614521027,
+      "learning_rate": 0.0001939302861212685,
+      "loss": 0.2359,
+      "step": 14950
+    },
+    {
+      "epoch": 0.41601808291933756,
+      "grad_norm": 0.1622370034456253,
+      "learning_rate": 0.00019387881670936035,
+      "loss": 0.2413,
+      "step": 15000
+    },
+    {
+      "epoch": 0.41601808291933756,
+      "eval_loss": 0.23365913331508636,
+      "eval_runtime": 500.916,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 15000
+    },
+    {
+      "epoch": 0.417404809862402,
+      "grad_norm": 0.1744803488254547,
+      "learning_rate": 0.00019382713688368162,
+      "loss": 0.2406,
+      "step": 15050
+    },
+    {
+      "epoch": 0.4187915368054665,
+      "grad_norm": 0.19140714406967163,
+      "learning_rate": 0.00019377524676006397,
+      "loss": 0.2385,
+      "step": 15100
+    },
+    {
+      "epoch": 0.42017826374853096,
+      "grad_norm": 0.14320451021194458,
+      "learning_rate": 0.00019372314645481052,
+      "loss": 0.2384,
+      "step": 15150
+    },
+    {
+      "epoch": 0.4215649906915954,
+      "grad_norm": 0.18620997667312622,
+      "learning_rate": 0.00019367083608469546,
+      "loss": 0.2343,
+      "step": 15200
+    },
+    {
+      "epoch": 0.42295171763465983,
+      "grad_norm": 0.13473859429359436,
+      "learning_rate": 0.00019361831576696382,
+      "loss": 0.2399,
+      "step": 15250
+    },
+    {
+      "epoch": 0.4243384445777243,
+      "grad_norm": 0.15213748812675476,
+      "learning_rate": 0.00019356558561933108,
+      "loss": 0.2358,
+      "step": 15300
+    },
+    {
+      "epoch": 0.42572517152078876,
+      "grad_norm": 0.16841459274291992,
+      "learning_rate": 0.0001935126457599832,
+      "loss": 0.2332,
+      "step": 15350
+    },
+    {
+      "epoch": 0.4271118984638532,
+      "grad_norm": 0.14978626370429993,
+      "learning_rate": 0.00019345949630757603,
+      "loss": 0.2382,
+      "step": 15400
+    },
+    {
+      "epoch": 0.4284986254069177,
+      "grad_norm": 0.18397267162799835,
+      "learning_rate": 0.00019340613738123526,
+      "loss": 0.2328,
+      "step": 15450
+    },
+    {
+      "epoch": 0.42988535234998215,
+      "grad_norm": 0.13535378873348236,
+      "learning_rate": 0.000193352569100556,
+      "loss": 0.2278,
+      "step": 15500
+    },
+    {
+      "epoch": 0.4312720792930466,
+      "grad_norm": 0.1288972645998001,
+      "learning_rate": 0.00019329879158560274,
+      "loss": 0.2385,
+      "step": 15550
+    },
+    {
+      "epoch": 0.4326588062361111,
+      "grad_norm": 0.1488959789276123,
+      "learning_rate": 0.0001932448049569088,
+      "loss": 0.2352,
+      "step": 15600
+    },
+    {
+      "epoch": 0.43404553317917555,
+      "grad_norm": 0.16358473896980286,
+      "learning_rate": 0.00019319060933547624,
+      "loss": 0.2362,
+      "step": 15650
+    },
+    {
+      "epoch": 0.43543226012223996,
+      "grad_norm": 0.13347339630126953,
+      "learning_rate": 0.00019313620484277553,
+      "loss": 0.2376,
+      "step": 15700
+    },
+    {
+      "epoch": 0.4368189870653044,
+      "grad_norm": 0.13555756211280823,
+      "learning_rate": 0.0001930815916007453,
+      "loss": 0.2308,
+      "step": 15750
+    },
+    {
+      "epoch": 0.4382057140083689,
+      "grad_norm": 0.13955436646938324,
+      "learning_rate": 0.0001930267697317921,
+      "loss": 0.2329,
+      "step": 15800
+    },
+    {
+      "epoch": 0.43959244095143335,
+      "grad_norm": 0.1596931517124176,
+      "learning_rate": 0.00019297173935879,
+      "loss": 0.2322,
+      "step": 15850
+    },
+    {
+      "epoch": 0.4409791678944978,
+      "grad_norm": 0.14860297739505768,
+      "learning_rate": 0.00019291650060508045,
+      "loss": 0.234,
+      "step": 15900
+    },
+    {
+      "epoch": 0.4423658948375623,
+      "grad_norm": 0.14575625956058502,
+      "learning_rate": 0.00019286105359447194,
+      "loss": 0.2362,
+      "step": 15950
+    },
+    {
+      "epoch": 0.44375262178062674,
+      "grad_norm": 0.1400967240333557,
+      "learning_rate": 0.00019280539845123974,
+      "loss": 0.2358,
+      "step": 16000
+    },
+    {
+      "epoch": 0.44375262178062674,
+      "eval_loss": 0.23256094753742218,
+      "eval_runtime": 500.6637,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 16000
+    },
+    {
+      "epoch": 0.4451393487236912,
+      "grad_norm": 0.2537101209163666,
+      "learning_rate": 0.00019274953530012563,
+      "loss": 0.2363,
+      "step": 16050
+    },
+    {
+      "epoch": 0.4465260756667557,
+      "grad_norm": 0.192925825715065,
+      "learning_rate": 0.0001926934642663375,
+      "loss": 0.2343,
+      "step": 16100
+    },
+    {
+      "epoch": 0.4479128026098201,
+      "grad_norm": 0.17011120915412903,
+      "learning_rate": 0.0001926371854755493,
+      "loss": 0.2362,
+      "step": 16150
+    },
+    {
+      "epoch": 0.44929952955288455,
+      "grad_norm": 0.1474524289369583,
+      "learning_rate": 0.00019258069905390065,
+      "loss": 0.2359,
+      "step": 16200
+    },
+    {
+      "epoch": 0.450686256495949,
+      "grad_norm": 0.15591026842594147,
+      "learning_rate": 0.00019252400512799643,
+      "loss": 0.2338,
+      "step": 16250
+    },
+    {
+      "epoch": 0.4520729834390135,
+      "grad_norm": 0.14443908631801605,
+      "learning_rate": 0.00019246710382490664,
+      "loss": 0.2421,
+      "step": 16300
+    },
+    {
+      "epoch": 0.45345971038207794,
+      "grad_norm": 0.12614597380161285,
+      "learning_rate": 0.00019240999527216608,
+      "loss": 0.2373,
+      "step": 16350
+    },
+    {
+      "epoch": 0.4548464373251424,
+      "grad_norm": 0.1438266485929489,
+      "learning_rate": 0.00019235267959777415,
+      "loss": 0.2443,
+      "step": 16400
+    },
+    {
+      "epoch": 0.45623316426820687,
+      "grad_norm": 0.14473649859428406,
+      "learning_rate": 0.00019229515693019436,
+      "loss": 0.241,
+      "step": 16450
+    },
+    {
+      "epoch": 0.45761989121127133,
+      "grad_norm": 0.13498128950595856,
+      "learning_rate": 0.00019223742739835423,
+      "loss": 0.2393,
+      "step": 16500
+    },
+    {
+      "epoch": 0.4590066181543358,
+      "grad_norm": 0.14498169720172882,
+      "learning_rate": 0.0001921794911316449,
+      "loss": 0.2363,
+      "step": 16550
+    },
+    {
+      "epoch": 0.4603933450974002,
+      "grad_norm": 0.14319288730621338,
+      "learning_rate": 0.00019212134825992091,
+      "loss": 0.2359,
+      "step": 16600
+    },
+    {
+      "epoch": 0.4617800720404647,
+      "grad_norm": 0.12314629554748535,
+      "learning_rate": 0.00019206299891349983,
+      "loss": 0.23,
+      "step": 16650
+    },
+    {
+      "epoch": 0.46316679898352914,
+      "grad_norm": 0.14780518412590027,
+      "learning_rate": 0.00019200444322316207,
+      "loss": 0.2381,
+      "step": 16700
+    },
+    {
+      "epoch": 0.4645535259265936,
+      "grad_norm": 0.1493334025144577,
+      "learning_rate": 0.0001919456813201504,
+      "loss": 0.2345,
+      "step": 16750
+    },
+    {
+      "epoch": 0.46594025286965807,
+      "grad_norm": 0.11972863227128983,
+      "learning_rate": 0.00019188671333616992,
+      "loss": 0.235,
+      "step": 16800
+    },
+    {
+      "epoch": 0.46732697981272253,
+      "grad_norm": 0.13366112112998962,
+      "learning_rate": 0.00019182753940338753,
+      "loss": 0.2306,
+      "step": 16850
+    },
+    {
+      "epoch": 0.468713706755787,
+      "grad_norm": 0.13790684938430786,
+      "learning_rate": 0.00019176815965443186,
+      "loss": 0.2366,
+      "step": 16900
+    },
+    {
+      "epoch": 0.47010043369885146,
+      "grad_norm": 0.14081595838069916,
+      "learning_rate": 0.0001917085742223926,
+      "loss": 0.2368,
+      "step": 16950
+    },
+    {
+      "epoch": 0.4714871606419159,
+      "grad_norm": 0.13987073302268982,
+      "learning_rate": 0.00019164878324082074,
+      "loss": 0.2337,
+      "step": 17000
+    },
+    {
+      "epoch": 0.4714871606419159,
+      "eval_loss": 0.2317454218864441,
+      "eval_runtime": 500.9301,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 17000
+    },
+    {
+      "epoch": 0.47287388758498033,
+      "grad_norm": 0.1430695503950119,
+      "learning_rate": 0.00019158878684372778,
+      "loss": 0.2346,
+      "step": 17050
+    },
+    {
+      "epoch": 0.4742606145280448,
+      "grad_norm": 0.14264121651649475,
+      "learning_rate": 0.00019152858516558564,
+      "loss": 0.2339,
+      "step": 17100
+    },
+    {
+      "epoch": 0.47564734147110926,
+      "grad_norm": 0.15278013050556183,
+      "learning_rate": 0.00019146817834132644,
+      "loss": 0.2333,
+      "step": 17150
+    },
+    {
+      "epoch": 0.47703406841417373,
+      "grad_norm": 0.15283286571502686,
+      "learning_rate": 0.000191407566506342,
+      "loss": 0.2323,
+      "step": 17200
+    },
+    {
+      "epoch": 0.4784207953572382,
+      "grad_norm": 0.13433212041854858,
+      "learning_rate": 0.00019134674979648367,
+      "loss": 0.2406,
+      "step": 17250
+    },
+    {
+      "epoch": 0.47980752230030266,
+      "grad_norm": 0.14129064977169037,
+      "learning_rate": 0.00019128572834806203,
+      "loss": 0.2353,
+      "step": 17300
+    },
+    {
+      "epoch": 0.4811942492433671,
+      "grad_norm": 0.14736846089363098,
+      "learning_rate": 0.00019122450229784653,
+      "loss": 0.2312,
+      "step": 17350
+    },
+    {
+      "epoch": 0.4825809761864316,
+      "grad_norm": 0.14513076841831207,
+      "learning_rate": 0.00019116307178306514,
+      "loss": 0.2358,
+      "step": 17400
+    },
+    {
+      "epoch": 0.48396770312949605,
+      "grad_norm": 0.14358818531036377,
+      "learning_rate": 0.0001911014369414042,
+      "loss": 0.2376,
+      "step": 17450
+    },
+    {
+      "epoch": 0.48535443007256046,
+      "grad_norm": 0.14574295282363892,
+      "learning_rate": 0.00019103959791100792,
+      "loss": 0.2306,
+      "step": 17500
+    },
+    {
+      "epoch": 0.4867411570156249,
+      "grad_norm": 0.1347060352563858,
+      "learning_rate": 0.00019097755483047827,
+      "loss": 0.2341,
+      "step": 17550
+    },
+    {
+      "epoch": 0.4881278839586894,
+      "grad_norm": 0.1792859435081482,
+      "learning_rate": 0.00019091530783887448,
+      "loss": 0.2392,
+      "step": 17600
+    },
+    {
+      "epoch": 0.48951461090175385,
+      "grad_norm": 0.11206398904323578,
+      "learning_rate": 0.00019085285707571282,
+      "loss": 0.236,
+      "step": 17650
+    },
+    {
+      "epoch": 0.4909013378448183,
+      "grad_norm": 0.16337329149246216,
+      "learning_rate": 0.0001907902026809663,
+      "loss": 0.239,
+      "step": 17700
+    },
+    {
+      "epoch": 0.4922880647878828,
+      "grad_norm": 0.14579764008522034,
+      "learning_rate": 0.0001907273447950644,
+      "loss": 0.2258,
+      "step": 17750
+    },
+    {
+      "epoch": 0.49367479173094725,
+      "grad_norm": 0.1381896585226059,
+      "learning_rate": 0.00019066428355889257,
+      "loss": 0.2366,
+      "step": 17800
+    },
+    {
+      "epoch": 0.4950615186740117,
+      "grad_norm": 0.13557949662208557,
+      "learning_rate": 0.00019060101911379208,
+      "loss": 0.236,
+      "step": 17850
+    },
+    {
+      "epoch": 0.4964482456170762,
+      "grad_norm": 0.13205058872699738,
+      "learning_rate": 0.00019053755160155974,
+      "loss": 0.237,
+      "step": 17900
+    },
+    {
+      "epoch": 0.4978349725601406,
+      "grad_norm": 0.1766868382692337,
+      "learning_rate": 0.00019047388116444735,
+      "loss": 0.241,
+      "step": 17950
+    },
+    {
+      "epoch": 0.49922169950320505,
+      "grad_norm": 0.1567864567041397,
+      "learning_rate": 0.00019041000794516171,
+      "loss": 0.2269,
+      "step": 18000
+    },
+    {
+      "epoch": 0.49922169950320505,
+      "eval_loss": 0.23145872354507446,
+      "eval_runtime": 500.5681,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 18000
+    },
+    {
+      "epoch": 0.5006084264462696,
+      "grad_norm": 0.13615478575229645,
+      "learning_rate": 0.00019034593208686396,
+      "loss": 0.2347,
+      "step": 18050
+    },
+    {
+      "epoch": 0.501995153389334,
+      "grad_norm": 0.13786327838897705,
+      "learning_rate": 0.00019028165373316948,
+      "loss": 0.2335,
+      "step": 18100
+    },
+    {
+      "epoch": 0.5033818803323985,
+      "grad_norm": 0.14584092795848846,
+      "learning_rate": 0.0001902171730281476,
+      "loss": 0.2392,
+      "step": 18150
+    },
+    {
+      "epoch": 0.5047686072754629,
+      "grad_norm": 0.18500222265720367,
+      "learning_rate": 0.000190152490116321,
+      "loss": 0.2336,
+      "step": 18200
+    },
+    {
+      "epoch": 0.5061553342185273,
+      "grad_norm": 0.14118489623069763,
+      "learning_rate": 0.0001900876051426658,
+      "loss": 0.2362,
+      "step": 18250
+    },
+    {
+      "epoch": 0.5075420611615918,
+      "grad_norm": 0.18030238151550293,
+      "learning_rate": 0.00019002251825261078,
+      "loss": 0.2363,
+      "step": 18300
+    },
+    {
+      "epoch": 0.5089287881046562,
+      "grad_norm": 0.1916930228471756,
+      "learning_rate": 0.00018995722959203745,
+      "loss": 0.2342,
+      "step": 18350
+    },
+    {
+      "epoch": 0.5103155150477208,
+      "grad_norm": 0.1503581702709198,
+      "learning_rate": 0.00018989173930727951,
+      "loss": 0.2365,
+      "step": 18400
+    },
+    {
+      "epoch": 0.5117022419907852,
+      "grad_norm": 0.14816977083683014,
+      "learning_rate": 0.0001898260475451225,
+      "loss": 0.2387,
+      "step": 18450
+    },
+    {
+      "epoch": 0.5130889689338497,
+      "grad_norm": 0.13476118445396423,
+      "learning_rate": 0.00018976015445280363,
+      "loss": 0.2343,
+      "step": 18500
+    },
+    {
+      "epoch": 0.5144756958769141,
+      "grad_norm": 0.17522576451301575,
+      "learning_rate": 0.00018969406017801127,
+      "loss": 0.2299,
+      "step": 18550
+    },
+    {
+      "epoch": 0.5158624228199786,
+      "grad_norm": 0.13437584042549133,
+      "learning_rate": 0.00018962776486888485,
+      "loss": 0.2342,
+      "step": 18600
+    },
+    {
+      "epoch": 0.517249149763043,
+      "grad_norm": 0.14156264066696167,
+      "learning_rate": 0.0001895612686740142,
+      "loss": 0.2363,
+      "step": 18650
+    },
+    {
+      "epoch": 0.5186358767061074,
+      "grad_norm": 0.11037924140691757,
+      "learning_rate": 0.00018949457174243954,
+      "loss": 0.2343,
+      "step": 18700
+    },
+    {
+      "epoch": 0.520022603649172,
+      "grad_norm": 0.1362009048461914,
+      "learning_rate": 0.00018942767422365094,
+      "loss": 0.2363,
+      "step": 18750
+    },
+    {
+      "epoch": 0.5214093305922364,
+      "grad_norm": 0.1261095106601715,
+      "learning_rate": 0.00018936057626758808,
+      "loss": 0.2341,
+      "step": 18800
+    },
+    {
+      "epoch": 0.5227960575353009,
+      "grad_norm": 0.13382628560066223,
+      "learning_rate": 0.00018929327802463987,
+      "loss": 0.2309,
+      "step": 18850
+    },
+    {
+      "epoch": 0.5241827844783653,
+      "grad_norm": 0.15190520882606506,
+      "learning_rate": 0.00018922577964564417,
+      "loss": 0.2338,
+      "step": 18900
+    },
+    {
+      "epoch": 0.5255695114214298,
+      "grad_norm": 0.13708838820457458,
+      "learning_rate": 0.00018915808128188734,
+      "loss": 0.2338,
+      "step": 18950
+    },
+    {
+      "epoch": 0.5269562383644942,
+      "grad_norm": 0.20378737151622772,
+      "learning_rate": 0.0001890901830851041,
+      "loss": 0.2341,
+      "step": 19000
+    },
+    {
+      "epoch": 0.5269562383644942,
+      "eval_loss": 0.23116359114646912,
+      "eval_runtime": 500.7638,
+      "eval_samples_per_second": 5.705,
+      "eval_steps_per_second": 5.705,
+      "step": 19000
+    },
+    {
+      "epoch": 0.5283429653075588,
+      "grad_norm": 0.17179715633392334,
+      "learning_rate": 0.00018902208520747685,
+      "loss": 0.2363,
+      "step": 19050
+    },
+    {
+      "epoch": 0.5297296922506232,
+      "grad_norm": 0.13991795480251312,
+      "learning_rate": 0.00018895378780163578,
+      "loss": 0.2308,
+      "step": 19100
+    },
+    {
+      "epoch": 0.5311164191936876,
+      "grad_norm": 0.11662200093269348,
+      "learning_rate": 0.0001888852910206581,
+      "loss": 0.2354,
+      "step": 19150
+    },
+    {
+      "epoch": 0.5325031461367521,
+      "grad_norm": 0.1577063351869583,
+      "learning_rate": 0.00018881659501806804,
+      "loss": 0.2331,
+      "step": 19200
+    },
+    {
+      "epoch": 0.5338898730798165,
+      "grad_norm": 0.14893421530723572,
+      "learning_rate": 0.0001887476999478362,
+      "loss": 0.2345,
+      "step": 19250
+    },
+    {
+      "epoch": 0.535276600022881,
+      "grad_norm": 0.14458926022052765,
+      "learning_rate": 0.00018867860596437946,
+      "loss": 0.2364,
+      "step": 19300
+    },
+    {
+      "epoch": 0.5366633269659454,
+      "grad_norm": 0.18197046220302582,
+      "learning_rate": 0.00018860931322256056,
+      "loss": 0.2316,
+      "step": 19350
+    },
+    {
+      "epoch": 0.53805005390901,
+      "grad_norm": 0.12696345150470734,
+      "learning_rate": 0.0001885398218776876,
+      "loss": 0.2288,
+      "step": 19400
+    },
+    {
+      "epoch": 0.5394367808520744,
+      "grad_norm": 0.14459608495235443,
+      "learning_rate": 0.00018847013208551393,
+      "loss": 0.2342,
+      "step": 19450
+    },
+    {
+      "epoch": 0.5408235077951389,
+      "grad_norm": 0.13681089878082275,
+      "learning_rate": 0.00018840024400223758,
+      "loss": 0.2341,
+      "step": 19500
+    },
+    {
+      "epoch": 0.5422102347382033,
+      "grad_norm": 0.1358567178249359,
+      "learning_rate": 0.00018833015778450113,
+      "loss": 0.239,
+      "step": 19550
+    },
+    {
+      "epoch": 0.5435969616812677,
+      "grad_norm": 0.1429983228445053,
+      "learning_rate": 0.0001882598735893912,
+      "loss": 0.234,
+      "step": 19600
+    },
+    {
+      "epoch": 0.5449836886243322,
+      "grad_norm": 0.15259206295013428,
+      "learning_rate": 0.00018818939157443806,
+      "loss": 0.2333,
+      "step": 19650
+    },
+    {
+      "epoch": 0.5463704155673966,
+      "grad_norm": 0.1499055027961731,
+      "learning_rate": 0.00018811871189761554,
+      "loss": 0.2335,
+      "step": 19700
+    },
+    {
+      "epoch": 0.5477571425104611,
+      "grad_norm": 0.15547756850719452,
+      "learning_rate": 0.0001880478347173403,
+      "loss": 0.2331,
+      "step": 19750
+    },
+    {
+      "epoch": 0.5491438694535256,
+      "grad_norm": 0.13615499436855316,
+      "learning_rate": 0.00018797676019247187,
+      "loss": 0.2327,
+      "step": 19800
+    },
+    {
+      "epoch": 0.5505305963965901,
+      "grad_norm": 0.15891136229038239,
+      "learning_rate": 0.00018790548848231188,
+      "loss": 0.2293,
+      "step": 19850
+    },
+    {
+      "epoch": 0.5519173233396545,
+      "grad_norm": 0.1028260812163353,
+      "learning_rate": 0.0001878340197466041,
+      "loss": 0.2337,
+      "step": 19900
+    },
+    {
+      "epoch": 0.553304050282719,
+      "grad_norm": 0.15393692255020142,
+      "learning_rate": 0.0001877623541455338,
+      "loss": 0.2332,
+      "step": 19950
+    },
+    {
+      "epoch": 0.5546907772257834,
+      "grad_norm": 0.11807084083557129,
+      "learning_rate": 0.0001876904918397275,
+      "loss": 0.2352,
+      "step": 20000
+    },
+    {
+      "epoch": 0.5546907772257834,
+      "eval_loss": 0.2310873419046402,
+      "eval_runtime": 501.0545,
+      "eval_samples_per_second": 5.702,
+      "eval_steps_per_second": 5.702,
+      "step": 20000
+    },
+    {
+      "epoch": 0.5560775041688478,
+      "grad_norm": 0.1603621393442154,
+      "learning_rate": 0.00018761843299025267,
+      "loss": 0.2347,
+      "step": 20050
+    },
+    {
+      "epoch": 0.5574642311119123,
+      "grad_norm": 0.14295394718647003,
+      "learning_rate": 0.00018754617775861718,
+      "loss": 0.2335,
+      "step": 20100
+    },
+    {
+      "epoch": 0.5588509580549768,
+      "grad_norm": 0.1290232539176941,
+      "learning_rate": 0.0001874737263067692,
+      "loss": 0.2337,
+      "step": 20150
+    },
+    {
+      "epoch": 0.5602376849980413,
+      "grad_norm": 0.16112935543060303,
+      "learning_rate": 0.00018740107879709655,
+      "loss": 0.2354,
+      "step": 20200
+    },
+    {
+      "epoch": 0.5616244119411057,
+      "grad_norm": 0.13674217462539673,
+      "learning_rate": 0.00018732823539242664,
+      "loss": 0.23,
+      "step": 20250
+    },
+    {
+      "epoch": 0.5630111388841702,
+      "grad_norm": 0.18549004197120667,
+      "learning_rate": 0.00018725519625602578,
+      "loss": 0.2353,
+      "step": 20300
+    },
+    {
+      "epoch": 0.5643978658272346,
+      "grad_norm": 0.13107050955295563,
+      "learning_rate": 0.0001871819615515991,
+      "loss": 0.2392,
+      "step": 20350
+    },
+    {
+      "epoch": 0.5657845927702991,
+      "grad_norm": 0.13590605556964874,
+      "learning_rate": 0.00018710853144329002,
+      "loss": 0.2347,
+      "step": 20400
+    },
+    {
+      "epoch": 0.5671713197133635,
+      "grad_norm": 0.13591018319129944,
+      "learning_rate": 0.0001870349060956799,
+      "loss": 0.229,
+      "step": 20450
+    },
+    {
+      "epoch": 0.568558046656428,
+      "grad_norm": 0.11401943862438202,
+      "learning_rate": 0.00018696108567378773,
+      "loss": 0.2326,
+      "step": 20500
+    },
+    {
+      "epoch": 0.5699447735994925,
+      "grad_norm": 0.18518146872520447,
+      "learning_rate": 0.00018688707034306978,
+      "loss": 0.2351,
+      "step": 20550
+    },
+    {
+      "epoch": 0.5713315005425569,
+      "grad_norm": 0.1642865538597107,
+      "learning_rate": 0.00018681286026941905,
+      "loss": 0.2384,
+      "step": 20600
+    },
+    {
+      "epoch": 0.5727182274856214,
+      "grad_norm": 0.133639395236969,
+      "learning_rate": 0.00018673845561916513,
+      "loss": 0.2324,
+      "step": 20650
+    },
+    {
+      "epoch": 0.5741049544286858,
+      "grad_norm": 0.120590940117836,
+      "learning_rate": 0.00018666385655907367,
+      "loss": 0.2315,
+      "step": 20700
+    },
+    {
+      "epoch": 0.5754916813717503,
+      "grad_norm": 0.15754735469818115,
+      "learning_rate": 0.00018658906325634604,
+      "loss": 0.2388,
+      "step": 20750
+    },
+    {
+      "epoch": 0.5768784083148147,
+      "grad_norm": 0.15975181758403778,
+      "learning_rate": 0.00018651407587861905,
+      "loss": 0.2376,
+      "step": 20800
+    },
+    {
+      "epoch": 0.5782651352578793,
+      "grad_norm": 0.13276700675487518,
+      "learning_rate": 0.0001864388945939644,
+      "loss": 0.2379,
+      "step": 20850
+    },
+    {
+      "epoch": 0.5796518622009437,
+      "grad_norm": 0.16388626396656036,
+      "learning_rate": 0.0001863635195708885,
+      "loss": 0.2332,
+      "step": 20900
+    },
+    {
+      "epoch": 0.5810385891440081,
+      "grad_norm": 0.18847975134849548,
+      "learning_rate": 0.0001862879509783319,
+      "loss": 0.2381,
+      "step": 20950
+    },
+    {
+      "epoch": 0.5824253160870726,
+      "grad_norm": 0.24493199586868286,
+      "learning_rate": 0.00018621218898566907,
+      "loss": 0.2328,
+      "step": 21000
+    },
+    {
+      "epoch": 0.5824253160870726,
+      "eval_loss": 0.23020677268505096,
+      "eval_runtime": 499.9502,
+      "eval_samples_per_second": 5.715,
+      "eval_steps_per_second": 5.715,
+      "step": 21000
+    },
+    {
+      "epoch": 0.583812043030137,
+      "grad_norm": 0.16316668689250946,
+      "learning_rate": 0.00018613623376270794,
+      "loss": 0.2429,
+      "step": 21050
+    },
+    {
+      "epoch": 0.5851987699732015,
+      "grad_norm": 0.13449080288410187,
+      "learning_rate": 0.0001860600854796895,
+      "loss": 0.2298,
+      "step": 21100
+    },
+    {
+      "epoch": 0.5865854969162659,
+      "grad_norm": 0.11589767783880234,
+      "learning_rate": 0.00018598374430728746,
+      "loss": 0.2344,
+      "step": 21150
+    },
+    {
+      "epoch": 0.5879722238593305,
+      "grad_norm": 0.11659828573465347,
+      "learning_rate": 0.0001859072104166079,
+      "loss": 0.2333,
+      "step": 21200
+    },
+    {
+      "epoch": 0.5893589508023949,
+      "grad_norm": 0.155133455991745,
+      "learning_rate": 0.00018583048397918884,
+      "loss": 0.2362,
+      "step": 21250
+    },
+    {
+      "epoch": 0.5907456777454594,
+      "grad_norm": 0.16488181054592133,
+      "learning_rate": 0.00018575356516699977,
+      "loss": 0.2334,
+      "step": 21300
+    },
+    {
+      "epoch": 0.5921324046885238,
+      "grad_norm": 0.18307441473007202,
+      "learning_rate": 0.0001856764541524415,
+      "loss": 0.2272,
+      "step": 21350
+    },
+    {
+      "epoch": 0.5935191316315882,
+      "grad_norm": 0.1316101998090744,
+      "learning_rate": 0.00018559915110834553,
+      "loss": 0.2342,
+      "step": 21400
+    },
+    {
+      "epoch": 0.5949058585746527,
+      "grad_norm": 0.1548035889863968,
+      "learning_rate": 0.00018552165620797382,
+      "loss": 0.2323,
+      "step": 21450
+    },
+    {
+      "epoch": 0.5962925855177171,
+      "grad_norm": 0.13214810192584991,
+      "learning_rate": 0.00018544396962501828,
+      "loss": 0.2319,
+      "step": 21500
+    },
+    {
+      "epoch": 0.5976793124607817,
+      "grad_norm": 0.14733006060123444,
+      "learning_rate": 0.00018536609153360046,
+      "loss": 0.237,
+      "step": 21550
+    },
+    {
+      "epoch": 0.5990660394038461,
+      "grad_norm": 0.14465801417827606,
+      "learning_rate": 0.0001852880221082712,
+      "loss": 0.2318,
+      "step": 21600
+    },
+    {
+      "epoch": 0.6004527663469106,
+      "grad_norm": 0.14646270871162415,
+      "learning_rate": 0.00018520976152401012,
+      "loss": 0.2368,
+      "step": 21650
+    },
+    {
+      "epoch": 0.601839493289975,
+      "grad_norm": 0.14174975454807281,
+      "learning_rate": 0.00018513130995622535,
+      "loss": 0.2349,
+      "step": 21700
+    },
+    {
+      "epoch": 0.6032262202330395,
+      "grad_norm": 0.12805262207984924,
+      "learning_rate": 0.00018505266758075302,
+      "loss": 0.2315,
+      "step": 21750
+    },
+    {
+      "epoch": 0.6046129471761039,
+      "grad_norm": 0.1598140299320221,
+      "learning_rate": 0.00018497383457385697,
+      "loss": 0.2332,
+      "step": 21800
+    },
+    {
+      "epoch": 0.6059996741191683,
+      "grad_norm": 0.13651584088802338,
+      "learning_rate": 0.00018489481111222828,
+      "loss": 0.2348,
+      "step": 21850
+    },
+    {
+      "epoch": 0.6073864010622329,
+      "grad_norm": 0.13091818988323212,
+      "learning_rate": 0.0001848155973729849,
+      "loss": 0.2287,
+      "step": 21900
+    },
+    {
+      "epoch": 0.6087731280052973,
+      "grad_norm": 0.17191646993160248,
+      "learning_rate": 0.00018473619353367128,
+      "loss": 0.2342,
+      "step": 21950
+    },
+    {
+      "epoch": 0.6101598549483618,
+      "grad_norm": 0.10674546658992767,
+      "learning_rate": 0.0001846565997722579,
+      "loss": 0.2309,
+      "step": 22000
+    },
+    {
+      "epoch": 0.6101598549483618,
+      "eval_loss": 0.22999995946884155,
+      "eval_runtime": 499.7816,
+      "eval_samples_per_second": 5.716,
+      "eval_steps_per_second": 5.716,
+      "step": 22000
+    },
+    {
+      "epoch": 0.6115465818914262,
+      "grad_norm": 0.1321185827255249,
+      "learning_rate": 0.000184576816267141,
+      "loss": 0.2347,
+      "step": 22050
+    },
+    {
+      "epoch": 0.6129333088344907,
+      "grad_norm": 0.12945061922073364,
+      "learning_rate": 0.00018449684319714202,
+      "loss": 0.2298,
+      "step": 22100
+    },
+    {
+      "epoch": 0.6143200357775551,
+      "grad_norm": 0.16403023898601532,
+      "learning_rate": 0.00018441668074150732,
+      "loss": 0.2276,
+      "step": 22150
+    },
+    {
+      "epoch": 0.6157067627206196,
+      "grad_norm": 0.14253240823745728,
+      "learning_rate": 0.00018433632907990775,
+      "loss": 0.2315,
+      "step": 22200
+    },
+    {
+      "epoch": 0.617093489663684,
+      "grad_norm": 0.1752641350030899,
+      "learning_rate": 0.00018425578839243814,
+      "loss": 0.2327,
+      "step": 22250
+    },
+    {
+      "epoch": 0.6184802166067485,
+      "grad_norm": 0.11023511737585068,
+      "learning_rate": 0.00018417505885961712,
+      "loss": 0.2341,
+      "step": 22300
+    },
+    {
+      "epoch": 0.619866943549813,
+      "grad_norm": 0.1494046449661255,
+      "learning_rate": 0.00018409414066238654,
+      "loss": 0.2307,
+      "step": 22350
+    },
+    {
+      "epoch": 0.6212536704928774,
+      "grad_norm": 0.13288947939872742,
+      "learning_rate": 0.00018401303398211103,
+      "loss": 0.2307,
+      "step": 22400
+    },
+    {
+      "epoch": 0.6226403974359419,
+      "grad_norm": 0.13972090184688568,
+      "learning_rate": 0.0001839317390005778,
+      "loss": 0.231,
+      "step": 22450
+    },
+    {
+      "epoch": 0.6240271243790063,
+      "grad_norm": 0.16141022741794586,
+      "learning_rate": 0.000183850255899996,
+      "loss": 0.2395,
+      "step": 22500
+    },
+    {
+      "epoch": 0.6254138513220708,
+      "grad_norm": 0.17160941660404205,
+      "learning_rate": 0.00018376858486299647,
+      "loss": 0.2371,
+      "step": 22550
+    },
+    {
+      "epoch": 0.6268005782651352,
+      "grad_norm": 0.13852784037590027,
+      "learning_rate": 0.00018368672607263132,
+      "loss": 0.2286,
+      "step": 22600
+    },
+    {
+      "epoch": 0.6281873052081998,
+      "grad_norm": 0.16050252318382263,
+      "learning_rate": 0.00018360467971237338,
+      "loss": 0.2345,
+      "step": 22650
+    },
+    {
+      "epoch": 0.6295740321512642,
+      "grad_norm": 0.12499688565731049,
+      "learning_rate": 0.0001835224459661159,
+      "loss": 0.232,
+      "step": 22700
+    },
+    {
+      "epoch": 0.6309607590943286,
+      "grad_norm": 0.16804257035255432,
+      "learning_rate": 0.00018344002501817226,
+      "loss": 0.2336,
+      "step": 22750
+    },
+    {
+      "epoch": 0.6323474860373931,
+      "grad_norm": 0.15330076217651367,
+      "learning_rate": 0.00018335741705327526,
+      "loss": 0.2314,
+      "step": 22800
+    },
+    {
+      "epoch": 0.6337342129804575,
+      "grad_norm": 0.12613581120967865,
+      "learning_rate": 0.00018327462225657692,
+      "loss": 0.235,
+      "step": 22850
+    },
+    {
+      "epoch": 0.635120939923522,
+      "grad_norm": 0.16671714186668396,
+      "learning_rate": 0.00018319164081364802,
+      "loss": 0.2319,
+      "step": 22900
+    },
+    {
+      "epoch": 0.6365076668665864,
+      "grad_norm": 0.11536330729722977,
+      "learning_rate": 0.00018310847291047776,
+      "loss": 0.2296,
+      "step": 22950
+    },
+    {
+      "epoch": 0.637894393809651,
+      "grad_norm": 0.1565777063369751,
+      "learning_rate": 0.00018302511873347305,
+      "loss": 0.23,
+      "step": 23000
+    },
+    {
+      "epoch": 0.637894393809651,
+      "eval_loss": 0.22944478690624237,
+      "eval_runtime": 500.3715,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 23000
+    },
+    {
+      "epoch": 0.6392811207527154,
+      "grad_norm": 0.18740278482437134,
+      "learning_rate": 0.00018294157846945853,
+      "loss": 0.2315,
+      "step": 23050
+    },
+    {
+      "epoch": 0.6406678476957799,
+      "grad_norm": 0.14261969923973083,
+      "learning_rate": 0.00018285785230567577,
+      "loss": 0.2291,
+      "step": 23100
+    },
+    {
+      "epoch": 0.6420545746388443,
+      "grad_norm": 0.16137824952602386,
+      "learning_rate": 0.00018277394042978307,
+      "loss": 0.2325,
+      "step": 23150
+    },
+    {
+      "epoch": 0.6434413015819087,
+      "grad_norm": 0.1337035894393921,
+      "learning_rate": 0.00018268984302985495,
+      "loss": 0.2322,
+      "step": 23200
+    },
+    {
+      "epoch": 0.6448280285249732,
+      "grad_norm": 0.11618442833423615,
+      "learning_rate": 0.0001826055602943818,
+      "loss": 0.2349,
+      "step": 23250
+    },
+    {
+      "epoch": 0.6462147554680376,
+      "grad_norm": 0.12656192481517792,
+      "learning_rate": 0.0001825210924122693,
+      "loss": 0.234,
+      "step": 23300
+    },
+    {
+      "epoch": 0.6476014824111022,
+      "grad_norm": 0.11272765696048737,
+      "learning_rate": 0.0001824364395728382,
+      "loss": 0.2313,
+      "step": 23350
+    },
+    {
+      "epoch": 0.6489882093541666,
+      "grad_norm": 0.13132552802562714,
+      "learning_rate": 0.00018235160196582384,
+      "loss": 0.2289,
+      "step": 23400
+    },
+    {
+      "epoch": 0.6503749362972311,
+      "grad_norm": 0.11405663937330246,
+      "learning_rate": 0.00018226657978137554,
+      "loss": 0.2356,
+      "step": 23450
+    },
+    {
+      "epoch": 0.6517616632402955,
+      "grad_norm": 0.15040431916713715,
+      "learning_rate": 0.00018218137321005643,
+      "loss": 0.2303,
+      "step": 23500
+    },
+    {
+      "epoch": 0.65314839018336,
+      "grad_norm": 0.13074640929698944,
+      "learning_rate": 0.00018209598244284288,
+      "loss": 0.2319,
+      "step": 23550
+    },
+    {
+      "epoch": 0.6545351171264244,
+      "grad_norm": 0.14512640237808228,
+      "learning_rate": 0.00018201040767112413,
+      "loss": 0.2393,
+      "step": 23600
+    },
+    {
+      "epoch": 0.6559218440694888,
+      "grad_norm": 0.10800650715827942,
+      "learning_rate": 0.00018192464908670176,
+      "loss": 0.2318,
+      "step": 23650
+    },
+    {
+      "epoch": 0.6573085710125534,
+      "grad_norm": 0.12321613729000092,
+      "learning_rate": 0.00018183870688178946,
+      "loss": 0.2331,
+      "step": 23700
+    },
+    {
+      "epoch": 0.6586952979556178,
+      "grad_norm": 0.1868344396352768,
+      "learning_rate": 0.00018175258124901236,
+      "loss": 0.2317,
+      "step": 23750
+    },
+    {
+      "epoch": 0.6600820248986823,
+      "grad_norm": 0.11993540078401566,
+      "learning_rate": 0.00018166627238140674,
+      "loss": 0.2309,
+      "step": 23800
+    },
+    {
+      "epoch": 0.6614687518417467,
+      "grad_norm": 0.11594246327877045,
+      "learning_rate": 0.00018157978047241962,
+      "loss": 0.2322,
+      "step": 23850
+    },
+    {
+      "epoch": 0.6628554787848112,
+      "grad_norm": 0.18056848645210266,
+      "learning_rate": 0.00018149310571590824,
+      "loss": 0.2335,
+      "step": 23900
+    },
+    {
+      "epoch": 0.6642422057278756,
+      "grad_norm": 0.14387637376785278,
+      "learning_rate": 0.00018140624830613965,
+      "loss": 0.2366,
+      "step": 23950
+    },
+    {
+      "epoch": 0.6656289326709401,
+      "grad_norm": 0.16983430087566376,
+      "learning_rate": 0.00018131920843779035,
+      "loss": 0.2361,
+      "step": 24000
+    },
+    {
+      "epoch": 0.6656289326709401,
+      "eval_loss": 0.22958332300186157,
+      "eval_runtime": 500.0504,
+      "eval_samples_per_second": 5.713,
+      "eval_steps_per_second": 5.713,
+      "step": 24000
+    },
+    {
+      "epoch": 0.6670156596140046,
+      "grad_norm": 0.13279864192008972,
+      "learning_rate": 0.0001812319863059457,
+      "loss": 0.2359,
+      "step": 24050
+    },
+    {
+      "epoch": 0.668402386557069,
+      "grad_norm": 0.11594101786613464,
+      "learning_rate": 0.00018114458210609962,
+      "loss": 0.2358,
+      "step": 24100
+    },
+    {
+      "epoch": 0.6697891135001335,
+      "grad_norm": 0.13613513112068176,
+      "learning_rate": 0.0001810569960341541,
+      "loss": 0.2278,
+      "step": 24150
+    },
+    {
+      "epoch": 0.6711758404431979,
+      "grad_norm": 0.12295212596654892,
+      "learning_rate": 0.00018096922828641878,
+      "loss": 0.2315,
+      "step": 24200
+    },
+    {
+      "epoch": 0.6725625673862624,
+      "grad_norm": 0.17889654636383057,
+      "learning_rate": 0.00018088127905961047,
+      "loss": 0.2305,
+      "step": 24250
+    },
+    {
+      "epoch": 0.6739492943293268,
+      "grad_norm": 0.16525234282016754,
+      "learning_rate": 0.0001807931485508528,
+      "loss": 0.2304,
+      "step": 24300
+    },
+    {
+      "epoch": 0.6753360212723913,
+      "grad_norm": 0.11446121335029602,
+      "learning_rate": 0.0001807048369576756,
+      "loss": 0.2333,
+      "step": 24350
+    },
+    {
+      "epoch": 0.6767227482154557,
+      "grad_norm": 0.14533396065235138,
+      "learning_rate": 0.00018061634447801467,
+      "loss": 0.2354,
+      "step": 24400
+    },
+    {
+      "epoch": 0.6781094751585203,
+      "grad_norm": 0.14825408160686493,
+      "learning_rate": 0.0001805276713102112,
+      "loss": 0.2316,
+      "step": 24450
+    },
+    {
+      "epoch": 0.6794962021015847,
+      "grad_norm": 0.148117333650589,
+      "learning_rate": 0.00018043881765301135,
+      "loss": 0.2338,
+      "step": 24500
+    },
+    {
+      "epoch": 0.6808829290446491,
+      "grad_norm": 0.10264230519533157,
+      "learning_rate": 0.00018034978370556583,
+      "loss": 0.2298,
+      "step": 24550
+    },
+    {
+      "epoch": 0.6822696559877136,
+      "grad_norm": 0.12200962007045746,
+      "learning_rate": 0.00018026056966742945,
+      "loss": 0.2284,
+      "step": 24600
+    },
+    {
+      "epoch": 0.683656382930778,
+      "grad_norm": 0.14096751809120178,
+      "learning_rate": 0.00018017117573856063,
+      "loss": 0.2333,
+      "step": 24650
+    },
+    {
+      "epoch": 0.6850431098738425,
+      "grad_norm": 0.16554249823093414,
+      "learning_rate": 0.00018008160211932108,
+      "loss": 0.2316,
+      "step": 24700
+    },
+    {
+      "epoch": 0.686429836816907,
+      "grad_norm": 0.11679153889417648,
+      "learning_rate": 0.0001799918490104751,
+      "loss": 0.2287,
+      "step": 24750
+    },
+    {
+      "epoch": 0.6878165637599715,
+      "grad_norm": 0.1387365758419037,
+      "learning_rate": 0.00017990191661318943,
+      "loss": 0.2356,
+      "step": 24800
+    },
+    {
+      "epoch": 0.6892032907030359,
+      "grad_norm": 0.1255553960800171,
+      "learning_rate": 0.00017981180512903255,
+      "loss": 0.2342,
+      "step": 24850
+    },
+    {
+      "epoch": 0.6905900176461004,
+      "grad_norm": 0.17247521877288818,
+      "learning_rate": 0.00017972151475997443,
+      "loss": 0.2303,
+      "step": 24900
+    },
+    {
+      "epoch": 0.6919767445891648,
+      "grad_norm": 0.20023292303085327,
+      "learning_rate": 0.0001796310457083859,
+      "loss": 0.2346,
+      "step": 24950
+    },
+    {
+      "epoch": 0.6933634715322292,
+      "grad_norm": 0.11909276992082596,
+      "learning_rate": 0.0001795403981770383,
+      "loss": 0.2264,
+      "step": 25000
+    },
+    {
+      "epoch": 0.6933634715322292,
+      "eval_loss": 0.2287738025188446,
+      "eval_runtime": 500.5021,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 25000
+    },
+    {
+      "epoch": 0.6947501984752937,
+      "grad_norm": 0.13509905338287354,
+      "learning_rate": 0.00017944957236910308,
+      "loss": 0.2318,
+      "step": 25050
+    },
+    {
+      "epoch": 0.6961369254183581,
+      "grad_norm": 0.15455523133277893,
+      "learning_rate": 0.0001793585684881511,
+      "loss": 0.2325,
+      "step": 25100
+    },
+    {
+      "epoch": 0.6975236523614227,
+      "grad_norm": 0.1231105625629425,
+      "learning_rate": 0.00017926738673815248,
+      "loss": 0.2303,
+      "step": 25150
+    },
+    {
+      "epoch": 0.6989103793044871,
+      "grad_norm": 0.19073975086212158,
+      "learning_rate": 0.00017917602732347597,
+      "loss": 0.2309,
+      "step": 25200
+    },
+    {
+      "epoch": 0.7002971062475516,
+      "grad_norm": 0.16656789183616638,
+      "learning_rate": 0.00017908449044888854,
+      "loss": 0.2334,
+      "step": 25250
+    },
+    {
+      "epoch": 0.701683833190616,
+      "grad_norm": 0.12732850015163422,
+      "learning_rate": 0.00017899277631955486,
+      "loss": 0.2348,
+      "step": 25300
+    },
+    {
+      "epoch": 0.7030705601336805,
+      "grad_norm": 0.20655155181884766,
+      "learning_rate": 0.00017890088514103692,
+      "loss": 0.2355,
+      "step": 25350
+    },
+    {
+      "epoch": 0.7044572870767449,
+      "grad_norm": 0.10959596931934357,
+      "learning_rate": 0.00017880881711929353,
+      "loss": 0.2304,
+      "step": 25400
+    },
+    {
+      "epoch": 0.7058440140198093,
+      "grad_norm": 0.15412519872188568,
+      "learning_rate": 0.00017871657246067987,
+      "loss": 0.2336,
+      "step": 25450
+    },
+    {
+      "epoch": 0.7072307409628739,
+      "grad_norm": 0.16455277800559998,
+      "learning_rate": 0.00017862415137194702,
+      "loss": 0.2319,
+      "step": 25500
+    },
+    {
+      "epoch": 0.7086174679059383,
+      "grad_norm": 0.1389029622077942,
+      "learning_rate": 0.00017853340773211896,
+      "loss": 0.2294,
+      "step": 25550
+    },
+    {
+      "epoch": 0.7100041948490028,
+      "grad_norm": 0.14564301073551178,
+      "learning_rate": 0.0001784424950430794,
+      "loss": 0.2326,
+      "step": 25600
+    },
+    {
+      "epoch": 0.7113909217920672,
+      "grad_norm": 0.1606937199831009,
+      "learning_rate": 0.00017834955293674994,
+      "loss": 0.23,
+      "step": 25650
+    },
+    {
+      "epoch": 0.7127776487351317,
+      "grad_norm": 0.13401974737644196,
+      "learning_rate": 0.00017825643522291457,
+      "loss": 0.2361,
+      "step": 25700
+    },
+    {
+      "epoch": 0.7141643756781961,
+      "grad_norm": 0.12457278370857239,
+      "learning_rate": 0.0001781631421102812,
+      "loss": 0.232,
+      "step": 25750
+    },
+    {
+      "epoch": 0.7155511026212606,
+      "grad_norm": 0.13395826518535614,
+      "learning_rate": 0.0001780696738079508,
+      "loss": 0.2294,
+      "step": 25800
+    },
+    {
+      "epoch": 0.7169378295643251,
+      "grad_norm": 0.13083291053771973,
+      "learning_rate": 0.00017797603052541704,
+      "loss": 0.2328,
+      "step": 25850
+    },
+    {
+      "epoch": 0.7183245565073895,
+      "grad_norm": 0.14696165919303894,
+      "learning_rate": 0.00017788221247256583,
+      "loss": 0.233,
+      "step": 25900
+    },
+    {
+      "epoch": 0.719711283450454,
+      "grad_norm": 0.1512746810913086,
+      "learning_rate": 0.00017778821985967467,
+      "loss": 0.2319,
+      "step": 25950
+    },
+    {
+      "epoch": 0.7210980103935184,
+      "grad_norm": 0.1260426789522171,
+      "learning_rate": 0.00017769405289741247,
+      "loss": 0.2341,
+      "step": 26000
+    },
+    {
+      "epoch": 0.7210980103935184,
+      "eval_loss": 0.22873948514461517,
+      "eval_runtime": 500.274,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 26000
+    },
+    {
+      "epoch": 0.7224847373365829,
+      "grad_norm": 0.1653342843055725,
+      "learning_rate": 0.00017759971179683875,
+      "loss": 0.2316,
+      "step": 26050
+    },
+    {
+      "epoch": 0.7238714642796473,
+      "grad_norm": 0.13507039844989777,
+      "learning_rate": 0.00017750519676940348,
+      "loss": 0.2357,
+      "step": 26100
+    },
+    {
+      "epoch": 0.7252581912227118,
+      "grad_norm": 0.128819540143013,
+      "learning_rate": 0.00017741050802694635,
+      "loss": 0.231,
+      "step": 26150
+    },
+    {
+      "epoch": 0.7266449181657763,
+      "grad_norm": 0.13130728900432587,
+      "learning_rate": 0.00017731564578169647,
+      "loss": 0.2305,
+      "step": 26200
+    },
+    {
+      "epoch": 0.7280316451088408,
+      "grad_norm": 0.12267379462718964,
+      "learning_rate": 0.0001772206102462718,
+      "loss": 0.2345,
+      "step": 26250
+    },
+    {
+      "epoch": 0.7294183720519052,
+      "grad_norm": 0.14595343172550201,
+      "learning_rate": 0.0001771254016336787,
+      "loss": 0.2294,
+      "step": 26300
+    },
+    {
+      "epoch": 0.7308050989949696,
+      "grad_norm": 0.13935647904872894,
+      "learning_rate": 0.0001770300201573114,
+      "loss": 0.2358,
+      "step": 26350
+    },
+    {
+      "epoch": 0.7321918259380341,
+      "grad_norm": 0.11328408867120743,
+      "learning_rate": 0.00017693446603095174,
+      "loss": 0.2339,
+      "step": 26400
+    },
+    {
+      "epoch": 0.7335785528810985,
+      "grad_norm": 0.19857367873191833,
+      "learning_rate": 0.00017683873946876835,
+      "loss": 0.2269,
+      "step": 26450
+    },
+    {
+      "epoch": 0.734965279824163,
+      "grad_norm": 0.16225670278072357,
+      "learning_rate": 0.00017674284068531641,
+      "loss": 0.2307,
+      "step": 26500
+    },
+    {
+      "epoch": 0.7363520067672275,
+      "grad_norm": 0.1412588506937027,
+      "learning_rate": 0.00017664676989553714,
+      "loss": 0.229,
+      "step": 26550
+    },
+    {
+      "epoch": 0.737738733710292,
+      "grad_norm": 0.14530161023139954,
+      "learning_rate": 0.00017655052731475724,
+      "loss": 0.2308,
+      "step": 26600
+    },
+    {
+      "epoch": 0.7391254606533564,
+      "grad_norm": 0.12190265953540802,
+      "learning_rate": 0.0001764541131586885,
+      "loss": 0.2294,
+      "step": 26650
+    },
+    {
+      "epoch": 0.7405121875964209,
+      "grad_norm": 0.13169080018997192,
+      "learning_rate": 0.00017635752764342717,
+      "loss": 0.2275,
+      "step": 26700
+    },
+    {
+      "epoch": 0.7418989145394853,
+      "grad_norm": 0.12346599251031876,
+      "learning_rate": 0.00017626077098545367,
+      "loss": 0.2326,
+      "step": 26750
+    },
+    {
+      "epoch": 0.7432856414825497,
+      "grad_norm": 0.12645727396011353,
+      "learning_rate": 0.00017616384340163197,
+      "loss": 0.2369,
+      "step": 26800
+    },
+    {
+      "epoch": 0.7446723684256142,
+      "grad_norm": 0.12523086369037628,
+      "learning_rate": 0.00017606674510920915,
+      "loss": 0.2291,
+      "step": 26850
+    },
+    {
+      "epoch": 0.7460590953686786,
+      "grad_norm": 0.14181695878505707,
+      "learning_rate": 0.0001759694763258149,
+      "loss": 0.2266,
+      "step": 26900
+    },
+    {
+      "epoch": 0.7474458223117432,
+      "grad_norm": 0.13824765384197235,
+      "learning_rate": 0.00017587203726946102,
+      "loss": 0.2281,
+      "step": 26950
+    },
+    {
+      "epoch": 0.7488325492548076,
+      "grad_norm": 0.1162494495511055,
+      "learning_rate": 0.000175774428158541,
+      "loss": 0.2326,
+      "step": 27000
+    },
+    {
+      "epoch": 0.7488325492548076,
+      "eval_loss": 0.22845527529716492,
+      "eval_runtime": 500.3687,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 27000
+    },
+    {
+      "epoch": 0.7502192761978721,
+      "grad_norm": 0.1494184285402298,
+      "learning_rate": 0.0001756766492118294,
+      "loss": 0.2335,
+      "step": 27050
+    },
+    {
+      "epoch": 0.7516060031409365,
+      "grad_norm": 0.14270345866680145,
+      "learning_rate": 0.00017557870064848153,
+      "loss": 0.2378,
+      "step": 27100
+    },
+    {
+      "epoch": 0.752992730084001,
+      "grad_norm": 0.17542113363742828,
+      "learning_rate": 0.0001754805826880328,
+      "loss": 0.2344,
+      "step": 27150
+    },
+    {
+      "epoch": 0.7543794570270654,
+      "grad_norm": 0.14542442560195923,
+      "learning_rate": 0.0001753822955503983,
+      "loss": 0.2413,
+      "step": 27200
+    },
+    {
+      "epoch": 0.75576618397013,
+      "grad_norm": 0.13541916012763977,
+      "learning_rate": 0.00017528383945587236,
+      "loss": 0.2331,
+      "step": 27250
+    },
+    {
+      "epoch": 0.7571529109131944,
+      "grad_norm": 0.1555178165435791,
+      "learning_rate": 0.00017518521462512796,
+      "loss": 0.2314,
+      "step": 27300
+    },
+    {
+      "epoch": 0.7585396378562588,
+      "grad_norm": 0.10956469923257828,
+      "learning_rate": 0.0001750864212792162,
+      "loss": 0.2312,
+      "step": 27350
+    },
+    {
+      "epoch": 0.7599263647993233,
+      "grad_norm": 0.15572619438171387,
+      "learning_rate": 0.00017498745963956603,
+      "loss": 0.2334,
+      "step": 27400
+    },
+    {
+      "epoch": 0.7613130917423877,
+      "grad_norm": 0.1467774659395218,
+      "learning_rate": 0.0001748883299279835,
+      "loss": 0.231,
+      "step": 27450
+    },
+    {
+      "epoch": 0.7626998186854522,
+      "grad_norm": 0.12245896458625793,
+      "learning_rate": 0.00017478903236665136,
+      "loss": 0.2374,
+      "step": 27500
+    },
+    {
+      "epoch": 0.7640865456285166,
+      "grad_norm": 0.10392642766237259,
+      "learning_rate": 0.00017468956717812864,
+      "loss": 0.2313,
+      "step": 27550
+    },
+    {
+      "epoch": 0.7654732725715812,
+      "grad_norm": 0.1239921823143959,
+      "learning_rate": 0.00017458993458534998,
+      "loss": 0.2349,
+      "step": 27600
+    },
+    {
+      "epoch": 0.7668599995146456,
+      "grad_norm": 0.13776883482933044,
+      "learning_rate": 0.00017449013481162534,
+      "loss": 0.2362,
+      "step": 27650
+    },
+    {
+      "epoch": 0.7682467264577101,
+      "grad_norm": 0.1389874666929245,
+      "learning_rate": 0.00017439016808063932,
+      "loss": 0.2304,
+      "step": 27700
+    },
+    {
+      "epoch": 0.7696334534007745,
+      "grad_norm": 0.11973544955253601,
+      "learning_rate": 0.00017429003461645072,
+      "loss": 0.2352,
+      "step": 27750
+    },
+    {
+      "epoch": 0.7710201803438389,
+      "grad_norm": 0.13108691573143005,
+      "learning_rate": 0.00017418973464349209,
+      "loss": 0.2311,
+      "step": 27800
+    },
+    {
+      "epoch": 0.7724069072869034,
+      "grad_norm": 0.12594327330589294,
+      "learning_rate": 0.00017408926838656912,
+      "loss": 0.2332,
+      "step": 27850
+    },
+    {
+      "epoch": 0.7737936342299678,
+      "grad_norm": 0.14845065772533417,
+      "learning_rate": 0.00017398863607086024,
+      "loss": 0.2307,
+      "step": 27900
+    },
+    {
+      "epoch": 0.7751803611730324,
+      "grad_norm": 0.11298257112503052,
+      "learning_rate": 0.0001738878379219161,
+      "loss": 0.2331,
+      "step": 27950
+    },
+    {
+      "epoch": 0.7765670881160968,
+      "grad_norm": 0.11864858120679855,
+      "learning_rate": 0.000173786874165659,
+      "loss": 0.231,
+      "step": 28000
+    },
+    {
+      "epoch": 0.7765670881160968,
+      "eval_loss": 0.22779151797294617,
+      "eval_runtime": 501.235,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 28000
+    },
+    {
+      "epoch": 0.7779538150591613,
+      "grad_norm": 0.11632022261619568,
+      "learning_rate": 0.00017368574502838239,
+      "loss": 0.229,
+      "step": 28050
+    },
+    {
+      "epoch": 0.7793405420022257,
+      "grad_norm": 0.1431494504213333,
+      "learning_rate": 0.00017358445073675042,
+      "loss": 0.2318,
+      "step": 28100
+    },
+    {
+      "epoch": 0.7807272689452902,
+      "grad_norm": 0.12157493084669113,
+      "learning_rate": 0.00017348299151779748,
+      "loss": 0.2343,
+      "step": 28150
+    },
+    {
+      "epoch": 0.7821139958883546,
+      "grad_norm": 0.11989067494869232,
+      "learning_rate": 0.00017338136759892752,
+      "loss": 0.2347,
+      "step": 28200
+    },
+    {
+      "epoch": 0.783500722831419,
+      "grad_norm": 0.12739787995815277,
+      "learning_rate": 0.00017327957920791365,
+      "loss": 0.2328,
+      "step": 28250
+    },
+    {
+      "epoch": 0.7848874497744835,
+      "grad_norm": 0.15567833185195923,
+      "learning_rate": 0.00017317762657289768,
+      "loss": 0.2297,
+      "step": 28300
+    },
+    {
+      "epoch": 0.786274176717548,
+      "grad_norm": 0.12073542922735214,
+      "learning_rate": 0.00017307550992238943,
+      "loss": 0.2296,
+      "step": 28350
+    },
+    {
+      "epoch": 0.7876609036606125,
+      "grad_norm": 0.1477758288383484,
+      "learning_rate": 0.0001729732294852665,
+      "loss": 0.2328,
+      "step": 28400
+    },
+    {
+      "epoch": 0.7890476306036769,
+      "grad_norm": 0.1612139195203781,
+      "learning_rate": 0.00017287078549077343,
+      "loss": 0.2314,
+      "step": 28450
+    },
+    {
+      "epoch": 0.7904343575467414,
+      "grad_norm": 0.15718688070774078,
+      "learning_rate": 0.00017276817816852145,
+      "loss": 0.2289,
+      "step": 28500
+    },
+    {
+      "epoch": 0.7918210844898058,
+      "grad_norm": 0.1242058202624321,
+      "learning_rate": 0.0001726654077484878,
+      "loss": 0.2301,
+      "step": 28550
+    },
+    {
+      "epoch": 0.7932078114328703,
+      "grad_norm": 0.13269132375717163,
+      "learning_rate": 0.0001725624744610153,
+      "loss": 0.2303,
+      "step": 28600
+    },
+    {
+      "epoch": 0.7945945383759347,
+      "grad_norm": 0.12394677847623825,
+      "learning_rate": 0.0001724593785368118,
+      "loss": 0.2362,
+      "step": 28650
+    },
+    {
+      "epoch": 0.7959812653189992,
+      "grad_norm": 0.1323787420988083,
+      "learning_rate": 0.00017235612020694978,
+      "loss": 0.2281,
+      "step": 28700
+    },
+    {
+      "epoch": 0.7973679922620637,
+      "grad_norm": 0.1532479077577591,
+      "learning_rate": 0.00017225269970286552,
+      "loss": 0.2321,
+      "step": 28750
+    },
+    {
+      "epoch": 0.7987547192051281,
+      "grad_norm": 0.14882826805114746,
+      "learning_rate": 0.00017214911725635897,
+      "loss": 0.2316,
+      "step": 28800
+    },
+    {
+      "epoch": 0.8001414461481926,
+      "grad_norm": 0.11855613440275192,
+      "learning_rate": 0.00017204537309959292,
+      "loss": 0.2271,
+      "step": 28850
+    },
+    {
+      "epoch": 0.801528173091257,
+      "grad_norm": 0.15302914381027222,
+      "learning_rate": 0.00017194146746509268,
+      "loss": 0.2296,
+      "step": 28900
+    },
+    {
+      "epoch": 0.8029149000343215,
+      "grad_norm": 0.11822402477264404,
+      "learning_rate": 0.00017183740058574547,
+      "loss": 0.2301,
+      "step": 28950
+    },
+    {
+      "epoch": 0.8043016269773859,
+      "grad_norm": 0.1369016021490097,
+      "learning_rate": 0.00017173317269479992,
+      "loss": 0.2291,
+      "step": 29000
+    },
+    {
+      "epoch": 0.8043016269773859,
+      "eval_loss": 0.2273886650800705,
+      "eval_runtime": 501.6607,
+      "eval_samples_per_second": 5.695,
+      "eval_steps_per_second": 5.695,
+      "step": 29000
+    },
+    {
+      "epoch": 0.8056883539204505,
+      "grad_norm": 0.12872962653636932,
+      "learning_rate": 0.00017162878402586553,
+      "loss": 0.2344,
+      "step": 29050
+    },
+    {
+      "epoch": 0.8070750808635149,
+      "grad_norm": 0.13491351902484894,
+      "learning_rate": 0.00017152423481291216,
+      "loss": 0.2357,
+      "step": 29100
+    },
+    {
+      "epoch": 0.8084618078065793,
+      "grad_norm": 0.12680833041667938,
+      "learning_rate": 0.00017141952529026945,
+      "loss": 0.2333,
+      "step": 29150
+    },
+    {
+      "epoch": 0.8098485347496438,
+      "grad_norm": 0.12384926527738571,
+      "learning_rate": 0.0001713146556926265,
+      "loss": 0.2421,
+      "step": 29200
+    },
+    {
+      "epoch": 0.8112352616927082,
+      "grad_norm": 0.13864979147911072,
+      "learning_rate": 0.00017120962625503098,
+      "loss": 0.2262,
+      "step": 29250
+    },
+    {
+      "epoch": 0.8126219886357727,
+      "grad_norm": 0.12703485786914825,
+      "learning_rate": 0.00017110443721288901,
+      "loss": 0.2295,
+      "step": 29300
+    },
+    {
+      "epoch": 0.8140087155788371,
+      "grad_norm": 0.12121795862913132,
+      "learning_rate": 0.0001709990888019643,
+      "loss": 0.2286,
+      "step": 29350
+    },
+    {
+      "epoch": 0.8153954425219017,
+      "grad_norm": 0.11982162296772003,
+      "learning_rate": 0.00017089358125837783,
+      "loss": 0.2286,
+      "step": 29400
+    },
+    {
+      "epoch": 0.8167821694649661,
+      "grad_norm": 0.1372060328722,
+      "learning_rate": 0.00017078791481860725,
+      "loss": 0.2244,
+      "step": 29450
+    },
+    {
+      "epoch": 0.8181688964080306,
+      "grad_norm": 0.12731321156024933,
+      "learning_rate": 0.0001706820897194863,
+      "loss": 0.2259,
+      "step": 29500
+    },
+    {
+      "epoch": 0.819555623351095,
+      "grad_norm": 0.14031195640563965,
+      "learning_rate": 0.00017057610619820437,
+      "loss": 0.2297,
+      "step": 29550
+    },
+    {
+      "epoch": 0.8209423502941594,
+      "grad_norm": 0.13404880464076996,
+      "learning_rate": 0.0001704699644923059,
+      "loss": 0.2293,
+      "step": 29600
+    },
+    {
+      "epoch": 0.8223290772372239,
+      "grad_norm": 0.12400925159454346,
+      "learning_rate": 0.00017036366483968987,
+      "loss": 0.2263,
+      "step": 29650
+    },
+    {
+      "epoch": 0.8237158041802883,
+      "grad_norm": 0.14439739286899567,
+      "learning_rate": 0.00017025720747860937,
+      "loss": 0.2272,
+      "step": 29700
+    },
+    {
+      "epoch": 0.8251025311233529,
+      "grad_norm": 0.12196583300828934,
+      "learning_rate": 0.00017015059264767084,
+      "loss": 0.2337,
+      "step": 29750
+    },
+    {
+      "epoch": 0.8264892580664173,
+      "grad_norm": 0.13919509947299957,
+      "learning_rate": 0.00017004382058583367,
+      "loss": 0.2337,
+      "step": 29800
+    },
+    {
+      "epoch": 0.8278759850094818,
+      "grad_norm": 0.11371088027954102,
+      "learning_rate": 0.00016993689153240978,
+      "loss": 0.2252,
+      "step": 29850
+    },
+    {
+      "epoch": 0.8292627119525462,
+      "grad_norm": 0.1316608041524887,
+      "learning_rate": 0.00016982980572706282,
+      "loss": 0.2281,
+      "step": 29900
+    },
+    {
+      "epoch": 0.8306494388956107,
+      "grad_norm": 0.18003039062023163,
+      "learning_rate": 0.00016972256340980785,
+      "loss": 0.2296,
+      "step": 29950
+    },
+    {
+      "epoch": 0.8320361658386751,
+      "grad_norm": 0.16534283757209778,
+      "learning_rate": 0.0001696151648210107,
+      "loss": 0.2267,
+      "step": 30000
+    },
+    {
+      "epoch": 0.8320361658386751,
+      "eval_loss": 0.22761212289333344,
+      "eval_runtime": 501.069,
+      "eval_samples_per_second": 5.702,
+      "eval_steps_per_second": 5.702,
+      "step": 30000
+    },
+    {
+      "epoch": 0.8334228927817395,
+      "grad_norm": 0.11093872785568237,
+      "learning_rate": 0.00016950761020138747,
+      "loss": 0.234,
+      "step": 30050
+    },
+    {
+      "epoch": 0.834809619724804,
+      "grad_norm": 0.14647316932678223,
+      "learning_rate": 0.00016939989979200394,
+      "loss": 0.232,
+      "step": 30100
+    },
+    {
+      "epoch": 0.8361963466678685,
+      "grad_norm": 0.14312680065631866,
+      "learning_rate": 0.00016929203383427515,
+      "loss": 0.2299,
+      "step": 30150
+    },
+    {
+      "epoch": 0.837583073610933,
+      "grad_norm": 0.11662258952856064,
+      "learning_rate": 0.00016918401256996467,
+      "loss": 0.2298,
+      "step": 30200
+    },
+    {
+      "epoch": 0.8389698005539974,
+      "grad_norm": 0.11783650517463684,
+      "learning_rate": 0.0001690758362411843,
+      "loss": 0.2345,
+      "step": 30250
+    },
+    {
+      "epoch": 0.8403565274970619,
+      "grad_norm": 0.12562035024166107,
+      "learning_rate": 0.0001689675050903932,
+      "loss": 0.2341,
+      "step": 30300
+    },
+    {
+      "epoch": 0.8417432544401263,
+      "grad_norm": 0.1082848459482193,
+      "learning_rate": 0.00016885901936039774,
+      "loss": 0.2298,
+      "step": 30350
+    },
+    {
+      "epoch": 0.8431299813831908,
+      "grad_norm": 0.14080305397510529,
+      "learning_rate": 0.0001687503792943506,
+      "loss": 0.2364,
+      "step": 30400
+    },
+    {
+      "epoch": 0.8445167083262552,
+      "grad_norm": 0.133138969540596,
+      "learning_rate": 0.00016864158513575048,
+      "loss": 0.2293,
+      "step": 30450
+    },
+    {
+      "epoch": 0.8459034352693197,
+      "grad_norm": 0.13258026540279388,
+      "learning_rate": 0.00016853263712844136,
+      "loss": 0.2269,
+      "step": 30500
+    },
+    {
+      "epoch": 0.8472901622123842,
+      "grad_norm": 0.12311206012964249,
+      "learning_rate": 0.00016842353551661216,
+      "loss": 0.2297,
+      "step": 30550
+    },
+    {
+      "epoch": 0.8486768891554486,
+      "grad_norm": 0.12220294028520584,
+      "learning_rate": 0.00016831428054479597,
+      "loss": 0.2301,
+      "step": 30600
+    },
+    {
+      "epoch": 0.8500636160985131,
+      "grad_norm": 0.112845279276371,
+      "learning_rate": 0.00016820487245786968,
+      "loss": 0.2295,
+      "step": 30650
+    },
+    {
+      "epoch": 0.8514503430415775,
+      "grad_norm": 0.17439040541648865,
+      "learning_rate": 0.0001680953115010533,
+      "loss": 0.2299,
+      "step": 30700
+    },
+    {
+      "epoch": 0.852837069984642,
+      "grad_norm": 0.14124707877635956,
+      "learning_rate": 0.0001679855979199096,
+      "loss": 0.228,
+      "step": 30750
+    },
+    {
+      "epoch": 0.8542237969277064,
+      "grad_norm": 0.12298920005559921,
+      "learning_rate": 0.00016787573196034328,
+      "loss": 0.2293,
+      "step": 30800
+    },
+    {
+      "epoch": 0.855610523870771,
+      "grad_norm": 0.15425720810890198,
+      "learning_rate": 0.0001677657138686006,
+      "loss": 0.2263,
+      "step": 30850
+    },
+    {
+      "epoch": 0.8569972508138354,
+      "grad_norm": 0.13903729617595673,
+      "learning_rate": 0.0001676555438912689,
+      "loss": 0.2315,
+      "step": 30900
+    },
+    {
+      "epoch": 0.8583839777568998,
+      "grad_norm": 0.1249585896730423,
+      "learning_rate": 0.00016754522227527589,
+      "loss": 0.2289,
+      "step": 30950
+    },
+    {
+      "epoch": 0.8597707046999643,
+      "grad_norm": 0.13223236799240112,
+      "learning_rate": 0.00016743474926788908,
+      "loss": 0.2303,
+      "step": 31000
+    },
+    {
+      "epoch": 0.8597707046999643,
+      "eval_loss": 0.22721892595291138,
+      "eval_runtime": 500.5938,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 31000
+    },
+    {
+      "epoch": 0.8611574316430287,
+      "grad_norm": 0.15615518391132355,
+      "learning_rate": 0.00016732412511671544,
+      "loss": 0.2306,
+      "step": 31050
+    },
+    {
+      "epoch": 0.8625441585860932,
+      "grad_norm": 0.14526858925819397,
+      "learning_rate": 0.0001672133500697005,
+      "loss": 0.2307,
+      "step": 31100
+    },
+    {
+      "epoch": 0.8639308855291576,
+      "grad_norm": 0.11307808756828308,
+      "learning_rate": 0.00016710242437512825,
+      "loss": 0.237,
+      "step": 31150
+    },
+    {
+      "epoch": 0.8653176124722222,
+      "grad_norm": 0.1289224922657013,
+      "learning_rate": 0.00016699134828162017,
+      "loss": 0.2344,
+      "step": 31200
+    },
+    {
+      "epoch": 0.8667043394152866,
+      "grad_norm": 0.1631319522857666,
+      "learning_rate": 0.00016688012203813486,
+      "loss": 0.2305,
+      "step": 31250
+    },
+    {
+      "epoch": 0.8680910663583511,
+      "grad_norm": 0.1249733492732048,
+      "learning_rate": 0.00016676874589396744,
+      "loss": 0.2301,
+      "step": 31300
+    },
+    {
+      "epoch": 0.8694777933014155,
+      "grad_norm": 0.11502408981323242,
+      "learning_rate": 0.00016665722009874905,
+      "loss": 0.2319,
+      "step": 31350
+    },
+    {
+      "epoch": 0.8708645202444799,
+      "grad_norm": 0.13455846905708313,
+      "learning_rate": 0.00016654554490244628,
+      "loss": 0.228,
+      "step": 31400
+    },
+    {
+      "epoch": 0.8722512471875444,
+      "grad_norm": 0.1758633404970169,
+      "learning_rate": 0.00016643372055536048,
+      "loss": 0.2309,
+      "step": 31450
+    },
+    {
+      "epoch": 0.8736379741306088,
+      "grad_norm": 0.11880768090486526,
+      "learning_rate": 0.00016632174730812734,
+      "loss": 0.23,
+      "step": 31500
+    },
+    {
+      "epoch": 0.8750247010736734,
+      "grad_norm": 0.13718900084495544,
+      "learning_rate": 0.0001662096254117163,
+      "loss": 0.2279,
+      "step": 31550
+    },
+    {
+      "epoch": 0.8764114280167378,
+      "grad_norm": 0.1170978993177414,
+      "learning_rate": 0.00016609735511743,
+      "loss": 0.2306,
+      "step": 31600
+    },
+    {
+      "epoch": 0.8777981549598023,
+      "grad_norm": 0.15582193434238434,
+      "learning_rate": 0.0001659849366769036,
+      "loss": 0.2312,
+      "step": 31650
+    },
+    {
+      "epoch": 0.8791848819028667,
+      "grad_norm": 0.12351904064416885,
+      "learning_rate": 0.00016587237034210435,
+      "loss": 0.2292,
+      "step": 31700
+    },
+    {
+      "epoch": 0.8805716088459312,
+      "grad_norm": 0.18479709327220917,
+      "learning_rate": 0.000165759656365331,
+      "loss": 0.2274,
+      "step": 31750
+    },
+    {
+      "epoch": 0.8819583357889956,
+      "grad_norm": 0.14211027324199677,
+      "learning_rate": 0.00016564679499921328,
+      "loss": 0.2298,
+      "step": 31800
+    },
+    {
+      "epoch": 0.88334506273206,
+      "grad_norm": 0.1540357619524002,
+      "learning_rate": 0.00016553378649671112,
+      "loss": 0.2304,
+      "step": 31850
+    },
+    {
+      "epoch": 0.8847317896751246,
+      "grad_norm": 0.12503454089164734,
+      "learning_rate": 0.00016542063111111427,
+      "loss": 0.2294,
+      "step": 31900
+    },
+    {
+      "epoch": 0.886118516618189,
+      "grad_norm": 0.13658925890922546,
+      "learning_rate": 0.00016530732909604177,
+      "loss": 0.2291,
+      "step": 31950
+    },
+    {
+      "epoch": 0.8875052435612535,
+      "grad_norm": 0.15731070935726166,
+      "learning_rate": 0.00016519388070544128,
+      "loss": 0.2322,
+      "step": 32000
+    },
+    {
+      "epoch": 0.8875052435612535,
+      "eval_loss": 0.22673186659812927,
+      "eval_runtime": 500.5013,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 32000
+    },
+    {
+      "epoch": 0.8888919705043179,
+      "grad_norm": 0.11884371191263199,
+      "learning_rate": 0.0001650802861935885,
+      "loss": 0.2312,
+      "step": 32050
+    },
+    {
+      "epoch": 0.8902786974473824,
+      "grad_norm": 0.168379008769989,
+      "learning_rate": 0.00016496654581508663,
+      "loss": 0.2312,
+      "step": 32100
+    },
+    {
+      "epoch": 0.8916654243904468,
+      "grad_norm": 0.11641304939985275,
+      "learning_rate": 0.00016485265982486591,
+      "loss": 0.2271,
+      "step": 32150
+    },
+    {
+      "epoch": 0.8930521513335113,
+      "grad_norm": 0.12015505880117416,
+      "learning_rate": 0.00016473862847818277,
+      "loss": 0.2308,
+      "step": 32200
+    },
+    {
+      "epoch": 0.8944388782765758,
+      "grad_norm": 0.17053671181201935,
+      "learning_rate": 0.00016462445203061957,
+      "loss": 0.2324,
+      "step": 32250
+    },
+    {
+      "epoch": 0.8958256052196402,
+      "grad_norm": 0.12947635352611542,
+      "learning_rate": 0.0001645101307380839,
+      "loss": 0.2318,
+      "step": 32300
+    },
+    {
+      "epoch": 0.8972123321627047,
+      "grad_norm": 0.11198735982179642,
+      "learning_rate": 0.00016439566485680783,
+      "loss": 0.23,
+      "step": 32350
+    },
+    {
+      "epoch": 0.8985990591057691,
+      "grad_norm": 0.1204909086227417,
+      "learning_rate": 0.00016428105464334772,
+      "loss": 0.23,
+      "step": 32400
+    },
+    {
+      "epoch": 0.8999857860488336,
+      "grad_norm": 0.11191330850124359,
+      "learning_rate": 0.00016416630035458326,
+      "loss": 0.2295,
+      "step": 32450
+    },
+    {
+      "epoch": 0.901372512991898,
+      "grad_norm": 0.10705868154764175,
+      "learning_rate": 0.00016405140224771717,
+      "loss": 0.2246,
+      "step": 32500
+    },
+    {
+      "epoch": 0.9027592399349625,
+      "grad_norm": 0.11882634460926056,
+      "learning_rate": 0.0001639363605802744,
+      "loss": 0.2345,
+      "step": 32550
+    },
+    {
+      "epoch": 0.904145966878027,
+      "grad_norm": 0.1181696355342865,
+      "learning_rate": 0.0001638211756101018,
+      "loss": 0.2306,
+      "step": 32600
+    },
+    {
+      "epoch": 0.9055326938210915,
+      "grad_norm": 0.1270473152399063,
+      "learning_rate": 0.00016370584759536734,
+      "loss": 0.2297,
+      "step": 32650
+    },
+    {
+      "epoch": 0.9069194207641559,
+      "grad_norm": 0.11503591388463974,
+      "learning_rate": 0.00016359037679455955,
+      "loss": 0.2292,
+      "step": 32700
+    },
+    {
+      "epoch": 0.9083061477072203,
+      "grad_norm": 0.11596430093050003,
+      "learning_rate": 0.0001634747634664871,
+      "loss": 0.2324,
+      "step": 32750
+    },
+    {
+      "epoch": 0.9096928746502848,
+      "grad_norm": 0.16631336510181427,
+      "learning_rate": 0.00016335900787027802,
+      "loss": 0.23,
+      "step": 32800
+    },
+    {
+      "epoch": 0.9110796015933492,
+      "grad_norm": 0.12083205580711365,
+      "learning_rate": 0.0001632431102653793,
+      "loss": 0.2295,
+      "step": 32850
+    },
+    {
+      "epoch": 0.9124663285364137,
+      "grad_norm": 0.1268964558839798,
+      "learning_rate": 0.00016312707091155609,
+      "loss": 0.2299,
+      "step": 32900
+    },
+    {
+      "epoch": 0.9138530554794781,
+      "grad_norm": 0.1737286001443863,
+      "learning_rate": 0.00016301089006889137,
+      "loss": 0.2291,
+      "step": 32950
+    },
+    {
+      "epoch": 0.9152397824225427,
+      "grad_norm": 0.12454930692911148,
+      "learning_rate": 0.00016289456799778522,
+      "loss": 0.2289,
+      "step": 33000
+    },
+    {
+      "epoch": 0.9152397824225427,
+      "eval_loss": 0.22642949223518372,
+      "eval_runtime": 500.8866,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 33000
+    },
+    {
+      "epoch": 0.9166265093656071,
+      "grad_norm": 0.12109609693288803,
+      "learning_rate": 0.00016277810495895419,
+      "loss": 0.2289,
+      "step": 33050
+    },
+    {
+      "epoch": 0.9180132363086716,
+      "grad_norm": 0.16857489943504333,
+      "learning_rate": 0.00016266150121343085,
+      "loss": 0.2265,
+      "step": 33100
+    },
+    {
+      "epoch": 0.919399963251736,
+      "grad_norm": 0.13193485140800476,
+      "learning_rate": 0.00016254475702256308,
+      "loss": 0.2277,
+      "step": 33150
+    },
+    {
+      "epoch": 0.9207866901948004,
+      "grad_norm": 0.13189518451690674,
+      "learning_rate": 0.0001624278726480137,
+      "loss": 0.2346,
+      "step": 33200
+    },
+    {
+      "epoch": 0.9221734171378649,
+      "grad_norm": 0.16021443903446198,
+      "learning_rate": 0.00016231084835175948,
+      "loss": 0.2273,
+      "step": 33250
+    },
+    {
+      "epoch": 0.9235601440809293,
+      "grad_norm": 0.14241939783096313,
+      "learning_rate": 0.00016219368439609103,
+      "loss": 0.236,
+      "step": 33300
+    },
+    {
+      "epoch": 0.9249468710239939,
+      "grad_norm": 0.18355390429496765,
+      "learning_rate": 0.0001620763810436119,
+      "loss": 0.2281,
+      "step": 33350
+    },
+    {
+      "epoch": 0.9263335979670583,
+      "grad_norm": 0.1321648508310318,
+      "learning_rate": 0.0001619612887687756,
+      "loss": 0.241,
+      "step": 33400
+    },
+    {
+      "epoch": 0.9277203249101228,
+      "grad_norm": 0.16118654608726501,
+      "learning_rate": 0.00016184371018656649,
+      "loss": 0.233,
+      "step": 33450
+    },
+    {
+      "epoch": 0.9291070518531872,
+      "grad_norm": 0.11974034458398819,
+      "learning_rate": 0.00016172599299195568,
+      "loss": 0.219,
+      "step": 33500
+    },
+    {
+      "epoch": 0.9304937787962517,
+      "grad_norm": 0.14652998745441437,
+      "learning_rate": 0.00016160813744878674,
+      "loss": 0.2316,
+      "step": 33550
+    },
+    {
+      "epoch": 0.9318805057393161,
+      "grad_norm": 0.09738484770059586,
+      "learning_rate": 0.0001614901438212133,
+      "loss": 0.2351,
+      "step": 33600
+    },
+    {
+      "epoch": 0.9332672326823805,
+      "grad_norm": 0.15131749212741852,
+      "learning_rate": 0.00016137201237369846,
+      "loss": 0.2281,
+      "step": 33650
+    },
+    {
+      "epoch": 0.9346539596254451,
+      "grad_norm": 0.16536715626716614,
+      "learning_rate": 0.00016125374337101422,
+      "loss": 0.2317,
+      "step": 33700
+    },
+    {
+      "epoch": 0.9360406865685095,
+      "grad_norm": 0.15788187086582184,
+      "learning_rate": 0.0001611353370782409,
+      "loss": 0.2261,
+      "step": 33750
+    },
+    {
+      "epoch": 0.937427413511574,
+      "grad_norm": 0.11554282158613205,
+      "learning_rate": 0.00016101679376076655,
+      "loss": 0.2288,
+      "step": 33800
+    },
+    {
+      "epoch": 0.9388141404546384,
+      "grad_norm": 0.1376064121723175,
+      "learning_rate": 0.00016089811368428633,
+      "loss": 0.2287,
+      "step": 33850
+    },
+    {
+      "epoch": 0.9402008673977029,
+      "grad_norm": 0.1270899623632431,
+      "learning_rate": 0.0001607792971148019,
+      "loss": 0.2232,
+      "step": 33900
+    },
+    {
+      "epoch": 0.9415875943407673,
+      "grad_norm": 0.1187126636505127,
+      "learning_rate": 0.00016066034431862084,
+      "loss": 0.2321,
+      "step": 33950
+    },
+    {
+      "epoch": 0.9429743212838319,
+      "grad_norm": 0.14895334839820862,
+      "learning_rate": 0.00016054125556235613,
+      "loss": 0.2306,
+      "step": 34000
+    },
+    {
+      "epoch": 0.9429743212838319,
+      "eval_loss": 0.22613388299942017,
+      "eval_runtime": 500.7207,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 34000
+    },
+    {
+      "epoch": 0.9443610482268963,
+      "grad_norm": 0.12219640612602234,
+      "learning_rate": 0.00016042203111292538,
+      "loss": 0.2315,
+      "step": 34050
+    },
+    {
+      "epoch": 0.9457477751699607,
+      "grad_norm": 0.1677113175392151,
+      "learning_rate": 0.00016030267123755038,
+      "loss": 0.2327,
+      "step": 34100
+    },
+    {
+      "epoch": 0.9471345021130252,
+      "grad_norm": 0.12030269205570221,
+      "learning_rate": 0.00016018317620375652,
+      "loss": 0.2282,
+      "step": 34150
+    },
+    {
+      "epoch": 0.9485212290560896,
+      "grad_norm": 0.13181360065937042,
+      "learning_rate": 0.00016006354627937203,
+      "loss": 0.2287,
+      "step": 34200
+    },
+    {
+      "epoch": 0.9499079559991541,
+      "grad_norm": 0.13087068498134613,
+      "learning_rate": 0.00015994378173252752,
+      "loss": 0.2282,
+      "step": 34250
+    },
+    {
+      "epoch": 0.9512946829422185,
+      "grad_norm": 0.14467494189739227,
+      "learning_rate": 0.0001598238828316553,
+      "loss": 0.2254,
+      "step": 34300
+    },
+    {
+      "epoch": 0.952681409885283,
+      "grad_norm": 0.14921946823596954,
+      "learning_rate": 0.00015970384984548885,
+      "loss": 0.2324,
+      "step": 34350
+    },
+    {
+      "epoch": 0.9540681368283475,
+      "grad_norm": 0.19342415034770966,
+      "learning_rate": 0.0001595836830430622,
+      "loss": 0.2342,
+      "step": 34400
+    },
+    {
+      "epoch": 0.955454863771412,
+      "grad_norm": 0.12381652742624283,
+      "learning_rate": 0.00015946338269370923,
+      "loss": 0.2262,
+      "step": 34450
+    },
+    {
+      "epoch": 0.9568415907144764,
+      "grad_norm": 0.1456434279680252,
+      "learning_rate": 0.00015934294906706315,
+      "loss": 0.2277,
+      "step": 34500
+    },
+    {
+      "epoch": 0.9582283176575408,
+      "grad_norm": 0.11485321074724197,
+      "learning_rate": 0.000159222382433056,
+      "loss": 0.2355,
+      "step": 34550
+    },
+    {
+      "epoch": 0.9596150446006053,
+      "grad_norm": 0.10027427971363068,
+      "learning_rate": 0.00015910168306191785,
+      "loss": 0.2269,
+      "step": 34600
+    },
+    {
+      "epoch": 0.9610017715436697,
+      "grad_norm": 0.16801820695400238,
+      "learning_rate": 0.0001589808512241763,
+      "loss": 0.2282,
+      "step": 34650
+    },
+    {
+      "epoch": 0.9623884984867342,
+      "grad_norm": 0.11840588599443436,
+      "learning_rate": 0.00015885988719065573,
+      "loss": 0.2304,
+      "step": 34700
+    },
+    {
+      "epoch": 0.9637752254297987,
+      "grad_norm": 0.16810324788093567,
+      "learning_rate": 0.00015873879123247706,
+      "loss": 0.231,
+      "step": 34750
+    },
+    {
+      "epoch": 0.9651619523728632,
+      "grad_norm": 0.1277480274438858,
+      "learning_rate": 0.0001586175636210567,
+      "loss": 0.2292,
+      "step": 34800
+    },
+    {
+      "epoch": 0.9665486793159276,
+      "grad_norm": 0.13225620985031128,
+      "learning_rate": 0.0001584962046281062,
+      "loss": 0.2255,
+      "step": 34850
+    },
+    {
+      "epoch": 0.9679354062589921,
+      "grad_norm": 0.14994849264621735,
+      "learning_rate": 0.00015837471452563159,
+      "loss": 0.2306,
+      "step": 34900
+    },
+    {
+      "epoch": 0.9693221332020565,
+      "grad_norm": 0.11426250636577606,
+      "learning_rate": 0.00015825309358593272,
+      "loss": 0.2311,
+      "step": 34950
+    },
+    {
+      "epoch": 0.9707088601451209,
+      "grad_norm": 0.1453811228275299,
+      "learning_rate": 0.00015813134208160276,
+      "loss": 0.2276,
+      "step": 35000
+    },
+    {
+      "epoch": 0.9707088601451209,
+      "eval_loss": 0.22605940699577332,
+      "eval_runtime": 500.6317,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 35000
+    },
+    {
+      "epoch": 0.9720955870881854,
+      "grad_norm": 0.14036044478416443,
+      "learning_rate": 0.0001580094602855275,
+      "loss": 0.2241,
+      "step": 35050
+    },
+    {
+      "epoch": 0.9734823140312499,
+      "grad_norm": 0.1456310898065567,
+      "learning_rate": 0.00015788744847088464,
+      "loss": 0.2352,
+      "step": 35100
+    },
+    {
+      "epoch": 0.9748690409743144,
+      "grad_norm": 0.1325587034225464,
+      "learning_rate": 0.0001577653069111435,
+      "loss": 0.2267,
+      "step": 35150
+    },
+    {
+      "epoch": 0.9762557679173788,
+      "grad_norm": 0.13475272059440613,
+      "learning_rate": 0.000157643035880064,
+      "loss": 0.232,
+      "step": 35200
+    },
+    {
+      "epoch": 0.9776424948604433,
+      "grad_norm": 0.13557064533233643,
+      "learning_rate": 0.00015752063565169645,
+      "loss": 0.2342,
+      "step": 35250
+    },
+    {
+      "epoch": 0.9790292218035077,
+      "grad_norm": 0.149173304438591,
+      "learning_rate": 0.00015739810650038054,
+      "loss": 0.2284,
+      "step": 35300
+    },
+    {
+      "epoch": 0.9804159487465722,
+      "grad_norm": 0.11646503955125809,
+      "learning_rate": 0.00015727544870074503,
+      "loss": 0.2259,
+      "step": 35350
+    },
+    {
+      "epoch": 0.9818026756896366,
+      "grad_norm": 0.126033216714859,
+      "learning_rate": 0.000157152662527707,
+      "loss": 0.2289,
+      "step": 35400
+    },
+    {
+      "epoch": 0.983189402632701,
+      "grad_norm": 0.17162640392780304,
+      "learning_rate": 0.00015702974825647123,
+      "loss": 0.2293,
+      "step": 35450
+    },
+    {
+      "epoch": 0.9845761295757656,
+      "grad_norm": 0.12047728151082993,
+      "learning_rate": 0.0001569067061625297,
+      "loss": 0.2265,
+      "step": 35500
+    },
+    {
+      "epoch": 0.98596285651883,
+      "grad_norm": 0.1183520033955574,
+      "learning_rate": 0.00015678353652166078,
+      "loss": 0.2272,
+      "step": 35550
+    },
+    {
+      "epoch": 0.9873495834618945,
+      "grad_norm": 0.13919849693775177,
+      "learning_rate": 0.00015666023960992878,
+      "loss": 0.2295,
+      "step": 35600
+    },
+    {
+      "epoch": 0.9887363104049589,
+      "grad_norm": 0.14626280963420868,
+      "learning_rate": 0.00015653681570368318,
+      "loss": 0.2293,
+      "step": 35650
+    },
+    {
+      "epoch": 0.9901230373480234,
+      "grad_norm": 0.11618024855852127,
+      "learning_rate": 0.00015641326507955823,
+      "loss": 0.2264,
+      "step": 35700
+    },
+    {
+      "epoch": 0.9915097642910878,
+      "grad_norm": 0.12280390411615372,
+      "learning_rate": 0.0001562895880144721,
+      "loss": 0.233,
+      "step": 35750
+    },
+    {
+      "epoch": 0.9928964912341524,
+      "grad_norm": 0.11896737664937973,
+      "learning_rate": 0.0001561657847856264,
+      "loss": 0.2276,
+      "step": 35800
+    },
+    {
+      "epoch": 0.9942832181772168,
+      "grad_norm": 0.1226055920124054,
+      "learning_rate": 0.0001560418556705055,
+      "loss": 0.2364,
+      "step": 35850
+    },
+    {
+      "epoch": 0.9956699451202812,
+      "grad_norm": 0.1566486656665802,
+      "learning_rate": 0.00015591780094687587,
+      "loss": 0.2315,
+      "step": 35900
+    },
+    {
+      "epoch": 0.9970566720633457,
+      "grad_norm": 0.12156879901885986,
+      "learning_rate": 0.0001557936208927856,
+      "loss": 0.2284,
+      "step": 35950
+    },
+    {
+      "epoch": 0.9984433990064101,
+      "grad_norm": 0.12765392661094666,
+      "learning_rate": 0.00015566931578656366,
+      "loss": 0.2319,
+      "step": 36000
+    },
+    {
+      "epoch": 0.9984433990064101,
+      "eval_loss": 0.22568126022815704,
+      "eval_runtime": 500.5568,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 36000
+    },
+    {
+      "epoch": 0.9998301259494746,
+      "grad_norm": 0.11263388395309448,
+      "learning_rate": 0.00015554488590681934,
+      "loss": 0.2249,
+      "step": 36050
+    },
+    {
+      "epoch": 1.0012168528925391,
+      "grad_norm": 0.12134028226137161,
+      "learning_rate": 0.00015542033153244142,
+      "loss": 0.2296,
+      "step": 36100
+    },
+    {
+      "epoch": 1.0026035798356034,
+      "grad_norm": 0.12478175759315491,
+      "learning_rate": 0.00015529565294259795,
+      "loss": 0.2295,
+      "step": 36150
+    },
+    {
+      "epoch": 1.003990306778668,
+      "grad_norm": 0.1091291755437851,
+      "learning_rate": 0.0001551708504167352,
+      "loss": 0.2285,
+      "step": 36200
+    },
+    {
+      "epoch": 1.0053770337217325,
+      "grad_norm": 0.11158731579780579,
+      "learning_rate": 0.00015504592423457733,
+      "loss": 0.2267,
+      "step": 36250
+    },
+    {
+      "epoch": 1.006763760664797,
+      "grad_norm": 0.17226600646972656,
+      "learning_rate": 0.00015492087467612562,
+      "loss": 0.2369,
+      "step": 36300
+    },
+    {
+      "epoch": 1.0081504876078613,
+      "grad_norm": 0.10548936575651169,
+      "learning_rate": 0.00015479570202165784,
+      "loss": 0.2257,
+      "step": 36350
+    },
+    {
+      "epoch": 1.0095372145509258,
+      "grad_norm": 0.12710842490196228,
+      "learning_rate": 0.0001546704065517278,
+      "loss": 0.2283,
+      "step": 36400
+    },
+    {
+      "epoch": 1.0109239414939903,
+      "grad_norm": 0.13734006881713867,
+      "learning_rate": 0.0001545449885471644,
+      "loss": 0.2266,
+      "step": 36450
+    },
+    {
+      "epoch": 1.0123106684370546,
+      "grad_norm": 0.14669275283813477,
+      "learning_rate": 0.00015441944828907124,
+      "loss": 0.2265,
+      "step": 36500
+    },
+    {
+      "epoch": 1.0136973953801192,
+      "grad_norm": 0.10941125452518463,
+      "learning_rate": 0.000154293786058826,
+      "loss": 0.231,
+      "step": 36550
+    },
+    {
+      "epoch": 1.0150841223231837,
+      "grad_norm": 0.12528035044670105,
+      "learning_rate": 0.00015416800213807972,
+      "loss": 0.2286,
+      "step": 36600
+    },
+    {
+      "epoch": 1.0164708492662482,
+      "grad_norm": 0.1242556944489479,
+      "learning_rate": 0.00015404209680875607,
+      "loss": 0.2277,
+      "step": 36650
+    },
+    {
+      "epoch": 1.0178575762093125,
+      "grad_norm": 0.09937360137701035,
+      "learning_rate": 0.000153916070353051,
+      "loss": 0.2247,
+      "step": 36700
+    },
+    {
+      "epoch": 1.019244303152377,
+      "grad_norm": 0.11109854280948639,
+      "learning_rate": 0.00015378992305343183,
+      "loss": 0.2248,
+      "step": 36750
+    },
+    {
+      "epoch": 1.0206310300954415,
+      "grad_norm": 0.14019356667995453,
+      "learning_rate": 0.00015366365519263683,
+      "loss": 0.2252,
+      "step": 36800
+    },
+    {
+      "epoch": 1.0220177570385058,
+      "grad_norm": 0.11496023088693619,
+      "learning_rate": 0.00015353979599334788,
+      "loss": 0.2228,
+      "step": 36850
+    },
+    {
+      "epoch": 1.0234044839815704,
+      "grad_norm": 0.15292219817638397,
+      "learning_rate": 0.0001534132902566159,
+      "loss": 0.2307,
+      "step": 36900
+    },
+    {
+      "epoch": 1.0247912109246349,
+      "grad_norm": 0.12410300970077515,
+      "learning_rate": 0.00015328666480286793,
+      "loss": 0.2263,
+      "step": 36950
+    },
+    {
+      "epoch": 1.0261779378676994,
+      "grad_norm": 0.14905387163162231,
+      "learning_rate": 0.00015315991991591386,
+      "loss": 0.2228,
+      "step": 37000
+    },
+    {
+      "epoch": 1.0261779378676994,
+      "eval_loss": 0.22574713826179504,
+      "eval_runtime": 500.6484,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 37000
+    },
+    {
+      "epoch": 1.0275646648107637,
+      "grad_norm": 0.12682612240314484,
+      "learning_rate": 0.0001530330558798313,
+      "loss": 0.2257,
+      "step": 37050
+    },
+    {
+      "epoch": 1.0289513917538282,
+      "grad_norm": 0.15558844804763794,
+      "learning_rate": 0.00015290607297896482,
+      "loss": 0.2259,
+      "step": 37100
+    },
+    {
+      "epoch": 1.0303381186968927,
+      "grad_norm": 0.16526414453983307,
+      "learning_rate": 0.00015277897149792562,
+      "loss": 0.2301,
+      "step": 37150
+    },
+    {
+      "epoch": 1.0317248456399573,
+      "grad_norm": 0.1130262240767479,
+      "learning_rate": 0.0001526517517215905,
+      "loss": 0.2244,
+      "step": 37200
+    },
+    {
+      "epoch": 1.0331115725830216,
+      "grad_norm": 0.12639841437339783,
+      "learning_rate": 0.00015252441393510146,
+      "loss": 0.2269,
+      "step": 37250
+    },
+    {
+      "epoch": 1.034498299526086,
+      "grad_norm": 0.12753638625144958,
+      "learning_rate": 0.000152396958423865,
+      "loss": 0.2277,
+      "step": 37300
+    },
+    {
+      "epoch": 1.0358850264691506,
+      "grad_norm": 0.1574636995792389,
+      "learning_rate": 0.00015226938547355145,
+      "loss": 0.2302,
+      "step": 37350
+    },
+    {
+      "epoch": 1.037271753412215,
+      "grad_norm": 0.1075245812535286,
+      "learning_rate": 0.0001521416953700944,
+      "loss": 0.2318,
+      "step": 37400
+    },
+    {
+      "epoch": 1.0386584803552794,
+      "grad_norm": 0.15765556693077087,
+      "learning_rate": 0.00015201388839969005,
+      "loss": 0.2271,
+      "step": 37450
+    },
+    {
+      "epoch": 1.040045207298344,
+      "grad_norm": 0.14305494725704193,
+      "learning_rate": 0.00015188596484879636,
+      "loss": 0.2268,
+      "step": 37500
+    },
+    {
+      "epoch": 1.0414319342414085,
+      "grad_norm": 0.14217057824134827,
+      "learning_rate": 0.0001517579250041328,
+      "loss": 0.2302,
+      "step": 37550
+    },
+    {
+      "epoch": 1.0428186611844728,
+      "grad_norm": 0.12122397124767303,
+      "learning_rate": 0.00015162976915267948,
+      "loss": 0.2264,
+      "step": 37600
+    },
+    {
+      "epoch": 1.0442053881275373,
+      "grad_norm": 0.1215621680021286,
+      "learning_rate": 0.00015150149758167634,
+      "loss": 0.2239,
+      "step": 37650
+    },
+    {
+      "epoch": 1.0455921150706018,
+      "grad_norm": 0.1759423315525055,
+      "learning_rate": 0.00015137311057862279,
+      "loss": 0.2244,
+      "step": 37700
+    },
+    {
+      "epoch": 1.046978842013666,
+      "grad_norm": 0.11546457558870316,
+      "learning_rate": 0.00015124460843127704,
+      "loss": 0.226,
+      "step": 37750
+    },
+    {
+      "epoch": 1.0483655689567306,
+      "grad_norm": 0.16507115960121155,
+      "learning_rate": 0.00015111599142765526,
+      "loss": 0.2267,
+      "step": 37800
+    },
+    {
+      "epoch": 1.0497522958997951,
+      "grad_norm": 0.15918377041816711,
+      "learning_rate": 0.0001509872598560311,
+      "loss": 0.2265,
+      "step": 37850
+    },
+    {
+      "epoch": 1.0511390228428596,
+      "grad_norm": 0.12590187788009644,
+      "learning_rate": 0.000150858414004935,
+      "loss": 0.2285,
+      "step": 37900
+    },
+    {
+      "epoch": 1.052525749785924,
+      "grad_norm": 0.11883638054132462,
+      "learning_rate": 0.0001507294541631535,
+      "loss": 0.2233,
+      "step": 37950
+    },
+    {
+      "epoch": 1.0539124767289885,
+      "grad_norm": 0.11353275179862976,
+      "learning_rate": 0.00015060038061972874,
+      "loss": 0.2238,
+      "step": 38000
+    },
+    {
+      "epoch": 1.0539124767289885,
+      "eval_loss": 0.22568707168102264,
+      "eval_runtime": 500.8783,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 38000
+    },
+    {
+      "epoch": 1.055299203672053,
+      "grad_norm": 0.1161685511469841,
+      "learning_rate": 0.00015047119366395757,
+      "loss": 0.2292,
+      "step": 38050
+    },
+    {
+      "epoch": 1.0566859306151175,
+      "grad_norm": 0.13814447820186615,
+      "learning_rate": 0.00015034189358539103,
+      "loss": 0.2251,
+      "step": 38100
+    },
+    {
+      "epoch": 1.0580726575581818,
+      "grad_norm": 0.15208768844604492,
+      "learning_rate": 0.00015021248067383387,
+      "loss": 0.2286,
+      "step": 38150
+    },
+    {
+      "epoch": 1.0594593845012463,
+      "grad_norm": 0.12832270562648773,
+      "learning_rate": 0.00015008295521934354,
+      "loss": 0.229,
+      "step": 38200
+    },
+    {
+      "epoch": 1.0608461114443108,
+      "grad_norm": 0.12442856281995773,
+      "learning_rate": 0.00014995331751222992,
+      "loss": 0.2286,
+      "step": 38250
+    },
+    {
+      "epoch": 1.0622328383873751,
+      "grad_norm": 0.14005307853221893,
+      "learning_rate": 0.00014982356784305428,
+      "loss": 0.2293,
+      "step": 38300
+    },
+    {
+      "epoch": 1.0636195653304397,
+      "grad_norm": 0.14418749511241913,
+      "learning_rate": 0.00014969370650262903,
+      "loss": 0.2328,
+      "step": 38350
+    },
+    {
+      "epoch": 1.0650062922735042,
+      "grad_norm": 0.11833231151103973,
+      "learning_rate": 0.00014956373378201677,
+      "loss": 0.2273,
+      "step": 38400
+    },
+    {
+      "epoch": 1.0663930192165687,
+      "grad_norm": 0.12782081961631775,
+      "learning_rate": 0.00014943364997252977,
+      "loss": 0.2224,
+      "step": 38450
+    },
+    {
+      "epoch": 1.067779746159633,
+      "grad_norm": 0.11903475224971771,
+      "learning_rate": 0.00014930345536572924,
+      "loss": 0.2256,
+      "step": 38500
+    },
+    {
+      "epoch": 1.0691664731026975,
+      "grad_norm": 0.17546679079532623,
+      "learning_rate": 0.00014917315025342483,
+      "loss": 0.2306,
+      "step": 38550
+    },
+    {
+      "epoch": 1.070553200045762,
+      "grad_norm": 0.16552455723285675,
+      "learning_rate": 0.0001490427349276737,
+      "loss": 0.2242,
+      "step": 38600
+    },
+    {
+      "epoch": 1.0719399269888266,
+      "grad_norm": 0.11756553500890732,
+      "learning_rate": 0.00014891220968078024,
+      "loss": 0.223,
+      "step": 38650
+    },
+    {
+      "epoch": 1.0733266539318909,
+      "grad_norm": 0.13542614877223969,
+      "learning_rate": 0.000148781574805295,
+      "loss": 0.2293,
+      "step": 38700
+    },
+    {
+      "epoch": 1.0747133808749554,
+      "grad_norm": 0.1370215266942978,
+      "learning_rate": 0.00014865083059401445,
+      "loss": 0.2291,
+      "step": 38750
+    },
+    {
+      "epoch": 1.07610010781802,
+      "grad_norm": 0.1472005844116211,
+      "learning_rate": 0.00014851997733997992,
+      "loss": 0.2272,
+      "step": 38800
+    },
+    {
+      "epoch": 1.0774868347610842,
+      "grad_norm": 0.1240694522857666,
+      "learning_rate": 0.00014838901533647733,
+      "loss": 0.2237,
+      "step": 38850
+    },
+    {
+      "epoch": 1.0788735617041487,
+      "grad_norm": 0.11901194602251053,
+      "learning_rate": 0.0001482579448770362,
+      "loss": 0.2285,
+      "step": 38900
+    },
+    {
+      "epoch": 1.0802602886472132,
+      "grad_norm": 0.2202654331922531,
+      "learning_rate": 0.0001481267662554292,
+      "loss": 0.2321,
+      "step": 38950
+    },
+    {
+      "epoch": 1.0816470155902778,
+      "grad_norm": 0.11475471407175064,
+      "learning_rate": 0.00014799547976567144,
+      "loss": 0.2296,
+      "step": 39000
+    },
+    {
+      "epoch": 1.0816470155902778,
+      "eval_loss": 0.2248746156692505,
+      "eval_runtime": 500.4656,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 39000
+    },
+    {
+      "epoch": 1.083033742533342,
+      "grad_norm": 0.1217503771185875,
+      "learning_rate": 0.00014786408570201975,
+      "loss": 0.2223,
+      "step": 39050
+    },
+    {
+      "epoch": 1.0844204694764066,
+      "grad_norm": 0.14427083730697632,
+      "learning_rate": 0.00014773258435897207,
+      "loss": 0.2279,
+      "step": 39100
+    },
+    {
+      "epoch": 1.085807196419471,
+      "grad_norm": 0.11865708976984024,
+      "learning_rate": 0.00014760097603126689,
+      "loss": 0.2295,
+      "step": 39150
+    },
+    {
+      "epoch": 1.0871939233625354,
+      "grad_norm": 0.14178717136383057,
+      "learning_rate": 0.0001474718963578798,
+      "loss": 0.2261,
+      "step": 39200
+    },
+    {
+      "epoch": 1.0885806503056,
+      "grad_norm": 0.15393276512622833,
+      "learning_rate": 0.0001473400770710278,
+      "loss": 0.2308,
+      "step": 39250
+    },
+    {
+      "epoch": 1.0899673772486644,
+      "grad_norm": 0.11602922528982162,
+      "learning_rate": 0.00014720815167925812,
+      "loss": 0.2283,
+      "step": 39300
+    },
+    {
+      "epoch": 1.091354104191729,
+      "grad_norm": 0.16645793616771698,
+      "learning_rate": 0.00014707612047825964,
+      "loss": 0.233,
+      "step": 39350
+    },
+    {
+      "epoch": 1.0927408311347933,
+      "grad_norm": 0.10213354974985123,
+      "learning_rate": 0.00014694398376395825,
+      "loss": 0.2277,
+      "step": 39400
+    },
+    {
+      "epoch": 1.0941275580778578,
+      "grad_norm": 0.11264722794294357,
+      "learning_rate": 0.0001468117418325166,
+      "loss": 0.2267,
+      "step": 39450
+    },
+    {
+      "epoch": 1.0955142850209223,
+      "grad_norm": 0.12596255540847778,
+      "learning_rate": 0.00014667939498033293,
+      "loss": 0.2226,
+      "step": 39500
+    },
+    {
+      "epoch": 1.0969010119639866,
+      "grad_norm": 0.10382383316755295,
+      "learning_rate": 0.0001465469435040407,
+      "loss": 0.2297,
+      "step": 39550
+    },
+    {
+      "epoch": 1.0982877389070511,
+      "grad_norm": 0.12972958385944366,
+      "learning_rate": 0.00014641438770050794,
+      "loss": 0.2256,
+      "step": 39600
+    },
+    {
+      "epoch": 1.0996744658501156,
+      "grad_norm": 0.13036096096038818,
+      "learning_rate": 0.00014628172786683641,
+      "loss": 0.2235,
+      "step": 39650
+    },
+    {
+      "epoch": 1.1010611927931802,
+      "grad_norm": 0.1233506128191948,
+      "learning_rate": 0.00014614896430036113,
+      "loss": 0.2243,
+      "step": 39700
+    },
+    {
+      "epoch": 1.1024479197362445,
+      "grad_norm": 0.11503315716981888,
+      "learning_rate": 0.00014601609729864956,
+      "loss": 0.2285,
+      "step": 39750
+    },
+    {
+      "epoch": 1.103834646679309,
+      "grad_norm": 0.12343501299619675,
+      "learning_rate": 0.000145883127159501,
+      "loss": 0.2272,
+      "step": 39800
+    },
+    {
+      "epoch": 1.1052213736223735,
+      "grad_norm": 0.1226864606142044,
+      "learning_rate": 0.00014575005418094594,
+      "loss": 0.2332,
+      "step": 39850
+    },
+    {
+      "epoch": 1.106608100565438,
+      "grad_norm": 0.1333167850971222,
+      "learning_rate": 0.00014561687866124535,
+      "loss": 0.2304,
+      "step": 39900
+    },
+    {
+      "epoch": 1.1079948275085023,
+      "grad_norm": 0.1088777631521225,
+      "learning_rate": 0.00014548360089889002,
+      "loss": 0.2296,
+      "step": 39950
+    },
+    {
+      "epoch": 1.1093815544515668,
+      "grad_norm": 0.11975093185901642,
+      "learning_rate": 0.00014535022119259994,
+      "loss": 0.2255,
+      "step": 40000
+    },
+    {
+      "epoch": 1.1093815544515668,
+      "eval_loss": 0.22516606748104095,
+      "eval_runtime": 500.4411,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 40000
+    },
+    {
+      "epoch": 1.1107682813946314,
+      "grad_norm": 0.19725576043128967,
+      "learning_rate": 0.0001452167398413235,
+      "loss": 0.2317,
+      "step": 40050
+    },
+    {
+      "epoch": 1.1121550083376956,
+      "grad_norm": 0.12385617196559906,
+      "learning_rate": 0.00014508315714423706,
+      "loss": 0.2269,
+      "step": 40100
+    },
+    {
+      "epoch": 1.1135417352807602,
+      "grad_norm": 0.12559738755226135,
+      "learning_rate": 0.000144949473400744,
+      "loss": 0.2295,
+      "step": 40150
+    },
+    {
+      "epoch": 1.1149284622238247,
+      "grad_norm": 0.1279434859752655,
+      "learning_rate": 0.0001448156889104742,
+      "loss": 0.2283,
+      "step": 40200
+    },
+    {
+      "epoch": 1.1163151891668892,
+      "grad_norm": 0.14756010472774506,
+      "learning_rate": 0.0001446818039732834,
+      "loss": 0.2267,
+      "step": 40250
+    },
+    {
+      "epoch": 1.1177019161099535,
+      "grad_norm": 0.11476084589958191,
+      "learning_rate": 0.00014454781888925238,
+      "loss": 0.2265,
+      "step": 40300
+    },
+    {
+      "epoch": 1.119088643053018,
+      "grad_norm": 0.12701088190078735,
+      "learning_rate": 0.00014441373395868653,
+      "loss": 0.2255,
+      "step": 40350
+    },
+    {
+      "epoch": 1.1204753699960825,
+      "grad_norm": 0.14300104975700378,
+      "learning_rate": 0.00014427954948211493,
+      "loss": 0.227,
+      "step": 40400
+    },
+    {
+      "epoch": 1.121862096939147,
+      "grad_norm": 0.11292553693056107,
+      "learning_rate": 0.00014414526576028973,
+      "loss": 0.2239,
+      "step": 40450
+    },
+    {
+      "epoch": 1.1232488238822114,
+      "grad_norm": 0.1404883861541748,
+      "learning_rate": 0.00014401088309418564,
+      "loss": 0.2234,
+      "step": 40500
+    },
+    {
+      "epoch": 1.1246355508252759,
+      "grad_norm": 0.15262041985988617,
+      "learning_rate": 0.00014387640178499905,
+      "loss": 0.2319,
+      "step": 40550
+    },
+    {
+      "epoch": 1.1260222777683404,
+      "grad_norm": 0.16456229984760284,
+      "learning_rate": 0.0001437418221341475,
+      "loss": 0.2264,
+      "step": 40600
+    },
+    {
+      "epoch": 1.1274090047114047,
+      "grad_norm": 0.12468329817056656,
+      "learning_rate": 0.0001436071444432689,
+      "loss": 0.2273,
+      "step": 40650
+    },
+    {
+      "epoch": 1.1287957316544692,
+      "grad_norm": 0.12449460476636887,
+      "learning_rate": 0.0001434723690142209,
+      "loss": 0.2333,
+      "step": 40700
+    },
+    {
+      "epoch": 1.1301824585975337,
+      "grad_norm": 0.12426210194826126,
+      "learning_rate": 0.0001433374961490803,
+      "loss": 0.2328,
+      "step": 40750
+    },
+    {
+      "epoch": 1.1315691855405983,
+      "grad_norm": 0.1501815766096115,
+      "learning_rate": 0.00014320252615014216,
+      "loss": 0.2214,
+      "step": 40800
+    },
+    {
+      "epoch": 1.1329559124836626,
+      "grad_norm": 0.15881818532943726,
+      "learning_rate": 0.00014306745931991932,
+      "loss": 0.2292,
+      "step": 40850
+    },
+    {
+      "epoch": 1.134342639426727,
+      "grad_norm": 0.12299991399049759,
+      "learning_rate": 0.00014293229596114163,
+      "loss": 0.2238,
+      "step": 40900
+    },
+    {
+      "epoch": 1.1357293663697916,
+      "grad_norm": 0.14259304106235504,
+      "learning_rate": 0.0001427970363767553,
+      "loss": 0.2291,
+      "step": 40950
+    },
+    {
+      "epoch": 1.137116093312856,
+      "grad_norm": 0.12536148726940155,
+      "learning_rate": 0.00014266168086992225,
+      "loss": 0.2252,
+      "step": 41000
+    },
+    {
+      "epoch": 1.137116093312856,
+      "eval_loss": 0.2245665341615677,
+      "eval_runtime": 501.2828,
+      "eval_samples_per_second": 5.699,
+      "eval_steps_per_second": 5.699,
+      "step": 41000
+    },
+    {
+      "epoch": 1.1385028202559204,
+      "grad_norm": 0.12410587817430496,
+      "learning_rate": 0.00014252622974401932,
+      "loss": 0.2268,
+      "step": 41050
+    },
+    {
+      "epoch": 1.139889547198985,
+      "grad_norm": 0.12877434492111206,
+      "learning_rate": 0.00014239068330263775,
+      "loss": 0.2258,
+      "step": 41100
+    },
+    {
+      "epoch": 1.1412762741420495,
+      "grad_norm": 0.1299249529838562,
+      "learning_rate": 0.00014225504184958232,
+      "loss": 0.2301,
+      "step": 41150
+    },
+    {
+      "epoch": 1.1426630010851138,
+      "grad_norm": 0.15234452486038208,
+      "learning_rate": 0.00014211930568887088,
+      "loss": 0.2192,
+      "step": 41200
+    },
+    {
+      "epoch": 1.1440497280281783,
+      "grad_norm": 0.12678442895412445,
+      "learning_rate": 0.00014198347512473343,
+      "loss": 0.2311,
+      "step": 41250
+    },
+    {
+      "epoch": 1.1454364549712428,
+      "grad_norm": 0.12326008826494217,
+      "learning_rate": 0.0001418475504616116,
+      "loss": 0.2318,
+      "step": 41300
+    },
+    {
+      "epoch": 1.146823181914307,
+      "grad_norm": 0.11192907392978668,
+      "learning_rate": 0.00014171153200415797,
+      "loss": 0.2232,
+      "step": 41350
+    },
+    {
+      "epoch": 1.1482099088573716,
+      "grad_norm": 0.11843819916248322,
+      "learning_rate": 0.00014157542005723532,
+      "loss": 0.2277,
+      "step": 41400
+    },
+    {
+      "epoch": 1.1495966358004361,
+      "grad_norm": 0.12903502583503723,
+      "learning_rate": 0.0001414419399397752,
+      "loss": 0.2237,
+      "step": 41450
+    },
+    {
+      "epoch": 1.1509833627435007,
+      "grad_norm": 0.13532768189907074,
+      "learning_rate": 0.00014130564378392948,
+      "loss": 0.2291,
+      "step": 41500
+    },
+    {
+      "epoch": 1.152370089686565,
+      "grad_norm": 0.11242423951625824,
+      "learning_rate": 0.00014116925504834574,
+      "loss": 0.2263,
+      "step": 41550
+    },
+    {
+      "epoch": 1.1537568166296295,
+      "grad_norm": 0.14420267939567566,
+      "learning_rate": 0.00014103277403871667,
+      "loss": 0.231,
+      "step": 41600
+    },
+    {
+      "epoch": 1.155143543572694,
+      "grad_norm": 0.11390483379364014,
+      "learning_rate": 0.00014089620106094174,
+      "loss": 0.2281,
+      "step": 41650
+    },
+    {
+      "epoch": 1.1565302705157583,
+      "grad_norm": 0.10996092855930328,
+      "learning_rate": 0.0001407595364211267,
+      "loss": 0.223,
+      "step": 41700
+    },
+    {
+      "epoch": 1.1579169974588228,
+      "grad_norm": 0.1297358274459839,
+      "learning_rate": 0.00014062278042558253,
+      "loss": 0.2251,
+      "step": 41750
+    },
+    {
+      "epoch": 1.1593037244018873,
+      "grad_norm": 0.13994191586971283,
+      "learning_rate": 0.00014048593338082508,
+      "loss": 0.2261,
+      "step": 41800
+    },
+    {
+      "epoch": 1.1606904513449519,
+      "grad_norm": 0.15100865066051483,
+      "learning_rate": 0.00014034899559357432,
+      "loss": 0.2257,
+      "step": 41850
+    },
+    {
+      "epoch": 1.1620771782880164,
+      "grad_norm": 0.1151217371225357,
+      "learning_rate": 0.0001402119673707535,
+      "loss": 0.2278,
+      "step": 41900
+    },
+    {
+      "epoch": 1.1634639052310807,
+      "grad_norm": 0.1580880582332611,
+      "learning_rate": 0.00014007484901948865,
+      "loss": 0.2247,
+      "step": 41950
+    },
+    {
+      "epoch": 1.1648506321741452,
+      "grad_norm": 0.1323232203722,
+      "learning_rate": 0.00013993764084710777,
+      "loss": 0.2229,
+      "step": 42000
+    },
+    {
+      "epoch": 1.1648506321741452,
+      "eval_loss": 0.22439424693584442,
+      "eval_runtime": 501.4893,
+      "eval_samples_per_second": 5.697,
+      "eval_steps_per_second": 5.697,
+      "step": 42000
+    },
+    {
+      "epoch": 1.1662373591172097,
+      "grad_norm": 0.11002755165100098,
+      "learning_rate": 0.00013980034316114014,
+      "loss": 0.2287,
+      "step": 42050
+    },
+    {
+      "epoch": 1.167624086060274,
+      "grad_norm": 0.16875265538692474,
+      "learning_rate": 0.00013966295626931575,
+      "loss": 0.2268,
+      "step": 42100
+    },
+    {
+      "epoch": 1.1690108130033385,
+      "grad_norm": 0.1291196197271347,
+      "learning_rate": 0.0001395254804795645,
+      "loss": 0.2267,
+      "step": 42150
+    },
+    {
+      "epoch": 1.170397539946403,
+      "grad_norm": 0.12030452489852905,
+      "learning_rate": 0.0001393879161000155,
+      "loss": 0.2284,
+      "step": 42200
+    },
+    {
+      "epoch": 1.1717842668894676,
+      "grad_norm": 0.1254565715789795,
+      "learning_rate": 0.00013925026343899644,
+      "loss": 0.2325,
+      "step": 42250
+    },
+    {
+      "epoch": 1.1731709938325319,
+      "grad_norm": 0.10753902792930603,
+      "learning_rate": 0.000139112522805033,
+      "loss": 0.2265,
+      "step": 42300
+    },
+    {
+      "epoch": 1.1745577207755964,
+      "grad_norm": 0.14079649746418,
+      "learning_rate": 0.00013897469450684783,
+      "loss": 0.2279,
+      "step": 42350
+    },
+    {
+      "epoch": 1.175944447718661,
+      "grad_norm": 0.13644090294837952,
+      "learning_rate": 0.00013883677885336013,
+      "loss": 0.2264,
+      "step": 42400
+    },
+    {
+      "epoch": 1.1773311746617252,
+      "grad_norm": 0.15901681780815125,
+      "learning_rate": 0.000138698776153685,
+      "loss": 0.2274,
+      "step": 42450
+    },
+    {
+      "epoch": 1.1787179016047897,
+      "grad_norm": 0.14739197492599487,
+      "learning_rate": 0.00013856068671713254,
+      "loss": 0.2223,
+      "step": 42500
+    },
+    {
+      "epoch": 1.1801046285478543,
+      "grad_norm": 0.1077587679028511,
+      "learning_rate": 0.00013842251085320728,
+      "loss": 0.2257,
+      "step": 42550
+    },
+    {
+      "epoch": 1.1814913554909188,
+      "grad_norm": 0.12596414983272552,
+      "learning_rate": 0.00013828424887160745,
+      "loss": 0.2251,
+      "step": 42600
+    },
+    {
+      "epoch": 1.182878082433983,
+      "grad_norm": 0.11234478652477264,
+      "learning_rate": 0.0001381459010822243,
+      "loss": 0.2225,
+      "step": 42650
+    },
+    {
+      "epoch": 1.1842648093770476,
+      "grad_norm": 0.11206696927547455,
+      "learning_rate": 0.00013800746779514143,
+      "loss": 0.2266,
+      "step": 42700
+    },
+    {
+      "epoch": 1.185651536320112,
+      "grad_norm": 0.10260911285877228,
+      "learning_rate": 0.0001378689493206341,
+      "loss": 0.2241,
+      "step": 42750
+    },
+    {
+      "epoch": 1.1870382632631764,
+      "grad_norm": 0.12874187529087067,
+      "learning_rate": 0.0001377303459691684,
+      "loss": 0.2277,
+      "step": 42800
+    },
+    {
+      "epoch": 1.188424990206241,
+      "grad_norm": 0.1351606696844101,
+      "learning_rate": 0.0001375916580514007,
+      "loss": 0.2268,
+      "step": 42850
+    },
+    {
+      "epoch": 1.1898117171493054,
+      "grad_norm": 0.1250632107257843,
+      "learning_rate": 0.000137452885878177,
+      "loss": 0.2265,
+      "step": 42900
+    },
+    {
+      "epoch": 1.19119844409237,
+      "grad_norm": 0.12516459822654724,
+      "learning_rate": 0.00013731402976053202,
+      "loss": 0.2256,
+      "step": 42950
+    },
+    {
+      "epoch": 1.1925851710354343,
+      "grad_norm": 0.12791725993156433,
+      "learning_rate": 0.00013717509000968865,
+      "loss": 0.2252,
+      "step": 43000
+    },
+    {
+      "epoch": 1.1925851710354343,
+      "eval_loss": 0.22418725490570068,
+      "eval_runtime": 501.0375,
+      "eval_samples_per_second": 5.702,
+      "eval_steps_per_second": 5.702,
+      "step": 43000
+    },
+    {
+      "epoch": 1.1939718979784988,
+      "grad_norm": 0.152371346950531,
+      "learning_rate": 0.00013703606693705732,
+      "loss": 0.2308,
+      "step": 43050
+    },
+    {
+      "epoch": 1.1953586249215633,
+      "grad_norm": 0.14723214507102966,
+      "learning_rate": 0.0001368969608542351,
+      "loss": 0.2258,
+      "step": 43100
+    },
+    {
+      "epoch": 1.1967453518646276,
+      "grad_norm": 0.1414303481578827,
+      "learning_rate": 0.00013675777207300524,
+      "loss": 0.2278,
+      "step": 43150
+    },
+    {
+      "epoch": 1.1981320788076921,
+      "grad_norm": 0.15416811406612396,
+      "learning_rate": 0.00013661850090533617,
+      "loss": 0.2324,
+      "step": 43200
+    },
+    {
+      "epoch": 1.1995188057507566,
+      "grad_norm": 0.11736203730106354,
+      "learning_rate": 0.00013647914766338112,
+      "loss": 0.2292,
+      "step": 43250
+    },
+    {
+      "epoch": 1.2009055326938212,
+      "grad_norm": 0.1547485738992691,
+      "learning_rate": 0.00013633971265947722,
+      "loss": 0.2281,
+      "step": 43300
+    },
+    {
+      "epoch": 1.2022922596368855,
+      "grad_norm": 0.15800827741622925,
+      "learning_rate": 0.0001362001962061449,
+      "loss": 0.2296,
+      "step": 43350
+    },
+    {
+      "epoch": 1.20367898657995,
+      "grad_norm": 0.15381957590579987,
+      "learning_rate": 0.0001360605986160871,
+      "loss": 0.2291,
+      "step": 43400
+    },
+    {
+      "epoch": 1.2050657135230145,
+      "grad_norm": 0.17754536867141724,
+      "learning_rate": 0.00013592092020218855,
+      "loss": 0.2285,
+      "step": 43450
+    },
+    {
+      "epoch": 1.2064524404660788,
+      "grad_norm": 0.1404140442609787,
+      "learning_rate": 0.0001357811612775153,
+      "loss": 0.2253,
+      "step": 43500
+    },
+    {
+      "epoch": 1.2078391674091433,
+      "grad_norm": 0.11709395796060562,
+      "learning_rate": 0.00013564132215531372,
+      "loss": 0.2261,
+      "step": 43550
+    },
+    {
+      "epoch": 1.2092258943522078,
+      "grad_norm": 0.11466790735721588,
+      "learning_rate": 0.00013550140314901,
+      "loss": 0.2295,
+      "step": 43600
+    },
+    {
+      "epoch": 1.2106126212952724,
+      "grad_norm": 0.14058195054531097,
+      "learning_rate": 0.00013536140457220933,
+      "loss": 0.2307,
+      "step": 43650
+    },
+    {
+      "epoch": 1.2119993482383369,
+      "grad_norm": 0.18355610966682434,
+      "learning_rate": 0.00013522132673869522,
+      "loss": 0.2283,
+      "step": 43700
+    },
+    {
+      "epoch": 1.2133860751814012,
+      "grad_norm": 0.1437745839357376,
+      "learning_rate": 0.00013508116996242893,
+      "loss": 0.2244,
+      "step": 43750
+    },
+    {
+      "epoch": 1.2147728021244657,
+      "grad_norm": 0.12281102687120438,
+      "learning_rate": 0.00013494093455754851,
+      "loss": 0.2266,
+      "step": 43800
+    },
+    {
+      "epoch": 1.2161595290675302,
+      "grad_norm": 0.15082257986068726,
+      "learning_rate": 0.00013480062083836842,
+      "loss": 0.2275,
+      "step": 43850
+    },
+    {
+      "epoch": 1.2175462560105945,
+      "grad_norm": 0.13360853493213654,
+      "learning_rate": 0.00013466022911937846,
+      "loss": 0.2293,
+      "step": 43900
+    },
+    {
+      "epoch": 1.218932982953659,
+      "grad_norm": 0.1245453953742981,
+      "learning_rate": 0.00013451975971524337,
+      "loss": 0.2252,
+      "step": 43950
+    },
+    {
+      "epoch": 1.2203197098967236,
+      "grad_norm": 0.12427138537168503,
+      "learning_rate": 0.00013437921294080202,
+      "loss": 0.2273,
+      "step": 44000
+    },
+    {
+      "epoch": 1.2203197098967236,
+      "eval_loss": 0.22416169941425323,
+      "eval_runtime": 501.199,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 44000
+    },
+    {
+      "epoch": 1.221706436839788,
+      "grad_norm": 0.13315744698047638,
+      "learning_rate": 0.00013423858911106664,
+      "loss": 0.2273,
+      "step": 44050
+    },
+    {
+      "epoch": 1.2230931637828524,
+      "grad_norm": 0.11731356382369995,
+      "learning_rate": 0.0001340978885412221,
+      "loss": 0.2284,
+      "step": 44100
+    },
+    {
+      "epoch": 1.224479890725917,
+      "grad_norm": 0.1332121342420578,
+      "learning_rate": 0.00013395711154662548,
+      "loss": 0.2311,
+      "step": 44150
+    },
+    {
+      "epoch": 1.2258666176689814,
+      "grad_norm": 0.11775799095630646,
+      "learning_rate": 0.00013381625844280495,
+      "loss": 0.2207,
+      "step": 44200
+    },
+    {
+      "epoch": 1.2272533446120457,
+      "grad_norm": 0.13608750700950623,
+      "learning_rate": 0.00013367532954545934,
+      "loss": 0.2259,
+      "step": 44250
+    },
+    {
+      "epoch": 1.2286400715551102,
+      "grad_norm": 0.11276783794164658,
+      "learning_rate": 0.00013353432517045739,
+      "loss": 0.2254,
+      "step": 44300
+    },
+    {
+      "epoch": 1.2300267984981748,
+      "grad_norm": 0.11962584406137466,
+      "learning_rate": 0.00013339324563383693,
+      "loss": 0.2231,
+      "step": 44350
+    },
+    {
+      "epoch": 1.2314135254412393,
+      "grad_norm": 0.14515165984630585,
+      "learning_rate": 0.0001332520912518044,
+      "loss": 0.2273,
+      "step": 44400
+    },
+    {
+      "epoch": 1.2328002523843036,
+      "grad_norm": 0.14967331290245056,
+      "learning_rate": 0.00013311086234073376,
+      "loss": 0.2292,
+      "step": 44450
+    },
+    {
+      "epoch": 1.234186979327368,
+      "grad_norm": 0.10794315487146378,
+      "learning_rate": 0.00013296955921716626,
+      "loss": 0.2213,
+      "step": 44500
+    },
+    {
+      "epoch": 1.2355737062704326,
+      "grad_norm": 0.1261892467737198,
+      "learning_rate": 0.0001328281821978093,
+      "loss": 0.2249,
+      "step": 44550
+    },
+    {
+      "epoch": 1.236960433213497,
+      "grad_norm": 0.16944009065628052,
+      "learning_rate": 0.00013268673159953608,
+      "loss": 0.2279,
+      "step": 44600
+    },
+    {
+      "epoch": 1.2383471601565614,
+      "grad_norm": 0.14991511404514313,
+      "learning_rate": 0.00013254520773938453,
+      "loss": 0.224,
+      "step": 44650
+    },
+    {
+      "epoch": 1.239733887099626,
+      "grad_norm": 0.16776132583618164,
+      "learning_rate": 0.00013240361093455686,
+      "loss": 0.2267,
+      "step": 44700
+    },
+    {
+      "epoch": 1.2411206140426905,
+      "grad_norm": 0.15971648693084717,
+      "learning_rate": 0.00013226194150241886,
+      "loss": 0.2269,
+      "step": 44750
+    },
+    {
+      "epoch": 1.2425073409857548,
+      "grad_norm": 0.16267691552639008,
+      "learning_rate": 0.00013212019976049897,
+      "loss": 0.2262,
+      "step": 44800
+    },
+    {
+      "epoch": 1.2438940679288193,
+      "grad_norm": 0.13528917729854584,
+      "learning_rate": 0.00013197838602648773,
+      "loss": 0.2282,
+      "step": 44850
+    },
+    {
+      "epoch": 1.2452807948718838,
+      "grad_norm": 0.13532580435276031,
+      "learning_rate": 0.0001318365006182371,
+      "loss": 0.2269,
+      "step": 44900
+    },
+    {
+      "epoch": 1.246667521814948,
+      "grad_norm": 0.15377886593341827,
+      "learning_rate": 0.00013169738368628263,
+      "loss": 0.2298,
+      "step": 44950
+    },
+    {
+      "epoch": 1.2480542487580126,
+      "grad_norm": 0.16382162272930145,
+      "learning_rate": 0.00013155535730139284,
+      "loss": 0.2301,
+      "step": 45000
+    },
+    {
+      "epoch": 1.2480542487580126,
+      "eval_loss": 0.22414694726467133,
+      "eval_runtime": 500.918,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 45000
+    },
+    {
+      "epoch": 1.2494409757010772,
+      "grad_norm": 0.13876722753047943,
+      "learning_rate": 0.00013141326019041228,
+      "loss": 0.2249,
+      "step": 45050
+    },
+    {
+      "epoch": 1.2508277026441417,
+      "grad_norm": 0.1360548585653305,
+      "learning_rate": 0.00013127393671013348,
+      "loss": 0.2255,
+      "step": 45100
+    },
+    {
+      "epoch": 1.2522144295872062,
+      "grad_norm": 0.1435881406068802,
+      "learning_rate": 0.00013113170050124578,
+      "loss": 0.2314,
+      "step": 45150
+    },
+    {
+      "epoch": 1.2536011565302705,
+      "grad_norm": 0.12622830271720886,
+      "learning_rate": 0.00013098939451582363,
+      "loss": 0.2248,
+      "step": 45200
+    },
+    {
+      "epoch": 1.254987883473335,
+      "grad_norm": 0.1429251879453659,
+      "learning_rate": 0.00013084701907282228,
+      "loss": 0.2312,
+      "step": 45250
+    },
+    {
+      "epoch": 1.2563746104163993,
+      "grad_norm": 0.12246144562959671,
+      "learning_rate": 0.00013070457449135262,
+      "loss": 0.2236,
+      "step": 45300
+    },
+    {
+      "epoch": 1.2577613373594638,
+      "grad_norm": 0.11872986704111099,
+      "learning_rate": 0.00013056206109068045,
+      "loss": 0.2263,
+      "step": 45350
+    },
+    {
+      "epoch": 1.2591480643025283,
+      "grad_norm": 0.12920017540454865,
+      "learning_rate": 0.00013041947919022594,
+      "loss": 0.2258,
+      "step": 45400
+    },
+    {
+      "epoch": 1.2605347912455929,
+      "grad_norm": 0.15954279899597168,
+      "learning_rate": 0.00013027682910956271,
+      "loss": 0.2272,
+      "step": 45450
+    },
+    {
+      "epoch": 1.2619215181886574,
+      "grad_norm": 0.16156534850597382,
+      "learning_rate": 0.00013013411116841723,
+      "loss": 0.2245,
+      "step": 45500
+    },
+    {
+      "epoch": 1.2633082451317217,
+      "grad_norm": 0.12423060089349747,
+      "learning_rate": 0.00012999132568666805,
+      "loss": 0.2271,
+      "step": 45550
+    },
+    {
+      "epoch": 1.2646949720747862,
+      "grad_norm": 0.1252107322216034,
+      "learning_rate": 0.0001298484729843451,
+      "loss": 0.2298,
+      "step": 45600
+    },
+    {
+      "epoch": 1.2660816990178507,
+      "grad_norm": 0.16947528719902039,
+      "learning_rate": 0.00012970555338162896,
+      "loss": 0.2273,
+      "step": 45650
+    },
+    {
+      "epoch": 1.267468425960915,
+      "grad_norm": 0.14459671080112457,
+      "learning_rate": 0.00012956256719885026,
+      "loss": 0.2282,
+      "step": 45700
+    },
+    {
+      "epoch": 1.2688551529039795,
+      "grad_norm": 0.1194702684879303,
+      "learning_rate": 0.00012941951475648866,
+      "loss": 0.2263,
+      "step": 45750
+    },
+    {
+      "epoch": 1.270241879847044,
+      "grad_norm": 0.12180822342634201,
+      "learning_rate": 0.00012927639637517249,
+      "loss": 0.227,
+      "step": 45800
+    },
+    {
+      "epoch": 1.2716286067901086,
+      "grad_norm": 0.14245355129241943,
+      "learning_rate": 0.00012913321237567783,
+      "loss": 0.2262,
+      "step": 45850
+    },
+    {
+      "epoch": 1.2730153337331729,
+      "grad_norm": 0.14033064246177673,
+      "learning_rate": 0.00012898996307892784,
+      "loss": 0.2249,
+      "step": 45900
+    },
+    {
+      "epoch": 1.2744020606762374,
+      "grad_norm": 0.11540055274963379,
+      "learning_rate": 0.00012884664880599198,
+      "loss": 0.2265,
+      "step": 45950
+    },
+    {
+      "epoch": 1.275788787619302,
+      "grad_norm": 0.10777000337839127,
+      "learning_rate": 0.00012870326987808538,
+      "loss": 0.2245,
+      "step": 46000
+    },
+    {
+      "epoch": 1.275788787619302,
+      "eval_loss": 0.2235965132713318,
+      "eval_runtime": 500.5657,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 46000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 108168,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.2909744594944e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}