diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,16038 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.999989599547927,
+  "eval_steps": 1000,
+  "global_step": 108168,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0013867269430644586,
+      "grad_norm": 1.8933687210083008,
+      "learning_rate": 2.957486136783734e-06,
+      "loss": 1.2241,
+      "step": 50
+    },
+    {
+      "epoch": 0.002773453886128917,
+      "grad_norm": 0.7502820491790771,
+      "learning_rate": 6.038200862600124e-06,
+      "loss": 1.0267,
+      "step": 100
+    },
+    {
+      "epoch": 0.004160180829193376,
+      "grad_norm": 0.5821689963340759,
+      "learning_rate": 9.118915588416513e-06,
+      "loss": 0.8167,
+      "step": 150
+    },
+    {
+      "epoch": 0.005546907772257834,
+      "grad_norm": 0.5138927698135376,
+      "learning_rate": 1.2199630314232902e-05,
+      "loss": 0.6408,
+      "step": 200
+    },
+    {
+      "epoch": 0.006933634715322293,
+      "grad_norm": 0.619263768196106,
+      "learning_rate": 1.5280345040049293e-05,
+      "loss": 0.5468,
+      "step": 250
+    },
+    {
+      "epoch": 0.008320361658386751,
+      "grad_norm": 0.5078439712524414,
+      "learning_rate": 1.836105976586568e-05,
+      "loss": 0.4952,
+      "step": 300
+    },
+    {
+      "epoch": 0.00970708860145121,
+      "grad_norm": 0.5653749108314514,
+      "learning_rate": 2.144177449168207e-05,
+      "loss": 0.4388,
+      "step": 350
+    },
+    {
+      "epoch": 0.011093815544515669,
+      "grad_norm": 0.6189213991165161,
+      "learning_rate": 2.452248921749846e-05,
+      "loss": 0.4232,
+      "step": 400
+    },
+    {
+      "epoch": 0.012480542487580126,
+      "grad_norm": 0.6082913875579834,
+      "learning_rate": 2.760320394331485e-05,
+      "loss": 0.401,
+      "step": 450
+    },
+    {
+      "epoch": 0.013867269430644586,
+      "grad_norm": 0.6956301331520081,
+      "learning_rate": 3.068391866913124e-05,
+      "loss": 0.3895,
+      "step": 500
+    },
+    {
+      "epoch": 0.015253996373709043,
+      "grad_norm": 0.7030412554740906,
+      "learning_rate": 3.3764633394947633e-05,
+      "loss": 0.3676,
+      "step": 550
+    },
+    {
+      "epoch": 0.016640723316773503,
+      "grad_norm": 0.6779190897941589,
+      "learning_rate": 3.684534812076402e-05,
+      "loss": 0.3653,
+      "step": 600
+    },
+    {
+      "epoch": 0.01802745025983796,
+      "grad_norm": 0.8930213451385498,
+      "learning_rate": 3.992606284658041e-05,
+      "loss": 0.3645,
+      "step": 650
+    },
+    {
+      "epoch": 0.01941417720290242,
+      "grad_norm": 0.6423994302749634,
+      "learning_rate": 4.30067775723968e-05,
+      "loss": 0.3514,
+      "step": 700
+    },
+    {
+      "epoch": 0.02080090414596688,
+      "grad_norm": 0.7728660106658936,
+      "learning_rate": 4.608749229821319e-05,
+      "loss": 0.3468,
+      "step": 750
+    },
+    {
+      "epoch": 0.022187631089031337,
+      "grad_norm": 0.7561061978340149,
+      "learning_rate": 4.916820702402958e-05,
+      "loss": 0.3499,
+      "step": 800
+    },
+    {
+      "epoch": 0.023574358032095795,
+      "grad_norm": 0.6163890957832336,
+      "learning_rate": 5.224892174984597e-05,
+      "loss": 0.3417,
+      "step": 850
+    },
+    {
+      "epoch": 0.024961084975160253,
+      "grad_norm": 0.7334563732147217,
+      "learning_rate": 5.532963647566236e-05,
+      "loss": 0.3299,
+      "step": 900
+    },
+    {
+      "epoch": 0.026347811918224714,
+      "grad_norm": 0.655237078666687,
+      "learning_rate": 5.841035120147874e-05,
+      "loss": 0.3306,
+      "step": 950
+    },
+    {
+      "epoch": 0.02773453886128917,
+      "grad_norm": 0.8147113919258118,
+      "learning_rate": 6.149106592729513e-05,
+      "loss": 0.3281,
+      "step": 1000
+    },
+    {
+      "epoch": 0.02773453886128917,
+      "eval_loss": 0.32194069027900696,
+      "eval_runtime": 501.2457,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 1000
+    },
+    {
+      "epoch": 0.02912126580435363,
+      "grad_norm": 0.6397083401679993,
+      "learning_rate": 6.457178065311152e-05,
+      "loss": 0.3204,
+      "step": 1050
+    },
+    {
+      "epoch": 0.030507992747418087,
+      "grad_norm": 0.5808627009391785,
+      "learning_rate": 6.765249537892791e-05,
+      "loss": 0.3229,
+      "step": 1100
+    },
+    {
+      "epoch": 0.03189471969048255,
+      "grad_norm": 0.6929567456245422,
+      "learning_rate": 7.073321010474431e-05,
+      "loss": 0.3148,
+      "step": 1150
+    },
+    {
+      "epoch": 0.033281446633547006,
+      "grad_norm": 0.620298445224762,
+      "learning_rate": 7.38139248305607e-05,
+      "loss": 0.32,
+      "step": 1200
+    },
+    {
+      "epoch": 0.034668173576611463,
+      "grad_norm": 0.5947968363761902,
+      "learning_rate": 7.689463955637708e-05,
+      "loss": 0.306,
+      "step": 1250
+    },
+    {
+      "epoch": 0.03605490051967592,
+      "grad_norm": 0.6097683906555176,
+      "learning_rate": 7.997535428219347e-05,
+      "loss": 0.3179,
+      "step": 1300
+    },
+    {
+      "epoch": 0.03744162746274038,
+      "grad_norm": 0.6339348554611206,
+      "learning_rate": 8.305606900800986e-05,
+      "loss": 0.3161,
+      "step": 1350
+    },
+    {
+      "epoch": 0.03882835440580484,
+      "grad_norm": 0.5278933644294739,
+      "learning_rate": 8.613678373382625e-05,
+      "loss": 0.3153,
+      "step": 1400
+    },
+    {
+      "epoch": 0.040215081348869294,
+      "grad_norm": 0.4927423894405365,
+      "learning_rate": 8.921749845964264e-05,
+      "loss": 0.3111,
+      "step": 1450
+    },
+    {
+      "epoch": 0.04160180829193376,
+      "grad_norm": 0.4745596945285797,
+      "learning_rate": 9.229821318545902e-05,
+      "loss": 0.304,
+      "step": 1500
+    },
+    {
+      "epoch": 0.04298853523499822,
+      "grad_norm": 0.6532231569290161,
+      "learning_rate": 9.537892791127541e-05,
+      "loss": 0.3084,
+      "step": 1550
+    },
+    {
+      "epoch": 0.044375262178062674,
+      "grad_norm": 0.5528659820556641,
+      "learning_rate": 9.84596426370918e-05,
+      "loss": 0.3084,
+      "step": 1600
+    },
+    {
+      "epoch": 0.04576198912112713,
+      "grad_norm": 0.45793089270591736,
+      "learning_rate": 0.0001015403573629082,
+      "loss": 0.2964,
+      "step": 1650
+    },
+    {
+      "epoch": 0.04714871606419159,
+      "grad_norm": 0.5063529014587402,
+      "learning_rate": 0.00010462107208872458,
+      "loss": 0.2924,
+      "step": 1700
+    },
+    {
+      "epoch": 0.04853544300725605,
+      "grad_norm": 0.48600247502326965,
+      "learning_rate": 0.00010770178681454097,
+      "loss": 0.2947,
+      "step": 1750
+    },
+    {
+      "epoch": 0.049922169950320505,
+      "grad_norm": 0.4872143268585205,
+      "learning_rate": 0.00011078250154035737,
+      "loss": 0.297,
+      "step": 1800
+    },
+    {
+      "epoch": 0.05130889689338496,
+      "grad_norm": 0.5091805458068848,
+      "learning_rate": 0.00011386321626617376,
+      "loss": 0.2888,
+      "step": 1850
+    },
+    {
+      "epoch": 0.05269562383644943,
+      "grad_norm": 0.41649994254112244,
+      "learning_rate": 0.00011694393099199015,
+      "loss": 0.2871,
+      "step": 1900
+    },
+    {
+      "epoch": 0.054082350779513885,
+      "grad_norm": 0.5174862146377563,
+      "learning_rate": 0.00012002464571780654,
+      "loss": 0.2922,
+      "step": 1950
+    },
+    {
+      "epoch": 0.05546907772257834,
+      "grad_norm": 0.45786553621292114,
+      "learning_rate": 0.00012310536044362293,
+      "loss": 0.2883,
+      "step": 2000
+    },
+    {
+      "epoch": 0.05546907772257834,
+      "eval_loss": 0.28488224744796753,
+      "eval_runtime": 500.9558,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 2000
+    },
+    {
+      "epoch": 0.0568558046656428,
+      "grad_norm": 0.4992533326148987,
+      "learning_rate": 0.00012606284658040666,
+      "loss": 0.3033,
+      "step": 2050
+    },
+    {
+      "epoch": 0.05824253160870726,
+      "grad_norm": 0.4205988049507141,
+      "learning_rate": 0.00012914356130622304,
+      "loss": 0.2867,
+      "step": 2100
+    },
+    {
+      "epoch": 0.059629258551771716,
+      "grad_norm": 0.4288152754306793,
+      "learning_rate": 0.00013222427603203944,
+      "loss": 0.2795,
+      "step": 2150
+    },
+    {
+      "epoch": 0.061015985494836174,
+      "grad_norm": 0.4856145977973938,
+      "learning_rate": 0.00013530499075785582,
+      "loss": 0.2833,
+      "step": 2200
+    },
+    {
+      "epoch": 0.06240271243790063,
+      "grad_norm": 0.4891654849052429,
+      "learning_rate": 0.00013838570548367222,
+      "loss": 0.2797,
+      "step": 2250
+    },
+    {
+      "epoch": 0.0637894393809651,
+      "grad_norm": 0.39899352192878723,
+      "learning_rate": 0.00014146642020948863,
+      "loss": 0.2785,
+      "step": 2300
+    },
+    {
+      "epoch": 0.06517616632402955,
+      "grad_norm": 0.3616255819797516,
+      "learning_rate": 0.000144547134935305,
+      "loss": 0.2798,
+      "step": 2350
+    },
+    {
+      "epoch": 0.06656289326709401,
+      "grad_norm": 0.3556617498397827,
+      "learning_rate": 0.0001476278496611214,
+      "loss": 0.2811,
+      "step": 2400
+    },
+    {
+      "epoch": 0.06794962021015846,
+      "grad_norm": 0.39639297127723694,
+      "learning_rate": 0.00015070856438693776,
+      "loss": 0.2813,
+      "step": 2450
+    },
+    {
+      "epoch": 0.06933634715322293,
+      "grad_norm": 0.35177573561668396,
+      "learning_rate": 0.00015378927911275416,
+      "loss": 0.2797,
+      "step": 2500
+    },
+    {
+      "epoch": 0.07072307409628739,
+      "grad_norm": 0.38610222935676575,
+      "learning_rate": 0.00015686999383857054,
+      "loss": 0.2747,
+      "step": 2550
+    },
+    {
+      "epoch": 0.07210980103935184,
+      "grad_norm": 0.36727309226989746,
+      "learning_rate": 0.00015995070856438694,
+      "loss": 0.2776,
+      "step": 2600
+    },
+    {
+      "epoch": 0.07349652798241631,
+      "grad_norm": 0.3905107378959656,
+      "learning_rate": 0.00016303142329020332,
+      "loss": 0.2772,
+      "step": 2650
+    },
+    {
+      "epoch": 0.07488325492548076,
+      "grad_norm": 0.3958912193775177,
+      "learning_rate": 0.00016611213801601973,
+      "loss": 0.2707,
+      "step": 2700
+    },
+    {
+      "epoch": 0.07626998186854522,
+      "grad_norm": 0.4029497504234314,
+      "learning_rate": 0.0001691928527418361,
+      "loss": 0.2692,
+      "step": 2750
+    },
+    {
+      "epoch": 0.07765670881160967,
+      "grad_norm": 0.3514055907726288,
+      "learning_rate": 0.0001722735674676525,
+      "loss": 0.2759,
+      "step": 2800
+    },
+    {
+      "epoch": 0.07904343575467414,
+      "grad_norm": 0.34912553429603577,
+      "learning_rate": 0.00017529266789895255,
+      "loss": 0.2793,
+      "step": 2850
+    },
+    {
+      "epoch": 0.08043016269773859,
+      "grad_norm": 0.3493233621120453,
+      "learning_rate": 0.00017831176833025262,
+      "loss": 0.2845,
+      "step": 2900
+    },
+    {
+      "epoch": 0.08181688964080305,
+      "grad_norm": 0.30080145597457886,
+      "learning_rate": 0.00018139248305606902,
+      "loss": 0.2686,
+      "step": 2950
+    },
+    {
+      "epoch": 0.08320361658386752,
+      "grad_norm": 0.3265998959541321,
+      "learning_rate": 0.0001844731977818854,
+      "loss": 0.2695,
+      "step": 3000
+    },
+    {
+      "epoch": 0.08320361658386752,
+      "eval_loss": 0.26523345708847046,
+      "eval_runtime": 500.4565,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 3000
+    },
+    {
+      "epoch": 0.08459034352693197,
+      "grad_norm": 0.29866209626197815,
+      "learning_rate": 0.0001875539125077018,
+      "loss": 0.2679,
+      "step": 3050
+    },
+    {
+      "epoch": 0.08597707046999643,
+      "grad_norm": 0.3191625475883484,
+      "learning_rate": 0.00019063462723351818,
+      "loss": 0.267,
+      "step": 3100
+    },
+    {
+      "epoch": 0.08736379741306088,
+      "grad_norm": 0.3110339939594269,
+      "learning_rate": 0.00019371534195933459,
+      "loss": 0.2658,
+      "step": 3150
+    },
+    {
+      "epoch": 0.08875052435612535,
+      "grad_norm": 0.32120850682258606,
+      "learning_rate": 0.00019679605668515096,
+      "loss": 0.2724,
+      "step": 3200
+    },
+    {
+      "epoch": 0.0901372512991898,
+      "grad_norm": 0.28446418046951294,
+      "learning_rate": 0.00019987677141096734,
+      "loss": 0.268,
+      "step": 3250
+    },
+    {
+      "epoch": 0.09152397824225426,
+      "grad_norm": 0.2722443640232086,
+      "learning_rate": 0.00019999989671933422,
+      "loss": 0.2716,
+      "step": 3300
+    },
+    {
+      "epoch": 0.09291070518531871,
+      "grad_norm": 0.31304416060447693,
+      "learning_rate": 0.00019999956948482068,
+      "loss": 0.2631,
+      "step": 3350
+    },
+    {
+      "epoch": 0.09429743212838318,
+      "grad_norm": 0.2516928017139435,
+      "learning_rate": 0.00019999901811788604,
+      "loss": 0.2647,
+      "step": 3400
+    },
+    {
+      "epoch": 0.09568415907144764,
+      "grad_norm": 0.288006067276001,
+      "learning_rate": 0.00019999824261976613,
+      "loss": 0.263,
+      "step": 3450
+    },
+    {
+      "epoch": 0.0970708860145121,
+      "grad_norm": 0.2745107114315033,
+      "learning_rate": 0.00019999724299219913,
+      "loss": 0.2642,
+      "step": 3500
+    },
+    {
+      "epoch": 0.09845761295757656,
+      "grad_norm": 2.800987720489502,
+      "learning_rate": 0.00019999601923742548,
+      "loss": 0.7176,
+      "step": 3550
+    },
+    {
+      "epoch": 0.09984433990064101,
+      "grad_norm": 0.3590925931930542,
+      "learning_rate": 0.00019999457135818805,
+      "loss": 0.3146,
+      "step": 3600
+    },
+    {
+      "epoch": 0.10123106684370548,
+      "grad_norm": 0.32617494463920593,
+      "learning_rate": 0.00019999289935773202,
+      "loss": 0.2786,
+      "step": 3650
+    },
+    {
+      "epoch": 0.10261779378676993,
+      "grad_norm": 0.3239264488220215,
+      "learning_rate": 0.0001999910032398049,
+      "loss": 0.2807,
+      "step": 3700
+    },
+    {
+      "epoch": 0.10400452072983439,
+      "grad_norm": 0.3022274076938629,
+      "learning_rate": 0.00019998888300865652,
+      "loss": 0.2758,
+      "step": 3750
+    },
+    {
+      "epoch": 0.10539124767289886,
+      "grad_norm": 0.33024862408638,
+      "learning_rate": 0.000199986538669039,
+      "loss": 0.2687,
+      "step": 3800
+    },
+    {
+      "epoch": 0.1067779746159633,
+      "grad_norm": 0.6899451017379761,
+      "learning_rate": 0.00019998397022620687,
+      "loss": 0.2699,
+      "step": 3850
+    },
+    {
+      "epoch": 0.10816470155902777,
+      "grad_norm": 0.2794604003429413,
+      "learning_rate": 0.0001999811776859168,
+      "loss": 0.2667,
+      "step": 3900
+    },
+    {
+      "epoch": 0.10955142850209222,
+      "grad_norm": 0.2764255106449127,
+      "learning_rate": 0.00019997816105442778,
+      "loss": 0.2658,
+      "step": 3950
+    },
+    {
+      "epoch": 0.11093815544515669,
+      "grad_norm": 0.43574222922325134,
+      "learning_rate": 0.0001999749203385012,
+      "loss": 0.2664,
+      "step": 4000
+    },
+    {
+      "epoch": 0.11093815544515669,
+      "eval_loss": 0.26065966486930847,
+      "eval_runtime": 500.842,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 4000
+    },
+    {
+      "epoch": 0.11232488238822114,
+      "grad_norm": 0.5340762734413147,
+      "learning_rate": 0.00019997145554540046,
+      "loss": 0.272,
+      "step": 4050
+    },
+    {
+      "epoch": 0.1137116093312856,
+      "grad_norm": 0.32403895258903503,
+      "learning_rate": 0.00019996776668289136,
+      "loss": 0.2679,
+      "step": 4100
+    },
+    {
+      "epoch": 0.11509833627435005,
+      "grad_norm": 0.2928290367126465,
+      "learning_rate": 0.0001999638537592419,
+      "loss": 0.2624,
+      "step": 4150
+    },
+    {
+      "epoch": 0.11648506321741452,
+      "grad_norm": 0.23226021230220795,
+      "learning_rate": 0.00019995971678322228,
+      "loss": 0.2557,
+      "step": 4200
+    },
+    {
+      "epoch": 0.11787179016047898,
+      "grad_norm": 0.2748055160045624,
+      "learning_rate": 0.00019995535576410476,
+      "loss": 0.2625,
+      "step": 4250
+    },
+    {
+      "epoch": 0.11925851710354343,
+      "grad_norm": 0.2713299095630646,
+      "learning_rate": 0.00019995077071166385,
+      "loss": 0.2611,
+      "step": 4300
+    },
+    {
+      "epoch": 0.1206452440466079,
+      "grad_norm": 0.24674977362155914,
+      "learning_rate": 0.00019994596163617624,
+      "loss": 0.2647,
+      "step": 4350
+    },
+    {
+      "epoch": 0.12203197098967235,
+      "grad_norm": 0.359017014503479,
+      "learning_rate": 0.00019994092854842065,
+      "loss": 0.2601,
+      "step": 4400
+    },
+    {
+      "epoch": 0.12341869793273681,
+      "grad_norm": 0.38051414489746094,
+      "learning_rate": 0.00019993567145967791,
+      "loss": 0.253,
+      "step": 4450
+    },
+    {
+      "epoch": 0.12480542487580126,
+      "grad_norm": 0.26227161288261414,
+      "learning_rate": 0.0001999301903817309,
+      "loss": 0.2584,
+      "step": 4500
+    },
+    {
+      "epoch": 0.12619215181886573,
+      "grad_norm": 0.21259668469429016,
+      "learning_rate": 0.00019992448532686453,
+      "loss": 0.2618,
+      "step": 4550
+    },
+    {
+      "epoch": 0.1275788787619302,
+      "grad_norm": 0.23226451873779297,
+      "learning_rate": 0.0001999185563078658,
+      "loss": 0.2526,
+      "step": 4600
+    },
+    {
+      "epoch": 0.12896560570499466,
+      "grad_norm": 0.24459871649742126,
+      "learning_rate": 0.00019991240333802352,
+      "loss": 0.2523,
+      "step": 4650
+    },
+    {
+      "epoch": 0.1303523326480591,
+      "grad_norm": 0.29185208678245544,
+      "learning_rate": 0.00019990602643112863,
+      "loss": 0.2546,
+      "step": 4700
+    },
+    {
+      "epoch": 0.13173905959112356,
+      "grad_norm": 0.23443324863910675,
+      "learning_rate": 0.00019989942560147387,
+      "loss": 0.2557,
+      "step": 4750
+    },
+    {
+      "epoch": 0.13312578653418802,
+      "grad_norm": 0.22915039956569672,
+      "learning_rate": 0.00019989260086385394,
+      "loss": 0.2546,
+      "step": 4800
+    },
+    {
+      "epoch": 0.1345125134772525,
+      "grad_norm": 0.2710748016834259,
+      "learning_rate": 0.00019988555223356531,
+      "loss": 0.2619,
+      "step": 4850
+    },
+    {
+      "epoch": 0.13589924042031692,
+      "grad_norm": 0.24671098589897156,
+      "learning_rate": 0.00019987827972640633,
+      "loss": 0.2594,
+      "step": 4900
+    },
+    {
+      "epoch": 0.1372859673633814,
+      "grad_norm": 0.2359282672405243,
+      "learning_rate": 0.00019987078335867713,
+      "loss": 0.2616,
+      "step": 4950
+    },
+    {
+      "epoch": 0.13867269430644585,
+      "grad_norm": 0.2197064608335495,
+      "learning_rate": 0.00019986306314717956,
+      "loss": 0.2507,
+      "step": 5000
+    },
+    {
+      "epoch": 0.13867269430644585,
+      "eval_loss": 0.25083017349243164,
+      "eval_runtime": 500.7995,
+      "eval_samples_per_second": 5.705,
+      "eval_steps_per_second": 5.705,
+      "step": 5000
+    },
+    {
+      "epoch": 0.14005942124951032,
+      "grad_norm": 0.2249370515346527,
+      "learning_rate": 0.0001998551191092172,
+      "loss": 0.2574,
+      "step": 5050
+    },
+    {
+      "epoch": 0.14144614819257478,
+      "grad_norm": 0.36345556378364563,
+      "learning_rate": 0.0001998469512625953,
+      "loss": 0.2493,
+      "step": 5100
+    },
+    {
+      "epoch": 0.14283287513563922,
+      "grad_norm": 0.24807791411876678,
+      "learning_rate": 0.00019983855962562067,
+      "loss": 0.2542,
+      "step": 5150
+    },
+    {
+      "epoch": 0.14421960207870368,
+      "grad_norm": 3.6125738620758057,
+      "learning_rate": 0.00019982994421710186,
+      "loss": 0.2595,
+      "step": 5200
+    },
+    {
+      "epoch": 0.14560632902176815,
+      "grad_norm": 0.4985048472881317,
+      "learning_rate": 0.0001998211050563488,
+      "loss": 0.2558,
+      "step": 5250
+    },
+    {
+      "epoch": 0.14699305596483261,
+      "grad_norm": 0.3320443332195282,
+      "learning_rate": 0.00019981204216317308,
+      "loss": 0.2545,
+      "step": 5300
+    },
+    {
+      "epoch": 0.14837978290789705,
+      "grad_norm": 0.2081877887248993,
+      "learning_rate": 0.00019980275555788759,
+      "loss": 0.2536,
+      "step": 5350
+    },
+    {
+      "epoch": 0.14976650985096152,
+      "grad_norm": 0.27258801460266113,
+      "learning_rate": 0.00019979324526130676,
+      "loss": 0.2505,
+      "step": 5400
+    },
+    {
+      "epoch": 0.15115323679402598,
+      "grad_norm": 0.23199999332427979,
+      "learning_rate": 0.00019978351129474632,
+      "loss": 0.2556,
+      "step": 5450
+    },
+    {
+      "epoch": 0.15253996373709044,
+      "grad_norm": 0.20929445326328278,
+      "learning_rate": 0.00019977355368002334,
+      "loss": 0.2486,
+      "step": 5500
+    },
+    {
+      "epoch": 0.1539266906801549,
+      "grad_norm": 0.23551955819129944,
+      "learning_rate": 0.00019976337243945617,
+      "loss": 0.2517,
+      "step": 5550
+    },
+    {
+      "epoch": 0.15531341762321935,
+      "grad_norm": 0.30231812596321106,
+      "learning_rate": 0.0001997529675958644,
+      "loss": 0.2498,
+      "step": 5600
+    },
+    {
+      "epoch": 0.1567001445662838,
+      "grad_norm": 0.24430635571479797,
+      "learning_rate": 0.00019974233917256865,
+      "loss": 0.2523,
+      "step": 5650
+    },
+    {
+      "epoch": 0.15808687150934828,
+      "grad_norm": 6.362756252288818,
+      "learning_rate": 0.0001997314871933909,
+      "loss": 0.2529,
+      "step": 5700
+    },
+    {
+      "epoch": 0.15947359845241274,
+      "grad_norm": 0.2339017242193222,
+      "learning_rate": 0.00019972041168265397,
+      "loss": 0.2524,
+      "step": 5750
+    },
+    {
+      "epoch": 0.16086032539547718,
+      "grad_norm": 0.22503100335597992,
+      "learning_rate": 0.0001997091126651818,
+      "loss": 0.251,
+      "step": 5800
+    },
+    {
+      "epoch": 0.16224705233854164,
+      "grad_norm": 0.26495125889778137,
+      "learning_rate": 0.00019969759016629928,
+      "loss": 0.2517,
+      "step": 5850
+    },
+    {
+      "epoch": 0.1636337792816061,
+      "grad_norm": 0.25339657068252563,
+      "learning_rate": 0.00019968584421183212,
+      "loss": 0.2505,
+      "step": 5900
+    },
+    {
+      "epoch": 0.16502050622467057,
+      "grad_norm": 0.20266841351985931,
+      "learning_rate": 0.000199673874828107,
+      "loss": 0.2501,
+      "step": 5950
+    },
+    {
+      "epoch": 0.16640723316773504,
+      "grad_norm": 0.19285647571086884,
+      "learning_rate": 0.00019966168204195125,
+      "loss": 0.2445,
+      "step": 6000
+    },
+    {
+      "epoch": 0.16640723316773504,
+      "eval_loss": 0.24731825292110443,
+      "eval_runtime": 500.9495,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 6000
+    },
+    {
+      "epoch": 0.16779396011079947,
+      "grad_norm": 0.2121065855026245,
+      "learning_rate": 0.000199649265880693,
+      "loss": 0.2466,
+      "step": 6050
+    },
+    {
+      "epoch": 0.16918068705386394,
+      "grad_norm": 0.2560518980026245,
+      "learning_rate": 0.000199636626372161,
+      "loss": 0.2572,
+      "step": 6100
+    },
+    {
+      "epoch": 0.1705674139969284,
+      "grad_norm": 0.22927352786064148,
+      "learning_rate": 0.00019962376354468466,
+      "loss": 0.2509,
+      "step": 6150
+    },
+    {
+      "epoch": 0.17195414093999287,
+      "grad_norm": 0.2201690673828125,
+      "learning_rate": 0.00019961067742709377,
+      "loss": 0.2501,
+      "step": 6200
+    },
+    {
+      "epoch": 0.1733408678830573,
+      "grad_norm": 0.23233374953269958,
+      "learning_rate": 0.0001995973680487188,
+      "loss": 0.2525,
+      "step": 6250
+    },
+    {
+      "epoch": 0.17472759482612177,
+      "grad_norm": 0.254256933927536,
+      "learning_rate": 0.00019958383543939041,
+      "loss": 0.2499,
+      "step": 6300
+    },
+    {
+      "epoch": 0.17611432176918623,
+      "grad_norm": 0.1754632294178009,
+      "learning_rate": 0.00019957007962943975,
+      "loss": 0.251,
+      "step": 6350
+    },
+    {
+      "epoch": 0.1775010487122507,
+      "grad_norm": 0.23628771305084229,
+      "learning_rate": 0.00019955610064969817,
+      "loss": 0.256,
+      "step": 6400
+    },
+    {
+      "epoch": 0.17888777565531516,
+      "grad_norm": 0.23698653280735016,
+      "learning_rate": 0.00019954189853149725,
+      "loss": 0.2474,
+      "step": 6450
+    },
+    {
+      "epoch": 0.1802745025983796,
+      "grad_norm": 0.27713823318481445,
+      "learning_rate": 0.00019952747330666867,
+      "loss": 0.2481,
+      "step": 6500
+    },
+    {
+      "epoch": 0.18166122954144406,
+      "grad_norm": 0.1710810512304306,
+      "learning_rate": 0.00019951282500754413,
+      "loss": 0.2564,
+      "step": 6550
+    },
+    {
+      "epoch": 0.18304795648450853,
+      "grad_norm": 0.21406157314777374,
+      "learning_rate": 0.00019949795366695544,
+      "loss": 0.2517,
+      "step": 6600
+    },
+    {
+      "epoch": 0.184434683427573,
+      "grad_norm": 0.20108449459075928,
+      "learning_rate": 0.00019948285931823415,
+      "loss": 0.2518,
+      "step": 6650
+    },
+    {
+      "epoch": 0.18582141037063743,
+      "grad_norm": 5.1352715492248535,
+      "learning_rate": 0.0001994675419952118,
+      "loss": 0.2546,
+      "step": 6700
+    },
+    {
+      "epoch": 0.1872081373137019,
+      "grad_norm": 0.22743810713291168,
+      "learning_rate": 0.00019945200173221962,
+      "loss": 0.2457,
+      "step": 6750
+    },
+    {
+      "epoch": 0.18859486425676636,
+      "grad_norm": 0.20475907623767853,
+      "learning_rate": 0.0001994362385640885,
+      "loss": 0.2529,
+      "step": 6800
+    },
+    {
+      "epoch": 0.18998159119983082,
+      "grad_norm": 0.22172316908836365,
+      "learning_rate": 0.000199420252526149,
+      "loss": 0.2554,
+      "step": 6850
+    },
+    {
+      "epoch": 0.1913683181428953,
+      "grad_norm": 2.967470407485962,
+      "learning_rate": 0.0001994040436542311,
+      "loss": 0.2555,
+      "step": 6900
+    },
+    {
+      "epoch": 0.19275504508595973,
+      "grad_norm": 0.23698735237121582,
+      "learning_rate": 0.00019938761198466437,
+      "loss": 0.2619,
+      "step": 6950
+    },
+    {
+      "epoch": 0.1941417720290242,
+      "grad_norm": 0.17891797423362732,
+      "learning_rate": 0.0001993709575542776,
+      "loss": 0.2464,
+      "step": 7000
+    },
+    {
+      "epoch": 0.1941417720290242,
+      "eval_loss": 0.24410127103328705,
+      "eval_runtime": 500.8833,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 7000
+    },
+    {
+      "epoch": 0.19552849897208865,
+      "grad_norm": 0.21030811965465546,
+      "learning_rate": 0.00019935408040039901,
+      "loss": 0.2517,
+      "step": 7050
+    },
+    {
+      "epoch": 0.19691522591515312,
+      "grad_norm": 0.1913098245859146,
+      "learning_rate": 0.00019933698056085586,
+      "loss": 0.249,
+      "step": 7100
+    },
+    {
+      "epoch": 0.19830195285821758,
+      "grad_norm": 0.2044433057308197,
+      "learning_rate": 0.00019931965807397465,
+      "loss": 0.2496,
+      "step": 7150
+    },
+    {
+      "epoch": 0.19968867980128202,
+      "grad_norm": 0.18698015809059143,
+      "learning_rate": 0.00019930211297858078,
+      "loss": 0.2537,
+      "step": 7200
+    },
+    {
+      "epoch": 0.20107540674434649,
+      "grad_norm": 0.22580522298812866,
+      "learning_rate": 0.00019928434531399876,
+      "loss": 0.2456,
+      "step": 7250
+    },
+    {
+      "epoch": 0.20246213368741095,
+      "grad_norm": 0.1749202162027359,
+      "learning_rate": 0.00019926635512005183,
+      "loss": 0.2504,
+      "step": 7300
+    },
+    {
+      "epoch": 0.20384886063047541,
+      "grad_norm": 0.2123364359140396,
+      "learning_rate": 0.00019924814243706197,
+      "loss": 0.2477,
+      "step": 7350
+    },
+    {
+      "epoch": 0.20523558757353985,
+      "grad_norm": 0.2234705090522766,
+      "learning_rate": 0.00019922970730584997,
+      "loss": 0.2457,
+      "step": 7400
+    },
+    {
+      "epoch": 0.20662231451660432,
+      "grad_norm": 0.20742256939411163,
+      "learning_rate": 0.00019921104976773505,
+      "loss": 0.249,
+      "step": 7450
+    },
+    {
+      "epoch": 0.20800904145966878,
+      "grad_norm": 0.18315458297729492,
+      "learning_rate": 0.000199192169864535,
+      "loss": 0.2459,
+      "step": 7500
+    },
+    {
+      "epoch": 0.20939576840273325,
+      "grad_norm": 0.19357183575630188,
+      "learning_rate": 0.000199173067638566,
+      "loss": 0.2439,
+      "step": 7550
+    },
+    {
+      "epoch": 0.2107824953457977,
+      "grad_norm": 0.2398926168680191,
+      "learning_rate": 0.00019915374313264248,
+      "loss": 0.2497,
+      "step": 7600
+    },
+    {
+      "epoch": 0.21216922228886215,
+      "grad_norm": 0.20313721895217896,
+      "learning_rate": 0.00019913419639007714,
+      "loss": 0.2447,
+      "step": 7650
+    },
+    {
+      "epoch": 0.2135559492319266,
+      "grad_norm": 0.17255066335201263,
+      "learning_rate": 0.00019911442745468075,
+      "loss": 0.2447,
+      "step": 7700
+    },
+    {
+      "epoch": 0.21494267617499108,
+      "grad_norm": 0.19140756130218506,
+      "learning_rate": 0.0001990944363707621,
+      "loss": 0.2383,
+      "step": 7750
+    },
+    {
+      "epoch": 0.21632940311805554,
+      "grad_norm": 0.15212053060531616,
+      "learning_rate": 0.00019907422318312783,
+      "loss": 0.2485,
+      "step": 7800
+    },
+    {
+      "epoch": 0.21771613006111998,
+      "grad_norm": 0.1841588169336319,
+      "learning_rate": 0.0001990537879370825,
+      "loss": 0.2432,
+      "step": 7850
+    },
+    {
+      "epoch": 0.21910285700418444,
+      "grad_norm": 0.2013355791568756,
+      "learning_rate": 0.00019903313067842833,
+      "loss": 0.2431,
+      "step": 7900
+    },
+    {
+      "epoch": 0.2204895839472489,
+      "grad_norm": 0.17149454355239868,
+      "learning_rate": 0.0001990122514534651,
+      "loss": 0.247,
+      "step": 7950
+    },
+    {
+      "epoch": 0.22187631089031337,
+      "grad_norm": 0.24272453784942627,
+      "learning_rate": 0.00019899115030899014,
+      "loss": 0.2468,
+      "step": 8000
+    },
+    {
+      "epoch": 0.22187631089031337,
+      "eval_loss": 0.24099861085414886,
+      "eval_runtime": 501.2129,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 8000
+    },
+    {
+      "epoch": 0.22326303783337784,
+      "grad_norm": 0.2419915497303009,
+      "learning_rate": 0.00019896982729229813,
+      "loss": 0.2454,
+      "step": 8050
+    },
+    {
+      "epoch": 0.22464976477644227,
+      "grad_norm": 0.16482336819171906,
+      "learning_rate": 0.0001989482824511811,
+      "loss": 0.2423,
+      "step": 8100
+    },
+    {
+      "epoch": 0.22603649171950674,
+      "grad_norm": 0.22351431846618652,
+      "learning_rate": 0.00019892651583392824,
+      "loss": 0.2501,
+      "step": 8150
+    },
+    {
+      "epoch": 0.2274232186625712,
+      "grad_norm": 0.19319549202919006,
+      "learning_rate": 0.0001989045274893258,
+      "loss": 0.2452,
+      "step": 8200
+    },
+    {
+      "epoch": 0.22880994560563567,
+      "grad_norm": 0.15613292157649994,
+      "learning_rate": 0.00019888231746665696,
+      "loss": 0.2428,
+      "step": 8250
+    },
+    {
+      "epoch": 0.2301966725487001,
+      "grad_norm": 0.18092665076255798,
+      "learning_rate": 0.00019885988581570184,
+      "loss": 0.2448,
+      "step": 8300
+    },
+    {
+      "epoch": 0.23158339949176457,
+      "grad_norm": 0.18928927183151245,
+      "learning_rate": 0.00019883723258673724,
+      "loss": 0.2493,
+      "step": 8350
+    },
+    {
+      "epoch": 0.23297012643482903,
+      "grad_norm": 0.19816988706588745,
+      "learning_rate": 0.0001988143578305366,
+      "loss": 0.2465,
+      "step": 8400
+    },
+    {
+      "epoch": 0.2343568533778935,
+      "grad_norm": 0.19853706657886505,
+      "learning_rate": 0.00019879126159836992,
+      "loss": 0.2443,
+      "step": 8450
+    },
+    {
+      "epoch": 0.23574358032095796,
+      "grad_norm": 0.17544203996658325,
+      "learning_rate": 0.00019876794394200353,
+      "loss": 0.2429,
+      "step": 8500
+    },
+    {
+      "epoch": 0.2371303072640224,
+      "grad_norm": 0.16583149135112762,
+      "learning_rate": 0.0001987444049137001,
+      "loss": 0.244,
+      "step": 8550
+    },
+    {
+      "epoch": 0.23851703420708686,
+      "grad_norm": 0.18239592015743256,
+      "learning_rate": 0.00019872064456621848,
+      "loss": 0.2447,
+      "step": 8600
+    },
+    {
+      "epoch": 0.23990376115015133,
+      "grad_norm": 0.15820704400539398,
+      "learning_rate": 0.0001986966629528135,
+      "loss": 0.2469,
+      "step": 8650
+    },
+    {
+      "epoch": 0.2412904880932158,
+      "grad_norm": 0.18477188050746918,
+      "learning_rate": 0.00019867246012723598,
+      "loss": 0.2407,
+      "step": 8700
+    },
+    {
+      "epoch": 0.24267721503628023,
+      "grad_norm": 0.1676979809999466,
+      "learning_rate": 0.0001986480361437325,
+      "loss": 0.2448,
+      "step": 8750
+    },
+    {
+      "epoch": 0.2440639419793447,
+      "grad_norm": 0.2173600196838379,
+      "learning_rate": 0.00019862339105704543,
+      "loss": 0.2409,
+      "step": 8800
+    },
+    {
+      "epoch": 0.24545066892240916,
+      "grad_norm": 0.17326687276363373,
+      "learning_rate": 0.00019859852492241256,
+      "loss": 0.2387,
+      "step": 8850
+    },
+    {
+      "epoch": 0.24683739586547362,
+      "grad_norm": 0.16229301691055298,
+      "learning_rate": 0.00019857343779556725,
+      "loss": 0.2467,
+      "step": 8900
+    },
+    {
+      "epoch": 0.2482241228085381,
+      "grad_norm": 0.21166543662548065,
+      "learning_rate": 0.0001985481297327381,
+      "loss": 0.2507,
+      "step": 8950
+    },
+    {
+      "epoch": 0.24961084975160253,
+      "grad_norm": 0.17892777919769287,
+      "learning_rate": 0.00019852260079064894,
+      "loss": 0.2416,
+      "step": 9000
+    },
+    {
+      "epoch": 0.24961084975160253,
+      "eval_loss": 0.23973840475082397,
+      "eval_runtime": 500.5349,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 9000
+    },
+    {
+      "epoch": 0.250997576694667,
+      "grad_norm": 0.20435132086277008,
+      "learning_rate": 0.00019849685102651867,
+      "loss": 0.2385,
+      "step": 9050
+    },
+    {
+      "epoch": 0.25238430363773146,
+      "grad_norm": 0.1890842318534851,
+      "learning_rate": 0.0001984708804980611,
+      "loss": 0.2416,
+      "step": 9100
+    },
+    {
+      "epoch": 0.2537710305807959,
+      "grad_norm": 0.18390174210071564,
+      "learning_rate": 0.00019844468926348482,
+      "loss": 0.2469,
+      "step": 9150
+    },
+    {
+      "epoch": 0.2551577575238604,
+      "grad_norm": 0.23599492013454437,
+      "learning_rate": 0.00019841827738149314,
+      "loss": 0.2417,
+      "step": 9200
+    },
+    {
+      "epoch": 0.25654448446692485,
+      "grad_norm": 0.1522965133190155,
+      "learning_rate": 0.00019839164491128398,
+      "loss": 0.2427,
+      "step": 9250
+    },
+    {
+      "epoch": 0.2579312114099893,
+      "grad_norm": 0.206534281373024,
+      "learning_rate": 0.00019836479191254948,
+      "loss": 0.2452,
+      "step": 9300
+    },
+    {
+      "epoch": 0.2593179383530537,
+      "grad_norm": 0.18928374350070953,
+      "learning_rate": 0.00019833771844547627,
+      "loss": 0.244,
+      "step": 9350
+    },
+    {
+      "epoch": 0.2607046652961182,
+      "grad_norm": 0.17130087316036224,
+      "learning_rate": 0.00019831042457074498,
+      "loss": 0.2488,
+      "step": 9400
+    },
+    {
+      "epoch": 0.26209139223918265,
+      "grad_norm": 0.17631781101226807,
+      "learning_rate": 0.00019828291034953033,
+      "loss": 0.2441,
+      "step": 9450
+    },
+    {
+      "epoch": 0.2634781191822471,
+      "grad_norm": 0.1852494180202484,
+      "learning_rate": 0.00019825517584350083,
+      "loss": 0.2414,
+      "step": 9500
+    },
+    {
+      "epoch": 0.2648648461253116,
+      "grad_norm": 0.21513506770133972,
+      "learning_rate": 0.0001982272211148188,
+      "loss": 0.2412,
+      "step": 9550
+    },
+    {
+      "epoch": 0.26625157306837605,
+      "grad_norm": 0.18172813951969147,
+      "learning_rate": 0.0001981990462261401,
+      "loss": 0.2435,
+      "step": 9600
+    },
+    {
+      "epoch": 0.2676383000114405,
+      "grad_norm": 0.1561124324798584,
+      "learning_rate": 0.00019817065124061407,
+      "loss": 0.238,
+      "step": 9650
+    },
+    {
+      "epoch": 0.269025026954505,
+      "grad_norm": 0.16663338243961334,
+      "learning_rate": 0.00019814203622188338,
+      "loss": 0.2383,
+      "step": 9700
+    },
+    {
+      "epoch": 0.27041175389756944,
+      "grad_norm": 0.17735238373279572,
+      "learning_rate": 0.0001981132012340838,
+      "loss": 0.2459,
+      "step": 9750
+    },
+    {
+      "epoch": 0.27179848084063385,
+      "grad_norm": 0.21334126591682434,
+      "learning_rate": 0.00019808414634184417,
+      "loss": 0.2425,
+      "step": 9800
+    },
+    {
+      "epoch": 0.2731852077836983,
+      "grad_norm": 0.16817434132099152,
+      "learning_rate": 0.00019805487161028625,
+      "loss": 0.2361,
+      "step": 9850
+    },
+    {
+      "epoch": 0.2745719347267628,
+      "grad_norm": 0.17149919271469116,
+      "learning_rate": 0.00019802537710502443,
+      "loss": 0.2431,
+      "step": 9900
+    },
+    {
+      "epoch": 0.27595866166982724,
+      "grad_norm": 0.1521356999874115,
+      "learning_rate": 0.00019799566289216576,
+      "loss": 0.2411,
+      "step": 9950
+    },
+    {
+      "epoch": 0.2773453886128917,
+      "grad_norm": 0.15583455562591553,
+      "learning_rate": 0.00019796572903830974,
+      "loss": 0.2388,
+      "step": 10000
+    },
+    {
+      "epoch": 0.2773453886128917,
+      "eval_loss": 0.23783154785633087,
+      "eval_runtime": 501.3932,
+      "eval_samples_per_second": 5.698,
+      "eval_steps_per_second": 5.698,
+      "step": 10000
+    },
+    {
+      "epoch": 0.2787321155559562,
+      "grad_norm": 0.15069644153118134,
+      "learning_rate": 0.00019793557561054807,
+      "loss": 0.245,
+      "step": 10050
+    },
+    {
+      "epoch": 0.28011884249902064,
+      "grad_norm": 0.16481320559978485,
+      "learning_rate": 0.0001979052026764647,
+      "loss": 0.2403,
+      "step": 10100
+    },
+    {
+      "epoch": 0.2815055694420851,
+      "grad_norm": 0.16549484431743622,
+      "learning_rate": 0.00019787461030413553,
+      "loss": 0.2404,
+      "step": 10150
+    },
+    {
+      "epoch": 0.28289229638514957,
+      "grad_norm": 0.1722942292690277,
+      "learning_rate": 0.0001978437985621282,
+      "loss": 0.2407,
+      "step": 10200
+    },
+    {
+      "epoch": 0.284279023328214,
+      "grad_norm": 1.554700255393982,
+      "learning_rate": 0.0001978127675195022,
+      "loss": 0.2423,
+      "step": 10250
+    },
+    {
+      "epoch": 0.28566575027127844,
+      "grad_norm": 0.18697640299797058,
+      "learning_rate": 0.0001977815172458084,
+      "loss": 0.2458,
+      "step": 10300
+    },
+    {
+      "epoch": 0.2870524772143429,
+      "grad_norm": 0.19721738994121552,
+      "learning_rate": 0.00019775004781108914,
+      "loss": 0.2423,
+      "step": 10350
+    },
+    {
+      "epoch": 0.28843920415740737,
+      "grad_norm": 0.13843601942062378,
+      "learning_rate": 0.00019771835928587787,
+      "loss": 0.249,
+      "step": 10400
+    },
+    {
+      "epoch": 0.28982593110047183,
+      "grad_norm": 0.19530989229679108,
+      "learning_rate": 0.0001976864517411992,
+      "loss": 0.2438,
+      "step": 10450
+    },
+    {
+      "epoch": 0.2912126580435363,
+      "grad_norm": 0.14896182715892792,
+      "learning_rate": 0.0001976543252485686,
+      "loss": 0.2392,
+      "step": 10500
+    },
+    {
+      "epoch": 0.29259938498660076,
+      "grad_norm": 0.1485060602426529,
+      "learning_rate": 0.00019762197987999223,
+      "loss": 0.2371,
+      "step": 10550
+    },
+    {
+      "epoch": 0.29398611192966523,
+      "grad_norm": 0.20084735751152039,
+      "learning_rate": 0.00019758941570796688,
+      "loss": 0.2461,
+      "step": 10600
+    },
+    {
+      "epoch": 0.2953728388727297,
+      "grad_norm": 0.1450163722038269,
+      "learning_rate": 0.0001975566328054797,
+      "loss": 0.2379,
+      "step": 10650
+    },
+    {
+      "epoch": 0.2967595658157941,
+      "grad_norm": 0.14225760102272034,
+      "learning_rate": 0.00019752363124600817,
+      "loss": 0.2465,
+      "step": 10700
+    },
+    {
+      "epoch": 0.29814629275885857,
+      "grad_norm": 0.182630255818367,
+      "learning_rate": 0.00019749041110351975,
+      "loss": 0.2382,
+      "step": 10750
+    },
+    {
+      "epoch": 0.29953301970192303,
+      "grad_norm": 0.18140457570552826,
+      "learning_rate": 0.00019745697245247194,
+      "loss": 0.2394,
+      "step": 10800
+    },
+    {
+      "epoch": 0.3009197466449875,
+      "grad_norm": 0.1756162941455841,
+      "learning_rate": 0.00019742331536781187,
+      "loss": 0.2377,
+      "step": 10850
+    },
+    {
+      "epoch": 0.30230647358805196,
+      "grad_norm": 0.14414621889591217,
+      "learning_rate": 0.0001973894399249763,
+      "loss": 0.2408,
+      "step": 10900
+    },
+    {
+      "epoch": 0.3036932005311164,
+      "grad_norm": 0.1697167605161667,
+      "learning_rate": 0.00019735534619989142,
+      "loss": 0.2442,
+      "step": 10950
+    },
+    {
+      "epoch": 0.3050799274741809,
+      "grad_norm": 0.15641078352928162,
+      "learning_rate": 0.00019732103426897265,
+      "loss": 0.2421,
+      "step": 11000
+    },
+    {
+      "epoch": 0.3050799274741809,
+      "eval_loss": 0.23684217035770416,
+      "eval_runtime": 500.474,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 11000
+    },
+    {
+      "epoch": 0.30646665441724535,
+      "grad_norm": 0.190172016620636,
+      "learning_rate": 0.00019728650420912448,
+      "loss": 0.2475,
+      "step": 11050
+    },
+    {
+      "epoch": 0.3078533813603098,
+      "grad_norm": 0.16632623970508575,
+      "learning_rate": 0.0001972517560977403,
+      "loss": 0.2426,
+      "step": 11100
+    },
+    {
+      "epoch": 0.30924010830337423,
+      "grad_norm": 0.16913548111915588,
+      "learning_rate": 0.00019721679001270226,
+      "loss": 0.2386,
+      "step": 11150
+    },
+    {
+      "epoch": 0.3106268352464387,
+      "grad_norm": 0.16081750392913818,
+      "learning_rate": 0.00019718160603238096,
+      "loss": 0.2358,
+      "step": 11200
+    },
+    {
+      "epoch": 0.31201356218950316,
+      "grad_norm": 0.19061852991580963,
+      "learning_rate": 0.00019714620423563552,
+      "loss": 0.238,
+      "step": 11250
+    },
+    {
+      "epoch": 0.3134002891325676,
+      "grad_norm": 0.16220314800739288,
+      "learning_rate": 0.00019711058470181316,
+      "loss": 0.2428,
+      "step": 11300
+    },
+    {
+      "epoch": 0.3147870160756321,
+      "grad_norm": 0.20064842700958252,
+      "learning_rate": 0.00019707474751074915,
+      "loss": 0.2393,
+      "step": 11350
+    },
+    {
+      "epoch": 0.31617374301869655,
+      "grad_norm": 0.14250491559505463,
+      "learning_rate": 0.00019703869274276657,
+      "loss": 0.2376,
+      "step": 11400
+    },
+    {
+      "epoch": 0.317560469961761,
+      "grad_norm": 0.18501660227775574,
+      "learning_rate": 0.00019700242047867623,
+      "loss": 0.2405,
+      "step": 11450
+    },
+    {
+      "epoch": 0.3189471969048255,
+      "grad_norm": 0.1680876910686493,
+      "learning_rate": 0.00019696593079977635,
+      "loss": 0.241,
+      "step": 11500
+    },
+    {
+      "epoch": 0.32033392384788995,
+      "grad_norm": 0.15119992196559906,
+      "learning_rate": 0.00019692922378785252,
+      "loss": 0.2371,
+      "step": 11550
+    },
+    {
+      "epoch": 0.32172065079095435,
+      "grad_norm": 0.15388673543930054,
+      "learning_rate": 0.0001968922995251774,
+      "loss": 0.2425,
+      "step": 11600
+    },
+    {
+      "epoch": 0.3231073777340188,
+      "grad_norm": 0.19946704804897308,
+      "learning_rate": 0.00019685515809451056,
+      "loss": 0.2476,
+      "step": 11650
+    },
+    {
+      "epoch": 0.3244941046770833,
+      "grad_norm": 0.17677927017211914,
+      "learning_rate": 0.0001968177995790984,
+      "loss": 0.2432,
+      "step": 11700
+    },
+    {
+      "epoch": 0.32588083162014775,
+      "grad_norm": 0.18418142199516296,
+      "learning_rate": 0.00019678022406267374,
+      "loss": 0.2387,
+      "step": 11750
+    },
+    {
+      "epoch": 0.3272675585632122,
+      "grad_norm": 0.1462264358997345,
+      "learning_rate": 0.00019674243162945594,
+      "loss": 0.2377,
+      "step": 11800
+    },
+    {
+      "epoch": 0.3286542855062767,
+      "grad_norm": 0.14166492223739624,
+      "learning_rate": 0.0001967044223641504,
+      "loss": 0.238,
+      "step": 11850
+    },
+    {
+      "epoch": 0.33004101244934114,
+      "grad_norm": 0.17436008155345917,
+      "learning_rate": 0.00019666619635194866,
+      "loss": 0.2429,
+      "step": 11900
+    },
+    {
+      "epoch": 0.3314277393924056,
+      "grad_norm": 0.15779553353786469,
+      "learning_rate": 0.00019662775367852787,
+      "loss": 0.2404,
+      "step": 11950
+    },
+    {
+      "epoch": 0.33281446633547007,
+      "grad_norm": 0.17796078324317932,
+      "learning_rate": 0.000196589094430051,
+      "loss": 0.235,
+      "step": 12000
+    },
+    {
+      "epoch": 0.33281446633547007,
+      "eval_loss": 0.235828697681427,
+      "eval_runtime": 500.6046,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 12000
+    },
+    {
+      "epoch": 0.3342011932785345,
+      "grad_norm": 0.14978894591331482,
+      "learning_rate": 0.0001965502186931662,
+      "loss": 0.2419,
+      "step": 12050
+    },
+    {
+      "epoch": 0.33558792022159895,
+      "grad_norm": 0.17456893622875214,
+      "learning_rate": 0.00019651112655500713,
+      "loss": 0.2389,
+      "step": 12100
+    },
+    {
+      "epoch": 0.3369746471646634,
+      "grad_norm": 0.1462843269109726,
+      "learning_rate": 0.0001964718181031922,
+      "loss": 0.2363,
+      "step": 12150
+    },
+    {
+      "epoch": 0.3383613741077279,
+      "grad_norm": 0.16996078193187714,
+      "learning_rate": 0.0001964322934258248,
+      "loss": 0.2404,
+      "step": 12200
+    },
+    {
+      "epoch": 0.33974810105079234,
+      "grad_norm": 0.1906641721725464,
+      "learning_rate": 0.00019639255261149298,
+      "loss": 0.2394,
+      "step": 12250
+    },
+    {
+      "epoch": 0.3411348279938568,
+      "grad_norm": 0.15007531642913818,
+      "learning_rate": 0.00019635259574926912,
+      "loss": 0.2371,
+      "step": 12300
+    },
+    {
+      "epoch": 0.34252155493692127,
+      "grad_norm": 0.18667016923427582,
+      "learning_rate": 0.00019631242292870993,
+      "loss": 0.24,
+      "step": 12350
+    },
+    {
+      "epoch": 0.34390828187998573,
+      "grad_norm": 0.1689510941505432,
+      "learning_rate": 0.0001962720342398561,
+      "loss": 0.2359,
+      "step": 12400
+    },
+    {
+      "epoch": 0.3452950088230502,
+      "grad_norm": 0.1622210294008255,
+      "learning_rate": 0.0001962314297732321,
+      "loss": 0.2405,
+      "step": 12450
+    },
+    {
+      "epoch": 0.3466817357661146,
+      "grad_norm": 0.20153377950191498,
+      "learning_rate": 0.0001961906096198462,
+      "loss": 0.2368,
+      "step": 12500
+    },
+    {
+      "epoch": 0.34806846270917907,
+      "grad_norm": 0.1634126603603363,
+      "learning_rate": 0.00019614957387118994,
+      "loss": 0.236,
+      "step": 12550
+    },
+    {
+      "epoch": 0.34945518965224354,
+      "grad_norm": 0.21276158094406128,
+      "learning_rate": 0.00019610832261923817,
+      "loss": 0.2397,
+      "step": 12600
+    },
+    {
+      "epoch": 0.350841916595308,
+      "grad_norm": 0.16108940541744232,
+      "learning_rate": 0.00019606685595644865,
+      "loss": 0.2424,
+      "step": 12650
+    },
+    {
+      "epoch": 0.35222864353837247,
+      "grad_norm": 0.20505978167057037,
+      "learning_rate": 0.00019602517397576205,
+      "loss": 0.2423,
+      "step": 12700
+    },
+    {
+      "epoch": 0.35361537048143693,
+      "grad_norm": 0.1431368589401245,
+      "learning_rate": 0.0001959832767706016,
+      "loss": 0.2353,
+      "step": 12750
+    },
+    {
+      "epoch": 0.3550020974245014,
+      "grad_norm": 0.1670791357755661,
+      "learning_rate": 0.00019594116443487293,
+      "loss": 0.2366,
+      "step": 12800
+    },
+    {
+      "epoch": 0.35638882436756586,
+      "grad_norm": 0.1353309154510498,
+      "learning_rate": 0.00019589883706296385,
+      "loss": 0.2387,
+      "step": 12850
+    },
+    {
+      "epoch": 0.3577755513106303,
+      "grad_norm": 0.16561363637447357,
+      "learning_rate": 0.00019585629474974415,
+      "loss": 0.2373,
+      "step": 12900
+    },
+    {
+      "epoch": 0.35916227825369473,
+      "grad_norm": 0.16978101432323456,
+      "learning_rate": 0.00019581353759056528,
+      "loss": 0.2383,
+      "step": 12950
+    },
+    {
+      "epoch": 0.3605490051967592,
+      "grad_norm": 0.13398033380508423,
+      "learning_rate": 0.0001957705656812604,
+      "loss": 0.2389,
+      "step": 13000
+    },
+    {
+      "epoch": 0.3605490051967592,
+      "eval_loss": 0.2349192500114441,
+      "eval_runtime": 500.9767,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 13000
+    },
+    {
+      "epoch": 0.36193573213982366,
+      "grad_norm": 0.17141664028167725,
+      "learning_rate": 0.00019572737911814387,
+      "loss": 0.2379,
+      "step": 13050
+    },
+    {
+      "epoch": 0.3633224590828881,
+      "grad_norm": 0.25635290145874023,
+      "learning_rate": 0.00019568397799801118,
+      "loss": 0.2354,
+      "step": 13100
+    },
+    {
+      "epoch": 0.3647091860259526,
+      "grad_norm": 0.19244590401649475,
+      "learning_rate": 0.00019564036241813876,
+      "loss": 0.2372,
+      "step": 13150
+    },
+    {
+      "epoch": 0.36609591296901706,
+      "grad_norm": 0.1587456613779068,
+      "learning_rate": 0.00019559653247628364,
+      "loss": 0.2399,
+      "step": 13200
+    },
+    {
+      "epoch": 0.3674826399120815,
+      "grad_norm": 0.22146746516227722,
+      "learning_rate": 0.0001955524882706834,
+      "loss": 0.2356,
+      "step": 13250
+    },
+    {
+      "epoch": 0.368869366855146,
+      "grad_norm": 0.21101641654968262,
+      "learning_rate": 0.0001955082299000558,
+      "loss": 0.2425,
+      "step": 13300
+    },
+    {
+      "epoch": 0.37025609379821045,
+      "grad_norm": 0.16459371149539948,
+      "learning_rate": 0.0001954637574635986,
+      "loss": 0.239,
+      "step": 13350
+    },
+    {
+      "epoch": 0.37164282074127486,
+      "grad_norm": 0.15547959506511688,
+      "learning_rate": 0.0001954190710609894,
+      "loss": 0.2358,
+      "step": 13400
+    },
+    {
+      "epoch": 0.3730295476843393,
+      "grad_norm": 0.1342894285917282,
+      "learning_rate": 0.00019537417079238534,
+      "loss": 0.2363,
+      "step": 13450
+    },
+    {
+      "epoch": 0.3744162746274038,
+      "grad_norm": 0.14169098436832428,
+      "learning_rate": 0.0001953290567584229,
+      "loss": 0.2355,
+      "step": 13500
+    },
+    {
+      "epoch": 0.37580300157046825,
+      "grad_norm": 0.17943793535232544,
+      "learning_rate": 0.00019528372906021772,
+      "loss": 0.2354,
+      "step": 13550
+    },
+    {
+      "epoch": 0.3771897285135327,
+      "grad_norm": 0.20254671573638916,
+      "learning_rate": 0.0001952381877993643,
+      "loss": 0.2411,
+      "step": 13600
+    },
+    {
+      "epoch": 0.3785764554565972,
+      "grad_norm": 0.1362125426530838,
+      "learning_rate": 0.0001951924330779358,
+      "loss": 0.2383,
+      "step": 13650
+    },
+    {
+      "epoch": 0.37996318239966165,
+      "grad_norm": 0.19201667606830597,
+      "learning_rate": 0.0001951464649984838,
+      "loss": 0.2398,
+      "step": 13700
+    },
+    {
+      "epoch": 0.3813499093427261,
+      "grad_norm": 0.15204668045043945,
+      "learning_rate": 0.0001951002836640382,
+      "loss": 0.2347,
+      "step": 13750
+    },
+    {
+      "epoch": 0.3827366362857906,
+      "grad_norm": 0.14426596462726593,
+      "learning_rate": 0.00019505388917810665,
+      "loss": 0.2399,
+      "step": 13800
+    },
+    {
+      "epoch": 0.38412336322885504,
+      "grad_norm": 0.1463170200586319,
+      "learning_rate": 0.0001950072816446748,
+      "loss": 0.2316,
+      "step": 13850
+    },
+    {
+      "epoch": 0.38551009017191945,
+      "grad_norm": 0.15552669763565063,
+      "learning_rate": 0.00019496046116820566,
+      "loss": 0.2354,
+      "step": 13900
+    },
+    {
+      "epoch": 0.3868968171149839,
+      "grad_norm": 0.16742919385433197,
+      "learning_rate": 0.00019491342785363952,
+      "loss": 0.2388,
+      "step": 13950
+    },
+    {
+      "epoch": 0.3882835440580484,
+      "grad_norm": 0.16111566126346588,
+      "learning_rate": 0.00019486618180639375,
+      "loss": 0.2385,
+      "step": 14000
+    },
+    {
+      "epoch": 0.3882835440580484,
+      "eval_loss": 0.23382489383220673,
+      "eval_runtime": 500.6533,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 14000
+    },
+    {
+      "epoch": 0.38967027100111284,
+      "grad_norm": 0.15741662681102753,
+      "learning_rate": 0.00019481872313236256,
+      "loss": 0.2374,
+      "step": 14050
+    },
+    {
+      "epoch": 0.3910569979441773,
+      "grad_norm": 0.15046770870685577,
+      "learning_rate": 0.00019477105193791664,
+      "loss": 0.2379,
+      "step": 14100
+    },
+    {
+      "epoch": 0.3924437248872418,
+      "grad_norm": 0.14219743013381958,
+      "learning_rate": 0.00019472316832990308,
+      "loss": 0.2434,
+      "step": 14150
+    },
+    {
+      "epoch": 0.39383045183030624,
+      "grad_norm": 0.15226851403713226,
+      "learning_rate": 0.000194675072415645,
+      "loss": 0.2427,
+      "step": 14200
+    },
+    {
+      "epoch": 0.3952171787733707,
+      "grad_norm": 0.19782114028930664,
+      "learning_rate": 0.00019462676430294143,
+      "loss": 0.2357,
+      "step": 14250
+    },
+    {
+      "epoch": 0.39660390571643517,
+      "grad_norm": 0.14243118464946747,
+      "learning_rate": 0.00019457824410006692,
+      "loss": 0.2343,
+      "step": 14300
+    },
+    {
+      "epoch": 0.3979906326594996,
+      "grad_norm": 0.22301803529262543,
+      "learning_rate": 0.00019452951191577155,
+      "loss": 0.2406,
+      "step": 14350
+    },
+    {
+      "epoch": 0.39937735960256404,
+      "grad_norm": 0.13103021681308746,
+      "learning_rate": 0.00019448056785928032,
+      "loss": 0.2398,
+      "step": 14400
+    },
+    {
+      "epoch": 0.4007640865456285,
+      "grad_norm": 0.16922806203365326,
+      "learning_rate": 0.00019443141204029325,
+      "loss": 0.2363,
+      "step": 14450
+    },
+    {
+      "epoch": 0.40215081348869297,
+      "grad_norm": 0.17801126837730408,
+      "learning_rate": 0.00019438204456898492,
+      "loss": 0.2377,
+      "step": 14500
+    },
+    {
+      "epoch": 0.40353754043175744,
+      "grad_norm": 0.14513610303401947,
+      "learning_rate": 0.0001943324655560043,
+      "loss": 0.241,
+      "step": 14550
+    },
+    {
+      "epoch": 0.4049242673748219,
+      "grad_norm": 0.14587055146694183,
+      "learning_rate": 0.00019428267511247457,
+      "loss": 0.2345,
+      "step": 14600
+    },
+    {
+      "epoch": 0.40631099431788636,
+      "grad_norm": 0.17200471460819244,
+      "learning_rate": 0.00019423267334999267,
+      "loss": 0.2345,
+      "step": 14650
+    },
+    {
+      "epoch": 0.40769772126095083,
+      "grad_norm": 0.16612234711647034,
+      "learning_rate": 0.00019418246038062928,
+      "loss": 0.235,
+      "step": 14700
+    },
+    {
+      "epoch": 0.4090844482040153,
+      "grad_norm": 0.14822156727313995,
+      "learning_rate": 0.00019413203631692843,
+      "loss": 0.2384,
+      "step": 14750
+    },
+    {
+      "epoch": 0.4104711751470797,
+      "grad_norm": 0.15960198640823364,
+      "learning_rate": 0.00019408140127190725,
+      "loss": 0.2375,
+      "step": 14800
+    },
+    {
+      "epoch": 0.41185790209014417,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019403157434308126,
+      "loss": 0.233,
+      "step": 14850
+    },
+    {
+      "epoch": 0.41324462903320863,
+      "grad_norm": 0.15910230576992035,
+      "learning_rate": 0.00019398154500404588,
+      "loss": 0.2728,
+      "step": 14900
+    },
+    {
+      "epoch": 0.4146313559762731,
+      "grad_norm": 0.16004903614521027,
+      "learning_rate": 0.0001939302861212685,
+      "loss": 0.2359,
+      "step": 14950
+    },
+    {
+      "epoch": 0.41601808291933756,
+      "grad_norm": 0.1622370034456253,
+      "learning_rate": 0.00019387881670936035,
+      "loss": 0.2413,
+      "step": 15000
+    },
+    {
+      "epoch": 0.41601808291933756,
+      "eval_loss": 0.23365913331508636,
+      "eval_runtime": 500.916,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 15000
+    },
+    {
+      "epoch": 0.417404809862402,
+      "grad_norm": 0.1744803488254547,
+      "learning_rate": 0.00019382713688368162,
+      "loss": 0.2406,
+      "step": 15050
+    },
+    {
+      "epoch": 0.4187915368054665,
+      "grad_norm": 0.19140714406967163,
+      "learning_rate": 0.00019377524676006397,
+      "loss": 0.2385,
+      "step": 15100
+    },
+    {
+      "epoch": 0.42017826374853096,
+      "grad_norm": 0.14320451021194458,
+      "learning_rate": 0.00019372314645481052,
+      "loss": 0.2384,
+      "step": 15150
+    },
+    {
+      "epoch": 0.4215649906915954,
+      "grad_norm": 0.18620997667312622,
+      "learning_rate": 0.00019367083608469546,
+      "loss": 0.2343,
+      "step": 15200
+    },
+    {
+      "epoch": 0.42295171763465983,
+      "grad_norm": 0.13473859429359436,
+      "learning_rate": 0.00019361831576696382,
+      "loss": 0.2399,
+      "step": 15250
+    },
+    {
+      "epoch": 0.4243384445777243,
+      "grad_norm": 0.15213748812675476,
+      "learning_rate": 0.00019356558561933108,
+      "loss": 0.2358,
+      "step": 15300
+    },
+    {
+      "epoch": 0.42572517152078876,
+      "grad_norm": 0.16841459274291992,
+      "learning_rate": 0.0001935126457599832,
+      "loss": 0.2332,
+      "step": 15350
+    },
+    {
+      "epoch": 0.4271118984638532,
+      "grad_norm": 0.14978626370429993,
+      "learning_rate": 0.00019345949630757603,
+      "loss": 0.2382,
+      "step": 15400
+    },
+    {
+      "epoch": 0.4284986254069177,
+      "grad_norm": 0.18397267162799835,
+      "learning_rate": 0.00019340613738123526,
+      "loss": 0.2328,
+      "step": 15450
+    },
+    {
+      "epoch": 0.42988535234998215,
+      "grad_norm": 0.13535378873348236,
+      "learning_rate": 0.000193352569100556,
+      "loss": 0.2278,
+      "step": 15500
+    },
+    {
+      "epoch": 0.4312720792930466,
+      "grad_norm": 0.1288972645998001,
+      "learning_rate": 0.00019329879158560274,
+      "loss": 0.2385,
+      "step": 15550
+    },
+    {
+      "epoch": 0.4326588062361111,
+      "grad_norm": 0.1488959789276123,
+      "learning_rate": 0.0001932448049569088,
+      "loss": 0.2352,
+      "step": 15600
+    },
+    {
+      "epoch": 0.43404553317917555,
+      "grad_norm": 0.16358473896980286,
+      "learning_rate": 0.00019319060933547624,
+      "loss": 0.2362,
+      "step": 15650
+    },
+    {
+      "epoch": 0.43543226012223996,
+      "grad_norm": 0.13347339630126953,
+      "learning_rate": 0.00019313620484277553,
+      "loss": 0.2376,
+      "step": 15700
+    },
+    {
+      "epoch": 0.4368189870653044,
+      "grad_norm": 0.13555756211280823,
+      "learning_rate": 0.0001930815916007453,
+      "loss": 0.2308,
+      "step": 15750
+    },
+    {
+      "epoch": 0.4382057140083689,
+      "grad_norm": 0.13955436646938324,
+      "learning_rate": 0.0001930267697317921,
+      "loss": 0.2329,
+      "step": 15800
+    },
+    {
+      "epoch": 0.43959244095143335,
+      "grad_norm": 0.1596931517124176,
+      "learning_rate": 0.00019297173935879,
+      "loss": 0.2322,
+      "step": 15850
+    },
+    {
+      "epoch": 0.4409791678944978,
+      "grad_norm": 0.14860297739505768,
+      "learning_rate": 0.00019291650060508045,
+      "loss": 0.234,
+      "step": 15900
+    },
+    {
+      "epoch": 0.4423658948375623,
+      "grad_norm": 0.14575625956058502,
+      "learning_rate": 0.00019286105359447194,
+      "loss": 0.2362,
+      "step": 15950
+    },
+    {
+      "epoch": 0.44375262178062674,
+      "grad_norm": 0.1400967240333557,
+      "learning_rate": 0.00019280539845123974,
+      "loss": 0.2358,
+      "step": 16000
+    },
+    {
+      "epoch": 0.44375262178062674,
+      "eval_loss": 0.23256094753742218,
+      "eval_runtime": 500.6637,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 16000
+    },
+    {
+      "epoch": 0.4451393487236912,
+      "grad_norm": 0.2537101209163666,
+      "learning_rate": 0.00019274953530012563,
+      "loss": 0.2363,
+      "step": 16050
+    },
+    {
+      "epoch": 0.4465260756667557,
+      "grad_norm": 0.192925825715065,
+      "learning_rate": 0.0001926934642663375,
+      "loss": 0.2343,
+      "step": 16100
+    },
+    {
+      "epoch": 0.4479128026098201,
+      "grad_norm": 0.17011120915412903,
+      "learning_rate": 0.0001926371854755493,
+      "loss": 0.2362,
+      "step": 16150
+    },
+    {
+      "epoch": 0.44929952955288455,
+      "grad_norm": 0.1474524289369583,
+      "learning_rate": 0.00019258069905390065,
+      "loss": 0.2359,
+      "step": 16200
+    },
+    {
+      "epoch": 0.450686256495949,
+      "grad_norm": 0.15591026842594147,
+      "learning_rate": 0.00019252400512799643,
+      "loss": 0.2338,
+      "step": 16250
+    },
+    {
+      "epoch": 0.4520729834390135,
+      "grad_norm": 0.14443908631801605,
+      "learning_rate": 0.00019246710382490664,
+      "loss": 0.2421,
+      "step": 16300
+    },
+    {
+      "epoch": 0.45345971038207794,
+      "grad_norm": 0.12614597380161285,
+      "learning_rate": 0.00019240999527216608,
+      "loss": 0.2373,
+      "step": 16350
+    },
+    {
+      "epoch": 0.4548464373251424,
+      "grad_norm": 0.1438266485929489,
+      "learning_rate": 0.00019235267959777415,
+      "loss": 0.2443,
+      "step": 16400
+    },
+    {
+      "epoch": 0.45623316426820687,
+      "grad_norm": 0.14473649859428406,
+      "learning_rate": 0.00019229515693019436,
+      "loss": 0.241,
+      "step": 16450
+    },
+    {
+      "epoch": 0.45761989121127133,
+      "grad_norm": 0.13498128950595856,
+      "learning_rate": 0.00019223742739835423,
+      "loss": 0.2393,
+      "step": 16500
+    },
+    {
+      "epoch": 0.4590066181543358,
+      "grad_norm": 0.14498169720172882,
+      "learning_rate": 0.0001921794911316449,
+      "loss": 0.2363,
+      "step": 16550
+    },
+    {
+      "epoch": 0.4603933450974002,
+      "grad_norm": 0.14319288730621338,
+      "learning_rate": 0.00019212134825992091,
+      "loss": 0.2359,
+      "step": 16600
+    },
+    {
+      "epoch": 0.4617800720404647,
+      "grad_norm": 0.12314629554748535,
+      "learning_rate": 0.00019206299891349983,
+      "loss": 0.23,
+      "step": 16650
+    },
+    {
+      "epoch": 0.46316679898352914,
+      "grad_norm": 0.14780518412590027,
+      "learning_rate": 0.00019200444322316207,
+      "loss": 0.2381,
+      "step": 16700
+    },
+    {
+      "epoch": 0.4645535259265936,
+      "grad_norm": 0.1493334025144577,
+      "learning_rate": 0.0001919456813201504,
+      "loss": 0.2345,
+      "step": 16750
+    },
+    {
+      "epoch": 0.46594025286965807,
+      "grad_norm": 0.11972863227128983,
+      "learning_rate": 0.00019188671333616992,
+      "loss": 0.235,
+      "step": 16800
+    },
+    {
+      "epoch": 0.46732697981272253,
+      "grad_norm": 0.13366112112998962,
+      "learning_rate": 0.00019182753940338753,
+      "loss": 0.2306,
+      "step": 16850
+    },
+    {
+      "epoch": 0.468713706755787,
+      "grad_norm": 0.13790684938430786,
+      "learning_rate": 0.00019176815965443186,
+      "loss": 0.2366,
+      "step": 16900
+    },
+    {
+      "epoch": 0.47010043369885146,
+      "grad_norm": 0.14081595838069916,
+      "learning_rate": 0.0001917085742223926,
+      "loss": 0.2368,
+      "step": 16950
+    },
+    {
+      "epoch": 0.4714871606419159,
+      "grad_norm": 0.13987073302268982,
+      "learning_rate": 0.00019164878324082074,
+      "loss": 0.2337,
+      "step": 17000
+    },
+    {
+      "epoch": 0.4714871606419159,
+      "eval_loss": 0.2317454218864441,
+      "eval_runtime": 500.9301,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 17000
+    },
+    {
+      "epoch": 0.47287388758498033,
+      "grad_norm": 0.1430695503950119,
+      "learning_rate": 0.00019158878684372778,
+      "loss": 0.2346,
+      "step": 17050
+    },
+    {
+      "epoch": 0.4742606145280448,
+      "grad_norm": 0.14264121651649475,
+      "learning_rate": 0.00019152858516558564,
+      "loss": 0.2339,
+      "step": 17100
+    },
+    {
+      "epoch": 0.47564734147110926,
+      "grad_norm": 0.15278013050556183,
+      "learning_rate": 0.00019146817834132644,
+      "loss": 0.2333,
+      "step": 17150
+    },
+    {
+      "epoch": 0.47703406841417373,
+      "grad_norm": 0.15283286571502686,
+      "learning_rate": 0.000191407566506342,
+      "loss": 0.2323,
+      "step": 17200
+    },
+    {
+      "epoch": 0.4784207953572382,
+      "grad_norm": 0.13433212041854858,
+      "learning_rate": 0.00019134674979648367,
+      "loss": 0.2406,
+      "step": 17250
+    },
+    {
+      "epoch": 0.47980752230030266,
+      "grad_norm": 0.14129064977169037,
+      "learning_rate": 0.00019128572834806203,
+      "loss": 0.2353,
+      "step": 17300
+    },
+    {
+      "epoch": 0.4811942492433671,
+      "grad_norm": 0.14736846089363098,
+      "learning_rate": 0.00019122450229784653,
+      "loss": 0.2312,
+      "step": 17350
+    },
+    {
+      "epoch": 0.4825809761864316,
+      "grad_norm": 0.14513076841831207,
+      "learning_rate": 0.00019116307178306514,
+      "loss": 0.2358,
+      "step": 17400
+    },
+    {
+      "epoch": 0.48396770312949605,
+      "grad_norm": 0.14358818531036377,
+      "learning_rate": 0.0001911014369414042,
+      "loss": 0.2376,
+      "step": 17450
+    },
+    {
+      "epoch": 0.48535443007256046,
+      "grad_norm": 0.14574295282363892,
+      "learning_rate": 0.00019103959791100792,
+      "loss": 0.2306,
+      "step": 17500
+    },
+    {
+      "epoch": 0.4867411570156249,
+      "grad_norm": 0.1347060352563858,
+      "learning_rate": 0.00019097755483047827,
+      "loss": 0.2341,
+      "step": 17550
+    },
+    {
+      "epoch": 0.4881278839586894,
+      "grad_norm": 0.1792859435081482,
+      "learning_rate": 0.00019091530783887448,
+      "loss": 0.2392,
+      "step": 17600
+    },
+    {
+      "epoch": 0.48951461090175385,
+      "grad_norm": 0.11206398904323578,
+      "learning_rate": 0.00019085285707571282,
+      "loss": 0.236,
+      "step": 17650
+    },
+    {
+      "epoch": 0.4909013378448183,
+      "grad_norm": 0.16337329149246216,
+      "learning_rate": 0.0001907902026809663,
+      "loss": 0.239,
+      "step": 17700
+    },
+    {
+      "epoch": 0.4922880647878828,
+      "grad_norm": 0.14579764008522034,
+      "learning_rate": 0.0001907273447950644,
+      "loss": 0.2258,
+      "step": 17750
+    },
+    {
+      "epoch": 0.49367479173094725,
+      "grad_norm": 0.1381896585226059,
+      "learning_rate": 0.00019066428355889257,
+      "loss": 0.2366,
+      "step": 17800
+    },
+    {
+      "epoch": 0.4950615186740117,
+      "grad_norm": 0.13557949662208557,
+      "learning_rate": 0.00019060101911379208,
+      "loss": 0.236,
+      "step": 17850
+    },
+    {
+      "epoch": 0.4964482456170762,
+      "grad_norm": 0.13205058872699738,
+      "learning_rate": 0.00019053755160155974,
+      "loss": 0.237,
+      "step": 17900
+    },
+    {
+      "epoch": 0.4978349725601406,
+      "grad_norm": 0.1766868382692337,
+      "learning_rate": 0.00019047388116444735,
+      "loss": 0.241,
+      "step": 17950
+    },
+    {
+      "epoch": 0.49922169950320505,
+      "grad_norm": 0.1567864567041397,
+      "learning_rate": 0.00019041000794516171,
+      "loss": 0.2269,
+      "step": 18000
+    },
+    {
+      "epoch": 0.49922169950320505,
+      "eval_loss": 0.23145872354507446,
+      "eval_runtime": 500.5681,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 18000
+    },
+    {
+      "epoch": 0.5006084264462696,
+      "grad_norm": 0.13615478575229645,
+      "learning_rate": 0.00019034593208686396,
+      "loss": 0.2347,
+      "step": 18050
+    },
+    {
+      "epoch": 0.501995153389334,
+      "grad_norm": 0.13786327838897705,
+      "learning_rate": 0.00019028165373316948,
+      "loss": 0.2335,
+      "step": 18100
+    },
+    {
+      "epoch": 0.5033818803323985,
+      "grad_norm": 0.14584092795848846,
+      "learning_rate": 0.0001902171730281476,
+      "loss": 0.2392,
+      "step": 18150
+    },
+    {
+      "epoch": 0.5047686072754629,
+      "grad_norm": 0.18500222265720367,
+      "learning_rate": 0.000190152490116321,
+      "loss": 0.2336,
+      "step": 18200
+    },
+    {
+      "epoch": 0.5061553342185273,
+      "grad_norm": 0.14118489623069763,
+      "learning_rate": 0.0001900876051426658,
+      "loss": 0.2362,
+      "step": 18250
+    },
+    {
+      "epoch": 0.5075420611615918,
+      "grad_norm": 0.18030238151550293,
+      "learning_rate": 0.00019002251825261078,
+      "loss": 0.2363,
+      "step": 18300
+    },
+    {
+      "epoch": 0.5089287881046562,
+      "grad_norm": 0.1916930228471756,
+      "learning_rate": 0.00018995722959203745,
+      "loss": 0.2342,
+      "step": 18350
+    },
+    {
+      "epoch": 0.5103155150477208,
+      "grad_norm": 0.1503581702709198,
+      "learning_rate": 0.00018989173930727951,
+      "loss": 0.2365,
+      "step": 18400
+    },
+    {
+      "epoch": 0.5117022419907852,
+      "grad_norm": 0.14816977083683014,
+      "learning_rate": 0.0001898260475451225,
+      "loss": 0.2387,
+      "step": 18450
+    },
+    {
+      "epoch": 0.5130889689338497,
+      "grad_norm": 0.13476118445396423,
+      "learning_rate": 0.00018976015445280363,
+      "loss": 0.2343,
+      "step": 18500
+    },
+    {
+      "epoch": 0.5144756958769141,
+      "grad_norm": 0.17522576451301575,
+      "learning_rate": 0.00018969406017801127,
+      "loss": 0.2299,
+      "step": 18550
+    },
+    {
+      "epoch": 0.5158624228199786,
+      "grad_norm": 0.13437584042549133,
+      "learning_rate": 0.00018962776486888485,
+      "loss": 0.2342,
+      "step": 18600
+    },
+    {
+      "epoch": 0.517249149763043,
+      "grad_norm": 0.14156264066696167,
+      "learning_rate": 0.0001895612686740142,
+      "loss": 0.2363,
+      "step": 18650
+    },
+    {
+      "epoch": 0.5186358767061074,
+      "grad_norm": 0.11037924140691757,
+      "learning_rate": 0.00018949457174243954,
+      "loss": 0.2343,
+      "step": 18700
+    },
+    {
+      "epoch": 0.520022603649172,
+      "grad_norm": 0.1362009048461914,
+      "learning_rate": 0.00018942767422365094,
+      "loss": 0.2363,
+      "step": 18750
+    },
+    {
+      "epoch": 0.5214093305922364,
+      "grad_norm": 0.1261095106601715,
+      "learning_rate": 0.00018936057626758808,
+      "loss": 0.2341,
+      "step": 18800
+    },
+    {
+      "epoch": 0.5227960575353009,
+      "grad_norm": 0.13382628560066223,
+      "learning_rate": 0.00018929327802463987,
+      "loss": 0.2309,
+      "step": 18850
+    },
+    {
+      "epoch": 0.5241827844783653,
+      "grad_norm": 0.15190520882606506,
+      "learning_rate": 0.00018922577964564417,
+      "loss": 0.2338,
+      "step": 18900
+    },
+    {
+      "epoch": 0.5255695114214298,
+      "grad_norm": 0.13708838820457458,
+      "learning_rate": 0.00018915808128188734,
+      "loss": 0.2338,
+      "step": 18950
+    },
+    {
+      "epoch": 0.5269562383644942,
+      "grad_norm": 0.20378737151622772,
+      "learning_rate": 0.0001890901830851041,
+      "loss": 0.2341,
+      "step": 19000
+    },
+    {
+      "epoch": 0.5269562383644942,
+      "eval_loss": 0.23116359114646912,
+      "eval_runtime": 500.7638,
+      "eval_samples_per_second": 5.705,
+      "eval_steps_per_second": 5.705,
+      "step": 19000
+    },
+    {
+      "epoch": 0.5283429653075588,
+      "grad_norm": 0.17179715633392334,
+      "learning_rate": 0.00018902208520747685,
+      "loss": 0.2363,
+      "step": 19050
+    },
+    {
+      "epoch": 0.5297296922506232,
+      "grad_norm": 0.13991795480251312,
+      "learning_rate": 0.00018895378780163578,
+      "loss": 0.2308,
+      "step": 19100
+    },
+    {
+      "epoch": 0.5311164191936876,
+      "grad_norm": 0.11662200093269348,
+      "learning_rate": 0.0001888852910206581,
+      "loss": 0.2354,
+      "step": 19150
+    },
+    {
+      "epoch": 0.5325031461367521,
+      "grad_norm": 0.1577063351869583,
+      "learning_rate": 0.00018881659501806804,
+      "loss": 0.2331,
+      "step": 19200
+    },
+    {
+      "epoch": 0.5338898730798165,
+      "grad_norm": 0.14893421530723572,
+      "learning_rate": 0.0001887476999478362,
+      "loss": 0.2345,
+      "step": 19250
+    },
+    {
+      "epoch": 0.535276600022881,
+      "grad_norm": 0.14458926022052765,
+      "learning_rate": 0.00018867860596437946,
+      "loss": 0.2364,
+      "step": 19300
+    },
+    {
+      "epoch": 0.5366633269659454,
+      "grad_norm": 0.18197046220302582,
+      "learning_rate": 0.00018860931322256056,
+      "loss": 0.2316,
+      "step": 19350
+    },
+    {
+      "epoch": 0.53805005390901,
+      "grad_norm": 0.12696345150470734,
+      "learning_rate": 0.0001885398218776876,
+      "loss": 0.2288,
+      "step": 19400
+    },
+    {
+      "epoch": 0.5394367808520744,
+      "grad_norm": 0.14459608495235443,
+      "learning_rate": 0.00018847013208551393,
+      "loss": 0.2342,
+      "step": 19450
+    },
+    {
+      "epoch": 0.5408235077951389,
+      "grad_norm": 0.13681089878082275,
+      "learning_rate": 0.00018840024400223758,
+      "loss": 0.2341,
+      "step": 19500
+    },
+    {
+      "epoch": 0.5422102347382033,
+      "grad_norm": 0.1358567178249359,
+      "learning_rate": 0.00018833015778450113,
+      "loss": 0.239,
+      "step": 19550
+    },
+    {
+      "epoch": 0.5435969616812677,
+      "grad_norm": 0.1429983228445053,
+      "learning_rate": 0.0001882598735893912,
+      "loss": 0.234,
+      "step": 19600
+    },
+    {
+      "epoch": 0.5449836886243322,
+      "grad_norm": 0.15259206295013428,
+      "learning_rate": 0.00018818939157443806,
+      "loss": 0.2333,
+      "step": 19650
+    },
+    {
+      "epoch": 0.5463704155673966,
+      "grad_norm": 0.1499055027961731,
+      "learning_rate": 0.00018811871189761554,
+      "loss": 0.2335,
+      "step": 19700
+    },
+    {
+      "epoch": 0.5477571425104611,
+      "grad_norm": 0.15547756850719452,
+      "learning_rate": 0.0001880478347173403,
+      "loss": 0.2331,
+      "step": 19750
+    },
+    {
+      "epoch": 0.5491438694535256,
+      "grad_norm": 0.13615499436855316,
+      "learning_rate": 0.00018797676019247187,
+      "loss": 0.2327,
+      "step": 19800
+    },
+    {
+      "epoch": 0.5505305963965901,
+      "grad_norm": 0.15891136229038239,
+      "learning_rate": 0.00018790548848231188,
+      "loss": 0.2293,
+      "step": 19850
+    },
+    {
+      "epoch": 0.5519173233396545,
+      "grad_norm": 0.1028260812163353,
+      "learning_rate": 0.0001878340197466041,
+      "loss": 0.2337,
+      "step": 19900
+    },
+    {
+      "epoch": 0.553304050282719,
+      "grad_norm": 0.15393692255020142,
+      "learning_rate": 0.0001877623541455338,
+      "loss": 0.2332,
+      "step": 19950
+    },
+    {
+      "epoch": 0.5546907772257834,
+      "grad_norm": 0.11807084083557129,
+      "learning_rate": 0.0001876904918397275,
+      "loss": 0.2352,
+      "step": 20000
+    },
+    {
+      "epoch": 0.5546907772257834,
+      "eval_loss": 0.2310873419046402,
+      "eval_runtime": 501.0545,
+      "eval_samples_per_second": 5.702,
+      "eval_steps_per_second": 5.702,
+      "step": 20000
+    },
+    {
+      "epoch": 0.5560775041688478,
+      "grad_norm": 0.1603621393442154,
+      "learning_rate": 0.00018761843299025267,
+      "loss": 0.2347,
+      "step": 20050
+    },
+    {
+      "epoch": 0.5574642311119123,
+      "grad_norm": 0.14295394718647003,
+      "learning_rate": 0.00018754617775861718,
+      "loss": 0.2335,
+      "step": 20100
+    },
+    {
+      "epoch": 0.5588509580549768,
+      "grad_norm": 0.1290232539176941,
+      "learning_rate": 0.0001874737263067692,
+      "loss": 0.2337,
+      "step": 20150
+    },
+    {
+      "epoch": 0.5602376849980413,
+      "grad_norm": 0.16112935543060303,
+      "learning_rate": 0.00018740107879709655,
+      "loss": 0.2354,
+      "step": 20200
+    },
+    {
+      "epoch": 0.5616244119411057,
+      "grad_norm": 0.13674217462539673,
+      "learning_rate": 0.00018732823539242664,
+      "loss": 0.23,
+      "step": 20250
+    },
+    {
+      "epoch": 0.5630111388841702,
+      "grad_norm": 0.18549004197120667,
+      "learning_rate": 0.00018725519625602578,
+      "loss": 0.2353,
+      "step": 20300
+    },
+    {
+      "epoch": 0.5643978658272346,
+      "grad_norm": 0.13107050955295563,
+      "learning_rate": 0.0001871819615515991,
+      "loss": 0.2392,
+      "step": 20350
+    },
+    {
+      "epoch": 0.5657845927702991,
+      "grad_norm": 0.13590605556964874,
+      "learning_rate": 0.00018710853144329002,
+      "loss": 0.2347,
+      "step": 20400
+    },
+    {
+      "epoch": 0.5671713197133635,
+      "grad_norm": 0.13591018319129944,
+      "learning_rate": 0.0001870349060956799,
+      "loss": 0.229,
+      "step": 20450
+    },
+    {
+      "epoch": 0.568558046656428,
+      "grad_norm": 0.11401943862438202,
+      "learning_rate": 0.00018696108567378773,
+      "loss": 0.2326,
+      "step": 20500
+    },
+    {
+      "epoch": 0.5699447735994925,
+      "grad_norm": 0.18518146872520447,
+      "learning_rate": 0.00018688707034306978,
+      "loss": 0.2351,
+      "step": 20550
+    },
+    {
+      "epoch": 0.5713315005425569,
+      "grad_norm": 0.1642865538597107,
+      "learning_rate": 0.00018681286026941905,
+      "loss": 0.2384,
+      "step": 20600
+    },
+    {
+      "epoch": 0.5727182274856214,
+      "grad_norm": 0.133639395236969,
+      "learning_rate": 0.00018673845561916513,
+      "loss": 0.2324,
+      "step": 20650
+    },
+    {
+      "epoch": 0.5741049544286858,
+      "grad_norm": 0.120590940117836,
+      "learning_rate": 0.00018666385655907367,
+      "loss": 0.2315,
+      "step": 20700
+    },
+    {
+      "epoch": 0.5754916813717503,
+      "grad_norm": 0.15754735469818115,
+      "learning_rate": 0.00018658906325634604,
+      "loss": 0.2388,
+      "step": 20750
+    },
+    {
+      "epoch": 0.5768784083148147,
+      "grad_norm": 0.15975181758403778,
+      "learning_rate": 0.00018651407587861905,
+      "loss": 0.2376,
+      "step": 20800
+    },
+    {
+      "epoch": 0.5782651352578793,
+      "grad_norm": 0.13276700675487518,
+      "learning_rate": 0.0001864388945939644,
+      "loss": 0.2379,
+      "step": 20850
+    },
+    {
+      "epoch": 0.5796518622009437,
+      "grad_norm": 0.16388626396656036,
+      "learning_rate": 0.0001863635195708885,
+      "loss": 0.2332,
+      "step": 20900
+    },
+    {
+      "epoch": 0.5810385891440081,
+      "grad_norm": 0.18847975134849548,
+      "learning_rate": 0.0001862879509783319,
+      "loss": 0.2381,
+      "step": 20950
+    },
+    {
+      "epoch": 0.5824253160870726,
+      "grad_norm": 0.24493199586868286,
+      "learning_rate": 0.00018621218898566907,
+      "loss": 0.2328,
+      "step": 21000
+    },
+    {
+      "epoch": 0.5824253160870726,
+      "eval_loss": 0.23020677268505096,
+      "eval_runtime": 499.9502,
+      "eval_samples_per_second": 5.715,
+      "eval_steps_per_second": 5.715,
+      "step": 21000
+    },
+    {
+      "epoch": 0.583812043030137,
+      "grad_norm": 0.16316668689250946,
+      "learning_rate": 0.00018613623376270794,
+      "loss": 0.2429,
+      "step": 21050
+    },
+    {
+      "epoch": 0.5851987699732015,
+      "grad_norm": 0.13449080288410187,
+      "learning_rate": 0.0001860600854796895,
+      "loss": 0.2298,
+      "step": 21100
+    },
+    {
+      "epoch": 0.5865854969162659,
+      "grad_norm": 0.11589767783880234,
+      "learning_rate": 0.00018598374430728746,
+      "loss": 0.2344,
+      "step": 21150
+    },
+    {
+      "epoch": 0.5879722238593305,
+      "grad_norm": 0.11659828573465347,
+      "learning_rate": 0.0001859072104166079,
+      "loss": 0.2333,
+      "step": 21200
+    },
+    {
+      "epoch": 0.5893589508023949,
+      "grad_norm": 0.155133455991745,
+      "learning_rate": 0.00018583048397918884,
+      "loss": 0.2362,
+      "step": 21250
+    },
+    {
+      "epoch": 0.5907456777454594,
+      "grad_norm": 0.16488181054592133,
+      "learning_rate": 0.00018575356516699977,
+      "loss": 0.2334,
+      "step": 21300
+    },
+    {
+      "epoch": 0.5921324046885238,
+      "grad_norm": 0.18307441473007202,
+      "learning_rate": 0.0001856764541524415,
+      "loss": 0.2272,
+      "step": 21350
+    },
+    {
+      "epoch": 0.5935191316315882,
+      "grad_norm": 0.1316101998090744,
+      "learning_rate": 0.00018559915110834553,
+      "loss": 0.2342,
+      "step": 21400
+    },
+    {
+      "epoch": 0.5949058585746527,
+      "grad_norm": 0.1548035889863968,
+      "learning_rate": 0.00018552165620797382,
+      "loss": 0.2323,
+      "step": 21450
+    },
+    {
+      "epoch": 0.5962925855177171,
+      "grad_norm": 0.13214810192584991,
+      "learning_rate": 0.00018544396962501828,
+      "loss": 0.2319,
+      "step": 21500
+    },
+    {
+      "epoch": 0.5976793124607817,
+      "grad_norm": 0.14733006060123444,
+      "learning_rate": 0.00018536609153360046,
+      "loss": 0.237,
+      "step": 21550
+    },
+    {
+      "epoch": 0.5990660394038461,
+      "grad_norm": 0.14465801417827606,
+      "learning_rate": 0.0001852880221082712,
+      "loss": 0.2318,
+      "step": 21600
+    },
+    {
+      "epoch": 0.6004527663469106,
+      "grad_norm": 0.14646270871162415,
+      "learning_rate": 0.00018520976152401012,
+      "loss": 0.2368,
+      "step": 21650
+    },
+    {
+      "epoch": 0.601839493289975,
+      "grad_norm": 0.14174975454807281,
+      "learning_rate": 0.00018513130995622535,
+      "loss": 0.2349,
+      "step": 21700
+    },
+    {
+      "epoch": 0.6032262202330395,
+      "grad_norm": 0.12805262207984924,
+      "learning_rate": 0.00018505266758075302,
+      "loss": 0.2315,
+      "step": 21750
+    },
+    {
+      "epoch": 0.6046129471761039,
+      "grad_norm": 0.1598140299320221,
+      "learning_rate": 0.00018497383457385697,
+      "loss": 0.2332,
+      "step": 21800
+    },
+    {
+      "epoch": 0.6059996741191683,
+      "grad_norm": 0.13651584088802338,
+      "learning_rate": 0.00018489481111222828,
+      "loss": 0.2348,
+      "step": 21850
+    },
+    {
+      "epoch": 0.6073864010622329,
+      "grad_norm": 0.13091818988323212,
+      "learning_rate": 0.0001848155973729849,
+      "loss": 0.2287,
+      "step": 21900
+    },
+    {
+      "epoch": 0.6087731280052973,
+      "grad_norm": 0.17191646993160248,
+      "learning_rate": 0.00018473619353367128,
+      "loss": 0.2342,
+      "step": 21950
+    },
+    {
+      "epoch": 0.6101598549483618,
+      "grad_norm": 0.10674546658992767,
+      "learning_rate": 0.0001846565997722579,
+      "loss": 0.2309,
+      "step": 22000
+    },
+    {
+      "epoch": 0.6101598549483618,
+      "eval_loss": 0.22999995946884155,
+      "eval_runtime": 499.7816,
+      "eval_samples_per_second": 5.716,
+      "eval_steps_per_second": 5.716,
+      "step": 22000
+    },
+    {
+      "epoch": 0.6115465818914262,
+      "grad_norm": 0.1321185827255249,
+      "learning_rate": 0.000184576816267141,
+      "loss": 0.2347,
+      "step": 22050
+    },
+    {
+      "epoch": 0.6129333088344907,
+      "grad_norm": 0.12945061922073364,
+      "learning_rate": 0.00018449684319714202,
+      "loss": 0.2298,
+      "step": 22100
+    },
+    {
+      "epoch": 0.6143200357775551,
+      "grad_norm": 0.16403023898601532,
+      "learning_rate": 0.00018441668074150732,
+      "loss": 0.2276,
+      "step": 22150
+    },
+    {
+      "epoch": 0.6157067627206196,
+      "grad_norm": 0.14253240823745728,
+      "learning_rate": 0.00018433632907990775,
+      "loss": 0.2315,
+      "step": 22200
+    },
+    {
+      "epoch": 0.617093489663684,
+      "grad_norm": 0.1752641350030899,
+      "learning_rate": 0.00018425578839243814,
+      "loss": 0.2327,
+      "step": 22250
+    },
+    {
+      "epoch": 0.6184802166067485,
+      "grad_norm": 0.11023511737585068,
+      "learning_rate": 0.00018417505885961712,
+      "loss": 0.2341,
+      "step": 22300
+    },
+    {
+      "epoch": 0.619866943549813,
+      "grad_norm": 0.1494046449661255,
+      "learning_rate": 0.00018409414066238654,
+      "loss": 0.2307,
+      "step": 22350
+    },
+    {
+      "epoch": 0.6212536704928774,
+      "grad_norm": 0.13288947939872742,
+      "learning_rate": 0.00018401303398211103,
+      "loss": 0.2307,
+      "step": 22400
+    },
+    {
+      "epoch": 0.6226403974359419,
+      "grad_norm": 0.13972090184688568,
+      "learning_rate": 0.0001839317390005778,
+      "loss": 0.231,
+      "step": 22450
+    },
+    {
+      "epoch": 0.6240271243790063,
+      "grad_norm": 0.16141022741794586,
+      "learning_rate": 0.000183850255899996,
+      "loss": 0.2395,
+      "step": 22500
+    },
+    {
+      "epoch": 0.6254138513220708,
+      "grad_norm": 0.17160941660404205,
+      "learning_rate": 0.00018376858486299647,
+      "loss": 0.2371,
+      "step": 22550
+    },
+    {
+      "epoch": 0.6268005782651352,
+      "grad_norm": 0.13852784037590027,
+      "learning_rate": 0.00018368672607263132,
+      "loss": 0.2286,
+      "step": 22600
+    },
+    {
+      "epoch": 0.6281873052081998,
+      "grad_norm": 0.16050252318382263,
+      "learning_rate": 0.00018360467971237338,
+      "loss": 0.2345,
+      "step": 22650
+    },
+    {
+      "epoch": 0.6295740321512642,
+      "grad_norm": 0.12499688565731049,
+      "learning_rate": 0.0001835224459661159,
+      "loss": 0.232,
+      "step": 22700
+    },
+    {
+      "epoch": 0.6309607590943286,
+      "grad_norm": 0.16804257035255432,
+      "learning_rate": 0.00018344002501817226,
+      "loss": 0.2336,
+      "step": 22750
+    },
+    {
+      "epoch": 0.6323474860373931,
+      "grad_norm": 0.15330076217651367,
+      "learning_rate": 0.00018335741705327526,
+      "loss": 0.2314,
+      "step": 22800
+    },
+    {
+      "epoch": 0.6337342129804575,
+      "grad_norm": 0.12613581120967865,
+      "learning_rate": 0.00018327462225657692,
+      "loss": 0.235,
+      "step": 22850
+    },
+    {
+      "epoch": 0.635120939923522,
+      "grad_norm": 0.16671714186668396,
+      "learning_rate": 0.00018319164081364802,
+      "loss": 0.2319,
+      "step": 22900
+    },
+    {
+      "epoch": 0.6365076668665864,
+      "grad_norm": 0.11536330729722977,
+      "learning_rate": 0.00018310847291047776,
+      "loss": 0.2296,
+      "step": 22950
+    },
+    {
+      "epoch": 0.637894393809651,
+      "grad_norm": 0.1565777063369751,
+      "learning_rate": 0.00018302511873347305,
+      "loss": 0.23,
+      "step": 23000
+    },
+    {
+      "epoch": 0.637894393809651,
+      "eval_loss": 0.22944478690624237,
+      "eval_runtime": 500.3715,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 23000
+    },
+    {
+      "epoch": 0.6392811207527154,
+      "grad_norm": 0.18740278482437134,
+      "learning_rate": 0.00018294157846945853,
+      "loss": 0.2315,
+      "step": 23050
+    },
+    {
+      "epoch": 0.6406678476957799,
+      "grad_norm": 0.14261969923973083,
+      "learning_rate": 0.00018285785230567577,
+      "loss": 0.2291,
+      "step": 23100
+    },
+    {
+      "epoch": 0.6420545746388443,
+      "grad_norm": 0.16137824952602386,
+      "learning_rate": 0.00018277394042978307,
+      "loss": 0.2325,
+      "step": 23150
+    },
+    {
+      "epoch": 0.6434413015819087,
+      "grad_norm": 0.1337035894393921,
+      "learning_rate": 0.00018268984302985495,
+      "loss": 0.2322,
+      "step": 23200
+    },
+    {
+      "epoch": 0.6448280285249732,
+      "grad_norm": 0.11618442833423615,
+      "learning_rate": 0.0001826055602943818,
+      "loss": 0.2349,
+      "step": 23250
+    },
+    {
+      "epoch": 0.6462147554680376,
+      "grad_norm": 0.12656192481517792,
+      "learning_rate": 0.0001825210924122693,
+      "loss": 0.234,
+      "step": 23300
+    },
+    {
+      "epoch": 0.6476014824111022,
+      "grad_norm": 0.11272765696048737,
+      "learning_rate": 0.0001824364395728382,
+      "loss": 0.2313,
+      "step": 23350
+    },
+    {
+      "epoch": 0.6489882093541666,
+      "grad_norm": 0.13132552802562714,
+      "learning_rate": 0.00018235160196582384,
+      "loss": 0.2289,
+      "step": 23400
+    },
+    {
+      "epoch": 0.6503749362972311,
+      "grad_norm": 0.11405663937330246,
+      "learning_rate": 0.00018226657978137554,
+      "loss": 0.2356,
+      "step": 23450
+    },
+    {
+      "epoch": 0.6517616632402955,
+      "grad_norm": 0.15040431916713715,
+      "learning_rate": 0.00018218137321005643,
+      "loss": 0.2303,
+      "step": 23500
+    },
+    {
+      "epoch": 0.65314839018336,
+      "grad_norm": 0.13074640929698944,
+      "learning_rate": 0.00018209598244284288,
+      "loss": 0.2319,
+      "step": 23550
+    },
+    {
+      "epoch": 0.6545351171264244,
+      "grad_norm": 0.14512640237808228,
+      "learning_rate": 0.00018201040767112413,
+      "loss": 0.2393,
+      "step": 23600
+    },
+    {
+      "epoch": 0.6559218440694888,
+      "grad_norm": 0.10800650715827942,
+      "learning_rate": 0.00018192464908670176,
+      "loss": 0.2318,
+      "step": 23650
+    },
+    {
+      "epoch": 0.6573085710125534,
+      "grad_norm": 0.12321613729000092,
+      "learning_rate": 0.00018183870688178946,
+      "loss": 0.2331,
+      "step": 23700
+    },
+    {
+      "epoch": 0.6586952979556178,
+      "grad_norm": 0.1868344396352768,
+      "learning_rate": 0.00018175258124901236,
+      "loss": 0.2317,
+      "step": 23750
+    },
+    {
+      "epoch": 0.6600820248986823,
+      "grad_norm": 0.11993540078401566,
+      "learning_rate": 0.00018166627238140674,
+      "loss": 0.2309,
+      "step": 23800
+    },
+    {
+      "epoch": 0.6614687518417467,
+      "grad_norm": 0.11594246327877045,
+      "learning_rate": 0.00018157978047241962,
+      "loss": 0.2322,
+      "step": 23850
+    },
+    {
+      "epoch": 0.6628554787848112,
+      "grad_norm": 0.18056848645210266,
+      "learning_rate": 0.00018149310571590824,
+      "loss": 0.2335,
+      "step": 23900
+    },
+    {
+      "epoch": 0.6642422057278756,
+      "grad_norm": 0.14387637376785278,
+      "learning_rate": 0.00018140624830613965,
+      "loss": 0.2366,
+      "step": 23950
+    },
+    {
+      "epoch": 0.6656289326709401,
+      "grad_norm": 0.16983430087566376,
+      "learning_rate": 0.00018131920843779035,
+      "loss": 0.2361,
+      "step": 24000
+    },
+    {
+      "epoch": 0.6656289326709401,
+      "eval_loss": 0.22958332300186157,
+      "eval_runtime": 500.0504,
+      "eval_samples_per_second": 5.713,
+      "eval_steps_per_second": 5.713,
+      "step": 24000
+    },
+    {
+      "epoch": 0.6670156596140046,
+      "grad_norm": 0.13279864192008972,
+      "learning_rate": 0.0001812319863059457,
+      "loss": 0.2359,
+      "step": 24050
+    },
+    {
+      "epoch": 0.668402386557069,
+      "grad_norm": 0.11594101786613464,
+      "learning_rate": 0.00018114458210609962,
+      "loss": 0.2358,
+      "step": 24100
+    },
+    {
+      "epoch": 0.6697891135001335,
+      "grad_norm": 0.13613513112068176,
+      "learning_rate": 0.0001810569960341541,
+      "loss": 0.2278,
+      "step": 24150
+    },
+    {
+      "epoch": 0.6711758404431979,
+      "grad_norm": 0.12295212596654892,
+      "learning_rate": 0.00018096922828641878,
+      "loss": 0.2315,
+      "step": 24200
+    },
+    {
+      "epoch": 0.6725625673862624,
+      "grad_norm": 0.17889654636383057,
+      "learning_rate": 0.00018088127905961047,
+      "loss": 0.2305,
+      "step": 24250
+    },
+    {
+      "epoch": 0.6739492943293268,
+      "grad_norm": 0.16525234282016754,
+      "learning_rate": 0.0001807931485508528,
+      "loss": 0.2304,
+      "step": 24300
+    },
+    {
+      "epoch": 0.6753360212723913,
+      "grad_norm": 0.11446121335029602,
+      "learning_rate": 0.0001807048369576756,
+      "loss": 0.2333,
+      "step": 24350
+    },
+    {
+      "epoch": 0.6767227482154557,
+      "grad_norm": 0.14533396065235138,
+      "learning_rate": 0.00018061634447801467,
+      "loss": 0.2354,
+      "step": 24400
+    },
+    {
+      "epoch": 0.6781094751585203,
+      "grad_norm": 0.14825408160686493,
+      "learning_rate": 0.0001805276713102112,
+      "loss": 0.2316,
+      "step": 24450
+    },
+    {
+      "epoch": 0.6794962021015847,
+      "grad_norm": 0.148117333650589,
+      "learning_rate": 0.00018043881765301135,
+      "loss": 0.2338,
+      "step": 24500
+    },
+    {
+      "epoch": 0.6808829290446491,
+      "grad_norm": 0.10264230519533157,
+      "learning_rate": 0.00018034978370556583,
+      "loss": 0.2298,
+      "step": 24550
+    },
+    {
+      "epoch": 0.6822696559877136,
+      "grad_norm": 0.12200962007045746,
+      "learning_rate": 0.00018026056966742945,
+      "loss": 0.2284,
+      "step": 24600
+    },
+    {
+      "epoch": 0.683656382930778,
+      "grad_norm": 0.14096751809120178,
+      "learning_rate": 0.00018017117573856063,
+      "loss": 0.2333,
+      "step": 24650
+    },
+    {
+      "epoch": 0.6850431098738425,
+      "grad_norm": 0.16554249823093414,
+      "learning_rate": 0.00018008160211932108,
+      "loss": 0.2316,
+      "step": 24700
+    },
+    {
+      "epoch": 0.686429836816907,
+      "grad_norm": 0.11679153889417648,
+      "learning_rate": 0.0001799918490104751,
+      "loss": 0.2287,
+      "step": 24750
+    },
+    {
+      "epoch": 0.6878165637599715,
+      "grad_norm": 0.1387365758419037,
+      "learning_rate": 0.00017990191661318943,
+      "loss": 0.2356,
+      "step": 24800
+    },
+    {
+      "epoch": 0.6892032907030359,
+      "grad_norm": 0.1255553960800171,
+      "learning_rate": 0.00017981180512903255,
+      "loss": 0.2342,
+      "step": 24850
+    },
+    {
+      "epoch": 0.6905900176461004,
+      "grad_norm": 0.17247521877288818,
+      "learning_rate": 0.00017972151475997443,
+      "loss": 0.2303,
+      "step": 24900
+    },
+    {
+      "epoch": 0.6919767445891648,
+      "grad_norm": 0.20023292303085327,
+      "learning_rate": 0.0001796310457083859,
+      "loss": 0.2346,
+      "step": 24950
+    },
+    {
+      "epoch": 0.6933634715322292,
+      "grad_norm": 0.11909276992082596,
+      "learning_rate": 0.0001795403981770383,
+      "loss": 0.2264,
+      "step": 25000
+    },
+    {
+      "epoch": 0.6933634715322292,
+      "eval_loss": 0.2287738025188446,
+      "eval_runtime": 500.5021,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 25000
+    },
+    {
+      "epoch": 0.6947501984752937,
+      "grad_norm": 0.13509905338287354,
+      "learning_rate": 0.00017944957236910308,
+      "loss": 0.2318,
+      "step": 25050
+    },
+    {
+      "epoch": 0.6961369254183581,
+      "grad_norm": 0.15455523133277893,
+      "learning_rate": 0.0001793585684881511,
+      "loss": 0.2325,
+      "step": 25100
+    },
+    {
+      "epoch": 0.6975236523614227,
+      "grad_norm": 0.1231105625629425,
+      "learning_rate": 0.00017926738673815248,
+      "loss": 0.2303,
+      "step": 25150
+    },
+    {
+      "epoch": 0.6989103793044871,
+      "grad_norm": 0.19073975086212158,
+      "learning_rate": 0.00017917602732347597,
+      "loss": 0.2309,
+      "step": 25200
+    },
+    {
+      "epoch": 0.7002971062475516,
+      "grad_norm": 0.16656789183616638,
+      "learning_rate": 0.00017908449044888854,
+      "loss": 0.2334,
+      "step": 25250
+    },
+    {
+      "epoch": 0.701683833190616,
+      "grad_norm": 0.12732850015163422,
+      "learning_rate": 0.00017899277631955486,
+      "loss": 0.2348,
+      "step": 25300
+    },
+    {
+      "epoch": 0.7030705601336805,
+      "grad_norm": 0.20655155181884766,
+      "learning_rate": 0.00017890088514103692,
+      "loss": 0.2355,
+      "step": 25350
+    },
+    {
+      "epoch": 0.7044572870767449,
+      "grad_norm": 0.10959596931934357,
+      "learning_rate": 0.00017880881711929353,
+      "loss": 0.2304,
+      "step": 25400
+    },
+    {
+      "epoch": 0.7058440140198093,
+      "grad_norm": 0.15412519872188568,
+      "learning_rate": 0.00017871657246067987,
+      "loss": 0.2336,
+      "step": 25450
+    },
+    {
+      "epoch": 0.7072307409628739,
+      "grad_norm": 0.16455277800559998,
+      "learning_rate": 0.00017862415137194702,
+      "loss": 0.2319,
+      "step": 25500
+    },
+    {
+      "epoch": 0.7086174679059383,
+      "grad_norm": 0.1389029622077942,
+      "learning_rate": 0.00017853340773211896,
+      "loss": 0.2294,
+      "step": 25550
+    },
+    {
+      "epoch": 0.7100041948490028,
+      "grad_norm": 0.14564301073551178,
+      "learning_rate": 0.0001784424950430794,
+      "loss": 0.2326,
+      "step": 25600
+    },
+    {
+      "epoch": 0.7113909217920672,
+      "grad_norm": 0.1606937199831009,
+      "learning_rate": 0.00017834955293674994,
+      "loss": 0.23,
+      "step": 25650
+    },
+    {
+      "epoch": 0.7127776487351317,
+      "grad_norm": 0.13401974737644196,
+      "learning_rate": 0.00017825643522291457,
+      "loss": 0.2361,
+      "step": 25700
+    },
+    {
+      "epoch": 0.7141643756781961,
+      "grad_norm": 0.12457278370857239,
+      "learning_rate": 0.0001781631421102812,
+      "loss": 0.232,
+      "step": 25750
+    },
+    {
+      "epoch": 0.7155511026212606,
+      "grad_norm": 0.13395826518535614,
+      "learning_rate": 0.0001780696738079508,
+      "loss": 0.2294,
+      "step": 25800
+    },
+    {
+      "epoch": 0.7169378295643251,
+      "grad_norm": 0.13083291053771973,
+      "learning_rate": 0.00017797603052541704,
+      "loss": 0.2328,
+      "step": 25850
+    },
+    {
+      "epoch": 0.7183245565073895,
+      "grad_norm": 0.14696165919303894,
+      "learning_rate": 0.00017788221247256583,
+      "loss": 0.233,
+      "step": 25900
+    },
+    {
+      "epoch": 0.719711283450454,
+      "grad_norm": 0.1512746810913086,
+      "learning_rate": 0.00017778821985967467,
+      "loss": 0.2319,
+      "step": 25950
+    },
+    {
+      "epoch": 0.7210980103935184,
+      "grad_norm": 0.1260426789522171,
+      "learning_rate": 0.00017769405289741247,
+      "loss": 0.2341,
+      "step": 26000
+    },
+    {
+      "epoch": 0.7210980103935184,
+      "eval_loss": 0.22873948514461517,
+      "eval_runtime": 500.274,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 26000
+    },
+    {
+      "epoch": 0.7224847373365829,
+      "grad_norm": 0.1653342843055725,
+      "learning_rate": 0.00017759971179683875,
+      "loss": 0.2316,
+      "step": 26050
+    },
+    {
+      "epoch": 0.7238714642796473,
+      "grad_norm": 0.13507039844989777,
+      "learning_rate": 0.00017750519676940348,
+      "loss": 0.2357,
+      "step": 26100
+    },
+    {
+      "epoch": 0.7252581912227118,
+      "grad_norm": 0.128819540143013,
+      "learning_rate": 0.00017741050802694635,
+      "loss": 0.231,
+      "step": 26150
+    },
+    {
+      "epoch": 0.7266449181657763,
+      "grad_norm": 0.13130728900432587,
+      "learning_rate": 0.00017731564578169647,
+      "loss": 0.2305,
+      "step": 26200
+    },
+    {
+      "epoch": 0.7280316451088408,
+      "grad_norm": 0.12267379462718964,
+      "learning_rate": 0.0001772206102462718,
+      "loss": 0.2345,
+      "step": 26250
+    },
+    {
+      "epoch": 0.7294183720519052,
+      "grad_norm": 0.14595343172550201,
+      "learning_rate": 0.0001771254016336787,
+      "loss": 0.2294,
+      "step": 26300
+    },
+    {
+      "epoch": 0.7308050989949696,
+      "grad_norm": 0.13935647904872894,
+      "learning_rate": 0.0001770300201573114,
+      "loss": 0.2358,
+      "step": 26350
+    },
+    {
+      "epoch": 0.7321918259380341,
+      "grad_norm": 0.11328408867120743,
+      "learning_rate": 0.00017693446603095174,
+      "loss": 0.2339,
+      "step": 26400
+    },
+    {
+      "epoch": 0.7335785528810985,
+      "grad_norm": 0.19857367873191833,
+      "learning_rate": 0.00017683873946876835,
+      "loss": 0.2269,
+      "step": 26450
+    },
+    {
+      "epoch": 0.734965279824163,
+      "grad_norm": 0.16225670278072357,
+      "learning_rate": 0.00017674284068531641,
+      "loss": 0.2307,
+      "step": 26500
+    },
+    {
+      "epoch": 0.7363520067672275,
+      "grad_norm": 0.1412588506937027,
+      "learning_rate": 0.00017664676989553714,
+      "loss": 0.229,
+      "step": 26550
+    },
+    {
+      "epoch": 0.737738733710292,
+      "grad_norm": 0.14530161023139954,
+      "learning_rate": 0.00017655052731475724,
+      "loss": 0.2308,
+      "step": 26600
+    },
+    {
+      "epoch": 0.7391254606533564,
+      "grad_norm": 0.12190265953540802,
+      "learning_rate": 0.0001764541131586885,
+      "loss": 0.2294,
+      "step": 26650
+    },
+    {
+      "epoch": 0.7405121875964209,
+      "grad_norm": 0.13169080018997192,
+      "learning_rate": 0.00017635752764342717,
+      "loss": 0.2275,
+      "step": 26700
+    },
+    {
+      "epoch": 0.7418989145394853,
+      "grad_norm": 0.12346599251031876,
+      "learning_rate": 0.00017626077098545367,
+      "loss": 0.2326,
+      "step": 26750
+    },
+    {
+      "epoch": 0.7432856414825497,
+      "grad_norm": 0.12645727396011353,
+      "learning_rate": 0.00017616384340163197,
+      "loss": 0.2369,
+      "step": 26800
+    },
+    {
+      "epoch": 0.7446723684256142,
+      "grad_norm": 0.12523086369037628,
+      "learning_rate": 0.00017606674510920915,
+      "loss": 0.2291,
+      "step": 26850
+    },
+    {
+      "epoch": 0.7460590953686786,
+      "grad_norm": 0.14181695878505707,
+      "learning_rate": 0.0001759694763258149,
+      "loss": 0.2266,
+      "step": 26900
+    },
+    {
+      "epoch": 0.7474458223117432,
+      "grad_norm": 0.13824765384197235,
+      "learning_rate": 0.00017587203726946102,
+      "loss": 0.2281,
+      "step": 26950
+    },
+    {
+      "epoch": 0.7488325492548076,
+      "grad_norm": 0.1162494495511055,
+      "learning_rate": 0.000175774428158541,
+      "loss": 0.2326,
+      "step": 27000
+    },
+    {
+      "epoch": 0.7488325492548076,
+      "eval_loss": 0.22845527529716492,
+      "eval_runtime": 500.3687,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 27000
+    },
+    {
+      "epoch": 0.7502192761978721,
+      "grad_norm": 0.1494184285402298,
+      "learning_rate": 0.0001756766492118294,
+      "loss": 0.2335,
+      "step": 27050
+    },
+    {
+      "epoch": 0.7516060031409365,
+      "grad_norm": 0.14270345866680145,
+      "learning_rate": 0.00017557870064848153,
+      "loss": 0.2378,
+      "step": 27100
+    },
+    {
+      "epoch": 0.752992730084001,
+      "grad_norm": 0.17542113363742828,
+      "learning_rate": 0.0001754805826880328,
+      "loss": 0.2344,
+      "step": 27150
+    },
+    {
+      "epoch": 0.7543794570270654,
+      "grad_norm": 0.14542442560195923,
+      "learning_rate": 0.0001753822955503983,
+      "loss": 0.2413,
+      "step": 27200
+    },
+    {
+      "epoch": 0.75576618397013,
+      "grad_norm": 0.13541916012763977,
+      "learning_rate": 0.00017528383945587236,
+      "loss": 0.2331,
+      "step": 27250
+    },
+    {
+      "epoch": 0.7571529109131944,
+      "grad_norm": 0.1555178165435791,
+      "learning_rate": 0.00017518521462512796,
+      "loss": 0.2314,
+      "step": 27300
+    },
+    {
+      "epoch": 0.7585396378562588,
+      "grad_norm": 0.10956469923257828,
+      "learning_rate": 0.0001750864212792162,
+      "loss": 0.2312,
+      "step": 27350
+    },
+    {
+      "epoch": 0.7599263647993233,
+      "grad_norm": 0.15572619438171387,
+      "learning_rate": 0.00017498745963956603,
+      "loss": 0.2334,
+      "step": 27400
+    },
+    {
+      "epoch": 0.7613130917423877,
+      "grad_norm": 0.1467774659395218,
+      "learning_rate": 0.0001748883299279835,
+      "loss": 0.231,
+      "step": 27450
+    },
+    {
+      "epoch": 0.7626998186854522,
+      "grad_norm": 0.12245896458625793,
+      "learning_rate": 0.00017478903236665136,
+      "loss": 0.2374,
+      "step": 27500
+    },
+    {
+      "epoch": 0.7640865456285166,
+      "grad_norm": 0.10392642766237259,
+      "learning_rate": 0.00017468956717812864,
+      "loss": 0.2313,
+      "step": 27550
+    },
+    {
+      "epoch": 0.7654732725715812,
+      "grad_norm": 0.1239921823143959,
+      "learning_rate": 0.00017458993458534998,
+      "loss": 0.2349,
+      "step": 27600
+    },
+    {
+      "epoch": 0.7668599995146456,
+      "grad_norm": 0.13776883482933044,
+      "learning_rate": 0.00017449013481162534,
+      "loss": 0.2362,
+      "step": 27650
+    },
+    {
+      "epoch": 0.7682467264577101,
+      "grad_norm": 0.1389874666929245,
+      "learning_rate": 0.00017439016808063932,
+      "loss": 0.2304,
+      "step": 27700
+    },
+    {
+      "epoch": 0.7696334534007745,
+      "grad_norm": 0.11973544955253601,
+      "learning_rate": 0.00017429003461645072,
+      "loss": 0.2352,
+      "step": 27750
+    },
+    {
+      "epoch": 0.7710201803438389,
+      "grad_norm": 0.13108691573143005,
+      "learning_rate": 0.00017418973464349209,
+      "loss": 0.2311,
+      "step": 27800
+    },
+    {
+      "epoch": 0.7724069072869034,
+      "grad_norm": 0.12594327330589294,
+      "learning_rate": 0.00017408926838656912,
+      "loss": 0.2332,
+      "step": 27850
+    },
+    {
+      "epoch": 0.7737936342299678,
+      "grad_norm": 0.14845065772533417,
+      "learning_rate": 0.00017398863607086024,
+      "loss": 0.2307,
+      "step": 27900
+    },
+    {
+      "epoch": 0.7751803611730324,
+      "grad_norm": 0.11298257112503052,
+      "learning_rate": 0.0001738878379219161,
+      "loss": 0.2331,
+      "step": 27950
+    },
+    {
+      "epoch": 0.7765670881160968,
+      "grad_norm": 0.11864858120679855,
+      "learning_rate": 0.000173786874165659,
+      "loss": 0.231,
+      "step": 28000
+    },
+    {
+      "epoch": 0.7765670881160968,
+      "eval_loss": 0.22779151797294617,
+      "eval_runtime": 501.235,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 28000
+    },
+    {
+      "epoch": 0.7779538150591613,
+      "grad_norm": 0.11632022261619568,
+      "learning_rate": 0.00017368574502838239,
+      "loss": 0.229,
+      "step": 28050
+    },
+    {
+      "epoch": 0.7793405420022257,
+      "grad_norm": 0.1431494504213333,
+      "learning_rate": 0.00017358445073675042,
+      "loss": 0.2318,
+      "step": 28100
+    },
+    {
+      "epoch": 0.7807272689452902,
+      "grad_norm": 0.12157493084669113,
+      "learning_rate": 0.00017348299151779748,
+      "loss": 0.2343,
+      "step": 28150
+    },
+    {
+      "epoch": 0.7821139958883546,
+      "grad_norm": 0.11989067494869232,
+      "learning_rate": 0.00017338136759892752,
+      "loss": 0.2347,
+      "step": 28200
+    },
+    {
+      "epoch": 0.783500722831419,
+      "grad_norm": 0.12739787995815277,
+      "learning_rate": 0.00017327957920791365,
+      "loss": 0.2328,
+      "step": 28250
+    },
+    {
+      "epoch": 0.7848874497744835,
+      "grad_norm": 0.15567833185195923,
+      "learning_rate": 0.00017317762657289768,
+      "loss": 0.2297,
+      "step": 28300
+    },
+    {
+      "epoch": 0.786274176717548,
+      "grad_norm": 0.12073542922735214,
+      "learning_rate": 0.00017307550992238943,
+      "loss": 0.2296,
+      "step": 28350
+    },
+    {
+      "epoch": 0.7876609036606125,
+      "grad_norm": 0.1477758288383484,
+      "learning_rate": 0.0001729732294852665,
+      "loss": 0.2328,
+      "step": 28400
+    },
+    {
+      "epoch": 0.7890476306036769,
+      "grad_norm": 0.1612139195203781,
+      "learning_rate": 0.00017287078549077343,
+      "loss": 0.2314,
+      "step": 28450
+    },
+    {
+      "epoch": 0.7904343575467414,
+      "grad_norm": 0.15718688070774078,
+      "learning_rate": 0.00017276817816852145,
+      "loss": 0.2289,
+      "step": 28500
+    },
+    {
+      "epoch": 0.7918210844898058,
+      "grad_norm": 0.1242058202624321,
+      "learning_rate": 0.0001726654077484878,
+      "loss": 0.2301,
+      "step": 28550
+    },
+    {
+      "epoch": 0.7932078114328703,
+      "grad_norm": 0.13269132375717163,
+      "learning_rate": 0.0001725624744610153,
+      "loss": 0.2303,
+      "step": 28600
+    },
+    {
+      "epoch": 0.7945945383759347,
+      "grad_norm": 0.12394677847623825,
+      "learning_rate": 0.0001724593785368118,
+      "loss": 0.2362,
+      "step": 28650
+    },
+    {
+      "epoch": 0.7959812653189992,
+      "grad_norm": 0.1323787420988083,
+      "learning_rate": 0.00017235612020694978,
+      "loss": 0.2281,
+      "step": 28700
+    },
+    {
+      "epoch": 0.7973679922620637,
+      "grad_norm": 0.1532479077577591,
+      "learning_rate": 0.00017225269970286552,
+      "loss": 0.2321,
+      "step": 28750
+    },
+    {
+      "epoch": 0.7987547192051281,
+      "grad_norm": 0.14882826805114746,
+      "learning_rate": 0.00017214911725635897,
+      "loss": 0.2316,
+      "step": 28800
+    },
+    {
+      "epoch": 0.8001414461481926,
+      "grad_norm": 0.11855613440275192,
+      "learning_rate": 0.00017204537309959292,
+      "loss": 0.2271,
+      "step": 28850
+    },
+    {
+      "epoch": 0.801528173091257,
+      "grad_norm": 0.15302914381027222,
+      "learning_rate": 0.00017194146746509268,
+      "loss": 0.2296,
+      "step": 28900
+    },
+    {
+      "epoch": 0.8029149000343215,
+      "grad_norm": 0.11822402477264404,
+      "learning_rate": 0.00017183740058574547,
+      "loss": 0.2301,
+      "step": 28950
+    },
+    {
+      "epoch": 0.8043016269773859,
+      "grad_norm": 0.1369016021490097,
+      "learning_rate": 0.00017173317269479992,
+      "loss": 0.2291,
+      "step": 29000
+    },
+    {
+      "epoch": 0.8043016269773859,
+      "eval_loss": 0.2273886650800705,
+      "eval_runtime": 501.6607,
+      "eval_samples_per_second": 5.695,
+      "eval_steps_per_second": 5.695,
+      "step": 29000
+    },
+    {
+      "epoch": 0.8056883539204505,
+      "grad_norm": 0.12872962653636932,
+      "learning_rate": 0.00017162878402586553,
+      "loss": 0.2344,
+      "step": 29050
+    },
+    {
+      "epoch": 0.8070750808635149,
+      "grad_norm": 0.13491351902484894,
+      "learning_rate": 0.00017152423481291216,
+      "loss": 0.2357,
+      "step": 29100
+    },
+    {
+      "epoch": 0.8084618078065793,
+      "grad_norm": 0.12680833041667938,
+      "learning_rate": 0.00017141952529026945,
+      "loss": 0.2333,
+      "step": 29150
+    },
+    {
+      "epoch": 0.8098485347496438,
+      "grad_norm": 0.12384926527738571,
+      "learning_rate": 0.0001713146556926265,
+      "loss": 0.2421,
+      "step": 29200
+    },
+    {
+      "epoch": 0.8112352616927082,
+      "grad_norm": 0.13864979147911072,
+      "learning_rate": 0.00017120962625503098,
+      "loss": 0.2262,
+      "step": 29250
+    },
+    {
+      "epoch": 0.8126219886357727,
+      "grad_norm": 0.12703485786914825,
+      "learning_rate": 0.00017110443721288901,
+      "loss": 0.2295,
+      "step": 29300
+    },
+    {
+      "epoch": 0.8140087155788371,
+      "grad_norm": 0.12121795862913132,
+      "learning_rate": 0.0001709990888019643,
+      "loss": 0.2286,
+      "step": 29350
+    },
+    {
+      "epoch": 0.8153954425219017,
+      "grad_norm": 0.11982162296772003,
+      "learning_rate": 0.00017089358125837783,
+      "loss": 0.2286,
+      "step": 29400
+    },
+    {
+      "epoch": 0.8167821694649661,
+      "grad_norm": 0.1372060328722,
+      "learning_rate": 0.00017078791481860725,
+      "loss": 0.2244,
+      "step": 29450
+    },
+    {
+      "epoch": 0.8181688964080306,
+      "grad_norm": 0.12731321156024933,
+      "learning_rate": 0.0001706820897194863,
+      "loss": 0.2259,
+      "step": 29500
+    },
+    {
+      "epoch": 0.819555623351095,
+      "grad_norm": 0.14031195640563965,
+      "learning_rate": 0.00017057610619820437,
+      "loss": 0.2297,
+      "step": 29550
+    },
+    {
+      "epoch": 0.8209423502941594,
+      "grad_norm": 0.13404880464076996,
+      "learning_rate": 0.0001704699644923059,
+      "loss": 0.2293,
+      "step": 29600
+    },
+    {
+      "epoch": 0.8223290772372239,
+      "grad_norm": 0.12400925159454346,
+      "learning_rate": 0.00017036366483968987,
+      "loss": 0.2263,
+      "step": 29650
+    },
+    {
+      "epoch": 0.8237158041802883,
+      "grad_norm": 0.14439739286899567,
+      "learning_rate": 0.00017025720747860937,
+      "loss": 0.2272,
+      "step": 29700
+    },
+    {
+      "epoch": 0.8251025311233529,
+      "grad_norm": 0.12196583300828934,
+      "learning_rate": 0.00017015059264767084,
+      "loss": 0.2337,
+      "step": 29750
+    },
+    {
+      "epoch": 0.8264892580664173,
+      "grad_norm": 0.13919509947299957,
+      "learning_rate": 0.00017004382058583367,
+      "loss": 0.2337,
+      "step": 29800
+    },
+    {
+      "epoch": 0.8278759850094818,
+      "grad_norm": 0.11371088027954102,
+      "learning_rate": 0.00016993689153240978,
+      "loss": 0.2252,
+      "step": 29850
+    },
+    {
+      "epoch": 0.8292627119525462,
+      "grad_norm": 0.1316608041524887,
+      "learning_rate": 0.00016982980572706282,
+      "loss": 0.2281,
+      "step": 29900
+    },
+    {
+      "epoch": 0.8306494388956107,
+      "grad_norm": 0.18003039062023163,
+      "learning_rate": 0.00016972256340980785,
+      "loss": 0.2296,
+      "step": 29950
+    },
+    {
+      "epoch": 0.8320361658386751,
+      "grad_norm": 0.16534283757209778,
+      "learning_rate": 0.0001696151648210107,
+      "loss": 0.2267,
+      "step": 30000
+    },
+    {
+      "epoch": 0.8320361658386751,
+      "eval_loss": 0.22761212289333344,
+      "eval_runtime": 501.069,
+      "eval_samples_per_second": 5.702,
+      "eval_steps_per_second": 5.702,
+      "step": 30000
+    },
+    {
+      "epoch": 0.8334228927817395,
+      "grad_norm": 0.11093872785568237,
+      "learning_rate": 0.00016950761020138747,
+      "loss": 0.234,
+      "step": 30050
+    },
+    {
+      "epoch": 0.834809619724804,
+      "grad_norm": 0.14647316932678223,
+      "learning_rate": 0.00016939989979200394,
+      "loss": 0.232,
+      "step": 30100
+    },
+    {
+      "epoch": 0.8361963466678685,
+      "grad_norm": 0.14312680065631866,
+      "learning_rate": 0.00016929203383427515,
+      "loss": 0.2299,
+      "step": 30150
+    },
+    {
+      "epoch": 0.837583073610933,
+      "grad_norm": 0.11662258952856064,
+      "learning_rate": 0.00016918401256996467,
+      "loss": 0.2298,
+      "step": 30200
+    },
+    {
+      "epoch": 0.8389698005539974,
+      "grad_norm": 0.11783650517463684,
+      "learning_rate": 0.0001690758362411843,
+      "loss": 0.2345,
+      "step": 30250
+    },
+    {
+      "epoch": 0.8403565274970619,
+      "grad_norm": 0.12562035024166107,
+      "learning_rate": 0.0001689675050903932,
+      "loss": 0.2341,
+      "step": 30300
+    },
+    {
+      "epoch": 0.8417432544401263,
+      "grad_norm": 0.1082848459482193,
+      "learning_rate": 0.00016885901936039774,
+      "loss": 0.2298,
+      "step": 30350
+    },
+    {
+      "epoch": 0.8431299813831908,
+      "grad_norm": 0.14080305397510529,
+      "learning_rate": 0.0001687503792943506,
+      "loss": 0.2364,
+      "step": 30400
+    },
+    {
+      "epoch": 0.8445167083262552,
+      "grad_norm": 0.133138969540596,
+      "learning_rate": 0.00016864158513575048,
+      "loss": 0.2293,
+      "step": 30450
+    },
+    {
+      "epoch": 0.8459034352693197,
+      "grad_norm": 0.13258026540279388,
+      "learning_rate": 0.00016853263712844136,
+      "loss": 0.2269,
+      "step": 30500
+    },
+    {
+      "epoch": 0.8472901622123842,
+      "grad_norm": 0.12311206012964249,
+      "learning_rate": 0.00016842353551661216,
+      "loss": 0.2297,
+      "step": 30550
+    },
+    {
+      "epoch": 0.8486768891554486,
+      "grad_norm": 0.12220294028520584,
+      "learning_rate": 0.00016831428054479597,
+      "loss": 0.2301,
+      "step": 30600
+    },
+    {
+      "epoch": 0.8500636160985131,
+      "grad_norm": 0.112845279276371,
+      "learning_rate": 0.00016820487245786968,
+      "loss": 0.2295,
+      "step": 30650
+    },
+    {
+      "epoch": 0.8514503430415775,
+      "grad_norm": 0.17439040541648865,
+      "learning_rate": 0.0001680953115010533,
+      "loss": 0.2299,
+      "step": 30700
+    },
+    {
+      "epoch": 0.852837069984642,
+      "grad_norm": 0.14124707877635956,
+      "learning_rate": 0.0001679855979199096,
+      "loss": 0.228,
+      "step": 30750
+    },
+    {
+      "epoch": 0.8542237969277064,
+      "grad_norm": 0.12298920005559921,
+      "learning_rate": 0.00016787573196034328,
+      "loss": 0.2293,
+      "step": 30800
+    },
+    {
+      "epoch": 0.855610523870771,
+      "grad_norm": 0.15425720810890198,
+      "learning_rate": 0.0001677657138686006,
+      "loss": 0.2263,
+      "step": 30850
+    },
+    {
+      "epoch": 0.8569972508138354,
+      "grad_norm": 0.13903729617595673,
+      "learning_rate": 0.0001676555438912689,
+      "loss": 0.2315,
+      "step": 30900
+    },
+    {
+      "epoch": 0.8583839777568998,
+      "grad_norm": 0.1249585896730423,
+      "learning_rate": 0.00016754522227527589,
+      "loss": 0.2289,
+      "step": 30950
+    },
+    {
+      "epoch": 0.8597707046999643,
+      "grad_norm": 0.13223236799240112,
+      "learning_rate": 0.00016743474926788908,
+      "loss": 0.2303,
+      "step": 31000
+    },
+    {
+      "epoch": 0.8597707046999643,
+      "eval_loss": 0.22721892595291138,
+      "eval_runtime": 500.5938,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 31000
+    },
+    {
+      "epoch": 0.8611574316430287,
+      "grad_norm": 0.15615518391132355,
+      "learning_rate": 0.00016732412511671544,
+      "loss": 0.2306,
+      "step": 31050
+    },
+    {
+      "epoch": 0.8625441585860932,
+      "grad_norm": 0.14526858925819397,
+      "learning_rate": 0.0001672133500697005,
+      "loss": 0.2307,
+      "step": 31100
+    },
+    {
+      "epoch": 0.8639308855291576,
+      "grad_norm": 0.11307808756828308,
+      "learning_rate": 0.00016710242437512825,
+      "loss": 0.237,
+      "step": 31150
+    },
+    {
+      "epoch": 0.8653176124722222,
+      "grad_norm": 0.1289224922657013,
+      "learning_rate": 0.00016699134828162017,
+      "loss": 0.2344,
+      "step": 31200
+    },
+    {
+      "epoch": 0.8667043394152866,
+      "grad_norm": 0.1631319522857666,
+      "learning_rate": 0.00016688012203813486,
+      "loss": 0.2305,
+      "step": 31250
+    },
+    {
+      "epoch": 0.8680910663583511,
+      "grad_norm": 0.1249733492732048,
+      "learning_rate": 0.00016676874589396744,
+      "loss": 0.2301,
+      "step": 31300
+    },
+    {
+      "epoch": 0.8694777933014155,
+      "grad_norm": 0.11502408981323242,
+      "learning_rate": 0.00016665722009874905,
+      "loss": 0.2319,
+      "step": 31350
+    },
+    {
+      "epoch": 0.8708645202444799,
+      "grad_norm": 0.13455846905708313,
+      "learning_rate": 0.00016654554490244628,
+      "loss": 0.228,
+      "step": 31400
+    },
+    {
+      "epoch": 0.8722512471875444,
+      "grad_norm": 0.1758633404970169,
+      "learning_rate": 0.00016643372055536048,
+      "loss": 0.2309,
+      "step": 31450
+    },
+    {
+      "epoch": 0.8736379741306088,
+      "grad_norm": 0.11880768090486526,
+      "learning_rate": 0.00016632174730812734,
+      "loss": 0.23,
+      "step": 31500
+    },
+    {
+      "epoch": 0.8750247010736734,
+      "grad_norm": 0.13718900084495544,
+      "learning_rate": 0.0001662096254117163,
+      "loss": 0.2279,
+      "step": 31550
+    },
+    {
+      "epoch": 0.8764114280167378,
+      "grad_norm": 0.1170978993177414,
+      "learning_rate": 0.00016609735511743,
+      "loss": 0.2306,
+      "step": 31600
+    },
+    {
+      "epoch": 0.8777981549598023,
+      "grad_norm": 0.15582193434238434,
+      "learning_rate": 0.0001659849366769036,
+      "loss": 0.2312,
+      "step": 31650
+    },
+    {
+      "epoch": 0.8791848819028667,
+      "grad_norm": 0.12351904064416885,
+      "learning_rate": 0.00016587237034210435,
+      "loss": 0.2292,
+      "step": 31700
+    },
+    {
+      "epoch": 0.8805716088459312,
+      "grad_norm": 0.18479709327220917,
+      "learning_rate": 0.000165759656365331,
+      "loss": 0.2274,
+      "step": 31750
+    },
+    {
+      "epoch": 0.8819583357889956,
+      "grad_norm": 0.14211027324199677,
+      "learning_rate": 0.00016564679499921328,
+      "loss": 0.2298,
+      "step": 31800
+    },
+    {
+      "epoch": 0.88334506273206,
+      "grad_norm": 0.1540357619524002,
+      "learning_rate": 0.00016553378649671112,
+      "loss": 0.2304,
+      "step": 31850
+    },
+    {
+      "epoch": 0.8847317896751246,
+      "grad_norm": 0.12503454089164734,
+      "learning_rate": 0.00016542063111111427,
+      "loss": 0.2294,
+      "step": 31900
+    },
+    {
+      "epoch": 0.886118516618189,
+      "grad_norm": 0.13658925890922546,
+      "learning_rate": 0.00016530732909604177,
+      "loss": 0.2291,
+      "step": 31950
+    },
+    {
+      "epoch": 0.8875052435612535,
+      "grad_norm": 0.15731070935726166,
+      "learning_rate": 0.00016519388070544128,
+      "loss": 0.2322,
+      "step": 32000
+    },
+    {
+      "epoch": 0.8875052435612535,
+      "eval_loss": 0.22673186659812927,
+      "eval_runtime": 500.5013,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 32000
+    },
+    {
+      "epoch": 0.8888919705043179,
+      "grad_norm": 0.11884371191263199,
+      "learning_rate": 0.0001650802861935885,
+      "loss": 0.2312,
+      "step": 32050
+    },
+    {
+      "epoch": 0.8902786974473824,
+      "grad_norm": 0.168379008769989,
+      "learning_rate": 0.00016496654581508663,
+      "loss": 0.2312,
+      "step": 32100
+    },
+    {
+      "epoch": 0.8916654243904468,
+      "grad_norm": 0.11641304939985275,
+      "learning_rate": 0.00016485265982486591,
+      "loss": 0.2271,
+      "step": 32150
+    },
+    {
+      "epoch": 0.8930521513335113,
+      "grad_norm": 0.12015505880117416,
+      "learning_rate": 0.00016473862847818277,
+      "loss": 0.2308,
+      "step": 32200
+    },
+    {
+      "epoch": 0.8944388782765758,
+      "grad_norm": 0.17053671181201935,
+      "learning_rate": 0.00016462445203061957,
+      "loss": 0.2324,
+      "step": 32250
+    },
+    {
+      "epoch": 0.8958256052196402,
+      "grad_norm": 0.12947635352611542,
+      "learning_rate": 0.0001645101307380839,
+      "loss": 0.2318,
+      "step": 32300
+    },
+    {
+      "epoch": 0.8972123321627047,
+      "grad_norm": 0.11198735982179642,
+      "learning_rate": 0.00016439566485680783,
+      "loss": 0.23,
+      "step": 32350
+    },
+    {
+      "epoch": 0.8985990591057691,
+      "grad_norm": 0.1204909086227417,
+      "learning_rate": 0.00016428105464334772,
+      "loss": 0.23,
+      "step": 32400
+    },
+    {
+      "epoch": 0.8999857860488336,
+      "grad_norm": 0.11191330850124359,
+      "learning_rate": 0.00016416630035458326,
+      "loss": 0.2295,
+      "step": 32450
+    },
+    {
+      "epoch": 0.901372512991898,
+      "grad_norm": 0.10705868154764175,
+      "learning_rate": 0.00016405140224771717,
+      "loss": 0.2246,
+      "step": 32500
+    },
+    {
+      "epoch": 0.9027592399349625,
+      "grad_norm": 0.11882634460926056,
+      "learning_rate": 0.0001639363605802744,
+      "loss": 0.2345,
+      "step": 32550
+    },
+    {
+      "epoch": 0.904145966878027,
+      "grad_norm": 0.1181696355342865,
+      "learning_rate": 0.0001638211756101018,
+      "loss": 0.2306,
+      "step": 32600
+    },
+    {
+      "epoch": 0.9055326938210915,
+      "grad_norm": 0.1270473152399063,
+      "learning_rate": 0.00016370584759536734,
+      "loss": 0.2297,
+      "step": 32650
+    },
+    {
+      "epoch": 0.9069194207641559,
+      "grad_norm": 0.11503591388463974,
+      "learning_rate": 0.00016359037679455955,
+      "loss": 0.2292,
+      "step": 32700
+    },
+    {
+      "epoch": 0.9083061477072203,
+      "grad_norm": 0.11596430093050003,
+      "learning_rate": 0.0001634747634664871,
+      "loss": 0.2324,
+      "step": 32750
+    },
+    {
+      "epoch": 0.9096928746502848,
+      "grad_norm": 0.16631336510181427,
+      "learning_rate": 0.00016335900787027802,
+      "loss": 0.23,
+      "step": 32800
+    },
+    {
+      "epoch": 0.9110796015933492,
+      "grad_norm": 0.12083205580711365,
+      "learning_rate": 0.0001632431102653793,
+      "loss": 0.2295,
+      "step": 32850
+    },
+    {
+      "epoch": 0.9124663285364137,
+      "grad_norm": 0.1268964558839798,
+      "learning_rate": 0.00016312707091155609,
+      "loss": 0.2299,
+      "step": 32900
+    },
+    {
+      "epoch": 0.9138530554794781,
+      "grad_norm": 0.1737286001443863,
+      "learning_rate": 0.00016301089006889137,
+      "loss": 0.2291,
+      "step": 32950
+    },
+    {
+      "epoch": 0.9152397824225427,
+      "grad_norm": 0.12454930692911148,
+      "learning_rate": 0.00016289456799778522,
+      "loss": 0.2289,
+      "step": 33000
+    },
+    {
+      "epoch": 0.9152397824225427,
+      "eval_loss": 0.22642949223518372,
+      "eval_runtime": 500.8866,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 33000
+    },
+    {
+      "epoch": 0.9166265093656071,
+      "grad_norm": 0.12109609693288803,
+      "learning_rate": 0.00016277810495895419,
+      "loss": 0.2289,
+      "step": 33050
+    },
+    {
+      "epoch": 0.9180132363086716,
+      "grad_norm": 0.16857489943504333,
+      "learning_rate": 0.00016266150121343085,
+      "loss": 0.2265,
+      "step": 33100
+    },
+    {
+      "epoch": 0.919399963251736,
+      "grad_norm": 0.13193485140800476,
+      "learning_rate": 0.00016254475702256308,
+      "loss": 0.2277,
+      "step": 33150
+    },
+    {
+      "epoch": 0.9207866901948004,
+      "grad_norm": 0.13189518451690674,
+      "learning_rate": 0.0001624278726480137,
+      "loss": 0.2346,
+      "step": 33200
+    },
+    {
+      "epoch": 0.9221734171378649,
+      "grad_norm": 0.16021443903446198,
+      "learning_rate": 0.00016231084835175948,
+      "loss": 0.2273,
+      "step": 33250
+    },
+    {
+      "epoch": 0.9235601440809293,
+      "grad_norm": 0.14241939783096313,
+      "learning_rate": 0.00016219368439609103,
+      "loss": 0.236,
+      "step": 33300
+    },
+    {
+      "epoch": 0.9249468710239939,
+      "grad_norm": 0.18355390429496765,
+      "learning_rate": 0.0001620763810436119,
+      "loss": 0.2281,
+      "step": 33350
+    },
+    {
+      "epoch": 0.9263335979670583,
+      "grad_norm": 0.1321648508310318,
+      "learning_rate": 0.0001619612887687756,
+      "loss": 0.241,
+      "step": 33400
+    },
+    {
+      "epoch": 0.9277203249101228,
+      "grad_norm": 0.16118654608726501,
+      "learning_rate": 0.00016184371018656649,
+      "loss": 0.233,
+      "step": 33450
+    },
+    {
+      "epoch": 0.9291070518531872,
+      "grad_norm": 0.11974034458398819,
+      "learning_rate": 0.00016172599299195568,
+      "loss": 0.219,
+      "step": 33500
+    },
+    {
+      "epoch": 0.9304937787962517,
+      "grad_norm": 0.14652998745441437,
+      "learning_rate": 0.00016160813744878674,
+      "loss": 0.2316,
+      "step": 33550
+    },
+    {
+      "epoch": 0.9318805057393161,
+      "grad_norm": 0.09738484770059586,
+      "learning_rate": 0.0001614901438212133,
+      "loss": 0.2351,
+      "step": 33600
+    },
+    {
+      "epoch": 0.9332672326823805,
+      "grad_norm": 0.15131749212741852,
+      "learning_rate": 0.00016137201237369846,
+      "loss": 0.2281,
+      "step": 33650
+    },
+    {
+      "epoch": 0.9346539596254451,
+      "grad_norm": 0.16536715626716614,
+      "learning_rate": 0.00016125374337101422,
+      "loss": 0.2317,
+      "step": 33700
+    },
+    {
+      "epoch": 0.9360406865685095,
+      "grad_norm": 0.15788187086582184,
+      "learning_rate": 0.0001611353370782409,
+      "loss": 0.2261,
+      "step": 33750
+    },
+    {
+      "epoch": 0.937427413511574,
+      "grad_norm": 0.11554282158613205,
+      "learning_rate": 0.00016101679376076655,
+      "loss": 0.2288,
+      "step": 33800
+    },
+    {
+      "epoch": 0.9388141404546384,
+      "grad_norm": 0.1376064121723175,
+      "learning_rate": 0.00016089811368428633,
+      "loss": 0.2287,
+      "step": 33850
+    },
+    {
+      "epoch": 0.9402008673977029,
+      "grad_norm": 0.1270899623632431,
+      "learning_rate": 0.0001607792971148019,
+      "loss": 0.2232,
+      "step": 33900
+    },
+    {
+      "epoch": 0.9415875943407673,
+      "grad_norm": 0.1187126636505127,
+      "learning_rate": 0.00016066034431862084,
+      "loss": 0.2321,
+      "step": 33950
+    },
+    {
+      "epoch": 0.9429743212838319,
+      "grad_norm": 0.14895334839820862,
+      "learning_rate": 0.00016054125556235613,
+      "loss": 0.2306,
+      "step": 34000
+    },
+    {
+      "epoch": 0.9429743212838319,
+      "eval_loss": 0.22613388299942017,
+      "eval_runtime": 500.7207,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 34000
+    },
+    {
+      "epoch": 0.9443610482268963,
+      "grad_norm": 0.12219640612602234,
+      "learning_rate": 0.00016042203111292538,
+      "loss": 0.2315,
+      "step": 34050
+    },
+    {
+      "epoch": 0.9457477751699607,
+      "grad_norm": 0.1677113175392151,
+      "learning_rate": 0.00016030267123755038,
+      "loss": 0.2327,
+      "step": 34100
+    },
+    {
+      "epoch": 0.9471345021130252,
+      "grad_norm": 0.12030269205570221,
+      "learning_rate": 0.00016018317620375652,
+      "loss": 0.2282,
+      "step": 34150
+    },
+    {
+      "epoch": 0.9485212290560896,
+      "grad_norm": 0.13181360065937042,
+      "learning_rate": 0.00016006354627937203,
+      "loss": 0.2287,
+      "step": 34200
+    },
+    {
+      "epoch": 0.9499079559991541,
+      "grad_norm": 0.13087068498134613,
+      "learning_rate": 0.00015994378173252752,
+      "loss": 0.2282,
+      "step": 34250
+    },
+    {
+      "epoch": 0.9512946829422185,
+      "grad_norm": 0.14467494189739227,
+      "learning_rate": 0.0001598238828316553,
+      "loss": 0.2254,
+      "step": 34300
+    },
+    {
+      "epoch": 0.952681409885283,
+      "grad_norm": 0.14921946823596954,
+      "learning_rate": 0.00015970384984548885,
+      "loss": 0.2324,
+      "step": 34350
+    },
+    {
+      "epoch": 0.9540681368283475,
+      "grad_norm": 0.19342415034770966,
+      "learning_rate": 0.0001595836830430622,
+      "loss": 0.2342,
+      "step": 34400
+    },
+    {
+      "epoch": 0.955454863771412,
+      "grad_norm": 0.12381652742624283,
+      "learning_rate": 0.00015946338269370923,
+      "loss": 0.2262,
+      "step": 34450
+    },
+    {
+      "epoch": 0.9568415907144764,
+      "grad_norm": 0.1456434279680252,
+      "learning_rate": 0.00015934294906706315,
+      "loss": 0.2277,
+      "step": 34500
+    },
+    {
+      "epoch": 0.9582283176575408,
+      "grad_norm": 0.11485321074724197,
+      "learning_rate": 0.000159222382433056,
+      "loss": 0.2355,
+      "step": 34550
+    },
+    {
+      "epoch": 0.9596150446006053,
+      "grad_norm": 0.10027427971363068,
+      "learning_rate": 0.00015910168306191785,
+      "loss": 0.2269,
+      "step": 34600
+    },
+    {
+      "epoch": 0.9610017715436697,
+      "grad_norm": 0.16801820695400238,
+      "learning_rate": 0.0001589808512241763,
+      "loss": 0.2282,
+      "step": 34650
+    },
+    {
+      "epoch": 0.9623884984867342,
+      "grad_norm": 0.11840588599443436,
+      "learning_rate": 0.00015885988719065573,
+      "loss": 0.2304,
+      "step": 34700
+    },
+    {
+      "epoch": 0.9637752254297987,
+      "grad_norm": 0.16810324788093567,
+      "learning_rate": 0.00015873879123247706,
+      "loss": 0.231,
+      "step": 34750
+    },
+    {
+      "epoch": 0.9651619523728632,
+      "grad_norm": 0.1277480274438858,
+      "learning_rate": 0.0001586175636210567,
+      "loss": 0.2292,
+      "step": 34800
+    },
+    {
+      "epoch": 0.9665486793159276,
+      "grad_norm": 0.13225620985031128,
+      "learning_rate": 0.0001584962046281062,
+      "loss": 0.2255,
+      "step": 34850
+    },
+    {
+      "epoch": 0.9679354062589921,
+      "grad_norm": 0.14994849264621735,
+      "learning_rate": 0.00015837471452563159,
+      "loss": 0.2306,
+      "step": 34900
+    },
+    {
+      "epoch": 0.9693221332020565,
+      "grad_norm": 0.11426250636577606,
+      "learning_rate": 0.00015825309358593272,
+      "loss": 0.2311,
+      "step": 34950
+    },
+    {
+      "epoch": 0.9707088601451209,
+      "grad_norm": 0.1453811228275299,
+      "learning_rate": 0.00015813134208160276,
+      "loss": 0.2276,
+      "step": 35000
+    },
+    {
+      "epoch": 0.9707088601451209,
+      "eval_loss": 0.22605940699577332,
+      "eval_runtime": 500.6317,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 35000
+    },
+    {
+      "epoch": 0.9720955870881854,
+      "grad_norm": 0.14036044478416443,
+      "learning_rate": 0.0001580094602855275,
+      "loss": 0.2241,
+      "step": 35050
+    },
+    {
+      "epoch": 0.9734823140312499,
+      "grad_norm": 0.1456310898065567,
+      "learning_rate": 0.00015788744847088464,
+      "loss": 0.2352,
+      "step": 35100
+    },
+    {
+      "epoch": 0.9748690409743144,
+      "grad_norm": 0.1325587034225464,
+      "learning_rate": 0.0001577653069111435,
+      "loss": 0.2267,
+      "step": 35150
+    },
+    {
+      "epoch": 0.9762557679173788,
+      "grad_norm": 0.13475272059440613,
+      "learning_rate": 0.000157643035880064,
+      "loss": 0.232,
+      "step": 35200
+    },
+    {
+      "epoch": 0.9776424948604433,
+      "grad_norm": 0.13557064533233643,
+      "learning_rate": 0.00015752063565169645,
+      "loss": 0.2342,
+      "step": 35250
+    },
+    {
+      "epoch": 0.9790292218035077,
+      "grad_norm": 0.149173304438591,
+      "learning_rate": 0.00015739810650038054,
+      "loss": 0.2284,
+      "step": 35300
+    },
+    {
+      "epoch": 0.9804159487465722,
+      "grad_norm": 0.11646503955125809,
+      "learning_rate": 0.00015727544870074503,
+      "loss": 0.2259,
+      "step": 35350
+    },
+    {
+      "epoch": 0.9818026756896366,
+      "grad_norm": 0.126033216714859,
+      "learning_rate": 0.000157152662527707,
+      "loss": 0.2289,
+      "step": 35400
+    },
+    {
+      "epoch": 0.983189402632701,
+      "grad_norm": 0.17162640392780304,
+      "learning_rate": 0.00015702974825647123,
+      "loss": 0.2293,
+      "step": 35450
+    },
+    {
+      "epoch": 0.9845761295757656,
+      "grad_norm": 0.12047728151082993,
+      "learning_rate": 0.0001569067061625297,
+      "loss": 0.2265,
+      "step": 35500
+    },
+    {
+      "epoch": 0.98596285651883,
+      "grad_norm": 0.1183520033955574,
+      "learning_rate": 0.00015678353652166078,
+      "loss": 0.2272,
+      "step": 35550
+    },
+    {
+      "epoch": 0.9873495834618945,
+      "grad_norm": 0.13919849693775177,
+      "learning_rate": 0.00015666023960992878,
+      "loss": 0.2295,
+      "step": 35600
+    },
+    {
+      "epoch": 0.9887363104049589,
+      "grad_norm": 0.14626280963420868,
+      "learning_rate": 0.00015653681570368318,
+      "loss": 0.2293,
+      "step": 35650
+    },
+    {
+      "epoch": 0.9901230373480234,
+      "grad_norm": 0.11618024855852127,
+      "learning_rate": 0.00015641326507955823,
+      "loss": 0.2264,
+      "step": 35700
+    },
+    {
+      "epoch": 0.9915097642910878,
+      "grad_norm": 0.12280390411615372,
+      "learning_rate": 0.0001562895880144721,
+      "loss": 0.233,
+      "step": 35750
+    },
+    {
+      "epoch": 0.9928964912341524,
+      "grad_norm": 0.11896737664937973,
+      "learning_rate": 0.0001561657847856264,
+      "loss": 0.2276,
+      "step": 35800
+    },
+    {
+      "epoch": 0.9942832181772168,
+      "grad_norm": 0.1226055920124054,
+      "learning_rate": 0.0001560418556705055,
+      "loss": 0.2364,
+      "step": 35850
+    },
+    {
+      "epoch": 0.9956699451202812,
+      "grad_norm": 0.1566486656665802,
+      "learning_rate": 0.00015591780094687587,
+      "loss": 0.2315,
+      "step": 35900
+    },
+    {
+      "epoch": 0.9970566720633457,
+      "grad_norm": 0.12156879901885986,
+      "learning_rate": 0.0001557936208927856,
+      "loss": 0.2284,
+      "step": 35950
+    },
+    {
+      "epoch": 0.9984433990064101,
+      "grad_norm": 0.12765392661094666,
+      "learning_rate": 0.00015566931578656366,
+      "loss": 0.2319,
+      "step": 36000
+    },
+    {
+      "epoch": 0.9984433990064101,
+      "eval_loss": 0.22568126022815704,
+      "eval_runtime": 500.5568,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 36000
+    },
+    {
+      "epoch": 0.9998301259494746,
+      "grad_norm": 0.11263388395309448,
+      "learning_rate": 0.00015554488590681934,
+      "loss": 0.2249,
+      "step": 36050
+    },
+    {
+      "epoch": 1.0012168528925391,
+      "grad_norm": 0.12134028226137161,
+      "learning_rate": 0.00015542033153244142,
+      "loss": 0.2296,
+      "step": 36100
+    },
+    {
+      "epoch": 1.0026035798356034,
+      "grad_norm": 0.12478175759315491,
+      "learning_rate": 0.00015529565294259795,
+      "loss": 0.2295,
+      "step": 36150
+    },
+    {
+      "epoch": 1.003990306778668,
+      "grad_norm": 0.1091291755437851,
+      "learning_rate": 0.0001551708504167352,
+      "loss": 0.2285,
+      "step": 36200
+    },
+    {
+      "epoch": 1.0053770337217325,
+      "grad_norm": 0.11158731579780579,
+      "learning_rate": 0.00015504592423457733,
+      "loss": 0.2267,
+      "step": 36250
+    },
+    {
+      "epoch": 1.006763760664797,
+      "grad_norm": 0.17226600646972656,
+      "learning_rate": 0.00015492087467612562,
+      "loss": 0.2369,
+      "step": 36300
+    },
+    {
+      "epoch": 1.0081504876078613,
+      "grad_norm": 0.10548936575651169,
+      "learning_rate": 0.00015479570202165784,
+      "loss": 0.2257,
+      "step": 36350
+    },
+    {
+      "epoch": 1.0095372145509258,
+      "grad_norm": 0.12710842490196228,
+      "learning_rate": 0.0001546704065517278,
+      "loss": 0.2283,
+      "step": 36400
+    },
+    {
+      "epoch": 1.0109239414939903,
+      "grad_norm": 0.13734006881713867,
+      "learning_rate": 0.0001545449885471644,
+      "loss": 0.2266,
+      "step": 36450
+    },
+    {
+      "epoch": 1.0123106684370546,
+      "grad_norm": 0.14669275283813477,
+      "learning_rate": 0.00015441944828907124,
+      "loss": 0.2265,
+      "step": 36500
+    },
+    {
+      "epoch": 1.0136973953801192,
+      "grad_norm": 0.10941125452518463,
+      "learning_rate": 0.000154293786058826,
+      "loss": 0.231,
+      "step": 36550
+    },
+    {
+      "epoch": 1.0150841223231837,
+      "grad_norm": 0.12528035044670105,
+      "learning_rate": 0.00015416800213807972,
+      "loss": 0.2286,
+      "step": 36600
+    },
+    {
+      "epoch": 1.0164708492662482,
+      "grad_norm": 0.1242556944489479,
+      "learning_rate": 0.00015404209680875607,
+      "loss": 0.2277,
+      "step": 36650
+    },
+    {
+      "epoch": 1.0178575762093125,
+      "grad_norm": 0.09937360137701035,
+      "learning_rate": 0.000153916070353051,
+      "loss": 0.2247,
+      "step": 36700
+    },
+    {
+      "epoch": 1.019244303152377,
+      "grad_norm": 0.11109854280948639,
+      "learning_rate": 0.00015378992305343183,
+      "loss": 0.2248,
+      "step": 36750
+    },
+    {
+      "epoch": 1.0206310300954415,
+      "grad_norm": 0.14019356667995453,
+      "learning_rate": 0.00015366365519263683,
+      "loss": 0.2252,
+      "step": 36800
+    },
+    {
+      "epoch": 1.0220177570385058,
+      "grad_norm": 0.11496023088693619,
+      "learning_rate": 0.00015353979599334788,
+      "loss": 0.2228,
+      "step": 36850
+    },
+    {
+      "epoch": 1.0234044839815704,
+      "grad_norm": 0.15292219817638397,
+      "learning_rate": 0.0001534132902566159,
+      "loss": 0.2307,
+      "step": 36900
+    },
+    {
+      "epoch": 1.0247912109246349,
+      "grad_norm": 0.12410300970077515,
+      "learning_rate": 0.00015328666480286793,
+      "loss": 0.2263,
+      "step": 36950
+    },
+    {
+      "epoch": 1.0261779378676994,
+      "grad_norm": 0.14905387163162231,
+      "learning_rate": 0.00015315991991591386,
+      "loss": 0.2228,
+      "step": 37000
+    },
+    {
+      "epoch": 1.0261779378676994,
+      "eval_loss": 0.22574713826179504,
+      "eval_runtime": 500.6484,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 37000
+    },
+    {
+      "epoch": 1.0275646648107637,
+      "grad_norm": 0.12682612240314484,
+      "learning_rate": 0.0001530330558798313,
+      "loss": 0.2257,
+      "step": 37050
+    },
+    {
+      "epoch": 1.0289513917538282,
+      "grad_norm": 0.15558844804763794,
+      "learning_rate": 0.00015290607297896482,
+      "loss": 0.2259,
+      "step": 37100
+    },
+    {
+      "epoch": 1.0303381186968927,
+      "grad_norm": 0.16526414453983307,
+      "learning_rate": 0.00015277897149792562,
+      "loss": 0.2301,
+      "step": 37150
+    },
+    {
+      "epoch": 1.0317248456399573,
+      "grad_norm": 0.1130262240767479,
+      "learning_rate": 0.0001526517517215905,
+      "loss": 0.2244,
+      "step": 37200
+    },
+    {
+      "epoch": 1.0331115725830216,
+      "grad_norm": 0.12639841437339783,
+      "learning_rate": 0.00015252441393510146,
+      "loss": 0.2269,
+      "step": 37250
+    },
+    {
+      "epoch": 1.034498299526086,
+      "grad_norm": 0.12753638625144958,
+      "learning_rate": 0.000152396958423865,
+      "loss": 0.2277,
+      "step": 37300
+    },
+    {
+      "epoch": 1.0358850264691506,
+      "grad_norm": 0.1574636995792389,
+      "learning_rate": 0.00015226938547355145,
+      "loss": 0.2302,
+      "step": 37350
+    },
+    {
+      "epoch": 1.037271753412215,
+      "grad_norm": 0.1075245812535286,
+      "learning_rate": 0.0001521416953700944,
+      "loss": 0.2318,
+      "step": 37400
+    },
+    {
+      "epoch": 1.0386584803552794,
+      "grad_norm": 0.15765556693077087,
+      "learning_rate": 0.00015201388839969005,
+      "loss": 0.2271,
+      "step": 37450
+    },
+    {
+      "epoch": 1.040045207298344,
+      "grad_norm": 0.14305494725704193,
+      "learning_rate": 0.00015188596484879636,
+      "loss": 0.2268,
+      "step": 37500
+    },
+    {
+      "epoch": 1.0414319342414085,
+      "grad_norm": 0.14217057824134827,
+      "learning_rate": 0.0001517579250041328,
+      "loss": 0.2302,
+      "step": 37550
+    },
+    {
+      "epoch": 1.0428186611844728,
+      "grad_norm": 0.12122397124767303,
+      "learning_rate": 0.00015162976915267948,
+      "loss": 0.2264,
+      "step": 37600
+    },
+    {
+      "epoch": 1.0442053881275373,
+      "grad_norm": 0.1215621680021286,
+      "learning_rate": 0.00015150149758167634,
+      "loss": 0.2239,
+      "step": 37650
+    },
+    {
+      "epoch": 1.0455921150706018,
+      "grad_norm": 0.1759423315525055,
+      "learning_rate": 0.00015137311057862279,
+      "loss": 0.2244,
+      "step": 37700
+    },
+    {
+      "epoch": 1.046978842013666,
+      "grad_norm": 0.11546457558870316,
+      "learning_rate": 0.00015124460843127704,
+      "loss": 0.226,
+      "step": 37750
+    },
+    {
+      "epoch": 1.0483655689567306,
+      "grad_norm": 0.16507115960121155,
+      "learning_rate": 0.00015111599142765526,
+      "loss": 0.2267,
+      "step": 37800
+    },
+    {
+      "epoch": 1.0497522958997951,
+      "grad_norm": 0.15918377041816711,
+      "learning_rate": 0.0001509872598560311,
+      "loss": 0.2265,
+      "step": 37850
+    },
+    {
+      "epoch": 1.0511390228428596,
+      "grad_norm": 0.12590187788009644,
+      "learning_rate": 0.000150858414004935,
+      "loss": 0.2285,
+      "step": 37900
+    },
+    {
+      "epoch": 1.052525749785924,
+      "grad_norm": 0.11883638054132462,
+      "learning_rate": 0.0001507294541631535,
+      "loss": 0.2233,
+      "step": 37950
+    },
+    {
+      "epoch": 1.0539124767289885,
+      "grad_norm": 0.11353275179862976,
+      "learning_rate": 0.00015060038061972874,
+      "loss": 0.2238,
+      "step": 38000
+    },
+    {
+      "epoch": 1.0539124767289885,
+      "eval_loss": 0.22568707168102264,
+      "eval_runtime": 500.8783,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 38000
+    },
+    {
+      "epoch": 1.055299203672053,
+      "grad_norm": 0.1161685511469841,
+      "learning_rate": 0.00015047119366395757,
+      "loss": 0.2292,
+      "step": 38050
+    },
+    {
+      "epoch": 1.0566859306151175,
+      "grad_norm": 0.13814447820186615,
+      "learning_rate": 0.00015034189358539103,
+      "loss": 0.2251,
+      "step": 38100
+    },
+    {
+      "epoch": 1.0580726575581818,
+      "grad_norm": 0.15208768844604492,
+      "learning_rate": 0.00015021248067383387,
+      "loss": 0.2286,
+      "step": 38150
+    },
+    {
+      "epoch": 1.0594593845012463,
+      "grad_norm": 0.12832270562648773,
+      "learning_rate": 0.00015008295521934354,
+      "loss": 0.229,
+      "step": 38200
+    },
+    {
+      "epoch": 1.0608461114443108,
+      "grad_norm": 0.12442856281995773,
+      "learning_rate": 0.00014995331751222992,
+      "loss": 0.2286,
+      "step": 38250
+    },
+    {
+      "epoch": 1.0622328383873751,
+      "grad_norm": 0.14005307853221893,
+      "learning_rate": 0.00014982356784305428,
+      "loss": 0.2293,
+      "step": 38300
+    },
+    {
+      "epoch": 1.0636195653304397,
+      "grad_norm": 0.14418749511241913,
+      "learning_rate": 0.00014969370650262903,
+      "loss": 0.2328,
+      "step": 38350
+    },
+    {
+      "epoch": 1.0650062922735042,
+      "grad_norm": 0.11833231151103973,
+      "learning_rate": 0.00014956373378201677,
+      "loss": 0.2273,
+      "step": 38400
+    },
+    {
+      "epoch": 1.0663930192165687,
+      "grad_norm": 0.12782081961631775,
+      "learning_rate": 0.00014943364997252977,
+      "loss": 0.2224,
+      "step": 38450
+    },
+    {
+      "epoch": 1.067779746159633,
+      "grad_norm": 0.11903475224971771,
+      "learning_rate": 0.00014930345536572924,
+      "loss": 0.2256,
+      "step": 38500
+    },
+    {
+      "epoch": 1.0691664731026975,
+      "grad_norm": 0.17546679079532623,
+      "learning_rate": 0.00014917315025342483,
+      "loss": 0.2306,
+      "step": 38550
+    },
+    {
+      "epoch": 1.070553200045762,
+      "grad_norm": 0.16552455723285675,
+      "learning_rate": 0.0001490427349276737,
+      "loss": 0.2242,
+      "step": 38600
+    },
+    {
+      "epoch": 1.0719399269888266,
+      "grad_norm": 0.11756553500890732,
+      "learning_rate": 0.00014891220968078024,
+      "loss": 0.223,
+      "step": 38650
+    },
+    {
+      "epoch": 1.0733266539318909,
+      "grad_norm": 0.13542614877223969,
+      "learning_rate": 0.000148781574805295,
+      "loss": 0.2293,
+      "step": 38700
+    },
+    {
+      "epoch": 1.0747133808749554,
+      "grad_norm": 0.1370215266942978,
+      "learning_rate": 0.00014865083059401445,
+      "loss": 0.2291,
+      "step": 38750
+    },
+    {
+      "epoch": 1.07610010781802,
+      "grad_norm": 0.1472005844116211,
+      "learning_rate": 0.00014851997733997992,
+      "loss": 0.2272,
+      "step": 38800
+    },
+    {
+      "epoch": 1.0774868347610842,
+      "grad_norm": 0.1240694522857666,
+      "learning_rate": 0.00014838901533647733,
+      "loss": 0.2237,
+      "step": 38850
+    },
+    {
+      "epoch": 1.0788735617041487,
+      "grad_norm": 0.11901194602251053,
+      "learning_rate": 0.0001482579448770362,
+      "loss": 0.2285,
+      "step": 38900
+    },
+    {
+      "epoch": 1.0802602886472132,
+      "grad_norm": 0.2202654331922531,
+      "learning_rate": 0.0001481267662554292,
+      "loss": 0.2321,
+      "step": 38950
+    },
+    {
+      "epoch": 1.0816470155902778,
+      "grad_norm": 0.11475471407175064,
+      "learning_rate": 0.00014799547976567144,
+      "loss": 0.2296,
+      "step": 39000
+    },
+    {
+      "epoch": 1.0816470155902778,
+      "eval_loss": 0.2248746156692505,
+      "eval_runtime": 500.4656,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 39000
+    },
+    {
+      "epoch": 1.083033742533342,
+      "grad_norm": 0.1217503771185875,
+      "learning_rate": 0.00014786408570201975,
+      "loss": 0.2223,
+      "step": 39050
+    },
+    {
+      "epoch": 1.0844204694764066,
+      "grad_norm": 0.14427083730697632,
+      "learning_rate": 0.00014773258435897207,
+      "loss": 0.2279,
+      "step": 39100
+    },
+    {
+      "epoch": 1.085807196419471,
+      "grad_norm": 0.11865708976984024,
+      "learning_rate": 0.00014760097603126689,
+      "loss": 0.2295,
+      "step": 39150
+    },
+    {
+      "epoch": 1.0871939233625354,
+      "grad_norm": 0.14178717136383057,
+      "learning_rate": 0.0001474718963578798,
+      "loss": 0.2261,
+      "step": 39200
+    },
+    {
+      "epoch": 1.0885806503056,
+      "grad_norm": 0.15393276512622833,
+      "learning_rate": 0.0001473400770710278,
+      "loss": 0.2308,
+      "step": 39250
+    },
+    {
+      "epoch": 1.0899673772486644,
+      "grad_norm": 0.11602922528982162,
+      "learning_rate": 0.00014720815167925812,
+      "loss": 0.2283,
+      "step": 39300
+    },
+    {
+      "epoch": 1.091354104191729,
+      "grad_norm": 0.16645793616771698,
+      "learning_rate": 0.00014707612047825964,
+      "loss": 0.233,
+      "step": 39350
+    },
+    {
+      "epoch": 1.0927408311347933,
+      "grad_norm": 0.10213354974985123,
+      "learning_rate": 0.00014694398376395825,
+      "loss": 0.2277,
+      "step": 39400
+    },
+    {
+      "epoch": 1.0941275580778578,
+      "grad_norm": 0.11264722794294357,
+      "learning_rate": 0.0001468117418325166,
+      "loss": 0.2267,
+      "step": 39450
+    },
+    {
+      "epoch": 1.0955142850209223,
+      "grad_norm": 0.12596255540847778,
+      "learning_rate": 0.00014667939498033293,
+      "loss": 0.2226,
+      "step": 39500
+    },
+    {
+      "epoch": 1.0969010119639866,
+      "grad_norm": 0.10382383316755295,
+      "learning_rate": 0.0001465469435040407,
+      "loss": 0.2297,
+      "step": 39550
+    },
+    {
+      "epoch": 1.0982877389070511,
+      "grad_norm": 0.12972958385944366,
+      "learning_rate": 0.00014641438770050794,
+      "loss": 0.2256,
+      "step": 39600
+    },
+    {
+      "epoch": 1.0996744658501156,
+      "grad_norm": 0.13036096096038818,
+      "learning_rate": 0.00014628172786683641,
+      "loss": 0.2235,
+      "step": 39650
+    },
+    {
+      "epoch": 1.1010611927931802,
+      "grad_norm": 0.1233506128191948,
+      "learning_rate": 0.00014614896430036113,
+      "loss": 0.2243,
+      "step": 39700
+    },
+    {
+      "epoch": 1.1024479197362445,
+      "grad_norm": 0.11503315716981888,
+      "learning_rate": 0.00014601609729864956,
+      "loss": 0.2285,
+      "step": 39750
+    },
+    {
+      "epoch": 1.103834646679309,
+      "grad_norm": 0.12343501299619675,
+      "learning_rate": 0.000145883127159501,
+      "loss": 0.2272,
+      "step": 39800
+    },
+    {
+      "epoch": 1.1052213736223735,
+      "grad_norm": 0.1226864606142044,
+      "learning_rate": 0.00014575005418094594,
+      "loss": 0.2332,
+      "step": 39850
+    },
+    {
+      "epoch": 1.106608100565438,
+      "grad_norm": 0.1333167850971222,
+      "learning_rate": 0.00014561687866124535,
+      "loss": 0.2304,
+      "step": 39900
+    },
+    {
+      "epoch": 1.1079948275085023,
+      "grad_norm": 0.1088777631521225,
+      "learning_rate": 0.00014548360089889002,
+      "loss": 0.2296,
+      "step": 39950
+    },
+    {
+      "epoch": 1.1093815544515668,
+      "grad_norm": 0.11975093185901642,
+      "learning_rate": 0.00014535022119259994,
+      "loss": 0.2255,
+      "step": 40000
+    },
+    {
+      "epoch": 1.1093815544515668,
+      "eval_loss": 0.22516606748104095,
+      "eval_runtime": 500.4411,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 40000
+    },
+    {
+      "epoch": 1.1107682813946314,
+      "grad_norm": 0.19725576043128967,
+      "learning_rate": 0.0001452167398413235,
+      "loss": 0.2317,
+      "step": 40050
+    },
+    {
+      "epoch": 1.1121550083376956,
+      "grad_norm": 0.12385617196559906,
+      "learning_rate": 0.00014508315714423706,
+      "loss": 0.2269,
+      "step": 40100
+    },
+    {
+      "epoch": 1.1135417352807602,
+      "grad_norm": 0.12559738755226135,
+      "learning_rate": 0.000144949473400744,
+      "loss": 0.2295,
+      "step": 40150
+    },
+    {
+      "epoch": 1.1149284622238247,
+      "grad_norm": 0.1279434859752655,
+      "learning_rate": 0.0001448156889104742,
+      "loss": 0.2283,
+      "step": 40200
+    },
+    {
+      "epoch": 1.1163151891668892,
+      "grad_norm": 0.14756010472774506,
+      "learning_rate": 0.0001446818039732834,
+      "loss": 0.2267,
+      "step": 40250
+    },
+    {
+      "epoch": 1.1177019161099535,
+      "grad_norm": 0.11476084589958191,
+      "learning_rate": 0.00014454781888925238,
+      "loss": 0.2265,
+      "step": 40300
+    },
+    {
+      "epoch": 1.119088643053018,
+      "grad_norm": 0.12701088190078735,
+      "learning_rate": 0.00014441373395868653,
+      "loss": 0.2255,
+      "step": 40350
+    },
+    {
+      "epoch": 1.1204753699960825,
+      "grad_norm": 0.14300104975700378,
+      "learning_rate": 0.00014427954948211493,
+      "loss": 0.227,
+      "step": 40400
+    },
+    {
+      "epoch": 1.121862096939147,
+      "grad_norm": 0.11292553693056107,
+      "learning_rate": 0.00014414526576028973,
+      "loss": 0.2239,
+      "step": 40450
+    },
+    {
+      "epoch": 1.1232488238822114,
+      "grad_norm": 0.1404883861541748,
+      "learning_rate": 0.00014401088309418564,
+      "loss": 0.2234,
+      "step": 40500
+    },
+    {
+      "epoch": 1.1246355508252759,
+      "grad_norm": 0.15262041985988617,
+      "learning_rate": 0.00014387640178499905,
+      "loss": 0.2319,
+      "step": 40550
+    },
+    {
+      "epoch": 1.1260222777683404,
+      "grad_norm": 0.16456229984760284,
+      "learning_rate": 0.0001437418221341475,
+      "loss": 0.2264,
+      "step": 40600
+    },
+    {
+      "epoch": 1.1274090047114047,
+      "grad_norm": 0.12468329817056656,
+      "learning_rate": 0.0001436071444432689,
+      "loss": 0.2273,
+      "step": 40650
+    },
+    {
+      "epoch": 1.1287957316544692,
+      "grad_norm": 0.12449460476636887,
+      "learning_rate": 0.0001434723690142209,
+      "loss": 0.2333,
+      "step": 40700
+    },
+    {
+      "epoch": 1.1301824585975337,
+      "grad_norm": 0.12426210194826126,
+      "learning_rate": 0.0001433374961490803,
+      "loss": 0.2328,
+      "step": 40750
+    },
+    {
+      "epoch": 1.1315691855405983,
+      "grad_norm": 0.1501815766096115,
+      "learning_rate": 0.00014320252615014216,
+      "loss": 0.2214,
+      "step": 40800
+    },
+    {
+      "epoch": 1.1329559124836626,
+      "grad_norm": 0.15881818532943726,
+      "learning_rate": 0.00014306745931991932,
+      "loss": 0.2292,
+      "step": 40850
+    },
+    {
+      "epoch": 1.134342639426727,
+      "grad_norm": 0.12299991399049759,
+      "learning_rate": 0.00014293229596114163,
+      "loss": 0.2238,
+      "step": 40900
+    },
+    {
+      "epoch": 1.1357293663697916,
+      "grad_norm": 0.14259304106235504,
+      "learning_rate": 0.0001427970363767553,
+      "loss": 0.2291,
+      "step": 40950
+    },
+    {
+      "epoch": 1.137116093312856,
+      "grad_norm": 0.12536148726940155,
+      "learning_rate": 0.00014266168086992225,
+      "loss": 0.2252,
+      "step": 41000
+    },
+    {
+      "epoch": 1.137116093312856,
+      "eval_loss": 0.2245665341615677,
+      "eval_runtime": 501.2828,
+      "eval_samples_per_second": 5.699,
+      "eval_steps_per_second": 5.699,
+      "step": 41000
+    },
+    {
+      "epoch": 1.1385028202559204,
+      "grad_norm": 0.12410587817430496,
+      "learning_rate": 0.00014252622974401932,
+      "loss": 0.2268,
+      "step": 41050
+    },
+    {
+      "epoch": 1.139889547198985,
+      "grad_norm": 0.12877434492111206,
+      "learning_rate": 0.00014239068330263775,
+      "loss": 0.2258,
+      "step": 41100
+    },
+    {
+      "epoch": 1.1412762741420495,
+      "grad_norm": 0.1299249529838562,
+      "learning_rate": 0.00014225504184958232,
+      "loss": 0.2301,
+      "step": 41150
+    },
+    {
+      "epoch": 1.1426630010851138,
+      "grad_norm": 0.15234452486038208,
+      "learning_rate": 0.00014211930568887088,
+      "loss": 0.2192,
+      "step": 41200
+    },
+    {
+      "epoch": 1.1440497280281783,
+      "grad_norm": 0.12678442895412445,
+      "learning_rate": 0.00014198347512473343,
+      "loss": 0.2311,
+      "step": 41250
+    },
+    {
+      "epoch": 1.1454364549712428,
+      "grad_norm": 0.12326008826494217,
+      "learning_rate": 0.0001418475504616116,
+      "loss": 0.2318,
+      "step": 41300
+    },
+    {
+      "epoch": 1.146823181914307,
+      "grad_norm": 0.11192907392978668,
+      "learning_rate": 0.00014171153200415797,
+      "loss": 0.2232,
+      "step": 41350
+    },
+    {
+      "epoch": 1.1482099088573716,
+      "grad_norm": 0.11843819916248322,
+      "learning_rate": 0.00014157542005723532,
+      "loss": 0.2277,
+      "step": 41400
+    },
+    {
+      "epoch": 1.1495966358004361,
+      "grad_norm": 0.12903502583503723,
+      "learning_rate": 0.0001414419399397752,
+      "loss": 0.2237,
+      "step": 41450
+    },
+    {
+      "epoch": 1.1509833627435007,
+      "grad_norm": 0.13532768189907074,
+      "learning_rate": 0.00014130564378392948,
+      "loss": 0.2291,
+      "step": 41500
+    },
+    {
+      "epoch": 1.152370089686565,
+      "grad_norm": 0.11242423951625824,
+      "learning_rate": 0.00014116925504834574,
+      "loss": 0.2263,
+      "step": 41550
+    },
+    {
+      "epoch": 1.1537568166296295,
+      "grad_norm": 0.14420267939567566,
+      "learning_rate": 0.00014103277403871667,
+      "loss": 0.231,
+      "step": 41600
+    },
+    {
+      "epoch": 1.155143543572694,
+      "grad_norm": 0.11390483379364014,
+      "learning_rate": 0.00014089620106094174,
+      "loss": 0.2281,
+      "step": 41650
+    },
+    {
+      "epoch": 1.1565302705157583,
+      "grad_norm": 0.10996092855930328,
+      "learning_rate": 0.0001407595364211267,
+      "loss": 0.223,
+      "step": 41700
+    },
+    {
+      "epoch": 1.1579169974588228,
+      "grad_norm": 0.1297358274459839,
+      "learning_rate": 0.00014062278042558253,
+      "loss": 0.2251,
+      "step": 41750
+    },
+    {
+      "epoch": 1.1593037244018873,
+      "grad_norm": 0.13994191586971283,
+      "learning_rate": 0.00014048593338082508,
+      "loss": 0.2261,
+      "step": 41800
+    },
+    {
+      "epoch": 1.1606904513449519,
+      "grad_norm": 0.15100865066051483,
+      "learning_rate": 0.00014034899559357432,
+      "loss": 0.2257,
+      "step": 41850
+    },
+    {
+      "epoch": 1.1620771782880164,
+      "grad_norm": 0.1151217371225357,
+      "learning_rate": 0.0001402119673707535,
+      "loss": 0.2278,
+      "step": 41900
+    },
+    {
+      "epoch": 1.1634639052310807,
+      "grad_norm": 0.1580880582332611,
+      "learning_rate": 0.00014007484901948865,
+      "loss": 0.2247,
+      "step": 41950
+    },
+    {
+      "epoch": 1.1648506321741452,
+      "grad_norm": 0.1323232203722,
+      "learning_rate": 0.00013993764084710777,
+      "loss": 0.2229,
+      "step": 42000
+    },
+    {
+      "epoch": 1.1648506321741452,
+      "eval_loss": 0.22439424693584442,
+      "eval_runtime": 501.4893,
+      "eval_samples_per_second": 5.697,
+      "eval_steps_per_second": 5.697,
+      "step": 42000
+    },
+    {
+      "epoch": 1.1662373591172097,
+      "grad_norm": 0.11002755165100098,
+      "learning_rate": 0.00013980034316114014,
+      "loss": 0.2287,
+      "step": 42050
+    },
+    {
+      "epoch": 1.167624086060274,
+      "grad_norm": 0.16875265538692474,
+      "learning_rate": 0.00013966295626931575,
+      "loss": 0.2268,
+      "step": 42100
+    },
+    {
+      "epoch": 1.1690108130033385,
+      "grad_norm": 0.1291196197271347,
+      "learning_rate": 0.0001395254804795645,
+      "loss": 0.2267,
+      "step": 42150
+    },
+    {
+      "epoch": 1.170397539946403,
+      "grad_norm": 0.12030452489852905,
+      "learning_rate": 0.0001393879161000155,
+      "loss": 0.2284,
+      "step": 42200
+    },
+    {
+      "epoch": 1.1717842668894676,
+      "grad_norm": 0.1254565715789795,
+      "learning_rate": 0.00013925026343899644,
+      "loss": 0.2325,
+      "step": 42250
+    },
+    {
+      "epoch": 1.1731709938325319,
+      "grad_norm": 0.10753902792930603,
+      "learning_rate": 0.000139112522805033,
+      "loss": 0.2265,
+      "step": 42300
+    },
+    {
+      "epoch": 1.1745577207755964,
+      "grad_norm": 0.14079649746418,
+      "learning_rate": 0.00013897469450684783,
+      "loss": 0.2279,
+      "step": 42350
+    },
+    {
+      "epoch": 1.175944447718661,
+      "grad_norm": 0.13644090294837952,
+      "learning_rate": 0.00013883677885336013,
+      "loss": 0.2264,
+      "step": 42400
+    },
+    {
+      "epoch": 1.1773311746617252,
+      "grad_norm": 0.15901681780815125,
+      "learning_rate": 0.000138698776153685,
+      "loss": 0.2274,
+      "step": 42450
+    },
+    {
+      "epoch": 1.1787179016047897,
+      "grad_norm": 0.14739197492599487,
+      "learning_rate": 0.00013856068671713254,
+      "loss": 0.2223,
+      "step": 42500
+    },
+    {
+      "epoch": 1.1801046285478543,
+      "grad_norm": 0.1077587679028511,
+      "learning_rate": 0.00013842251085320728,
+      "loss": 0.2257,
+      "step": 42550
+    },
+    {
+      "epoch": 1.1814913554909188,
+      "grad_norm": 0.12596414983272552,
+      "learning_rate": 0.00013828424887160745,
+      "loss": 0.2251,
+      "step": 42600
+    },
+    {
+      "epoch": 1.182878082433983,
+      "grad_norm": 0.11234478652477264,
+      "learning_rate": 0.0001381459010822243,
+      "loss": 0.2225,
+      "step": 42650
+    },
+    {
+      "epoch": 1.1842648093770476,
+      "grad_norm": 0.11206696927547455,
+      "learning_rate": 0.00013800746779514143,
+      "loss": 0.2266,
+      "step": 42700
+    },
+    {
+      "epoch": 1.185651536320112,
+      "grad_norm": 0.10260911285877228,
+      "learning_rate": 0.0001378689493206341,
+      "loss": 0.2241,
+      "step": 42750
+    },
+    {
+      "epoch": 1.1870382632631764,
+      "grad_norm": 0.12874187529087067,
+      "learning_rate": 0.0001377303459691684,
+      "loss": 0.2277,
+      "step": 42800
+    },
+    {
+      "epoch": 1.188424990206241,
+      "grad_norm": 0.1351606696844101,
+      "learning_rate": 0.0001375916580514007,
+      "loss": 0.2268,
+      "step": 42850
+    },
+    {
+      "epoch": 1.1898117171493054,
+      "grad_norm": 0.1250632107257843,
+      "learning_rate": 0.000137452885878177,
+      "loss": 0.2265,
+      "step": 42900
+    },
+    {
+      "epoch": 1.19119844409237,
+      "grad_norm": 0.12516459822654724,
+      "learning_rate": 0.00013731402976053202,
+      "loss": 0.2256,
+      "step": 42950
+    },
+    {
+      "epoch": 1.1925851710354343,
+      "grad_norm": 0.12791725993156433,
+      "learning_rate": 0.00013717509000968865,
+      "loss": 0.2252,
+      "step": 43000
+    },
+    {
+      "epoch": 1.1925851710354343,
+      "eval_loss": 0.22418725490570068,
+      "eval_runtime": 501.0375,
+      "eval_samples_per_second": 5.702,
+      "eval_steps_per_second": 5.702,
+      "step": 43000
+    },
+    {
+      "epoch": 1.1939718979784988,
+      "grad_norm": 0.152371346950531,
+      "learning_rate": 0.00013703606693705732,
+      "loss": 0.2308,
+      "step": 43050
+    },
+    {
+      "epoch": 1.1953586249215633,
+      "grad_norm": 0.14723214507102966,
+      "learning_rate": 0.0001368969608542351,
+      "loss": 0.2258,
+      "step": 43100
+    },
+    {
+      "epoch": 1.1967453518646276,
+      "grad_norm": 0.1414303481578827,
+      "learning_rate": 0.00013675777207300524,
+      "loss": 0.2278,
+      "step": 43150
+    },
+    {
+      "epoch": 1.1981320788076921,
+      "grad_norm": 0.15416811406612396,
+      "learning_rate": 0.00013661850090533617,
+      "loss": 0.2324,
+      "step": 43200
+    },
+    {
+      "epoch": 1.1995188057507566,
+      "grad_norm": 0.11736203730106354,
+      "learning_rate": 0.00013647914766338112,
+      "loss": 0.2292,
+      "step": 43250
+    },
+    {
+      "epoch": 1.2009055326938212,
+      "grad_norm": 0.1547485738992691,
+      "learning_rate": 0.00013633971265947722,
+      "loss": 0.2281,
+      "step": 43300
+    },
+    {
+      "epoch": 1.2022922596368855,
+      "grad_norm": 0.15800827741622925,
+      "learning_rate": 0.0001362001962061449,
+      "loss": 0.2296,
+      "step": 43350
+    },
+    {
+      "epoch": 1.20367898657995,
+      "grad_norm": 0.15381957590579987,
+      "learning_rate": 0.0001360605986160871,
+      "loss": 0.2291,
+      "step": 43400
+    },
+    {
+      "epoch": 1.2050657135230145,
+      "grad_norm": 0.17754536867141724,
+      "learning_rate": 0.00013592092020218855,
+      "loss": 0.2285,
+      "step": 43450
+    },
+    {
+      "epoch": 1.2064524404660788,
+      "grad_norm": 0.1404140442609787,
+      "learning_rate": 0.0001357811612775153,
+      "loss": 0.2253,
+      "step": 43500
+    },
+    {
+      "epoch": 1.2078391674091433,
+      "grad_norm": 0.11709395796060562,
+      "learning_rate": 0.00013564132215531372,
+      "loss": 0.2261,
+      "step": 43550
+    },
+    {
+      "epoch": 1.2092258943522078,
+      "grad_norm": 0.11466790735721588,
+      "learning_rate": 0.00013550140314901,
+      "loss": 0.2295,
+      "step": 43600
+    },
+    {
+      "epoch": 1.2106126212952724,
+      "grad_norm": 0.14058195054531097,
+      "learning_rate": 0.00013536140457220933,
+      "loss": 0.2307,
+      "step": 43650
+    },
+    {
+      "epoch": 1.2119993482383369,
+      "grad_norm": 0.18355610966682434,
+      "learning_rate": 0.00013522132673869522,
+      "loss": 0.2283,
+      "step": 43700
+    },
+    {
+      "epoch": 1.2133860751814012,
+      "grad_norm": 0.1437745839357376,
+      "learning_rate": 0.00013508116996242893,
+      "loss": 0.2244,
+      "step": 43750
+    },
+    {
+      "epoch": 1.2147728021244657,
+      "grad_norm": 0.12281102687120438,
+      "learning_rate": 0.00013494093455754851,
+      "loss": 0.2266,
+      "step": 43800
+    },
+    {
+      "epoch": 1.2161595290675302,
+      "grad_norm": 0.15082257986068726,
+      "learning_rate": 0.00013480062083836842,
+      "loss": 0.2275,
+      "step": 43850
+    },
+    {
+      "epoch": 1.2175462560105945,
+      "grad_norm": 0.13360853493213654,
+      "learning_rate": 0.00013466022911937846,
+      "loss": 0.2293,
+      "step": 43900
+    },
+    {
+      "epoch": 1.218932982953659,
+      "grad_norm": 0.1245453953742981,
+      "learning_rate": 0.00013451975971524337,
+      "loss": 0.2252,
+      "step": 43950
+    },
+    {
+      "epoch": 1.2203197098967236,
+      "grad_norm": 0.12427138537168503,
+      "learning_rate": 0.00013437921294080202,
+      "loss": 0.2273,
+      "step": 44000
+    },
+    {
+      "epoch": 1.2203197098967236,
+      "eval_loss": 0.22416169941425323,
+      "eval_runtime": 501.199,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 5.7,
+      "step": 44000
+    },
+    {
+      "epoch": 1.221706436839788,
+      "grad_norm": 0.13315744698047638,
+      "learning_rate": 0.00013423858911106664,
+      "loss": 0.2273,
+      "step": 44050
+    },
+    {
+      "epoch": 1.2230931637828524,
+      "grad_norm": 0.11731356382369995,
+      "learning_rate": 0.0001340978885412221,
+      "loss": 0.2284,
+      "step": 44100
+    },
+    {
+      "epoch": 1.224479890725917,
+      "grad_norm": 0.1332121342420578,
+      "learning_rate": 0.00013395711154662548,
+      "loss": 0.2311,
+      "step": 44150
+    },
+    {
+      "epoch": 1.2258666176689814,
+      "grad_norm": 0.11775799095630646,
+      "learning_rate": 0.00013381625844280495,
+      "loss": 0.2207,
+      "step": 44200
+    },
+    {
+      "epoch": 1.2272533446120457,
+      "grad_norm": 0.13608750700950623,
+      "learning_rate": 0.00013367532954545934,
+      "loss": 0.2259,
+      "step": 44250
+    },
+    {
+      "epoch": 1.2286400715551102,
+      "grad_norm": 0.11276783794164658,
+      "learning_rate": 0.00013353432517045739,
+      "loss": 0.2254,
+      "step": 44300
+    },
+    {
+      "epoch": 1.2300267984981748,
+      "grad_norm": 0.11962584406137466,
+      "learning_rate": 0.00013339324563383693,
+      "loss": 0.2231,
+      "step": 44350
+    },
+    {
+      "epoch": 1.2314135254412393,
+      "grad_norm": 0.14515165984630585,
+      "learning_rate": 0.0001332520912518044,
+      "loss": 0.2273,
+      "step": 44400
+    },
+    {
+      "epoch": 1.2328002523843036,
+      "grad_norm": 0.14967331290245056,
+      "learning_rate": 0.00013311086234073376,
+      "loss": 0.2292,
+      "step": 44450
+    },
+    {
+      "epoch": 1.234186979327368,
+      "grad_norm": 0.10794315487146378,
+      "learning_rate": 0.00013296955921716626,
+      "loss": 0.2213,
+      "step": 44500
+    },
+    {
+      "epoch": 1.2355737062704326,
+      "grad_norm": 0.1261892467737198,
+      "learning_rate": 0.0001328281821978093,
+      "loss": 0.2249,
+      "step": 44550
+    },
+    {
+      "epoch": 1.236960433213497,
+      "grad_norm": 0.16944009065628052,
+      "learning_rate": 0.00013268673159953608,
+      "loss": 0.2279,
+      "step": 44600
+    },
+    {
+      "epoch": 1.2383471601565614,
+      "grad_norm": 0.14991511404514313,
+      "learning_rate": 0.00013254520773938453,
+      "loss": 0.224,
+      "step": 44650
+    },
+    {
+      "epoch": 1.239733887099626,
+      "grad_norm": 0.16776132583618164,
+      "learning_rate": 0.00013240361093455686,
+      "loss": 0.2267,
+      "step": 44700
+    },
+    {
+      "epoch": 1.2411206140426905,
+      "grad_norm": 0.15971648693084717,
+      "learning_rate": 0.00013226194150241886,
+      "loss": 0.2269,
+      "step": 44750
+    },
+    {
+      "epoch": 1.2425073409857548,
+      "grad_norm": 0.16267691552639008,
+      "learning_rate": 0.00013212019976049897,
+      "loss": 0.2262,
+      "step": 44800
+    },
+    {
+      "epoch": 1.2438940679288193,
+      "grad_norm": 0.13528917729854584,
+      "learning_rate": 0.00013197838602648773,
+      "loss": 0.2282,
+      "step": 44850
+    },
+    {
+      "epoch": 1.2452807948718838,
+      "grad_norm": 0.13532580435276031,
+      "learning_rate": 0.0001318365006182371,
+      "loss": 0.2269,
+      "step": 44900
+    },
+    {
+      "epoch": 1.246667521814948,
+      "grad_norm": 0.15377886593341827,
+      "learning_rate": 0.00013169738368628263,
+      "loss": 0.2298,
+      "step": 44950
+    },
+    {
+      "epoch": 1.2480542487580126,
+      "grad_norm": 0.16382162272930145,
+      "learning_rate": 0.00013155535730139284,
+      "loss": 0.2301,
+      "step": 45000
+    },
+    {
+      "epoch": 1.2480542487580126,
+      "eval_loss": 0.22414694726467133,
+      "eval_runtime": 500.918,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 45000
+    },
+    {
+      "epoch": 1.2494409757010772,
+      "grad_norm": 0.13876722753047943,
+      "learning_rate": 0.00013141326019041228,
+      "loss": 0.2249,
+      "step": 45050
+    },
+    {
+      "epoch": 1.2508277026441417,
+      "grad_norm": 0.1360548585653305,
+      "learning_rate": 0.00013127393671013348,
+      "loss": 0.2255,
+      "step": 45100
+    },
+    {
+      "epoch": 1.2522144295872062,
+      "grad_norm": 0.1435881406068802,
+      "learning_rate": 0.00013113170050124578,
+      "loss": 0.2314,
+      "step": 45150
+    },
+    {
+      "epoch": 1.2536011565302705,
+      "grad_norm": 0.12622830271720886,
+      "learning_rate": 0.00013098939451582363,
+      "loss": 0.2248,
+      "step": 45200
+    },
+    {
+      "epoch": 1.254987883473335,
+      "grad_norm": 0.1429251879453659,
+      "learning_rate": 0.00013084701907282228,
+      "loss": 0.2312,
+      "step": 45250
+    },
+    {
+      "epoch": 1.2563746104163993,
+      "grad_norm": 0.12246144562959671,
+      "learning_rate": 0.00013070457449135262,
+      "loss": 0.2236,
+      "step": 45300
+    },
+    {
+      "epoch": 1.2577613373594638,
+      "grad_norm": 0.11872986704111099,
+      "learning_rate": 0.00013056206109068045,
+      "loss": 0.2263,
+      "step": 45350
+    },
+    {
+      "epoch": 1.2591480643025283,
+      "grad_norm": 0.12920017540454865,
+      "learning_rate": 0.00013041947919022594,
+      "loss": 0.2258,
+      "step": 45400
+    },
+    {
+      "epoch": 1.2605347912455929,
+      "grad_norm": 0.15954279899597168,
+      "learning_rate": 0.00013027682910956271,
+      "loss": 0.2272,
+      "step": 45450
+    },
+    {
+      "epoch": 1.2619215181886574,
+      "grad_norm": 0.16156534850597382,
+      "learning_rate": 0.00013013411116841723,
+      "loss": 0.2245,
+      "step": 45500
+    },
+    {
+      "epoch": 1.2633082451317217,
+      "grad_norm": 0.12423060089349747,
+      "learning_rate": 0.00012999132568666805,
+      "loss": 0.2271,
+      "step": 45550
+    },
+    {
+      "epoch": 1.2646949720747862,
+      "grad_norm": 0.1252107322216034,
+      "learning_rate": 0.0001298484729843451,
+      "loss": 0.2298,
+      "step": 45600
+    },
+    {
+      "epoch": 1.2660816990178507,
+      "grad_norm": 0.16947528719902039,
+      "learning_rate": 0.00012970555338162896,
+      "loss": 0.2273,
+      "step": 45650
+    },
+    {
+      "epoch": 1.267468425960915,
+      "grad_norm": 0.14459671080112457,
+      "learning_rate": 0.00012956256719885026,
+      "loss": 0.2282,
+      "step": 45700
+    },
+    {
+      "epoch": 1.2688551529039795,
+      "grad_norm": 0.1194702684879303,
+      "learning_rate": 0.00012941951475648866,
+      "loss": 0.2263,
+      "step": 45750
+    },
+    {
+      "epoch": 1.270241879847044,
+      "grad_norm": 0.12180822342634201,
+      "learning_rate": 0.00012927639637517249,
+      "loss": 0.227,
+      "step": 45800
+    },
+    {
+      "epoch": 1.2716286067901086,
+      "grad_norm": 0.14245355129241943,
+      "learning_rate": 0.00012913321237567783,
+      "loss": 0.2262,
+      "step": 45850
+    },
+    {
+      "epoch": 1.2730153337331729,
+      "grad_norm": 0.14033064246177673,
+      "learning_rate": 0.00012898996307892784,
+      "loss": 0.2249,
+      "step": 45900
+    },
+    {
+      "epoch": 1.2744020606762374,
+      "grad_norm": 0.11540055274963379,
+      "learning_rate": 0.00012884664880599198,
+      "loss": 0.2265,
+      "step": 45950
+    },
+    {
+      "epoch": 1.275788787619302,
+      "grad_norm": 0.10777000337839127,
+      "learning_rate": 0.00012870326987808538,
+      "loss": 0.2245,
+      "step": 46000
+    },
+    {
+      "epoch": 1.275788787619302,
+      "eval_loss": 0.2235965132713318,
+      "eval_runtime": 500.5657,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 46000
+    },
+    {
+      "epoch": 1.2771755145623662,
+      "grad_norm": 0.13470718264579773,
+      "learning_rate": 0.00012855982661656815,
+      "loss": 0.2226,
+      "step": 46050
+    },
+    {
+      "epoch": 1.2785622415054307,
+      "grad_norm": 0.12822124361991882,
+      "learning_rate": 0.0001284163193429445,
+      "loss": 0.2294,
+      "step": 46100
+    },
+    {
+      "epoch": 1.2799489684484953,
+      "grad_norm": 0.14290271699428558,
+      "learning_rate": 0.0001282727483788621,
+      "loss": 0.2231,
+      "step": 46150
+    },
+    {
+      "epoch": 1.2813356953915598,
+      "grad_norm": 0.13675449788570404,
+      "learning_rate": 0.00012812911404611144,
+      "loss": 0.2283,
+      "step": 46200
+    },
+    {
+      "epoch": 1.282722422334624,
+      "grad_norm": 0.16636592149734497,
+      "learning_rate": 0.00012798541666662506,
+      "loss": 0.223,
+      "step": 46250
+    },
+    {
+      "epoch": 1.2841091492776886,
+      "grad_norm": 0.12275688350200653,
+      "learning_rate": 0.00012784165656247665,
+      "loss": 0.2271,
+      "step": 46300
+    },
+    {
+      "epoch": 1.2854958762207531,
+      "grad_norm": 0.13686712086200714,
+      "learning_rate": 0.00012769783405588072,
+      "loss": 0.223,
+      "step": 46350
+    },
+    {
+      "epoch": 1.2868826031638174,
+      "grad_norm": 0.1588650494813919,
+      "learning_rate": 0.00012755394946919145,
+      "loss": 0.2296,
+      "step": 46400
+    },
+    {
+      "epoch": 1.288269330106882,
+      "grad_norm": 0.12250595539808273,
+      "learning_rate": 0.00012741000312490228,
+      "loss": 0.2257,
+      "step": 46450
+    },
+    {
+      "epoch": 1.2896560570499465,
+      "grad_norm": 0.12048181891441345,
+      "learning_rate": 0.00012726599534564496,
+      "loss": 0.2231,
+      "step": 46500
+    },
+    {
+      "epoch": 1.291042783993011,
+      "grad_norm": 0.12862320244312286,
+      "learning_rate": 0.00012712192645418909,
+      "loss": 0.2274,
+      "step": 46550
+    },
+    {
+      "epoch": 1.2924295109360753,
+      "grad_norm": 0.13872814178466797,
+      "learning_rate": 0.00012697779677344108,
+      "loss": 0.2253,
+      "step": 46600
+    },
+    {
+      "epoch": 1.2938162378791398,
+      "grad_norm": 0.15560327470302582,
+      "learning_rate": 0.0001268336066264437,
+      "loss": 0.2243,
+      "step": 46650
+    },
+    {
+      "epoch": 1.2952029648222043,
+      "grad_norm": 0.12047038972377777,
+      "learning_rate": 0.0001266893563363752,
+      "loss": 0.2222,
+      "step": 46700
+    },
+    {
+      "epoch": 1.2965896917652686,
+      "grad_norm": 0.11812040954828262,
+      "learning_rate": 0.00012654504622654867,
+      "loss": 0.2278,
+      "step": 46750
+    },
+    {
+      "epoch": 1.2979764187083331,
+      "grad_norm": 0.12126338481903076,
+      "learning_rate": 0.00012640067662041118,
+      "loss": 0.2221,
+      "step": 46800
+    },
+    {
+      "epoch": 1.2993631456513977,
+      "grad_norm": 0.12661194801330566,
+      "learning_rate": 0.0001262562478415433,
+      "loss": 0.2259,
+      "step": 46850
+    },
+    {
+      "epoch": 1.3007498725944622,
+      "grad_norm": 0.1360771805047989,
+      "learning_rate": 0.00012611176021365807,
+      "loss": 0.2266,
+      "step": 46900
+    },
+    {
+      "epoch": 1.3021365995375267,
+      "grad_norm": 0.11665530502796173,
+      "learning_rate": 0.0001259672140606005,
+      "loss": 0.2254,
+      "step": 46950
+    },
+    {
+      "epoch": 1.303523326480591,
+      "grad_norm": 0.15935631096363068,
+      "learning_rate": 0.00012582260970634684,
+      "loss": 0.2274,
+      "step": 47000
+    },
+    {
+      "epoch": 1.303523326480591,
+      "eval_loss": 0.223495215177536,
+      "eval_runtime": 500.5307,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 47000
+    },
+    {
+      "epoch": 1.3049100534236555,
+      "grad_norm": 0.11267057806253433,
+      "learning_rate": 0.0001256779474750037,
+      "loss": 0.2268,
+      "step": 47050
+    },
+    {
+      "epoch": 1.3062967803667198,
+      "grad_norm": 0.12521906197071075,
+      "learning_rate": 0.0001255332276908074,
+      "loss": 0.2288,
+      "step": 47100
+    },
+    {
+      "epoch": 1.3076835073097843,
+      "grad_norm": 0.17053671181201935,
+      "learning_rate": 0.00012538845067812333,
+      "loss": 0.2263,
+      "step": 47150
+    },
+    {
+      "epoch": 1.3090702342528489,
+      "grad_norm": 0.11105658113956451,
+      "learning_rate": 0.0001252436167614451,
+      "loss": 0.2288,
+      "step": 47200
+    },
+    {
+      "epoch": 1.3104569611959134,
+      "grad_norm": 0.11419650167226791,
+      "learning_rate": 0.00012509872626539388,
+      "loss": 0.2229,
+      "step": 47250
+    },
+    {
+      "epoch": 1.311843688138978,
+      "grad_norm": 0.12066779285669327,
+      "learning_rate": 0.00012495377951471766,
+      "loss": 0.2245,
+      "step": 47300
+    },
+    {
+      "epoch": 1.3132304150820422,
+      "grad_norm": 0.12847816944122314,
+      "learning_rate": 0.00012480877683429043,
+      "loss": 0.2239,
+      "step": 47350
+    },
+    {
+      "epoch": 1.3146171420251067,
+      "grad_norm": 0.1426202654838562,
+      "learning_rate": 0.00012466371854911169,
+      "loss": 0.22,
+      "step": 47400
+    },
+    {
+      "epoch": 1.3160038689681712,
+      "grad_norm": 0.16877411305904388,
+      "learning_rate": 0.00012451860498430547,
+      "loss": 0.2233,
+      "step": 47450
+    },
+    {
+      "epoch": 1.3173905959112355,
+      "grad_norm": 0.12421774864196777,
+      "learning_rate": 0.00012437343646511966,
+      "loss": 0.2253,
+      "step": 47500
+    },
+    {
+      "epoch": 1.3187773228543,
+      "grad_norm": 0.12950457632541656,
+      "learning_rate": 0.00012422821331692542,
+      "loss": 0.222,
+      "step": 47550
+    },
+    {
+      "epoch": 1.3201640497973646,
+      "grad_norm": 0.11517184227705002,
+      "learning_rate": 0.00012408293586521632,
+      "loss": 0.226,
+      "step": 47600
+    },
+    {
+      "epoch": 1.321550776740429,
+      "grad_norm": 0.17244720458984375,
+      "learning_rate": 0.0001239376044356076,
+      "loss": 0.2228,
+      "step": 47650
+    },
+    {
+      "epoch": 1.3229375036834934,
+      "grad_norm": 0.11641084402799606,
+      "learning_rate": 0.00012379221935383553,
+      "loss": 0.2279,
+      "step": 47700
+    },
+    {
+      "epoch": 1.324324230626558,
+      "grad_norm": 0.1356060951948166,
+      "learning_rate": 0.00012364678094575665,
+      "loss": 0.227,
+      "step": 47750
+    },
+    {
+      "epoch": 1.3257109575696224,
+      "grad_norm": 0.21041586995124817,
+      "learning_rate": 0.00012350128953734693,
+      "loss": 0.2263,
+      "step": 47800
+    },
+    {
+      "epoch": 1.3270976845126867,
+      "grad_norm": 0.14442645013332367,
+      "learning_rate": 0.00012335574545470124,
+      "loss": 0.2254,
+      "step": 47850
+    },
+    {
+      "epoch": 1.3284844114557512,
+      "grad_norm": 0.11860388517379761,
+      "learning_rate": 0.0001232101490240324,
+      "loss": 0.2296,
+      "step": 47900
+    },
+    {
+      "epoch": 1.3298711383988158,
+      "grad_norm": 0.1434333771467209,
+      "learning_rate": 0.0001230645005716707,
+      "loss": 0.2259,
+      "step": 47950
+    },
+    {
+      "epoch": 1.3312578653418803,
+      "grad_norm": 0.13076238334178925,
+      "learning_rate": 0.0001229188004240629,
+      "loss": 0.2191,
+      "step": 48000
+    },
+    {
+      "epoch": 1.3312578653418803,
+      "eval_loss": 0.22342181205749512,
+      "eval_runtime": 500.8608,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 48000
+    },
+    {
+      "epoch": 1.3326445922849446,
+      "grad_norm": 0.13471820950508118,
+      "learning_rate": 0.00012277304890777164,
+      "loss": 0.225,
+      "step": 48050
+    },
+    {
+      "epoch": 1.334031319228009,
+      "grad_norm": 0.11158140748739243,
+      "learning_rate": 0.00012262724634947477,
+      "loss": 0.2241,
+      "step": 48100
+    },
+    {
+      "epoch": 1.3354180461710736,
+      "grad_norm": 0.131271094083786,
+      "learning_rate": 0.00012248139307596451,
+      "loss": 0.2282,
+      "step": 48150
+    },
+    {
+      "epoch": 1.336804773114138,
+      "grad_norm": 0.12760807573795319,
+      "learning_rate": 0.00012233548941414677,
+      "loss": 0.2269,
+      "step": 48200
+    },
+    {
+      "epoch": 1.3381915000572024,
+      "grad_norm": 0.10696883499622345,
+      "learning_rate": 0.00012218953569104025,
+      "loss": 0.2313,
+      "step": 48250
+    },
+    {
+      "epoch": 1.339578227000267,
+      "grad_norm": 0.11438623070716858,
+      "learning_rate": 0.00012204353223377612,
+      "loss": 0.2261,
+      "step": 48300
+    },
+    {
+      "epoch": 1.3409649539433315,
+      "grad_norm": 0.1496392786502838,
+      "learning_rate": 0.00012189747936959677,
+      "loss": 0.2287,
+      "step": 48350
+    },
+    {
+      "epoch": 1.3423516808863958,
+      "grad_norm": 0.11277805268764496,
+      "learning_rate": 0.00012175137742585546,
+      "loss": 0.2226,
+      "step": 48400
+    },
+    {
+      "epoch": 1.3437384078294603,
+      "grad_norm": 0.14849400520324707,
+      "learning_rate": 0.00012160522673001542,
+      "loss": 0.2211,
+      "step": 48450
+    },
+    {
+      "epoch": 1.3451251347725248,
+      "grad_norm": 0.11590241640806198,
+      "learning_rate": 0.00012145902760964916,
+      "loss": 0.2277,
+      "step": 48500
+    },
+    {
+      "epoch": 1.3465118617155891,
+      "grad_norm": 0.12213417887687683,
+      "learning_rate": 0.00012131278039243772,
+      "loss": 0.2218,
+      "step": 48550
+    },
+    {
+      "epoch": 1.3478985886586536,
+      "grad_norm": 0.11973270773887634,
+      "learning_rate": 0.00012116648540616996,
+      "loss": 0.228,
+      "step": 48600
+    },
+    {
+      "epoch": 1.3492853156017182,
+      "grad_norm": 0.12690085172653198,
+      "learning_rate": 0.00012102014297874171,
+      "loss": 0.2243,
+      "step": 48650
+    },
+    {
+      "epoch": 1.3506720425447827,
+      "grad_norm": 0.12574374675750732,
+      "learning_rate": 0.00012087375343815526,
+      "loss": 0.2261,
+      "step": 48700
+    },
+    {
+      "epoch": 1.3520587694878472,
+      "grad_norm": 0.11568839848041534,
+      "learning_rate": 0.00012072731711251848,
+      "loss": 0.2254,
+      "step": 48750
+    },
+    {
+      "epoch": 1.3534454964309115,
+      "grad_norm": 0.1170002669095993,
+      "learning_rate": 0.00012058083433004403,
+      "loss": 0.2298,
+      "step": 48800
+    },
+    {
+      "epoch": 1.354832223373976,
+      "grad_norm": 0.1263497769832611,
+      "learning_rate": 0.0001204343054190487,
+      "loss": 0.2257,
+      "step": 48850
+    },
+    {
+      "epoch": 1.3562189503170403,
+      "grad_norm": 0.14301252365112305,
+      "learning_rate": 0.00012028773070795275,
+      "loss": 0.2253,
+      "step": 48900
+    },
+    {
+      "epoch": 1.3576056772601048,
+      "grad_norm": 0.1464497148990631,
+      "learning_rate": 0.000120141110525279,
+      "loss": 0.2256,
+      "step": 48950
+    },
+    {
+      "epoch": 1.3589924042031694,
+      "grad_norm": 0.1061500534415245,
+      "learning_rate": 0.00011999444519965228,
+      "loss": 0.2212,
+      "step": 49000
+    },
+    {
+      "epoch": 1.3589924042031694,
+      "eval_loss": 0.22327525913715363,
+      "eval_runtime": 501.03,
+      "eval_samples_per_second": 5.702,
+      "eval_steps_per_second": 5.702,
+      "step": 49000
+    },
+    {
+      "epoch": 1.3603791311462339,
+      "grad_norm": 0.10947709530591965,
+      "learning_rate": 0.00011984773505979852,
+      "loss": 0.2176,
+      "step": 49050
+    },
+    {
+      "epoch": 1.3617658580892984,
+      "grad_norm": 0.11933697760105133,
+      "learning_rate": 0.00011970098043454412,
+      "loss": 0.2254,
+      "step": 49100
+    },
+    {
+      "epoch": 1.3631525850323627,
+      "grad_norm": 0.11835236102342606,
+      "learning_rate": 0.0001195541816528152,
+      "loss": 0.2236,
+      "step": 49150
+    },
+    {
+      "epoch": 1.3645393119754272,
+      "grad_norm": 0.15372909605503082,
+      "learning_rate": 0.00011940733904363681,
+      "loss": 0.2225,
+      "step": 49200
+    },
+    {
+      "epoch": 1.3659260389184917,
+      "grad_norm": 0.137434184551239,
+      "learning_rate": 0.00011926045293613228,
+      "loss": 0.2249,
+      "step": 49250
+    },
+    {
+      "epoch": 1.367312765861556,
+      "grad_norm": 0.1446930319070816,
+      "learning_rate": 0.00011911352365952247,
+      "loss": 0.228,
+      "step": 49300
+    },
+    {
+      "epoch": 1.3686994928046206,
+      "grad_norm": 0.12815728783607483,
+      "learning_rate": 0.0001189665515431249,
+      "loss": 0.2256,
+      "step": 49350
+    },
+    {
+      "epoch": 1.370086219747685,
+      "grad_norm": 0.12660840153694153,
+      "learning_rate": 0.00011881953691635312,
+      "loss": 0.2235,
+      "step": 49400
+    },
+    {
+      "epoch": 1.3714729466907496,
+      "grad_norm": 0.15400618314743042,
+      "learning_rate": 0.00011867248010871604,
+      "loss": 0.2271,
+      "step": 49450
+    },
+    {
+      "epoch": 1.372859673633814,
+      "grad_norm": 0.1160617545247078,
+      "learning_rate": 0.00011852538144981701,
+      "loss": 0.222,
+      "step": 49500
+    },
+    {
+      "epoch": 1.3742464005768784,
+      "grad_norm": 0.10030169039964676,
+      "learning_rate": 0.0001183782412693533,
+      "loss": 0.227,
+      "step": 49550
+    },
+    {
+      "epoch": 1.375633127519943,
+      "grad_norm": 0.1285124272108078,
+      "learning_rate": 0.00011823105989711515,
+      "loss": 0.2227,
+      "step": 49600
+    },
+    {
+      "epoch": 1.3770198544630072,
+      "grad_norm": 0.16413910686969757,
+      "learning_rate": 0.00011808383766298512,
+      "loss": 0.2241,
+      "step": 49650
+    },
+    {
+      "epoch": 1.3784065814060718,
+      "grad_norm": 0.12963584065437317,
+      "learning_rate": 0.00011793657489693743,
+      "loss": 0.2268,
+      "step": 49700
+    },
+    {
+      "epoch": 1.3797933083491363,
+      "grad_norm": 0.11898767948150635,
+      "learning_rate": 0.00011778927192903709,
+      "loss": 0.2244,
+      "step": 49750
+    },
+    {
+      "epoch": 1.3811800352922008,
+      "grad_norm": 0.10802606493234634,
+      "learning_rate": 0.00011764192908943925,
+      "loss": 0.2227,
+      "step": 49800
+    },
+    {
+      "epoch": 1.382566762235265,
+      "grad_norm": 0.17253124713897705,
+      "learning_rate": 0.00011749749474137916,
+      "loss": 0.2278,
+      "step": 49850
+    },
+    {
+      "epoch": 1.3839534891783296,
+      "grad_norm": 0.11553626507520676,
+      "learning_rate": 0.00011735007393019295,
+      "loss": 0.2232,
+      "step": 49900
+    },
+    {
+      "epoch": 1.3853402161213941,
+      "grad_norm": 0.16757309436798096,
+      "learning_rate": 0.00011720261423169856,
+      "loss": 0.2288,
+      "step": 49950
+    },
+    {
+      "epoch": 1.3867269430644584,
+      "grad_norm": 0.17031162977218628,
+      "learning_rate": 0.0001170551159764024,
+      "loss": 0.2249,
+      "step": 50000
+    },
+    {
+      "epoch": 1.3867269430644584,
+      "eval_loss": 0.22317548096179962,
+      "eval_runtime": 500.2665,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 50000
+    },
+    {
+      "epoch": 1.388113670007523,
+      "grad_norm": 0.11588902771472931,
+      "learning_rate": 0.00011690757949489732,
+      "loss": 0.2284,
+      "step": 50050
+    },
+    {
+      "epoch": 1.3895003969505875,
+      "grad_norm": 0.12574197351932526,
+      "learning_rate": 0.00011676000511786185,
+      "loss": 0.2316,
+      "step": 50100
+    },
+    {
+      "epoch": 1.390887123893652,
+      "grad_norm": 0.11579444259405136,
+      "learning_rate": 0.0001166123931760594,
+      "loss": 0.2239,
+      "step": 50150
+    },
+    {
+      "epoch": 1.3922738508367163,
+      "grad_norm": 0.1079888865351677,
+      "learning_rate": 0.00011646474400033762,
+      "loss": 0.2256,
+      "step": 50200
+    },
+    {
+      "epoch": 1.3936605777797808,
+      "grad_norm": 0.1480822116136551,
+      "learning_rate": 0.00011631705792162764,
+      "loss": 0.2254,
+      "step": 50250
+    },
+    {
+      "epoch": 1.3950473047228453,
+      "grad_norm": 0.12621110677719116,
+      "learning_rate": 0.0001161693352709432,
+      "loss": 0.2262,
+      "step": 50300
+    },
+    {
+      "epoch": 1.3964340316659096,
+      "grad_norm": 0.1542740762233734,
+      "learning_rate": 0.00011602157637938016,
+      "loss": 0.222,
+      "step": 50350
+    },
+    {
+      "epoch": 1.3978207586089741,
+      "grad_norm": 0.11742950975894928,
+      "learning_rate": 0.00011587378157811545,
+      "loss": 0.2197,
+      "step": 50400
+    },
+    {
+      "epoch": 1.3992074855520387,
+      "grad_norm": 0.1457587331533432,
+      "learning_rate": 0.00011572595119840666,
+      "loss": 0.2236,
+      "step": 50450
+    },
+    {
+      "epoch": 1.4005942124951032,
+      "grad_norm": 0.12402471154928207,
+      "learning_rate": 0.00011557808557159093,
+      "loss": 0.2251,
+      "step": 50500
+    },
+    {
+      "epoch": 1.4019809394381677,
+      "grad_norm": 0.1243479996919632,
+      "learning_rate": 0.00011543018502908455,
+      "loss": 0.2192,
+      "step": 50550
+    },
+    {
+      "epoch": 1.403367666381232,
+      "grad_norm": 0.126913920044899,
+      "learning_rate": 0.00011528224990238199,
+      "loss": 0.2248,
+      "step": 50600
+    },
+    {
+      "epoch": 1.4047543933242965,
+      "grad_norm": 0.13005682826042175,
+      "learning_rate": 0.00011513428052305528,
+      "loss": 0.2287,
+      "step": 50650
+    },
+    {
+      "epoch": 1.4061411202673608,
+      "grad_norm": 0.12297997623682022,
+      "learning_rate": 0.00011498627722275319,
+      "loss": 0.2244,
+      "step": 50700
+    },
+    {
+      "epoch": 1.4075278472104253,
+      "grad_norm": 0.14594142138957977,
+      "learning_rate": 0.00011483824033320053,
+      "loss": 0.2262,
+      "step": 50750
+    },
+    {
+      "epoch": 1.4089145741534899,
+      "grad_norm": 0.14156201481819153,
+      "learning_rate": 0.00011469017018619734,
+      "loss": 0.2251,
+      "step": 50800
+    },
+    {
+      "epoch": 1.4103013010965544,
+      "grad_norm": 0.11722792685031891,
+      "learning_rate": 0.00011454206711361835,
+      "loss": 0.223,
+      "step": 50850
+    },
+    {
+      "epoch": 1.411688028039619,
+      "grad_norm": 0.10951201617717743,
+      "learning_rate": 0.0001143939314474119,
+      "loss": 0.2276,
+      "step": 50900
+    },
+    {
+      "epoch": 1.4130747549826832,
+      "grad_norm": 0.13954362273216248,
+      "learning_rate": 0.00011424576351959957,
+      "loss": 0.2233,
+      "step": 50950
+    },
+    {
+      "epoch": 1.4144614819257477,
+      "grad_norm": 0.12842465937137604,
+      "learning_rate": 0.00011409756366227509,
+      "loss": 0.2244,
+      "step": 51000
+    },
+    {
+      "epoch": 1.4144614819257477,
+      "eval_loss": 0.2229885309934616,
+      "eval_runtime": 500.7247,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 51000
+    },
+    {
+      "epoch": 1.4158482088688122,
+      "grad_norm": 0.11812816560268402,
+      "learning_rate": 0.0001139493322076038,
+      "loss": 0.2243,
+      "step": 51050
+    },
+    {
+      "epoch": 1.4172349358118765,
+      "grad_norm": 0.11909503489732742,
+      "learning_rate": 0.00011380106948782194,
+      "loss": 0.2241,
+      "step": 51100
+    },
+    {
+      "epoch": 1.418621662754941,
+      "grad_norm": 0.12617436051368713,
+      "learning_rate": 0.00011365277583523573,
+      "loss": 0.2247,
+      "step": 51150
+    },
+    {
+      "epoch": 1.4200083896980056,
+      "grad_norm": 0.1255272477865219,
+      "learning_rate": 0.00011350445158222074,
+      "loss": 0.2222,
+      "step": 51200
+    },
+    {
+      "epoch": 1.42139511664107,
+      "grad_norm": 0.1265612095594406,
+      "learning_rate": 0.00011335609706122117,
+      "loss": 0.2248,
+      "step": 51250
+    },
+    {
+      "epoch": 1.4227818435841344,
+      "grad_norm": 0.1541653722524643,
+      "learning_rate": 0.000113207712604749,
+      "loss": 0.2325,
+      "step": 51300
+    },
+    {
+      "epoch": 1.424168570527199,
+      "grad_norm": 0.14105264842510223,
+      "learning_rate": 0.00011305929854538338,
+      "loss": 0.2222,
+      "step": 51350
+    },
+    {
+      "epoch": 1.4255552974702634,
+      "grad_norm": 0.13859054446220398,
+      "learning_rate": 0.00011291085521576972,
+      "loss": 0.225,
+      "step": 51400
+    },
+    {
+      "epoch": 1.4269420244133277,
+      "grad_norm": 0.14842507243156433,
+      "learning_rate": 0.00011276238294861912,
+      "loss": 0.2233,
+      "step": 51450
+    },
+    {
+      "epoch": 1.4283287513563923,
+      "grad_norm": 0.1151481494307518,
+      "learning_rate": 0.00011261388207670747,
+      "loss": 0.2241,
+      "step": 51500
+    },
+    {
+      "epoch": 1.4297154782994568,
+      "grad_norm": 0.13431167602539062,
+      "learning_rate": 0.00011246535293287483,
+      "loss": 0.2259,
+      "step": 51550
+    },
+    {
+      "epoch": 1.4311022052425213,
+      "grad_norm": 0.10691921412944794,
+      "learning_rate": 0.0001123167958500246,
+      "loss": 0.2257,
+      "step": 51600
+    },
+    {
+      "epoch": 1.4324889321855856,
+      "grad_norm": 0.12425126880407333,
+      "learning_rate": 0.00011216821116112275,
+      "loss": 0.2272,
+      "step": 51650
+    },
+    {
+      "epoch": 1.4338756591286501,
+      "grad_norm": 0.1344369500875473,
+      "learning_rate": 0.00011201959919919722,
+      "loss": 0.2236,
+      "step": 51700
+    },
+    {
+      "epoch": 1.4352623860717146,
+      "grad_norm": 0.154687762260437,
+      "learning_rate": 0.00011187096029733704,
+      "loss": 0.2248,
+      "step": 51750
+    },
+    {
+      "epoch": 1.436649113014779,
+      "grad_norm": 0.14304719865322113,
+      "learning_rate": 0.00011172229478869158,
+      "loss": 0.2269,
+      "step": 51800
+    },
+    {
+      "epoch": 1.4380358399578435,
+      "grad_norm": 0.14757920801639557,
+      "learning_rate": 0.00011157360300646988,
+      "loss": 0.2224,
+      "step": 51850
+    },
+    {
+      "epoch": 1.439422566900908,
+      "grad_norm": 0.13079752027988434,
+      "learning_rate": 0.00011142488528393989,
+      "loss": 0.2249,
+      "step": 51900
+    },
+    {
+      "epoch": 1.4408092938439725,
+      "grad_norm": 0.1415751427412033,
+      "learning_rate": 0.00011127614195442766,
+      "loss": 0.2308,
+      "step": 51950
+    },
+    {
+      "epoch": 1.442196020787037,
+      "grad_norm": 0.14857710897922516,
+      "learning_rate": 0.00011112737335131667,
+      "loss": 0.2276,
+      "step": 52000
+    },
+    {
+      "epoch": 1.442196020787037,
+      "eval_loss": 0.22276601195335388,
+      "eval_runtime": 500.893,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 52000
+    },
+    {
+      "epoch": 1.4435827477301013,
+      "grad_norm": 0.12997597455978394,
+      "learning_rate": 0.000110978579808047,
+      "loss": 0.2244,
+      "step": 52050
+    },
+    {
+      "epoch": 1.4449694746731658,
+      "grad_norm": 0.13583651185035706,
+      "learning_rate": 0.00011082976165811469,
+      "loss": 0.2271,
+      "step": 52100
+    },
+    {
+      "epoch": 1.4463562016162301,
+      "grad_norm": 0.15908506512641907,
+      "learning_rate": 0.00011068091923507087,
+      "loss": 0.2276,
+      "step": 52150
+    },
+    {
+      "epoch": 1.4477429285592947,
+      "grad_norm": 0.10295715928077698,
+      "learning_rate": 0.00011053205287252113,
+      "loss": 0.2268,
+      "step": 52200
+    },
+    {
+      "epoch": 1.4491296555023592,
+      "grad_norm": 0.14841599762439728,
+      "learning_rate": 0.00011038316290412463,
+      "loss": 0.227,
+      "step": 52250
+    },
+    {
+      "epoch": 1.4505163824454237,
+      "grad_norm": 0.11808385699987411,
+      "learning_rate": 0.00011023722815431241,
+      "loss": 0.2212,
+      "step": 52300
+    },
+    {
+      "epoch": 1.4519031093884882,
+      "grad_norm": 0.14345276355743408,
+      "learning_rate": 0.00011008829243090724,
+      "loss": 0.2264,
+      "step": 52350
+    },
+    {
+      "epoch": 1.4532898363315525,
+      "grad_norm": 0.1435631662607193,
+      "learning_rate": 0.00010993933409627062,
+      "loss": 0.222,
+      "step": 52400
+    },
+    {
+      "epoch": 1.454676563274617,
+      "grad_norm": 0.10484310239553452,
+      "learning_rate": 0.00010979035348426798,
+      "loss": 0.2253,
+      "step": 52450
+    },
+    {
+      "epoch": 1.4560632902176813,
+      "grad_norm": 0.12916283309459686,
+      "learning_rate": 0.00010964135092881453,
+      "loss": 0.2261,
+      "step": 52500
+    },
+    {
+      "epoch": 1.4574500171607458,
+      "grad_norm": 0.15381525456905365,
+      "learning_rate": 0.00010949232676387484,
+      "loss": 0.2276,
+      "step": 52550
+    },
+    {
+      "epoch": 1.4588367441038104,
+      "grad_norm": 0.11151523888111115,
+      "learning_rate": 0.00010934328132346172,
+      "loss": 0.2262,
+      "step": 52600
+    },
+    {
+      "epoch": 1.4602234710468749,
+      "grad_norm": 0.11262549459934235,
+      "learning_rate": 0.00010919421494163582,
+      "loss": 0.222,
+      "step": 52650
+    },
+    {
+      "epoch": 1.4616101979899394,
+      "grad_norm": 0.12128688395023346,
+      "learning_rate": 0.00010904512795250468,
+      "loss": 0.223,
+      "step": 52700
+    },
+    {
+      "epoch": 1.4629969249330037,
+      "grad_norm": 0.12478487193584442,
+      "learning_rate": 0.00010889602069022198,
+      "loss": 0.2229,
+      "step": 52750
+    },
+    {
+      "epoch": 1.4643836518760682,
+      "grad_norm": 0.12064434587955475,
+      "learning_rate": 0.00010874689348898685,
+      "loss": 0.2216,
+      "step": 52800
+    },
+    {
+      "epoch": 1.4657703788191327,
+      "grad_norm": 0.1457664519548416,
+      "learning_rate": 0.00010859774668304321,
+      "loss": 0.225,
+      "step": 52850
+    },
+    {
+      "epoch": 1.467157105762197,
+      "grad_norm": 0.1602487713098526,
+      "learning_rate": 0.00010844858060667881,
+      "loss": 0.2242,
+      "step": 52900
+    },
+    {
+      "epoch": 1.4685438327052616,
+      "grad_norm": 0.12469302117824554,
+      "learning_rate": 0.00010829939559422464,
+      "loss": 0.2252,
+      "step": 52950
+    },
+    {
+      "epoch": 1.469930559648326,
+      "grad_norm": 0.1300072968006134,
+      "learning_rate": 0.00010815019198005407,
+      "loss": 0.2294,
+      "step": 53000
+    },
+    {
+      "epoch": 1.469930559648326,
+      "eval_loss": 0.2224954217672348,
+      "eval_runtime": 500.39,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 53000
+    },
+    {
+      "epoch": 1.4713172865913906,
+      "grad_norm": 0.11393048614263535,
+      "learning_rate": 0.00010800097009858226,
+      "loss": 0.2211,
+      "step": 53050
+    },
+    {
+      "epoch": 1.472704013534455,
+      "grad_norm": 0.11365407705307007,
+      "learning_rate": 0.00010785173028426525,
+      "loss": 0.2247,
+      "step": 53100
+    },
+    {
+      "epoch": 1.4740907404775194,
+      "grad_norm": 0.13408495485782623,
+      "learning_rate": 0.00010770247287159932,
+      "loss": 0.2243,
+      "step": 53150
+    },
+    {
+      "epoch": 1.475477467420584,
+      "grad_norm": 0.12727609276771545,
+      "learning_rate": 0.00010755319819512011,
+      "loss": 0.2314,
+      "step": 53200
+    },
+    {
+      "epoch": 1.4768641943636482,
+      "grad_norm": 0.11358857899904251,
+      "learning_rate": 0.00010740390658940205,
+      "loss": 0.224,
+      "step": 53250
+    },
+    {
+      "epoch": 1.4782509213067128,
+      "grad_norm": 0.13022546470165253,
+      "learning_rate": 0.00010725459838905748,
+      "loss": 0.2241,
+      "step": 53300
+    },
+    {
+      "epoch": 1.4796376482497773,
+      "grad_norm": 0.125900000333786,
+      "learning_rate": 0.00010710527392873587,
+      "loss": 0.2252,
+      "step": 53350
+    },
+    {
+      "epoch": 1.4810243751928418,
+      "grad_norm": 0.13671207427978516,
+      "learning_rate": 0.00010695593354312321,
+      "loss": 0.2249,
+      "step": 53400
+    },
+    {
+      "epoch": 1.482411102135906,
+      "grad_norm": 0.15761978924274445,
+      "learning_rate": 0.0001068065775669412,
+      "loss": 0.2255,
+      "step": 53450
+    },
+    {
+      "epoch": 1.4837978290789706,
+      "grad_norm": 0.1286703646183014,
+      "learning_rate": 0.00010665720633494641,
+      "loss": 0.2214,
+      "step": 53500
+    },
+    {
+      "epoch": 1.4851845560220351,
+      "grad_norm": 0.1332877278327942,
+      "learning_rate": 0.00010650782018192962,
+      "loss": 0.2226,
+      "step": 53550
+    },
+    {
+      "epoch": 1.4865712829650994,
+      "grad_norm": 0.13217736780643463,
+      "learning_rate": 0.00010635841944271511,
+      "loss": 0.2268,
+      "step": 53600
+    },
+    {
+      "epoch": 1.487958009908164,
+      "grad_norm": 0.12781362235546112,
+      "learning_rate": 0.0001062090044521598,
+      "loss": 0.2244,
+      "step": 53650
+    },
+    {
+      "epoch": 1.4893447368512285,
+      "grad_norm": 0.1170235425233841,
+      "learning_rate": 0.0001060595755451526,
+      "loss": 0.2278,
+      "step": 53700
+    },
+    {
+      "epoch": 1.490731463794293,
+      "grad_norm": 0.11187135428190231,
+      "learning_rate": 0.00010591013305661357,
+      "loss": 0.2257,
+      "step": 53750
+    },
+    {
+      "epoch": 1.4921181907373575,
+      "grad_norm": 0.11063262075185776,
+      "learning_rate": 0.00010576067732149315,
+      "loss": 0.2233,
+      "step": 53800
+    },
+    {
+      "epoch": 1.4935049176804218,
+      "grad_norm": 0.18011736869812012,
+      "learning_rate": 0.00010561120867477164,
+      "loss": 0.2236,
+      "step": 53850
+    },
+    {
+      "epoch": 1.4948916446234863,
+      "grad_norm": 0.11368495970964432,
+      "learning_rate": 0.00010546172745145812,
+      "loss": 0.2275,
+      "step": 53900
+    },
+    {
+      "epoch": 1.4962783715665506,
+      "grad_norm": 0.11830084025859833,
+      "learning_rate": 0.00010531223398658993,
+      "loss": 0.2243,
+      "step": 53950
+    },
+    {
+      "epoch": 1.4976650985096152,
+      "grad_norm": 0.129221111536026,
+      "learning_rate": 0.00010516272861523182,
+      "loss": 0.2249,
+      "step": 54000
+    },
+    {
+      "epoch": 1.4976650985096152,
+      "eval_loss": 0.222273588180542,
+      "eval_runtime": 500.3973,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 54000
+    },
+    {
+      "epoch": 1.4990518254526797,
+      "grad_norm": 0.1275608390569687,
+      "learning_rate": 0.00010501321167247526,
+      "loss": 0.2209,
+      "step": 54050
+    },
+    {
+      "epoch": 1.5004385523957442,
+      "grad_norm": 0.126673623919487,
+      "learning_rate": 0.0001048636834934376,
+      "loss": 0.2213,
+      "step": 54100
+    },
+    {
+      "epoch": 1.5018252793388087,
+      "grad_norm": 0.09999104589223862,
+      "learning_rate": 0.0001047141444132615,
+      "loss": 0.2253,
+      "step": 54150
+    },
+    {
+      "epoch": 1.503212006281873,
+      "grad_norm": 0.11277350038290024,
+      "learning_rate": 0.00010456459476711389,
+      "loss": 0.2246,
+      "step": 54200
+    },
+    {
+      "epoch": 1.5045987332249375,
+      "grad_norm": 0.1197759360074997,
+      "learning_rate": 0.00010441503489018545,
+      "loss": 0.2221,
+      "step": 54250
+    },
+    {
+      "epoch": 1.5059854601680018,
+      "grad_norm": 0.11321547627449036,
+      "learning_rate": 0.00010426546511768982,
+      "loss": 0.222,
+      "step": 54300
+    },
+    {
+      "epoch": 1.5073721871110664,
+      "grad_norm": 0.13402992486953735,
+      "learning_rate": 0.00010411588578486282,
+      "loss": 0.2201,
+      "step": 54350
+    },
+    {
+      "epoch": 1.5087589140541309,
+      "grad_norm": 0.11394736170768738,
+      "learning_rate": 0.00010396629722696163,
+      "loss": 0.2237,
+      "step": 54400
+    },
+    {
+      "epoch": 1.5101456409971954,
+      "grad_norm": 0.14211580157279968,
+      "learning_rate": 0.00010381669977926414,
+      "loss": 0.2238,
+      "step": 54450
+    },
+    {
+      "epoch": 1.51153236794026,
+      "grad_norm": 0.15154938399791718,
+      "learning_rate": 0.00010366709377706825,
+      "loss": 0.2225,
+      "step": 54500
+    },
+    {
+      "epoch": 1.5129190948833242,
+      "grad_norm": 0.14525644481182098,
+      "learning_rate": 0.00010351747955569088,
+      "loss": 0.2245,
+      "step": 54550
+    },
+    {
+      "epoch": 1.5143058218263887,
+      "grad_norm": 0.1171354353427887,
+      "learning_rate": 0.00010336785745046747,
+      "loss": 0.2209,
+      "step": 54600
+    },
+    {
+      "epoch": 1.515692548769453,
+      "grad_norm": 0.10528494417667389,
+      "learning_rate": 0.00010321822779675115,
+      "loss": 0.2247,
+      "step": 54650
+    },
+    {
+      "epoch": 1.5170792757125175,
+      "grad_norm": 0.13853302597999573,
+      "learning_rate": 0.00010306859092991188,
+      "loss": 0.2243,
+      "step": 54700
+    },
+    {
+      "epoch": 1.518466002655582,
+      "grad_norm": 0.15869873762130737,
+      "learning_rate": 0.00010291894718533585,
+      "loss": 0.2257,
+      "step": 54750
+    },
+    {
+      "epoch": 1.5198527295986466,
+      "grad_norm": 0.11144551634788513,
+      "learning_rate": 0.0001027692968984247,
+      "loss": 0.2249,
+      "step": 54800
+    },
+    {
+      "epoch": 1.521239456541711,
+      "grad_norm": 0.1125815361738205,
+      "learning_rate": 0.00010261964040459458,
+      "loss": 0.2243,
+      "step": 54850
+    },
+    {
+      "epoch": 1.5226261834847756,
+      "grad_norm": 0.11292921751737595,
+      "learning_rate": 0.00010246997803927576,
+      "loss": 0.2219,
+      "step": 54900
+    },
+    {
+      "epoch": 1.52401291042784,
+      "grad_norm": 0.1216253936290741,
+      "learning_rate": 0.00010232031013791152,
+      "loss": 0.223,
+      "step": 54950
+    },
+    {
+      "epoch": 1.5253996373709042,
+      "grad_norm": 0.12706224620342255,
+      "learning_rate": 0.00010217063703595761,
+      "loss": 0.2214,
+      "step": 55000
+    },
+    {
+      "epoch": 1.5253996373709042,
+      "eval_loss": 0.22219954431056976,
+      "eval_runtime": 500.7255,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 55000
+    },
+    {
+      "epoch": 1.5267863643139687,
+      "grad_norm": 0.12699660658836365,
+      "learning_rate": 0.0001020209590688814,
+      "loss": 0.2274,
+      "step": 55050
+    },
+    {
+      "epoch": 1.5281730912570333,
+      "grad_norm": 0.11792019754648209,
+      "learning_rate": 0.00010187127657216122,
+      "loss": 0.2263,
+      "step": 55100
+    },
+    {
+      "epoch": 1.5295598182000978,
+      "grad_norm": 0.11735875904560089,
+      "learning_rate": 0.00010172158988128548,
+      "loss": 0.224,
+      "step": 55150
+    },
+    {
+      "epoch": 1.5309465451431623,
+      "grad_norm": 0.1137237474322319,
+      "learning_rate": 0.00010157189933175203,
+      "loss": 0.2225,
+      "step": 55200
+    },
+    {
+      "epoch": 1.5323332720862268,
+      "grad_norm": 0.12405762076377869,
+      "learning_rate": 0.0001014222052590674,
+      "loss": 0.2183,
+      "step": 55250
+    },
+    {
+      "epoch": 1.5337199990292911,
+      "grad_norm": 0.1327371746301651,
+      "learning_rate": 0.00010127250799874596,
+      "loss": 0.2211,
+      "step": 55300
+    },
+    {
+      "epoch": 1.5351067259723556,
+      "grad_norm": 0.14912429451942444,
+      "learning_rate": 0.00010112280788630928,
+      "loss": 0.2236,
+      "step": 55350
+    },
+    {
+      "epoch": 1.53649345291542,
+      "grad_norm": 0.1276470422744751,
+      "learning_rate": 0.00010097310525728527,
+      "loss": 0.2264,
+      "step": 55400
+    },
+    {
+      "epoch": 1.5378801798584845,
+      "grad_norm": 0.157034233212471,
+      "learning_rate": 0.00010082340044720746,
+      "loss": 0.2252,
+      "step": 55450
+    },
+    {
+      "epoch": 1.539266906801549,
+      "grad_norm": 0.12907952070236206,
+      "learning_rate": 0.00010067369379161437,
+      "loss": 0.2252,
+      "step": 55500
+    },
+    {
+      "epoch": 1.5406536337446135,
+      "grad_norm": 0.12835729122161865,
+      "learning_rate": 0.00010052398562604856,
+      "loss": 0.2231,
+      "step": 55550
+    },
+    {
+      "epoch": 1.542040360687678,
+      "grad_norm": 0.10687188804149628,
+      "learning_rate": 0.00010037427628605604,
+      "loss": 0.2267,
+      "step": 55600
+    },
+    {
+      "epoch": 1.5434270876307423,
+      "grad_norm": 0.15926331281661987,
+      "learning_rate": 0.0001002245661071854,
+      "loss": 0.2252,
+      "step": 55650
+    },
+    {
+      "epoch": 1.5448138145738068,
+      "grad_norm": 0.1307857781648636,
+      "learning_rate": 0.00010007485542498716,
+      "loss": 0.222,
+      "step": 55700
+    },
+    {
+      "epoch": 1.5462005415168711,
+      "grad_norm": 0.13711467385292053,
+      "learning_rate": 9.992813879148622e-05,
+      "loss": 0.225,
+      "step": 55750
+    },
+    {
+      "epoch": 1.5475872684599357,
+      "grad_norm": 0.12018255889415741,
+      "learning_rate": 9.977842810264401e-05,
+      "loss": 0.2236,
+      "step": 55800
+    },
+    {
+      "epoch": 1.5489739954030002,
+      "grad_norm": 0.17128078639507294,
+      "learning_rate": 9.962871791041844e-05,
+      "loss": 0.2258,
+      "step": 55850
+    },
+    {
+      "epoch": 1.5503607223460647,
+      "grad_norm": 0.13206107914447784,
+      "learning_rate": 9.947900855035997e-05,
+      "loss": 0.2215,
+      "step": 55900
+    },
+    {
+      "epoch": 1.5517474492891292,
+      "grad_norm": 0.11228550225496292,
+      "learning_rate": 9.932930035801728e-05,
+      "loss": 0.2247,
+      "step": 55950
+    },
+    {
+      "epoch": 1.5531341762321935,
+      "grad_norm": 0.11278003454208374,
+      "learning_rate": 9.91795936689364e-05,
+      "loss": 0.2231,
+      "step": 56000
+    },
+    {
+      "epoch": 1.5531341762321935,
+      "eval_loss": 0.2218015044927597,
+      "eval_runtime": 500.47,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 56000
+    },
+    {
+      "epoch": 1.554520903175258,
+      "grad_norm": 0.12507981061935425,
+      "learning_rate": 9.903288289547497e-05,
+      "loss": 0.2251,
+      "step": 56050
+    },
+    {
+      "epoch": 1.5559076301183223,
+      "grad_norm": 0.109380342066288,
+      "learning_rate": 9.888318017276653e-05,
+      "loss": 0.2241,
+      "step": 56100
+    },
+    {
+      "epoch": 1.5572943570613869,
+      "grad_norm": 0.16204926371574402,
+      "learning_rate": 9.873347995322417e-05,
+      "loss": 0.2275,
+      "step": 56150
+    },
+    {
+      "epoch": 1.5586810840044514,
+      "grad_norm": 0.15255117416381836,
+      "learning_rate": 9.858378257237604e-05,
+      "loss": 0.2266,
+      "step": 56200
+    },
+    {
+      "epoch": 1.560067810947516,
+      "grad_norm": 0.10249169170856476,
+      "learning_rate": 9.843408836574402e-05,
+      "loss": 0.2281,
+      "step": 56250
+    },
+    {
+      "epoch": 1.5614545378905804,
+      "grad_norm": 0.11107359826564789,
+      "learning_rate": 9.828439766884277e-05,
+      "loss": 0.221,
+      "step": 56300
+    },
+    {
+      "epoch": 1.5628412648336447,
+      "grad_norm": 0.11863153427839279,
+      "learning_rate": 9.813471081717909e-05,
+      "loss": 0.2303,
+      "step": 56350
+    },
+    {
+      "epoch": 1.5642279917767092,
+      "grad_norm": 0.12891624867916107,
+      "learning_rate": 9.798502814625123e-05,
+      "loss": 0.2217,
+      "step": 56400
+    },
+    {
+      "epoch": 1.5656147187197735,
+      "grad_norm": 0.13360735774040222,
+      "learning_rate": 9.783534999154802e-05,
+      "loss": 0.2252,
+      "step": 56450
+    },
+    {
+      "epoch": 1.567001445662838,
+      "grad_norm": 0.13507115840911865,
+      "learning_rate": 9.768567668854817e-05,
+      "loss": 0.2259,
+      "step": 56500
+    },
+    {
+      "epoch": 1.5683881726059026,
+      "grad_norm": 0.11616547405719757,
+      "learning_rate": 9.753600857271952e-05,
+      "loss": 0.2235,
+      "step": 56550
+    },
+    {
+      "epoch": 1.569774899548967,
+      "grad_norm": 0.12262886762619019,
+      "learning_rate": 9.738634597951829e-05,
+      "loss": 0.2229,
+      "step": 56600
+    },
+    {
+      "epoch": 1.5711616264920316,
+      "grad_norm": 0.11454316973686218,
+      "learning_rate": 9.723668924438826e-05,
+      "loss": 0.2235,
+      "step": 56650
+    },
+    {
+      "epoch": 1.5725483534350961,
+      "grad_norm": 0.1089673787355423,
+      "learning_rate": 9.708703870276025e-05,
+      "loss": 0.2188,
+      "step": 56700
+    },
+    {
+      "epoch": 1.5739350803781604,
+      "grad_norm": 0.14894092082977295,
+      "learning_rate": 9.693739469005102e-05,
+      "loss": 0.2259,
+      "step": 56750
+    },
+    {
+      "epoch": 1.5753218073212247,
+      "grad_norm": 0.13055862486362457,
+      "learning_rate": 9.678775754166277e-05,
+      "loss": 0.2264,
+      "step": 56800
+    },
+    {
+      "epoch": 1.5767085342642893,
+      "grad_norm": 0.13678143918514252,
+      "learning_rate": 9.66381275929823e-05,
+      "loss": 0.2262,
+      "step": 56850
+    },
+    {
+      "epoch": 1.5780952612073538,
+      "grad_norm": 0.1054852306842804,
+      "learning_rate": 9.648850517938029e-05,
+      "loss": 0.224,
+      "step": 56900
+    },
+    {
+      "epoch": 1.5794819881504183,
+      "grad_norm": 0.13340288400650024,
+      "learning_rate": 9.633889063621053e-05,
+      "loss": 0.2204,
+      "step": 56950
+    },
+    {
+      "epoch": 1.5808687150934828,
+      "grad_norm": 0.1335633248090744,
+      "learning_rate": 9.618928429880915e-05,
+      "loss": 0.2255,
+      "step": 57000
+    },
+    {
+      "epoch": 1.5808687150934828,
+      "eval_loss": 0.22177007794380188,
+      "eval_runtime": 500.5574,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 57000
+    },
+    {
+      "epoch": 1.5822554420365473,
+      "grad_norm": 0.14497853815555573,
+      "learning_rate": 9.603968650249387e-05,
+      "loss": 0.2215,
+      "step": 57050
+    },
+    {
+      "epoch": 1.5836421689796116,
+      "grad_norm": 0.12169457972049713,
+      "learning_rate": 9.589009758256336e-05,
+      "loss": 0.2248,
+      "step": 57100
+    },
+    {
+      "epoch": 1.5850288959226762,
+      "grad_norm": 0.15703269839286804,
+      "learning_rate": 9.57405178742963e-05,
+      "loss": 0.2312,
+      "step": 57150
+    },
+    {
+      "epoch": 1.5864156228657404,
+      "grad_norm": 0.12093982100486755,
+      "learning_rate": 9.559094771295076e-05,
+      "loss": 0.2217,
+      "step": 57200
+    },
+    {
+      "epoch": 1.587802349808805,
+      "grad_norm": 0.11299122124910355,
+      "learning_rate": 9.544138743376341e-05,
+      "loss": 0.2225,
+      "step": 57250
+    },
+    {
+      "epoch": 1.5891890767518695,
+      "grad_norm": 0.12226125597953796,
+      "learning_rate": 9.529183737194875e-05,
+      "loss": 0.2235,
+      "step": 57300
+    },
+    {
+      "epoch": 1.590575803694934,
+      "grad_norm": 0.09305635094642639,
+      "learning_rate": 9.514229786269836e-05,
+      "loss": 0.2207,
+      "step": 57350
+    },
+    {
+      "epoch": 1.5919625306379985,
+      "grad_norm": 0.13961461186408997,
+      "learning_rate": 9.499276924118032e-05,
+      "loss": 0.2259,
+      "step": 57400
+    },
+    {
+      "epoch": 1.5933492575810628,
+      "grad_norm": 0.11887528002262115,
+      "learning_rate": 9.484325184253808e-05,
+      "loss": 0.2232,
+      "step": 57450
+    },
+    {
+      "epoch": 1.5947359845241273,
+      "grad_norm": 0.16336101293563843,
+      "learning_rate": 9.469374600189009e-05,
+      "loss": 0.2175,
+      "step": 57500
+    },
+    {
+      "epoch": 1.5961227114671916,
+      "grad_norm": 0.10530219227075577,
+      "learning_rate": 9.454425205432887e-05,
+      "loss": 0.222,
+      "step": 57550
+    },
+    {
+      "epoch": 1.5975094384102562,
+      "grad_norm": 0.10913381725549698,
+      "learning_rate": 9.439477033492027e-05,
+      "loss": 0.2242,
+      "step": 57600
+    },
+    {
+      "epoch": 1.5988961653533207,
+      "grad_norm": 0.12135528773069382,
+      "learning_rate": 9.424530117870271e-05,
+      "loss": 0.226,
+      "step": 57650
+    },
+    {
+      "epoch": 1.6002828922963852,
+      "grad_norm": 0.13996323943138123,
+      "learning_rate": 9.409584492068646e-05,
+      "loss": 0.224,
+      "step": 57700
+    },
+    {
+      "epoch": 1.6016696192394497,
+      "grad_norm": 0.12019480764865875,
+      "learning_rate": 9.394640189585291e-05,
+      "loss": 0.2251,
+      "step": 57750
+    },
+    {
+      "epoch": 1.603056346182514,
+      "grad_norm": 0.14267127215862274,
+      "learning_rate": 9.379697243915376e-05,
+      "loss": 0.2231,
+      "step": 57800
+    },
+    {
+      "epoch": 1.6044430731255785,
+      "grad_norm": 0.1371176540851593,
+      "learning_rate": 9.364755688551027e-05,
+      "loss": 0.2234,
+      "step": 57850
+    },
+    {
+      "epoch": 1.6058298000686428,
+      "grad_norm": 0.11706887930631638,
+      "learning_rate": 9.349815556981269e-05,
+      "loss": 0.2248,
+      "step": 57900
+    },
+    {
+      "epoch": 1.6072165270117074,
+      "grad_norm": 0.14251373708248138,
+      "learning_rate": 9.334876882691918e-05,
+      "loss": 0.2239,
+      "step": 57950
+    },
+    {
+      "epoch": 1.6086032539547719,
+      "grad_norm": 0.1372220814228058,
+      "learning_rate": 9.319939699165527e-05,
+      "loss": 0.2252,
+      "step": 58000
+    },
+    {
+      "epoch": 1.6086032539547719,
+      "eval_loss": 0.22149445116519928,
+      "eval_runtime": 500.9696,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 58000
+    },
+    {
+      "epoch": 1.6099899808978364,
+      "grad_norm": 0.10551794618368149,
+      "learning_rate": 9.305004039881319e-05,
+      "loss": 0.2268,
+      "step": 58050
+    },
+    {
+      "epoch": 1.611376707840901,
+      "grad_norm": 0.11267270147800446,
+      "learning_rate": 9.290069938315087e-05,
+      "loss": 0.2248,
+      "step": 58100
+    },
+    {
+      "epoch": 1.6127634347839652,
+      "grad_norm": 0.1272066831588745,
+      "learning_rate": 9.275137427939142e-05,
+      "loss": 0.2219,
+      "step": 58150
+    },
+    {
+      "epoch": 1.6141501617270297,
+      "grad_norm": 0.11539309471845627,
+      "learning_rate": 9.260206542222224e-05,
+      "loss": 0.223,
+      "step": 58200
+    },
+    {
+      "epoch": 1.615536888670094,
+      "grad_norm": 0.13046038150787354,
+      "learning_rate": 9.245277314629432e-05,
+      "loss": 0.2204,
+      "step": 58250
+    },
+    {
+      "epoch": 1.6169236156131586,
+      "grad_norm": 0.13138452172279358,
+      "learning_rate": 9.230349778622144e-05,
+      "loss": 0.2227,
+      "step": 58300
+    },
+    {
+      "epoch": 1.618310342556223,
+      "grad_norm": 0.12613414227962494,
+      "learning_rate": 9.215423967657963e-05,
+      "loss": 0.2218,
+      "step": 58350
+    },
+    {
+      "epoch": 1.6196970694992876,
+      "grad_norm": 0.11702366173267365,
+      "learning_rate": 9.200499915190609e-05,
+      "loss": 0.2256,
+      "step": 58400
+    },
+    {
+      "epoch": 1.6210837964423521,
+      "grad_norm": 0.16387322545051575,
+      "learning_rate": 9.185577654669866e-05,
+      "loss": 0.2261,
+      "step": 58450
+    },
+    {
+      "epoch": 1.6224705233854166,
+      "grad_norm": 0.1341577172279358,
+      "learning_rate": 9.1706572195415e-05,
+      "loss": 0.2225,
+      "step": 58500
+    },
+    {
+      "epoch": 1.623857250328481,
+      "grad_norm": 0.11689752340316772,
+      "learning_rate": 9.155738643247191e-05,
+      "loss": 0.2209,
+      "step": 58550
+    },
+    {
+      "epoch": 1.6252439772715452,
+      "grad_norm": 0.14046530425548553,
+      "learning_rate": 9.140821959224448e-05,
+      "loss": 0.224,
+      "step": 58600
+    },
+    {
+      "epoch": 1.6266307042146098,
+      "grad_norm": 0.13324092328548431,
+      "learning_rate": 9.125907200906539e-05,
+      "loss": 0.2229,
+      "step": 58650
+    },
+    {
+      "epoch": 1.6280174311576743,
+      "grad_norm": 0.10926762223243713,
+      "learning_rate": 9.110994401722413e-05,
+      "loss": 0.2231,
+      "step": 58700
+    },
+    {
+      "epoch": 1.6294041581007388,
+      "grad_norm": 0.144717276096344,
+      "learning_rate": 9.096083595096642e-05,
+      "loss": 0.2252,
+      "step": 58750
+    },
+    {
+      "epoch": 1.6307908850438033,
+      "grad_norm": 0.11615584790706635,
+      "learning_rate": 9.081174814449314e-05,
+      "loss": 0.2203,
+      "step": 58800
+    },
+    {
+      "epoch": 1.6321776119868678,
+      "grad_norm": 0.1326524317264557,
+      "learning_rate": 9.066268093195987e-05,
+      "loss": 0.2261,
+      "step": 58850
+    },
+    {
+      "epoch": 1.6335643389299321,
+      "grad_norm": 0.18373699486255646,
+      "learning_rate": 9.051363464747599e-05,
+      "loss": 0.2248,
+      "step": 58900
+    },
+    {
+      "epoch": 1.6349510658729967,
+      "grad_norm": 0.12923288345336914,
+      "learning_rate": 9.036460962510398e-05,
+      "loss": 0.2265,
+      "step": 58950
+    },
+    {
+      "epoch": 1.636337792816061,
+      "grad_norm": 0.12572887539863586,
+      "learning_rate": 9.021560619885865e-05,
+      "loss": 0.2273,
+      "step": 59000
+    },
+    {
+      "epoch": 1.636337792816061,
+      "eval_loss": 0.22130924463272095,
+      "eval_runtime": 501.6665,
+      "eval_samples_per_second": 5.695,
+      "eval_steps_per_second": 5.695,
+      "step": 59000
+    },
+    {
+      "epoch": 1.6377245197591255,
+      "grad_norm": 0.12783953547477722,
+      "learning_rate": 9.006662470270646e-05,
+      "loss": 0.2276,
+      "step": 59050
+    },
+    {
+      "epoch": 1.63911124670219,
+      "grad_norm": 0.1579081267118454,
+      "learning_rate": 8.991766547056464e-05,
+      "loss": 0.2258,
+      "step": 59100
+    },
+    {
+      "epoch": 1.6404979736452545,
+      "grad_norm": 0.13635234534740448,
+      "learning_rate": 8.976872883630062e-05,
+      "loss": 0.2187,
+      "step": 59150
+    },
+    {
+      "epoch": 1.641884700588319,
+      "grad_norm": 0.13230207562446594,
+      "learning_rate": 8.961981513373109e-05,
+      "loss": 0.22,
+      "step": 59200
+    },
+    {
+      "epoch": 1.6432714275313833,
+      "grad_norm": 0.13722644746303558,
+      "learning_rate": 8.947092469662137e-05,
+      "loss": 0.2231,
+      "step": 59250
+    },
+    {
+      "epoch": 1.6446581544744479,
+      "grad_norm": 0.10856415331363678,
+      "learning_rate": 8.932205785868466e-05,
+      "loss": 0.2182,
+      "step": 59300
+    },
+    {
+      "epoch": 1.6460448814175122,
+      "grad_norm": 0.11690490692853928,
+      "learning_rate": 8.91732149535812e-05,
+      "loss": 0.2179,
+      "step": 59350
+    },
+    {
+      "epoch": 1.6474316083605767,
+      "grad_norm": 0.12944932281970978,
+      "learning_rate": 8.902439631491768e-05,
+      "loss": 0.224,
+      "step": 59400
+    },
+    {
+      "epoch": 1.6488183353036412,
+      "grad_norm": 0.10274020582437515,
+      "learning_rate": 8.887560227624632e-05,
+      "loss": 0.2217,
+      "step": 59450
+    },
+    {
+      "epoch": 1.6502050622467057,
+      "grad_norm": 0.10526053607463837,
+      "learning_rate": 8.872683317106422e-05,
+      "loss": 0.2205,
+      "step": 59500
+    },
+    {
+      "epoch": 1.6515917891897702,
+      "grad_norm": 0.13197946548461914,
+      "learning_rate": 8.857808933281269e-05,
+      "loss": 0.2233,
+      "step": 59550
+    },
+    {
+      "epoch": 1.6529785161328345,
+      "grad_norm": 0.12400202453136444,
+      "learning_rate": 8.842937109487625e-05,
+      "loss": 0.2239,
+      "step": 59600
+    },
+    {
+      "epoch": 1.654365243075899,
+      "grad_norm": 0.13808733224868774,
+      "learning_rate": 8.828067879058219e-05,
+      "loss": 0.2239,
+      "step": 59650
+    },
+    {
+      "epoch": 1.6557519700189633,
+      "grad_norm": 0.1406649500131607,
+      "learning_rate": 8.813201275319957e-05,
+      "loss": 0.2234,
+      "step": 59700
+    },
+    {
+      "epoch": 1.6571386969620279,
+      "grad_norm": 0.13243602216243744,
+      "learning_rate": 8.798337331593862e-05,
+      "loss": 0.2255,
+      "step": 59750
+    },
+    {
+      "epoch": 1.6585254239050924,
+      "grad_norm": 0.12087827175855637,
+      "learning_rate": 8.783476081194993e-05,
+      "loss": 0.223,
+      "step": 59800
+    },
+    {
+      "epoch": 1.659912150848157,
+      "grad_norm": 0.12543760240077972,
+      "learning_rate": 8.768617557432374e-05,
+      "loss": 0.2232,
+      "step": 59850
+    },
+    {
+      "epoch": 1.6612988777912214,
+      "grad_norm": 0.11448033154010773,
+      "learning_rate": 8.753761793608915e-05,
+      "loss": 0.2214,
+      "step": 59900
+    },
+    {
+      "epoch": 1.662685604734286,
+      "grad_norm": 0.13189347088336945,
+      "learning_rate": 8.738908823021349e-05,
+      "loss": 0.2256,
+      "step": 59950
+    },
+    {
+      "epoch": 1.6640723316773502,
+      "grad_norm": 0.12486356496810913,
+      "learning_rate": 8.724058678960139e-05,
+      "loss": 0.2226,
+      "step": 60000
+    },
+    {
+      "epoch": 1.6640723316773502,
+      "eval_loss": 0.2211039811372757,
+      "eval_runtime": 501.6014,
+      "eval_samples_per_second": 5.696,
+      "eval_steps_per_second": 5.696,
+      "step": 60000
+    },
+    {
+      "epoch": 1.6654590586204145,
+      "grad_norm": 0.12641596794128418,
+      "learning_rate": 8.709211394709415e-05,
+      "loss": 0.2244,
+      "step": 60050
+    },
+    {
+      "epoch": 1.666845785563479,
+      "grad_norm": 0.14593897759914398,
+      "learning_rate": 8.694367003546897e-05,
+      "loss": 0.2257,
+      "step": 60100
+    },
+    {
+      "epoch": 1.6682325125065436,
+      "grad_norm": 0.11628296971321106,
+      "learning_rate": 8.679525538743825e-05,
+      "loss": 0.2253,
+      "step": 60150
+    },
+    {
+      "epoch": 1.669619239449608,
+      "grad_norm": 0.10328439623117447,
+      "learning_rate": 8.664687033564874e-05,
+      "loss": 0.2214,
+      "step": 60200
+    },
+    {
+      "epoch": 1.6710059663926726,
+      "grad_norm": 0.13650208711624146,
+      "learning_rate": 8.649851521268087e-05,
+      "loss": 0.2234,
+      "step": 60250
+    },
+    {
+      "epoch": 1.6723926933357371,
+      "grad_norm": 0.1255771368741989,
+      "learning_rate": 8.635019035104798e-05,
+      "loss": 0.2223,
+      "step": 60300
+    },
+    {
+      "epoch": 1.6737794202788014,
+      "grad_norm": 0.10108000785112381,
+      "learning_rate": 8.620189608319568e-05,
+      "loss": 0.2273,
+      "step": 60350
+    },
+    {
+      "epoch": 1.675166147221866,
+      "grad_norm": 0.11289294809103012,
+      "learning_rate": 8.605363274150089e-05,
+      "loss": 0.2252,
+      "step": 60400
+    },
+    {
+      "epoch": 1.6765528741649303,
+      "grad_norm": 0.18194100260734558,
+      "learning_rate": 8.590540065827126e-05,
+      "loss": 0.2287,
+      "step": 60450
+    },
+    {
+      "epoch": 1.6779396011079948,
+      "grad_norm": 0.1335909515619278,
+      "learning_rate": 8.575720016574438e-05,
+      "loss": 0.2223,
+      "step": 60500
+    },
+    {
+      "epoch": 1.6793263280510593,
+      "grad_norm": 0.17010542750358582,
+      "learning_rate": 8.561199465248794e-05,
+      "loss": 0.2241,
+      "step": 60550
+    },
+    {
+      "epoch": 1.6807130549941238,
+      "grad_norm": 0.14800529181957245,
+      "learning_rate": 8.546385768944199e-05,
+      "loss": 0.2253,
+      "step": 60600
+    },
+    {
+      "epoch": 1.6820997819371883,
+      "grad_norm": 0.138923779129982,
+      "learning_rate": 8.531575330674397e-05,
+      "loss": 0.2229,
+      "step": 60650
+    },
+    {
+      "epoch": 1.6834865088802526,
+      "grad_norm": 0.1428767740726471,
+      "learning_rate": 8.51676818363453e-05,
+      "loss": 0.2241,
+      "step": 60700
+    },
+    {
+      "epoch": 1.6848732358233172,
+      "grad_norm": 0.15303942561149597,
+      "learning_rate": 8.501964361012355e-05,
+      "loss": 0.2221,
+      "step": 60750
+    },
+    {
+      "epoch": 1.6862599627663815,
+      "grad_norm": 0.12015249580144882,
+      "learning_rate": 8.487163895988181e-05,
+      "loss": 0.2289,
+      "step": 60800
+    },
+    {
+      "epoch": 1.687646689709446,
+      "grad_norm": 0.1367703676223755,
+      "learning_rate": 8.472366821734789e-05,
+      "loss": 0.2264,
+      "step": 60850
+    },
+    {
+      "epoch": 1.6890334166525105,
+      "grad_norm": 0.14325445890426636,
+      "learning_rate": 8.457573171417366e-05,
+      "loss": 0.2247,
+      "step": 60900
+    },
+    {
+      "epoch": 1.690420143595575,
+      "grad_norm": 0.18521633744239807,
+      "learning_rate": 8.442782978193423e-05,
+      "loss": 0.2229,
+      "step": 60950
+    },
+    {
+      "epoch": 1.6918068705386395,
+      "grad_norm": 0.11883826553821564,
+      "learning_rate": 8.427996275212719e-05,
+      "loss": 0.2245,
+      "step": 61000
+    },
+    {
+      "epoch": 1.6918068705386395,
+      "eval_loss": 0.2209625393152237,
+      "eval_runtime": 501.1761,
+      "eval_samples_per_second": 5.701,
+      "eval_steps_per_second": 5.701,
+      "step": 61000
+    },
+    {
+      "epoch": 1.6931935974817038,
+      "grad_norm": 0.15255926549434662,
+      "learning_rate": 8.413213095617189e-05,
+      "loss": 0.2231,
+      "step": 61050
+    },
+    {
+      "epoch": 1.6945803244247684,
+      "grad_norm": 0.11666153371334076,
+      "learning_rate": 8.398433472540878e-05,
+      "loss": 0.2232,
+      "step": 61100
+    },
+    {
+      "epoch": 1.6959670513678327,
+      "grad_norm": 0.1510268598794937,
+      "learning_rate": 8.383657439109852e-05,
+      "loss": 0.2279,
+      "step": 61150
+    },
+    {
+      "epoch": 1.6973537783108972,
+      "grad_norm": 0.12121795862913132,
+      "learning_rate": 8.368885028442138e-05,
+      "loss": 0.218,
+      "step": 61200
+    },
+    {
+      "epoch": 1.6987405052539617,
+      "grad_norm": 0.1254698634147644,
+      "learning_rate": 8.354116273647637e-05,
+      "loss": 0.2244,
+      "step": 61250
+    },
+    {
+      "epoch": 1.7001272321970262,
+      "grad_norm": 0.1407916098833084,
+      "learning_rate": 8.339351207828064e-05,
+      "loss": 0.2235,
+      "step": 61300
+    },
+    {
+      "epoch": 1.7015139591400907,
+      "grad_norm": 0.16278910636901855,
+      "learning_rate": 8.324589864076858e-05,
+      "loss": 0.2218,
+      "step": 61350
+    },
+    {
+      "epoch": 1.702900686083155,
+      "grad_norm": 0.12832941114902496,
+      "learning_rate": 8.30983227547912e-05,
+      "loss": 0.2237,
+      "step": 61400
+    },
+    {
+      "epoch": 1.7042874130262196,
+      "grad_norm": 0.10571594536304474,
+      "learning_rate": 8.295078475111532e-05,
+      "loss": 0.2241,
+      "step": 61450
+    },
+    {
+      "epoch": 1.7056741399692839,
+      "grad_norm": 0.12712325155735016,
+      "learning_rate": 8.280328496042287e-05,
+      "loss": 0.2253,
+      "step": 61500
+    },
+    {
+      "epoch": 1.7070608669123484,
+      "grad_norm": 0.12702125310897827,
+      "learning_rate": 8.265582371331011e-05,
+      "loss": 0.2225,
+      "step": 61550
+    },
+    {
+      "epoch": 1.708447593855413,
+      "grad_norm": 0.10937677323818207,
+      "learning_rate": 8.250840134028694e-05,
+      "loss": 0.226,
+      "step": 61600
+    },
+    {
+      "epoch": 1.7098343207984774,
+      "grad_norm": 0.15084721148014069,
+      "learning_rate": 8.236101817177609e-05,
+      "loss": 0.224,
+      "step": 61650
+    },
+    {
+      "epoch": 1.711221047741542,
+      "grad_norm": 0.10948482900857925,
+      "learning_rate": 8.221367453811247e-05,
+      "loss": 0.2222,
+      "step": 61700
+    },
+    {
+      "epoch": 1.7126077746846065,
+      "grad_norm": 0.12602409720420837,
+      "learning_rate": 8.206637076954236e-05,
+      "loss": 0.2221,
+      "step": 61750
+    },
+    {
+      "epoch": 1.7139945016276708,
+      "grad_norm": 0.11353398859500885,
+      "learning_rate": 8.191910719622267e-05,
+      "loss": 0.219,
+      "step": 61800
+    },
+    {
+      "epoch": 1.715381228570735,
+      "grad_norm": 0.17882102727890015,
+      "learning_rate": 8.177188414822025e-05,
+      "loss": 0.2259,
+      "step": 61850
+    },
+    {
+      "epoch": 1.7167679555137996,
+      "grad_norm": 0.14457036554813385,
+      "learning_rate": 8.162470195551111e-05,
+      "loss": 0.2232,
+      "step": 61900
+    },
+    {
+      "epoch": 1.718154682456864,
+      "grad_norm": 0.13123807311058044,
+      "learning_rate": 8.147756094797964e-05,
+      "loss": 0.2189,
+      "step": 61950
+    },
+    {
+      "epoch": 1.7195414093999286,
+      "grad_norm": 0.13137926161289215,
+      "learning_rate": 8.133046145541801e-05,
+      "loss": 0.223,
+      "step": 62000
+    },
+    {
+      "epoch": 1.7195414093999286,
+      "eval_loss": 0.22088629007339478,
+      "eval_runtime": 501.7917,
+      "eval_samples_per_second": 5.694,
+      "eval_steps_per_second": 5.694,
+      "step": 62000
+    },
+    {
+      "epoch": 1.7209281363429931,
+      "grad_norm": 0.13826268911361694,
+      "learning_rate": 8.118340380752526e-05,
+      "loss": 0.2267,
+      "step": 62050
+    },
+    {
+      "epoch": 1.7223148632860577,
+      "grad_norm": 0.11485900729894638,
+      "learning_rate": 8.103638833390666e-05,
+      "loss": 0.2245,
+      "step": 62100
+    },
+    {
+      "epoch": 1.723701590229122,
+      "grad_norm": 0.14433616399765015,
+      "learning_rate": 8.088941536407302e-05,
+      "loss": 0.2254,
+      "step": 62150
+    },
+    {
+      "epoch": 1.7250883171721865,
+      "grad_norm": 0.15891560912132263,
+      "learning_rate": 8.07424852274398e-05,
+      "loss": 0.2259,
+      "step": 62200
+    },
+    {
+      "epoch": 1.7264750441152508,
+      "grad_norm": 0.15121401846408844,
+      "learning_rate": 8.059559825332653e-05,
+      "loss": 0.2238,
+      "step": 62250
+    },
+    {
+      "epoch": 1.7278617710583153,
+      "grad_norm": 0.1441587507724762,
+      "learning_rate": 8.044875477095589e-05,
+      "loss": 0.2222,
+      "step": 62300
+    },
+    {
+      "epoch": 1.7292484980013798,
+      "grad_norm": 0.1155286505818367,
+      "learning_rate": 8.03019551094532e-05,
+      "loss": 0.2264,
+      "step": 62350
+    },
+    {
+      "epoch": 1.7306352249444443,
+      "grad_norm": 0.11506728082895279,
+      "learning_rate": 8.01551995978455e-05,
+      "loss": 0.2168,
+      "step": 62400
+    },
+    {
+      "epoch": 1.7320219518875088,
+      "grad_norm": 0.11319868266582489,
+      "learning_rate": 8.00084885650609e-05,
+      "loss": 0.2199,
+      "step": 62450
+    },
+    {
+      "epoch": 1.7334086788305731,
+      "grad_norm": 0.11964423209428787,
+      "learning_rate": 7.986182233992773e-05,
+      "loss": 0.2293,
+      "step": 62500
+    },
+    {
+      "epoch": 1.7347954057736377,
+      "grad_norm": 0.14446033537387848,
+      "learning_rate": 7.971520125117408e-05,
+      "loss": 0.2209,
+      "step": 62550
+    },
+    {
+      "epoch": 1.736182132716702,
+      "grad_norm": 0.1368509829044342,
+      "learning_rate": 7.95686256274267e-05,
+      "loss": 0.2276,
+      "step": 62600
+    },
+    {
+      "epoch": 1.7375688596597665,
+      "grad_norm": 0.11722143739461899,
+      "learning_rate": 7.942209579721052e-05,
+      "loss": 0.222,
+      "step": 62650
+    },
+    {
+      "epoch": 1.738955586602831,
+      "grad_norm": 0.11798449605703354,
+      "learning_rate": 7.927561208894781e-05,
+      "loss": 0.2211,
+      "step": 62700
+    },
+    {
+      "epoch": 1.7403423135458955,
+      "grad_norm": 0.13891583681106567,
+      "learning_rate": 7.912917483095743e-05,
+      "loss": 0.2218,
+      "step": 62750
+    },
+    {
+      "epoch": 1.74172904048896,
+      "grad_norm": 0.11691787093877792,
+      "learning_rate": 7.898278435145419e-05,
+      "loss": 0.2216,
+      "step": 62800
+    },
+    {
+      "epoch": 1.7431157674320243,
+      "grad_norm": 0.1415584832429886,
+      "learning_rate": 7.883644097854802e-05,
+      "loss": 0.2185,
+      "step": 62850
+    },
+    {
+      "epoch": 1.7445024943750889,
+      "grad_norm": 0.14408282935619354,
+      "learning_rate": 7.869014504024328e-05,
+      "loss": 0.2261,
+      "step": 62900
+    },
+    {
+      "epoch": 1.7458892213181532,
+      "grad_norm": 0.11771035194396973,
+      "learning_rate": 7.8543896864438e-05,
+      "loss": 0.2207,
+      "step": 62950
+    },
+    {
+      "epoch": 1.7472759482612177,
+      "grad_norm": 0.12419497966766357,
+      "learning_rate": 7.839769677892322e-05,
+      "loss": 0.2201,
+      "step": 63000
+    },
+    {
+      "epoch": 1.7472759482612177,
+      "eval_loss": 0.22052037715911865,
+      "eval_runtime": 501.3675,
+      "eval_samples_per_second": 5.698,
+      "eval_steps_per_second": 5.698,
+      "step": 63000
+    },
+    {
+      "epoch": 1.7486626752042822,
+      "grad_norm": 0.13703902065753937,
+      "learning_rate": 7.825154511138208e-05,
+      "loss": 0.2231,
+      "step": 63050
+    },
+    {
+      "epoch": 1.7500494021473467,
+      "grad_norm": 0.131145641207695,
+      "learning_rate": 7.810544218938931e-05,
+      "loss": 0.2242,
+      "step": 63100
+    },
+    {
+      "epoch": 1.7514361290904112,
+      "grad_norm": 0.12763576209545135,
+      "learning_rate": 7.79593883404103e-05,
+      "loss": 0.2217,
+      "step": 63150
+    },
+    {
+      "epoch": 1.7528228560334755,
+      "grad_norm": 0.13587799668312073,
+      "learning_rate": 7.781338389180049e-05,
+      "loss": 0.2236,
+      "step": 63200
+    },
+    {
+      "epoch": 1.75420958297654,
+      "grad_norm": 0.1352778673171997,
+      "learning_rate": 7.766742917080461e-05,
+      "loss": 0.2255,
+      "step": 63250
+    },
+    {
+      "epoch": 1.7555963099196044,
+      "grad_norm": 0.13144232332706451,
+      "learning_rate": 7.752152450455587e-05,
+      "loss": 0.2224,
+      "step": 63300
+    },
+    {
+      "epoch": 1.7569830368626689,
+      "grad_norm": 0.13210804760456085,
+      "learning_rate": 7.737567022007541e-05,
+      "loss": 0.2226,
+      "step": 63350
+    },
+    {
+      "epoch": 1.7583697638057334,
+      "grad_norm": 0.12364811450242996,
+      "learning_rate": 7.722986664427134e-05,
+      "loss": 0.2266,
+      "step": 63400
+    },
+    {
+      "epoch": 1.759756490748798,
+      "grad_norm": 0.13110807538032532,
+      "learning_rate": 7.708411410393817e-05,
+      "loss": 0.2207,
+      "step": 63450
+    },
+    {
+      "epoch": 1.7611432176918624,
+      "grad_norm": 0.11447171866893768,
+      "learning_rate": 7.693841292575598e-05,
+      "loss": 0.2276,
+      "step": 63500
+    },
+    {
+      "epoch": 1.762529944634927,
+      "grad_norm": 0.13065434992313385,
+      "learning_rate": 7.679276343628978e-05,
+      "loss": 0.2215,
+      "step": 63550
+    },
+    {
+      "epoch": 1.7639166715779913,
+      "grad_norm": 0.13254573941230774,
+      "learning_rate": 7.664716596198869e-05,
+      "loss": 0.2217,
+      "step": 63600
+    },
+    {
+      "epoch": 1.7653033985210556,
+      "grad_norm": 0.12096452713012695,
+      "learning_rate": 7.650162082918525e-05,
+      "loss": 0.2226,
+      "step": 63650
+    },
+    {
+      "epoch": 1.76669012546412,
+      "grad_norm": 0.11871534585952759,
+      "learning_rate": 7.635612836409466e-05,
+      "loss": 0.2219,
+      "step": 63700
+    },
+    {
+      "epoch": 1.7680768524071846,
+      "grad_norm": 0.12863662838935852,
+      "learning_rate": 7.621068889281419e-05,
+      "loss": 0.2189,
+      "step": 63750
+    },
+    {
+      "epoch": 1.7694635793502491,
+      "grad_norm": 0.12479788810014725,
+      "learning_rate": 7.60653027413222e-05,
+      "loss": 0.224,
+      "step": 63800
+    },
+    {
+      "epoch": 1.7708503062933136,
+      "grad_norm": 0.11679526418447495,
+      "learning_rate": 7.591997023547763e-05,
+      "loss": 0.2266,
+      "step": 63850
+    },
+    {
+      "epoch": 1.7722370332363782,
+      "grad_norm": 0.13567224144935608,
+      "learning_rate": 7.577469170101908e-05,
+      "loss": 0.2249,
+      "step": 63900
+    },
+    {
+      "epoch": 1.7736237601794425,
+      "grad_norm": 0.11825359612703323,
+      "learning_rate": 7.562946746356432e-05,
+      "loss": 0.2164,
+      "step": 63950
+    },
+    {
+      "epoch": 1.775010487122507,
+      "grad_norm": 0.1773427426815033,
+      "learning_rate": 7.548429784860931e-05,
+      "loss": 0.2246,
+      "step": 64000
+    },
+    {
+      "epoch": 1.775010487122507,
+      "eval_loss": 0.22055774927139282,
+      "eval_runtime": 500.8502,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 64000
+    },
+    {
+      "epoch": 1.7763972140655713,
+      "grad_norm": 0.14357559382915497,
+      "learning_rate": 7.533918318152764e-05,
+      "loss": 0.2252,
+      "step": 64050
+    },
+    {
+      "epoch": 1.7777839410086358,
+      "grad_norm": 0.13654856383800507,
+      "learning_rate": 7.519412378756967e-05,
+      "loss": 0.2244,
+      "step": 64100
+    },
+    {
+      "epoch": 1.7791706679517003,
+      "grad_norm": 0.13761308789253235,
+      "learning_rate": 7.504911999186203e-05,
+      "loss": 0.2216,
+      "step": 64150
+    },
+    {
+      "epoch": 1.7805573948947648,
+      "grad_norm": 0.1746763437986374,
+      "learning_rate": 7.490707052670636e-05,
+      "loss": 0.2252,
+      "step": 64200
+    },
+    {
+      "epoch": 1.7819441218378294,
+      "grad_norm": 0.12090858817100525,
+      "learning_rate": 7.476217777423408e-05,
+      "loss": 0.2202,
+      "step": 64250
+    },
+    {
+      "epoch": 1.7833308487808937,
+      "grad_norm": 0.1337929368019104,
+      "learning_rate": 7.461734158814738e-05,
+      "loss": 0.2266,
+      "step": 64300
+    },
+    {
+      "epoch": 1.7847175757239582,
+      "grad_norm": 0.11186370998620987,
+      "learning_rate": 7.447256229307243e-05,
+      "loss": 0.2205,
+      "step": 64350
+    },
+    {
+      "epoch": 1.7861043026670225,
+      "grad_norm": 0.11708831042051315,
+      "learning_rate": 7.432784021350796e-05,
+      "loss": 0.2221,
+      "step": 64400
+    },
+    {
+      "epoch": 1.787491029610087,
+      "grad_norm": 0.13799749314785004,
+      "learning_rate": 7.418317567382446e-05,
+      "loss": 0.2222,
+      "step": 64450
+    },
+    {
+      "epoch": 1.7888777565531515,
+      "grad_norm": 0.12827137112617493,
+      "learning_rate": 7.403856899826352e-05,
+      "loss": 0.2204,
+      "step": 64500
+    },
+    {
+      "epoch": 1.790264483496216,
+      "grad_norm": 0.1483440101146698,
+      "learning_rate": 7.389402051093692e-05,
+      "loss": 0.2245,
+      "step": 64550
+    },
+    {
+      "epoch": 1.7916512104392806,
+      "grad_norm": 0.1483563780784607,
+      "learning_rate": 7.37495305358261e-05,
+      "loss": 0.2199,
+      "step": 64600
+    },
+    {
+      "epoch": 1.7930379373823448,
+      "grad_norm": 0.12296409904956818,
+      "learning_rate": 7.360509939678129e-05,
+      "loss": 0.2211,
+      "step": 64650
+    },
+    {
+      "epoch": 1.7944246643254094,
+      "grad_norm": 0.10277026891708374,
+      "learning_rate": 7.346072741752098e-05,
+      "loss": 0.22,
+      "step": 64700
+    },
+    {
+      "epoch": 1.7958113912684737,
+      "grad_norm": 0.11957128345966339,
+      "learning_rate": 7.331641492163092e-05,
+      "loss": 0.2218,
+      "step": 64750
+    },
+    {
+      "epoch": 1.7971981182115382,
+      "grad_norm": 0.1205180436372757,
+      "learning_rate": 7.317216223256362e-05,
+      "loss": 0.2238,
+      "step": 64800
+    },
+    {
+      "epoch": 1.7985848451546027,
+      "grad_norm": 0.12323558330535889,
+      "learning_rate": 7.302796967363748e-05,
+      "loss": 0.2238,
+      "step": 64850
+    },
+    {
+      "epoch": 1.7999715720976672,
+      "grad_norm": 0.1744375079870224,
+      "learning_rate": 7.288383756803618e-05,
+      "loss": 0.2236,
+      "step": 64900
+    },
+    {
+      "epoch": 1.8013582990407317,
+      "grad_norm": 0.11908925324678421,
+      "learning_rate": 7.27397662388079e-05,
+      "loss": 0.2199,
+      "step": 64950
+    },
+    {
+      "epoch": 1.802745025983796,
+      "grad_norm": 0.16596545279026031,
+      "learning_rate": 7.259575600886457e-05,
+      "loss": 0.2201,
+      "step": 65000
+    },
+    {
+      "epoch": 1.802745025983796,
+      "eval_loss": 0.22035908699035645,
+      "eval_runtime": 501.5752,
+      "eval_samples_per_second": 5.696,
+      "eval_steps_per_second": 5.696,
+      "step": 65000
+    },
+    {
+      "epoch": 1.8041317529268606,
+      "grad_norm": 0.10997340828180313,
+      "learning_rate": 7.245180720098122e-05,
+      "loss": 0.2226,
+      "step": 65050
+    },
+    {
+      "epoch": 1.8055184798699249,
+      "grad_norm": 0.13975024223327637,
+      "learning_rate": 7.230792013779512e-05,
+      "loss": 0.2227,
+      "step": 65100
+    },
+    {
+      "epoch": 1.8069052068129894,
+      "grad_norm": 0.13971950113773346,
+      "learning_rate": 7.216409514180532e-05,
+      "loss": 0.2287,
+      "step": 65150
+    },
+    {
+      "epoch": 1.808291933756054,
+      "grad_norm": 0.1235787570476532,
+      "learning_rate": 7.20203325353716e-05,
+      "loss": 0.2237,
+      "step": 65200
+    },
+    {
+      "epoch": 1.8096786606991184,
+      "grad_norm": 0.15590088069438934,
+      "learning_rate": 7.187663264071396e-05,
+      "loss": 0.222,
+      "step": 65250
+    },
+    {
+      "epoch": 1.811065387642183,
+      "grad_norm": 0.12113209813833237,
+      "learning_rate": 7.173299577991184e-05,
+      "loss": 0.2226,
+      "step": 65300
+    },
+    {
+      "epoch": 1.8124521145852475,
+      "grad_norm": 0.10521169751882553,
+      "learning_rate": 7.158942227490341e-05,
+      "loss": 0.2229,
+      "step": 65350
+    },
+    {
+      "epoch": 1.8138388415283118,
+      "grad_norm": 0.1138349398970604,
+      "learning_rate": 7.14459124474848e-05,
+      "loss": 0.2185,
+      "step": 65400
+    },
+    {
+      "epoch": 1.815225568471376,
+      "grad_norm": 0.11943230032920837,
+      "learning_rate": 7.130246661930945e-05,
+      "loss": 0.2202,
+      "step": 65450
+    },
+    {
+      "epoch": 1.8166122954144406,
+      "grad_norm": 0.12407558411359787,
+      "learning_rate": 7.115908511188736e-05,
+      "loss": 0.2209,
+      "step": 65500
+    },
+    {
+      "epoch": 1.817999022357505,
+      "grad_norm": 0.10693726688623428,
+      "learning_rate": 7.101576824658439e-05,
+      "loss": 0.2211,
+      "step": 65550
+    },
+    {
+      "epoch": 1.8193857493005696,
+      "grad_norm": 0.12432532757520676,
+      "learning_rate": 7.087251634462143e-05,
+      "loss": 0.2281,
+      "step": 65600
+    },
+    {
+      "epoch": 1.8207724762436341,
+      "grad_norm": 0.16990961134433746,
+      "learning_rate": 7.072932972707387e-05,
+      "loss": 0.2286,
+      "step": 65650
+    },
+    {
+      "epoch": 1.8221592031866987,
+      "grad_norm": 0.1222047433257103,
+      "learning_rate": 7.05862087148707e-05,
+      "loss": 0.2183,
+      "step": 65700
+    },
+    {
+      "epoch": 1.823545930129763,
+      "grad_norm": 0.13230758905410767,
+      "learning_rate": 7.044315362879388e-05,
+      "loss": 0.2244,
+      "step": 65750
+    },
+    {
+      "epoch": 1.8249326570728275,
+      "grad_norm": 0.11451806128025055,
+      "learning_rate": 7.030016478947762e-05,
+      "loss": 0.2208,
+      "step": 65800
+    },
+    {
+      "epoch": 1.8263193840158918,
+      "grad_norm": 0.1575726568698883,
+      "learning_rate": 7.015724251740766e-05,
+      "loss": 0.2233,
+      "step": 65850
+    },
+    {
+      "epoch": 1.8277061109589563,
+      "grad_norm": 0.11409944295883179,
+      "learning_rate": 7.001438713292047e-05,
+      "loss": 0.2232,
+      "step": 65900
+    },
+    {
+      "epoch": 1.8290928379020208,
+      "grad_norm": 0.1506188064813614,
+      "learning_rate": 6.987159895620277e-05,
+      "loss": 0.2272,
+      "step": 65950
+    },
+    {
+      "epoch": 1.8304795648450853,
+      "grad_norm": 0.12321347743272781,
+      "learning_rate": 6.972887830729048e-05,
+      "loss": 0.2225,
+      "step": 66000
+    },
+    {
+      "epoch": 1.8304795648450853,
+      "eval_loss": 0.22010773420333862,
+      "eval_runtime": 500.7778,
+      "eval_samples_per_second": 5.705,
+      "eval_steps_per_second": 5.705,
+      "step": 66000
+    },
+    {
+      "epoch": 1.8318662917881499,
+      "grad_norm": 0.10877233743667603,
+      "learning_rate": 6.958622550606821e-05,
+      "loss": 0.2208,
+      "step": 66050
+    },
+    {
+      "epoch": 1.8332530187312142,
+      "grad_norm": 0.12374629080295563,
+      "learning_rate": 6.944364087226851e-05,
+      "loss": 0.2198,
+      "step": 66100
+    },
+    {
+      "epoch": 1.8346397456742787,
+      "grad_norm": 0.12141034007072449,
+      "learning_rate": 6.930112472547118e-05,
+      "loss": 0.2202,
+      "step": 66150
+    },
+    {
+      "epoch": 1.836026472617343,
+      "grad_norm": 0.11637747287750244,
+      "learning_rate": 6.915867738510247e-05,
+      "loss": 0.2196,
+      "step": 66200
+    },
+    {
+      "epoch": 1.8374131995604075,
+      "grad_norm": 0.11352310329675674,
+      "learning_rate": 6.901629917043442e-05,
+      "loss": 0.2169,
+      "step": 66250
+    },
+    {
+      "epoch": 1.838799926503472,
+      "grad_norm": 0.13060016930103302,
+      "learning_rate": 6.887399040058408e-05,
+      "loss": 0.2259,
+      "step": 66300
+    },
+    {
+      "epoch": 1.8401866534465365,
+      "grad_norm": 0.13059192895889282,
+      "learning_rate": 6.873175139451306e-05,
+      "loss": 0.2235,
+      "step": 66350
+    },
+    {
+      "epoch": 1.841573380389601,
+      "grad_norm": 0.1436002552509308,
+      "learning_rate": 6.858958247102638e-05,
+      "loss": 0.2223,
+      "step": 66400
+    },
+    {
+      "epoch": 1.8429601073326654,
+      "grad_norm": 0.11226887255907059,
+      "learning_rate": 6.844748394877205e-05,
+      "loss": 0.2216,
+      "step": 66450
+    },
+    {
+      "epoch": 1.8443468342757299,
+      "grad_norm": 0.1210029125213623,
+      "learning_rate": 6.83054561462403e-05,
+      "loss": 0.2211,
+      "step": 66500
+    },
+    {
+      "epoch": 1.8457335612187942,
+      "grad_norm": 0.12150629609823227,
+      "learning_rate": 6.81634993817629e-05,
+      "loss": 0.2206,
+      "step": 66550
+    },
+    {
+      "epoch": 1.8471202881618587,
+      "grad_norm": 0.11927852034568787,
+      "learning_rate": 6.80216139735123e-05,
+      "loss": 0.2204,
+      "step": 66600
+    },
+    {
+      "epoch": 1.8485070151049232,
+      "grad_norm": 0.1343168169260025,
+      "learning_rate": 6.787980023950108e-05,
+      "loss": 0.2217,
+      "step": 66650
+    },
+    {
+      "epoch": 1.8498937420479877,
+      "grad_norm": 0.11947305500507355,
+      "learning_rate": 6.773805849758116e-05,
+      "loss": 0.2175,
+      "step": 66700
+    },
+    {
+      "epoch": 1.8512804689910523,
+      "grad_norm": 0.11362255364656448,
+      "learning_rate": 6.759638906544313e-05,
+      "loss": 0.2233,
+      "step": 66750
+    },
+    {
+      "epoch": 1.8526671959341166,
+      "grad_norm": 0.13201679289340973,
+      "learning_rate": 6.745479226061548e-05,
+      "loss": 0.22,
+      "step": 66800
+    },
+    {
+      "epoch": 1.854053922877181,
+      "grad_norm": 0.12694838643074036,
+      "learning_rate": 6.731326840046395e-05,
+      "loss": 0.2245,
+      "step": 66850
+    },
+    {
+      "epoch": 1.8554406498202454,
+      "grad_norm": 0.12625135481357574,
+      "learning_rate": 6.71718178021907e-05,
+      "loss": 0.226,
+      "step": 66900
+    },
+    {
+      "epoch": 1.85682737676331,
+      "grad_norm": 0.11247176676988602,
+      "learning_rate": 6.703044078283378e-05,
+      "loss": 0.2251,
+      "step": 66950
+    },
+    {
+      "epoch": 1.8582141037063744,
+      "grad_norm": 0.11113060265779495,
+      "learning_rate": 6.688913765926627e-05,
+      "loss": 0.2218,
+      "step": 67000
+    },
+    {
+      "epoch": 1.8582141037063744,
+      "eval_loss": 0.22002293169498444,
+      "eval_runtime": 500.6759,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 67000
+    },
+    {
+      "epoch": 1.859600830649439,
+      "grad_norm": 0.12426327913999557,
+      "learning_rate": 6.674790874819566e-05,
+      "loss": 0.2283,
+      "step": 67050
+    },
+    {
+      "epoch": 1.8609875575925034,
+      "grad_norm": 0.12066414952278137,
+      "learning_rate": 6.660675436616306e-05,
+      "loss": 0.2195,
+      "step": 67100
+    },
+    {
+      "epoch": 1.862374284535568,
+      "grad_norm": 0.1106274425983429,
+      "learning_rate": 6.646567482954262e-05,
+      "loss": 0.2203,
+      "step": 67150
+    },
+    {
+      "epoch": 1.8637610114786323,
+      "grad_norm": 0.15133588016033173,
+      "learning_rate": 6.632467045454068e-05,
+      "loss": 0.2222,
+      "step": 67200
+    },
+    {
+      "epoch": 1.8651477384216966,
+      "grad_norm": 0.1420869678258896,
+      "learning_rate": 6.618374155719507e-05,
+      "loss": 0.2216,
+      "step": 67250
+    },
+    {
+      "epoch": 1.866534465364761,
+      "grad_norm": 0.1180604100227356,
+      "learning_rate": 6.604288845337453e-05,
+      "loss": 0.2234,
+      "step": 67300
+    },
+    {
+      "epoch": 1.8679211923078256,
+      "grad_norm": 0.1143430843949318,
+      "learning_rate": 6.59021114587779e-05,
+      "loss": 0.2215,
+      "step": 67350
+    },
+    {
+      "epoch": 1.8693079192508901,
+      "grad_norm": 0.11402673274278641,
+      "learning_rate": 6.57614108889334e-05,
+      "loss": 0.2195,
+      "step": 67400
+    },
+    {
+      "epoch": 1.8706946461939546,
+      "grad_norm": 0.11139002442359924,
+      "learning_rate": 6.5620787059198e-05,
+      "loss": 0.2245,
+      "step": 67450
+    },
+    {
+      "epoch": 1.8720813731370192,
+      "grad_norm": 0.12388894706964493,
+      "learning_rate": 6.548024028475661e-05,
+      "loss": 0.2223,
+      "step": 67500
+    },
+    {
+      "epoch": 1.8734681000800835,
+      "grad_norm": 0.1154908537864685,
+      "learning_rate": 6.533977088062155e-05,
+      "loss": 0.2255,
+      "step": 67550
+    },
+    {
+      "epoch": 1.874854827023148,
+      "grad_norm": 0.13296931982040405,
+      "learning_rate": 6.519937916163161e-05,
+      "loss": 0.2223,
+      "step": 67600
+    },
+    {
+      "epoch": 1.8762415539662123,
+      "grad_norm": 0.1450251191854477,
+      "learning_rate": 6.505906544245151e-05,
+      "loss": 0.2235,
+      "step": 67650
+    },
+    {
+      "epoch": 1.8776282809092768,
+      "grad_norm": 0.13592007756233215,
+      "learning_rate": 6.491883003757108e-05,
+      "loss": 0.2212,
+      "step": 67700
+    },
+    {
+      "epoch": 1.8790150078523413,
+      "grad_norm": 0.12879903614521027,
+      "learning_rate": 6.47786732613048e-05,
+      "loss": 0.2212,
+      "step": 67750
+    },
+    {
+      "epoch": 1.8804017347954058,
+      "grad_norm": 0.11880529671907425,
+      "learning_rate": 6.463859542779072e-05,
+      "loss": 0.2249,
+      "step": 67800
+    },
+    {
+      "epoch": 1.8817884617384704,
+      "grad_norm": 0.09874891489744186,
+      "learning_rate": 6.449859685099002e-05,
+      "loss": 0.2205,
+      "step": 67850
+    },
+    {
+      "epoch": 1.8831751886815347,
+      "grad_norm": 0.12535731494426727,
+      "learning_rate": 6.43586778446863e-05,
+      "loss": 0.2236,
+      "step": 67900
+    },
+    {
+      "epoch": 1.8845619156245992,
+      "grad_norm": 0.10726474970579147,
+      "learning_rate": 6.42188387224847e-05,
+      "loss": 0.2228,
+      "step": 67950
+    },
+    {
+      "epoch": 1.8859486425676635,
+      "grad_norm": 0.1287633180618286,
+      "learning_rate": 6.407907979781145e-05,
+      "loss": 0.2181,
+      "step": 68000
+    },
+    {
+      "epoch": 1.8859486425676635,
+      "eval_loss": 0.2199305146932602,
+      "eval_runtime": 500.7534,
+      "eval_samples_per_second": 5.705,
+      "eval_steps_per_second": 5.705,
+      "step": 68000
+    },
+    {
+      "epoch": 1.887335369510728,
+      "grad_norm": 0.09685543924570084,
+      "learning_rate": 6.393940138391295e-05,
+      "loss": 0.2223,
+      "step": 68050
+    },
+    {
+      "epoch": 1.8887220964537925,
+      "grad_norm": 0.18234391510486603,
+      "learning_rate": 6.379980379385513e-05,
+      "loss": 0.2216,
+      "step": 68100
+    },
+    {
+      "epoch": 1.890108823396857,
+      "grad_norm": 0.11249957233667374,
+      "learning_rate": 6.366028734052279e-05,
+      "loss": 0.2183,
+      "step": 68150
+    },
+    {
+      "epoch": 1.8914955503399216,
+      "grad_norm": 0.14707712829113007,
+      "learning_rate": 6.35208523366189e-05,
+      "loss": 0.2239,
+      "step": 68200
+    },
+    {
+      "epoch": 1.8928822772829859,
+      "grad_norm": 0.1354626715183258,
+      "learning_rate": 6.338149909466387e-05,
+      "loss": 0.2211,
+      "step": 68250
+    },
+    {
+      "epoch": 1.8942690042260504,
+      "grad_norm": 0.11492297798395157,
+      "learning_rate": 6.324222792699481e-05,
+      "loss": 0.2233,
+      "step": 68300
+    },
+    {
+      "epoch": 1.8956557311691147,
+      "grad_norm": 0.15535590052604675,
+      "learning_rate": 6.310303914576487e-05,
+      "loss": 0.215,
+      "step": 68350
+    },
+    {
+      "epoch": 1.8970424581121792,
+      "grad_norm": 0.12155631929636002,
+      "learning_rate": 6.296393306294268e-05,
+      "loss": 0.2186,
+      "step": 68400
+    },
+    {
+      "epoch": 1.8984291850552437,
+      "grad_norm": 0.16767702996730804,
+      "learning_rate": 6.282490999031134e-05,
+      "loss": 0.2221,
+      "step": 68450
+    },
+    {
+      "epoch": 1.8998159119983082,
+      "grad_norm": 0.14361165463924408,
+      "learning_rate": 6.2685970239468e-05,
+      "loss": 0.2254,
+      "step": 68500
+    },
+    {
+      "epoch": 1.9012026389413728,
+      "grad_norm": 0.1382211595773697,
+      "learning_rate": 6.254711412182303e-05,
+      "loss": 0.2227,
+      "step": 68550
+    },
+    {
+      "epoch": 1.902589365884437,
+      "grad_norm": 0.1300189197063446,
+      "learning_rate": 6.240834194859931e-05,
+      "loss": 0.2181,
+      "step": 68600
+    },
+    {
+      "epoch": 1.9039760928275016,
+      "grad_norm": 0.1218055710196495,
+      "learning_rate": 6.227242696147264e-05,
+      "loss": 0.2234,
+      "step": 68650
+    },
+    {
+      "epoch": 1.9053628197705659,
+      "grad_norm": 0.14510662853717804,
+      "learning_rate": 6.213382191563584e-05,
+      "loss": 0.2212,
+      "step": 68700
+    },
+    {
+      "epoch": 1.9067495467136304,
+      "grad_norm": 0.1312136948108673,
+      "learning_rate": 6.19953017405461e-05,
+      "loss": 0.22,
+      "step": 68750
+    },
+    {
+      "epoch": 1.908136273656695,
+      "grad_norm": 0.1256617307662964,
+      "learning_rate": 6.185686674667344e-05,
+      "loss": 0.2247,
+      "step": 68800
+    },
+    {
+      "epoch": 1.9095230005997594,
+      "grad_norm": 0.15276198089122772,
+      "learning_rate": 6.171851724429687e-05,
+      "loss": 0.2203,
+      "step": 68850
+    },
+    {
+      "epoch": 1.910909727542824,
+      "grad_norm": 0.1289031058549881,
+      "learning_rate": 6.158025354350377e-05,
+      "loss": 0.2185,
+      "step": 68900
+    },
+    {
+      "epoch": 1.9122964544858885,
+      "grad_norm": 0.14539048075675964,
+      "learning_rate": 6.144207595418932e-05,
+      "loss": 0.2226,
+      "step": 68950
+    },
+    {
+      "epoch": 1.9136831814289528,
+      "grad_norm": 0.12622995674610138,
+      "learning_rate": 6.130398478605562e-05,
+      "loss": 0.2212,
+      "step": 69000
+    },
+    {
+      "epoch": 1.9136831814289528,
+      "eval_loss": 0.21972130239009857,
+      "eval_runtime": 500.141,
+      "eval_samples_per_second": 5.712,
+      "eval_steps_per_second": 5.712,
+      "step": 69000
+    },
+    {
+      "epoch": 1.915069908372017,
+      "grad_norm": 0.1135367900133133,
+      "learning_rate": 6.116598034861105e-05,
+      "loss": 0.2255,
+      "step": 69050
+    },
+    {
+      "epoch": 1.9164566353150816,
+      "grad_norm": 0.12307292968034744,
+      "learning_rate": 6.102806295116965e-05,
+      "loss": 0.2236,
+      "step": 69100
+    },
+    {
+      "epoch": 1.9178433622581461,
+      "grad_norm": 0.15067149698734283,
+      "learning_rate": 6.089023290285036e-05,
+      "loss": 0.2245,
+      "step": 69150
+    },
+    {
+      "epoch": 1.9192300892012106,
+      "grad_norm": 0.13182760775089264,
+      "learning_rate": 6.075249051257632e-05,
+      "loss": 0.2234,
+      "step": 69200
+    },
+    {
+      "epoch": 1.9206168161442752,
+      "grad_norm": 0.13820376992225647,
+      "learning_rate": 6.061483608907419e-05,
+      "loss": 0.2232,
+      "step": 69250
+    },
+    {
+      "epoch": 1.9220035430873397,
+      "grad_norm": 0.12792882323265076,
+      "learning_rate": 6.0477269940873505e-05,
+      "loss": 0.2184,
+      "step": 69300
+    },
+    {
+      "epoch": 1.923390270030404,
+      "grad_norm": 0.11212699115276337,
+      "learning_rate": 6.0339792376305974e-05,
+      "loss": 0.2206,
+      "step": 69350
+    },
+    {
+      "epoch": 1.9247769969734685,
+      "grad_norm": 0.12071092426776886,
+      "learning_rate": 6.020240370350465e-05,
+      "loss": 0.2214,
+      "step": 69400
+    },
+    {
+      "epoch": 1.9261637239165328,
+      "grad_norm": 0.14231905341148376,
+      "learning_rate": 6.006510423040349e-05,
+      "loss": 0.228,
+      "step": 69450
+    },
+    {
+      "epoch": 1.9275504508595973,
+      "grad_norm": 0.13944728672504425,
+      "learning_rate": 5.99278942647364e-05,
+      "loss": 0.2199,
+      "step": 69500
+    },
+    {
+      "epoch": 1.9289371778026618,
+      "grad_norm": 0.12536093592643738,
+      "learning_rate": 5.979077411403675e-05,
+      "loss": 0.2242,
+      "step": 69550
+    },
+    {
+      "epoch": 1.9303239047457263,
+      "grad_norm": 0.11361142992973328,
+      "learning_rate": 5.965374408563655e-05,
+      "loss": 0.2275,
+      "step": 69600
+    },
+    {
+      "epoch": 1.9317106316887909,
+      "grad_norm": 0.13786083459854126,
+      "learning_rate": 5.9516804486665866e-05,
+      "loss": 0.2276,
+      "step": 69650
+    },
+    {
+      "epoch": 1.9330973586318552,
+      "grad_norm": 0.11472176760435104,
+      "learning_rate": 5.9379955624052006e-05,
+      "loss": 0.2235,
+      "step": 69700
+    },
+    {
+      "epoch": 1.9344840855749197,
+      "grad_norm": 0.17608420550823212,
+      "learning_rate": 5.9243197804519036e-05,
+      "loss": 0.2197,
+      "step": 69750
+    },
+    {
+      "epoch": 1.935870812517984,
+      "grad_norm": 0.12841780483722687,
+      "learning_rate": 5.9106531334586856e-05,
+      "loss": 0.2228,
+      "step": 69800
+    },
+    {
+      "epoch": 1.9372575394610485,
+      "grad_norm": 0.17407533526420593,
+      "learning_rate": 5.8969956520570646e-05,
+      "loss": 0.221,
+      "step": 69850
+    },
+    {
+      "epoch": 1.938644266404113,
+      "grad_norm": 0.12569575011730194,
+      "learning_rate": 5.883347366858014e-05,
+      "loss": 0.2228,
+      "step": 69900
+    },
+    {
+      "epoch": 1.9400309933471775,
+      "grad_norm": 0.1247481182217598,
+      "learning_rate": 5.8697083084519025e-05,
+      "loss": 0.2252,
+      "step": 69950
+    },
+    {
+      "epoch": 1.941417720290242,
+      "grad_norm": 0.15431839227676392,
+      "learning_rate": 5.85607850740841e-05,
+      "loss": 0.22,
+      "step": 70000
+    },
+    {
+      "epoch": 1.941417720290242,
+      "eval_loss": 0.21946103870868683,
+      "eval_runtime": 500.6645,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 70000
+    },
+    {
+      "epoch": 1.9428044472333064,
+      "grad_norm": 0.13469503819942474,
+      "learning_rate": 5.842457994276473e-05,
+      "loss": 0.2178,
+      "step": 70050
+    },
+    {
+      "epoch": 1.9441911741763709,
+      "grad_norm": 0.103294737637043,
+      "learning_rate": 5.828846799584203e-05,
+      "loss": 0.2227,
+      "step": 70100
+    },
+    {
+      "epoch": 1.9455779011194352,
+      "grad_norm": 0.12273615598678589,
+      "learning_rate": 5.815244953838843e-05,
+      "loss": 0.2227,
+      "step": 70150
+    },
+    {
+      "epoch": 1.9469646280624997,
+      "grad_norm": 0.1339019238948822,
+      "learning_rate": 5.80165248752666e-05,
+      "loss": 0.215,
+      "step": 70200
+    },
+    {
+      "epoch": 1.9483513550055642,
+      "grad_norm": 0.10198177397251129,
+      "learning_rate": 5.788069431112913e-05,
+      "loss": 0.2222,
+      "step": 70250
+    },
+    {
+      "epoch": 1.9497380819486287,
+      "grad_norm": 0.14309309422969818,
+      "learning_rate": 5.7744958150417685e-05,
+      "loss": 0.223,
+      "step": 70300
+    },
+    {
+      "epoch": 1.9511248088916933,
+      "grad_norm": 0.11085857450962067,
+      "learning_rate": 5.760931669736226e-05,
+      "loss": 0.2246,
+      "step": 70350
+    },
+    {
+      "epoch": 1.9525115358347578,
+      "grad_norm": 0.13969945907592773,
+      "learning_rate": 5.7473770255980705e-05,
+      "loss": 0.2223,
+      "step": 70400
+    },
+    {
+      "epoch": 1.953898262777822,
+      "grad_norm": 0.14510655403137207,
+      "learning_rate": 5.734102721654016e-05,
+      "loss": 0.221,
+      "step": 70450
+    },
+    {
+      "epoch": 1.9552849897208864,
+      "grad_norm": 0.12342046946287155,
+      "learning_rate": 5.720566979435193e-05,
+      "loss": 0.2223,
+      "step": 70500
+    },
+    {
+      "epoch": 1.956671716663951,
+      "grad_norm": 0.1510363519191742,
+      "learning_rate": 5.707040828854496e-05,
+      "loss": 0.2236,
+      "step": 70550
+    },
+    {
+      "epoch": 1.9580584436070154,
+      "grad_norm": 0.1418016403913498,
+      "learning_rate": 5.6935243002285547e-05,
+      "loss": 0.2258,
+      "step": 70600
+    },
+    {
+      "epoch": 1.95944517055008,
+      "grad_norm": 0.12346237152814865,
+      "learning_rate": 5.680017423852406e-05,
+      "loss": 0.2184,
+      "step": 70650
+    },
+    {
+      "epoch": 1.9608318974931445,
+      "grad_norm": 0.1376170963048935,
+      "learning_rate": 5.666520229999489e-05,
+      "loss": 0.2241,
+      "step": 70700
+    },
+    {
+      "epoch": 1.962218624436209,
+      "grad_norm": 0.13145020604133606,
+      "learning_rate": 5.6530327489215084e-05,
+      "loss": 0.2245,
+      "step": 70750
+    },
+    {
+      "epoch": 1.9636053513792733,
+      "grad_norm": 0.14705248177051544,
+      "learning_rate": 5.639555010848416e-05,
+      "loss": 0.2217,
+      "step": 70800
+    },
+    {
+      "epoch": 1.9649920783223378,
+      "grad_norm": 0.1190200001001358,
+      "learning_rate": 5.6260870459883264e-05,
+      "loss": 0.2212,
+      "step": 70850
+    },
+    {
+      "epoch": 1.966378805265402,
+      "grad_norm": 0.1403389275074005,
+      "learning_rate": 5.612628884527436e-05,
+      "loss": 0.2227,
+      "step": 70900
+    },
+    {
+      "epoch": 1.9677655322084666,
+      "grad_norm": 0.10089599341154099,
+      "learning_rate": 5.5991805566299884e-05,
+      "loss": 0.2242,
+      "step": 70950
+    },
+    {
+      "epoch": 1.9691522591515311,
+      "grad_norm": 0.1129344254732132,
+      "learning_rate": 5.5857420924381665e-05,
+      "loss": 0.218,
+      "step": 71000
+    },
+    {
+      "epoch": 1.9691522591515311,
+      "eval_loss": 0.21950487792491913,
+      "eval_runtime": 500.1742,
+      "eval_samples_per_second": 5.712,
+      "eval_steps_per_second": 5.712,
+      "step": 71000
+    },
+    {
+      "epoch": 1.9705389860945957,
+      "grad_norm": 0.11886442452669144,
+      "learning_rate": 5.57231352207206e-05,
+      "loss": 0.2208,
+      "step": 71050
+    },
+    {
+      "epoch": 1.9719257130376602,
+      "grad_norm": 0.13062913715839386,
+      "learning_rate": 5.5588948756295787e-05,
+      "loss": 0.222,
+      "step": 71100
+    },
+    {
+      "epoch": 1.9733124399807245,
+      "grad_norm": 0.11810411512851715,
+      "learning_rate": 5.5454861831863905e-05,
+      "loss": 0.2198,
+      "step": 71150
+    },
+    {
+      "epoch": 1.974699166923789,
+      "grad_norm": 0.1369456797838211,
+      "learning_rate": 5.5320874747958475e-05,
+      "loss": 0.2224,
+      "step": 71200
+    },
+    {
+      "epoch": 1.9760858938668533,
+      "grad_norm": 0.1086314469575882,
+      "learning_rate": 5.51869878048893e-05,
+      "loss": 0.2207,
+      "step": 71250
+    },
+    {
+      "epoch": 1.9774726208099178,
+      "grad_norm": 0.13363894820213318,
+      "learning_rate": 5.5053201302741765e-05,
+      "loss": 0.2219,
+      "step": 71300
+    },
+    {
+      "epoch": 1.9788593477529823,
+      "grad_norm": 0.12869518995285034,
+      "learning_rate": 5.491951554137602e-05,
+      "loss": 0.223,
+      "step": 71350
+    },
+    {
+      "epoch": 1.9802460746960469,
+      "grad_norm": 0.11446097493171692,
+      "learning_rate": 5.478593082042655e-05,
+      "loss": 0.22,
+      "step": 71400
+    },
+    {
+      "epoch": 1.9816328016391114,
+      "grad_norm": 0.1284862756729126,
+      "learning_rate": 5.4652447439301204e-05,
+      "loss": 0.219,
+      "step": 71450
+    },
+    {
+      "epoch": 1.9830195285821757,
+      "grad_norm": 0.1428086757659912,
+      "learning_rate": 5.451906569718095e-05,
+      "loss": 0.2222,
+      "step": 71500
+    },
+    {
+      "epoch": 1.9844062555252402,
+      "grad_norm": 0.15157289803028107,
+      "learning_rate": 5.43857858930187e-05,
+      "loss": 0.2199,
+      "step": 71550
+    },
+    {
+      "epoch": 1.9857929824683045,
+      "grad_norm": 0.12832427024841309,
+      "learning_rate": 5.4252608325539066e-05,
+      "loss": 0.2248,
+      "step": 71600
+    },
+    {
+      "epoch": 1.987179709411369,
+      "grad_norm": 0.16493399441242218,
+      "learning_rate": 5.411953329323736e-05,
+      "loss": 0.2212,
+      "step": 71650
+    },
+    {
+      "epoch": 1.9885664363544335,
+      "grad_norm": 0.1472720503807068,
+      "learning_rate": 5.3986561094379226e-05,
+      "loss": 0.2229,
+      "step": 71700
+    },
+    {
+      "epoch": 1.989953163297498,
+      "grad_norm": 0.17510546743869781,
+      "learning_rate": 5.3853692026999704e-05,
+      "loss": 0.2217,
+      "step": 71750
+    },
+    {
+      "epoch": 1.9913398902405626,
+      "grad_norm": 0.12741264700889587,
+      "learning_rate": 5.372092638890274e-05,
+      "loss": 0.2205,
+      "step": 71800
+    },
+    {
+      "epoch": 1.9927266171836269,
+      "grad_norm": 0.13984480500221252,
+      "learning_rate": 5.358826447766052e-05,
+      "loss": 0.218,
+      "step": 71850
+    },
+    {
+      "epoch": 1.9941133441266914,
+      "grad_norm": 0.14489765465259552,
+      "learning_rate": 5.345570659061254e-05,
+      "loss": 0.2256,
+      "step": 71900
+    },
+    {
+      "epoch": 1.9955000710697557,
+      "grad_norm": 0.11793538182973862,
+      "learning_rate": 5.332325302486545e-05,
+      "loss": 0.2214,
+      "step": 71950
+    },
+    {
+      "epoch": 1.9968867980128202,
+      "grad_norm": 0.11862395703792572,
+      "learning_rate": 5.3190904077291794e-05,
+      "loss": 0.2185,
+      "step": 72000
+    },
+    {
+      "epoch": 1.9968867980128202,
+      "eval_loss": 0.21919070184230804,
+      "eval_runtime": 500.6392,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 72000
+    },
+    {
+      "epoch": 1.9982735249558847,
+      "grad_norm": 0.14340032637119293,
+      "learning_rate": 5.305866004452982e-05,
+      "loss": 0.2204,
+      "step": 72050
+    },
+    {
+      "epoch": 1.9996602518989492,
+      "grad_norm": 0.10452345758676529,
+      "learning_rate": 5.2926521222982494e-05,
+      "loss": 0.2243,
+      "step": 72100
+    },
+    {
+      "epoch": 2.0010469788420138,
+      "grad_norm": 0.1440684199333191,
+      "learning_rate": 5.279448790881709e-05,
+      "loss": 0.2194,
+      "step": 72150
+    },
+    {
+      "epoch": 2.0024337057850783,
+      "grad_norm": 0.1383608728647232,
+      "learning_rate": 5.2662560397964265e-05,
+      "loss": 0.2185,
+      "step": 72200
+    },
+    {
+      "epoch": 2.0038204327281424,
+      "grad_norm": 0.12105223536491394,
+      "learning_rate": 5.253073898611769e-05,
+      "loss": 0.2176,
+      "step": 72250
+    },
+    {
+      "epoch": 2.005207159671207,
+      "grad_norm": 0.13564659655094147,
+      "learning_rate": 5.239902396873312e-05,
+      "loss": 0.2188,
+      "step": 72300
+    },
+    {
+      "epoch": 2.0065938866142714,
+      "grad_norm": 0.10812518000602722,
+      "learning_rate": 5.226741564102793e-05,
+      "loss": 0.221,
+      "step": 72350
+    },
+    {
+      "epoch": 2.007980613557336,
+      "grad_norm": 0.1458885222673416,
+      "learning_rate": 5.2135914297980257e-05,
+      "loss": 0.222,
+      "step": 72400
+    },
+    {
+      "epoch": 2.0093673405004004,
+      "grad_norm": 0.13904157280921936,
+      "learning_rate": 5.2004520234328556e-05,
+      "loss": 0.2226,
+      "step": 72450
+    },
+    {
+      "epoch": 2.010754067443465,
+      "grad_norm": 0.12214989960193634,
+      "learning_rate": 5.18732337445708e-05,
+      "loss": 0.2179,
+      "step": 72500
+    },
+    {
+      "epoch": 2.0121407943865295,
+      "grad_norm": 0.1347937136888504,
+      "learning_rate": 5.1742055122963804e-05,
+      "loss": 0.2153,
+      "step": 72550
+    },
+    {
+      "epoch": 2.013527521329594,
+      "grad_norm": 0.17244890332221985,
+      "learning_rate": 5.161098466352271e-05,
+      "loss": 0.2201,
+      "step": 72600
+    },
+    {
+      "epoch": 2.014914248272658,
+      "grad_norm": 0.1166064664721489,
+      "learning_rate": 5.14800226600201e-05,
+      "loss": 0.2212,
+      "step": 72650
+    },
+    {
+      "epoch": 2.0163009752157226,
+      "grad_norm": 0.11647620797157288,
+      "learning_rate": 5.134916940598558e-05,
+      "loss": 0.2161,
+      "step": 72700
+    },
+    {
+      "epoch": 2.017687702158787,
+      "grad_norm": 0.133785679936409,
+      "learning_rate": 5.121842519470501e-05,
+      "loss": 0.2214,
+      "step": 72750
+    },
+    {
+      "epoch": 2.0190744291018516,
+      "grad_norm": 0.11569181829690933,
+      "learning_rate": 5.108779031921982e-05,
+      "loss": 0.2181,
+      "step": 72800
+    },
+    {
+      "epoch": 2.020461156044916,
+      "grad_norm": 0.16622135043144226,
+      "learning_rate": 5.095726507232631e-05,
+      "loss": 0.2178,
+      "step": 72850
+    },
+    {
+      "epoch": 2.0218478829879807,
+      "grad_norm": 0.13199181854724884,
+      "learning_rate": 5.082684974657519e-05,
+      "loss": 0.2218,
+      "step": 72900
+    },
+    {
+      "epoch": 2.023234609931045,
+      "grad_norm": 0.12465277314186096,
+      "learning_rate": 5.069654463427077e-05,
+      "loss": 0.2211,
+      "step": 72950
+    },
+    {
+      "epoch": 2.0246213368741093,
+      "grad_norm": 0.11640169471502304,
+      "learning_rate": 5.0566350027470235e-05,
+      "loss": 0.2189,
+      "step": 73000
+    },
+    {
+      "epoch": 2.0246213368741093,
+      "eval_loss": 0.21916010975837708,
+      "eval_runtime": 500.4715,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 73000
+    },
+    {
+      "epoch": 2.026008063817174,
+      "grad_norm": 0.13460282981395721,
+      "learning_rate": 5.0436266217983255e-05,
+      "loss": 0.2175,
+      "step": 73050
+    },
+    {
+      "epoch": 2.0273947907602383,
+      "grad_norm": 0.14442716538906097,
+      "learning_rate": 5.030629349737095e-05,
+      "loss": 0.2217,
+      "step": 73100
+    },
+    {
+      "epoch": 2.028781517703303,
+      "grad_norm": 0.12992683053016663,
+      "learning_rate": 5.017643215694573e-05,
+      "loss": 0.2183,
+      "step": 73150
+    },
+    {
+      "epoch": 2.0301682446463674,
+      "grad_norm": 0.12550853192806244,
+      "learning_rate": 5.00466824877701e-05,
+      "loss": 0.2186,
+      "step": 73200
+    },
+    {
+      "epoch": 2.031554971589432,
+      "grad_norm": 0.153733029961586,
+      "learning_rate": 4.9917044780656474e-05,
+      "loss": 0.2175,
+      "step": 73250
+    },
+    {
+      "epoch": 2.0329416985324964,
+      "grad_norm": 0.13093726336956024,
+      "learning_rate": 4.978751932616615e-05,
+      "loss": 0.2206,
+      "step": 73300
+    },
+    {
+      "epoch": 2.0343284254755605,
+      "grad_norm": 0.13688836991786957,
+      "learning_rate": 4.9658106414608995e-05,
+      "loss": 0.2213,
+      "step": 73350
+    },
+    {
+      "epoch": 2.035715152418625,
+      "grad_norm": 0.1340765506029129,
+      "learning_rate": 4.9528806336042475e-05,
+      "loss": 0.2195,
+      "step": 73400
+    },
+    {
+      "epoch": 2.0371018793616895,
+      "grad_norm": 0.13744907081127167,
+      "learning_rate": 4.9399619380271267e-05,
+      "loss": 0.2196,
+      "step": 73450
+    },
+    {
+      "epoch": 2.038488606304754,
+      "grad_norm": 0.11359039694070816,
+      "learning_rate": 4.927054583684647e-05,
+      "loss": 0.2175,
+      "step": 73500
+    },
+    {
+      "epoch": 2.0398753332478186,
+      "grad_norm": 0.1374887377023697,
+      "learning_rate": 4.914158599506499e-05,
+      "loss": 0.2191,
+      "step": 73550
+    },
+    {
+      "epoch": 2.041262060190883,
+      "grad_norm": 0.1326727718114853,
+      "learning_rate": 4.901274014396892e-05,
+      "loss": 0.2174,
+      "step": 73600
+    },
+    {
+      "epoch": 2.0426487871339476,
+      "grad_norm": 0.1448783129453659,
+      "learning_rate": 4.8884008572344753e-05,
+      "loss": 0.218,
+      "step": 73650
+    },
+    {
+      "epoch": 2.0440355140770117,
+      "grad_norm": 0.15006962418556213,
+      "learning_rate": 4.8755391568723e-05,
+      "loss": 0.2153,
+      "step": 73700
+    },
+    {
+      "epoch": 2.045422241020076,
+      "grad_norm": 0.11637504398822784,
+      "learning_rate": 4.862688942137723e-05,
+      "loss": 0.2209,
+      "step": 73750
+    },
+    {
+      "epoch": 2.0468089679631407,
+      "grad_norm": 0.15461041033267975,
+      "learning_rate": 4.849850241832373e-05,
+      "loss": 0.2218,
+      "step": 73800
+    },
+    {
+      "epoch": 2.0481956949062052,
+      "grad_norm": 0.11848367750644684,
+      "learning_rate": 4.837023084732056e-05,
+      "loss": 0.2199,
+      "step": 73850
+    },
+    {
+      "epoch": 2.0495824218492698,
+      "grad_norm": 0.1128213182091713,
+      "learning_rate": 4.824207499586719e-05,
+      "loss": 0.2235,
+      "step": 73900
+    },
+    {
+      "epoch": 2.0509691487923343,
+      "grad_norm": 0.13028693199157715,
+      "learning_rate": 4.811403515120364e-05,
+      "loss": 0.2189,
+      "step": 73950
+    },
+    {
+      "epoch": 2.052355875735399,
+      "grad_norm": 0.1149388998746872,
+      "learning_rate": 4.798611160031001e-05,
+      "loss": 0.222,
+      "step": 74000
+    },
+    {
+      "epoch": 2.052355875735399,
+      "eval_loss": 0.21903079748153687,
+      "eval_runtime": 500.4713,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 74000
+    },
+    {
+      "epoch": 2.0537426026784633,
+      "grad_norm": 0.11896523833274841,
+      "learning_rate": 4.7858304629905606e-05,
+      "loss": 0.2157,
+      "step": 74050
+    },
+    {
+      "epoch": 2.0551293296215274,
+      "grad_norm": 0.12097673863172531,
+      "learning_rate": 4.7730614526448546e-05,
+      "loss": 0.2204,
+      "step": 74100
+    },
+    {
+      "epoch": 2.056516056564592,
+      "grad_norm": 0.1441742181777954,
+      "learning_rate": 4.760304157613503e-05,
+      "loss": 0.2195,
+      "step": 74150
+    },
+    {
+      "epoch": 2.0579027835076564,
+      "grad_norm": 0.12328975647687912,
+      "learning_rate": 4.7475586064898545e-05,
+      "loss": 0.2183,
+      "step": 74200
+    },
+    {
+      "epoch": 2.059289510450721,
+      "grad_norm": 0.11700959503650665,
+      "learning_rate": 4.734824827840954e-05,
+      "loss": 0.2219,
+      "step": 74250
+    },
+    {
+      "epoch": 2.0606762373937855,
+      "grad_norm": 0.11523609608411789,
+      "learning_rate": 4.722102850207437e-05,
+      "loss": 0.2197,
+      "step": 74300
+    },
+    {
+      "epoch": 2.06206296433685,
+      "grad_norm": 0.10766173154115677,
+      "learning_rate": 4.70939270210352e-05,
+      "loss": 0.2222,
+      "step": 74350
+    },
+    {
+      "epoch": 2.0634496912799145,
+      "grad_norm": 0.1485738754272461,
+      "learning_rate": 4.6966944120168754e-05,
+      "loss": 0.2171,
+      "step": 74400
+    },
+    {
+      "epoch": 2.0648364182229786,
+      "grad_norm": 0.12434457242488861,
+      "learning_rate": 4.684008008408619e-05,
+      "loss": 0.2194,
+      "step": 74450
+    },
+    {
+      "epoch": 2.066223145166043,
+      "grad_norm": 0.11212220788002014,
+      "learning_rate": 4.671333519713209e-05,
+      "loss": 0.2244,
+      "step": 74500
+    },
+    {
+      "epoch": 2.0676098721091076,
+      "grad_norm": 0.1226951852440834,
+      "learning_rate": 4.658670974338409e-05,
+      "loss": 0.2176,
+      "step": 74550
+    },
+    {
+      "epoch": 2.068996599052172,
+      "grad_norm": 0.1348971724510193,
+      "learning_rate": 4.6460204006652174e-05,
+      "loss": 0.2161,
+      "step": 74600
+    },
+    {
+      "epoch": 2.0703833259952367,
+      "grad_norm": 0.11871132254600525,
+      "learning_rate": 4.633381827047782e-05,
+      "loss": 0.2216,
+      "step": 74650
+    },
+    {
+      "epoch": 2.071770052938301,
+      "grad_norm": 0.13671475648880005,
+      "learning_rate": 4.620755281813376e-05,
+      "loss": 0.2143,
+      "step": 74700
+    },
+    {
+      "epoch": 2.0731567798813657,
+      "grad_norm": 0.11762852221727371,
+      "learning_rate": 4.60814079326229e-05,
+      "loss": 0.2194,
+      "step": 74750
+    },
+    {
+      "epoch": 2.07454350682443,
+      "grad_norm": 0.14862984418869019,
+      "learning_rate": 4.59553838966782e-05,
+      "loss": 0.2223,
+      "step": 74800
+    },
+    {
+      "epoch": 2.0759302337674943,
+      "grad_norm": 0.16189099848270416,
+      "learning_rate": 4.58294809927615e-05,
+      "loss": 0.2203,
+      "step": 74850
+    },
+    {
+      "epoch": 2.077316960710559,
+      "grad_norm": 0.15650849044322968,
+      "learning_rate": 4.5703699503063294e-05,
+      "loss": 0.2204,
+      "step": 74900
+    },
+    {
+      "epoch": 2.0787036876536233,
+      "grad_norm": 0.1271338164806366,
+      "learning_rate": 4.557803970950182e-05,
+      "loss": 0.2179,
+      "step": 74950
+    },
+    {
+      "epoch": 2.080090414596688,
+      "grad_norm": 0.14354722201824188,
+      "learning_rate": 4.545250189372268e-05,
+      "loss": 0.2166,
+      "step": 75000
+    },
+    {
+      "epoch": 2.080090414596688,
+      "eval_loss": 0.2189180701971054,
+      "eval_runtime": 500.4567,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 75000
+    },
+    {
+      "epoch": 2.0814771415397524,
+      "grad_norm": 0.15187138319015503,
+      "learning_rate": 4.5327086337098056e-05,
+      "loss": 0.2223,
+      "step": 75050
+    },
+    {
+      "epoch": 2.082863868482817,
+      "grad_norm": 0.13435395061969757,
+      "learning_rate": 4.5201793320726016e-05,
+      "loss": 0.2206,
+      "step": 75100
+    },
+    {
+      "epoch": 2.084250595425881,
+      "grad_norm": 0.13874699175357819,
+      "learning_rate": 4.507662312543007e-05,
+      "loss": 0.2208,
+      "step": 75150
+    },
+    {
+      "epoch": 2.0856373223689455,
+      "grad_norm": 0.1275242120027542,
+      "learning_rate": 4.495157603175842e-05,
+      "loss": 0.2204,
+      "step": 75200
+    },
+    {
+      "epoch": 2.08702404931201,
+      "grad_norm": 0.17980913817882538,
+      "learning_rate": 4.482665231998338e-05,
+      "loss": 0.2254,
+      "step": 75250
+    },
+    {
+      "epoch": 2.0884107762550745,
+      "grad_norm": 0.1556035280227661,
+      "learning_rate": 4.470185227010064e-05,
+      "loss": 0.2167,
+      "step": 75300
+    },
+    {
+      "epoch": 2.089797503198139,
+      "grad_norm": 0.12536190450191498,
+      "learning_rate": 4.4577176161828835e-05,
+      "loss": 0.2204,
+      "step": 75350
+    },
+    {
+      "epoch": 2.0911842301412036,
+      "grad_norm": 0.16269785165786743,
+      "learning_rate": 4.445262427460868e-05,
+      "loss": 0.218,
+      "step": 75400
+    },
+    {
+      "epoch": 2.092570957084268,
+      "grad_norm": 0.16465267539024353,
+      "learning_rate": 4.4328196887602616e-05,
+      "loss": 0.2201,
+      "step": 75450
+    },
+    {
+      "epoch": 2.093957684027332,
+      "grad_norm": 0.12975232303142548,
+      "learning_rate": 4.420389427969386e-05,
+      "loss": 0.22,
+      "step": 75500
+    },
+    {
+      "epoch": 2.0953444109703967,
+      "grad_norm": 0.11653002351522446,
+      "learning_rate": 4.407971672948612e-05,
+      "loss": 0.2144,
+      "step": 75550
+    },
+    {
+      "epoch": 2.096731137913461,
+      "grad_norm": 0.1448681503534317,
+      "learning_rate": 4.3955664515302744e-05,
+      "loss": 0.2237,
+      "step": 75600
+    },
+    {
+      "epoch": 2.0981178648565257,
+      "grad_norm": 0.1351730227470398,
+      "learning_rate": 4.3831737915186144e-05,
+      "loss": 0.2204,
+      "step": 75650
+    },
+    {
+      "epoch": 2.0995045917995903,
+      "grad_norm": 0.12176606059074402,
+      "learning_rate": 4.370793720689724e-05,
+      "loss": 0.2219,
+      "step": 75700
+    },
+    {
+      "epoch": 2.100891318742655,
+      "grad_norm": 0.152371346950531,
+      "learning_rate": 4.3584262667914696e-05,
+      "loss": 0.2211,
+      "step": 75750
+    },
+    {
+      "epoch": 2.1022780456857193,
+      "grad_norm": 0.11460871249437332,
+      "learning_rate": 4.3460714575434517e-05,
+      "loss": 0.2199,
+      "step": 75800
+    },
+    {
+      "epoch": 2.1036647726287834,
+      "grad_norm": 0.12700140476226807,
+      "learning_rate": 4.3337293206369125e-05,
+      "loss": 0.2194,
+      "step": 75850
+    },
+    {
+      "epoch": 2.105051499571848,
+      "grad_norm": 0.15812121331691742,
+      "learning_rate": 4.3213998837347116e-05,
+      "loss": 0.2196,
+      "step": 75900
+    },
+    {
+      "epoch": 2.1064382265149124,
+      "grad_norm": 0.13531926274299622,
+      "learning_rate": 4.309083174471221e-05,
+      "loss": 0.2209,
+      "step": 75950
+    },
+    {
+      "epoch": 2.107824953457977,
+      "grad_norm": 0.13829369843006134,
+      "learning_rate": 4.2967792204523136e-05,
+      "loss": 0.2183,
+      "step": 76000
+    },
+    {
+      "epoch": 2.107824953457977,
+      "eval_loss": 0.2188321202993393,
+      "eval_runtime": 500.5809,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 76000
+    },
+    {
+      "epoch": 2.1092116804010415,
+      "grad_norm": 0.13063254952430725,
+      "learning_rate": 4.284488049255246e-05,
+      "loss": 0.2162,
+      "step": 76050
+    },
+    {
+      "epoch": 2.110598407344106,
+      "grad_norm": 0.13592955470085144,
+      "learning_rate": 4.272209688428646e-05,
+      "loss": 0.2186,
+      "step": 76100
+    },
+    {
+      "epoch": 2.1119851342871705,
+      "grad_norm": 0.1165243461728096,
+      "learning_rate": 4.25994416549241e-05,
+      "loss": 0.2158,
+      "step": 76150
+    },
+    {
+      "epoch": 2.113371861230235,
+      "grad_norm": 0.14689646661281586,
+      "learning_rate": 4.247691507937673e-05,
+      "loss": 0.2211,
+      "step": 76200
+    },
+    {
+      "epoch": 2.114758588173299,
+      "grad_norm": 0.11267469078302383,
+      "learning_rate": 4.235451743226737e-05,
+      "loss": 0.2178,
+      "step": 76250
+    },
+    {
+      "epoch": 2.1161453151163636,
+      "grad_norm": 0.15382416546344757,
+      "learning_rate": 4.2232248987929936e-05,
+      "loss": 0.2197,
+      "step": 76300
+    },
+    {
+      "epoch": 2.117532042059428,
+      "grad_norm": 0.15660510957241058,
+      "learning_rate": 4.2110110020408855e-05,
+      "loss": 0.2201,
+      "step": 76350
+    },
+    {
+      "epoch": 2.1189187690024927,
+      "grad_norm": 0.13238658010959625,
+      "learning_rate": 4.198810080345834e-05,
+      "loss": 0.2208,
+      "step": 76400
+    },
+    {
+      "epoch": 2.120305495945557,
+      "grad_norm": 0.11956652998924255,
+      "learning_rate": 4.186622161054181e-05,
+      "loss": 0.2193,
+      "step": 76450
+    },
+    {
+      "epoch": 2.1216922228886217,
+      "grad_norm": 0.12554652988910675,
+      "learning_rate": 4.174690641406727e-05,
+      "loss": 0.219,
+      "step": 76500
+    },
+    {
+      "epoch": 2.123078949831686,
+      "grad_norm": 0.13725616037845612,
+      "learning_rate": 4.162528547436844e-05,
+      "loss": 0.2175,
+      "step": 76550
+    },
+    {
+      "epoch": 2.1244656767747503,
+      "grad_norm": 0.12407530844211578,
+      "learning_rate": 4.1503795371893814e-05,
+      "loss": 0.2188,
+      "step": 76600
+    },
+    {
+      "epoch": 2.125852403717815,
+      "grad_norm": 0.1060599759221077,
+      "learning_rate": 4.1382436378943334e-05,
+      "loss": 0.219,
+      "step": 76650
+    },
+    {
+      "epoch": 2.1272391306608793,
+      "grad_norm": 0.12038925290107727,
+      "learning_rate": 4.126120876752295e-05,
+      "loss": 0.2237,
+      "step": 76700
+    },
+    {
+      "epoch": 2.128625857603944,
+      "grad_norm": 0.12510186433792114,
+      "learning_rate": 4.114011280934425e-05,
+      "loss": 0.2172,
+      "step": 76750
+    },
+    {
+      "epoch": 2.1300125845470084,
+      "grad_norm": 0.12675727903842926,
+      "learning_rate": 4.102156676187841e-05,
+      "loss": 0.2233,
+      "step": 76800
+    },
+    {
+      "epoch": 2.131399311490073,
+      "grad_norm": 0.14749465882778168,
+      "learning_rate": 4.090073227756616e-05,
+      "loss": 0.221,
+      "step": 76850
+    },
+    {
+      "epoch": 2.1327860384331374,
+      "grad_norm": 0.1391042023897171,
+      "learning_rate": 4.07800302544438e-05,
+      "loss": 0.221,
+      "step": 76900
+    },
+    {
+      "epoch": 2.1341727653762015,
+      "grad_norm": 0.10255371034145355,
+      "learning_rate": 4.0659460963044785e-05,
+      "loss": 0.2123,
+      "step": 76950
+    },
+    {
+      "epoch": 2.135559492319266,
+      "grad_norm": 0.11783240735530853,
+      "learning_rate": 4.0539024673605206e-05,
+      "loss": 0.219,
+      "step": 77000
+    },
+    {
+      "epoch": 2.135559492319266,
+      "eval_loss": 0.21858830749988556,
+      "eval_runtime": 500.5181,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 77000
+    },
+    {
+      "epoch": 2.1369462192623305,
+      "grad_norm": 0.1376359611749649,
+      "learning_rate": 4.041872165606292e-05,
+      "loss": 0.2194,
+      "step": 77050
+    },
+    {
+      "epoch": 2.138332946205395,
+      "grad_norm": 0.12311622500419617,
+      "learning_rate": 4.029855218005727e-05,
+      "loss": 0.2212,
+      "step": 77100
+    },
+    {
+      "epoch": 2.1397196731484596,
+      "grad_norm": 0.11871961504220963,
+      "learning_rate": 4.017851651492808e-05,
+      "loss": 0.2196,
+      "step": 77150
+    },
+    {
+      "epoch": 2.141106400091524,
+      "grad_norm": 0.11650574952363968,
+      "learning_rate": 4.005861492971541e-05,
+      "loss": 0.2181,
+      "step": 77200
+    },
+    {
+      "epoch": 2.1424931270345886,
+      "grad_norm": 0.13727551698684692,
+      "learning_rate": 3.9938847693158685e-05,
+      "loss": 0.2218,
+      "step": 77250
+    },
+    {
+      "epoch": 2.143879853977653,
+      "grad_norm": 0.11267198622226715,
+      "learning_rate": 3.981921507369629e-05,
+      "loss": 0.2196,
+      "step": 77300
+    },
+    {
+      "epoch": 2.145266580920717,
+      "grad_norm": 0.1510041058063507,
+      "learning_rate": 3.9699717339464915e-05,
+      "loss": 0.2168,
+      "step": 77350
+    },
+    {
+      "epoch": 2.1466533078637817,
+      "grad_norm": 0.12016372382640839,
+      "learning_rate": 3.95803547582988e-05,
+      "loss": 0.2202,
+      "step": 77400
+    },
+    {
+      "epoch": 2.1480400348068462,
+      "grad_norm": 0.13027046620845795,
+      "learning_rate": 3.9461127597729366e-05,
+      "loss": 0.2192,
+      "step": 77450
+    },
+    {
+      "epoch": 2.1494267617499108,
+      "grad_norm": 0.10763117671012878,
+      "learning_rate": 3.934203612498449e-05,
+      "loss": 0.2174,
+      "step": 77500
+    },
+    {
+      "epoch": 2.1508134886929753,
+      "grad_norm": 0.12586219608783722,
+      "learning_rate": 3.922308060698797e-05,
+      "loss": 0.2196,
+      "step": 77550
+    },
+    {
+      "epoch": 2.15220021563604,
+      "grad_norm": 0.13267678022384644,
+      "learning_rate": 3.910426131035876e-05,
+      "loss": 0.2164,
+      "step": 77600
+    },
+    {
+      "epoch": 2.1535869425791043,
+      "grad_norm": 0.14239874482154846,
+      "learning_rate": 3.8985578501410635e-05,
+      "loss": 0.2182,
+      "step": 77650
+    },
+    {
+      "epoch": 2.1549736695221684,
+      "grad_norm": 0.11145935207605362,
+      "learning_rate": 3.886703244615132e-05,
+      "loss": 0.2174,
+      "step": 77700
+    },
+    {
+      "epoch": 2.156360396465233,
+      "grad_norm": 0.14922165870666504,
+      "learning_rate": 3.874862341028216e-05,
+      "loss": 0.2225,
+      "step": 77750
+    },
+    {
+      "epoch": 2.1577471234082974,
+      "grad_norm": 0.16182860732078552,
+      "learning_rate": 3.863035165919735e-05,
+      "loss": 0.216,
+      "step": 77800
+    },
+    {
+      "epoch": 2.159133850351362,
+      "grad_norm": 0.11681864410638809,
+      "learning_rate": 3.85122174579833e-05,
+      "loss": 0.2222,
+      "step": 77850
+    },
+    {
+      "epoch": 2.1605205772944265,
+      "grad_norm": 0.11636471748352051,
+      "learning_rate": 3.839422107141826e-05,
+      "loss": 0.2156,
+      "step": 77900
+    },
+    {
+      "epoch": 2.161907304237491,
+      "grad_norm": 0.12071753293275833,
+      "learning_rate": 3.827636276397149e-05,
+      "loss": 0.2176,
+      "step": 77950
+    },
+    {
+      "epoch": 2.1632940311805555,
+      "grad_norm": 0.14000236988067627,
+      "learning_rate": 3.815864279980284e-05,
+      "loss": 0.2192,
+      "step": 78000
+    },
+    {
+      "epoch": 2.1632940311805555,
+      "eval_loss": 0.21850600838661194,
+      "eval_runtime": 500.629,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 78000
+    },
+    {
+      "epoch": 2.1646807581236196,
+      "grad_norm": 0.12112339586019516,
+      "learning_rate": 3.8041061442762015e-05,
+      "loss": 0.2194,
+      "step": 78050
+    },
+    {
+      "epoch": 2.166067485066684,
+      "grad_norm": 0.1251995861530304,
+      "learning_rate": 3.792361895638814e-05,
+      "loss": 0.2201,
+      "step": 78100
+    },
+    {
+      "epoch": 2.1674542120097486,
+      "grad_norm": 0.11946742981672287,
+      "learning_rate": 3.780631560390897e-05,
+      "loss": 0.2214,
+      "step": 78150
+    },
+    {
+      "epoch": 2.168840938952813,
+      "grad_norm": 0.11297029256820679,
+      "learning_rate": 3.768915164824055e-05,
+      "loss": 0.2169,
+      "step": 78200
+    },
+    {
+      "epoch": 2.1702276658958777,
+      "grad_norm": 0.13184936344623566,
+      "learning_rate": 3.7572127351986316e-05,
+      "loss": 0.2201,
+      "step": 78250
+    },
+    {
+      "epoch": 2.171614392838942,
+      "grad_norm": 0.14334504306316376,
+      "learning_rate": 3.7455242977436924e-05,
+      "loss": 0.2201,
+      "step": 78300
+    },
+    {
+      "epoch": 2.1730011197820067,
+      "grad_norm": 0.1557203084230423,
+      "learning_rate": 3.733849878656918e-05,
+      "loss": 0.2134,
+      "step": 78350
+    },
+    {
+      "epoch": 2.174387846725071,
+      "grad_norm": 0.16170355677604675,
+      "learning_rate": 3.722189504104583e-05,
+      "loss": 0.2184,
+      "step": 78400
+    },
+    {
+      "epoch": 2.1757745736681353,
+      "grad_norm": 0.15113234519958496,
+      "learning_rate": 3.7105432002214815e-05,
+      "loss": 0.2219,
+      "step": 78450
+    },
+    {
+      "epoch": 2.1771613006112,
+      "grad_norm": 0.14761875569820404,
+      "learning_rate": 3.698910993110864e-05,
+      "loss": 0.2217,
+      "step": 78500
+    },
+    {
+      "epoch": 2.1785480275542644,
+      "grad_norm": 0.1473364681005478,
+      "learning_rate": 3.6872929088443945e-05,
+      "loss": 0.2228,
+      "step": 78550
+    },
+    {
+      "epoch": 2.179934754497329,
+      "grad_norm": 0.11361527442932129,
+      "learning_rate": 3.6756889734620735e-05,
+      "loss": 0.2166,
+      "step": 78600
+    },
+    {
+      "epoch": 2.1813214814403934,
+      "grad_norm": 0.12869326770305634,
+      "learning_rate": 3.664099212972202e-05,
+      "loss": 0.221,
+      "step": 78650
+    },
+    {
+      "epoch": 2.182708208383458,
+      "grad_norm": 0.12593664228916168,
+      "learning_rate": 3.6525236533512896e-05,
+      "loss": 0.2193,
+      "step": 78700
+    },
+    {
+      "epoch": 2.184094935326522,
+      "grad_norm": 0.13714058697223663,
+      "learning_rate": 3.640962320544047e-05,
+      "loss": 0.22,
+      "step": 78750
+    },
+    {
+      "epoch": 2.1854816622695865,
+      "grad_norm": 0.12551981210708618,
+      "learning_rate": 3.6294152404632685e-05,
+      "loss": 0.2217,
+      "step": 78800
+    },
+    {
+      "epoch": 2.186868389212651,
+      "grad_norm": 0.139542818069458,
+      "learning_rate": 3.617882438989822e-05,
+      "loss": 0.2167,
+      "step": 78850
+    },
+    {
+      "epoch": 2.1882551161557156,
+      "grad_norm": 0.15694187581539154,
+      "learning_rate": 3.606363941972561e-05,
+      "loss": 0.2223,
+      "step": 78900
+    },
+    {
+      "epoch": 2.18964184309878,
+      "grad_norm": 0.12061590701341629,
+      "learning_rate": 3.5948597752282854e-05,
+      "loss": 0.2199,
+      "step": 78950
+    },
+    {
+      "epoch": 2.1910285700418446,
+      "grad_norm": 0.1186976209282875,
+      "learning_rate": 3.583369964541677e-05,
+      "loss": 0.221,
+      "step": 79000
+    },
+    {
+      "epoch": 2.1910285700418446,
+      "eval_loss": 0.21846945583820343,
+      "eval_runtime": 500.8819,
+      "eval_samples_per_second": 5.704,
+      "eval_steps_per_second": 5.704,
+      "step": 79000
+    },
+    {
+      "epoch": 2.192415296984909,
+      "grad_norm": 0.14024211466312408,
+      "learning_rate": 3.5718945356652314e-05,
+      "loss": 0.2195,
+      "step": 79050
+    },
+    {
+      "epoch": 2.193802023927973,
+      "grad_norm": 0.14544863998889923,
+      "learning_rate": 3.560433514319217e-05,
+      "loss": 0.2194,
+      "step": 79100
+    },
+    {
+      "epoch": 2.1951887508710377,
+      "grad_norm": 0.14944560825824738,
+      "learning_rate": 3.548986926191612e-05,
+      "loss": 0.2171,
+      "step": 79150
+    },
+    {
+      "epoch": 2.1965754778141022,
+      "grad_norm": 0.1533380150794983,
+      "learning_rate": 3.537554796938044e-05,
+      "loss": 0.2174,
+      "step": 79200
+    },
+    {
+      "epoch": 2.1979622047571667,
+      "grad_norm": 0.1631087213754654,
+      "learning_rate": 3.526365362963201e-05,
+      "loss": 0.2197,
+      "step": 79250
+    },
+    {
+      "epoch": 2.1993489317002313,
+      "grad_norm": 0.11946437507867813,
+      "learning_rate": 3.514961937842551e-05,
+      "loss": 0.2172,
+      "step": 79300
+    },
+    {
+      "epoch": 2.200735658643296,
+      "grad_norm": 0.11075513809919357,
+      "learning_rate": 3.5035730478572906e-05,
+      "loss": 0.2198,
+      "step": 79350
+    },
+    {
+      "epoch": 2.2021223855863603,
+      "grad_norm": 0.151596337556839,
+      "learning_rate": 3.49219871853373e-05,
+      "loss": 0.2214,
+      "step": 79400
+    },
+    {
+      "epoch": 2.2035091125294244,
+      "grad_norm": 0.15046928822994232,
+      "learning_rate": 3.4808389753655324e-05,
+      "loss": 0.2189,
+      "step": 79450
+    },
+    {
+      "epoch": 2.204895839472489,
+      "grad_norm": 0.1315099596977234,
+      "learning_rate": 3.469493843813677e-05,
+      "loss": 0.2205,
+      "step": 79500
+    },
+    {
+      "epoch": 2.2062825664155534,
+      "grad_norm": 0.15399102866649628,
+      "learning_rate": 3.458163349306397e-05,
+      "loss": 0.2191,
+      "step": 79550
+    },
+    {
+      "epoch": 2.207669293358618,
+      "grad_norm": 0.1285402774810791,
+      "learning_rate": 3.4468475172391054e-05,
+      "loss": 0.2211,
+      "step": 79600
+    },
+    {
+      "epoch": 2.2090560203016825,
+      "grad_norm": 0.13388928771018982,
+      "learning_rate": 3.435546372974363e-05,
+      "loss": 0.2197,
+      "step": 79650
+    },
+    {
+      "epoch": 2.210442747244747,
+      "grad_norm": 0.13818307220935822,
+      "learning_rate": 3.424259941841807e-05,
+      "loss": 0.2174,
+      "step": 79700
+    },
+    {
+      "epoch": 2.2118294741878115,
+      "grad_norm": 0.11872979253530502,
+      "learning_rate": 3.4129882491381015e-05,
+      "loss": 0.2184,
+      "step": 79750
+    },
+    {
+      "epoch": 2.213216201130876,
+      "grad_norm": 0.156584233045578,
+      "learning_rate": 3.4017313201268655e-05,
+      "loss": 0.222,
+      "step": 79800
+    },
+    {
+      "epoch": 2.21460292807394,
+      "grad_norm": 0.13037458062171936,
+      "learning_rate": 3.3904891800386426e-05,
+      "loss": 0.2176,
+      "step": 79850
+    },
+    {
+      "epoch": 2.2159896550170046,
+      "grad_norm": 0.13034582138061523,
+      "learning_rate": 3.379261854070815e-05,
+      "loss": 0.2204,
+      "step": 79900
+    },
+    {
+      "epoch": 2.217376381960069,
+      "grad_norm": 0.14262603223323822,
+      "learning_rate": 3.3680493673875735e-05,
+      "loss": 0.2204,
+      "step": 79950
+    },
+    {
+      "epoch": 2.2187631089031337,
+      "grad_norm": 0.12774142622947693,
+      "learning_rate": 3.3568517451198454e-05,
+      "loss": 0.2186,
+      "step": 80000
+    },
+    {
+      "epoch": 2.2187631089031337,
+      "eval_loss": 0.21821601688861847,
+      "eval_runtime": 500.5342,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 80000
+    },
+    {
+      "epoch": 2.220149835846198,
+      "grad_norm": 0.12721596658229828,
+      "learning_rate": 3.345669012365234e-05,
+      "loss": 0.2158,
+      "step": 80050
+    },
+    {
+      "epoch": 2.2215365627892627,
+      "grad_norm": 0.11402986943721771,
+      "learning_rate": 3.334501194187981e-05,
+      "loss": 0.2203,
+      "step": 80100
+    },
+    {
+      "epoch": 2.2229232897323272,
+      "grad_norm": 0.15116162598133087,
+      "learning_rate": 3.323348315618896e-05,
+      "loss": 0.2191,
+      "step": 80150
+    },
+    {
+      "epoch": 2.2243100166753913,
+      "grad_norm": 0.1261793076992035,
+      "learning_rate": 3.312210401655306e-05,
+      "loss": 0.2175,
+      "step": 80200
+    },
+    {
+      "epoch": 2.225696743618456,
+      "grad_norm": 0.14110060036182404,
+      "learning_rate": 3.301087477260987e-05,
+      "loss": 0.2175,
+      "step": 80250
+    },
+    {
+      "epoch": 2.2270834705615203,
+      "grad_norm": 0.11654798686504364,
+      "learning_rate": 3.2899795673661335e-05,
+      "loss": 0.2186,
+      "step": 80300
+    },
+    {
+      "epoch": 2.228470197504585,
+      "grad_norm": 0.11238402873277664,
+      "learning_rate": 3.278886696867275e-05,
+      "loss": 0.2174,
+      "step": 80350
+    },
+    {
+      "epoch": 2.2298569244476494,
+      "grad_norm": 0.14211712777614594,
+      "learning_rate": 3.267808890627239e-05,
+      "loss": 0.2206,
+      "step": 80400
+    },
+    {
+      "epoch": 2.231243651390714,
+      "grad_norm": 0.12510523200035095,
+      "learning_rate": 3.256746173475088e-05,
+      "loss": 0.2158,
+      "step": 80450
+    },
+    {
+      "epoch": 2.2326303783337784,
+      "grad_norm": 0.11722100526094437,
+      "learning_rate": 3.2456985702060694e-05,
+      "loss": 0.2249,
+      "step": 80500
+    },
+    {
+      "epoch": 2.2340171052768425,
+      "grad_norm": 0.15641877055168152,
+      "learning_rate": 3.234666105581542e-05,
+      "loss": 0.2159,
+      "step": 80550
+    },
+    {
+      "epoch": 2.235403832219907,
+      "grad_norm": 0.15180495381355286,
+      "learning_rate": 3.223648804328946e-05,
+      "loss": 0.2197,
+      "step": 80600
+    },
+    {
+      "epoch": 2.2367905591629715,
+      "grad_norm": 0.11919309943914413,
+      "learning_rate": 3.212646691141736e-05,
+      "loss": 0.2214,
+      "step": 80650
+    },
+    {
+      "epoch": 2.238177286106036,
+      "grad_norm": 0.12934933602809906,
+      "learning_rate": 3.2016597906793134e-05,
+      "loss": 0.2177,
+      "step": 80700
+    },
+    {
+      "epoch": 2.2395640130491006,
+      "grad_norm": 0.1465185284614563,
+      "learning_rate": 3.1906881275669975e-05,
+      "loss": 0.2174,
+      "step": 80750
+    },
+    {
+      "epoch": 2.240950739992165,
+      "grad_norm": 0.14115314185619354,
+      "learning_rate": 3.1797317263959415e-05,
+      "loss": 0.2173,
+      "step": 80800
+    },
+    {
+      "epoch": 2.2423374669352296,
+      "grad_norm": 0.12480226904153824,
+      "learning_rate": 3.1687906117231e-05,
+      "loss": 0.218,
+      "step": 80850
+    },
+    {
+      "epoch": 2.243724193878294,
+      "grad_norm": 0.1279391646385193,
+      "learning_rate": 3.157864808071167e-05,
+      "loss": 0.2171,
+      "step": 80900
+    },
+    {
+      "epoch": 2.245110920821358,
+      "grad_norm": 0.16663573682308197,
+      "learning_rate": 3.146954339928516e-05,
+      "loss": 0.218,
+      "step": 80950
+    },
+    {
+      "epoch": 2.2464976477644227,
+      "grad_norm": 0.14335867762565613,
+      "learning_rate": 3.136059231749145e-05,
+      "loss": 0.2205,
+      "step": 81000
+    },
+    {
+      "epoch": 2.2464976477644227,
+      "eval_loss": 0.21820572018623352,
+      "eval_runtime": 500.4609,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 81000
+    },
+    {
+      "epoch": 2.2478843747074873,
+      "grad_norm": 0.11844130605459213,
+      "learning_rate": 3.12517950795263e-05,
+      "loss": 0.2227,
+      "step": 81050
+    },
+    {
+      "epoch": 2.2492711016505518,
+      "grad_norm": 0.13108013570308685,
+      "learning_rate": 3.1143151929240696e-05,
+      "loss": 0.2228,
+      "step": 81100
+    },
+    {
+      "epoch": 2.2506578285936163,
+      "grad_norm": 0.1188526302576065,
+      "learning_rate": 3.103466311014013e-05,
+      "loss": 0.2151,
+      "step": 81150
+    },
+    {
+      "epoch": 2.252044555536681,
+      "grad_norm": 0.15748703479766846,
+      "learning_rate": 3.092632886538432e-05,
+      "loss": 0.2161,
+      "step": 81200
+    },
+    {
+      "epoch": 2.2534312824797453,
+      "grad_norm": 0.1411810666322708,
+      "learning_rate": 3.0818149437786414e-05,
+      "loss": 0.2204,
+      "step": 81250
+    },
+    {
+      "epoch": 2.2548180094228094,
+      "grad_norm": 0.16991600394248962,
+      "learning_rate": 3.0710125069812724e-05,
+      "loss": 0.2221,
+      "step": 81300
+    },
+    {
+      "epoch": 2.256204736365874,
+      "grad_norm": 0.14075744152069092,
+      "learning_rate": 3.060225600358184e-05,
+      "loss": 0.2181,
+      "step": 81350
+    },
+    {
+      "epoch": 2.2575914633089385,
+      "grad_norm": 0.15437312424182892,
+      "learning_rate": 3.0494542480864418e-05,
+      "loss": 0.2196,
+      "step": 81400
+    },
+    {
+      "epoch": 2.258978190252003,
+      "grad_norm": 0.1334267407655716,
+      "learning_rate": 3.038698474308236e-05,
+      "loss": 0.2194,
+      "step": 81450
+    },
+    {
+      "epoch": 2.2603649171950675,
+      "grad_norm": 0.11487523466348648,
+      "learning_rate": 3.0279583031308524e-05,
+      "loss": 0.2199,
+      "step": 81500
+    },
+    {
+      "epoch": 2.261751644138132,
+      "grad_norm": 0.1350301206111908,
+      "learning_rate": 3.017233758626593e-05,
+      "loss": 0.2249,
+      "step": 81550
+    },
+    {
+      "epoch": 2.2631383710811965,
+      "grad_norm": 0.13608673214912415,
+      "learning_rate": 3.006524864832748e-05,
+      "loss": 0.2176,
+      "step": 81600
+    },
+    {
+      "epoch": 2.2645250980242606,
+      "grad_norm": 0.11125027388334274,
+      "learning_rate": 2.9958316457515222e-05,
+      "loss": 0.2203,
+      "step": 81650
+    },
+    {
+      "epoch": 2.265911824967325,
+      "grad_norm": 0.11785724014043808,
+      "learning_rate": 2.9851541253499894e-05,
+      "loss": 0.2174,
+      "step": 81700
+    },
+    {
+      "epoch": 2.2672985519103896,
+      "grad_norm": 0.14160412549972534,
+      "learning_rate": 2.974492327560042e-05,
+      "loss": 0.2173,
+      "step": 81750
+    },
+    {
+      "epoch": 2.268685278853454,
+      "grad_norm": 0.10561185330152512,
+      "learning_rate": 2.9638462762783215e-05,
+      "loss": 0.2209,
+      "step": 81800
+    },
+    {
+      "epoch": 2.2700720057965187,
+      "grad_norm": 0.12722249329090118,
+      "learning_rate": 2.9532159953661886e-05,
+      "loss": 0.2187,
+      "step": 81850
+    },
+    {
+      "epoch": 2.271458732739583,
+      "grad_norm": 0.12994156777858734,
+      "learning_rate": 2.9426015086496474e-05,
+      "loss": 0.218,
+      "step": 81900
+    },
+    {
+      "epoch": 2.2728454596826477,
+      "grad_norm": 0.13776080310344696,
+      "learning_rate": 2.93200283991931e-05,
+      "loss": 0.2197,
+      "step": 81950
+    },
+    {
+      "epoch": 2.274232186625712,
+      "grad_norm": 0.14488635957241058,
+      "learning_rate": 2.921420012930328e-05,
+      "loss": 0.2177,
+      "step": 82000
+    },
+    {
+      "epoch": 2.274232186625712,
+      "eval_loss": 0.2180478274822235,
+      "eval_runtime": 500.2847,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 82000
+    },
+    {
+      "epoch": 2.2756189135687763,
+      "grad_norm": 0.15002746880054474,
+      "learning_rate": 2.9108530514023512e-05,
+      "loss": 0.219,
+      "step": 82050
+    },
+    {
+      "epoch": 2.277005640511841,
+      "grad_norm": 0.14248649775981903,
+      "learning_rate": 2.9003019790194684e-05,
+      "loss": 0.2169,
+      "step": 82100
+    },
+    {
+      "epoch": 2.2783923674549054,
+      "grad_norm": 0.11851143091917038,
+      "learning_rate": 2.8897668194301598e-05,
+      "loss": 0.2186,
+      "step": 82150
+    },
+    {
+      "epoch": 2.27977909439797,
+      "grad_norm": 0.14575442671775818,
+      "learning_rate": 2.8792475962472277e-05,
+      "loss": 0.22,
+      "step": 82200
+    },
+    {
+      "epoch": 2.2811658213410344,
+      "grad_norm": 0.12261557579040527,
+      "learning_rate": 2.868744333047767e-05,
+      "loss": 0.2196,
+      "step": 82250
+    },
+    {
+      "epoch": 2.282552548284099,
+      "grad_norm": 0.12256131321191788,
+      "learning_rate": 2.8582570533731002e-05,
+      "loss": 0.2174,
+      "step": 82300
+    },
+    {
+      "epoch": 2.283939275227163,
+      "grad_norm": 0.13776589930057526,
+      "learning_rate": 2.8477857807287156e-05,
+      "loss": 0.2138,
+      "step": 82350
+    },
+    {
+      "epoch": 2.2853260021702275,
+      "grad_norm": 0.16855372488498688,
+      "learning_rate": 2.8373305385842385e-05,
+      "loss": 0.2188,
+      "step": 82400
+    },
+    {
+      "epoch": 2.286712729113292,
+      "grad_norm": 0.13965797424316406,
+      "learning_rate": 2.8268913503733498e-05,
+      "loss": 0.2199,
+      "step": 82450
+    },
+    {
+      "epoch": 2.2880994560563566,
+      "grad_norm": 0.1284012645483017,
+      "learning_rate": 2.816468239493758e-05,
+      "loss": 0.2178,
+      "step": 82500
+    },
+    {
+      "epoch": 2.289486182999421,
+      "grad_norm": 0.1288595348596573,
+      "learning_rate": 2.8060612293071363e-05,
+      "loss": 0.2162,
+      "step": 82550
+    },
+    {
+      "epoch": 2.2908729099424856,
+      "grad_norm": 0.1225946918129921,
+      "learning_rate": 2.795670343139072e-05,
+      "loss": 0.2135,
+      "step": 82600
+    },
+    {
+      "epoch": 2.29225963688555,
+      "grad_norm": 0.1356947422027588,
+      "learning_rate": 2.7852956042790023e-05,
+      "loss": 0.2204,
+      "step": 82650
+    },
+    {
+      "epoch": 2.293646363828614,
+      "grad_norm": 0.11796356737613678,
+      "learning_rate": 2.774937035980184e-05,
+      "loss": 0.2209,
+      "step": 82700
+    },
+    {
+      "epoch": 2.2950330907716787,
+      "grad_norm": 0.14882323145866394,
+      "learning_rate": 2.7648013501010216e-05,
+      "loss": 0.2201,
+      "step": 82750
+    },
+    {
+      "epoch": 2.2964198177147432,
+      "grad_norm": 0.1550634354352951,
+      "learning_rate": 2.7544748679733266e-05,
+      "loss": 0.2214,
+      "step": 82800
+    },
+    {
+      "epoch": 2.2978065446578078,
+      "grad_norm": 0.12448450177907944,
+      "learning_rate": 2.7441646254864463e-05,
+      "loss": 0.2206,
+      "step": 82850
+    },
+    {
+      "epoch": 2.2991932716008723,
+      "grad_norm": 0.13499294221401215,
+      "learning_rate": 2.7338706457490704e-05,
+      "loss": 0.2182,
+      "step": 82900
+    },
+    {
+      "epoch": 2.300579998543937,
+      "grad_norm": 0.1396186500787735,
+      "learning_rate": 2.7235929518334515e-05,
+      "loss": 0.2214,
+      "step": 82950
+    },
+    {
+      "epoch": 2.3019667254870013,
+      "grad_norm": 0.1256234049797058,
+      "learning_rate": 2.7133315667753244e-05,
+      "loss": 0.2177,
+      "step": 83000
+    },
+    {
+      "epoch": 2.3019667254870013,
+      "eval_loss": 0.21797436475753784,
+      "eval_runtime": 500.3137,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 83000
+    },
+    {
+      "epoch": 2.3033534524300654,
+      "grad_norm": 0.13521268963813782,
+      "learning_rate": 2.7030865135738836e-05,
+      "loss": 0.2164,
+      "step": 83050
+    },
+    {
+      "epoch": 2.30474017937313,
+      "grad_norm": 0.1525382250547409,
+      "learning_rate": 2.692857815191714e-05,
+      "loss": 0.218,
+      "step": 83100
+    },
+    {
+      "epoch": 2.3061269063161944,
+      "grad_norm": 0.11054780334234238,
+      "learning_rate": 2.6826454945547452e-05,
+      "loss": 0.2179,
+      "step": 83150
+    },
+    {
+      "epoch": 2.307513633259259,
+      "grad_norm": 0.10860705375671387,
+      "learning_rate": 2.6724495745521928e-05,
+      "loss": 0.2198,
+      "step": 83200
+    },
+    {
+      "epoch": 2.3089003602023235,
+      "grad_norm": 0.1467871069908142,
+      "learning_rate": 2.6622700780365195e-05,
+      "loss": 0.2182,
+      "step": 83250
+    },
+    {
+      "epoch": 2.310287087145388,
+      "grad_norm": 0.13562606275081635,
+      "learning_rate": 2.6521070278233783e-05,
+      "loss": 0.219,
+      "step": 83300
+    },
+    {
+      "epoch": 2.3116738140884525,
+      "grad_norm": 0.14807383716106415,
+      "learning_rate": 2.6419604466915526e-05,
+      "loss": 0.2187,
+      "step": 83350
+    },
+    {
+      "epoch": 2.3130605410315166,
+      "grad_norm": 0.13751082122325897,
+      "learning_rate": 2.631830357382925e-05,
+      "loss": 0.2183,
+      "step": 83400
+    },
+    {
+      "epoch": 2.314447267974581,
+      "grad_norm": 0.13988618552684784,
+      "learning_rate": 2.621716782602396e-05,
+      "loss": 0.2189,
+      "step": 83450
+    },
+    {
+      "epoch": 2.3158339949176456,
+      "grad_norm": 0.13154689967632294,
+      "learning_rate": 2.611619745017878e-05,
+      "loss": 0.2189,
+      "step": 83500
+    },
+    {
+      "epoch": 2.31722072186071,
+      "grad_norm": 0.11987145990133286,
+      "learning_rate": 2.6015392672601924e-05,
+      "loss": 0.2176,
+      "step": 83550
+    },
+    {
+      "epoch": 2.3186074488037747,
+      "grad_norm": 0.12398877739906311,
+      "learning_rate": 2.5914753719230623e-05,
+      "loss": 0.2197,
+      "step": 83600
+    },
+    {
+      "epoch": 2.319994175746839,
+      "grad_norm": 0.11837327480316162,
+      "learning_rate": 2.581428081563031e-05,
+      "loss": 0.2177,
+      "step": 83650
+    },
+    {
+      "epoch": 2.3213809026899037,
+      "grad_norm": 0.14284634590148926,
+      "learning_rate": 2.571397418699436e-05,
+      "loss": 0.2183,
+      "step": 83700
+    },
+    {
+      "epoch": 2.3227676296329682,
+      "grad_norm": 0.11705781519412994,
+      "learning_rate": 2.561383405814336e-05,
+      "loss": 0.2185,
+      "step": 83750
+    },
+    {
+      "epoch": 2.3241543565760328,
+      "grad_norm": 0.13301438093185425,
+      "learning_rate": 2.55138606535248e-05,
+      "loss": 0.2187,
+      "step": 83800
+    },
+    {
+      "epoch": 2.325541083519097,
+      "grad_norm": 0.15416869521141052,
+      "learning_rate": 2.5414054197212467e-05,
+      "loss": 0.2185,
+      "step": 83850
+    },
+    {
+      "epoch": 2.3269278104621614,
+      "grad_norm": 0.13096120953559875,
+      "learning_rate": 2.5314414912905938e-05,
+      "loss": 0.2181,
+      "step": 83900
+    },
+    {
+      "epoch": 2.328314537405226,
+      "grad_norm": 0.19566649198532104,
+      "learning_rate": 2.5214943023930137e-05,
+      "loss": 0.2175,
+      "step": 83950
+    },
+    {
+      "epoch": 2.3297012643482904,
+      "grad_norm": 0.1359308511018753,
+      "learning_rate": 2.511563875323474e-05,
+      "loss": 0.2185,
+      "step": 84000
+    },
+    {
+      "epoch": 2.3297012643482904,
+      "eval_loss": 0.21784663200378418,
+      "eval_runtime": 500.2701,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 84000
+    },
+    {
+      "epoch": 2.331087991291355,
+      "grad_norm": 0.12440098077058792,
+      "learning_rate": 2.501650232339382e-05,
+      "loss": 0.2178,
+      "step": 84050
+    },
+    {
+      "epoch": 2.3324747182344194,
+      "grad_norm": 0.13704413175582886,
+      "learning_rate": 2.4917533956605153e-05,
+      "loss": 0.218,
+      "step": 84100
+    },
+    {
+      "epoch": 2.333861445177484,
+      "grad_norm": 0.13410316407680511,
+      "learning_rate": 2.481873387468995e-05,
+      "loss": 0.2135,
+      "step": 84150
+    },
+    {
+      "epoch": 2.335248172120548,
+      "grad_norm": 0.13458983600139618,
+      "learning_rate": 2.4720102299092117e-05,
+      "loss": 0.2206,
+      "step": 84200
+    },
+    {
+      "epoch": 2.3366348990636125,
+      "grad_norm": 0.13513872027397156,
+      "learning_rate": 2.4621639450877987e-05,
+      "loss": 0.216,
+      "step": 84250
+    },
+    {
+      "epoch": 2.338021626006677,
+      "grad_norm": 0.12012195587158203,
+      "learning_rate": 2.4523345550735665e-05,
+      "loss": 0.2154,
+      "step": 84300
+    },
+    {
+      "epoch": 2.3394083529497416,
+      "grad_norm": 0.1352149099111557,
+      "learning_rate": 2.4425220818974624e-05,
+      "loss": 0.2162,
+      "step": 84350
+    },
+    {
+      "epoch": 2.340795079892806,
+      "grad_norm": 0.12285764515399933,
+      "learning_rate": 2.4327265475525097e-05,
+      "loss": 0.2205,
+      "step": 84400
+    },
+    {
+      "epoch": 2.3421818068358706,
+      "grad_norm": 0.12704753875732422,
+      "learning_rate": 2.4229479739937745e-05,
+      "loss": 0.2186,
+      "step": 84450
+    },
+    {
+      "epoch": 2.343568533778935,
+      "grad_norm": 0.131776824593544,
+      "learning_rate": 2.4131863831383062e-05,
+      "loss": 0.219,
+      "step": 84500
+    },
+    {
+      "epoch": 2.3449552607219992,
+      "grad_norm": 0.12816810607910156,
+      "learning_rate": 2.4034417968650834e-05,
+      "loss": 0.2227,
+      "step": 84550
+    },
+    {
+      "epoch": 2.3463419876650637,
+      "grad_norm": 0.12194739282131195,
+      "learning_rate": 2.3937142370149857e-05,
+      "loss": 0.2175,
+      "step": 84600
+    },
+    {
+      "epoch": 2.3477287146081283,
+      "grad_norm": 0.17542821168899536,
+      "learning_rate": 2.3840037253907098e-05,
+      "loss": 0.2232,
+      "step": 84650
+    },
+    {
+      "epoch": 2.349115441551193,
+      "grad_norm": 0.14499437808990479,
+      "learning_rate": 2.3743102837567688e-05,
+      "loss": 0.2222,
+      "step": 84700
+    },
+    {
+      "epoch": 2.3505021684942573,
+      "grad_norm": 0.1500397026538849,
+      "learning_rate": 2.364633933839391e-05,
+      "loss": 0.2233,
+      "step": 84750
+    },
+    {
+      "epoch": 2.351888895437322,
+      "grad_norm": 0.11950129270553589,
+      "learning_rate": 2.354974697326514e-05,
+      "loss": 0.2174,
+      "step": 84800
+    },
+    {
+      "epoch": 2.3532756223803863,
+      "grad_norm": 0.11446011066436768,
+      "learning_rate": 2.3453325958677053e-05,
+      "loss": 0.2177,
+      "step": 84850
+    },
+    {
+      "epoch": 2.3546623493234504,
+      "grad_norm": 0.14888976514339447,
+      "learning_rate": 2.335707651074137e-05,
+      "loss": 0.2172,
+      "step": 84900
+    },
+    {
+      "epoch": 2.356049076266515,
+      "grad_norm": 0.14163066446781158,
+      "learning_rate": 2.3260998845185254e-05,
+      "loss": 0.218,
+      "step": 84950
+    },
+    {
+      "epoch": 2.3574358032095795,
+      "grad_norm": 0.12244392931461334,
+      "learning_rate": 2.3165093177350793e-05,
+      "loss": 0.2175,
+      "step": 85000
+    },
+    {
+      "epoch": 2.3574358032095795,
+      "eval_loss": 0.21762743592262268,
+      "eval_runtime": 500.9336,
+      "eval_samples_per_second": 5.703,
+      "eval_steps_per_second": 5.703,
+      "step": 85000
+    },
+    {
+      "epoch": 2.358822530152644,
+      "grad_norm": 0.15694329142570496,
+      "learning_rate": 2.3069359722194617e-05,
+      "loss": 0.222,
+      "step": 85050
+    },
+    {
+      "epoch": 2.3602092570957085,
+      "grad_norm": 0.14646472036838531,
+      "learning_rate": 2.2973798694287362e-05,
+      "loss": 0.2174,
+      "step": 85100
+    },
+    {
+      "epoch": 2.361595984038773,
+      "grad_norm": 0.1558624804019928,
+      "learning_rate": 2.2878410307813235e-05,
+      "loss": 0.2209,
+      "step": 85150
+    },
+    {
+      "epoch": 2.3629827109818375,
+      "grad_norm": 0.15354704856872559,
+      "learning_rate": 2.2783194776569394e-05,
+      "loss": 0.2182,
+      "step": 85200
+    },
+    {
+      "epoch": 2.3643694379249016,
+      "grad_norm": 0.1462877094745636,
+      "learning_rate": 2.2688152313965684e-05,
+      "loss": 0.2188,
+      "step": 85250
+    },
+    {
+      "epoch": 2.365756164867966,
+      "grad_norm": 0.15048891305923462,
+      "learning_rate": 2.2593283133023945e-05,
+      "loss": 0.2161,
+      "step": 85300
+    },
+    {
+      "epoch": 2.3671428918110307,
+      "grad_norm": 0.12739485502243042,
+      "learning_rate": 2.2498587446377716e-05,
+      "loss": 0.2173,
+      "step": 85350
+    },
+    {
+      "epoch": 2.368529618754095,
+      "grad_norm": 0.12822887301445007,
+      "learning_rate": 2.2404065466271673e-05,
+      "loss": 0.222,
+      "step": 85400
+    },
+    {
+      "epoch": 2.3699163456971597,
+      "grad_norm": 0.1442478448152542,
+      "learning_rate": 2.2311602660026586e-05,
+      "loss": 0.2203,
+      "step": 85450
+    },
+    {
+      "epoch": 2.371303072640224,
+      "grad_norm": 0.15499895811080933,
+      "learning_rate": 2.2217425243509928e-05,
+      "loss": 0.2198,
+      "step": 85500
+    },
+    {
+      "epoch": 2.3726897995832887,
+      "grad_norm": 0.12855781614780426,
+      "learning_rate": 2.212342216371176e-05,
+      "loss": 0.221,
+      "step": 85550
+    },
+    {
+      "epoch": 2.374076526526353,
+      "grad_norm": 0.13640813529491425,
+      "learning_rate": 2.2029593631324417e-05,
+      "loss": 0.2175,
+      "step": 85600
+    },
+    {
+      "epoch": 2.3754632534694173,
+      "grad_norm": 0.13200990855693817,
+      "learning_rate": 2.19359398566489e-05,
+      "loss": 0.2175,
+      "step": 85650
+    },
+    {
+      "epoch": 2.376849980412482,
+      "grad_norm": 0.14753013849258423,
+      "learning_rate": 2.1842461049594677e-05,
+      "loss": 0.2188,
+      "step": 85700
+    },
+    {
+      "epoch": 2.3782367073555464,
+      "grad_norm": 0.11965631693601608,
+      "learning_rate": 2.17491574196789e-05,
+      "loss": 0.2191,
+      "step": 85750
+    },
+    {
+      "epoch": 2.379623434298611,
+      "grad_norm": 0.11599334329366684,
+      "learning_rate": 2.1656029176026193e-05,
+      "loss": 0.2199,
+      "step": 85800
+    },
+    {
+      "epoch": 2.3810101612416754,
+      "grad_norm": 0.14910413324832916,
+      "learning_rate": 2.1563076527367996e-05,
+      "loss": 0.2175,
+      "step": 85850
+    },
+    {
+      "epoch": 2.38239688818474,
+      "grad_norm": 0.1290595680475235,
+      "learning_rate": 2.147029968204226e-05,
+      "loss": 0.2163,
+      "step": 85900
+    },
+    {
+      "epoch": 2.383783615127804,
+      "grad_norm": 0.13954681158065796,
+      "learning_rate": 2.1377698847992878e-05,
+      "loss": 0.2253,
+      "step": 85950
+    },
+    {
+      "epoch": 2.3851703420708685,
+      "grad_norm": 0.10592395067214966,
+      "learning_rate": 2.1285274232769194e-05,
+      "loss": 0.2204,
+      "step": 86000
+    },
+    {
+      "epoch": 2.3851703420708685,
+      "eval_loss": 0.2176109105348587,
+      "eval_runtime": 500.5123,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 86000
+    },
+    {
+      "epoch": 2.386557069013933,
+      "grad_norm": 0.1263597160577774,
+      "learning_rate": 2.1193026043525655e-05,
+      "loss": 0.2194,
+      "step": 86050
+    },
+    {
+      "epoch": 2.3879437959569976,
+      "grad_norm": 0.13431338965892792,
+      "learning_rate": 2.1100954487021252e-05,
+      "loss": 0.2156,
+      "step": 86100
+    },
+    {
+      "epoch": 2.389330522900062,
+      "grad_norm": 0.13657350838184357,
+      "learning_rate": 2.10090597696191e-05,
+      "loss": 0.2163,
+      "step": 86150
+    },
+    {
+      "epoch": 2.3907172498431266,
+      "grad_norm": 0.1250746250152588,
+      "learning_rate": 2.0917342097285897e-05,
+      "loss": 0.2192,
+      "step": 86200
+    },
+    {
+      "epoch": 2.392103976786191,
+      "grad_norm": 0.11769583076238632,
+      "learning_rate": 2.0825801675591618e-05,
+      "loss": 0.22,
+      "step": 86250
+    },
+    {
+      "epoch": 2.393490703729255,
+      "grad_norm": 0.1266479194164276,
+      "learning_rate": 2.073443870970886e-05,
+      "loss": 0.2205,
+      "step": 86300
+    },
+    {
+      "epoch": 2.3948774306723197,
+      "grad_norm": 0.1402258574962616,
+      "learning_rate": 2.0643253404412564e-05,
+      "loss": 0.2161,
+      "step": 86350
+    },
+    {
+      "epoch": 2.3962641576153842,
+      "grad_norm": 0.15390367805957794,
+      "learning_rate": 2.055224596407942e-05,
+      "loss": 0.2153,
+      "step": 86400
+    },
+    {
+      "epoch": 2.3976508845584488,
+      "grad_norm": 0.15276868641376495,
+      "learning_rate": 2.0461416592687487e-05,
+      "loss": 0.2193,
+      "step": 86450
+    },
+    {
+      "epoch": 2.3990376115015133,
+      "grad_norm": 0.14889566600322723,
+      "learning_rate": 2.0370765493815735e-05,
+      "loss": 0.2186,
+      "step": 86500
+    },
+    {
+      "epoch": 2.400424338444578,
+      "grad_norm": 0.12452477216720581,
+      "learning_rate": 2.0280292870643524e-05,
+      "loss": 0.2249,
+      "step": 86550
+    },
+    {
+      "epoch": 2.4018110653876423,
+      "grad_norm": 0.12419555336236954,
+      "learning_rate": 2.0189998925950227e-05,
+      "loss": 0.219,
+      "step": 86600
+    },
+    {
+      "epoch": 2.4031977923307064,
+      "grad_norm": 0.12875986099243164,
+      "learning_rate": 2.0099883862114688e-05,
+      "loss": 0.2214,
+      "step": 86650
+    },
+    {
+      "epoch": 2.404584519273771,
+      "grad_norm": 0.18010209500789642,
+      "learning_rate": 2.0009947881114888e-05,
+      "loss": 0.2176,
+      "step": 86700
+    },
+    {
+      "epoch": 2.4059712462168354,
+      "grad_norm": 0.142224982380867,
+      "learning_rate": 1.992019118452735e-05,
+      "loss": 0.2147,
+      "step": 86750
+    },
+    {
+      "epoch": 2.4073579731599,
+      "grad_norm": 0.13554586470127106,
+      "learning_rate": 1.9830613973526823e-05,
+      "loss": 0.2146,
+      "step": 86800
+    },
+    {
+      "epoch": 2.4087447001029645,
+      "grad_norm": 0.16745540499687195,
+      "learning_rate": 1.974121644888569e-05,
+      "loss": 0.2172,
+      "step": 86850
+    },
+    {
+      "epoch": 2.410131427046029,
+      "grad_norm": 0.138095885515213,
+      "learning_rate": 1.9651998810973737e-05,
+      "loss": 0.2236,
+      "step": 86900
+    },
+    {
+      "epoch": 2.4115181539890935,
+      "grad_norm": 0.12732721865177155,
+      "learning_rate": 1.9562961259757418e-05,
+      "loss": 0.2195,
+      "step": 86950
+    },
+    {
+      "epoch": 2.4129048809321576,
+      "grad_norm": 0.14291352033615112,
+      "learning_rate": 1.9474103994799643e-05,
+      "loss": 0.2199,
+      "step": 87000
+    },
+    {
+      "epoch": 2.4129048809321576,
+      "eval_loss": 0.21756121516227722,
+      "eval_runtime": 500.6311,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 87000
+    },
+    {
+      "epoch": 2.414291607875222,
+      "grad_norm": 0.15151503682136536,
+      "learning_rate": 1.9385427215259166e-05,
+      "loss": 0.22,
+      "step": 87050
+    },
+    {
+      "epoch": 2.4156783348182866,
+      "grad_norm": 0.14093713462352753,
+      "learning_rate": 1.9296931119890283e-05,
+      "loss": 0.2168,
+      "step": 87100
+    },
+    {
+      "epoch": 2.417065061761351,
+      "grad_norm": 0.15471357107162476,
+      "learning_rate": 1.9208615907042316e-05,
+      "loss": 0.2209,
+      "step": 87150
+    },
+    {
+      "epoch": 2.4184517887044157,
+      "grad_norm": 0.15164633095264435,
+      "learning_rate": 1.9120481774659083e-05,
+      "loss": 0.2207,
+      "step": 87200
+    },
+    {
+      "epoch": 2.41983851564748,
+      "grad_norm": 0.1304929554462433,
+      "learning_rate": 1.9032528920278625e-05,
+      "loss": 0.2192,
+      "step": 87250
+    },
+    {
+      "epoch": 2.4212252425905447,
+      "grad_norm": 0.1501680463552475,
+      "learning_rate": 1.8944757541032664e-05,
+      "loss": 0.2209,
+      "step": 87300
+    },
+    {
+      "epoch": 2.4226119695336092,
+      "grad_norm": 0.12184792011976242,
+      "learning_rate": 1.8857167833646184e-05,
+      "loss": 0.219,
+      "step": 87350
+    },
+    {
+      "epoch": 2.4239986964766738,
+      "grad_norm": 0.14270274341106415,
+      "learning_rate": 1.8769759994436896e-05,
+      "loss": 0.2161,
+      "step": 87400
+    },
+    {
+      "epoch": 2.425385423419738,
+      "grad_norm": 0.14052851498126984,
+      "learning_rate": 1.868253421931503e-05,
+      "loss": 0.2177,
+      "step": 87450
+    },
+    {
+      "epoch": 2.4267721503628024,
+      "grad_norm": 0.14709405601024628,
+      "learning_rate": 1.859549070378259e-05,
+      "loss": 0.218,
+      "step": 87500
+    },
+    {
+      "epoch": 2.428158877305867,
+      "grad_norm": 0.15017499029636383,
+      "learning_rate": 1.8508629642933207e-05,
+      "loss": 0.2171,
+      "step": 87550
+    },
+    {
+      "epoch": 2.4295456042489314,
+      "grad_norm": 0.11489958316087723,
+      "learning_rate": 1.842195123145152e-05,
+      "loss": 0.2208,
+      "step": 87600
+    },
+    {
+      "epoch": 2.430932331191996,
+      "grad_norm": 0.1223435029387474,
+      "learning_rate": 1.8335455663612744e-05,
+      "loss": 0.2186,
+      "step": 87650
+    },
+    {
+      "epoch": 2.4323190581350604,
+      "grad_norm": 0.10422486811876297,
+      "learning_rate": 1.8249143133282344e-05,
+      "loss": 0.2169,
+      "step": 87700
+    },
+    {
+      "epoch": 2.433705785078125,
+      "grad_norm": 0.11790075153112411,
+      "learning_rate": 1.8163013833915532e-05,
+      "loss": 0.2201,
+      "step": 87750
+    },
+    {
+      "epoch": 2.435092512021189,
+      "grad_norm": 0.12330956012010574,
+      "learning_rate": 1.807706795855685e-05,
+      "loss": 0.2181,
+      "step": 87800
+    },
+    {
+      "epoch": 2.4364792389642536,
+      "grad_norm": 0.12955226004123688,
+      "learning_rate": 1.7991305699839623e-05,
+      "loss": 0.2158,
+      "step": 87850
+    },
+    {
+      "epoch": 2.437865965907318,
+      "grad_norm": 0.11457941681146622,
+      "learning_rate": 1.790572724998577e-05,
+      "loss": 0.2155,
+      "step": 87900
+    },
+    {
+      "epoch": 2.4392526928503826,
+      "grad_norm": 0.14329944550991058,
+      "learning_rate": 1.782033280080513e-05,
+      "loss": 0.2175,
+      "step": 87950
+    },
+    {
+      "epoch": 2.440639419793447,
+      "grad_norm": 0.15041890740394592,
+      "learning_rate": 1.7735122543695205e-05,
+      "loss": 0.2199,
+      "step": 88000
+    },
+    {
+      "epoch": 2.440639419793447,
+      "eval_loss": 0.21746821701526642,
+      "eval_runtime": 500.6597,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 88000
+    },
+    {
+      "epoch": 2.4420261467365116,
+      "grad_norm": 0.13580143451690674,
+      "learning_rate": 1.765009666964056e-05,
+      "loss": 0.2163,
+      "step": 88050
+    },
+    {
+      "epoch": 2.443412873679576,
+      "grad_norm": 0.14240418374538422,
+      "learning_rate": 1.7565255369212662e-05,
+      "loss": 0.2221,
+      "step": 88100
+    },
+    {
+      "epoch": 2.4447996006226402,
+      "grad_norm": 0.13065385818481445,
+      "learning_rate": 1.748059883256913e-05,
+      "loss": 0.2226,
+      "step": 88150
+    },
+    {
+      "epoch": 2.4461863275657048,
+      "grad_norm": 0.13227535784244537,
+      "learning_rate": 1.7396127249453542e-05,
+      "loss": 0.2152,
+      "step": 88200
+    },
+    {
+      "epoch": 2.4475730545087693,
+      "grad_norm": 0.13343574106693268,
+      "learning_rate": 1.7311840809194934e-05,
+      "loss": 0.2162,
+      "step": 88250
+    },
+    {
+      "epoch": 2.448959781451834,
+      "grad_norm": 0.14660726487636566,
+      "learning_rate": 1.7227739700707322e-05,
+      "loss": 0.2183,
+      "step": 88300
+    },
+    {
+      "epoch": 2.4503465083948983,
+      "grad_norm": 0.13315744698047638,
+      "learning_rate": 1.7143824112489413e-05,
+      "loss": 0.218,
+      "step": 88350
+    },
+    {
+      "epoch": 2.451733235337963,
+      "grad_norm": 0.15965452790260315,
+      "learning_rate": 1.7060094232624012e-05,
+      "loss": 0.2169,
+      "step": 88400
+    },
+    {
+      "epoch": 2.4531199622810274,
+      "grad_norm": 0.1352396458387375,
+      "learning_rate": 1.6976550248777747e-05,
+      "loss": 0.217,
+      "step": 88450
+    },
+    {
+      "epoch": 2.4545066892240914,
+      "grad_norm": 0.16009649634361267,
+      "learning_rate": 1.6893192348200582e-05,
+      "loss": 0.2179,
+      "step": 88500
+    },
+    {
+      "epoch": 2.455893416167156,
+      "grad_norm": 0.13520574569702148,
+      "learning_rate": 1.6810020717725427e-05,
+      "loss": 0.2179,
+      "step": 88550
+    },
+    {
+      "epoch": 2.4572801431102205,
+      "grad_norm": 0.1435248851776123,
+      "learning_rate": 1.6727035543767634e-05,
+      "loss": 0.2185,
+      "step": 88600
+    },
+    {
+      "epoch": 2.458666870053285,
+      "grad_norm": 0.12575069069862366,
+      "learning_rate": 1.6644237012324716e-05,
+      "loss": 0.2224,
+      "step": 88650
+    },
+    {
+      "epoch": 2.4600535969963495,
+      "grad_norm": 0.143171489238739,
+      "learning_rate": 1.6561625308975782e-05,
+      "loss": 0.2159,
+      "step": 88700
+    },
+    {
+      "epoch": 2.461440323939414,
+      "grad_norm": 0.1348053365945816,
+      "learning_rate": 1.6479200618881275e-05,
+      "loss": 0.2171,
+      "step": 88750
+    },
+    {
+      "epoch": 2.4628270508824786,
+      "grad_norm": 0.12457796931266785,
+      "learning_rate": 1.639696312678245e-05,
+      "loss": 0.2211,
+      "step": 88800
+    },
+    {
+      "epoch": 2.4642137778255426,
+      "grad_norm": 0.1280537098646164,
+      "learning_rate": 1.6314913017000955e-05,
+      "loss": 0.218,
+      "step": 88850
+    },
+    {
+      "epoch": 2.465600504768607,
+      "grad_norm": 0.12102089077234268,
+      "learning_rate": 1.6233050473438483e-05,
+      "loss": 0.2186,
+      "step": 88900
+    },
+    {
+      "epoch": 2.4669872317116717,
+      "grad_norm": 0.15358726680278778,
+      "learning_rate": 1.615137567957634e-05,
+      "loss": 0.2192,
+      "step": 88950
+    },
+    {
+      "epoch": 2.468373958654736,
+      "grad_norm": 0.1613384634256363,
+      "learning_rate": 1.6069888818475022e-05,
+      "loss": 0.2191,
+      "step": 89000
+    },
+    {
+      "epoch": 2.468373958654736,
+      "eval_loss": 0.21739034354686737,
+      "eval_runtime": 500.3264,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 89000
+    },
+    {
+      "epoch": 2.4697606855978007,
+      "grad_norm": 0.12117033451795578,
+      "learning_rate": 1.5988590072773744e-05,
+      "loss": 0.2202,
+      "step": 89050
+    },
+    {
+      "epoch": 2.4711474125408652,
+      "grad_norm": 0.13079126179218292,
+      "learning_rate": 1.590747962469018e-05,
+      "loss": 0.2146,
+      "step": 89100
+    },
+    {
+      "epoch": 2.4725341394839297,
+      "grad_norm": 0.12103881686925888,
+      "learning_rate": 1.582655765601989e-05,
+      "loss": 0.2182,
+      "step": 89150
+    },
+    {
+      "epoch": 2.473920866426994,
+      "grad_norm": 0.12128273397684097,
+      "learning_rate": 1.574582434813604e-05,
+      "loss": 0.217,
+      "step": 89200
+    },
+    {
+      "epoch": 2.4753075933700583,
+      "grad_norm": 0.13186509907245636,
+      "learning_rate": 1.5665279881988946e-05,
+      "loss": 0.2181,
+      "step": 89250
+    },
+    {
+      "epoch": 2.476694320313123,
+      "grad_norm": 0.17148008942604065,
+      "learning_rate": 1.5584924438105586e-05,
+      "loss": 0.2235,
+      "step": 89300
+    },
+    {
+      "epoch": 2.4780810472561874,
+      "grad_norm": 0.12658308446407318,
+      "learning_rate": 1.550475819658942e-05,
+      "loss": 0.219,
+      "step": 89350
+    },
+    {
+      "epoch": 2.479467774199252,
+      "grad_norm": 0.13896940648555756,
+      "learning_rate": 1.5424781337119685e-05,
+      "loss": 0.2193,
+      "step": 89400
+    },
+    {
+      "epoch": 2.4808545011423164,
+      "grad_norm": 0.13384407758712769,
+      "learning_rate": 1.534658792605652e-05,
+      "loss": 0.2173,
+      "step": 89450
+    },
+    {
+      "epoch": 2.482241228085381,
+      "grad_norm": 0.13054698705673218,
+      "learning_rate": 1.526698657146697e-05,
+      "loss": 0.217,
+      "step": 89500
+    },
+    {
+      "epoch": 2.483627955028445,
+      "grad_norm": 0.11624244600534439,
+      "learning_rate": 1.5187575131849496e-05,
+      "loss": 0.2174,
+      "step": 89550
+    },
+    {
+      "epoch": 2.4850146819715095,
+      "grad_norm": 0.13589714467525482,
+      "learning_rate": 1.5109936348064579e-05,
+      "loss": 0.2108,
+      "step": 89600
+    },
+    {
+      "epoch": 2.486401408914574,
+      "grad_norm": 0.1458209604024887,
+      "learning_rate": 1.5030901464780044e-05,
+      "loss": 0.2138,
+      "step": 89650
+    },
+    {
+      "epoch": 2.4877881358576386,
+      "grad_norm": 0.1267354041337967,
+      "learning_rate": 1.4952057025613075e-05,
+      "loss": 0.2154,
+      "step": 89700
+    },
+    {
+      "epoch": 2.489174862800703,
+      "grad_norm": 0.13832834362983704,
+      "learning_rate": 1.4873403207280445e-05,
+      "loss": 0.2182,
+      "step": 89750
+    },
+    {
+      "epoch": 2.4905615897437676,
+      "grad_norm": 0.15721414983272552,
+      "learning_rate": 1.4794940186071582e-05,
+      "loss": 0.2181,
+      "step": 89800
+    },
+    {
+      "epoch": 2.491948316686832,
+      "grad_norm": 0.13558489084243774,
+      "learning_rate": 1.471666813784831e-05,
+      "loss": 0.2194,
+      "step": 89850
+    },
+    {
+      "epoch": 2.493335043629896,
+      "grad_norm": 0.14502158761024475,
+      "learning_rate": 1.4638587238044466e-05,
+      "loss": 0.2197,
+      "step": 89900
+    },
+    {
+      "epoch": 2.4947217705729607,
+      "grad_norm": 0.14285750687122345,
+      "learning_rate": 1.4560697661665346e-05,
+      "loss": 0.2185,
+      "step": 89950
+    },
+    {
+      "epoch": 2.4961084975160253,
+      "grad_norm": 0.14996595680713654,
+      "learning_rate": 1.4482999583287549e-05,
+      "loss": 0.2188,
+      "step": 90000
+    },
+    {
+      "epoch": 2.4961084975160253,
+      "eval_loss": 0.21735654771327972,
+      "eval_runtime": 500.6813,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 90000
+    },
+    {
+      "epoch": 2.49749522445909,
+      "grad_norm": 0.12452629953622818,
+      "learning_rate": 1.4405493177058382e-05,
+      "loss": 0.2141,
+      "step": 90050
+    },
+    {
+      "epoch": 2.4988819514021543,
+      "grad_norm": 0.11585172265768051,
+      "learning_rate": 1.432817861669561e-05,
+      "loss": 0.22,
+      "step": 90100
+    },
+    {
+      "epoch": 2.500268678345219,
+      "grad_norm": 0.1297215074300766,
+      "learning_rate": 1.4251056075486935e-05,
+      "loss": 0.217,
+      "step": 90150
+    },
+    {
+      "epoch": 2.5016554052882833,
+      "grad_norm": 0.1487419456243515,
+      "learning_rate": 1.4174125726289755e-05,
+      "loss": 0.2192,
+      "step": 90200
+    },
+    {
+      "epoch": 2.5030421322313474,
+      "grad_norm": 0.13567551970481873,
+      "learning_rate": 1.409738774153062e-05,
+      "loss": 0.2171,
+      "step": 90250
+    },
+    {
+      "epoch": 2.5044288591744124,
+      "grad_norm": 0.12301863729953766,
+      "learning_rate": 1.4020842293205016e-05,
+      "loss": 0.2144,
+      "step": 90300
+    },
+    {
+      "epoch": 2.5058155861174765,
+      "grad_norm": 0.12166890501976013,
+      "learning_rate": 1.394448955287685e-05,
+      "loss": 0.2216,
+      "step": 90350
+    },
+    {
+      "epoch": 2.507202313060541,
+      "grad_norm": 0.1442263275384903,
+      "learning_rate": 1.386832969167805e-05,
+      "loss": 0.2166,
+      "step": 90400
+    },
+    {
+      "epoch": 2.5085890400036055,
+      "grad_norm": 0.1268533319234848,
+      "learning_rate": 1.3792362880308374e-05,
+      "loss": 0.2185,
+      "step": 90450
+    },
+    {
+      "epoch": 2.50997576694667,
+      "grad_norm": 0.11250314116477966,
+      "learning_rate": 1.3716589289034731e-05,
+      "loss": 0.2174,
+      "step": 90500
+    },
+    {
+      "epoch": 2.5113624938897345,
+      "grad_norm": 0.12006182223558426,
+      "learning_rate": 1.3641009087691103e-05,
+      "loss": 0.219,
+      "step": 90550
+    },
+    {
+      "epoch": 2.5127492208327986,
+      "grad_norm": 0.15108701586723328,
+      "learning_rate": 1.3565622445677906e-05,
+      "loss": 0.2219,
+      "step": 90600
+    },
+    {
+      "epoch": 2.5141359477758636,
+      "grad_norm": 0.1432884782552719,
+      "learning_rate": 1.3490429531961802e-05,
+      "loss": 0.2158,
+      "step": 90650
+    },
+    {
+      "epoch": 2.5155226747189277,
+      "grad_norm": 0.15546324849128723,
+      "learning_rate": 1.3415430515075178e-05,
+      "loss": 0.2208,
+      "step": 90700
+    },
+    {
+      "epoch": 2.516909401661992,
+      "grad_norm": 0.126853808760643,
+      "learning_rate": 1.3340625563115905e-05,
+      "loss": 0.217,
+      "step": 90750
+    },
+    {
+      "epoch": 2.5182961286050567,
+      "grad_norm": 0.1568535566329956,
+      "learning_rate": 1.3266014843746832e-05,
+      "loss": 0.2153,
+      "step": 90800
+    },
+    {
+      "epoch": 2.519682855548121,
+      "grad_norm": 0.15077820420265198,
+      "learning_rate": 1.3191598524195537e-05,
+      "loss": 0.22,
+      "step": 90850
+    },
+    {
+      "epoch": 2.5210695824911857,
+      "grad_norm": 0.12230156362056732,
+      "learning_rate": 1.3117376771253775e-05,
+      "loss": 0.2172,
+      "step": 90900
+    },
+    {
+      "epoch": 2.52245630943425,
+      "grad_norm": 0.15573225915431976,
+      "learning_rate": 1.30433497512773e-05,
+      "loss": 0.2178,
+      "step": 90950
+    },
+    {
+      "epoch": 2.5238430363773148,
+      "grad_norm": 0.13438810408115387,
+      "learning_rate": 1.2969517630185401e-05,
+      "loss": 0.2168,
+      "step": 91000
+    },
+    {
+      "epoch": 2.5238430363773148,
+      "eval_loss": 0.21724413335323334,
+      "eval_runtime": 500.5096,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 91000
+    },
+    {
+      "epoch": 2.525229763320379,
+      "grad_norm": 0.11917885392904282,
+      "learning_rate": 1.2895880573460462e-05,
+      "loss": 0.2173,
+      "step": 91050
+    },
+    {
+      "epoch": 2.5266164902634434,
+      "grad_norm": 0.127394899725914,
+      "learning_rate": 1.2822438746147769e-05,
+      "loss": 0.2163,
+      "step": 91100
+    },
+    {
+      "epoch": 2.528003217206508,
+      "grad_norm": 0.16986310482025146,
+      "learning_rate": 1.2749192312854929e-05,
+      "loss": 0.2234,
+      "step": 91150
+    },
+    {
+      "epoch": 2.5293899441495724,
+      "grad_norm": 0.12022554129362106,
+      "learning_rate": 1.2676141437751665e-05,
+      "loss": 0.2173,
+      "step": 91200
+    },
+    {
+      "epoch": 2.530776671092637,
+      "grad_norm": 0.13442516326904297,
+      "learning_rate": 1.2603286284569382e-05,
+      "loss": 0.213,
+      "step": 91250
+    },
+    {
+      "epoch": 2.5321633980357015,
+      "grad_norm": 0.1352687031030655,
+      "learning_rate": 1.2530627016600826e-05,
+      "loss": 0.2209,
+      "step": 91300
+    },
+    {
+      "epoch": 2.533550124978766,
+      "grad_norm": 0.11729501932859421,
+      "learning_rate": 1.245816379669963e-05,
+      "loss": 0.2173,
+      "step": 91350
+    },
+    {
+      "epoch": 2.53493685192183,
+      "grad_norm": 0.13084493577480316,
+      "learning_rate": 1.2385896787280072e-05,
+      "loss": 0.2149,
+      "step": 91400
+    },
+    {
+      "epoch": 2.5363235788648946,
+      "grad_norm": 0.12369387596845627,
+      "learning_rate": 1.2313826150316698e-05,
+      "loss": 0.2196,
+      "step": 91450
+    },
+    {
+      "epoch": 2.537710305807959,
+      "grad_norm": 0.13943591713905334,
+      "learning_rate": 1.2241952047343796e-05,
+      "loss": 0.22,
+      "step": 91500
+    },
+    {
+      "epoch": 2.5390970327510236,
+      "grad_norm": 0.13443367183208466,
+      "learning_rate": 1.2170274639455282e-05,
+      "loss": 0.2201,
+      "step": 91550
+    },
+    {
+      "epoch": 2.540483759694088,
+      "grad_norm": 0.12157223373651505,
+      "learning_rate": 1.2098794087304088e-05,
+      "loss": 0.2199,
+      "step": 91600
+    },
+    {
+      "epoch": 2.5418704866371526,
+      "grad_norm": 0.13608892261981964,
+      "learning_rate": 1.2027510551102084e-05,
+      "loss": 0.2219,
+      "step": 91650
+    },
+    {
+      "epoch": 2.543257213580217,
+      "grad_norm": 0.11314541846513748,
+      "learning_rate": 1.1956424190619408e-05,
+      "loss": 0.2161,
+      "step": 91700
+    },
+    {
+      "epoch": 2.5446439405232812,
+      "grad_norm": 0.12370922416448593,
+      "learning_rate": 1.188553516518437e-05,
+      "loss": 0.2159,
+      "step": 91750
+    },
+    {
+      "epoch": 2.5460306674663458,
+      "grad_norm": 0.1209016963839531,
+      "learning_rate": 1.1814843633682904e-05,
+      "loss": 0.2144,
+      "step": 91800
+    },
+    {
+      "epoch": 2.5474173944094103,
+      "grad_norm": 0.1325283944606781,
+      "learning_rate": 1.174434975455837e-05,
+      "loss": 0.2193,
+      "step": 91850
+    },
+    {
+      "epoch": 2.548804121352475,
+      "grad_norm": 0.15919071435928345,
+      "learning_rate": 1.1674053685811048e-05,
+      "loss": 0.2152,
+      "step": 91900
+    },
+    {
+      "epoch": 2.5501908482955393,
+      "grad_norm": 0.11343590915203094,
+      "learning_rate": 1.1603955584997916e-05,
+      "loss": 0.2174,
+      "step": 91950
+    },
+    {
+      "epoch": 2.551577575238604,
+      "grad_norm": 0.13005082309246063,
+      "learning_rate": 1.1534055609232219e-05,
+      "loss": 0.218,
+      "step": 92000
+    },
+    {
+      "epoch": 2.551577575238604,
+      "eval_loss": 0.21716812252998352,
+      "eval_runtime": 500.3626,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 92000
+    },
+    {
+      "epoch": 2.5529643021816684,
+      "grad_norm": 0.11071674525737762,
+      "learning_rate": 1.1464353915183146e-05,
+      "loss": 0.2193,
+      "step": 92050
+    },
+    {
+      "epoch": 2.5543510291247324,
+      "grad_norm": 0.13590101897716522,
+      "learning_rate": 1.1394850659075484e-05,
+      "loss": 0.2188,
+      "step": 92100
+    },
+    {
+      "epoch": 2.555737756067797,
+      "grad_norm": 0.13247303664684296,
+      "learning_rate": 1.1325545996689192e-05,
+      "loss": 0.217,
+      "step": 92150
+    },
+    {
+      "epoch": 2.5571244830108615,
+      "grad_norm": 0.1305960714817047,
+      "learning_rate": 1.1256440083359188e-05,
+      "loss": 0.2148,
+      "step": 92200
+    },
+    {
+      "epoch": 2.558511209953926,
+      "grad_norm": 0.13151520490646362,
+      "learning_rate": 1.1187533073974855e-05,
+      "loss": 0.2177,
+      "step": 92250
+    },
+    {
+      "epoch": 2.5598979368969905,
+      "grad_norm": 0.13488546013832092,
+      "learning_rate": 1.111882512297986e-05,
+      "loss": 0.2202,
+      "step": 92300
+    },
+    {
+      "epoch": 2.561284663840055,
+      "grad_norm": 0.1446569859981537,
+      "learning_rate": 1.1050316384371617e-05,
+      "loss": 0.2173,
+      "step": 92350
+    },
+    {
+      "epoch": 2.5626713907831196,
+      "grad_norm": 0.1253671795129776,
+      "learning_rate": 1.0982007011701101e-05,
+      "loss": 0.2166,
+      "step": 92400
+    },
+    {
+      "epoch": 2.5640581177261836,
+      "grad_norm": 0.12608706951141357,
+      "learning_rate": 1.0913897158072405e-05,
+      "loss": 0.2158,
+      "step": 92450
+    },
+    {
+      "epoch": 2.565444844669248,
+      "grad_norm": 0.12183616310358047,
+      "learning_rate": 1.0845986976142497e-05,
+      "loss": 0.2156,
+      "step": 92500
+    },
+    {
+      "epoch": 2.5668315716123127,
+      "grad_norm": 0.12636205554008484,
+      "learning_rate": 1.0778276618120708e-05,
+      "loss": 0.2159,
+      "step": 92550
+    },
+    {
+      "epoch": 2.568218298555377,
+      "grad_norm": 0.1183227151632309,
+      "learning_rate": 1.0710766235768588e-05,
+      "loss": 0.2125,
+      "step": 92600
+    },
+    {
+      "epoch": 2.5696050254984417,
+      "grad_norm": 0.12173448503017426,
+      "learning_rate": 1.0643455980399453e-05,
+      "loss": 0.2161,
+      "step": 92650
+    },
+    {
+      "epoch": 2.5709917524415062,
+      "grad_norm": 0.1476822942495346,
+      "learning_rate": 1.0576346002878023e-05,
+      "loss": 0.2192,
+      "step": 92700
+    },
+    {
+      "epoch": 2.5723784793845708,
+      "grad_norm": 0.11871439963579178,
+      "learning_rate": 1.0509436453620202e-05,
+      "loss": 0.2204,
+      "step": 92750
+    },
+    {
+      "epoch": 2.573765206327635,
+      "grad_norm": 0.15758539736270905,
+      "learning_rate": 1.0442727482592596e-05,
+      "loss": 0.2204,
+      "step": 92800
+    },
+    {
+      "epoch": 2.5751519332706994,
+      "grad_norm": 0.13511809706687927,
+      "learning_rate": 1.0376219239312279e-05,
+      "loss": 0.2153,
+      "step": 92850
+    },
+    {
+      "epoch": 2.576538660213764,
+      "grad_norm": 0.13542090356349945,
+      "learning_rate": 1.0309911872846455e-05,
+      "loss": 0.218,
+      "step": 92900
+    },
+    {
+      "epoch": 2.5779253871568284,
+      "grad_norm": 0.15104569494724274,
+      "learning_rate": 1.0243805531812067e-05,
+      "loss": 0.2207,
+      "step": 92950
+    },
+    {
+      "epoch": 2.579312114099893,
+      "grad_norm": 0.14258405566215515,
+      "learning_rate": 1.017790036437547e-05,
+      "loss": 0.2187,
+      "step": 93000
+    },
+    {
+      "epoch": 2.579312114099893,
+      "eval_loss": 0.2171117663383484,
+      "eval_runtime": 500.4014,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 93000
+    },
+    {
+      "epoch": 2.5806988410429574,
+      "grad_norm": 0.13689257204532623,
+      "learning_rate": 1.0112196518252159e-05,
+      "loss": 0.2162,
+      "step": 93050
+    },
+    {
+      "epoch": 2.582085567986022,
+      "grad_norm": 0.1326596736907959,
+      "learning_rate": 1.0046694140706415e-05,
+      "loss": 0.2179,
+      "step": 93100
+    },
+    {
+      "epoch": 2.583472294929086,
+      "grad_norm": 0.1485782414674759,
+      "learning_rate": 9.981393378550896e-06,
+      "loss": 0.2176,
+      "step": 93150
+    },
+    {
+      "epoch": 2.5848590218721506,
+      "grad_norm": 0.10767937451601028,
+      "learning_rate": 9.91629437814644e-06,
+      "loss": 0.2224,
+      "step": 93200
+    },
+    {
+      "epoch": 2.586245748815215,
+      "grad_norm": 0.12032176554203033,
+      "learning_rate": 9.851397285401597e-06,
+      "loss": 0.2171,
+      "step": 93250
+    },
+    {
+      "epoch": 2.5876324757582796,
+      "grad_norm": 0.13087831437587738,
+      "learning_rate": 9.786702245772484e-06,
+      "loss": 0.2161,
+      "step": 93300
+    },
+    {
+      "epoch": 2.589019202701344,
+      "grad_norm": 0.15828031301498413,
+      "learning_rate": 9.722209404262228e-06,
+      "loss": 0.2203,
+      "step": 93350
+    },
+    {
+      "epoch": 2.5904059296444086,
+      "grad_norm": 0.1374100148677826,
+      "learning_rate": 9.65791890542087e-06,
+      "loss": 0.222,
+      "step": 93400
+    },
+    {
+      "epoch": 2.591792656587473,
+      "grad_norm": 0.1500755399465561,
+      "learning_rate": 9.593830893344824e-06,
+      "loss": 0.2158,
+      "step": 93450
+    },
+    {
+      "epoch": 2.5931793835305372,
+      "grad_norm": 0.14486078917980194,
+      "learning_rate": 9.529945511676774e-06,
+      "loss": 0.2222,
+      "step": 93500
+    },
+    {
+      "epoch": 2.594566110473602,
+      "grad_norm": 0.14060664176940918,
+      "learning_rate": 9.466262903605138e-06,
+      "loss": 0.2129,
+      "step": 93550
+    },
+    {
+      "epoch": 2.5959528374166663,
+      "grad_norm": 0.15706004202365875,
+      "learning_rate": 9.40278321186394e-06,
+      "loss": 0.2192,
+      "step": 93600
+    },
+    {
+      "epoch": 2.597339564359731,
+      "grad_norm": 0.16333714127540588,
+      "learning_rate": 9.339506578732348e-06,
+      "loss": 0.2146,
+      "step": 93650
+    },
+    {
+      "epoch": 2.5987262913027953,
+      "grad_norm": 0.12438689172267914,
+      "learning_rate": 9.276433146034425e-06,
+      "loss": 0.2166,
+      "step": 93700
+    },
+    {
+      "epoch": 2.60011301824586,
+      "grad_norm": 0.1589004248380661,
+      "learning_rate": 9.213563055138807e-06,
+      "loss": 0.2181,
+      "step": 93750
+    },
+    {
+      "epoch": 2.6014997451889244,
+      "grad_norm": 0.11392497271299362,
+      "learning_rate": 9.150896446958324e-06,
+      "loss": 0.2142,
+      "step": 93800
+    },
+    {
+      "epoch": 2.6028864721319884,
+      "grad_norm": 0.13960012793540955,
+      "learning_rate": 9.088433461949809e-06,
+      "loss": 0.2166,
+      "step": 93850
+    },
+    {
+      "epoch": 2.6042731990750534,
+      "grad_norm": 0.13375328481197357,
+      "learning_rate": 9.027417426767926e-06,
+      "loss": 0.2122,
+      "step": 93900
+    },
+    {
+      "epoch": 2.6056599260181175,
+      "grad_norm": 0.13815660774707794,
+      "learning_rate": 8.96535802822891e-06,
+      "loss": 0.2178,
+      "step": 93950
+    },
+    {
+      "epoch": 2.607046652961182,
+      "grad_norm": 0.12065936625003815,
+      "learning_rate": 8.903502668715357e-06,
+      "loss": 0.2213,
+      "step": 94000
+    },
+    {
+      "epoch": 2.607046652961182,
+      "eval_loss": 0.21707016229629517,
+      "eval_runtime": 500.3557,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 94000
+    },
+    {
+      "epoch": 2.6084333799042465,
+      "grad_norm": 0.12994138896465302,
+      "learning_rate": 8.843082508667189e-06,
+      "loss": 0.218,
+      "step": 94050
+    },
+    {
+      "epoch": 2.609820106847311,
+      "grad_norm": 0.13512447476387024,
+      "learning_rate": 8.781631554994407e-06,
+      "loss": 0.2198,
+      "step": 94100
+    },
+    {
+      "epoch": 2.6112068337903755,
+      "grad_norm": 0.15621940791606903,
+      "learning_rate": 8.720385052139468e-06,
+      "loss": 0.2192,
+      "step": 94150
+    },
+    {
+      "epoch": 2.6125935607334396,
+      "grad_norm": 0.13967250287532806,
+      "learning_rate": 8.659343137376263e-06,
+      "loss": 0.2193,
+      "step": 94200
+    },
+    {
+      "epoch": 2.6139802876765046,
+      "grad_norm": 0.1347551941871643,
+      "learning_rate": 8.598505947520063e-06,
+      "loss": 0.2199,
+      "step": 94250
+    },
+    {
+      "epoch": 2.6153670146195687,
+      "grad_norm": 0.1365870088338852,
+      "learning_rate": 8.53787361892735e-06,
+      "loss": 0.2209,
+      "step": 94300
+    },
+    {
+      "epoch": 2.616753741562633,
+      "grad_norm": 0.12225038558244705,
+      "learning_rate": 8.477446287495371e-06,
+      "loss": 0.2139,
+      "step": 94350
+    },
+    {
+      "epoch": 2.6181404685056977,
+      "grad_norm": 0.14266717433929443,
+      "learning_rate": 8.417224088662012e-06,
+      "loss": 0.2165,
+      "step": 94400
+    },
+    {
+      "epoch": 2.6195271954487622,
+      "grad_norm": 0.1277306079864502,
+      "learning_rate": 8.357207157405277e-06,
+      "loss": 0.2168,
+      "step": 94450
+    },
+    {
+      "epoch": 2.6209139223918267,
+      "grad_norm": 0.1608654409646988,
+      "learning_rate": 8.2973956282432e-06,
+      "loss": 0.2172,
+      "step": 94500
+    },
+    {
+      "epoch": 2.622300649334891,
+      "grad_norm": 0.12185334414243698,
+      "learning_rate": 8.237789635233317e-06,
+      "loss": 0.2178,
+      "step": 94550
+    },
+    {
+      "epoch": 2.623687376277956,
+      "grad_norm": 0.1386982798576355,
+      "learning_rate": 8.178389311972612e-06,
+      "loss": 0.2169,
+      "step": 94600
+    },
+    {
+      "epoch": 2.62507410322102,
+      "grad_norm": 0.1339062601327896,
+      "learning_rate": 8.119194791597006e-06,
+      "loss": 0.2166,
+      "step": 94650
+    },
+    {
+      "epoch": 2.6264608301640844,
+      "grad_norm": 0.15275806188583374,
+      "learning_rate": 8.060206206781206e-06,
+      "loss": 0.2182,
+      "step": 94700
+    },
+    {
+      "epoch": 2.627847557107149,
+      "grad_norm": 0.1413796991109848,
+      "learning_rate": 8.001423689738308e-06,
+      "loss": 0.2204,
+      "step": 94750
+    },
+    {
+      "epoch": 2.6292342840502134,
+      "grad_norm": 0.1281316876411438,
+      "learning_rate": 7.942847372219564e-06,
+      "loss": 0.2189,
+      "step": 94800
+    },
+    {
+      "epoch": 2.630621010993278,
+      "grad_norm": 0.14383311569690704,
+      "learning_rate": 7.884477385514089e-06,
+      "loss": 0.2203,
+      "step": 94850
+    },
+    {
+      "epoch": 2.6320077379363425,
+      "grad_norm": 0.1202847883105278,
+      "learning_rate": 7.826313860448454e-06,
+      "loss": 0.2153,
+      "step": 94900
+    },
+    {
+      "epoch": 2.633394464879407,
+      "grad_norm": 0.12072198837995529,
+      "learning_rate": 7.768356927386589e-06,
+      "loss": 0.216,
+      "step": 94950
+    },
+    {
+      "epoch": 2.634781191822471,
+      "grad_norm": 0.12037204205989838,
+      "learning_rate": 7.710606716229285e-06,
+      "loss": 0.2228,
+      "step": 95000
+    },
+    {
+      "epoch": 2.634781191822471,
+      "eval_loss": 0.21694940328598022,
+      "eval_runtime": 500.561,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 95000
+    },
+    {
+      "epoch": 2.6361679187655356,
+      "grad_norm": 0.12889185547828674,
+      "learning_rate": 7.653063356414081e-06,
+      "loss": 0.2203,
+      "step": 95050
+    },
+    {
+      "epoch": 2.6375546457086,
+      "grad_norm": 0.1323080062866211,
+      "learning_rate": 7.59572697691483e-06,
+      "loss": 0.2183,
+      "step": 95100
+    },
+    {
+      "epoch": 2.6389413726516646,
+      "grad_norm": 0.12650491297245026,
+      "learning_rate": 7.538597706241513e-06,
+      "loss": 0.2143,
+      "step": 95150
+    },
+    {
+      "epoch": 2.640328099594729,
+      "grad_norm": 0.15389423072338104,
+      "learning_rate": 7.481675672439903e-06,
+      "loss": 0.2158,
+      "step": 95200
+    },
+    {
+      "epoch": 2.6417148265377937,
+      "grad_norm": 0.14535321295261383,
+      "learning_rate": 7.424961003091291e-06,
+      "loss": 0.2153,
+      "step": 95250
+    },
+    {
+      "epoch": 2.643101553480858,
+      "grad_norm": 0.1973002701997757,
+      "learning_rate": 7.368453825312161e-06,
+      "loss": 0.2192,
+      "step": 95300
+    },
+    {
+      "epoch": 2.6444882804239223,
+      "grad_norm": 0.1256517767906189,
+      "learning_rate": 7.312154265753978e-06,
+      "loss": 0.2209,
+      "step": 95350
+    },
+    {
+      "epoch": 2.6458750073669868,
+      "grad_norm": 0.13010665774345398,
+      "learning_rate": 7.256062450602863e-06,
+      "loss": 0.2207,
+      "step": 95400
+    },
+    {
+      "epoch": 2.6472617343100513,
+      "grad_norm": 0.11461709439754486,
+      "learning_rate": 7.200178505579269e-06,
+      "loss": 0.2164,
+      "step": 95450
+    },
+    {
+      "epoch": 2.648648461253116,
+      "grad_norm": 0.14307774603366852,
+      "learning_rate": 7.144502555937815e-06,
+      "loss": 0.2221,
+      "step": 95500
+    },
+    {
+      "epoch": 2.6500351881961803,
+      "grad_norm": 0.14916153252124786,
+      "learning_rate": 7.0890347264668255e-06,
+      "loss": 0.2202,
+      "step": 95550
+    },
+    {
+      "epoch": 2.651421915139245,
+      "grad_norm": 0.10386321693658829,
+      "learning_rate": 7.033775141488308e-06,
+      "loss": 0.2192,
+      "step": 95600
+    },
+    {
+      "epoch": 2.6528086420823094,
+      "grad_norm": 0.1536647379398346,
+      "learning_rate": 6.9787239248573885e-06,
+      "loss": 0.2157,
+      "step": 95650
+    },
+    {
+      "epoch": 2.6541953690253735,
+      "grad_norm": 0.12528762221336365,
+      "learning_rate": 6.9238811999622565e-06,
+      "loss": 0.2197,
+      "step": 95700
+    },
+    {
+      "epoch": 2.655582095968438,
+      "grad_norm": 0.13676822185516357,
+      "learning_rate": 6.869247089723729e-06,
+      "loss": 0.2214,
+      "step": 95750
+    },
+    {
+      "epoch": 2.6569688229115025,
+      "grad_norm": 0.13648496568202972,
+      "learning_rate": 6.81482171659511e-06,
+      "loss": 0.2178,
+      "step": 95800
+    },
+    {
+      "epoch": 2.658355549854567,
+      "grad_norm": 0.14560841023921967,
+      "learning_rate": 6.760605202561832e-06,
+      "loss": 0.2193,
+      "step": 95850
+    },
+    {
+      "epoch": 2.6597422767976315,
+      "grad_norm": 0.14555421471595764,
+      "learning_rate": 6.7065976691411904e-06,
+      "loss": 0.2193,
+      "step": 95900
+    },
+    {
+      "epoch": 2.661129003740696,
+      "grad_norm": 0.14283110201358795,
+      "learning_rate": 6.652799237382112e-06,
+      "loss": 0.2191,
+      "step": 95950
+    },
+    {
+      "epoch": 2.6625157306837606,
+      "grad_norm": 0.19822004437446594,
+      "learning_rate": 6.599210027864833e-06,
+      "loss": 0.2208,
+      "step": 96000
+    },
+    {
+      "epoch": 2.6625157306837606,
+      "eval_loss": 0.21687686443328857,
+      "eval_runtime": 500.1004,
+      "eval_samples_per_second": 5.713,
+      "eval_steps_per_second": 5.713,
+      "step": 96000
+    },
+    {
+      "epoch": 2.6639024576268246,
+      "grad_norm": 0.1323961615562439,
+      "learning_rate": 6.545830160700695e-06,
+      "loss": 0.2157,
+      "step": 96050
+    },
+    {
+      "epoch": 2.665289184569889,
+      "grad_norm": 0.1361609548330307,
+      "learning_rate": 6.492659755531749e-06,
+      "loss": 0.2166,
+      "step": 96100
+    },
+    {
+      "epoch": 2.6666759115129537,
+      "grad_norm": 0.1284680813550949,
+      "learning_rate": 6.439698931530669e-06,
+      "loss": 0.2152,
+      "step": 96150
+    },
+    {
+      "epoch": 2.668062638456018,
+      "grad_norm": 0.12301739305257797,
+      "learning_rate": 6.386947807400323e-06,
+      "loss": 0.217,
+      "step": 96200
+    },
+    {
+      "epoch": 2.6694493653990827,
+      "grad_norm": 0.1513117402791977,
+      "learning_rate": 6.33440650137358e-06,
+      "loss": 0.2157,
+      "step": 96250
+    },
+    {
+      "epoch": 2.6708360923421472,
+      "grad_norm": 0.15935802459716797,
+      "learning_rate": 6.282075131213083e-06,
+      "loss": 0.2193,
+      "step": 96300
+    },
+    {
+      "epoch": 2.6722228192852118,
+      "grad_norm": 0.1329813152551651,
+      "learning_rate": 6.229953814210865e-06,
+      "loss": 0.2126,
+      "step": 96350
+    },
+    {
+      "epoch": 2.673609546228276,
+      "grad_norm": 0.14577654004096985,
+      "learning_rate": 6.178042667188222e-06,
+      "loss": 0.2135,
+      "step": 96400
+    },
+    {
+      "epoch": 2.6749962731713404,
+      "grad_norm": 0.12831629812717438,
+      "learning_rate": 6.126341806495361e-06,
+      "loss": 0.2185,
+      "step": 96450
+    },
+    {
+      "epoch": 2.676383000114405,
+      "grad_norm": 0.13574399054050446,
+      "learning_rate": 6.074851348011179e-06,
+      "loss": 0.2171,
+      "step": 96500
+    },
+    {
+      "epoch": 2.6777697270574694,
+      "grad_norm": 0.15019351243972778,
+      "learning_rate": 6.023571407142969e-06,
+      "loss": 0.2188,
+      "step": 96550
+    },
+    {
+      "epoch": 2.679156454000534,
+      "grad_norm": 0.11595098674297333,
+      "learning_rate": 5.972502098826216e-06,
+      "loss": 0.2193,
+      "step": 96600
+    },
+    {
+      "epoch": 2.6805431809435984,
+      "grad_norm": 0.11979762464761734,
+      "learning_rate": 5.9216435375242685e-06,
+      "loss": 0.2215,
+      "step": 96650
+    },
+    {
+      "epoch": 2.681929907886663,
+      "grad_norm": 0.13104109466075897,
+      "learning_rate": 5.870995837228166e-06,
+      "loss": 0.2163,
+      "step": 96700
+    },
+    {
+      "epoch": 2.683316634829727,
+      "grad_norm": 0.15309420228004456,
+      "learning_rate": 5.820559111456292e-06,
+      "loss": 0.2147,
+      "step": 96750
+    },
+    {
+      "epoch": 2.6847033617727916,
+      "grad_norm": 0.13580960035324097,
+      "learning_rate": 5.7703334732541855e-06,
+      "loss": 0.2169,
+      "step": 96800
+    },
+    {
+      "epoch": 2.686090088715856,
+      "grad_norm": 0.14888140559196472,
+      "learning_rate": 5.720319035194299e-06,
+      "loss": 0.2193,
+      "step": 96850
+    },
+    {
+      "epoch": 2.6874768156589206,
+      "grad_norm": 0.1494724303483963,
+      "learning_rate": 5.670515909375651e-06,
+      "loss": 0.2186,
+      "step": 96900
+    },
+    {
+      "epoch": 2.688863542601985,
+      "grad_norm": 0.11105342954397202,
+      "learning_rate": 5.6209242074237165e-06,
+      "loss": 0.2171,
+      "step": 96950
+    },
+    {
+      "epoch": 2.6902502695450496,
+      "grad_norm": 0.1481621414422989,
+      "learning_rate": 5.5715440404900175e-06,
+      "loss": 0.2189,
+      "step": 97000
+    },
+    {
+      "epoch": 2.6902502695450496,
+      "eval_loss": 0.21687200665473938,
+      "eval_runtime": 500.2659,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 97000
+    },
+    {
+      "epoch": 2.691636996488114,
+      "grad_norm": 0.1304822862148285,
+      "learning_rate": 5.522375519252021e-06,
+      "loss": 0.2215,
+      "step": 97050
+    },
+    {
+      "epoch": 2.6930237234311782,
+      "grad_norm": 0.12556232511997223,
+      "learning_rate": 5.474395813301237e-06,
+      "loss": 0.2177,
+      "step": 97100
+    },
+    {
+      "epoch": 2.694410450374243,
+      "grad_norm": 0.10873960703611374,
+      "learning_rate": 5.42564667520441e-06,
+      "loss": 0.2182,
+      "step": 97150
+    },
+    {
+      "epoch": 2.6957971773173073,
+      "grad_norm": 0.1206154152750969,
+      "learning_rate": 5.377109509807965e-06,
+      "loss": 0.2183,
+      "step": 97200
+    },
+    {
+      "epoch": 2.697183904260372,
+      "grad_norm": 0.12211555987596512,
+      "learning_rate": 5.3287844258999135e-06,
+      "loss": 0.2175,
+      "step": 97250
+    },
+    {
+      "epoch": 2.6985706312034363,
+      "grad_norm": 0.13117828965187073,
+      "learning_rate": 5.28067153179288e-06,
+      "loss": 0.2208,
+      "step": 97300
+    },
+    {
+      "epoch": 2.699957358146501,
+      "grad_norm": 0.12565045058727264,
+      "learning_rate": 5.23277093532395e-06,
+      "loss": 0.2154,
+      "step": 97350
+    },
+    {
+      "epoch": 2.7013440850895654,
+      "grad_norm": 0.132746160030365,
+      "learning_rate": 5.1850827438543305e-06,
+      "loss": 0.2196,
+      "step": 97400
+    },
+    {
+      "epoch": 2.7027308120326294,
+      "grad_norm": 0.12424889206886292,
+      "learning_rate": 5.1376070642691896e-06,
+      "loss": 0.2143,
+      "step": 97450
+    },
+    {
+      "epoch": 2.7041175389756944,
+      "grad_norm": 0.13726244866847992,
+      "learning_rate": 5.0903440029773985e-06,
+      "loss": 0.2176,
+      "step": 97500
+    },
+    {
+      "epoch": 2.7055042659187585,
+      "grad_norm": 0.13994988799095154,
+      "learning_rate": 5.043293665911219e-06,
+      "loss": 0.217,
+      "step": 97550
+    },
+    {
+      "epoch": 2.706890992861823,
+      "grad_norm": 0.13315385580062866,
+      "learning_rate": 4.996456158526197e-06,
+      "loss": 0.2136,
+      "step": 97600
+    },
+    {
+      "epoch": 2.7082777198048875,
+      "grad_norm": 0.12573137879371643,
+      "learning_rate": 4.949831585800779e-06,
+      "loss": 0.2172,
+      "step": 97650
+    },
+    {
+      "epoch": 2.709664446747952,
+      "grad_norm": 0.12147074192762375,
+      "learning_rate": 4.903420052236252e-06,
+      "loss": 0.2215,
+      "step": 97700
+    },
+    {
+      "epoch": 2.7110511736910166,
+      "grad_norm": 0.12491251528263092,
+      "learning_rate": 4.857221661856304e-06,
+      "loss": 0.2188,
+      "step": 97750
+    },
+    {
+      "epoch": 2.7124379006340806,
+      "grad_norm": 0.11851975321769714,
+      "learning_rate": 4.8112365182070075e-06,
+      "loss": 0.218,
+      "step": 97800
+    },
+    {
+      "epoch": 2.7138246275771456,
+      "grad_norm": 0.14051896333694458,
+      "learning_rate": 4.765464724356383e-06,
+      "loss": 0.2167,
+      "step": 97850
+    },
+    {
+      "epoch": 2.7152113545202097,
+      "grad_norm": 0.14575433731079102,
+      "learning_rate": 4.719906382894324e-06,
+      "loss": 0.2172,
+      "step": 97900
+    },
+    {
+      "epoch": 2.716598081463274,
+      "grad_norm": 0.11300890892744064,
+      "learning_rate": 4.674561595932259e-06,
+      "loss": 0.2206,
+      "step": 97950
+    },
+    {
+      "epoch": 2.7179848084063387,
+      "grad_norm": 0.13367551565170288,
+      "learning_rate": 4.629430465103002e-06,
+      "loss": 0.2165,
+      "step": 98000
+    },
+    {
+      "epoch": 2.7179848084063387,
+      "eval_loss": 0.21681541204452515,
+      "eval_runtime": 500.7329,
+      "eval_samples_per_second": 5.706,
+      "eval_steps_per_second": 5.706,
+      "step": 98000
+    },
+    {
+      "epoch": 2.7193715353494032,
+      "grad_norm": 0.13205870985984802,
+      "learning_rate": 4.5845130915605165e-06,
+      "loss": 0.2165,
+      "step": 98050
+    },
+    {
+      "epoch": 2.7207582622924678,
+      "grad_norm": 0.13241606950759888,
+      "learning_rate": 4.539809575979581e-06,
+      "loss": 0.2193,
+      "step": 98100
+    },
+    {
+      "epoch": 2.722144989235532,
+      "grad_norm": 0.15105368196964264,
+      "learning_rate": 4.495320018555738e-06,
+      "loss": 0.2168,
+      "step": 98150
+    },
+    {
+      "epoch": 2.723531716178597,
+      "grad_norm": 0.16313302516937256,
+      "learning_rate": 4.451044519004921e-06,
+      "loss": 0.217,
+      "step": 98200
+    },
+    {
+      "epoch": 2.724918443121661,
+      "grad_norm": 0.13042916357517242,
+      "learning_rate": 4.406983176563329e-06,
+      "loss": 0.2197,
+      "step": 98250
+    },
+    {
+      "epoch": 2.7263051700647254,
+      "grad_norm": 0.11967332661151886,
+      "learning_rate": 4.363136089987096e-06,
+      "loss": 0.2216,
+      "step": 98300
+    },
+    {
+      "epoch": 2.72769189700779,
+      "grad_norm": 0.13527044653892517,
+      "learning_rate": 4.319503357552235e-06,
+      "loss": 0.2187,
+      "step": 98350
+    },
+    {
+      "epoch": 2.7290786239508544,
+      "grad_norm": 0.13896431028842926,
+      "learning_rate": 4.276085077054226e-06,
+      "loss": 0.218,
+      "step": 98400
+    },
+    {
+      "epoch": 2.730465350893919,
+      "grad_norm": 0.13394545018672943,
+      "learning_rate": 4.2328813458079374e-06,
+      "loss": 0.2211,
+      "step": 98450
+    },
+    {
+      "epoch": 2.7318520778369835,
+      "grad_norm": 0.1567400097846985,
+      "learning_rate": 4.189892260647388e-06,
+      "loss": 0.2192,
+      "step": 98500
+    },
+    {
+      "epoch": 2.733238804780048,
+      "grad_norm": 0.13998396694660187,
+      "learning_rate": 4.147117917925425e-06,
+      "loss": 0.2173,
+      "step": 98550
+    },
+    {
+      "epoch": 2.734625531723112,
+      "grad_norm": 0.11015547066926956,
+      "learning_rate": 4.104558413513649e-06,
+      "loss": 0.2178,
+      "step": 98600
+    },
+    {
+      "epoch": 2.7360122586661766,
+      "grad_norm": 0.1378924697637558,
+      "learning_rate": 4.062213842802121e-06,
+      "loss": 0.2193,
+      "step": 98650
+    },
+    {
+      "epoch": 2.737398985609241,
+      "grad_norm": 0.13427558541297913,
+      "learning_rate": 4.020084300699178e-06,
+      "loss": 0.2165,
+      "step": 98700
+    },
+    {
+      "epoch": 2.7387857125523056,
+      "grad_norm": 0.11761970818042755,
+      "learning_rate": 3.978169881631166e-06,
+      "loss": 0.2171,
+      "step": 98750
+    },
+    {
+      "epoch": 2.74017243949537,
+      "grad_norm": 0.12466447800397873,
+      "learning_rate": 3.936470679542292e-06,
+      "loss": 0.2211,
+      "step": 98800
+    },
+    {
+      "epoch": 2.7415591664384347,
+      "grad_norm": 0.15706753730773926,
+      "learning_rate": 3.894986787894394e-06,
+      "loss": 0.219,
+      "step": 98850
+    },
+    {
+      "epoch": 2.742945893381499,
+      "grad_norm": 0.14471718668937683,
+      "learning_rate": 3.853718299666742e-06,
+      "loss": 0.2158,
+      "step": 98900
+    },
+    {
+      "epoch": 2.7443326203245633,
+      "grad_norm": 0.12557634711265564,
+      "learning_rate": 3.812665307355745e-06,
+      "loss": 0.2208,
+      "step": 98950
+    },
+    {
+      "epoch": 2.745719347267628,
+      "grad_norm": 0.12567783892154694,
+      "learning_rate": 3.7718279029749225e-06,
+      "loss": 0.2175,
+      "step": 99000
+    },
+    {
+      "epoch": 2.745719347267628,
+      "eval_loss": 0.21679456532001495,
+      "eval_runtime": 500.082,
+      "eval_samples_per_second": 5.713,
+      "eval_steps_per_second": 5.713,
+      "step": 99000
+    },
+    {
+      "epoch": 2.7471060742106923,
+      "grad_norm": 0.13429369032382965,
+      "learning_rate": 3.731206178054503e-06,
+      "loss": 0.2174,
+      "step": 99050
+    },
+    {
+      "epoch": 2.748492801153757,
+      "grad_norm": 0.13248606026172638,
+      "learning_rate": 3.690800223641322e-06,
+      "loss": 0.2192,
+      "step": 99100
+    },
+    {
+      "epoch": 2.7498795280968213,
+      "grad_norm": 0.13042642176151276,
+      "learning_rate": 3.6506101302986595e-06,
+      "loss": 0.2166,
+      "step": 99150
+    },
+    {
+      "epoch": 2.751266255039886,
+      "grad_norm": 0.15077584981918335,
+      "learning_rate": 3.6106359881058815e-06,
+      "loss": 0.2152,
+      "step": 99200
+    },
+    {
+      "epoch": 2.7526529819829504,
+      "grad_norm": 0.13205015659332275,
+      "learning_rate": 3.570877886658419e-06,
+      "loss": 0.2196,
+      "step": 99250
+    },
+    {
+      "epoch": 2.7540397089260145,
+      "grad_norm": 0.14192979037761688,
+      "learning_rate": 3.531335915067424e-06,
+      "loss": 0.2182,
+      "step": 99300
+    },
+    {
+      "epoch": 2.755426435869079,
+      "grad_norm": 0.13009020686149597,
+      "learning_rate": 3.49201016195968e-06,
+      "loss": 0.2194,
+      "step": 99350
+    },
+    {
+      "epoch": 2.7568131628121435,
+      "grad_norm": 0.11861055344343185,
+      "learning_rate": 3.4529007154773142e-06,
+      "loss": 0.2181,
+      "step": 99400
+    },
+    {
+      "epoch": 2.758199889755208,
+      "grad_norm": 0.15699726343154907,
+      "learning_rate": 3.414007663277674e-06,
+      "loss": 0.2175,
+      "step": 99450
+    },
+    {
+      "epoch": 2.7595866166982725,
+      "grad_norm": 0.12084462493658066,
+      "learning_rate": 3.3753310925330516e-06,
+      "loss": 0.2188,
+      "step": 99500
+    },
+    {
+      "epoch": 2.760973343641337,
+      "grad_norm": 0.14574265480041504,
+      "learning_rate": 3.336871089930571e-06,
+      "loss": 0.2213,
+      "step": 99550
+    },
+    {
+      "epoch": 2.7623600705844016,
+      "grad_norm": 0.1578470766544342,
+      "learning_rate": 3.2986277416719227e-06,
+      "loss": 0.2185,
+      "step": 99600
+    },
+    {
+      "epoch": 2.7637467975274657,
+      "grad_norm": 0.1389865130186081,
+      "learning_rate": 3.2606011334732178e-06,
+      "loss": 0.2196,
+      "step": 99650
+    },
+    {
+      "epoch": 2.76513352447053,
+      "grad_norm": 0.12240829318761826,
+      "learning_rate": 3.222791350564802e-06,
+      "loss": 0.2209,
+      "step": 99700
+    },
+    {
+      "epoch": 2.7665202514135947,
+      "grad_norm": 0.13817352056503296,
+      "learning_rate": 3.1851984776909984e-06,
+      "loss": 0.2158,
+      "step": 99750
+    },
+    {
+      "epoch": 2.767906978356659,
+      "grad_norm": 0.16704636812210083,
+      "learning_rate": 3.1478225991099954e-06,
+      "loss": 0.2181,
+      "step": 99800
+    },
+    {
+      "epoch": 2.7692937052997237,
+      "grad_norm": 0.12111228704452515,
+      "learning_rate": 3.110663798593616e-06,
+      "loss": 0.2161,
+      "step": 99850
+    },
+    {
+      "epoch": 2.7706804322427883,
+      "grad_norm": 0.10894995182752609,
+      "learning_rate": 3.0737221594271616e-06,
+      "loss": 0.2138,
+      "step": 99900
+    },
+    {
+      "epoch": 2.772067159185853,
+      "grad_norm": 0.13423964381217957,
+      "learning_rate": 3.036997764409133e-06,
+      "loss": 0.2189,
+      "step": 99950
+    },
+    {
+      "epoch": 2.773453886128917,
+      "grad_norm": 0.17966866493225098,
+      "learning_rate": 3.000490695851188e-06,
+      "loss": 0.2156,
+      "step": 100000
+    },
+    {
+      "epoch": 2.773453886128917,
+      "eval_loss": 0.216772198677063,
+      "eval_runtime": 500.5232,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 100000
+    },
+    {
+      "epoch": 2.7748406130719814,
+      "grad_norm": 0.12200125306844711,
+      "learning_rate": 2.9642010355778403e-06,
+      "loss": 0.2163,
+      "step": 100050
+    },
+    {
+      "epoch": 2.776227340015046,
+      "grad_norm": 0.14171266555786133,
+      "learning_rate": 2.9281288649263496e-06,
+      "loss": 0.2199,
+      "step": 100100
+    },
+    {
+      "epoch": 2.7776140669581104,
+      "grad_norm": 0.14463447034358978,
+      "learning_rate": 2.8922742647464974e-06,
+      "loss": 0.2167,
+      "step": 100150
+    },
+    {
+      "epoch": 2.779000793901175,
+      "grad_norm": 0.14498113095760345,
+      "learning_rate": 2.8566373154003788e-06,
+      "loss": 0.2215,
+      "step": 100200
+    },
+    {
+      "epoch": 2.7803875208442395,
+      "grad_norm": 0.14089851081371307,
+      "learning_rate": 2.821218096762346e-06,
+      "loss": 0.2179,
+      "step": 100250
+    },
+    {
+      "epoch": 2.781774247787304,
+      "grad_norm": 0.1408817619085312,
+      "learning_rate": 2.786016688218651e-06,
+      "loss": 0.2221,
+      "step": 100300
+    },
+    {
+      "epoch": 2.783160974730368,
+      "grad_norm": 0.1360626369714737,
+      "learning_rate": 2.7510331686674383e-06,
+      "loss": 0.2169,
+      "step": 100350
+    },
+    {
+      "epoch": 2.7845477016734326,
+      "grad_norm": 0.12629085779190063,
+      "learning_rate": 2.7162676165184197e-06,
+      "loss": 0.2169,
+      "step": 100400
+    },
+    {
+      "epoch": 2.785934428616497,
+      "grad_norm": 0.14268292486667633,
+      "learning_rate": 2.68172010969282e-06,
+      "loss": 0.2142,
+      "step": 100450
+    },
+    {
+      "epoch": 2.7873211555595616,
+      "grad_norm": 0.1308322250843048,
+      "learning_rate": 2.6473907256231333e-06,
+      "loss": 0.2178,
+      "step": 100500
+    },
+    {
+      "epoch": 2.788707882502626,
+      "grad_norm": 0.13275696337223053,
+      "learning_rate": 2.6132795412529777e-06,
+      "loss": 0.2191,
+      "step": 100550
+    },
+    {
+      "epoch": 2.7900946094456907,
+      "grad_norm": 0.14593924582004547,
+      "learning_rate": 2.5793866330368954e-06,
+      "loss": 0.2164,
+      "step": 100600
+    },
+    {
+      "epoch": 2.791481336388755,
+      "grad_norm": 0.13292020559310913,
+      "learning_rate": 2.5457120769402208e-06,
+      "loss": 0.2161,
+      "step": 100650
+    },
+    {
+      "epoch": 2.7928680633318193,
+      "grad_norm": 0.13708774745464325,
+      "learning_rate": 2.5122559484388685e-06,
+      "loss": 0.2139,
+      "step": 100700
+    },
+    {
+      "epoch": 2.794254790274884,
+      "grad_norm": 0.1346159726381302,
+      "learning_rate": 2.479018322519189e-06,
+      "loss": 0.2186,
+      "step": 100750
+    },
+    {
+      "epoch": 2.7956415172179483,
+      "grad_norm": 0.1296459138393402,
+      "learning_rate": 2.4459992736778125e-06,
+      "loss": 0.2153,
+      "step": 100800
+    },
+    {
+      "epoch": 2.797028244161013,
+      "grad_norm": 0.15982100367546082,
+      "learning_rate": 2.413198875921441e-06,
+      "loss": 0.2162,
+      "step": 100850
+    },
+    {
+      "epoch": 2.7984149711040773,
+      "grad_norm": 0.1282055675983429,
+      "learning_rate": 2.3806172027667216e-06,
+      "loss": 0.2142,
+      "step": 100900
+    },
+    {
+      "epoch": 2.799801698047142,
+      "grad_norm": 0.12306920439004898,
+      "learning_rate": 2.3482543272400403e-06,
+      "loss": 0.2176,
+      "step": 100950
+    },
+    {
+      "epoch": 2.8011884249902064,
+      "grad_norm": 0.11848998814821243,
+      "learning_rate": 2.3161103218774404e-06,
+      "loss": 0.218,
+      "step": 101000
+    },
+    {
+      "epoch": 2.8011884249902064,
+      "eval_loss": 0.21674667298793793,
+      "eval_runtime": 500.3702,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 101000
+    },
+    {
+      "epoch": 2.8025751519332704,
+      "grad_norm": 0.1424969583749771,
+      "learning_rate": 2.284185258724336e-06,
+      "loss": 0.2143,
+      "step": 101050
+    },
+    {
+      "epoch": 2.8039618788763354,
+      "grad_norm": 0.1375187486410141,
+      "learning_rate": 2.2524792093354897e-06,
+      "loss": 0.2186,
+      "step": 101100
+    },
+    {
+      "epoch": 2.8053486058193995,
+      "grad_norm": 0.13417352735996246,
+      "learning_rate": 2.220992244774711e-06,
+      "loss": 0.2152,
+      "step": 101150
+    },
+    {
+      "epoch": 2.806735332762464,
+      "grad_norm": 0.149616077542305,
+      "learning_rate": 2.189724435614815e-06,
+      "loss": 0.2155,
+      "step": 101200
+    },
+    {
+      "epoch": 2.8081220597055285,
+      "grad_norm": 0.12678956985473633,
+      "learning_rate": 2.1586758519373973e-06,
+      "loss": 0.2163,
+      "step": 101250
+    },
+    {
+      "epoch": 2.809508786648593,
+      "grad_norm": 0.12688349187374115,
+      "learning_rate": 2.127846563332703e-06,
+      "loss": 0.219,
+      "step": 101300
+    },
+    {
+      "epoch": 2.8108955135916576,
+      "grad_norm": 0.12490526586771011,
+      "learning_rate": 2.097846687174676e-06,
+      "loss": 0.2185,
+      "step": 101350
+    },
+    {
+      "epoch": 2.8122822405347216,
+      "grad_norm": 0.11337302625179291,
+      "learning_rate": 2.0674518061951975e-06,
+      "loss": 0.2178,
+      "step": 101400
+    },
+    {
+      "epoch": 2.8136689674777866,
+      "grad_norm": 0.14391370117664337,
+      "learning_rate": 2.037276424751977e-06,
+      "loss": 0.2172,
+      "step": 101450
+    },
+    {
+      "epoch": 2.8150556944208507,
+      "grad_norm": 0.15447823703289032,
+      "learning_rate": 2.007320610478136e-06,
+      "loss": 0.2175,
+      "step": 101500
+    },
+    {
+      "epoch": 2.816442421363915,
+      "grad_norm": 0.1505586951971054,
+      "learning_rate": 1.977584430514623e-06,
+      "loss": 0.2214,
+      "step": 101550
+    },
+    {
+      "epoch": 2.8178291483069797,
+      "grad_norm": 0.1483069509267807,
+      "learning_rate": 1.9480679515101797e-06,
+      "loss": 0.2172,
+      "step": 101600
+    },
+    {
+      "epoch": 2.8192158752500442,
+      "grad_norm": 0.11997207999229431,
+      "learning_rate": 1.9187712396210756e-06,
+      "loss": 0.221,
+      "step": 101650
+    },
+    {
+      "epoch": 2.8206026021931088,
+      "grad_norm": 0.1311604231595993,
+      "learning_rate": 1.8896943605110185e-06,
+      "loss": 0.2171,
+      "step": 101700
+    },
+    {
+      "epoch": 2.821989329136173,
+      "grad_norm": 0.12765543162822723,
+      "learning_rate": 1.8608373793510102e-06,
+      "loss": 0.2168,
+      "step": 101750
+    },
+    {
+      "epoch": 2.823376056079238,
+      "grad_norm": 0.17365878820419312,
+      "learning_rate": 1.8322003608191696e-06,
+      "loss": 0.2174,
+      "step": 101800
+    },
+    {
+      "epoch": 2.824762783022302,
+      "grad_norm": 0.11939753592014313,
+      "learning_rate": 1.8037833691006312e-06,
+      "loss": 0.2167,
+      "step": 101850
+    },
+    {
+      "epoch": 2.8261495099653664,
+      "grad_norm": 0.14094018936157227,
+      "learning_rate": 1.7755864678873468e-06,
+      "loss": 0.2171,
+      "step": 101900
+    },
+    {
+      "epoch": 2.827536236908431,
+      "grad_norm": 0.1488160490989685,
+      "learning_rate": 1.7476097203779852e-06,
+      "loss": 0.2189,
+      "step": 101950
+    },
+    {
+      "epoch": 2.8289229638514954,
+      "grad_norm": 0.12787774205207825,
+      "learning_rate": 1.719853189277787e-06,
+      "loss": 0.2199,
+      "step": 102000
+    },
+    {
+      "epoch": 2.8289229638514954,
+      "eval_loss": 0.21673431992530823,
+      "eval_runtime": 500.5835,
+      "eval_samples_per_second": 5.707,
+      "eval_steps_per_second": 5.707,
+      "step": 102000
+    },
+    {
+      "epoch": 2.83030969079456,
+      "grad_norm": 0.12656597793102264,
+      "learning_rate": 1.6923169367983994e-06,
+      "loss": 0.2173,
+      "step": 102050
+    },
+    {
+      "epoch": 2.8316964177376245,
+      "grad_norm": 0.12270744144916534,
+      "learning_rate": 1.6650010246577751e-06,
+      "loss": 0.2189,
+      "step": 102100
+    },
+    {
+      "epoch": 2.833083144680689,
+      "grad_norm": 0.13541977107524872,
+      "learning_rate": 1.6379055140799626e-06,
+      "loss": 0.2195,
+      "step": 102150
+    },
+    {
+      "epoch": 2.834469871623753,
+      "grad_norm": 0.17381368577480316,
+      "learning_rate": 1.6110304657950715e-06,
+      "loss": 0.2198,
+      "step": 102200
+    },
+    {
+      "epoch": 2.8358565985668176,
+      "grad_norm": 0.11857634037733078,
+      "learning_rate": 1.584375940039029e-06,
+      "loss": 0.2178,
+      "step": 102250
+    },
+    {
+      "epoch": 2.837243325509882,
+      "grad_norm": 0.13569362461566925,
+      "learning_rate": 1.557941996553558e-06,
+      "loss": 0.2151,
+      "step": 102300
+    },
+    {
+      "epoch": 2.8386300524529466,
+      "grad_norm": 0.1297358274459839,
+      "learning_rate": 1.5317286945859433e-06,
+      "loss": 0.2149,
+      "step": 102350
+    },
+    {
+      "epoch": 2.840016779396011,
+      "grad_norm": 0.14771656692028046,
+      "learning_rate": 1.505736092888932e-06,
+      "loss": 0.2152,
+      "step": 102400
+    },
+    {
+      "epoch": 2.8414035063390757,
+      "grad_norm": 0.11793384701013565,
+      "learning_rate": 1.4799642497206334e-06,
+      "loss": 0.2184,
+      "step": 102450
+    },
+    {
+      "epoch": 2.84279023328214,
+      "grad_norm": 0.12428101897239685,
+      "learning_rate": 1.454413222844353e-06,
+      "loss": 0.2151,
+      "step": 102500
+    },
+    {
+      "epoch": 2.8441769602252043,
+      "grad_norm": 0.14781787991523743,
+      "learning_rate": 1.4290830695284807e-06,
+      "loss": 0.2156,
+      "step": 102550
+    },
+    {
+      "epoch": 2.845563687168269,
+      "grad_norm": 0.1396929919719696,
+      "learning_rate": 1.4039738465463136e-06,
+      "loss": 0.2198,
+      "step": 102600
+    },
+    {
+      "epoch": 2.8469504141113333,
+      "grad_norm": 0.14160382747650146,
+      "learning_rate": 1.3790856101760452e-06,
+      "loss": 0.2195,
+      "step": 102650
+    },
+    {
+      "epoch": 2.848337141054398,
+      "grad_norm": 0.1488640010356903,
+      "learning_rate": 1.354418416200498e-06,
+      "loss": 0.2191,
+      "step": 102700
+    },
+    {
+      "epoch": 2.8497238679974624,
+      "grad_norm": 0.17428374290466309,
+      "learning_rate": 1.3299723199070802e-06,
+      "loss": 0.217,
+      "step": 102750
+    },
+    {
+      "epoch": 2.851110594940527,
+      "grad_norm": 0.1595117300748825,
+      "learning_rate": 1.3057473760876848e-06,
+      "loss": 0.2167,
+      "step": 102800
+    },
+    {
+      "epoch": 2.8524973218835914,
+      "grad_norm": 0.14677169919013977,
+      "learning_rate": 1.2817436390384796e-06,
+      "loss": 0.217,
+      "step": 102850
+    },
+    {
+      "epoch": 2.8538840488266555,
+      "grad_norm": 0.13991901278495789,
+      "learning_rate": 1.2579611625598509e-06,
+      "loss": 0.217,
+      "step": 102900
+    },
+    {
+      "epoch": 2.85527077576972,
+      "grad_norm": 0.13375407457351685,
+      "learning_rate": 1.2343999999562817e-06,
+      "loss": 0.2193,
+      "step": 102950
+    },
+    {
+      "epoch": 2.8566575027127845,
+      "grad_norm": 0.1192784383893013,
+      "learning_rate": 1.2110602040361963e-06,
+      "loss": 0.2198,
+      "step": 103000
+    },
+    {
+      "epoch": 2.8566575027127845,
+      "eval_loss": 0.21671663224697113,
+      "eval_runtime": 500.3054,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 103000
+    },
+    {
+      "epoch": 2.858044229655849,
+      "grad_norm": 0.13127942383289337,
+      "learning_rate": 1.1879418271118603e-06,
+      "loss": 0.2189,
+      "step": 103050
+    },
+    {
+      "epoch": 2.8594309565989136,
+      "grad_norm": 0.130979984998703,
+      "learning_rate": 1.165044920999292e-06,
+      "loss": 0.2188,
+      "step": 103100
+    },
+    {
+      "epoch": 2.860817683541978,
+      "grad_norm": 0.13157400488853455,
+      "learning_rate": 1.1423695370180954e-06,
+      "loss": 0.2198,
+      "step": 103150
+    },
+    {
+      "epoch": 2.8622044104850426,
+      "grad_norm": 0.1267186552286148,
+      "learning_rate": 1.1199157259913606e-06,
+      "loss": 0.2161,
+      "step": 103200
+    },
+    {
+      "epoch": 2.8635911374281067,
+      "grad_norm": 0.11663077026605606,
+      "learning_rate": 1.0976835382455975e-06,
+      "loss": 0.2161,
+      "step": 103250
+    },
+    {
+      "epoch": 2.864977864371171,
+      "grad_norm": 0.1856304109096527,
+      "learning_rate": 1.0756730236105572e-06,
+      "loss": 0.2205,
+      "step": 103300
+    },
+    {
+      "epoch": 2.8663645913142357,
+      "grad_norm": 0.11844471096992493,
+      "learning_rate": 1.0538842314191444e-06,
+      "loss": 0.2181,
+      "step": 103350
+    },
+    {
+      "epoch": 2.8677513182573002,
+      "grad_norm": 0.12963257730007172,
+      "learning_rate": 1.0323172105073164e-06,
+      "loss": 0.2195,
+      "step": 103400
+    },
+    {
+      "epoch": 2.8691380452003648,
+      "grad_norm": NaN,
+      "learning_rate": 1.0113967390973257e-06,
+      "loss": 0.2195,
+      "step": 103450
+    },
+    {
+      "epoch": 2.8705247721434293,
+      "grad_norm": 0.15201549232006073,
+      "learning_rate": 9.90268967449348e-07,
+      "loss": 0.2255,
+      "step": 103500
+    },
+    {
+      "epoch": 2.871911499086494,
+      "grad_norm": 0.15500032901763916,
+      "learning_rate": 9.69363109664001e-07,
+      "loss": 0.2233,
+      "step": 103550
+    },
+    {
+      "epoch": 2.873298226029558,
+      "grad_norm": 0.12956801056861877,
+      "learning_rate": 9.486792125983024e-07,
+      "loss": 0.2187,
+      "step": 103600
+    },
+    {
+      "epoch": 2.8746849529726224,
+      "grad_norm": 0.12632031738758087,
+      "learning_rate": 9.282173226117574e-07,
+      "loss": 0.2189,
+      "step": 103650
+    },
+    {
+      "epoch": 2.876071679915687,
+      "grad_norm": 0.12151964753866196,
+      "learning_rate": 9.079774855663026e-07,
+      "loss": 0.2189,
+      "step": 103700
+    },
+    {
+      "epoch": 2.8774584068587514,
+      "grad_norm": 0.1441679447889328,
+      "learning_rate": 8.879597468261502e-07,
+      "loss": 0.2161,
+      "step": 103750
+    },
+    {
+      "epoch": 2.878845133801816,
+      "grad_norm": 0.14770221710205078,
+      "learning_rate": 8.681641512577665e-07,
+      "loss": 0.2201,
+      "step": 103800
+    },
+    {
+      "epoch": 2.8802318607448805,
+      "grad_norm": 0.14278040826320648,
+      "learning_rate": 8.485907432296714e-07,
+      "loss": 0.2181,
+      "step": 103850
+    },
+    {
+      "epoch": 2.881618587687945,
+      "grad_norm": 0.12169021368026733,
+      "learning_rate": 8.292395666124053e-07,
+      "loss": 0.2158,
+      "step": 103900
+    },
+    {
+      "epoch": 2.883005314631009,
+      "grad_norm": 0.1436878740787506,
+      "learning_rate": 8.101106647784295e-07,
+      "loss": 0.2195,
+      "step": 103950
+    },
+    {
+      "epoch": 2.884392041574074,
+      "grad_norm": 0.15572527050971985,
+      "learning_rate": 7.912040806019816e-07,
+      "loss": 0.2187,
+      "step": 104000
+    },
+    {
+      "epoch": 2.884392041574074,
+      "eval_loss": 0.21670910716056824,
+      "eval_runtime": 500.2765,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 104000
+    },
+    {
+      "epoch": 2.885778768517138,
+      "grad_norm": 0.15793730318546295,
+      "learning_rate": 7.725198564590419e-07,
+      "loss": 0.2154,
+      "step": 104050
+    },
+    {
+      "epoch": 2.8871654954602026,
+      "grad_norm": 0.12704993784427643,
+      "learning_rate": 7.540580342272007e-07,
+      "loss": 0.2175,
+      "step": 104100
+    },
+    {
+      "epoch": 2.888552222403267,
+      "grad_norm": 0.1465180665254593,
+      "learning_rate": 7.358186552855362e-07,
+      "loss": 0.2154,
+      "step": 104150
+    },
+    {
+      "epoch": 2.8899389493463317,
+      "grad_norm": 0.13564006984233856,
+      "learning_rate": 7.178017605146137e-07,
+      "loss": 0.2194,
+      "step": 104200
+    },
+    {
+      "epoch": 2.891325676289396,
+      "grad_norm": 0.12637755274772644,
+      "learning_rate": 7.000073902962978e-07,
+      "loss": 0.2167,
+      "step": 104250
+    },
+    {
+      "epoch": 2.8927124032324603,
+      "grad_norm": 0.1291232705116272,
+      "learning_rate": 6.824355845137298e-07,
+      "loss": 0.2167,
+      "step": 104300
+    },
+    {
+      "epoch": 2.8940991301755252,
+      "grad_norm": 0.12603652477264404,
+      "learning_rate": 6.650863825511611e-07,
+      "loss": 0.2153,
+      "step": 104350
+    },
+    {
+      "epoch": 2.8954858571185893,
+      "grad_norm": 0.11706072837114334,
+      "learning_rate": 6.479598232939754e-07,
+      "loss": 0.2164,
+      "step": 104400
+    },
+    {
+      "epoch": 2.896872584061654,
+      "grad_norm": 0.1363930106163025,
+      "learning_rate": 6.310559451284892e-07,
+      "loss": 0.2175,
+      "step": 104450
+    },
+    {
+      "epoch": 2.8982593110047183,
+      "grad_norm": 0.12287319451570511,
+      "learning_rate": 6.143747859419513e-07,
+      "loss": 0.2166,
+      "step": 104500
+    },
+    {
+      "epoch": 2.899646037947783,
+      "grad_norm": 0.1426548957824707,
+      "learning_rate": 5.979163831223989e-07,
+      "loss": 0.2192,
+      "step": 104550
+    },
+    {
+      "epoch": 2.9010327648908474,
+      "grad_norm": 0.13957655429840088,
+      "learning_rate": 5.816807735586127e-07,
+      "loss": 0.2205,
+      "step": 104600
+    },
+    {
+      "epoch": 2.9024194918339115,
+      "grad_norm": 0.11439050734043121,
+      "learning_rate": 5.656679936400178e-07,
+      "loss": 0.2222,
+      "step": 104650
+    },
+    {
+      "epoch": 2.9038062187769764,
+      "grad_norm": 0.11388220638036728,
+      "learning_rate": 5.498780792565938e-07,
+      "loss": 0.2183,
+      "step": 104700
+    },
+    {
+      "epoch": 2.9051929457200405,
+      "grad_norm": 0.13717804849147797,
+      "learning_rate": 5.343110657988093e-07,
+      "loss": 0.2211,
+      "step": 104750
+    },
+    {
+      "epoch": 2.906579672663105,
+      "grad_norm": 0.1320059895515442,
+      "learning_rate": 5.189669881575432e-07,
+      "loss": 0.2221,
+      "step": 104800
+    },
+    {
+      "epoch": 2.9079663996061695,
+      "grad_norm": 0.16280671954154968,
+      "learning_rate": 5.041461175445905e-07,
+      "loss": 0.2166,
+      "step": 104850
+    },
+    {
+      "epoch": 2.909353126549234,
+      "grad_norm": 0.1369704157114029,
+      "learning_rate": 4.892435537993234e-07,
+      "loss": 0.2219,
+      "step": 104900
+    },
+    {
+      "epoch": 2.9107398534922986,
+      "grad_norm": 0.15099848806858063,
+      "learning_rate": 4.7456402688191845e-07,
+      "loss": 0.2212,
+      "step": 104950
+    },
+    {
+      "epoch": 2.9121265804353627,
+      "grad_norm": 0.14291705191135406,
+      "learning_rate": 4.601075696940793e-07,
+      "loss": 0.2164,
+      "step": 105000
+    },
+    {
+      "epoch": 2.9121265804353627,
+      "eval_loss": 0.21670198440551758,
+      "eval_runtime": 500.5524,
+      "eval_samples_per_second": 5.708,
+      "eval_steps_per_second": 5.708,
+      "step": 105000
+    },
+    {
+      "epoch": 2.9135133073784276,
+      "grad_norm": 0.1311856508255005,
+      "learning_rate": 4.4587421463757604e-07,
+      "loss": 0.224,
+      "step": 105050
+    },
+    {
+      "epoch": 2.9149000343214917,
+      "grad_norm": 0.161887988448143,
+      "learning_rate": 4.3186399361409003e-07,
+      "loss": 0.2213,
+      "step": 105100
+    },
+    {
+      "epoch": 2.916286761264556,
+      "grad_norm": 0.11059953272342682,
+      "learning_rate": 4.1807693802521364e-07,
+      "loss": 0.2192,
+      "step": 105150
+    },
+    {
+      "epoch": 2.9176734882076207,
+      "grad_norm": 0.12950967252254486,
+      "learning_rate": 4.0451307877233947e-07,
+      "loss": 0.217,
+      "step": 105200
+    },
+    {
+      "epoch": 2.9190602151506853,
+      "grad_norm": 0.16520391404628754,
+      "learning_rate": 3.9117244625660467e-07,
+      "loss": 0.2232,
+      "step": 105250
+    },
+    {
+      "epoch": 2.9204469420937498,
+      "grad_norm": 0.12426646798849106,
+      "learning_rate": 3.780550703788355e-07,
+      "loss": 0.2138,
+      "step": 105300
+    },
+    {
+      "epoch": 2.9218336690368143,
+      "grad_norm": 0.13209545612335205,
+      "learning_rate": 3.651609805394252e-07,
+      "loss": 0.2167,
+      "step": 105350
+    },
+    {
+      "epoch": 2.923220395979879,
+      "grad_norm": 0.13010719418525696,
+      "learning_rate": 3.52490205638345e-07,
+      "loss": 0.2172,
+      "step": 105400
+    },
+    {
+      "epoch": 2.924607122922943,
+      "grad_norm": 0.15640610456466675,
+      "learning_rate": 3.4004277407502226e-07,
+      "loss": 0.2241,
+      "step": 105450
+    },
+    {
+      "epoch": 2.9259938498660074,
+      "grad_norm": 0.14542129635810852,
+      "learning_rate": 3.2781871374832907e-07,
+      "loss": 0.2229,
+      "step": 105500
+    },
+    {
+      "epoch": 2.927380576809072,
+      "grad_norm": 0.18153244256973267,
+      "learning_rate": 3.158180520564491e-07,
+      "loss": 0.2188,
+      "step": 105550
+    },
+    {
+      "epoch": 2.9287673037521365,
+      "grad_norm": 0.10919786989688873,
+      "learning_rate": 3.040408158968777e-07,
+      "loss": 0.2179,
+      "step": 105600
+    },
+    {
+      "epoch": 2.930154030695201,
+      "grad_norm": 0.14857454597949982,
+      "learning_rate": 2.9248703166633305e-07,
+      "loss": 0.2178,
+      "step": 105650
+    },
+    {
+      "epoch": 2.9315407576382655,
+      "grad_norm": 0.1264246106147766,
+      "learning_rate": 2.8115672526068947e-07,
+      "loss": 0.2194,
+      "step": 105700
+    },
+    {
+      "epoch": 2.93292748458133,
+      "grad_norm": 0.1416803002357483,
+      "learning_rate": 2.700499220749664e-07,
+      "loss": 0.217,
+      "step": 105750
+    },
+    {
+      "epoch": 2.934314211524394,
+      "grad_norm": 0.12332191318273544,
+      "learning_rate": 2.5916664700320615e-07,
+      "loss": 0.2174,
+      "step": 105800
+    },
+    {
+      "epoch": 2.9357009384674586,
+      "grad_norm": 0.14106211066246033,
+      "learning_rate": 2.4850692443847413e-07,
+      "loss": 0.2226,
+      "step": 105850
+    },
+    {
+      "epoch": 2.937087665410523,
+      "grad_norm": 0.11803478747606277,
+      "learning_rate": 2.380707782727476e-07,
+      "loss": 0.2179,
+      "step": 105900
+    },
+    {
+      "epoch": 2.9384743923535876,
+      "grad_norm": 0.15033772587776184,
+      "learning_rate": 2.278582318969269e-07,
+      "loss": 0.2241,
+      "step": 105950
+    },
+    {
+      "epoch": 2.939861119296652,
+      "grad_norm": 0.1364557147026062,
+      "learning_rate": 2.178693082007355e-07,
+      "loss": 0.2161,
+      "step": 106000
+    },
+    {
+      "epoch": 2.939861119296652,
+      "eval_loss": 0.21669968962669373,
+      "eval_runtime": 500.2368,
+      "eval_samples_per_second": 5.711,
+      "eval_steps_per_second": 5.711,
+      "step": 106000
+    },
+    {
+      "epoch": 2.9412478462397167,
+      "grad_norm": 0.1393050253391266,
+      "learning_rate": 2.081040295726866e-07,
+      "loss": 0.2236,
+      "step": 106050
+    },
+    {
+      "epoch": 2.942634573182781,
+      "grad_norm": 0.1260671466588974,
+      "learning_rate": 1.9856241790003892e-07,
+      "loss": 0.2192,
+      "step": 106100
+    },
+    {
+      "epoch": 2.9440213001258453,
+      "grad_norm": 0.1376418024301529,
+      "learning_rate": 1.8924449456870773e-07,
+      "loss": 0.2175,
+      "step": 106150
+    },
+    {
+      "epoch": 2.94540802706891,
+      "grad_norm": 0.1473878175020218,
+      "learning_rate": 1.8015028046328707e-07,
+      "loss": 0.2202,
+      "step": 106200
+    },
+    {
+      "epoch": 2.9467947540119743,
+      "grad_norm": 0.12721039354801178,
+      "learning_rate": 1.7127979596694987e-07,
+      "loss": 0.2177,
+      "step": 106250
+    },
+    {
+      "epoch": 2.948181480955039,
+      "grad_norm": 0.13388779759407043,
+      "learning_rate": 1.626330609613924e-07,
+      "loss": 0.2177,
+      "step": 106300
+    },
+    {
+      "epoch": 2.9495682078981034,
+      "grad_norm": 0.1170138493180275,
+      "learning_rate": 1.5421009482686766e-07,
+      "loss": 0.2192,
+      "step": 106350
+    },
+    {
+      "epoch": 2.950954934841168,
+      "grad_norm": 0.13245512545108795,
+      "learning_rate": 1.460109164420187e-07,
+      "loss": 0.2164,
+      "step": 106400
+    },
+    {
+      "epoch": 2.9523416617842324,
+      "grad_norm": 0.11573006212711334,
+      "learning_rate": 1.3803554418396758e-07,
+      "loss": 0.2164,
+      "step": 106450
+    },
+    {
+      "epoch": 2.9537283887272965,
+      "grad_norm": 0.1261189877986908,
+      "learning_rate": 1.3028399592818208e-07,
+      "loss": 0.2161,
+      "step": 106500
+    },
+    {
+      "epoch": 2.955115115670361,
+      "grad_norm": 0.1416768878698349,
+      "learning_rate": 1.227562890484535e-07,
+      "loss": 0.2183,
+      "step": 106550
+    },
+    {
+      "epoch": 2.9565018426134255,
+      "grad_norm": 0.1263783723115921,
+      "learning_rate": 1.1545244041690773e-07,
+      "loss": 0.2154,
+      "step": 106600
+    },
+    {
+      "epoch": 2.95788856955649,
+      "grad_norm": 0.11391854286193848,
+      "learning_rate": 1.0837246640389432e-07,
+      "loss": 0.2137,
+      "step": 106650
+    },
+    {
+      "epoch": 2.9592752964995546,
+      "grad_norm": 0.12810936570167542,
+      "learning_rate": 1.0151638287799747e-07,
+      "loss": 0.2192,
+      "step": 106700
+    },
+    {
+      "epoch": 2.960662023442619,
+      "grad_norm": 0.11619652062654495,
+      "learning_rate": 9.488420520600283e-08,
+      "loss": 0.2171,
+      "step": 106750
+    },
+    {
+      "epoch": 2.9620487503856836,
+      "grad_norm": 0.1461794227361679,
+      "learning_rate": 8.847594825281968e-08,
+      "loss": 0.2173,
+      "step": 106800
+    },
+    {
+      "epoch": 2.9634354773287477,
+      "grad_norm": 0.13875767588615417,
+      "learning_rate": 8.229162638150323e-08,
+      "loss": 0.2177,
+      "step": 106850
+    },
+    {
+      "epoch": 2.964822204271812,
+      "grad_norm": 0.1234852597117424,
+      "learning_rate": 7.633125345317682e-08,
+      "loss": 0.216,
+      "step": 106900
+    },
+    {
+      "epoch": 2.9662089312148767,
+      "grad_norm": 0.12513309717178345,
+      "learning_rate": 7.0594842827032e-08,
+      "loss": 0.2188,
+      "step": 106950
+    },
+    {
+      "epoch": 2.9675956581579412,
+      "grad_norm": 0.1280387043952942,
+      "learning_rate": 6.508240736027294e-08,
+      "loss": 0.2176,
+      "step": 107000
+    },
+    {
+      "epoch": 2.9675956581579412,
+      "eval_loss": 0.2166997194290161,
+      "eval_runtime": 500.3572,
+      "eval_samples_per_second": 5.71,
+      "eval_steps_per_second": 5.71,
+      "step": 107000
+    },
+    {
+      "epoch": 2.9689823851010058,
+      "grad_norm": 0.14746126532554626,
+      "learning_rate": 5.97939594081054e-08,
+      "loss": 0.2189,
+      "step": 107050
+    },
+    {
+      "epoch": 2.9703691120440703,
+      "grad_norm": 0.13869601488113403,
+      "learning_rate": 5.472951082371447e-08,
+      "loss": 0.2153,
+      "step": 107100
+    },
+    {
+      "epoch": 2.971755838987135,
+      "grad_norm": 0.1652906984090805,
+      "learning_rate": 4.9889072958220203e-08,
+      "loss": 0.2179,
+      "step": 107150
+    },
+    {
+      "epoch": 2.973142565930199,
+      "grad_norm": 0.15118339657783508,
+      "learning_rate": 4.5272656660655385e-08,
+      "loss": 0.2175,
+      "step": 107200
+    },
+    {
+      "epoch": 2.9745292928732634,
+      "grad_norm": 0.12483932077884674,
+      "learning_rate": 4.088027227795444e-08,
+      "loss": 0.2159,
+      "step": 107250
+    },
+    {
+      "epoch": 2.975916019816328,
+      "grad_norm": 0.12288288027048111,
+      "learning_rate": 3.6711929654920096e-08,
+      "loss": 0.2176,
+      "step": 107300
+    },
+    {
+      "epoch": 2.9773027467593924,
+      "grad_norm": 0.14908748865127563,
+      "learning_rate": 3.2767638134190146e-08,
+      "loss": 0.2165,
+      "step": 107350
+    },
+    {
+      "epoch": 2.978689473702457,
+      "grad_norm": 0.1312543898820877,
+      "learning_rate": 2.904740655623739e-08,
+      "loss": 0.2121,
+      "step": 107400
+    },
+    {
+      "epoch": 2.9800762006455215,
+      "grad_norm": 0.13569709658622742,
+      "learning_rate": 2.5551243259358537e-08,
+      "loss": 0.2172,
+      "step": 107450
+    },
+    {
+      "epoch": 2.981462927588586,
+      "grad_norm": 0.1605551540851593,
+      "learning_rate": 2.227915607960762e-08,
+      "loss": 0.2165,
+      "step": 107500
+    },
+    {
+      "epoch": 2.98284965453165,
+      "grad_norm": 0.14773668348789215,
+      "learning_rate": 1.9231152350829285e-08,
+      "loss": 0.2155,
+      "step": 107550
+    },
+    {
+      "epoch": 2.984236381474715,
+      "grad_norm": 0.12570437788963318,
+      "learning_rate": 1.6407238904625476e-08,
+      "loss": 0.2206,
+      "step": 107600
+    },
+    {
+      "epoch": 2.985623108417779,
+      "grad_norm": 0.15974928438663483,
+      "learning_rate": 1.380742207031105e-08,
+      "loss": 0.2159,
+      "step": 107650
+    },
+    {
+      "epoch": 2.9870098353608436,
+      "grad_norm": 0.14371749758720398,
+      "learning_rate": 1.1431707674958158e-08,
+      "loss": 0.2214,
+      "step": 107700
+    },
+    {
+      "epoch": 2.988396562303908,
+      "grad_norm": 0.1482096165418625,
+      "learning_rate": 9.28010104334076e-09,
+      "loss": 0.217,
+      "step": 107750
+    },
+    {
+      "epoch": 2.9897832892469727,
+      "grad_norm": 0.14239954948425293,
+      "learning_rate": 7.3526069979013015e-09,
+      "loss": 0.2184,
+      "step": 107800
+    },
+    {
+      "epoch": 2.991170016190037,
+      "grad_norm": 0.16931667923927307,
+      "learning_rate": 5.649229858828431e-09,
+      "loss": 0.2217,
+      "step": 107850
+    },
+    {
+      "epoch": 2.9925567431331013,
+      "grad_norm": 0.11450044065713882,
+      "learning_rate": 4.169973443945985e-09,
+      "loss": 0.2111,
+      "step": 107900
+    },
+    {
+      "epoch": 2.9939434700761662,
+      "grad_norm": 0.13328483700752258,
+      "learning_rate": 2.91484106875739e-09,
+      "loss": 0.2185,
+      "step": 107950
+    },
+    {
+      "epoch": 2.9953301970192303,
+      "grad_norm": 0.15810194611549377,
+      "learning_rate": 1.8838355464345647e-09,
+      "loss": 0.2183,
+      "step": 108000
+    },
+    {
+      "epoch": 2.9953301970192303,
+      "eval_loss": 0.2166995406150818,
+      "eval_runtime": 500.4679,
+      "eval_samples_per_second": 5.709,
+      "eval_steps_per_second": 5.709,
+      "step": 108000
+    },
+    {
+      "epoch": 2.996716923962295,
+      "grad_norm": 0.16397058963775635,
+      "learning_rate": 1.0769591878068142e-09,
+      "loss": 0.2181,
+      "step": 108050
+    },
+    {
+      "epoch": 2.9981036509053594,
+      "grad_norm": 0.1313973069190979,
+      "learning_rate": 4.942138013608322e-10,
+      "loss": 0.2171,
+      "step": 108100
+    },
+    {
+      "epoch": 2.999490377848424,
+      "grad_norm": 0.13321100175380707,
+      "learning_rate": 1.3560069320739388e-10,
+      "loss": 0.2169,
+      "step": 108150
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 108168,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.9496046202925875e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}