diff --git "a/last_to_hit_frequency_1001/checkpoint-70000/trainer_state.json" "b/last_to_hit_frequency_1001/checkpoint-70000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/last_to_hit_frequency_1001/checkpoint-70000/trainer_state.json"
@@ -0,0 +1,10473 @@
+{
+  "best_global_step": 65000,
+  "best_metric": 3.5310399532318115,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_1001/checkpoint-40000",
+  "epoch": 20.390351899324166,
+  "eval_steps": 1000,
+  "global_step": 70000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01456536937776742,
+      "grad_norm": 0.9732208251953125,
+      "learning_rate": 0.000294,
+      "loss": 8.429,
+      "step": 50
+    },
+    {
+      "epoch": 0.02913073875553484,
+      "grad_norm": 0.9100438356399536,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7354,
+      "step": 100
+    },
+    {
+      "epoch": 0.04369610813330226,
+      "grad_norm": 0.792998194694519,
+      "learning_rate": 0.0005998286213931798,
+      "loss": 6.3577,
+      "step": 150
+    },
+    {
+      "epoch": 0.05826147751106968,
+      "grad_norm": 0.47094255685806274,
+      "learning_rate": 0.0005996537452637714,
+      "loss": 6.1522,
+      "step": 200
+    },
+    {
+      "epoch": 0.0728268468888371,
+      "grad_norm": 0.5149641633033752,
+      "learning_rate": 0.0005994788691343632,
+      "loss": 6.0111,
+      "step": 250
+    },
+    {
+      "epoch": 0.08739221626660452,
+      "grad_norm": 0.4672093391418457,
+      "learning_rate": 0.0005993039930049548,
+      "loss": 5.8874,
+      "step": 300
+    },
+    {
+      "epoch": 0.10195758564437195,
+      "grad_norm": 0.47426706552505493,
+      "learning_rate": 0.0005991291168755465,
+      "loss": 5.7519,
+      "step": 350
+    },
+    {
+      "epoch": 0.11652295502213936,
+      "grad_norm": 0.447353333234787,
+      "learning_rate": 0.0005989542407461382,
+      "loss": 5.6271,
+      "step": 400
+    },
+    {
+      "epoch": 0.13108832439990678,
+      "grad_norm": 0.48666468262672424,
+      "learning_rate": 0.0005987793646167297,
+      "loss": 5.5233,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456536937776742,
+      "grad_norm": 0.4567582607269287,
+      "learning_rate": 0.0005986044884873214,
+      "loss": 5.4158,
+      "step": 500
+    },
+    {
+      "epoch": 0.16021906315544163,
+      "grad_norm": 0.42497554421424866,
+      "learning_rate": 0.0005984296123579131,
+      "loss": 5.3416,
+      "step": 550
+    },
+    {
+      "epoch": 0.17478443253320905,
+      "grad_norm": 0.5244524478912354,
+      "learning_rate": 0.0005982547362285047,
+      "loss": 5.2606,
+      "step": 600
+    },
+    {
+      "epoch": 0.18934980191097647,
+      "grad_norm": 0.43386486172676086,
+      "learning_rate": 0.0005980798600990964,
+      "loss": 5.2099,
+      "step": 650
+    },
+    {
+      "epoch": 0.2039151712887439,
+      "grad_norm": 0.4284716248512268,
+      "learning_rate": 0.0005979049839696881,
+      "loss": 5.141,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184805406665113,
+      "grad_norm": 0.7061876058578491,
+      "learning_rate": 0.0005977301078402798,
+      "loss": 5.0658,
+      "step": 750
+    },
+    {
+      "epoch": 0.23304591004427871,
+      "grad_norm": 0.38007256388664246,
+      "learning_rate": 0.0005975552317108715,
+      "loss": 5.0179,
+      "step": 800
+    },
+    {
+      "epoch": 0.24761127942204614,
+      "grad_norm": 0.4920196533203125,
+      "learning_rate": 0.0005973803555814631,
+      "loss": 4.9658,
+      "step": 850
+    },
+    {
+      "epoch": 0.26217664879981356,
+      "grad_norm": 0.41616320610046387,
+      "learning_rate": 0.0005972054794520547,
+      "loss": 4.9388,
+      "step": 900
+    },
+    {
+      "epoch": 0.276742018177581,
+      "grad_norm": 0.45925629138946533,
+      "learning_rate": 0.0005970306033226464,
+      "loss": 4.8788,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "grad_norm": 0.43381017446517944,
+      "learning_rate": 0.0005968557271932381,
+      "loss": 4.8292,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "eval_accuracy": 0.25263935092136414,
+      "eval_loss": 4.762474536895752,
+      "eval_runtime": 180.7317,
+      "eval_samples_per_second": 92.098,
+      "eval_steps_per_second": 5.76,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30587275693311583,
+      "grad_norm": 0.6765235662460327,
+      "learning_rate": 0.0005966808510638297,
+      "loss": 4.7894,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32043812631088325,
+      "grad_norm": 0.4921708405017853,
+      "learning_rate": 0.0005965059749344214,
+      "loss": 4.7573,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350034956886507,
+      "grad_norm": 0.42672333121299744,
+      "learning_rate": 0.0005963310988050131,
+      "loss": 4.7064,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3495688650664181,
+      "grad_norm": 0.44009578227996826,
+      "learning_rate": 0.0005961562226756047,
+      "loss": 4.6818,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3641342344441855,
+      "grad_norm": 0.45628976821899414,
+      "learning_rate": 0.0005959813465461965,
+      "loss": 4.64,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37869960382195295,
+      "grad_norm": 0.4491446912288666,
+      "learning_rate": 0.000595806470416788,
+      "loss": 4.6006,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39326497319972037,
+      "grad_norm": 0.4491466283798218,
+      "learning_rate": 0.0005956315942873797,
+      "loss": 4.5835,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4078303425774878,
+      "grad_norm": 0.4405411183834076,
+      "learning_rate": 0.0005954567181579714,
+      "loss": 4.5493,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42239571195525516,
+      "grad_norm": 0.45015254616737366,
+      "learning_rate": 0.000595281842028563,
+      "loss": 4.5427,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4369610813330226,
+      "grad_norm": 0.4123348593711853,
+      "learning_rate": 0.0005951069658991547,
+      "loss": 4.5047,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45152645071079,
+      "grad_norm": 0.4243234395980835,
+      "learning_rate": 0.0005949320897697464,
+      "loss": 4.4801,
+      "step": 1550
+    },
+    {
+      "epoch": 0.46609182008855743,
+      "grad_norm": 0.4439113140106201,
+      "learning_rate": 0.0005947572136403381,
+      "loss": 4.476,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48065718946632485,
+      "grad_norm": 0.38935843110084534,
+      "learning_rate": 0.0005945823375109297,
+      "loss": 4.4492,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4952225588440923,
+      "grad_norm": 0.4441373348236084,
+      "learning_rate": 0.0005944074613815215,
+      "loss": 4.4366,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5097879282218597,
+      "grad_norm": 0.4098525643348694,
+      "learning_rate": 0.000594232585252113,
+      "loss": 4.4203,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5243532975996271,
+      "grad_norm": 0.5222740173339844,
+      "learning_rate": 0.0005940577091227047,
+      "loss": 4.3938,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5389186669773945,
+      "grad_norm": 0.42251360416412354,
+      "learning_rate": 0.0005938828329932964,
+      "loss": 4.3761,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553484036355162,
+      "grad_norm": 0.4452977180480957,
+      "learning_rate": 0.000593707956863888,
+      "loss": 4.3494,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5680494057329294,
+      "grad_norm": 0.3798687160015106,
+      "learning_rate": 0.0005935330807344797,
+      "loss": 4.347,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "grad_norm": 0.39384129643440247,
+      "learning_rate": 0.0005933582046050714,
+      "loss": 4.343,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "eval_accuracy": 0.30013608689339194,
+      "eval_loss": 4.277587890625,
+      "eval_runtime": 180.1511,
+      "eval_samples_per_second": 92.395,
+      "eval_steps_per_second": 5.778,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5971801444884642,
+      "grad_norm": 0.3870452344417572,
+      "learning_rate": 0.000593183328475663,
+      "loss": 4.3225,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6117455138662317,
+      "grad_norm": 0.3716549277305603,
+      "learning_rate": 0.0005930084523462546,
+      "loss": 4.3048,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6263108832439991,
+      "grad_norm": 0.3902703523635864,
+      "learning_rate": 0.0005928335762168463,
+      "loss": 4.2988,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6408762526217665,
+      "grad_norm": 0.4183005690574646,
+      "learning_rate": 0.000592658700087438,
+      "loss": 4.2755,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6554416219995339,
+      "grad_norm": 0.40300023555755615,
+      "learning_rate": 0.0005924838239580297,
+      "loss": 4.2868,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6700069913773014,
+      "grad_norm": 0.3854156732559204,
+      "learning_rate": 0.0005923089478286214,
+      "loss": 4.2729,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6845723607550688,
+      "grad_norm": 0.38318294286727905,
+      "learning_rate": 0.000592134071699213,
+      "loss": 4.2674,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6991377301328362,
+      "grad_norm": 0.3910939395427704,
+      "learning_rate": 0.0005919591955698047,
+      "loss": 4.2482,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7137030995106036,
+      "grad_norm": 0.380487322807312,
+      "learning_rate": 0.0005917843194403964,
+      "loss": 4.2344,
+      "step": 2450
+    },
+    {
+      "epoch": 0.728268468888371,
+      "grad_norm": 0.3697279095649719,
+      "learning_rate": 0.000591609443310988,
+      "loss": 4.231,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7428338382661385,
+      "grad_norm": 0.3669740557670593,
+      "learning_rate": 0.0005914345671815796,
+      "loss": 4.221,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7573992076439059,
+      "grad_norm": 0.4364064931869507,
+      "learning_rate": 0.0005912596910521713,
+      "loss": 4.2146,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7719645770216733,
+      "grad_norm": 0.36517658829689026,
+      "learning_rate": 0.0005910848149227629,
+      "loss": 4.1886,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7865299463994407,
+      "grad_norm": 0.3450356423854828,
+      "learning_rate": 0.0005909099387933547,
+      "loss": 4.1916,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8010953157772082,
+      "grad_norm": 0.38316479325294495,
+      "learning_rate": 0.0005907350626639463,
+      "loss": 4.1823,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8156606851549756,
+      "grad_norm": 0.39378035068511963,
+      "learning_rate": 0.000590560186534538,
+      "loss": 4.1847,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8302260545327429,
+      "grad_norm": 0.36056962609291077,
+      "learning_rate": 0.0005903853104051297,
+      "loss": 4.1647,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8447914239105103,
+      "grad_norm": 0.3993781507015228,
+      "learning_rate": 0.0005902104342757214,
+      "loss": 4.1662,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8593567932882777,
+      "grad_norm": 0.3729277551174164,
+      "learning_rate": 0.000590035558146313,
+      "loss": 4.1351,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "grad_norm": 0.3638637065887451,
+      "learning_rate": 0.0005898606820169046,
+      "loss": 4.1519,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "eval_accuracy": 0.31632108041824236,
+      "eval_loss": 4.091220855712891,
+      "eval_runtime": 180.2511,
+      "eval_samples_per_second": 92.343,
+      "eval_steps_per_second": 5.775,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8884875320438126,
+      "grad_norm": 0.3821454346179962,
+      "learning_rate": 0.0005896858058874963,
+      "loss": 4.1453,
+      "step": 3050
+    },
+    {
+      "epoch": 0.90305290142158,
+      "grad_norm": 0.35036420822143555,
+      "learning_rate": 0.0005895109297580879,
+      "loss": 4.1283,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9176182707993474,
+      "grad_norm": 0.3927556276321411,
+      "learning_rate": 0.0005893360536286797,
+      "loss": 4.1126,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9321836401771149,
+      "grad_norm": 0.36200571060180664,
+      "learning_rate": 0.0005891611774992713,
+      "loss": 4.1101,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9467490095548823,
+      "grad_norm": 0.39992815256118774,
+      "learning_rate": 0.000588986301369863,
+      "loss": 4.095,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9613143789326497,
+      "grad_norm": 0.3586205840110779,
+      "learning_rate": 0.0005888114252404547,
+      "loss": 4.1105,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9758797483104171,
+      "grad_norm": 0.35303160548210144,
+      "learning_rate": 0.0005886365491110463,
+      "loss": 4.0899,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9904451176881846,
+      "grad_norm": 0.401599258184433,
+      "learning_rate": 0.000588461672981638,
+      "loss": 4.0786,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0049522255884409,
+      "grad_norm": 0.3272145986557007,
+      "learning_rate": 0.0005882867968522296,
+      "loss": 4.0585,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0195175949662083,
+      "grad_norm": 0.38398465514183044,
+      "learning_rate": 0.0005881119207228212,
+      "loss": 4.0137,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0340829643439757,
+      "grad_norm": 0.35750123858451843,
+      "learning_rate": 0.0005879370445934129,
+      "loss": 4.0097,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0486483337217432,
+      "grad_norm": 0.3321663737297058,
+      "learning_rate": 0.0005877621684640046,
+      "loss": 4.005,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0632137030995106,
+      "grad_norm": 0.3508462607860565,
+      "learning_rate": 0.0005875872923345963,
+      "loss": 4.0109,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077779072477278,
+      "grad_norm": 0.3782167434692383,
+      "learning_rate": 0.000587412416205188,
+      "loss": 3.998,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0923444418550454,
+      "grad_norm": 0.33389317989349365,
+      "learning_rate": 0.0005872375400757797,
+      "loss": 4.0072,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1069098112328128,
+      "grad_norm": 0.33711186051368713,
+      "learning_rate": 0.0005870626639463713,
+      "loss": 3.9995,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1214751806105803,
+      "grad_norm": 0.38124826550483704,
+      "learning_rate": 0.0005868877878169629,
+      "loss": 3.9936,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1360405499883477,
+      "grad_norm": 0.37443825602531433,
+      "learning_rate": 0.0005867129116875546,
+      "loss": 3.9935,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1506059193661151,
+      "grad_norm": 0.34981346130371094,
+      "learning_rate": 0.0005865380355581462,
+      "loss": 4.0015,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "grad_norm": 0.3286009132862091,
+      "learning_rate": 0.0005863631594287379,
+      "loss": 3.9832,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "eval_accuracy": 0.32566798677811487,
+      "eval_loss": 3.987405300140381,
+      "eval_runtime": 180.1237,
+      "eval_samples_per_second": 92.409,
+      "eval_steps_per_second": 5.779,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17973665812165,
+      "grad_norm": 0.3387763798236847,
+      "learning_rate": 0.0005861882832993296,
+      "loss": 3.9905,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1943020274994174,
+      "grad_norm": 0.32717615365982056,
+      "learning_rate": 0.0005860134071699212,
+      "loss": 3.9715,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2088673968771848,
+      "grad_norm": 0.3719293177127838,
+      "learning_rate": 0.000585838531040513,
+      "loss": 3.9672,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2234327662549522,
+      "grad_norm": 0.3480449318885803,
+      "learning_rate": 0.0005856636549111046,
+      "loss": 3.9802,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2379981356327197,
+      "grad_norm": 0.35181939601898193,
+      "learning_rate": 0.0005854887787816963,
+      "loss": 3.9749,
+      "step": 4250
+    },
+    {
+      "epoch": 1.252563505010487,
+      "grad_norm": 0.3547437787055969,
+      "learning_rate": 0.0005853139026522879,
+      "loss": 3.968,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2671288743882545,
+      "grad_norm": 0.34064048528671265,
+      "learning_rate": 0.0005851390265228796,
+      "loss": 3.9684,
+      "step": 4350
+    },
+    {
+      "epoch": 1.281694243766022,
+      "grad_norm": 0.34372982382774353,
+      "learning_rate": 0.0005849641503934712,
+      "loss": 3.9613,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2962596131437893,
+      "grad_norm": 0.36687299609184265,
+      "learning_rate": 0.0005847892742640629,
+      "loss": 3.9657,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3108249825215568,
+      "grad_norm": 0.3261411786079407,
+      "learning_rate": 0.0005846143981346546,
+      "loss": 3.9638,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3253903518993242,
+      "grad_norm": 0.34175243973731995,
+      "learning_rate": 0.0005844395220052462,
+      "loss": 3.9506,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3399557212770916,
+      "grad_norm": 0.3380016088485718,
+      "learning_rate": 0.000584264645875838,
+      "loss": 3.9473,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354521090654859,
+      "grad_norm": 0.31564953923225403,
+      "learning_rate": 0.0005840897697464296,
+      "loss": 3.9512,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3690864600326265,
+      "grad_norm": 0.3103531002998352,
+      "learning_rate": 0.0005839148936170212,
+      "loss": 3.942,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3836518294103939,
+      "grad_norm": 0.3472263216972351,
+      "learning_rate": 0.0005837400174876129,
+      "loss": 3.9413,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3982171987881613,
+      "grad_norm": 0.3497699201107025,
+      "learning_rate": 0.0005835651413582045,
+      "loss": 3.9362,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4127825681659287,
+      "grad_norm": 0.347032368183136,
+      "learning_rate": 0.0005833902652287962,
+      "loss": 3.935,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4273479375436962,
+      "grad_norm": 0.33469849824905396,
+      "learning_rate": 0.0005832153890993879,
+      "loss": 3.9344,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4419133069214636,
+      "grad_norm": 0.3481560945510864,
+      "learning_rate": 0.0005830405129699796,
+      "loss": 3.9247,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456478676299231,
+      "grad_norm": 0.3438446521759033,
+      "learning_rate": 0.0005828656368405712,
+      "loss": 3.9231,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456478676299231,
+      "eval_accuracy": 0.33212456036291405,
+      "eval_loss": 3.913280487060547,
+      "eval_runtime": 180.0039,
+      "eval_samples_per_second": 92.47,
+      "eval_steps_per_second": 5.783,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4710440456769984,
+      "grad_norm": 0.3285478949546814,
+      "learning_rate": 0.0005826907607111629,
+      "loss": 3.9283,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4856094150547658,
+      "grad_norm": 0.3346214294433594,
+      "learning_rate": 0.0005825158845817546,
+      "loss": 3.9257,
+      "step": 5100
+    },
+    {
+      "epoch": 1.500174784432533,
+      "grad_norm": 0.34109315276145935,
+      "learning_rate": 0.0005823410084523462,
+      "loss": 3.9206,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5147401538103007,
+      "grad_norm": 0.32906198501586914,
+      "learning_rate": 0.0005821661323229379,
+      "loss": 3.9209,
+      "step": 5200
+    },
+    {
+      "epoch": 1.529305523188068,
+      "grad_norm": 0.3165450394153595,
+      "learning_rate": 0.0005819912561935295,
+      "loss": 3.9042,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5438708925658355,
+      "grad_norm": 0.3268945515155792,
+      "learning_rate": 0.0005818163800641212,
+      "loss": 3.9095,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5584362619436027,
+      "grad_norm": 0.34344053268432617,
+      "learning_rate": 0.0005816415039347129,
+      "loss": 3.9064,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5730016313213704,
+      "grad_norm": 0.33473026752471924,
+      "learning_rate": 0.0005814666278053045,
+      "loss": 3.9084,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5875670006991376,
+      "grad_norm": 0.3239467442035675,
+      "learning_rate": 0.0005812917516758962,
+      "loss": 3.9066,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6021323700769052,
+      "grad_norm": 0.32284918427467346,
+      "learning_rate": 0.0005811168755464879,
+      "loss": 3.8854,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6166977394546724,
+      "grad_norm": 0.34899482131004333,
+      "learning_rate": 0.0005809419994170794,
+      "loss": 3.8977,
+      "step": 5550
+    },
+    {
+      "epoch": 1.63126310883244,
+      "grad_norm": 0.3288831114768982,
+      "learning_rate": 0.0005807671232876712,
+      "loss": 3.9047,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6458284782102073,
+      "grad_norm": 0.32496726512908936,
+      "learning_rate": 0.0005805922471582628,
+      "loss": 3.8927,
+      "step": 5650
+    },
+    {
+      "epoch": 1.660393847587975,
+      "grad_norm": 0.3319661021232605,
+      "learning_rate": 0.0005804173710288545,
+      "loss": 3.8783,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6749592169657421,
+      "grad_norm": 0.32201242446899414,
+      "learning_rate": 0.0005802424948994462,
+      "loss": 3.8909,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6895245863435098,
+      "grad_norm": 0.3138631284236908,
+      "learning_rate": 0.0005800676187700379,
+      "loss": 3.8901,
+      "step": 5800
+    },
+    {
+      "epoch": 1.704089955721277,
+      "grad_norm": 0.3181164264678955,
+      "learning_rate": 0.0005798927426406295,
+      "loss": 3.8813,
+      "step": 5850
+    },
+    {
+      "epoch": 1.7186553250990446,
+      "grad_norm": 0.33025944232940674,
+      "learning_rate": 0.0005797178665112212,
+      "loss": 3.873,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7332206944768118,
+      "grad_norm": 0.3095895051956177,
+      "learning_rate": 0.0005795429903818129,
+      "loss": 3.884,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "grad_norm": 0.3384963572025299,
+      "learning_rate": 0.0005793681142524044,
+      "loss": 3.8709,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "eval_accuracy": 0.3377239334814319,
+      "eval_loss": 3.851867914199829,
+      "eval_runtime": 180.0309,
+      "eval_samples_per_second": 92.456,
+      "eval_steps_per_second": 5.782,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7623514332323467,
+      "grad_norm": 0.33476731181144714,
+      "learning_rate": 0.0005791932381229961,
+      "loss": 3.8649,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7769168026101143,
+      "grad_norm": 0.318613737821579,
+      "learning_rate": 0.0005790183619935878,
+      "loss": 3.8848,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7914821719878815,
+      "grad_norm": 0.31928250193595886,
+      "learning_rate": 0.0005788434858641795,
+      "loss": 3.8653,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8060475413656492,
+      "grad_norm": 0.32100650668144226,
+      "learning_rate": 0.0005786686097347712,
+      "loss": 3.8764,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8206129107434164,
+      "grad_norm": 0.3220023214817047,
+      "learning_rate": 0.0005784937336053628,
+      "loss": 3.8634,
+      "step": 6250
+    },
+    {
+      "epoch": 1.835178280121184,
+      "grad_norm": 0.31344911456108093,
+      "learning_rate": 0.0005783188574759545,
+      "loss": 3.8532,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8497436494989512,
+      "grad_norm": 0.32198983430862427,
+      "learning_rate": 0.0005781439813465462,
+      "loss": 3.8574,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8643090188767188,
+      "grad_norm": 0.3163566589355469,
+      "learning_rate": 0.0005779691052171379,
+      "loss": 3.8587,
+      "step": 6400
+    },
+    {
+      "epoch": 1.878874388254486,
+      "grad_norm": 0.32121846079826355,
+      "learning_rate": 0.0005777942290877294,
+      "loss": 3.8634,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8934397576322537,
+      "grad_norm": 0.33083975315093994,
+      "learning_rate": 0.0005776193529583211,
+      "loss": 3.8503,
+      "step": 6500
+    },
+    {
+      "epoch": 1.908005127010021,
+      "grad_norm": 0.30773088335990906,
+      "learning_rate": 0.0005774444768289128,
+      "loss": 3.8501,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9225704963877885,
+      "grad_norm": 0.39235207438468933,
+      "learning_rate": 0.0005772696006995045,
+      "loss": 3.8508,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9371358657655557,
+      "grad_norm": 0.32834747433662415,
+      "learning_rate": 0.0005770947245700962,
+      "loss": 3.8371,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9517012351433234,
+      "grad_norm": 0.3106667995452881,
+      "learning_rate": 0.0005769198484406878,
+      "loss": 3.868,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9662666045210906,
+      "grad_norm": 0.31455743312835693,
+      "learning_rate": 0.0005767449723112795,
+      "loss": 3.8564,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9808319738988582,
+      "grad_norm": 0.33098679780960083,
+      "learning_rate": 0.0005765700961818712,
+      "loss": 3.8458,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9953973432766254,
+      "grad_norm": 0.3470974266529083,
+      "learning_rate": 0.0005763952200524627,
+      "loss": 3.8377,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0099044511768818,
+      "grad_norm": 0.32027867436408997,
+      "learning_rate": 0.0005762203439230544,
+      "loss": 3.7672,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0244698205546494,
+      "grad_norm": 0.33726635575294495,
+      "learning_rate": 0.0005760454677936461,
+      "loss": 3.7396,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "grad_norm": 0.3217654526233673,
+      "learning_rate": 0.0005758705916642378,
+      "loss": 3.753,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "eval_accuracy": 0.34205719882030594,
+      "eval_loss": 3.81199049949646,
+      "eval_runtime": 180.0121,
+      "eval_samples_per_second": 92.466,
+      "eval_steps_per_second": 5.783,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0536005593101843,
+      "grad_norm": 0.33013007044792175,
+      "learning_rate": 0.0005756957155348294,
+      "loss": 3.7468,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0681659286879515,
+      "grad_norm": 0.3293043375015259,
+      "learning_rate": 0.0005755208394054211,
+      "loss": 3.7574,
+      "step": 7100
+    },
+    {
+      "epoch": 2.082731298065719,
+      "grad_norm": 0.32888132333755493,
+      "learning_rate": 0.0005753459632760128,
+      "loss": 3.7502,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0972966674434863,
+      "grad_norm": 0.32330965995788574,
+      "learning_rate": 0.0005751710871466045,
+      "loss": 3.7581,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111862036821254,
+      "grad_norm": 0.3227783739566803,
+      "learning_rate": 0.0005749962110171962,
+      "loss": 3.7375,
+      "step": 7250
+    },
+    {
+      "epoch": 2.126427406199021,
+      "grad_norm": 0.3198298215866089,
+      "learning_rate": 0.0005748213348877877,
+      "loss": 3.7408,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140992775576789,
+      "grad_norm": 0.31929996609687805,
+      "learning_rate": 0.0005746464587583794,
+      "loss": 3.7467,
+      "step": 7350
+    },
+    {
+      "epoch": 2.155558144954556,
+      "grad_norm": 0.3041769564151764,
+      "learning_rate": 0.0005744715826289711,
+      "loss": 3.7418,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1701235143323236,
+      "grad_norm": 0.32159364223480225,
+      "learning_rate": 0.0005742967064995627,
+      "loss": 3.7448,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184688883710091,
+      "grad_norm": 0.32719555497169495,
+      "learning_rate": 0.0005741218303701544,
+      "loss": 3.7598,
+      "step": 7500
+    },
+    {
+      "epoch": 2.1992542530878585,
+      "grad_norm": 0.32142728567123413,
+      "learning_rate": 0.0005739469542407461,
+      "loss": 3.7616,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2138196224656257,
+      "grad_norm": 0.3248656392097473,
+      "learning_rate": 0.0005737720781113378,
+      "loss": 3.7569,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2283849918433933,
+      "grad_norm": 0.32039493322372437,
+      "learning_rate": 0.0005735972019819295,
+      "loss": 3.7451,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2429503612211605,
+      "grad_norm": 0.31409886479377747,
+      "learning_rate": 0.000573422325852521,
+      "loss": 3.7631,
+      "step": 7700
+    },
+    {
+      "epoch": 2.257515730598928,
+      "grad_norm": 0.32058414816856384,
+      "learning_rate": 0.0005732474497231127,
+      "loss": 3.7544,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2720810999766954,
+      "grad_norm": 0.3249547481536865,
+      "learning_rate": 0.0005730725735937044,
+      "loss": 3.7484,
+      "step": 7800
+    },
+    {
+      "epoch": 2.286646469354463,
+      "grad_norm": 0.33606165647506714,
+      "learning_rate": 0.0005728976974642961,
+      "loss": 3.7481,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3012118387322302,
+      "grad_norm": 0.3388115465641022,
+      "learning_rate": 0.0005727228213348877,
+      "loss": 3.7435,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3157772081099974,
+      "grad_norm": 0.3214958906173706,
+      "learning_rate": 0.0005725479452054794,
+      "loss": 3.7479,
+      "step": 7950
+    },
+    {
+      "epoch": 2.330342577487765,
+      "grad_norm": 0.32727494835853577,
+      "learning_rate": 0.0005723730690760711,
+      "loss": 3.7469,
+      "step": 8000
+    },
+    {
+      "epoch": 2.330342577487765,
+      "eval_accuracy": 0.3452295812344698,
+      "eval_loss": 3.780815839767456,
+      "eval_runtime": 180.1741,
+      "eval_samples_per_second": 92.383,
+      "eval_steps_per_second": 5.778,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3449079468655327,
+      "grad_norm": 0.31999602913856506,
+      "learning_rate": 0.0005721981929466627,
+      "loss": 3.7439,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3594733162433,
+      "grad_norm": 0.3452329635620117,
+      "learning_rate": 0.0005720233168172545,
+      "loss": 3.7511,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374038685621067,
+      "grad_norm": 0.3058473765850067,
+      "learning_rate": 0.000571848440687846,
+      "loss": 3.7567,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3886040549988348,
+      "grad_norm": 0.338764488697052,
+      "learning_rate": 0.0005716735645584377,
+      "loss": 3.7437,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4031694243766024,
+      "grad_norm": 0.3135814964771271,
+      "learning_rate": 0.0005714986884290294,
+      "loss": 3.7574,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4177347937543696,
+      "grad_norm": 0.32909590005874634,
+      "learning_rate": 0.000571323812299621,
+      "loss": 3.7459,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432300163132137,
+      "grad_norm": 0.3048388361930847,
+      "learning_rate": 0.0005711489361702127,
+      "loss": 3.737,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4468655325099045,
+      "grad_norm": 0.3441195785999298,
+      "learning_rate": 0.0005709740600408044,
+      "loss": 3.7551,
+      "step": 8400
+    },
+    {
+      "epoch": 2.461430901887672,
+      "grad_norm": 0.32695695757865906,
+      "learning_rate": 0.0005707991839113961,
+      "loss": 3.7378,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4759962712654393,
+      "grad_norm": 0.3172362446784973,
+      "learning_rate": 0.0005706243077819877,
+      "loss": 3.7473,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4905616406432065,
+      "grad_norm": 0.3251330256462097,
+      "learning_rate": 0.0005704494316525793,
+      "loss": 3.7485,
+      "step": 8550
+    },
+    {
+      "epoch": 2.505127010020974,
+      "grad_norm": 0.3160916268825531,
+      "learning_rate": 0.000570274555523171,
+      "loss": 3.7484,
+      "step": 8600
+    },
+    {
+      "epoch": 2.519692379398742,
+      "grad_norm": 0.3058483898639679,
+      "learning_rate": 0.0005700996793937627,
+      "loss": 3.7459,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534257748776509,
+      "grad_norm": 0.3010760247707367,
+      "learning_rate": 0.0005699248032643544,
+      "loss": 3.7445,
+      "step": 8700
+    },
+    {
+      "epoch": 2.548823118154276,
+      "grad_norm": 0.31458958983421326,
+      "learning_rate": 0.000569749927134946,
+      "loss": 3.7402,
+      "step": 8750
+    },
+    {
+      "epoch": 2.563388487532044,
+      "grad_norm": 0.3097609877586365,
+      "learning_rate": 0.0005695750510055377,
+      "loss": 3.7328,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5779538569098115,
+      "grad_norm": 0.3084356188774109,
+      "learning_rate": 0.0005694001748761294,
+      "loss": 3.7358,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5925192262875787,
+      "grad_norm": 0.3240303695201874,
+      "learning_rate": 0.000569225298746721,
+      "loss": 3.7288,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607084595665346,
+      "grad_norm": 0.30645912885665894,
+      "learning_rate": 0.0005690504226173127,
+      "loss": 3.7361,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "grad_norm": 0.3031748831272125,
+      "learning_rate": 0.0005688755464879043,
+      "loss": 3.7452,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "eval_accuracy": 0.34781717210847685,
+      "eval_loss": 3.7518084049224854,
+      "eval_runtime": 180.2025,
+      "eval_samples_per_second": 92.368,
+      "eval_steps_per_second": 5.777,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636215334420881,
+      "grad_norm": 0.30560678243637085,
+      "learning_rate": 0.000568700670358496,
+      "loss": 3.7391,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6507807037986484,
+      "grad_norm": 0.33370503783226013,
+      "learning_rate": 0.0005685257942290877,
+      "loss": 3.7257,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6653460731764156,
+      "grad_norm": 0.32426562905311584,
+      "learning_rate": 0.0005683509180996793,
+      "loss": 3.7313,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6799114425541832,
+      "grad_norm": 0.31184735894203186,
+      "learning_rate": 0.000568176041970271,
+      "loss": 3.7244,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6944768119319504,
+      "grad_norm": 0.30784913897514343,
+      "learning_rate": 0.0005680011658408627,
+      "loss": 3.7268,
+      "step": 9250
+    },
+    {
+      "epoch": 2.709042181309718,
+      "grad_norm": 0.2977924942970276,
+      "learning_rate": 0.0005678262897114544,
+      "loss": 3.7179,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7236075506874853,
+      "grad_norm": 0.3281992971897125,
+      "learning_rate": 0.000567651413582046,
+      "loss": 3.7368,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738172920065253,
+      "grad_norm": 0.32184773683547974,
+      "learning_rate": 0.0005674765374526377,
+      "loss": 3.7217,
+      "step": 9400
+    },
+    {
+      "epoch": 2.75273828944302,
+      "grad_norm": 0.3028261363506317,
+      "learning_rate": 0.0005673016613232293,
+      "loss": 3.7288,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7673036588207878,
+      "grad_norm": 0.36276352405548096,
+      "learning_rate": 0.0005671267851938209,
+      "loss": 3.7211,
+      "step": 9500
+    },
+    {
+      "epoch": 2.781869028198555,
+      "grad_norm": 0.3130638301372528,
+      "learning_rate": 0.0005669519090644127,
+      "loss": 3.7186,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7964343975763226,
+      "grad_norm": 0.31529319286346436,
+      "learning_rate": 0.0005667770329350043,
+      "loss": 3.7311,
+      "step": 9600
+    },
+    {
+      "epoch": 2.81099976695409,
+      "grad_norm": 0.3072699010372162,
+      "learning_rate": 0.000566602156805596,
+      "loss": 3.7297,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8255651363318575,
+      "grad_norm": 0.3379790186882019,
+      "learning_rate": 0.0005664272806761877,
+      "loss": 3.7178,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8401305057096247,
+      "grad_norm": 0.3173193037509918,
+      "learning_rate": 0.0005662524045467793,
+      "loss": 3.7132,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8546958750873923,
+      "grad_norm": 0.33217954635620117,
+      "learning_rate": 0.000566077528417371,
+      "loss": 3.7239,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8692612444651595,
+      "grad_norm": 0.331225723028183,
+      "learning_rate": 0.0005659026522879626,
+      "loss": 3.7334,
+      "step": 9850
+    },
+    {
+      "epoch": 2.883826613842927,
+      "grad_norm": 0.31492242217063904,
+      "learning_rate": 0.0005657277761585543,
+      "loss": 3.7225,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8983919832206944,
+      "grad_norm": 0.327841192483902,
+      "learning_rate": 0.0005655529000291459,
+      "loss": 3.7265,
+      "step": 9950
+    },
+    {
+      "epoch": 2.912957352598462,
+      "grad_norm": 0.3129492998123169,
+      "learning_rate": 0.0005653780238997376,
+      "loss": 3.7285,
+      "step": 10000
+    },
+    {
+      "epoch": 2.912957352598462,
+      "eval_accuracy": 0.34999950032890115,
+      "eval_loss": 3.7232048511505127,
+      "eval_runtime": 179.9799,
+      "eval_samples_per_second": 92.483,
+      "eval_steps_per_second": 5.784,
+      "step": 10000
+    },
+    {
+      "epoch": 2.927522721976229,
+      "grad_norm": 0.3213813006877899,
+      "learning_rate": 0.0005652031477703293,
+      "loss": 3.7164,
+      "step": 10050
+    },
+    {
+      "epoch": 2.942088091353997,
+      "grad_norm": 0.313442200422287,
+      "learning_rate": 0.000565028271640921,
+      "loss": 3.7237,
+      "step": 10100
+    },
+    {
+      "epoch": 2.956653460731764,
+      "grad_norm": 0.31209588050842285,
+      "learning_rate": 0.0005648533955115127,
+      "loss": 3.7229,
+      "step": 10150
+    },
+    {
+      "epoch": 2.9712188301095317,
+      "grad_norm": 0.309455007314682,
+      "learning_rate": 0.0005646785193821043,
+      "loss": 3.7254,
+      "step": 10200
+    },
+    {
+      "epoch": 2.985784199487299,
+      "grad_norm": 0.30592969059944153,
+      "learning_rate": 0.000564503643252696,
+      "loss": 3.7176,
+      "step": 10250
+    },
+    {
+      "epoch": 3.0002913073875552,
+      "grad_norm": 0.32156163454055786,
+      "learning_rate": 0.0005643287671232876,
+      "loss": 3.7071,
+      "step": 10300
+    },
+    {
+      "epoch": 3.014856676765323,
+      "grad_norm": 0.31477323174476624,
+      "learning_rate": 0.0005641538909938792,
+      "loss": 3.6161,
+      "step": 10350
+    },
+    {
+      "epoch": 3.02942204614309,
+      "grad_norm": 0.3122144043445587,
+      "learning_rate": 0.0005639790148644709,
+      "loss": 3.6061,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0439874155208577,
+      "grad_norm": 0.30838510394096375,
+      "learning_rate": 0.0005638041387350626,
+      "loss": 3.615,
+      "step": 10450
+    },
+    {
+      "epoch": 3.058552784898625,
+      "grad_norm": 0.3112572133541107,
+      "learning_rate": 0.0005636292626056543,
+      "loss": 3.6097,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0731181542763926,
+      "grad_norm": 0.31443509459495544,
+      "learning_rate": 0.000563454386476246,
+      "loss": 3.6103,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0876835236541598,
+      "grad_norm": 0.3189432621002197,
+      "learning_rate": 0.0005632795103468376,
+      "loss": 3.6205,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1022488930319274,
+      "grad_norm": 0.3305325210094452,
+      "learning_rate": 0.0005631046342174293,
+      "loss": 3.6255,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1168142624096946,
+      "grad_norm": 0.33095782995224,
+      "learning_rate": 0.000562929758088021,
+      "loss": 3.6204,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1313796317874623,
+      "grad_norm": 0.3245634436607361,
+      "learning_rate": 0.0005627548819586126,
+      "loss": 3.6275,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1459450011652295,
+      "grad_norm": 0.3105281889438629,
+      "learning_rate": 0.0005625800058292042,
+      "loss": 3.6391,
+      "step": 10800
+    },
+    {
+      "epoch": 3.160510370542997,
+      "grad_norm": 0.31318241357803345,
+      "learning_rate": 0.0005624051296997959,
+      "loss": 3.6262,
+      "step": 10850
+    },
+    {
+      "epoch": 3.1750757399207643,
+      "grad_norm": 0.3430389165878296,
+      "learning_rate": 0.0005622302535703876,
+      "loss": 3.6341,
+      "step": 10900
+    },
+    {
+      "epoch": 3.189641109298532,
+      "grad_norm": 0.3307541608810425,
+      "learning_rate": 0.0005620553774409792,
+      "loss": 3.6337,
+      "step": 10950
+    },
+    {
+      "epoch": 3.204206478676299,
+      "grad_norm": 0.3157060444355011,
+      "learning_rate": 0.000561880501311571,
+      "loss": 3.6361,
+      "step": 11000
+    },
+    {
+      "epoch": 3.204206478676299,
+      "eval_accuracy": 0.35203486646142923,
+      "eval_loss": 3.712493419647217,
+      "eval_runtime": 180.1969,
+      "eval_samples_per_second": 92.371,
+      "eval_steps_per_second": 5.777,
+      "step": 11000
+    },
+    {
+      "epoch": 3.218771848054067,
+      "grad_norm": 0.31381699442863464,
+      "learning_rate": 0.0005617056251821626,
+      "loss": 3.6288,
+      "step": 11050
+    },
+    {
+      "epoch": 3.233337217431834,
+      "grad_norm": 0.3294530510902405,
+      "learning_rate": 0.0005615307490527543,
+      "loss": 3.6322,
+      "step": 11100
+    },
+    {
+      "epoch": 3.2479025868096016,
+      "grad_norm": 0.34993889927864075,
+      "learning_rate": 0.000561355872923346,
+      "loss": 3.6331,
+      "step": 11150
+    },
+    {
+      "epoch": 3.262467956187369,
+      "grad_norm": 0.31110501289367676,
+      "learning_rate": 0.0005611809967939375,
+      "loss": 3.6428,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2770333255651365,
+      "grad_norm": 0.3065360486507416,
+      "learning_rate": 0.0005610061206645292,
+      "loss": 3.6328,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2915986949429037,
+      "grad_norm": 0.30535629391670227,
+      "learning_rate": 0.0005608312445351209,
+      "loss": 3.6385,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3061640643206713,
+      "grad_norm": 0.31860092282295227,
+      "learning_rate": 0.0005606563684057126,
+      "loss": 3.6457,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3207294336984385,
+      "grad_norm": 0.30683571100234985,
+      "learning_rate": 0.0005604814922763042,
+      "loss": 3.6281,
+      "step": 11400
+    },
+    {
+      "epoch": 3.335294803076206,
+      "grad_norm": 0.34014976024627686,
+      "learning_rate": 0.0005603066161468959,
+      "loss": 3.6465,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3498601724539734,
+      "grad_norm": 0.32539984583854675,
+      "learning_rate": 0.0005601317400174876,
+      "loss": 3.6275,
+      "step": 11500
+    },
+    {
+      "epoch": 3.364425541831741,
+      "grad_norm": 0.32786324620246887,
+      "learning_rate": 0.0005599568638880793,
+      "loss": 3.6468,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3789909112095082,
+      "grad_norm": 0.31274884939193726,
+      "learning_rate": 0.0005597819877586709,
+      "loss": 3.6499,
+      "step": 11600
+    },
+    {
+      "epoch": 3.393556280587276,
+      "grad_norm": 0.32175853848457336,
+      "learning_rate": 0.0005596071116292625,
+      "loss": 3.6414,
+      "step": 11650
+    },
+    {
+      "epoch": 3.408121649965043,
+      "grad_norm": 0.3347005248069763,
+      "learning_rate": 0.0005594322354998542,
+      "loss": 3.6366,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4226870193428107,
+      "grad_norm": 0.3298018276691437,
+      "learning_rate": 0.0005592573593704459,
+      "loss": 3.6363,
+      "step": 11750
+    },
+    {
+      "epoch": 3.437252388720578,
+      "grad_norm": 0.3186033368110657,
+      "learning_rate": 0.0005590824832410375,
+      "loss": 3.6394,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4518177580983456,
+      "grad_norm": 0.317862868309021,
+      "learning_rate": 0.0005589076071116292,
+      "loss": 3.6429,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4663831274761128,
+      "grad_norm": 0.3186003267765045,
+      "learning_rate": 0.0005587327309822209,
+      "loss": 3.6498,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4809484968538804,
+      "grad_norm": 0.3352317810058594,
+      "learning_rate": 0.0005585578548528126,
+      "loss": 3.649,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "grad_norm": 0.31718602776527405,
+      "learning_rate": 0.0005583829787234043,
+      "loss": 3.6428,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "eval_accuracy": 0.3535988957856564,
+      "eval_loss": 3.6945624351501465,
+      "eval_runtime": 180.0673,
+      "eval_samples_per_second": 92.438,
+      "eval_steps_per_second": 5.781,
+      "step": 12000
+    },
+    {
+      "epoch": 3.510079235609415,
+      "grad_norm": 0.31061556935310364,
+      "learning_rate": 0.0005582081025939958,
+      "loss": 3.6393,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5246446049871825,
+      "grad_norm": 0.30603349208831787,
+      "learning_rate": 0.0005580332264645875,
+      "loss": 3.6306,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53920997436495,
+      "grad_norm": 0.329946368932724,
+      "learning_rate": 0.0005578583503351792,
+      "loss": 3.645,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5537753437427173,
+      "grad_norm": 0.3200969099998474,
+      "learning_rate": 0.0005576834742057709,
+      "loss": 3.6239,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5683407131204845,
+      "grad_norm": 0.34068846702575684,
+      "learning_rate": 0.0005575085980763625,
+      "loss": 3.6462,
+      "step": 12250
+    },
+    {
+      "epoch": 3.582906082498252,
+      "grad_norm": 0.32341474294662476,
+      "learning_rate": 0.0005573337219469542,
+      "loss": 3.6488,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59747145187602,
+      "grad_norm": 0.3050375282764435,
+      "learning_rate": 0.0005571588458175459,
+      "loss": 3.6434,
+      "step": 12350
+    },
+    {
+      "epoch": 3.612036821253787,
+      "grad_norm": 0.32204675674438477,
+      "learning_rate": 0.0005569839696881374,
+      "loss": 3.6311,
+      "step": 12400
+    },
+    {
+      "epoch": 3.626602190631554,
+      "grad_norm": 0.3137733042240143,
+      "learning_rate": 0.0005568090935587292,
+      "loss": 3.6475,
+      "step": 12450
+    },
+    {
+      "epoch": 3.641167560009322,
+      "grad_norm": 0.3006216585636139,
+      "learning_rate": 0.0005566342174293208,
+      "loss": 3.6297,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6557329293870895,
+      "grad_norm": 0.3186657428741455,
+      "learning_rate": 0.0005564593412999125,
+      "loss": 3.6436,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6702982987648567,
+      "grad_norm": 0.316648930311203,
+      "learning_rate": 0.0005562844651705042,
+      "loss": 3.6409,
+      "step": 12600
+    },
+    {
+      "epoch": 3.684863668142624,
+      "grad_norm": 0.3677234649658203,
+      "learning_rate": 0.0005561095890410958,
+      "loss": 3.6333,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6994290375203915,
+      "grad_norm": 0.33473852276802063,
+      "learning_rate": 0.0005559347129116875,
+      "loss": 3.6376,
+      "step": 12700
+    },
+    {
+      "epoch": 3.713994406898159,
+      "grad_norm": 0.3314206600189209,
+      "learning_rate": 0.0005557598367822792,
+      "loss": 3.6366,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7285597762759264,
+      "grad_norm": 0.3139813244342804,
+      "learning_rate": 0.0005555849606528709,
+      "loss": 3.6416,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7431251456536936,
+      "grad_norm": 0.3106994032859802,
+      "learning_rate": 0.0005554100845234624,
+      "loss": 3.641,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7576905150314612,
+      "grad_norm": 0.3093438446521759,
+      "learning_rate": 0.0005552352083940541,
+      "loss": 3.6312,
+      "step": 12900
+    },
+    {
+      "epoch": 3.772255884409229,
+      "grad_norm": 0.30269870162010193,
+      "learning_rate": 0.0005550603322646458,
+      "loss": 3.6342,
+      "step": 12950
+    },
+    {
+      "epoch": 3.786821253786996,
+      "grad_norm": 0.34101831912994385,
+      "learning_rate": 0.0005548854561352375,
+      "loss": 3.6387,
+      "step": 13000
+    },
+    {
+      "epoch": 3.786821253786996,
+      "eval_accuracy": 0.35501196565319654,
+      "eval_loss": 3.6745212078094482,
+      "eval_runtime": 180.0817,
+      "eval_samples_per_second": 92.43,
+      "eval_steps_per_second": 5.781,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8013866231647633,
+      "grad_norm": 0.3471841812133789,
+      "learning_rate": 0.0005547105800058292,
+      "loss": 3.6363,
+      "step": 13050
+    },
+    {
+      "epoch": 3.815951992542531,
+      "grad_norm": 0.3232862651348114,
+      "learning_rate": 0.0005545357038764208,
+      "loss": 3.6401,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8305173619202986,
+      "grad_norm": 0.32816365361213684,
+      "learning_rate": 0.0005543608277470125,
+      "loss": 3.6346,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8450827312980658,
+      "grad_norm": 0.3071276545524597,
+      "learning_rate": 0.0005541859516176042,
+      "loss": 3.6315,
+      "step": 13200
+    },
+    {
+      "epoch": 3.859648100675833,
+      "grad_norm": 0.31473508477211,
+      "learning_rate": 0.0005540110754881958,
+      "loss": 3.6417,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8742134700536006,
+      "grad_norm": 0.33018672466278076,
+      "learning_rate": 0.0005538361993587874,
+      "loss": 3.6422,
+      "step": 13300
+    },
+    {
+      "epoch": 3.888778839431368,
+      "grad_norm": 0.29861611127853394,
+      "learning_rate": 0.0005536613232293791,
+      "loss": 3.6341,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9033442088091355,
+      "grad_norm": 0.3709903955459595,
+      "learning_rate": 0.0005534864470999708,
+      "loss": 3.6334,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9179095781869027,
+      "grad_norm": 0.30411627888679504,
+      "learning_rate": 0.0005533115709705625,
+      "loss": 3.6261,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9324749475646703,
+      "grad_norm": 0.30915218591690063,
+      "learning_rate": 0.0005531366948411541,
+      "loss": 3.6405,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9470403169424375,
+      "grad_norm": 0.3229111135005951,
+      "learning_rate": 0.0005529618187117458,
+      "loss": 3.6408,
+      "step": 13550
+    },
+    {
+      "epoch": 3.961605686320205,
+      "grad_norm": 0.3003472089767456,
+      "learning_rate": 0.0005527869425823375,
+      "loss": 3.6375,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9761710556979724,
+      "grad_norm": 0.31997328996658325,
+      "learning_rate": 0.0005526120664529292,
+      "loss": 3.6224,
+      "step": 13650
+    },
+    {
+      "epoch": 3.99073642507574,
+      "grad_norm": 0.3155164420604706,
+      "learning_rate": 0.0005524371903235207,
+      "loss": 3.64,
+      "step": 13700
+    },
+    {
+      "epoch": 4.005243532975996,
+      "grad_norm": 0.30095911026000977,
+      "learning_rate": 0.0005522623141941124,
+      "loss": 3.5881,
+      "step": 13750
+    },
+    {
+      "epoch": 4.0198089023537635,
+      "grad_norm": 0.3097637891769409,
+      "learning_rate": 0.0005520874380647041,
+      "loss": 3.5144,
+      "step": 13800
+    },
+    {
+      "epoch": 4.034374271731531,
+      "grad_norm": 0.3112574815750122,
+      "learning_rate": 0.0005519125619352957,
+      "loss": 3.5128,
+      "step": 13850
+    },
+    {
+      "epoch": 4.048939641109299,
+      "grad_norm": 0.3345850110054016,
+      "learning_rate": 0.0005517376858058875,
+      "loss": 3.5277,
+      "step": 13900
+    },
+    {
+      "epoch": 4.063505010487066,
+      "grad_norm": 0.3319241404533386,
+      "learning_rate": 0.0005515628096764791,
+      "loss": 3.5387,
+      "step": 13950
+    },
+    {
+      "epoch": 4.078070379864833,
+      "grad_norm": 0.32741302251815796,
+      "learning_rate": 0.0005513879335470708,
+      "loss": 3.5437,
+      "step": 14000
+    },
+    {
+      "epoch": 4.078070379864833,
+      "eval_accuracy": 0.3564399668688669,
+      "eval_loss": 3.6676642894744873,
+      "eval_runtime": 180.2205,
+      "eval_samples_per_second": 92.359,
+      "eval_steps_per_second": 5.776,
+      "step": 14000
+    },
+    {
+      "epoch": 4.092635749242601,
+      "grad_norm": 0.3242168426513672,
+      "learning_rate": 0.0005512130574176625,
+      "loss": 3.5313,
+      "step": 14050
+    },
+    {
+      "epoch": 4.1072011186203685,
+      "grad_norm": 0.3130347728729248,
+      "learning_rate": 0.000551038181288254,
+      "loss": 3.5486,
+      "step": 14100
+    },
+    {
+      "epoch": 4.121766487998135,
+      "grad_norm": 0.31881386041641235,
+      "learning_rate": 0.0005508633051588457,
+      "loss": 3.5395,
+      "step": 14150
+    },
+    {
+      "epoch": 4.136331857375903,
+      "grad_norm": 0.3165348768234253,
+      "learning_rate": 0.0005506884290294374,
+      "loss": 3.5563,
+      "step": 14200
+    },
+    {
+      "epoch": 4.150897226753671,
+      "grad_norm": 0.31050053238868713,
+      "learning_rate": 0.0005505135529000291,
+      "loss": 3.5441,
+      "step": 14250
+    },
+    {
+      "epoch": 4.165462596131438,
+      "grad_norm": 0.31978413462638855,
+      "learning_rate": 0.0005503386767706207,
+      "loss": 3.5413,
+      "step": 14300
+    },
+    {
+      "epoch": 4.180027965509205,
+      "grad_norm": 0.3227480947971344,
+      "learning_rate": 0.0005501638006412124,
+      "loss": 3.5426,
+      "step": 14350
+    },
+    {
+      "epoch": 4.194593334886973,
+      "grad_norm": 0.30381980538368225,
+      "learning_rate": 0.0005499889245118041,
+      "loss": 3.5495,
+      "step": 14400
+    },
+    {
+      "epoch": 4.20915870426474,
+      "grad_norm": 0.3120647966861725,
+      "learning_rate": 0.0005498140483823958,
+      "loss": 3.5646,
+      "step": 14450
+    },
+    {
+      "epoch": 4.223724073642508,
+      "grad_norm": 0.3166692852973938,
+      "learning_rate": 0.0005496391722529875,
+      "loss": 3.5624,
+      "step": 14500
+    },
+    {
+      "epoch": 4.238289443020275,
+      "grad_norm": 0.31486088037490845,
+      "learning_rate": 0.000549464296123579,
+      "loss": 3.5641,
+      "step": 14550
+    },
+    {
+      "epoch": 4.252854812398042,
+      "grad_norm": 0.32626253366470337,
+      "learning_rate": 0.0005492894199941707,
+      "loss": 3.5515,
+      "step": 14600
+    },
+    {
+      "epoch": 4.26742018177581,
+      "grad_norm": 0.30358177423477173,
+      "learning_rate": 0.0005491145438647624,
+      "loss": 3.5685,
+      "step": 14650
+    },
+    {
+      "epoch": 4.281985551153578,
+      "grad_norm": 0.3053493797779083,
+      "learning_rate": 0.000548939667735354,
+      "loss": 3.5679,
+      "step": 14700
+    },
+    {
+      "epoch": 4.296550920531344,
+      "grad_norm": 0.3187207579612732,
+      "learning_rate": 0.0005487647916059457,
+      "loss": 3.5632,
+      "step": 14750
+    },
+    {
+      "epoch": 4.311116289909112,
+      "grad_norm": 0.32234564423561096,
+      "learning_rate": 0.0005485899154765374,
+      "loss": 3.5622,
+      "step": 14800
+    },
+    {
+      "epoch": 4.32568165928688,
+      "grad_norm": 0.31007739901542664,
+      "learning_rate": 0.0005484150393471291,
+      "loss": 3.5543,
+      "step": 14850
+    },
+    {
+      "epoch": 4.340247028664647,
+      "grad_norm": 0.315043181180954,
+      "learning_rate": 0.0005482401632177208,
+      "loss": 3.5664,
+      "step": 14900
+    },
+    {
+      "epoch": 4.354812398042414,
+      "grad_norm": 0.30842092633247375,
+      "learning_rate": 0.0005480652870883124,
+      "loss": 3.5638,
+      "step": 14950
+    },
+    {
+      "epoch": 4.369377767420182,
+      "grad_norm": 0.31273534893989563,
+      "learning_rate": 0.000547890410958904,
+      "loss": 3.5684,
+      "step": 15000
+    },
+    {
+      "epoch": 4.369377767420182,
+      "eval_accuracy": 0.3577464010454295,
+      "eval_loss": 3.658200979232788,
+      "eval_runtime": 180.2541,
+      "eval_samples_per_second": 92.342,
+      "eval_steps_per_second": 5.775,
+      "step": 15000
+    },
+    {
+      "epoch": 4.383943136797949,
+      "grad_norm": 0.32522329688072205,
+      "learning_rate": 0.0005477155348294957,
+      "loss": 3.5632,
+      "step": 15050
+    },
+    {
+      "epoch": 4.398508506175717,
+      "grad_norm": 0.3170122504234314,
+      "learning_rate": 0.0005475406587000874,
+      "loss": 3.5591,
+      "step": 15100
+    },
+    {
+      "epoch": 4.413073875553484,
+      "grad_norm": 0.30741000175476074,
+      "learning_rate": 0.000547365782570679,
+      "loss": 3.5775,
+      "step": 15150
+    },
+    {
+      "epoch": 4.427639244931251,
+      "grad_norm": 0.32984650135040283,
+      "learning_rate": 0.0005471909064412707,
+      "loss": 3.5699,
+      "step": 15200
+    },
+    {
+      "epoch": 4.442204614309019,
+      "grad_norm": 0.32355785369873047,
+      "learning_rate": 0.0005470160303118624,
+      "loss": 3.5782,
+      "step": 15250
+    },
+    {
+      "epoch": 4.456769983686787,
+      "grad_norm": 0.31487616896629333,
+      "learning_rate": 0.000546841154182454,
+      "loss": 3.5612,
+      "step": 15300
+    },
+    {
+      "epoch": 4.471335353064553,
+      "grad_norm": 0.3270156681537628,
+      "learning_rate": 0.0005466662780530458,
+      "loss": 3.5596,
+      "step": 15350
+    },
+    {
+      "epoch": 4.485900722442321,
+      "grad_norm": 0.3146279752254486,
+      "learning_rate": 0.0005464914019236374,
+      "loss": 3.5732,
+      "step": 15400
+    },
+    {
+      "epoch": 4.500466091820089,
+      "grad_norm": 0.297494113445282,
+      "learning_rate": 0.000546316525794229,
+      "loss": 3.5721,
+      "step": 15450
+    },
+    {
+      "epoch": 4.515031461197856,
+      "grad_norm": 0.3217732012271881,
+      "learning_rate": 0.0005461416496648207,
+      "loss": 3.5773,
+      "step": 15500
+    },
+    {
+      "epoch": 4.529596830575623,
+      "grad_norm": 0.3060513734817505,
+      "learning_rate": 0.0005459667735354123,
+      "loss": 3.5638,
+      "step": 15550
+    },
+    {
+      "epoch": 4.544162199953391,
+      "grad_norm": 0.33157843351364136,
+      "learning_rate": 0.000545791897406004,
+      "loss": 3.5654,
+      "step": 15600
+    },
+    {
+      "epoch": 4.558727569331158,
+      "grad_norm": 0.3019466698169708,
+      "learning_rate": 0.0005456170212765957,
+      "loss": 3.5716,
+      "step": 15650
+    },
+    {
+      "epoch": 4.573292938708926,
+      "grad_norm": 0.3082275688648224,
+      "learning_rate": 0.0005454421451471874,
+      "loss": 3.5811,
+      "step": 15700
+    },
+    {
+      "epoch": 4.587858308086693,
+      "grad_norm": 0.3220166563987732,
+      "learning_rate": 0.000545267269017779,
+      "loss": 3.5781,
+      "step": 15750
+    },
+    {
+      "epoch": 4.6024236774644605,
+      "grad_norm": 0.30748656392097473,
+      "learning_rate": 0.0005450923928883708,
+      "loss": 3.5713,
+      "step": 15800
+    },
+    {
+      "epoch": 4.616989046842228,
+      "grad_norm": 0.3191044330596924,
+      "learning_rate": 0.0005449175167589623,
+      "loss": 3.5861,
+      "step": 15850
+    },
+    {
+      "epoch": 4.631554416219995,
+      "grad_norm": 0.31128981709480286,
+      "learning_rate": 0.000544742640629554,
+      "loss": 3.5644,
+      "step": 15900
+    },
+    {
+      "epoch": 4.6461197855977625,
+      "grad_norm": 0.3463725447654724,
+      "learning_rate": 0.0005445677645001457,
+      "loss": 3.5708,
+      "step": 15950
+    },
+    {
+      "epoch": 4.66068515497553,
+      "grad_norm": 0.32188108563423157,
+      "learning_rate": 0.0005443928883707373,
+      "loss": 3.5745,
+      "step": 16000
+    },
+    {
+      "epoch": 4.66068515497553,
+      "eval_accuracy": 0.35857467937281284,
+      "eval_loss": 3.6439030170440674,
+      "eval_runtime": 180.24,
+      "eval_samples_per_second": 92.349,
+      "eval_steps_per_second": 5.776,
+      "step": 16000
+    },
+    {
+      "epoch": 4.675250524353298,
+      "grad_norm": 0.32182472944259644,
+      "learning_rate": 0.000544218012241329,
+      "loss": 3.5742,
+      "step": 16050
+    },
+    {
+      "epoch": 4.689815893731065,
+      "grad_norm": 0.33084315061569214,
+      "learning_rate": 0.0005440431361119207,
+      "loss": 3.577,
+      "step": 16100
+    },
+    {
+      "epoch": 4.704381263108832,
+      "grad_norm": 0.3214638829231262,
+      "learning_rate": 0.0005438682599825123,
+      "loss": 3.5804,
+      "step": 16150
+    },
+    {
+      "epoch": 4.7189466324866,
+      "grad_norm": 0.30823513865470886,
+      "learning_rate": 0.000543693383853104,
+      "loss": 3.5711,
+      "step": 16200
+    },
+    {
+      "epoch": 4.7335120018643675,
+      "grad_norm": 0.31343257427215576,
+      "learning_rate": 0.0005435185077236957,
+      "loss": 3.5697,
+      "step": 16250
+    },
+    {
+      "epoch": 4.748077371242134,
+      "grad_norm": 0.30805066227912903,
+      "learning_rate": 0.0005433436315942873,
+      "loss": 3.5826,
+      "step": 16300
+    },
+    {
+      "epoch": 4.762642740619902,
+      "grad_norm": 0.31302279233932495,
+      "learning_rate": 0.000543168755464879,
+      "loss": 3.5666,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7772081099976695,
+      "grad_norm": 0.32919782400131226,
+      "learning_rate": 0.0005429938793354706,
+      "loss": 3.5658,
+      "step": 16400
+    },
+    {
+      "epoch": 4.791773479375437,
+      "grad_norm": 0.31369245052337646,
+      "learning_rate": 0.0005428190032060623,
+      "loss": 3.5781,
+      "step": 16450
+    },
+    {
+      "epoch": 4.806338848753205,
+      "grad_norm": 0.31175729632377625,
+      "learning_rate": 0.000542644127076654,
+      "loss": 3.5749,
+      "step": 16500
+    },
+    {
+      "epoch": 4.820904218130972,
+      "grad_norm": 0.3278850317001343,
+      "learning_rate": 0.0005424692509472457,
+      "loss": 3.5751,
+      "step": 16550
+    },
+    {
+      "epoch": 4.835469587508739,
+      "grad_norm": 0.30360278487205505,
+      "learning_rate": 0.0005422943748178373,
+      "loss": 3.5759,
+      "step": 16600
+    },
+    {
+      "epoch": 4.850034956886507,
+      "grad_norm": 0.30331000685691833,
+      "learning_rate": 0.000542119498688429,
+      "loss": 3.5919,
+      "step": 16650
+    },
+    {
+      "epoch": 4.864600326264274,
+      "grad_norm": 0.3112633526325226,
+      "learning_rate": 0.0005419446225590207,
+      "loss": 3.5798,
+      "step": 16700
+    },
+    {
+      "epoch": 4.879165695642041,
+      "grad_norm": 0.29969000816345215,
+      "learning_rate": 0.0005417697464296122,
+      "loss": 3.5732,
+      "step": 16750
+    },
+    {
+      "epoch": 4.893731065019809,
+      "grad_norm": 0.30441269278526306,
+      "learning_rate": 0.000541594870300204,
+      "loss": 3.5842,
+      "step": 16800
+    },
+    {
+      "epoch": 4.908296434397577,
+      "grad_norm": 0.292208731174469,
+      "learning_rate": 0.0005414199941707956,
+      "loss": 3.5711,
+      "step": 16850
+    },
+    {
+      "epoch": 4.922861803775344,
+      "grad_norm": 0.3113518953323364,
+      "learning_rate": 0.0005412451180413873,
+      "loss": 3.5742,
+      "step": 16900
+    },
+    {
+      "epoch": 4.937427173153111,
+      "grad_norm": 0.3084389567375183,
+      "learning_rate": 0.000541070241911979,
+      "loss": 3.5705,
+      "step": 16950
+    },
+    {
+      "epoch": 4.951992542530879,
+      "grad_norm": 0.3114304542541504,
+      "learning_rate": 0.0005408953657825706,
+      "loss": 3.5827,
+      "step": 17000
+    },
+    {
+      "epoch": 4.951992542530879,
+      "eval_accuracy": 0.3599802247814527,
+      "eval_loss": 3.6305477619171143,
+      "eval_runtime": 180.3299,
+      "eval_samples_per_second": 92.303,
+      "eval_steps_per_second": 5.773,
+      "step": 17000
+    },
+    {
+      "epoch": 4.966557911908646,
+      "grad_norm": 0.3147881329059601,
+      "learning_rate": 0.0005407204896531623,
+      "loss": 3.5661,
+      "step": 17050
+    },
+    {
+      "epoch": 4.981123281286413,
+      "grad_norm": 0.33159729838371277,
+      "learning_rate": 0.000540545613523754,
+      "loss": 3.5697,
+      "step": 17100
+    },
+    {
+      "epoch": 4.995688650664181,
+      "grad_norm": 0.31183063983917236,
+      "learning_rate": 0.0005403707373943456,
+      "loss": 3.5775,
+      "step": 17150
+    },
+    {
+      "epoch": 5.010195758564437,
+      "grad_norm": 0.3240616023540497,
+      "learning_rate": 0.0005401958612649372,
+      "loss": 3.4929,
+      "step": 17200
+    },
+    {
+      "epoch": 5.024761127942204,
+      "grad_norm": 0.367702454328537,
+      "learning_rate": 0.000540020985135529,
+      "loss": 3.4545,
+      "step": 17250
+    },
+    {
+      "epoch": 5.039326497319972,
+      "grad_norm": 0.2979840636253357,
+      "learning_rate": 0.0005398461090061206,
+      "loss": 3.4727,
+      "step": 17300
+    },
+    {
+      "epoch": 5.0538918666977395,
+      "grad_norm": 0.3102603852748871,
+      "learning_rate": 0.0005396712328767123,
+      "loss": 3.472,
+      "step": 17350
+    },
+    {
+      "epoch": 5.068457236075507,
+      "grad_norm": 0.3156759738922119,
+      "learning_rate": 0.000539496356747304,
+      "loss": 3.4673,
+      "step": 17400
+    },
+    {
+      "epoch": 5.083022605453274,
+      "grad_norm": 0.31477898359298706,
+      "learning_rate": 0.0005393214806178956,
+      "loss": 3.4812,
+      "step": 17450
+    },
+    {
+      "epoch": 5.0975879748310415,
+      "grad_norm": 0.3182123601436615,
+      "learning_rate": 0.0005391466044884873,
+      "loss": 3.4805,
+      "step": 17500
+    },
+    {
+      "epoch": 5.112153344208809,
+      "grad_norm": 0.33958199620246887,
+      "learning_rate": 0.000538971728359079,
+      "loss": 3.4871,
+      "step": 17550
+    },
+    {
+      "epoch": 5.126718713586577,
+      "grad_norm": 0.30561456084251404,
+      "learning_rate": 0.0005387968522296705,
+      "loss": 3.4898,
+      "step": 17600
+    },
+    {
+      "epoch": 5.141284082964344,
+      "grad_norm": 0.33050698041915894,
+      "learning_rate": 0.0005386219761002622,
+      "loss": 3.4846,
+      "step": 17650
+    },
+    {
+      "epoch": 5.155849452342111,
+      "grad_norm": 0.3352583944797516,
+      "learning_rate": 0.0005384470999708539,
+      "loss": 3.4927,
+      "step": 17700
+    },
+    {
+      "epoch": 5.170414821719879,
+      "grad_norm": 0.3153831958770752,
+      "learning_rate": 0.0005382722238414456,
+      "loss": 3.4836,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1849801910976465,
+      "grad_norm": 0.3268112540245056,
+      "learning_rate": 0.0005380973477120373,
+      "loss": 3.4846,
+      "step": 17800
+    },
+    {
+      "epoch": 5.199545560475413,
+      "grad_norm": 0.3340913951396942,
+      "learning_rate": 0.000537922471582629,
+      "loss": 3.5008,
+      "step": 17850
+    },
+    {
+      "epoch": 5.214110929853181,
+      "grad_norm": 0.330852210521698,
+      "learning_rate": 0.0005377475954532206,
+      "loss": 3.5119,
+      "step": 17900
+    },
+    {
+      "epoch": 5.228676299230949,
+      "grad_norm": 0.32476457953453064,
+      "learning_rate": 0.0005375727193238123,
+      "loss": 3.4917,
+      "step": 17950
+    },
+    {
+      "epoch": 5.243241668608716,
+      "grad_norm": 0.30598244071006775,
+      "learning_rate": 0.000537397843194404,
+      "loss": 3.4979,
+      "step": 18000
+    },
+    {
+      "epoch": 5.243241668608716,
+      "eval_accuracy": 0.359971054347168,
+      "eval_loss": 3.6366615295410156,
+      "eval_runtime": 180.2334,
+      "eval_samples_per_second": 92.352,
+      "eval_steps_per_second": 5.776,
+      "step": 18000
+    },
+    {
+      "epoch": 5.257807037986483,
+      "grad_norm": 0.3335484266281128,
+      "learning_rate": 0.0005372229670649955,
+      "loss": 3.4986,
+      "step": 18050
+    },
+    {
+      "epoch": 5.272372407364251,
+      "grad_norm": 0.3260400593280792,
+      "learning_rate": 0.0005370480909355872,
+      "loss": 3.4974,
+      "step": 18100
+    },
+    {
+      "epoch": 5.286937776742018,
+      "grad_norm": 0.3084673583507538,
+      "learning_rate": 0.0005368732148061789,
+      "loss": 3.506,
+      "step": 18150
+    },
+    {
+      "epoch": 5.301503146119786,
+      "grad_norm": 0.32834941148757935,
+      "learning_rate": 0.0005366983386767705,
+      "loss": 3.5106,
+      "step": 18200
+    },
+    {
+      "epoch": 5.316068515497553,
+      "grad_norm": 0.3387486934661865,
+      "learning_rate": 0.0005365234625473623,
+      "loss": 3.5101,
+      "step": 18250
+    },
+    {
+      "epoch": 5.33063388487532,
+      "grad_norm": 0.30053550004959106,
+      "learning_rate": 0.0005363485864179539,
+      "loss": 3.5012,
+      "step": 18300
+    },
+    {
+      "epoch": 5.345199254253088,
+      "grad_norm": 0.3416799306869507,
+      "learning_rate": 0.0005361737102885456,
+      "loss": 3.5223,
+      "step": 18350
+    },
+    {
+      "epoch": 5.359764623630856,
+      "grad_norm": 0.3232312500476837,
+      "learning_rate": 0.0005359988341591373,
+      "loss": 3.504,
+      "step": 18400
+    },
+    {
+      "epoch": 5.374329993008622,
+      "grad_norm": 0.3403994143009186,
+      "learning_rate": 0.000535823958029729,
+      "loss": 3.505,
+      "step": 18450
+    },
+    {
+      "epoch": 5.38889536238639,
+      "grad_norm": 0.3213067352771759,
+      "learning_rate": 0.0005356490819003205,
+      "loss": 3.5181,
+      "step": 18500
+    },
+    {
+      "epoch": 5.403460731764158,
+      "grad_norm": 0.31317710876464844,
+      "learning_rate": 0.0005354742057709122,
+      "loss": 3.5133,
+      "step": 18550
+    },
+    {
+      "epoch": 5.418026101141925,
+      "grad_norm": 0.3116888701915741,
+      "learning_rate": 0.0005352993296415039,
+      "loss": 3.5131,
+      "step": 18600
+    },
+    {
+      "epoch": 5.432591470519692,
+      "grad_norm": 0.31915172934532166,
+      "learning_rate": 0.0005351244535120955,
+      "loss": 3.5048,
+      "step": 18650
+    },
+    {
+      "epoch": 5.44715683989746,
+      "grad_norm": 0.33018389344215393,
+      "learning_rate": 0.0005349495773826873,
+      "loss": 3.5235,
+      "step": 18700
+    },
+    {
+      "epoch": 5.461722209275227,
+      "grad_norm": 0.32426100969314575,
+      "learning_rate": 0.0005347747012532789,
+      "loss": 3.5113,
+      "step": 18750
+    },
+    {
+      "epoch": 5.476287578652995,
+      "grad_norm": 0.31989872455596924,
+      "learning_rate": 0.0005345998251238706,
+      "loss": 3.5174,
+      "step": 18800
+    },
+    {
+      "epoch": 5.490852948030762,
+      "grad_norm": 0.30927300453186035,
+      "learning_rate": 0.0005344249489944623,
+      "loss": 3.5063,
+      "step": 18850
+    },
+    {
+      "epoch": 5.505418317408529,
+      "grad_norm": 0.3078673183917999,
+      "learning_rate": 0.0005342500728650538,
+      "loss": 3.5206,
+      "step": 18900
+    },
+    {
+      "epoch": 5.519983686786297,
+      "grad_norm": 0.3324156701564789,
+      "learning_rate": 0.0005340751967356455,
+      "loss": 3.5154,
+      "step": 18950
+    },
+    {
+      "epoch": 5.534549056164065,
+      "grad_norm": 0.3175108730792999,
+      "learning_rate": 0.0005339003206062372,
+      "loss": 3.5219,
+      "step": 19000
+    },
+    {
+      "epoch": 5.534549056164065,
+      "eval_accuracy": 0.36131487567889137,
+      "eval_loss": 3.623619556427002,
+      "eval_runtime": 180.1823,
+      "eval_samples_per_second": 92.379,
+      "eval_steps_per_second": 5.777,
+      "step": 19000
+    },
+    {
+      "epoch": 5.549114425541831,
+      "grad_norm": 0.3166046440601349,
+      "learning_rate": 0.0005337254444768288,
+      "loss": 3.5227,
+      "step": 19050
+    },
+    {
+      "epoch": 5.563679794919599,
+      "grad_norm": 0.3559969961643219,
+      "learning_rate": 0.0005335505683474205,
+      "loss": 3.5278,
+      "step": 19100
+    },
+    {
+      "epoch": 5.578245164297367,
+      "grad_norm": 0.31637391448020935,
+      "learning_rate": 0.0005333756922180122,
+      "loss": 3.5251,
+      "step": 19150
+    },
+    {
+      "epoch": 5.592810533675134,
+      "grad_norm": 0.33826208114624023,
+      "learning_rate": 0.0005332008160886039,
+      "loss": 3.5306,
+      "step": 19200
+    },
+    {
+      "epoch": 5.607375903052901,
+      "grad_norm": 0.3146003484725952,
+      "learning_rate": 0.0005330259399591956,
+      "loss": 3.5176,
+      "step": 19250
+    },
+    {
+      "epoch": 5.621941272430669,
+      "grad_norm": 0.33475756645202637,
+      "learning_rate": 0.0005328510638297873,
+      "loss": 3.5252,
+      "step": 19300
+    },
+    {
+      "epoch": 5.636506641808436,
+      "grad_norm": 0.31839892268180847,
+      "learning_rate": 0.0005326761877003788,
+      "loss": 3.5227,
+      "step": 19350
+    },
+    {
+      "epoch": 5.651072011186204,
+      "grad_norm": 0.3179383873939514,
+      "learning_rate": 0.0005325013115709705,
+      "loss": 3.5276,
+      "step": 19400
+    },
+    {
+      "epoch": 5.665637380563971,
+      "grad_norm": 0.32419490814208984,
+      "learning_rate": 0.0005323264354415622,
+      "loss": 3.5182,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6802027499417385,
+      "grad_norm": 0.30967098474502563,
+      "learning_rate": 0.0005321515593121538,
+      "loss": 3.5231,
+      "step": 19500
+    },
+    {
+      "epoch": 5.694768119319506,
+      "grad_norm": 0.33187001943588257,
+      "learning_rate": 0.0005319766831827455,
+      "loss": 3.5163,
+      "step": 19550
+    },
+    {
+      "epoch": 5.709333488697274,
+      "grad_norm": 0.3205404281616211,
+      "learning_rate": 0.0005318018070533372,
+      "loss": 3.5162,
+      "step": 19600
+    },
+    {
+      "epoch": 5.7238988580750405,
+      "grad_norm": 0.3075636923313141,
+      "learning_rate": 0.0005316269309239288,
+      "loss": 3.5245,
+      "step": 19650
+    },
+    {
+      "epoch": 5.738464227452808,
+      "grad_norm": 0.3177931010723114,
+      "learning_rate": 0.0005314520547945206,
+      "loss": 3.5179,
+      "step": 19700
+    },
+    {
+      "epoch": 5.753029596830576,
+      "grad_norm": 0.3217032849788666,
+      "learning_rate": 0.0005312771786651121,
+      "loss": 3.5328,
+      "step": 19750
+    },
+    {
+      "epoch": 5.7675949662083426,
+      "grad_norm": 0.3182378113269806,
+      "learning_rate": 0.0005311023025357038,
+      "loss": 3.5269,
+      "step": 19800
+    },
+    {
+      "epoch": 5.78216033558611,
+      "grad_norm": 0.3116445541381836,
+      "learning_rate": 0.0005309274264062955,
+      "loss": 3.5313,
+      "step": 19850
+    },
+    {
+      "epoch": 5.796725704963878,
+      "grad_norm": 0.3098820447921753,
+      "learning_rate": 0.0005307525502768872,
+      "loss": 3.5347,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8112910743416455,
+      "grad_norm": 0.3134811818599701,
+      "learning_rate": 0.0005305776741474788,
+      "loss": 3.5424,
+      "step": 19950
+    },
+    {
+      "epoch": 5.825856443719413,
+      "grad_norm": 0.3062841296195984,
+      "learning_rate": 0.0005304027980180705,
+      "loss": 3.5406,
+      "step": 20000
+    },
+    {
+      "epoch": 5.825856443719413,
+      "eval_accuracy": 0.36257216573326145,
+      "eval_loss": 3.6113312244415283,
+      "eval_runtime": 180.3944,
+      "eval_samples_per_second": 92.27,
+      "eval_steps_per_second": 5.771,
+      "step": 20000
+    },
+    {
+      "epoch": 5.84042181309718,
+      "grad_norm": 0.32947519421577454,
+      "learning_rate": 0.0005302279218886622,
+      "loss": 3.5347,
+      "step": 20050
+    },
+    {
+      "epoch": 5.8549871824749475,
+      "grad_norm": 0.3123909533023834,
+      "learning_rate": 0.0005300530457592538,
+      "loss": 3.5286,
+      "step": 20100
+    },
+    {
+      "epoch": 5.869552551852715,
+      "grad_norm": 0.34147006273269653,
+      "learning_rate": 0.0005298781696298456,
+      "loss": 3.5263,
+      "step": 20150
+    },
+    {
+      "epoch": 5.884117921230482,
+      "grad_norm": 0.3259049654006958,
+      "learning_rate": 0.0005297032935004371,
+      "loss": 3.5195,
+      "step": 20200
+    },
+    {
+      "epoch": 5.89868329060825,
+      "grad_norm": 0.31503719091415405,
+      "learning_rate": 0.0005295284173710288,
+      "loss": 3.5295,
+      "step": 20250
+    },
+    {
+      "epoch": 5.913248659986017,
+      "grad_norm": 0.32215699553489685,
+      "learning_rate": 0.0005293535412416205,
+      "loss": 3.5369,
+      "step": 20300
+    },
+    {
+      "epoch": 5.927814029363785,
+      "grad_norm": 0.33477988839149475,
+      "learning_rate": 0.0005291786651122121,
+      "loss": 3.5262,
+      "step": 20350
+    },
+    {
+      "epoch": 5.9423793987415525,
+      "grad_norm": 0.3120667040348053,
+      "learning_rate": 0.0005290037889828038,
+      "loss": 3.5411,
+      "step": 20400
+    },
+    {
+      "epoch": 5.956944768119319,
+      "grad_norm": 0.31322258710861206,
+      "learning_rate": 0.0005288289128533955,
+      "loss": 3.5162,
+      "step": 20450
+    },
+    {
+      "epoch": 5.971510137497087,
+      "grad_norm": 0.3116031885147095,
+      "learning_rate": 0.0005286540367239872,
+      "loss": 3.5246,
+      "step": 20500
+    },
+    {
+      "epoch": 5.986075506874855,
+      "grad_norm": 0.3136381506919861,
+      "learning_rate": 0.0005284791605945788,
+      "loss": 3.5164,
+      "step": 20550
+    },
+    {
+      "epoch": 6.0005826147751105,
+      "grad_norm": 0.314301460981369,
+      "learning_rate": 0.0005283042844651704,
+      "loss": 3.5269,
+      "step": 20600
+    },
+    {
+      "epoch": 6.015147984152878,
+      "grad_norm": 0.3147795498371124,
+      "learning_rate": 0.0005281294083357621,
+      "loss": 3.4183,
+      "step": 20650
+    },
+    {
+      "epoch": 6.029713353530646,
+      "grad_norm": 0.31358394026756287,
+      "learning_rate": 0.0005279545322063538,
+      "loss": 3.4052,
+      "step": 20700
+    },
+    {
+      "epoch": 6.044278722908413,
+      "grad_norm": 0.3503838777542114,
+      "learning_rate": 0.0005277796560769455,
+      "loss": 3.4225,
+      "step": 20750
+    },
+    {
+      "epoch": 6.05884409228618,
+      "grad_norm": 0.336975634098053,
+      "learning_rate": 0.0005276047799475371,
+      "loss": 3.4233,
+      "step": 20800
+    },
+    {
+      "epoch": 6.073409461663948,
+      "grad_norm": 0.34221938252449036,
+      "learning_rate": 0.0005274299038181288,
+      "loss": 3.4379,
+      "step": 20850
+    },
+    {
+      "epoch": 6.087974831041715,
+      "grad_norm": 0.30639681220054626,
+      "learning_rate": 0.0005272550276887205,
+      "loss": 3.4234,
+      "step": 20900
+    },
+    {
+      "epoch": 6.102540200419483,
+      "grad_norm": 0.3100702464580536,
+      "learning_rate": 0.0005270801515593121,
+      "loss": 3.4353,
+      "step": 20950
+    },
+    {
+      "epoch": 6.11710556979725,
+      "grad_norm": 0.31649187207221985,
+      "learning_rate": 0.0005269052754299037,
+      "loss": 3.4266,
+      "step": 21000
+    },
+    {
+      "epoch": 6.11710556979725,
+      "eval_accuracy": 0.3626251896545744,
+      "eval_loss": 3.617746114730835,
+      "eval_runtime": 180.2077,
+      "eval_samples_per_second": 92.366,
+      "eval_steps_per_second": 5.777,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1316709391750175,
+      "grad_norm": 0.31790605187416077,
+      "learning_rate": 0.0005267303993004954,
+      "loss": 3.4448,
+      "step": 21050
+    },
+    {
+      "epoch": 6.146236308552785,
+      "grad_norm": 0.3185005486011505,
+      "learning_rate": 0.000526555523171087,
+      "loss": 3.4435,
+      "step": 21100
+    },
+    {
+      "epoch": 6.160801677930552,
+      "grad_norm": 0.32815855741500854,
+      "learning_rate": 0.0005263806470416788,
+      "loss": 3.4373,
+      "step": 21150
+    },
+    {
+      "epoch": 6.1753670473083195,
+      "grad_norm": 0.3747202754020691,
+      "learning_rate": 0.0005262057709122704,
+      "loss": 3.4414,
+      "step": 21200
+    },
+    {
+      "epoch": 6.189932416686087,
+      "grad_norm": 0.3081320524215698,
+      "learning_rate": 0.0005260308947828621,
+      "loss": 3.4516,
+      "step": 21250
+    },
+    {
+      "epoch": 6.204497786063855,
+      "grad_norm": 0.33076637983322144,
+      "learning_rate": 0.0005258560186534538,
+      "loss": 3.445,
+      "step": 21300
+    },
+    {
+      "epoch": 6.219063155441622,
+      "grad_norm": 0.34362319111824036,
+      "learning_rate": 0.0005256811425240455,
+      "loss": 3.4611,
+      "step": 21350
+    },
+    {
+      "epoch": 6.233628524819389,
+      "grad_norm": 0.31307855248451233,
+      "learning_rate": 0.0005255062663946371,
+      "loss": 3.4547,
+      "step": 21400
+    },
+    {
+      "epoch": 6.248193894197157,
+      "grad_norm": 0.33912956714630127,
+      "learning_rate": 0.0005253313902652287,
+      "loss": 3.4538,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2627592635749245,
+      "grad_norm": 0.33405831456184387,
+      "learning_rate": 0.0005251565141358204,
+      "loss": 3.4433,
+      "step": 21500
+    },
+    {
+      "epoch": 6.277324632952691,
+      "grad_norm": 0.32273417711257935,
+      "learning_rate": 0.000524981638006412,
+      "loss": 3.4635,
+      "step": 21550
+    },
+    {
+      "epoch": 6.291890002330459,
+      "grad_norm": 0.3156605660915375,
+      "learning_rate": 0.0005248067618770038,
+      "loss": 3.4561,
+      "step": 21600
+    },
+    {
+      "epoch": 6.306455371708227,
+      "grad_norm": 0.3214246928691864,
+      "learning_rate": 0.0005246318857475954,
+      "loss": 3.4539,
+      "step": 21650
+    },
+    {
+      "epoch": 6.321020741085994,
+      "grad_norm": 0.30894410610198975,
+      "learning_rate": 0.0005244570096181871,
+      "loss": 3.4651,
+      "step": 21700
+    },
+    {
+      "epoch": 6.335586110463761,
+      "grad_norm": 0.36027565598487854,
+      "learning_rate": 0.0005242821334887788,
+      "loss": 3.4633,
+      "step": 21750
+    },
+    {
+      "epoch": 6.350151479841529,
+      "grad_norm": 0.33685246109962463,
+      "learning_rate": 0.0005241072573593704,
+      "loss": 3.4651,
+      "step": 21800
+    },
+    {
+      "epoch": 6.364716849219296,
+      "grad_norm": 0.3304831087589264,
+      "learning_rate": 0.000523932381229962,
+      "loss": 3.4685,
+      "step": 21850
+    },
+    {
+      "epoch": 6.379282218597064,
+      "grad_norm": 0.30572712421417236,
+      "learning_rate": 0.0005237575051005537,
+      "loss": 3.4666,
+      "step": 21900
+    },
+    {
+      "epoch": 6.393847587974831,
+      "grad_norm": 0.3293927013874054,
+      "learning_rate": 0.0005235826289711454,
+      "loss": 3.4804,
+      "step": 21950
+    },
+    {
+      "epoch": 6.408412957352598,
+      "grad_norm": 0.31569355726242065,
+      "learning_rate": 0.000523407752841737,
+      "loss": 3.4681,
+      "step": 22000
+    },
+    {
+      "epoch": 6.408412957352598,
+      "eval_accuracy": 0.3631648344413295,
+      "eval_loss": 3.6071176528930664,
+      "eval_runtime": 180.3173,
+      "eval_samples_per_second": 92.31,
+      "eval_steps_per_second": 5.773,
+      "step": 22000
+    },
+    {
+      "epoch": 6.422978326730366,
+      "grad_norm": 0.3268168568611145,
+      "learning_rate": 0.0005232328767123287,
+      "loss": 3.4681,
+      "step": 22050
+    },
+    {
+      "epoch": 6.437543696108134,
+      "grad_norm": 0.3143610656261444,
+      "learning_rate": 0.0005230580005829204,
+      "loss": 3.4788,
+      "step": 22100
+    },
+    {
+      "epoch": 6.4521090654859,
+      "grad_norm": 0.3352193534374237,
+      "learning_rate": 0.0005228831244535121,
+      "loss": 3.4648,
+      "step": 22150
+    },
+    {
+      "epoch": 6.466674434863668,
+      "grad_norm": 0.3402239680290222,
+      "learning_rate": 0.0005227082483241038,
+      "loss": 3.4847,
+      "step": 22200
+    },
+    {
+      "epoch": 6.481239804241436,
+      "grad_norm": 0.32882171869277954,
+      "learning_rate": 0.0005225333721946954,
+      "loss": 3.4692,
+      "step": 22250
+    },
+    {
+      "epoch": 6.495805173619203,
+      "grad_norm": 0.3083683252334595,
+      "learning_rate": 0.000522358496065287,
+      "loss": 3.4799,
+      "step": 22300
+    },
+    {
+      "epoch": 6.51037054299697,
+      "grad_norm": 0.32959502935409546,
+      "learning_rate": 0.0005221836199358787,
+      "loss": 3.4769,
+      "step": 22350
+    },
+    {
+      "epoch": 6.524935912374738,
+      "grad_norm": 0.31663578748703003,
+      "learning_rate": 0.0005220087438064703,
+      "loss": 3.4759,
+      "step": 22400
+    },
+    {
+      "epoch": 6.539501281752505,
+      "grad_norm": 0.3328782916069031,
+      "learning_rate": 0.000521833867677062,
+      "loss": 3.476,
+      "step": 22450
+    },
+    {
+      "epoch": 6.554066651130273,
+      "grad_norm": 0.32036277651786804,
+      "learning_rate": 0.0005216589915476537,
+      "loss": 3.4838,
+      "step": 22500
+    },
+    {
+      "epoch": 6.56863202050804,
+      "grad_norm": 0.31284037232398987,
+      "learning_rate": 0.0005214841154182454,
+      "loss": 3.4818,
+      "step": 22550
+    },
+    {
+      "epoch": 6.583197389885807,
+      "grad_norm": 0.3226317763328552,
+      "learning_rate": 0.0005213092392888371,
+      "loss": 3.4757,
+      "step": 22600
+    },
+    {
+      "epoch": 6.597762759263575,
+      "grad_norm": 0.31363436579704285,
+      "learning_rate": 0.0005211343631594287,
+      "loss": 3.4836,
+      "step": 22650
+    },
+    {
+      "epoch": 6.612328128641343,
+      "grad_norm": 0.33146846294403076,
+      "learning_rate": 0.0005209594870300204,
+      "loss": 3.4733,
+      "step": 22700
+    },
+    {
+      "epoch": 6.626893498019109,
+      "grad_norm": 0.3308202624320984,
+      "learning_rate": 0.000520784610900612,
+      "loss": 3.4909,
+      "step": 22750
+    },
+    {
+      "epoch": 6.641458867396877,
+      "grad_norm": 0.331926554441452,
+      "learning_rate": 0.0005206097347712037,
+      "loss": 3.4868,
+      "step": 22800
+    },
+    {
+      "epoch": 6.656024236774645,
+      "grad_norm": 0.31595379114151,
+      "learning_rate": 0.0005204348586417953,
+      "loss": 3.4817,
+      "step": 22850
+    },
+    {
+      "epoch": 6.670589606152412,
+      "grad_norm": 0.3580048680305481,
+      "learning_rate": 0.000520259982512387,
+      "loss": 3.484,
+      "step": 22900
+    },
+    {
+      "epoch": 6.685154975530179,
+      "grad_norm": 0.30390664935112,
+      "learning_rate": 0.0005200851063829787,
+      "loss": 3.4853,
+      "step": 22950
+    },
+    {
+      "epoch": 6.699720344907947,
+      "grad_norm": 0.323244571685791,
+      "learning_rate": 0.0005199102302535703,
+      "loss": 3.4797,
+      "step": 23000
+    },
+    {
+      "epoch": 6.699720344907947,
+      "eval_accuracy": 0.36416934970451803,
+      "eval_loss": 3.600242853164673,
+      "eval_runtime": 180.297,
+      "eval_samples_per_second": 92.32,
+      "eval_steps_per_second": 5.774,
+      "step": 23000
+    },
+    {
+      "epoch": 6.714285714285714,
+      "grad_norm": 0.3307446539402008,
+      "learning_rate": 0.0005197353541241621,
+      "loss": 3.4901,
+      "step": 23050
+    },
+    {
+      "epoch": 6.728851083663482,
+      "grad_norm": 0.3092699944972992,
+      "learning_rate": 0.0005195604779947537,
+      "loss": 3.4878,
+      "step": 23100
+    },
+    {
+      "epoch": 6.743416453041249,
+      "grad_norm": 0.33178552985191345,
+      "learning_rate": 0.0005193856018653454,
+      "loss": 3.4827,
+      "step": 23150
+    },
+    {
+      "epoch": 6.7579818224190165,
+      "grad_norm": 0.35661131143569946,
+      "learning_rate": 0.000519210725735937,
+      "loss": 3.4937,
+      "step": 23200
+    },
+    {
+      "epoch": 6.772547191796784,
+      "grad_norm": 0.319477915763855,
+      "learning_rate": 0.0005190358496065286,
+      "loss": 3.4894,
+      "step": 23250
+    },
+    {
+      "epoch": 6.787112561174552,
+      "grad_norm": 0.3135737478733063,
+      "learning_rate": 0.0005188609734771203,
+      "loss": 3.5,
+      "step": 23300
+    },
+    {
+      "epoch": 6.8016779305523185,
+      "grad_norm": 0.3119906187057495,
+      "learning_rate": 0.000518686097347712,
+      "loss": 3.4882,
+      "step": 23350
+    },
+    {
+      "epoch": 6.816243299930086,
+      "grad_norm": 0.31904417276382446,
+      "learning_rate": 0.0005185112212183037,
+      "loss": 3.4952,
+      "step": 23400
+    },
+    {
+      "epoch": 6.830808669307854,
+      "grad_norm": 0.33189624547958374,
+      "learning_rate": 0.0005183363450888953,
+      "loss": 3.4895,
+      "step": 23450
+    },
+    {
+      "epoch": 6.845374038685621,
+      "grad_norm": 0.3348432779312134,
+      "learning_rate": 0.000518161468959487,
+      "loss": 3.4911,
+      "step": 23500
+    },
+    {
+      "epoch": 6.859939408063388,
+      "grad_norm": 0.3141877055168152,
+      "learning_rate": 0.0005179865928300787,
+      "loss": 3.5002,
+      "step": 23550
+    },
+    {
+      "epoch": 6.874504777441156,
+      "grad_norm": 0.3227143883705139,
+      "learning_rate": 0.0005178117167006703,
+      "loss": 3.4831,
+      "step": 23600
+    },
+    {
+      "epoch": 6.8890701468189235,
+      "grad_norm": 0.317911833524704,
+      "learning_rate": 0.000517636840571262,
+      "loss": 3.494,
+      "step": 23650
+    },
+    {
+      "epoch": 6.903635516196691,
+      "grad_norm": 0.3200852572917938,
+      "learning_rate": 0.0005174619644418536,
+      "loss": 3.4797,
+      "step": 23700
+    },
+    {
+      "epoch": 6.918200885574458,
+      "grad_norm": 0.32029202580451965,
+      "learning_rate": 0.0005172870883124453,
+      "loss": 3.4846,
+      "step": 23750
+    },
+    {
+      "epoch": 6.9327662549522255,
+      "grad_norm": 0.322200208902359,
+      "learning_rate": 0.000517112212183037,
+      "loss": 3.4984,
+      "step": 23800
+    },
+    {
+      "epoch": 6.947331624329993,
+      "grad_norm": 0.3308181166648865,
+      "learning_rate": 0.0005169373360536286,
+      "loss": 3.4858,
+      "step": 23850
+    },
+    {
+      "epoch": 6.961896993707761,
+      "grad_norm": 0.3258877992630005,
+      "learning_rate": 0.0005167624599242203,
+      "loss": 3.4966,
+      "step": 23900
+    },
+    {
+      "epoch": 6.976462363085528,
+      "grad_norm": 0.32055163383483887,
+      "learning_rate": 0.000516587583794812,
+      "loss": 3.4928,
+      "step": 23950
+    },
+    {
+      "epoch": 6.991027732463295,
+      "grad_norm": 0.32862454652786255,
+      "learning_rate": 0.0005164127076654037,
+      "loss": 3.4969,
+      "step": 24000
+    },
+    {
+      "epoch": 6.991027732463295,
+      "eval_accuracy": 0.3648933437343302,
+      "eval_loss": 3.586930274963379,
+      "eval_runtime": 185.7598,
+      "eval_samples_per_second": 89.605,
+      "eval_steps_per_second": 5.604,
+      "step": 24000
+    },
+    {
+      "epoch": 7.005534840363552,
+      "grad_norm": 0.3120695948600769,
+      "learning_rate": 0.0005162378315359953,
+      "loss": 3.463,
+      "step": 24050
+    },
+    {
+      "epoch": 7.020100209741319,
+      "grad_norm": 0.32572463154792786,
+      "learning_rate": 0.0005160629554065869,
+      "loss": 3.3853,
+      "step": 24100
+    },
+    {
+      "epoch": 7.034665579119086,
+      "grad_norm": 0.3158482015132904,
+      "learning_rate": 0.0005158880792771786,
+      "loss": 3.3814,
+      "step": 24150
+    },
+    {
+      "epoch": 7.049230948496854,
+      "grad_norm": 0.322257936000824,
+      "learning_rate": 0.0005157132031477703,
+      "loss": 3.3902,
+      "step": 24200
+    },
+    {
+      "epoch": 7.063796317874622,
+      "grad_norm": 0.3240783214569092,
+      "learning_rate": 0.000515538327018362,
+      "loss": 3.3809,
+      "step": 24250
+    },
+    {
+      "epoch": 7.0783616872523885,
+      "grad_norm": 0.33611229062080383,
+      "learning_rate": 0.0005153634508889536,
+      "loss": 3.3813,
+      "step": 24300
+    },
+    {
+      "epoch": 7.092927056630156,
+      "grad_norm": 0.3258345425128937,
+      "learning_rate": 0.0005151885747595453,
+      "loss": 3.4111,
+      "step": 24350
+    },
+    {
+      "epoch": 7.107492426007924,
+      "grad_norm": 0.3187943994998932,
+      "learning_rate": 0.000515013698630137,
+      "loss": 3.3978,
+      "step": 24400
+    },
+    {
+      "epoch": 7.122057795385691,
+      "grad_norm": 0.3566177487373352,
+      "learning_rate": 0.0005148388225007285,
+      "loss": 3.4093,
+      "step": 24450
+    },
+    {
+      "epoch": 7.136623164763458,
+      "grad_norm": 0.32738471031188965,
+      "learning_rate": 0.0005146639463713203,
+      "loss": 3.4058,
+      "step": 24500
+    },
+    {
+      "epoch": 7.151188534141226,
+      "grad_norm": 0.3418164849281311,
+      "learning_rate": 0.0005144890702419119,
+      "loss": 3.4025,
+      "step": 24550
+    },
+    {
+      "epoch": 7.165753903518993,
+      "grad_norm": 0.3443870544433594,
+      "learning_rate": 0.0005143141941125036,
+      "loss": 3.4091,
+      "step": 24600
+    },
+    {
+      "epoch": 7.180319272896761,
+      "grad_norm": 0.3457808494567871,
+      "learning_rate": 0.0005141393179830953,
+      "loss": 3.4135,
+      "step": 24650
+    },
+    {
+      "epoch": 7.194884642274528,
+      "grad_norm": 0.3336106240749359,
+      "learning_rate": 0.0005139644418536869,
+      "loss": 3.4172,
+      "step": 24700
+    },
+    {
+      "epoch": 7.2094500116522955,
+      "grad_norm": 0.33773717284202576,
+      "learning_rate": 0.0005137895657242786,
+      "loss": 3.4075,
+      "step": 24750
+    },
+    {
+      "epoch": 7.224015381030063,
+      "grad_norm": 0.3136851489543915,
+      "learning_rate": 0.0005136146895948703,
+      "loss": 3.4221,
+      "step": 24800
+    },
+    {
+      "epoch": 7.238580750407831,
+      "grad_norm": 0.34105175733566284,
+      "learning_rate": 0.000513439813465462,
+      "loss": 3.4201,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2531461197855975,
+      "grad_norm": 0.34829896688461304,
+      "learning_rate": 0.0005132649373360535,
+      "loss": 3.4305,
+      "step": 24900
+    },
+    {
+      "epoch": 7.267711489163365,
+      "grad_norm": 0.32784610986709595,
+      "learning_rate": 0.0005130900612066452,
+      "loss": 3.4303,
+      "step": 24950
+    },
+    {
+      "epoch": 7.282276858541133,
+      "grad_norm": 0.34191837906837463,
+      "learning_rate": 0.0005129151850772369,
+      "loss": 3.4076,
+      "step": 25000
+    },
+    {
+      "epoch": 7.282276858541133,
+      "eval_accuracy": 0.36452899532601774,
+      "eval_loss": 3.5990021228790283,
+      "eval_runtime": 182.0119,
+      "eval_samples_per_second": 91.45,
+      "eval_steps_per_second": 5.719,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2968422279189,
+      "grad_norm": 0.32772010564804077,
+      "learning_rate": 0.0005127403089478286,
+      "loss": 3.4265,
+      "step": 25050
+    },
+    {
+      "epoch": 7.311407597296667,
+      "grad_norm": 0.33544066548347473,
+      "learning_rate": 0.0005125654328184203,
+      "loss": 3.4307,
+      "step": 25100
+    },
+    {
+      "epoch": 7.325972966674435,
+      "grad_norm": 0.3163904845714569,
+      "learning_rate": 0.0005123905566890119,
+      "loss": 3.4129,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3405383360522025,
+      "grad_norm": 0.3212714195251465,
+      "learning_rate": 0.0005122156805596036,
+      "loss": 3.4289,
+      "step": 25200
+    },
+    {
+      "epoch": 7.35510370542997,
+      "grad_norm": 0.3180087208747864,
+      "learning_rate": 0.0005120408044301953,
+      "loss": 3.441,
+      "step": 25250
+    },
+    {
+      "epoch": 7.369669074807737,
+      "grad_norm": 0.3278086483478546,
+      "learning_rate": 0.0005118659283007868,
+      "loss": 3.4319,
+      "step": 25300
+    },
+    {
+      "epoch": 7.384234444185505,
+      "grad_norm": 0.3221387267112732,
+      "learning_rate": 0.0005116910521713785,
+      "loss": 3.435,
+      "step": 25350
+    },
+    {
+      "epoch": 7.398799813563272,
+      "grad_norm": 0.3392038941383362,
+      "learning_rate": 0.0005115161760419702,
+      "loss": 3.4346,
+      "step": 25400
+    },
+    {
+      "epoch": 7.413365182941039,
+      "grad_norm": 0.30694445967674255,
+      "learning_rate": 0.0005113412999125619,
+      "loss": 3.4151,
+      "step": 25450
+    },
+    {
+      "epoch": 7.427930552318807,
+      "grad_norm": 0.3072811961174011,
+      "learning_rate": 0.0005111664237831536,
+      "loss": 3.4429,
+      "step": 25500
+    },
+    {
+      "epoch": 7.442495921696574,
+      "grad_norm": 0.354181170463562,
+      "learning_rate": 0.0005109915476537452,
+      "loss": 3.4374,
+      "step": 25550
+    },
+    {
+      "epoch": 7.457061291074342,
+      "grad_norm": 0.33871990442276,
+      "learning_rate": 0.0005108166715243369,
+      "loss": 3.4306,
+      "step": 25600
+    },
+    {
+      "epoch": 7.471626660452109,
+      "grad_norm": 0.33174610137939453,
+      "learning_rate": 0.0005106417953949286,
+      "loss": 3.4468,
+      "step": 25650
+    },
+    {
+      "epoch": 7.486192029829876,
+      "grad_norm": 0.3390611410140991,
+      "learning_rate": 0.0005104669192655203,
+      "loss": 3.4303,
+      "step": 25700
+    },
+    {
+      "epoch": 7.500757399207644,
+      "grad_norm": 0.3334279954433441,
+      "learning_rate": 0.0005102920431361118,
+      "loss": 3.4352,
+      "step": 25750
+    },
+    {
+      "epoch": 7.515322768585412,
+      "grad_norm": 0.31724613904953003,
+      "learning_rate": 0.0005101171670067035,
+      "loss": 3.4353,
+      "step": 25800
+    },
+    {
+      "epoch": 7.529888137963178,
+      "grad_norm": 0.3118076026439667,
+      "learning_rate": 0.0005099422908772952,
+      "loss": 3.4293,
+      "step": 25850
+    },
+    {
+      "epoch": 7.544453507340946,
+      "grad_norm": 0.3568933606147766,
+      "learning_rate": 0.0005097674147478868,
+      "loss": 3.4445,
+      "step": 25900
+    },
+    {
+      "epoch": 7.559018876718714,
+      "grad_norm": 0.3219817578792572,
+      "learning_rate": 0.0005095925386184786,
+      "loss": 3.4461,
+      "step": 25950
+    },
+    {
+      "epoch": 7.573584246096481,
+      "grad_norm": 0.3226459324359894,
+      "learning_rate": 0.0005094176624890702,
+      "loss": 3.4535,
+      "step": 26000
+    },
+    {
+      "epoch": 7.573584246096481,
+      "eval_accuracy": 0.3653061308468132,
+      "eval_loss": 3.5887749195098877,
+      "eval_runtime": 180.3689,
+      "eval_samples_per_second": 92.283,
+      "eval_steps_per_second": 5.772,
+      "step": 26000
+    },
+    {
+      "epoch": 7.588149615474248,
+      "grad_norm": 0.33434975147247314,
+      "learning_rate": 0.0005092427863596619,
+      "loss": 3.433,
+      "step": 26050
+    },
+    {
+      "epoch": 7.602714984852016,
+      "grad_norm": 0.33903586864471436,
+      "learning_rate": 0.0005090679102302536,
+      "loss": 3.4554,
+      "step": 26100
+    },
+    {
+      "epoch": 7.617280354229783,
+      "grad_norm": 0.3249543309211731,
+      "learning_rate": 0.0005088930341008451,
+      "loss": 3.4551,
+      "step": 26150
+    },
+    {
+      "epoch": 7.631845723607551,
+      "grad_norm": 0.33772045373916626,
+      "learning_rate": 0.0005087181579714368,
+      "loss": 3.4496,
+      "step": 26200
+    },
+    {
+      "epoch": 7.646411092985318,
+      "grad_norm": 0.3366949260234833,
+      "learning_rate": 0.0005085432818420285,
+      "loss": 3.453,
+      "step": 26250
+    },
+    {
+      "epoch": 7.660976462363085,
+      "grad_norm": 0.32443875074386597,
+      "learning_rate": 0.0005083684057126202,
+      "loss": 3.448,
+      "step": 26300
+    },
+    {
+      "epoch": 7.675541831740853,
+      "grad_norm": 0.322132408618927,
+      "learning_rate": 0.0005081935295832118,
+      "loss": 3.4588,
+      "step": 26350
+    },
+    {
+      "epoch": 7.690107201118621,
+      "grad_norm": 0.35065576434135437,
+      "learning_rate": 0.0005080186534538035,
+      "loss": 3.445,
+      "step": 26400
+    },
+    {
+      "epoch": 7.704672570496387,
+      "grad_norm": 0.32592296600341797,
+      "learning_rate": 0.0005078437773243952,
+      "loss": 3.4581,
+      "step": 26450
+    },
+    {
+      "epoch": 7.719237939874155,
+      "grad_norm": 0.3161769509315491,
+      "learning_rate": 0.0005076689011949869,
+      "loss": 3.4519,
+      "step": 26500
+    },
+    {
+      "epoch": 7.733803309251923,
+      "grad_norm": 0.3316858410835266,
+      "learning_rate": 0.0005074940250655786,
+      "loss": 3.4564,
+      "step": 26550
+    },
+    {
+      "epoch": 7.74836867862969,
+      "grad_norm": 0.3109246790409088,
+      "learning_rate": 0.0005073191489361701,
+      "loss": 3.4531,
+      "step": 26600
+    },
+    {
+      "epoch": 7.762934048007457,
+      "grad_norm": 0.33062729239463806,
+      "learning_rate": 0.0005071442728067618,
+      "loss": 3.4741,
+      "step": 26650
+    },
+    {
+      "epoch": 7.777499417385225,
+      "grad_norm": 0.31517359614372253,
+      "learning_rate": 0.0005069693966773535,
+      "loss": 3.4568,
+      "step": 26700
+    },
+    {
+      "epoch": 7.792064786762992,
+      "grad_norm": 0.3394618332386017,
+      "learning_rate": 0.0005067945205479451,
+      "loss": 3.4625,
+      "step": 26750
+    },
+    {
+      "epoch": 7.80663015614076,
+      "grad_norm": 0.3200554847717285,
+      "learning_rate": 0.0005066196444185368,
+      "loss": 3.4538,
+      "step": 26800
+    },
+    {
+      "epoch": 7.821195525518527,
+      "grad_norm": 0.30703428387641907,
+      "learning_rate": 0.0005064447682891285,
+      "loss": 3.4588,
+      "step": 26850
+    },
+    {
+      "epoch": 7.8357608948962945,
+      "grad_norm": 0.31843116879463196,
+      "learning_rate": 0.0005062698921597202,
+      "loss": 3.4522,
+      "step": 26900
+    },
+    {
+      "epoch": 7.850326264274062,
+      "grad_norm": 0.3061220943927765,
+      "learning_rate": 0.0005060950160303119,
+      "loss": 3.4653,
+      "step": 26950
+    },
+    {
+      "epoch": 7.86489163365183,
+      "grad_norm": 0.3118315637111664,
+      "learning_rate": 0.0005059201399009035,
+      "loss": 3.4522,
+      "step": 27000
+    },
+    {
+      "epoch": 7.86489163365183,
+      "eval_accuracy": 0.3663362762981308,
+      "eval_loss": 3.5790724754333496,
+      "eval_runtime": 180.2755,
+      "eval_samples_per_second": 92.331,
+      "eval_steps_per_second": 5.774,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8794570030295965,
+      "grad_norm": 0.31885090470314026,
+      "learning_rate": 0.0005057452637714951,
+      "loss": 3.4557,
+      "step": 27050
+    },
+    {
+      "epoch": 7.894022372407364,
+      "grad_norm": 0.29833894968032837,
+      "learning_rate": 0.0005055703876420868,
+      "loss": 3.4556,
+      "step": 27100
+    },
+    {
+      "epoch": 7.908587741785132,
+      "grad_norm": 0.3107469975948334,
+      "learning_rate": 0.0005053955115126785,
+      "loss": 3.4487,
+      "step": 27150
+    },
+    {
+      "epoch": 7.923153111162899,
+      "grad_norm": 0.3197011649608612,
+      "learning_rate": 0.0005052206353832701,
+      "loss": 3.4516,
+      "step": 27200
+    },
+    {
+      "epoch": 7.937718480540666,
+      "grad_norm": 0.3332655727863312,
+      "learning_rate": 0.0005050457592538618,
+      "loss": 3.4542,
+      "step": 27250
+    },
+    {
+      "epoch": 7.952283849918434,
+      "grad_norm": 0.3343665301799774,
+      "learning_rate": 0.0005048708831244535,
+      "loss": 3.4726,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9668492192962015,
+      "grad_norm": 0.33328115940093994,
+      "learning_rate": 0.0005046960069950451,
+      "loss": 3.4682,
+      "step": 27350
+    },
+    {
+      "epoch": 7.981414588673969,
+      "grad_norm": 0.30137884616851807,
+      "learning_rate": 0.0005045211308656369,
+      "loss": 3.457,
+      "step": 27400
+    },
+    {
+      "epoch": 7.995979958051736,
+      "grad_norm": 0.33149558305740356,
+      "learning_rate": 0.0005043462547362284,
+      "loss": 3.4767,
+      "step": 27450
+    },
+    {
+      "epoch": 8.010487065951992,
+      "grad_norm": 0.32484644651412964,
+      "learning_rate": 0.0005041713786068201,
+      "loss": 3.3776,
+      "step": 27500
+    },
+    {
+      "epoch": 8.02505243532976,
+      "grad_norm": 0.3401671051979065,
+      "learning_rate": 0.0005039965024774118,
+      "loss": 3.3496,
+      "step": 27550
+    },
+    {
+      "epoch": 8.039617804707527,
+      "grad_norm": 0.32481470704078674,
+      "learning_rate": 0.0005038216263480034,
+      "loss": 3.3514,
+      "step": 27600
+    },
+    {
+      "epoch": 8.054183174085296,
+      "grad_norm": 0.33603399991989136,
+      "learning_rate": 0.0005036467502185951,
+      "loss": 3.3551,
+      "step": 27650
+    },
+    {
+      "epoch": 8.068748543463062,
+      "grad_norm": 0.3104502558708191,
+      "learning_rate": 0.0005034718740891868,
+      "loss": 3.3531,
+      "step": 27700
+    },
+    {
+      "epoch": 8.08331391284083,
+      "grad_norm": 0.3129752278327942,
+      "learning_rate": 0.0005032969979597785,
+      "loss": 3.3728,
+      "step": 27750
+    },
+    {
+      "epoch": 8.097879282218598,
+      "grad_norm": 0.3313758075237274,
+      "learning_rate": 0.0005031221218303701,
+      "loss": 3.3705,
+      "step": 27800
+    },
+    {
+      "epoch": 8.112444651596364,
+      "grad_norm": 0.3319389522075653,
+      "learning_rate": 0.0005029472457009618,
+      "loss": 3.3781,
+      "step": 27850
+    },
+    {
+      "epoch": 8.127010020974131,
+      "grad_norm": 0.33893296122550964,
+      "learning_rate": 0.0005027723695715534,
+      "loss": 3.3757,
+      "step": 27900
+    },
+    {
+      "epoch": 8.1415753903519,
+      "grad_norm": 0.34440794587135315,
+      "learning_rate": 0.0005025974934421451,
+      "loss": 3.3669,
+      "step": 27950
+    },
+    {
+      "epoch": 8.156140759729666,
+      "grad_norm": 0.3383232057094574,
+      "learning_rate": 0.0005024226173127368,
+      "loss": 3.3902,
+      "step": 28000
+    },
+    {
+      "epoch": 8.156140759729666,
+      "eval_accuracy": 0.36587481534213656,
+      "eval_loss": 3.5893173217773438,
+      "eval_runtime": 180.5544,
+      "eval_samples_per_second": 92.188,
+      "eval_steps_per_second": 5.766,
+      "step": 28000
+    },
+    {
+      "epoch": 8.170706129107435,
+      "grad_norm": 0.3452877104282379,
+      "learning_rate": 0.0005022477411833284,
+      "loss": 3.3954,
+      "step": 28050
+    },
+    {
+      "epoch": 8.185271498485202,
+      "grad_norm": 0.31373968720436096,
+      "learning_rate": 0.0005020728650539201,
+      "loss": 3.3883,
+      "step": 28100
+    },
+    {
+      "epoch": 8.199836867862969,
+      "grad_norm": 0.3412425220012665,
+      "learning_rate": 0.0005018979889245118,
+      "loss": 3.3806,
+      "step": 28150
+    },
+    {
+      "epoch": 8.214402237240737,
+      "grad_norm": 0.3370135724544525,
+      "learning_rate": 0.0005017231127951034,
+      "loss": 3.3744,
+      "step": 28200
+    },
+    {
+      "epoch": 8.228967606618504,
+      "grad_norm": 0.32666200399398804,
+      "learning_rate": 0.0005015482366656951,
+      "loss": 3.3909,
+      "step": 28250
+    },
+    {
+      "epoch": 8.24353297599627,
+      "grad_norm": 0.35053926706314087,
+      "learning_rate": 0.0005013733605362868,
+      "loss": 3.3971,
+      "step": 28300
+    },
+    {
+      "epoch": 8.258098345374039,
+      "grad_norm": 0.32368576526641846,
+      "learning_rate": 0.0005011984844068784,
+      "loss": 3.3959,
+      "step": 28350
+    },
+    {
+      "epoch": 8.272663714751806,
+      "grad_norm": 0.3504972755908966,
+      "learning_rate": 0.0005010236082774701,
+      "loss": 3.4046,
+      "step": 28400
+    },
+    {
+      "epoch": 8.287229084129574,
+      "grad_norm": 0.3331746459007263,
+      "learning_rate": 0.0005008487321480617,
+      "loss": 3.3945,
+      "step": 28450
+    },
+    {
+      "epoch": 8.301794453507341,
+      "grad_norm": 0.33394086360931396,
+      "learning_rate": 0.0005006738560186534,
+      "loss": 3.3922,
+      "step": 28500
+    },
+    {
+      "epoch": 8.316359822885108,
+      "grad_norm": 0.3699552118778229,
+      "learning_rate": 0.0005004989798892451,
+      "loss": 3.3974,
+      "step": 28550
+    },
+    {
+      "epoch": 8.330925192262876,
+      "grad_norm": 0.3196350038051605,
+      "learning_rate": 0.0005003241037598368,
+      "loss": 3.3833,
+      "step": 28600
+    },
+    {
+      "epoch": 8.345490561640643,
+      "grad_norm": 0.3315056264400482,
+      "learning_rate": 0.0005001492276304284,
+      "loss": 3.4032,
+      "step": 28650
+    },
+    {
+      "epoch": 8.36005593101841,
+      "grad_norm": 0.322210431098938,
+      "learning_rate": 0.0004999743515010201,
+      "loss": 3.4025,
+      "step": 28700
+    },
+    {
+      "epoch": 8.374621300396178,
+      "grad_norm": 0.31940722465515137,
+      "learning_rate": 0.0004997994753716117,
+      "loss": 3.4013,
+      "step": 28750
+    },
+    {
+      "epoch": 8.389186669773945,
+      "grad_norm": 0.3116293251514435,
+      "learning_rate": 0.0004996245992422033,
+      "loss": 3.4114,
+      "step": 28800
+    },
+    {
+      "epoch": 8.403752039151712,
+      "grad_norm": 0.33897754549980164,
+      "learning_rate": 0.0004994497231127951,
+      "loss": 3.409,
+      "step": 28850
+    },
+    {
+      "epoch": 8.41831740852948,
+      "grad_norm": 0.3248927891254425,
+      "learning_rate": 0.0004992748469833867,
+      "loss": 3.4177,
+      "step": 28900
+    },
+    {
+      "epoch": 8.432882777907247,
+      "grad_norm": 0.34883642196655273,
+      "learning_rate": 0.0004990999708539784,
+      "loss": 3.412,
+      "step": 28950
+    },
+    {
+      "epoch": 8.447448147285016,
+      "grad_norm": 0.3144770860671997,
+      "learning_rate": 0.0004989250947245701,
+      "loss": 3.4024,
+      "step": 29000
+    },
+    {
+      "epoch": 8.447448147285016,
+      "eval_accuracy": 0.36662279358469335,
+      "eval_loss": 3.5800533294677734,
+      "eval_runtime": 180.2514,
+      "eval_samples_per_second": 92.343,
+      "eval_steps_per_second": 5.775,
+      "step": 29000
+    },
+    {
+      "epoch": 8.462013516662783,
+      "grad_norm": 0.3243623375892639,
+      "learning_rate": 0.0004987502185951617,
+      "loss": 3.4076,
+      "step": 29050
+    },
+    {
+      "epoch": 8.47657888604055,
+      "grad_norm": 0.33650779724121094,
+      "learning_rate": 0.0004985753424657534,
+      "loss": 3.4107,
+      "step": 29100
+    },
+    {
+      "epoch": 8.491144255418318,
+      "grad_norm": 0.3219531178474426,
+      "learning_rate": 0.000498400466336345,
+      "loss": 3.4279,
+      "step": 29150
+    },
+    {
+      "epoch": 8.505709624796085,
+      "grad_norm": 0.33923518657684326,
+      "learning_rate": 0.0004982255902069367,
+      "loss": 3.4111,
+      "step": 29200
+    },
+    {
+      "epoch": 8.520274994173853,
+      "grad_norm": 0.3253156244754791,
+      "learning_rate": 0.0004980507140775283,
+      "loss": 3.4206,
+      "step": 29250
+    },
+    {
+      "epoch": 8.53484036355162,
+      "grad_norm": 0.32380804419517517,
+      "learning_rate": 0.0004978758379481201,
+      "loss": 3.418,
+      "step": 29300
+    },
+    {
+      "epoch": 8.549405732929387,
+      "grad_norm": 0.3461204767227173,
+      "learning_rate": 0.0004977009618187117,
+      "loss": 3.423,
+      "step": 29350
+    },
+    {
+      "epoch": 8.563971102307155,
+      "grad_norm": 0.31913691759109497,
+      "learning_rate": 0.0004975260856893034,
+      "loss": 3.4112,
+      "step": 29400
+    },
+    {
+      "epoch": 8.578536471684922,
+      "grad_norm": 0.3127973675727844,
+      "learning_rate": 0.0004973512095598951,
+      "loss": 3.4102,
+      "step": 29450
+    },
+    {
+      "epoch": 8.593101841062689,
+      "grad_norm": 0.35194820165634155,
+      "learning_rate": 0.0004971763334304867,
+      "loss": 3.4137,
+      "step": 29500
+    },
+    {
+      "epoch": 8.607667210440457,
+      "grad_norm": 0.322832852602005,
+      "learning_rate": 0.0004970014573010784,
+      "loss": 3.4251,
+      "step": 29550
+    },
+    {
+      "epoch": 8.622232579818224,
+      "grad_norm": 0.3397376537322998,
+      "learning_rate": 0.00049682658117167,
+      "loss": 3.4134,
+      "step": 29600
+    },
+    {
+      "epoch": 8.63679794919599,
+      "grad_norm": 0.34631067514419556,
+      "learning_rate": 0.0004966517050422616,
+      "loss": 3.4247,
+      "step": 29650
+    },
+    {
+      "epoch": 8.65136331857376,
+      "grad_norm": 0.30945879220962524,
+      "learning_rate": 0.0004964768289128533,
+      "loss": 3.4248,
+      "step": 29700
+    },
+    {
+      "epoch": 8.665928687951526,
+      "grad_norm": 0.3211749792098999,
+      "learning_rate": 0.000496301952783445,
+      "loss": 3.4199,
+      "step": 29750
+    },
+    {
+      "epoch": 8.680494057329295,
+      "grad_norm": 0.3627347946166992,
+      "learning_rate": 0.0004961270766540367,
+      "loss": 3.4054,
+      "step": 29800
+    },
+    {
+      "epoch": 8.695059426707061,
+      "grad_norm": 0.32827696204185486,
+      "learning_rate": 0.0004959522005246284,
+      "loss": 3.4218,
+      "step": 29850
+    },
+    {
+      "epoch": 8.709624796084828,
+      "grad_norm": 0.3287445902824402,
+      "learning_rate": 0.00049577732439522,
+      "loss": 3.4173,
+      "step": 29900
+    },
+    {
+      "epoch": 8.724190165462597,
+      "grad_norm": 0.3574911952018738,
+      "learning_rate": 0.0004956024482658117,
+      "loss": 3.4242,
+      "step": 29950
+    },
+    {
+      "epoch": 8.738755534840363,
+      "grad_norm": 0.33523955941200256,
+      "learning_rate": 0.0004954275721364034,
+      "loss": 3.4241,
+      "step": 30000
+    },
+    {
+      "epoch": 8.738755534840363,
+      "eval_accuracy": 0.36724073977187954,
+      "eval_loss": 3.5715675354003906,
+      "eval_runtime": 184.6641,
+      "eval_samples_per_second": 90.137,
+      "eval_steps_per_second": 5.637,
+      "step": 30000
+    },
+    {
+      "epoch": 8.753320904218132,
+      "grad_norm": 0.3205890655517578,
+      "learning_rate": 0.000495252696006995,
+      "loss": 3.4306,
+      "step": 30050
+    },
+    {
+      "epoch": 8.767886273595899,
+      "grad_norm": 0.37505707144737244,
+      "learning_rate": 0.0004950778198775866,
+      "loss": 3.431,
+      "step": 30100
+    },
+    {
+      "epoch": 8.782451642973665,
+      "grad_norm": 0.34911301732063293,
+      "learning_rate": 0.0004949029437481783,
+      "loss": 3.4269,
+      "step": 30150
+    },
+    {
+      "epoch": 8.797017012351434,
+      "grad_norm": 0.3340999186038971,
+      "learning_rate": 0.00049472806761877,
+      "loss": 3.4223,
+      "step": 30200
+    },
+    {
+      "epoch": 8.8115823817292,
+      "grad_norm": 0.34287843108177185,
+      "learning_rate": 0.0004945531914893616,
+      "loss": 3.431,
+      "step": 30250
+    },
+    {
+      "epoch": 8.826147751106967,
+      "grad_norm": 0.3214537799358368,
+      "learning_rate": 0.0004943783153599534,
+      "loss": 3.428,
+      "step": 30300
+    },
+    {
+      "epoch": 8.840713120484736,
+      "grad_norm": 0.33591654896736145,
+      "learning_rate": 0.000494203439230545,
+      "loss": 3.4299,
+      "step": 30350
+    },
+    {
+      "epoch": 8.855278489862503,
+      "grad_norm": 0.3329029679298401,
+      "learning_rate": 0.0004940285631011367,
+      "loss": 3.4187,
+      "step": 30400
+    },
+    {
+      "epoch": 8.86984385924027,
+      "grad_norm": 0.3495054543018341,
+      "learning_rate": 0.0004938536869717284,
+      "loss": 3.4225,
+      "step": 30450
+    },
+    {
+      "epoch": 8.884409228618038,
+      "grad_norm": 0.32809290289878845,
+      "learning_rate": 0.0004936788108423199,
+      "loss": 3.4434,
+      "step": 30500
+    },
+    {
+      "epoch": 8.898974597995805,
+      "grad_norm": 0.3029363751411438,
+      "learning_rate": 0.0004935039347129116,
+      "loss": 3.4226,
+      "step": 30550
+    },
+    {
+      "epoch": 8.913539967373573,
+      "grad_norm": 0.3327055871486664,
+      "learning_rate": 0.0004933290585835033,
+      "loss": 3.4335,
+      "step": 30600
+    },
+    {
+      "epoch": 8.92810533675134,
+      "grad_norm": 0.3230193555355072,
+      "learning_rate": 0.000493154182454095,
+      "loss": 3.4221,
+      "step": 30650
+    },
+    {
+      "epoch": 8.942670706129107,
+      "grad_norm": 0.3522266745567322,
+      "learning_rate": 0.0004929793063246866,
+      "loss": 3.4195,
+      "step": 30700
+    },
+    {
+      "epoch": 8.957236075506875,
+      "grad_norm": 0.32179632782936096,
+      "learning_rate": 0.0004928044301952783,
+      "loss": 3.437,
+      "step": 30750
+    },
+    {
+      "epoch": 8.971801444884642,
+      "grad_norm": 0.32391178607940674,
+      "learning_rate": 0.00049262955406587,
+      "loss": 3.4254,
+      "step": 30800
+    },
+    {
+      "epoch": 8.986366814262409,
+      "grad_norm": 0.31046491861343384,
+      "learning_rate": 0.0004924546779364617,
+      "loss": 3.4317,
+      "step": 30850
+    },
+    {
+      "epoch": 9.000873922162667,
+      "grad_norm": 0.341305673122406,
+      "learning_rate": 0.0004922798018070533,
+      "loss": 3.4248,
+      "step": 30900
+    },
+    {
+      "epoch": 9.015439291540433,
+      "grad_norm": 0.3179487884044647,
+      "learning_rate": 0.0004921049256776449,
+      "loss": 3.317,
+      "step": 30950
+    },
+    {
+      "epoch": 9.0300046609182,
+      "grad_norm": 0.3541235029697418,
+      "learning_rate": 0.0004919300495482366,
+      "loss": 3.3306,
+      "step": 31000
+    },
+    {
+      "epoch": 9.0300046609182,
+      "eval_accuracy": 0.3668760386545562,
+      "eval_loss": 3.5815300941467285,
+      "eval_runtime": 180.3287,
+      "eval_samples_per_second": 92.304,
+      "eval_steps_per_second": 5.773,
+      "step": 31000
+    },
+    {
+      "epoch": 9.044570030295969,
+      "grad_norm": 0.3544383943080902,
+      "learning_rate": 0.0004917551734188283,
+      "loss": 3.3247,
+      "step": 31050
+    },
+    {
+      "epoch": 9.059135399673735,
+      "grad_norm": 0.32461535930633545,
+      "learning_rate": 0.0004915802972894199,
+      "loss": 3.3379,
+      "step": 31100
+    },
+    {
+      "epoch": 9.073700769051504,
+      "grad_norm": 0.3214743137359619,
+      "learning_rate": 0.0004914054211600116,
+      "loss": 3.3344,
+      "step": 31150
+    },
+    {
+      "epoch": 9.08826613842927,
+      "grad_norm": 0.3329453468322754,
+      "learning_rate": 0.0004912305450306033,
+      "loss": 3.3366,
+      "step": 31200
+    },
+    {
+      "epoch": 9.102831507807037,
+      "grad_norm": 0.33429789543151855,
+      "learning_rate": 0.000491055668901195,
+      "loss": 3.3535,
+      "step": 31250
+    },
+    {
+      "epoch": 9.117396877184806,
+      "grad_norm": 0.3591921031475067,
+      "learning_rate": 0.0004908807927717865,
+      "loss": 3.3407,
+      "step": 31300
+    },
+    {
+      "epoch": 9.131962246562573,
+      "grad_norm": 0.3514149785041809,
+      "learning_rate": 0.0004907059166423783,
+      "loss": 3.3391,
+      "step": 31350
+    },
+    {
+      "epoch": 9.14652761594034,
+      "grad_norm": 0.3230554163455963,
+      "learning_rate": 0.0004905310405129699,
+      "loss": 3.36,
+      "step": 31400
+    },
+    {
+      "epoch": 9.161092985318108,
+      "grad_norm": 0.3351898193359375,
+      "learning_rate": 0.0004903561643835616,
+      "loss": 3.3516,
+      "step": 31450
+    },
+    {
+      "epoch": 9.175658354695875,
+      "grad_norm": 0.32898926734924316,
+      "learning_rate": 0.0004901812882541533,
+      "loss": 3.3637,
+      "step": 31500
+    },
+    {
+      "epoch": 9.190223724073643,
+      "grad_norm": 0.37594887614250183,
+      "learning_rate": 0.0004900064121247449,
+      "loss": 3.3559,
+      "step": 31550
+    },
+    {
+      "epoch": 9.20478909345141,
+      "grad_norm": 0.3707999289035797,
+      "learning_rate": 0.0004898315359953366,
+      "loss": 3.3511,
+      "step": 31600
+    },
+    {
+      "epoch": 9.219354462829177,
+      "grad_norm": 0.34326040744781494,
+      "learning_rate": 0.0004896566598659283,
+      "loss": 3.3577,
+      "step": 31650
+    },
+    {
+      "epoch": 9.233919832206945,
+      "grad_norm": 0.3109116554260254,
+      "learning_rate": 0.0004894817837365199,
+      "loss": 3.3553,
+      "step": 31700
+    },
+    {
+      "epoch": 9.248485201584712,
+      "grad_norm": 0.343200147151947,
+      "learning_rate": 0.0004893069076071115,
+      "loss": 3.3755,
+      "step": 31750
+    },
+    {
+      "epoch": 9.263050570962479,
+      "grad_norm": 0.34142202138900757,
+      "learning_rate": 0.0004891320314777032,
+      "loss": 3.3705,
+      "step": 31800
+    },
+    {
+      "epoch": 9.277615940340247,
+      "grad_norm": 0.35971346497535706,
+      "learning_rate": 0.0004889571553482949,
+      "loss": 3.3715,
+      "step": 31850
+    },
+    {
+      "epoch": 9.292181309718014,
+      "grad_norm": 0.3268764913082123,
+      "learning_rate": 0.0004887822792188866,
+      "loss": 3.3794,
+      "step": 31900
+    },
+    {
+      "epoch": 9.306746679095783,
+      "grad_norm": 0.3492770493030548,
+      "learning_rate": 0.0004886074030894782,
+      "loss": 3.3712,
+      "step": 31950
+    },
+    {
+      "epoch": 9.32131204847355,
+      "grad_norm": 0.34961649775505066,
+      "learning_rate": 0.0004884325269600699,
+      "loss": 3.3808,
+      "step": 32000
+    },
+    {
+      "epoch": 9.32131204847355,
+      "eval_accuracy": 0.36758063368876603,
+      "eval_loss": 3.577639102935791,
+      "eval_runtime": 180.0042,
+      "eval_samples_per_second": 92.47,
+      "eval_steps_per_second": 5.783,
+      "step": 32000
+    },
+    {
+      "epoch": 9.335877417851316,
+      "grad_norm": 0.354534387588501,
+      "learning_rate": 0.0004882576508306615,
+      "loss": 3.3778,
+      "step": 32050
+    },
+    {
+      "epoch": 9.350442787229085,
+      "grad_norm": 0.3517109453678131,
+      "learning_rate": 0.00048808277470125327,
+      "loss": 3.3701,
+      "step": 32100
+    },
+    {
+      "epoch": 9.365008156606851,
+      "grad_norm": 0.333987295627594,
+      "learning_rate": 0.0004879078985718449,
+      "loss": 3.3867,
+      "step": 32150
+    },
+    {
+      "epoch": 9.379573525984618,
+      "grad_norm": 0.34107351303100586,
+      "learning_rate": 0.0004877330224424366,
+      "loss": 3.374,
+      "step": 32200
+    },
+    {
+      "epoch": 9.394138895362387,
+      "grad_norm": 0.3421863615512848,
+      "learning_rate": 0.00048755814631302823,
+      "loss": 3.377,
+      "step": 32250
+    },
+    {
+      "epoch": 9.408704264740154,
+      "grad_norm": 0.3399357795715332,
+      "learning_rate": 0.00048738327018361987,
+      "loss": 3.3739,
+      "step": 32300
+    },
+    {
+      "epoch": 9.423269634117922,
+      "grad_norm": 0.3419632613658905,
+      "learning_rate": 0.00048720839405421156,
+      "loss": 3.381,
+      "step": 32350
+    },
+    {
+      "epoch": 9.437835003495689,
+      "grad_norm": 0.3296915292739868,
+      "learning_rate": 0.0004870335179248032,
+      "loss": 3.3788,
+      "step": 32400
+    },
+    {
+      "epoch": 9.452400372873456,
+      "grad_norm": 0.3411202132701874,
+      "learning_rate": 0.0004868586417953949,
+      "loss": 3.3895,
+      "step": 32450
+    },
+    {
+      "epoch": 9.466965742251224,
+      "grad_norm": 0.35072267055511475,
+      "learning_rate": 0.0004866837656659865,
+      "loss": 3.3781,
+      "step": 32500
+    },
+    {
+      "epoch": 9.48153111162899,
+      "grad_norm": 0.3408583700656891,
+      "learning_rate": 0.00048650888953657816,
+      "loss": 3.3765,
+      "step": 32550
+    },
+    {
+      "epoch": 9.496096481006758,
+      "grad_norm": 0.3342673182487488,
+      "learning_rate": 0.0004863340134071699,
+      "loss": 3.384,
+      "step": 32600
+    },
+    {
+      "epoch": 9.510661850384526,
+      "grad_norm": 0.3424014151096344,
+      "learning_rate": 0.00048615913727776154,
+      "loss": 3.3787,
+      "step": 32650
+    },
+    {
+      "epoch": 9.525227219762293,
+      "grad_norm": 0.3425311744213104,
+      "learning_rate": 0.00048598426114835323,
+      "loss": 3.3784,
+      "step": 32700
+    },
+    {
+      "epoch": 9.53979258914006,
+      "grad_norm": 0.35819771885871887,
+      "learning_rate": 0.00048580938501894486,
+      "loss": 3.3871,
+      "step": 32750
+    },
+    {
+      "epoch": 9.554357958517828,
+      "grad_norm": 0.3209281265735626,
+      "learning_rate": 0.00048563450888953655,
+      "loss": 3.3878,
+      "step": 32800
+    },
+    {
+      "epoch": 9.568923327895595,
+      "grad_norm": 0.35813918709754944,
+      "learning_rate": 0.0004854596327601282,
+      "loss": 3.3865,
+      "step": 32850
+    },
+    {
+      "epoch": 9.583488697273363,
+      "grad_norm": 0.3399278521537781,
+      "learning_rate": 0.0004852847566307198,
+      "loss": 3.3913,
+      "step": 32900
+    },
+    {
+      "epoch": 9.59805406665113,
+      "grad_norm": 0.3297010362148285,
+      "learning_rate": 0.0004851098805013115,
+      "loss": 3.3961,
+      "step": 32950
+    },
+    {
+      "epoch": 9.612619436028897,
+      "grad_norm": 0.3605840504169464,
+      "learning_rate": 0.00048493500437190315,
+      "loss": 3.3982,
+      "step": 33000
+    },
+    {
+      "epoch": 9.612619436028897,
+      "eval_accuracy": 0.3680854778531073,
+      "eval_loss": 3.5693137645721436,
+      "eval_runtime": 181.9882,
+      "eval_samples_per_second": 91.462,
+      "eval_steps_per_second": 5.72,
+      "step": 33000
+    },
+    {
+      "epoch": 9.627184805406666,
+      "grad_norm": 0.3284258246421814,
+      "learning_rate": 0.0004847601282424949,
+      "loss": 3.3981,
+      "step": 33050
+    },
+    {
+      "epoch": 9.641750174784432,
+      "grad_norm": 0.3391687273979187,
+      "learning_rate": 0.00048458525211308653,
+      "loss": 3.4001,
+      "step": 33100
+    },
+    {
+      "epoch": 9.6563155441622,
+      "grad_norm": 0.33698102831840515,
+      "learning_rate": 0.00048441037598367817,
+      "loss": 3.3997,
+      "step": 33150
+    },
+    {
+      "epoch": 9.670880913539968,
+      "grad_norm": 0.34788277745246887,
+      "learning_rate": 0.00048423549985426986,
+      "loss": 3.4001,
+      "step": 33200
+    },
+    {
+      "epoch": 9.685446282917734,
+      "grad_norm": 0.3518361747264862,
+      "learning_rate": 0.0004840606237248615,
+      "loss": 3.3944,
+      "step": 33250
+    },
+    {
+      "epoch": 9.700011652295503,
+      "grad_norm": 0.3476645052433014,
+      "learning_rate": 0.0004838857475954532,
+      "loss": 3.4001,
+      "step": 33300
+    },
+    {
+      "epoch": 9.71457702167327,
+      "grad_norm": 0.34337055683135986,
+      "learning_rate": 0.0004837108714660448,
+      "loss": 3.3963,
+      "step": 33350
+    },
+    {
+      "epoch": 9.729142391051036,
+      "grad_norm": 0.37030595541000366,
+      "learning_rate": 0.0004835359953366365,
+      "loss": 3.4053,
+      "step": 33400
+    },
+    {
+      "epoch": 9.743707760428805,
+      "grad_norm": 0.343307763338089,
+      "learning_rate": 0.00048336111920722815,
+      "loss": 3.4102,
+      "step": 33450
+    },
+    {
+      "epoch": 9.758273129806572,
+      "grad_norm": 0.3450109660625458,
+      "learning_rate": 0.0004831862430778198,
+      "loss": 3.3953,
+      "step": 33500
+    },
+    {
+      "epoch": 9.772838499184338,
+      "grad_norm": 0.3278052508831024,
+      "learning_rate": 0.00048301136694841153,
+      "loss": 3.3951,
+      "step": 33550
+    },
+    {
+      "epoch": 9.787403868562107,
+      "grad_norm": 0.33179140090942383,
+      "learning_rate": 0.00048283649081900317,
+      "loss": 3.4122,
+      "step": 33600
+    },
+    {
+      "epoch": 9.801969237939874,
+      "grad_norm": 0.32797035574913025,
+      "learning_rate": 0.00048266161468959486,
+      "loss": 3.4051,
+      "step": 33650
+    },
+    {
+      "epoch": 9.816534607317642,
+      "grad_norm": 0.3269261419773102,
+      "learning_rate": 0.0004824867385601865,
+      "loss": 3.4172,
+      "step": 33700
+    },
+    {
+      "epoch": 9.831099976695409,
+      "grad_norm": 0.3510269522666931,
+      "learning_rate": 0.00048231186243077813,
+      "loss": 3.4084,
+      "step": 33750
+    },
+    {
+      "epoch": 9.845665346073176,
+      "grad_norm": 0.31174078583717346,
+      "learning_rate": 0.0004821369863013698,
+      "loss": 3.3988,
+      "step": 33800
+    },
+    {
+      "epoch": 9.860230715450944,
+      "grad_norm": 0.3384522497653961,
+      "learning_rate": 0.00048196211017196146,
+      "loss": 3.4053,
+      "step": 33850
+    },
+    {
+      "epoch": 9.874796084828711,
+      "grad_norm": 0.3284479081630707,
+      "learning_rate": 0.00048178723404255315,
+      "loss": 3.4068,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88936145420648,
+      "grad_norm": 0.3625960052013397,
+      "learning_rate": 0.0004816123579131448,
+      "loss": 3.4109,
+      "step": 33950
+    },
+    {
+      "epoch": 9.903926823584246,
+      "grad_norm": 0.31597065925598145,
+      "learning_rate": 0.0004814374817837364,
+      "loss": 3.4131,
+      "step": 34000
+    },
+    {
+      "epoch": 9.903926823584246,
+      "eval_accuracy": 0.3685431765796514,
+      "eval_loss": 3.559751510620117,
+      "eval_runtime": 180.1509,
+      "eval_samples_per_second": 92.395,
+      "eval_steps_per_second": 5.778,
+      "step": 34000
+    },
+    {
+      "epoch": 9.918492192962013,
+      "grad_norm": 0.3383018672466278,
+      "learning_rate": 0.00048126260565432816,
+      "loss": 3.3986,
+      "step": 34050
+    },
+    {
+      "epoch": 9.933057562339782,
+      "grad_norm": 0.3599507510662079,
+      "learning_rate": 0.0004810877295249198,
+      "loss": 3.4032,
+      "step": 34100
+    },
+    {
+      "epoch": 9.947622931717548,
+      "grad_norm": 0.31639835238456726,
+      "learning_rate": 0.0004809128533955115,
+      "loss": 3.4118,
+      "step": 34150
+    },
+    {
+      "epoch": 9.962188301095315,
+      "grad_norm": 0.3275424540042877,
+      "learning_rate": 0.0004807379772661031,
+      "loss": 3.407,
+      "step": 34200
+    },
+    {
+      "epoch": 9.976753670473084,
+      "grad_norm": 0.33506444096565247,
+      "learning_rate": 0.0004805631011366948,
+      "loss": 3.4083,
+      "step": 34250
+    },
+    {
+      "epoch": 9.99131903985085,
+      "grad_norm": 0.3339884877204895,
+      "learning_rate": 0.00048038822500728645,
+      "loss": 3.4199,
+      "step": 34300
+    },
+    {
+      "epoch": 10.005826147751106,
+      "grad_norm": 0.35206592082977295,
+      "learning_rate": 0.0004802133488778781,
+      "loss": 3.3579,
+      "step": 34350
+    },
+    {
+      "epoch": 10.020391517128875,
+      "grad_norm": 0.3488442301750183,
+      "learning_rate": 0.0004800384727484698,
+      "loss": 3.2863,
+      "step": 34400
+    },
+    {
+      "epoch": 10.034956886506642,
+      "grad_norm": 0.35724684596061707,
+      "learning_rate": 0.0004798635966190614,
+      "loss": 3.3027,
+      "step": 34450
+    },
+    {
+      "epoch": 10.049522255884408,
+      "grad_norm": 0.3650865852832794,
+      "learning_rate": 0.00047968872048965316,
+      "loss": 3.315,
+      "step": 34500
+    },
+    {
+      "epoch": 10.064087625262177,
+      "grad_norm": 0.34090808033943176,
+      "learning_rate": 0.0004795138443602448,
+      "loss": 3.3156,
+      "step": 34550
+    },
+    {
+      "epoch": 10.078652994639944,
+      "grad_norm": 0.3337085247039795,
+      "learning_rate": 0.00047933896823083643,
+      "loss": 3.3165,
+      "step": 34600
+    },
+    {
+      "epoch": 10.093218364017712,
+      "grad_norm": 0.3569350838661194,
+      "learning_rate": 0.0004791640921014281,
+      "loss": 3.3215,
+      "step": 34650
+    },
+    {
+      "epoch": 10.107783733395479,
+      "grad_norm": 0.35704872012138367,
+      "learning_rate": 0.00047898921597201976,
+      "loss": 3.3143,
+      "step": 34700
+    },
+    {
+      "epoch": 10.122349102773246,
+      "grad_norm": 0.3637121617794037,
+      "learning_rate": 0.00047881433984261145,
+      "loss": 3.3123,
+      "step": 34750
+    },
+    {
+      "epoch": 10.136914472151014,
+      "grad_norm": 0.37336090207099915,
+      "learning_rate": 0.0004786394637132031,
+      "loss": 3.3213,
+      "step": 34800
+    },
+    {
+      "epoch": 10.151479841528781,
+      "grad_norm": 0.35010263323783875,
+      "learning_rate": 0.0004784645875837948,
+      "loss": 3.3336,
+      "step": 34850
+    },
+    {
+      "epoch": 10.166045210906548,
+      "grad_norm": 0.33684661984443665,
+      "learning_rate": 0.0004782897114543864,
+      "loss": 3.3302,
+      "step": 34900
+    },
+    {
+      "epoch": 10.180610580284316,
+      "grad_norm": 0.34124529361724854,
+      "learning_rate": 0.00047811483532497805,
+      "loss": 3.3294,
+      "step": 34950
+    },
+    {
+      "epoch": 10.195175949662083,
+      "grad_norm": 0.35172775387763977,
+      "learning_rate": 0.0004779399591955698,
+      "loss": 3.3494,
+      "step": 35000
+    },
+    {
+      "epoch": 10.195175949662083,
+      "eval_accuracy": 0.36845782099900126,
+      "eval_loss": 3.5712480545043945,
+      "eval_runtime": 180.0647,
+      "eval_samples_per_second": 92.439,
+      "eval_steps_per_second": 5.781,
+      "step": 35000
+    },
+    {
+      "epoch": 10.209741319039852,
+      "grad_norm": 0.33524951338768005,
+      "learning_rate": 0.00047776508306616143,
+      "loss": 3.3295,
+      "step": 35050
+    },
+    {
+      "epoch": 10.224306688417618,
+      "grad_norm": 0.32747432589530945,
+      "learning_rate": 0.0004775902069367531,
+      "loss": 3.3427,
+      "step": 35100
+    },
+    {
+      "epoch": 10.238872057795385,
+      "grad_norm": 0.37668707966804504,
+      "learning_rate": 0.00047741533080734476,
+      "loss": 3.3447,
+      "step": 35150
+    },
+    {
+      "epoch": 10.253437427173154,
+      "grad_norm": 0.35878944396972656,
+      "learning_rate": 0.0004772404546779364,
+      "loss": 3.3322,
+      "step": 35200
+    },
+    {
+      "epoch": 10.26800279655092,
+      "grad_norm": 0.343294233083725,
+      "learning_rate": 0.0004770655785485281,
+      "loss": 3.3489,
+      "step": 35250
+    },
+    {
+      "epoch": 10.282568165928687,
+      "grad_norm": 0.35359030961990356,
+      "learning_rate": 0.0004768907024191197,
+      "loss": 3.3337,
+      "step": 35300
+    },
+    {
+      "epoch": 10.297133535306456,
+      "grad_norm": 0.3531194031238556,
+      "learning_rate": 0.0004767158262897114,
+      "loss": 3.3502,
+      "step": 35350
+    },
+    {
+      "epoch": 10.311698904684222,
+      "grad_norm": 0.34489333629608154,
+      "learning_rate": 0.00047654095016030305,
+      "loss": 3.3373,
+      "step": 35400
+    },
+    {
+      "epoch": 10.326264274061991,
+      "grad_norm": 0.35143405199050903,
+      "learning_rate": 0.0004763660740308948,
+      "loss": 3.3543,
+      "step": 35450
+    },
+    {
+      "epoch": 10.340829643439758,
+      "grad_norm": 0.345109760761261,
+      "learning_rate": 0.0004761911979014864,
+      "loss": 3.3522,
+      "step": 35500
+    },
+    {
+      "epoch": 10.355395012817525,
+      "grad_norm": 0.3299683928489685,
+      "learning_rate": 0.00047601632177207806,
+      "loss": 3.3578,
+      "step": 35550
+    },
+    {
+      "epoch": 10.369960382195293,
+      "grad_norm": 0.3354707360267639,
+      "learning_rate": 0.00047584144564266975,
+      "loss": 3.3613,
+      "step": 35600
+    },
+    {
+      "epoch": 10.38452575157306,
+      "grad_norm": 0.36335381865501404,
+      "learning_rate": 0.0004756665695132614,
+      "loss": 3.3644,
+      "step": 35650
+    },
+    {
+      "epoch": 10.399091120950827,
+      "grad_norm": 0.3393089771270752,
+      "learning_rate": 0.0004754916933838531,
+      "loss": 3.3578,
+      "step": 35700
+    },
+    {
+      "epoch": 10.413656490328595,
+      "grad_norm": 0.3224678337574005,
+      "learning_rate": 0.0004753168172544447,
+      "loss": 3.3585,
+      "step": 35750
+    },
+    {
+      "epoch": 10.428221859706362,
+      "grad_norm": 0.34779009222984314,
+      "learning_rate": 0.00047514194112503635,
+      "loss": 3.3551,
+      "step": 35800
+    },
+    {
+      "epoch": 10.44278722908413,
+      "grad_norm": 0.34187746047973633,
+      "learning_rate": 0.00047496706499562804,
+      "loss": 3.3558,
+      "step": 35850
+    },
+    {
+      "epoch": 10.457352598461897,
+      "grad_norm": 0.34698963165283203,
+      "learning_rate": 0.0004747921888662197,
+      "loss": 3.3626,
+      "step": 35900
+    },
+    {
+      "epoch": 10.471917967839664,
+      "grad_norm": 0.333051860332489,
+      "learning_rate": 0.0004746173127368114,
+      "loss": 3.3626,
+      "step": 35950
+    },
+    {
+      "epoch": 10.486483337217432,
+      "grad_norm": 0.3579745888710022,
+      "learning_rate": 0.00047444243660740306,
+      "loss": 3.3611,
+      "step": 36000
+    },
+    {
+      "epoch": 10.486483337217432,
+      "eval_accuracy": 0.3689163427132376,
+      "eval_loss": 3.564375638961792,
+      "eval_runtime": 180.2426,
+      "eval_samples_per_second": 92.348,
+      "eval_steps_per_second": 5.776,
+      "step": 36000
+    },
+    {
+      "epoch": 10.5010487065952,
+      "grad_norm": 0.3685716986656189,
+      "learning_rate": 0.0004742675604779947,
+      "loss": 3.3582,
+      "step": 36050
+    },
+    {
+      "epoch": 10.515614075972966,
+      "grad_norm": 0.3455582559108734,
+      "learning_rate": 0.0004740926843485864,
+      "loss": 3.3756,
+      "step": 36100
+    },
+    {
+      "epoch": 10.530179445350734,
+      "grad_norm": 0.34760144352912903,
+      "learning_rate": 0.000473917808219178,
+      "loss": 3.3706,
+      "step": 36150
+    },
+    {
+      "epoch": 10.544744814728501,
+      "grad_norm": 0.3610580861568451,
+      "learning_rate": 0.0004737429320897697,
+      "loss": 3.3661,
+      "step": 36200
+    },
+    {
+      "epoch": 10.55931018410627,
+      "grad_norm": 0.37062379717826843,
+      "learning_rate": 0.00047356805596036135,
+      "loss": 3.3746,
+      "step": 36250
+    },
+    {
+      "epoch": 10.573875553484037,
+      "grad_norm": 0.33157944679260254,
+      "learning_rate": 0.00047339317983095304,
+      "loss": 3.3633,
+      "step": 36300
+    },
+    {
+      "epoch": 10.588440922861803,
+      "grad_norm": 0.33509373664855957,
+      "learning_rate": 0.0004732183037015447,
+      "loss": 3.3758,
+      "step": 36350
+    },
+    {
+      "epoch": 10.603006292239572,
+      "grad_norm": 0.33611050248146057,
+      "learning_rate": 0.0004730434275721363,
+      "loss": 3.3778,
+      "step": 36400
+    },
+    {
+      "epoch": 10.617571661617339,
+      "grad_norm": 0.34995800256729126,
+      "learning_rate": 0.00047286855144272806,
+      "loss": 3.3576,
+      "step": 36450
+    },
+    {
+      "epoch": 10.632137030995105,
+      "grad_norm": 0.3561765253543854,
+      "learning_rate": 0.0004726936753133197,
+      "loss": 3.376,
+      "step": 36500
+    },
+    {
+      "epoch": 10.646702400372874,
+      "grad_norm": 0.34412702918052673,
+      "learning_rate": 0.0004725187991839114,
+      "loss": 3.3736,
+      "step": 36550
+    },
+    {
+      "epoch": 10.66126776975064,
+      "grad_norm": 0.34151721000671387,
+      "learning_rate": 0.000472343923054503,
+      "loss": 3.3746,
+      "step": 36600
+    },
+    {
+      "epoch": 10.675833139128407,
+      "grad_norm": 0.3609941601753235,
+      "learning_rate": 0.00047216904692509465,
+      "loss": 3.3746,
+      "step": 36650
+    },
+    {
+      "epoch": 10.690398508506176,
+      "grad_norm": 0.31918060779571533,
+      "learning_rate": 0.00047199417079568634,
+      "loss": 3.3827,
+      "step": 36700
+    },
+    {
+      "epoch": 10.704963877883943,
+      "grad_norm": 0.35245707631111145,
+      "learning_rate": 0.000471819294666278,
+      "loss": 3.3743,
+      "step": 36750
+    },
+    {
+      "epoch": 10.719529247261711,
+      "grad_norm": 0.333524614572525,
+      "learning_rate": 0.00047164441853686967,
+      "loss": 3.3766,
+      "step": 36800
+    },
+    {
+      "epoch": 10.734094616639478,
+      "grad_norm": 0.3567027747631073,
+      "learning_rate": 0.0004714695424074613,
+      "loss": 3.3738,
+      "step": 36850
+    },
+    {
+      "epoch": 10.748659986017245,
+      "grad_norm": 0.35120370984077454,
+      "learning_rate": 0.00047129466627805305,
+      "loss": 3.3758,
+      "step": 36900
+    },
+    {
+      "epoch": 10.763225355395013,
+      "grad_norm": 0.35132184624671936,
+      "learning_rate": 0.0004711197901486447,
+      "loss": 3.3646,
+      "step": 36950
+    },
+    {
+      "epoch": 10.77779072477278,
+      "grad_norm": 0.3338523507118225,
+      "learning_rate": 0.0004709449140192363,
+      "loss": 3.3847,
+      "step": 37000
+    },
+    {
+      "epoch": 10.77779072477278,
+      "eval_accuracy": 0.3693834470134071,
+      "eval_loss": 3.5580570697784424,
+      "eval_runtime": 180.1912,
+      "eval_samples_per_second": 92.374,
+      "eval_steps_per_second": 5.777,
+      "step": 37000
+    },
+    {
+      "epoch": 10.792356094150549,
+      "grad_norm": 0.339985728263855,
+      "learning_rate": 0.000470770037889828,
+      "loss": 3.3951,
+      "step": 37050
+    },
+    {
+      "epoch": 10.806921463528315,
+      "grad_norm": 0.34119653701782227,
+      "learning_rate": 0.00047059516176041965,
+      "loss": 3.375,
+      "step": 37100
+    },
+    {
+      "epoch": 10.821486832906082,
+      "grad_norm": 0.3279690742492676,
+      "learning_rate": 0.00047042028563101134,
+      "loss": 3.3907,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83605220228385,
+      "grad_norm": 0.34971579909324646,
+      "learning_rate": 0.000470245409501603,
+      "loss": 3.3908,
+      "step": 37200
+    },
+    {
+      "epoch": 10.850617571661617,
+      "grad_norm": 0.3415312170982361,
+      "learning_rate": 0.0004700705333721946,
+      "loss": 3.3816,
+      "step": 37250
+    },
+    {
+      "epoch": 10.865182941039384,
+      "grad_norm": 0.3437788188457489,
+      "learning_rate": 0.0004698956572427863,
+      "loss": 3.3843,
+      "step": 37300
+    },
+    {
+      "epoch": 10.879748310417153,
+      "grad_norm": 0.3201417028903961,
+      "learning_rate": 0.00046972078111337794,
+      "loss": 3.3878,
+      "step": 37350
+    },
+    {
+      "epoch": 10.89431367979492,
+      "grad_norm": 0.3246171176433563,
+      "learning_rate": 0.0004695459049839697,
+      "loss": 3.3897,
+      "step": 37400
+    },
+    {
+      "epoch": 10.908879049172686,
+      "grad_norm": 0.3374573886394501,
+      "learning_rate": 0.0004693710288545613,
+      "loss": 3.3913,
+      "step": 37450
+    },
+    {
+      "epoch": 10.923444418550455,
+      "grad_norm": 0.3322266638278961,
+      "learning_rate": 0.000469196152725153,
+      "loss": 3.3877,
+      "step": 37500
+    },
+    {
+      "epoch": 10.938009787928221,
+      "grad_norm": 0.32962697744369507,
+      "learning_rate": 0.00046902127659574465,
+      "loss": 3.3924,
+      "step": 37550
+    },
+    {
+      "epoch": 10.95257515730599,
+      "grad_norm": 0.34244483709335327,
+      "learning_rate": 0.0004688464004663363,
+      "loss": 3.394,
+      "step": 37600
+    },
+    {
+      "epoch": 10.967140526683757,
+      "grad_norm": 0.3375890552997589,
+      "learning_rate": 0.000468671524336928,
+      "loss": 3.3964,
+      "step": 37650
+    },
+    {
+      "epoch": 10.981705896061523,
+      "grad_norm": 0.33612722158432007,
+      "learning_rate": 0.0004684966482075196,
+      "loss": 3.3835,
+      "step": 37700
+    },
+    {
+      "epoch": 10.996271265439292,
+      "grad_norm": 0.30385395884513855,
+      "learning_rate": 0.0004683217720781113,
+      "loss": 3.384,
+      "step": 37750
+    },
+    {
+      "epoch": 11.010778373339548,
+      "grad_norm": 0.3476569652557373,
+      "learning_rate": 0.00046814689594870294,
+      "loss": 3.3007,
+      "step": 37800
+    },
+    {
+      "epoch": 11.025343742717315,
+      "grad_norm": 0.3526822626590729,
+      "learning_rate": 0.0004679720198192946,
+      "loss": 3.2748,
+      "step": 37850
+    },
+    {
+      "epoch": 11.039909112095083,
+      "grad_norm": 0.33929720520973206,
+      "learning_rate": 0.0004677971436898863,
+      "loss": 3.2772,
+      "step": 37900
+    },
+    {
+      "epoch": 11.05447448147285,
+      "grad_norm": 0.34976741671562195,
+      "learning_rate": 0.00046762226756047795,
+      "loss": 3.2923,
+      "step": 37950
+    },
+    {
+      "epoch": 11.069039850850617,
+      "grad_norm": 0.362759530544281,
+      "learning_rate": 0.00046744739143106964,
+      "loss": 3.2988,
+      "step": 38000
+    },
+    {
+      "epoch": 11.069039850850617,
+      "eval_accuracy": 0.36924154042133445,
+      "eval_loss": 3.5658700466156006,
+      "eval_runtime": 180.2854,
+      "eval_samples_per_second": 92.326,
+      "eval_steps_per_second": 5.774,
+      "step": 38000
+    },
+    {
+      "epoch": 11.083605220228385,
+      "grad_norm": 0.33938461542129517,
+      "learning_rate": 0.0004672725153016613,
+      "loss": 3.2849,
+      "step": 38050
+    },
+    {
+      "epoch": 11.098170589606152,
+      "grad_norm": 0.32288795709609985,
+      "learning_rate": 0.00046709763917225297,
+      "loss": 3.3015,
+      "step": 38100
+    },
+    {
+      "epoch": 11.11273595898392,
+      "grad_norm": 0.3679756820201874,
+      "learning_rate": 0.0004669227630428446,
+      "loss": 3.3041,
+      "step": 38150
+    },
+    {
+      "epoch": 11.127301328361687,
+      "grad_norm": 0.3594905734062195,
+      "learning_rate": 0.00046674788691343624,
+      "loss": 3.2987,
+      "step": 38200
+    },
+    {
+      "epoch": 11.141866697739454,
+      "grad_norm": 0.3440781235694885,
+      "learning_rate": 0.00046657301078402793,
+      "loss": 3.3041,
+      "step": 38250
+    },
+    {
+      "epoch": 11.156432067117223,
+      "grad_norm": 0.35526901483535767,
+      "learning_rate": 0.00046639813465461957,
+      "loss": 3.3127,
+      "step": 38300
+    },
+    {
+      "epoch": 11.17099743649499,
+      "grad_norm": 0.3565421998500824,
+      "learning_rate": 0.0004662232585252113,
+      "loss": 3.3124,
+      "step": 38350
+    },
+    {
+      "epoch": 11.185562805872756,
+      "grad_norm": 0.3327281177043915,
+      "learning_rate": 0.00046604838239580295,
+      "loss": 3.3171,
+      "step": 38400
+    },
+    {
+      "epoch": 11.200128175250525,
+      "grad_norm": 0.35897812247276306,
+      "learning_rate": 0.0004658735062663946,
+      "loss": 3.3073,
+      "step": 38450
+    },
+    {
+      "epoch": 11.214693544628291,
+      "grad_norm": 0.36023351550102234,
+      "learning_rate": 0.0004656986301369863,
+      "loss": 3.3196,
+      "step": 38500
+    },
+    {
+      "epoch": 11.22925891400606,
+      "grad_norm": 0.3633512556552887,
+      "learning_rate": 0.0004655237540075779,
+      "loss": 3.333,
+      "step": 38550
+    },
+    {
+      "epoch": 11.243824283383827,
+      "grad_norm": 0.36204254627227783,
+      "learning_rate": 0.0004653488778781696,
+      "loss": 3.3122,
+      "step": 38600
+    },
+    {
+      "epoch": 11.258389652761593,
+      "grad_norm": 0.3590092360973358,
+      "learning_rate": 0.00046517400174876124,
+      "loss": 3.3238,
+      "step": 38650
+    },
+    {
+      "epoch": 11.272955022139362,
+      "grad_norm": 0.33919012546539307,
+      "learning_rate": 0.0004649991256193529,
+      "loss": 3.3255,
+      "step": 38700
+    },
+    {
+      "epoch": 11.287520391517129,
+      "grad_norm": 0.3492119312286377,
+      "learning_rate": 0.00046482424948994457,
+      "loss": 3.3148,
+      "step": 38750
+    },
+    {
+      "epoch": 11.302085760894895,
+      "grad_norm": 0.3562859296798706,
+      "learning_rate": 0.0004646493733605362,
+      "loss": 3.3275,
+      "step": 38800
+    },
+    {
+      "epoch": 11.316651130272664,
+      "grad_norm": 0.3738594055175781,
+      "learning_rate": 0.00046447449723112795,
+      "loss": 3.3265,
+      "step": 38850
+    },
+    {
+      "epoch": 11.33121649965043,
+      "grad_norm": 0.34161487221717834,
+      "learning_rate": 0.0004642996211017196,
+      "loss": 3.3333,
+      "step": 38900
+    },
+    {
+      "epoch": 11.3457818690282,
+      "grad_norm": 0.3317897617816925,
+      "learning_rate": 0.0004641247449723113,
+      "loss": 3.3255,
+      "step": 38950
+    },
+    {
+      "epoch": 11.360347238405966,
+      "grad_norm": 0.3387396037578583,
+      "learning_rate": 0.0004639498688429029,
+      "loss": 3.3317,
+      "step": 39000
+    },
+    {
+      "epoch": 11.360347238405966,
+      "eval_accuracy": 0.3696292851940399,
+      "eval_loss": 3.560234785079956,
+      "eval_runtime": 180.1583,
+      "eval_samples_per_second": 92.391,
+      "eval_steps_per_second": 5.778,
+      "step": 39000
+    },
+    {
+      "epoch": 11.374912607783733,
+      "grad_norm": 0.3764457106590271,
+      "learning_rate": 0.00046377499271349455,
+      "loss": 3.33,
+      "step": 39050
+    },
+    {
+      "epoch": 11.389477977161501,
+      "grad_norm": 0.3497089147567749,
+      "learning_rate": 0.00046360011658408624,
+      "loss": 3.3435,
+      "step": 39100
+    },
+    {
+      "epoch": 11.404043346539268,
+      "grad_norm": 0.3582172095775604,
+      "learning_rate": 0.00046342524045467787,
+      "loss": 3.3332,
+      "step": 39150
+    },
+    {
+      "epoch": 11.418608715917035,
+      "grad_norm": 0.3324296176433563,
+      "learning_rate": 0.00046325036432526956,
+      "loss": 3.3399,
+      "step": 39200
+    },
+    {
+      "epoch": 11.433174085294803,
+      "grad_norm": 0.33598825335502625,
+      "learning_rate": 0.0004630754881958612,
+      "loss": 3.3347,
+      "step": 39250
+    },
+    {
+      "epoch": 11.44773945467257,
+      "grad_norm": 0.3521103858947754,
+      "learning_rate": 0.00046290061206645284,
+      "loss": 3.3471,
+      "step": 39300
+    },
+    {
+      "epoch": 11.462304824050339,
+      "grad_norm": 0.364227831363678,
+      "learning_rate": 0.0004627257359370446,
+      "loss": 3.3419,
+      "step": 39350
+    },
+    {
+      "epoch": 11.476870193428105,
+      "grad_norm": 0.34679660201072693,
+      "learning_rate": 0.0004625508598076362,
+      "loss": 3.336,
+      "step": 39400
+    },
+    {
+      "epoch": 11.491435562805872,
+      "grad_norm": 0.3476027250289917,
+      "learning_rate": 0.0004623759836782279,
+      "loss": 3.336,
+      "step": 39450
+    },
+    {
+      "epoch": 11.50600093218364,
+      "grad_norm": 0.3519826829433441,
+      "learning_rate": 0.00046220110754881954,
+      "loss": 3.3531,
+      "step": 39500
+    },
+    {
+      "epoch": 11.520566301561407,
+      "grad_norm": 0.3578990697860718,
+      "learning_rate": 0.00046202623141941123,
+      "loss": 3.3458,
+      "step": 39550
+    },
+    {
+      "epoch": 11.535131670939174,
+      "grad_norm": 0.3664044439792633,
+      "learning_rate": 0.00046185135529000287,
+      "loss": 3.3502,
+      "step": 39600
+    },
+    {
+      "epoch": 11.549697040316943,
+      "grad_norm": 0.3508833050727844,
+      "learning_rate": 0.0004616764791605945,
+      "loss": 3.354,
+      "step": 39650
+    },
+    {
+      "epoch": 11.56426240969471,
+      "grad_norm": 0.3304528295993805,
+      "learning_rate": 0.0004615016030311862,
+      "loss": 3.3462,
+      "step": 39700
+    },
+    {
+      "epoch": 11.578827779072478,
+      "grad_norm": 0.3349263668060303,
+      "learning_rate": 0.00046132672690177783,
+      "loss": 3.3452,
+      "step": 39750
+    },
+    {
+      "epoch": 11.593393148450245,
+      "grad_norm": 0.3545154631137848,
+      "learning_rate": 0.0004611518507723696,
+      "loss": 3.3546,
+      "step": 39800
+    },
+    {
+      "epoch": 11.607958517828012,
+      "grad_norm": 0.3606700301170349,
+      "learning_rate": 0.0004609769746429612,
+      "loss": 3.3595,
+      "step": 39850
+    },
+    {
+      "epoch": 11.62252388720578,
+      "grad_norm": 0.36492571234703064,
+      "learning_rate": 0.00046080209851355285,
+      "loss": 3.3623,
+      "step": 39900
+    },
+    {
+      "epoch": 11.637089256583547,
+      "grad_norm": 0.3475785255432129,
+      "learning_rate": 0.00046062722238414454,
+      "loss": 3.3574,
+      "step": 39950
+    },
+    {
+      "epoch": 11.651654625961314,
+      "grad_norm": 0.33172130584716797,
+      "learning_rate": 0.0004604523462547362,
+      "loss": 3.3584,
+      "step": 40000
+    },
+    {
+      "epoch": 11.651654625961314,
+      "eval_accuracy": 0.369777305408969,
+      "eval_loss": 3.554752826690674,
+      "eval_runtime": 180.1092,
+      "eval_samples_per_second": 92.416,
+      "eval_steps_per_second": 5.78,
+      "step": 40000
+    },
+    {
+      "epoch": 11.666219995339082,
+      "grad_norm": 0.3251342475414276,
+      "learning_rate": 0.00046027747012532787,
+      "loss": 3.3516,
+      "step": 40050
+    },
+    {
+      "epoch": 11.680785364716849,
+      "grad_norm": 0.3509256839752197,
+      "learning_rate": 0.0004601025939959195,
+      "loss": 3.3697,
+      "step": 40100
+    },
+    {
+      "epoch": 11.695350734094617,
+      "grad_norm": 0.33280062675476074,
+      "learning_rate": 0.0004599277178665112,
+      "loss": 3.3534,
+      "step": 40150
+    },
+    {
+      "epoch": 11.709916103472384,
+      "grad_norm": 0.3410335183143616,
+      "learning_rate": 0.00045975284173710283,
+      "loss": 3.358,
+      "step": 40200
+    },
+    {
+      "epoch": 11.724481472850151,
+      "grad_norm": 0.3339652419090271,
+      "learning_rate": 0.00045957796560769446,
+      "loss": 3.3617,
+      "step": 40250
+    },
+    {
+      "epoch": 11.73904684222792,
+      "grad_norm": 0.34843549132347107,
+      "learning_rate": 0.0004594030894782862,
+      "loss": 3.3564,
+      "step": 40300
+    },
+    {
+      "epoch": 11.753612211605686,
+      "grad_norm": 0.3603808879852295,
+      "learning_rate": 0.00045922821334887785,
+      "loss": 3.3563,
+      "step": 40350
+    },
+    {
+      "epoch": 11.768177580983453,
+      "grad_norm": 0.35512664914131165,
+      "learning_rate": 0.00045905333721946954,
+      "loss": 3.3686,
+      "step": 40400
+    },
+    {
+      "epoch": 11.782742950361222,
+      "grad_norm": 0.3335427939891815,
+      "learning_rate": 0.00045887846109006117,
+      "loss": 3.3768,
+      "step": 40450
+    },
+    {
+      "epoch": 11.797308319738988,
+      "grad_norm": 0.34287896752357483,
+      "learning_rate": 0.0004587035849606528,
+      "loss": 3.3561,
+      "step": 40500
+    },
+    {
+      "epoch": 11.811873689116755,
+      "grad_norm": 0.3359990417957306,
+      "learning_rate": 0.0004585287088312445,
+      "loss": 3.3572,
+      "step": 40550
+    },
+    {
+      "epoch": 11.826439058494524,
+      "grad_norm": 0.3517856299877167,
+      "learning_rate": 0.00045835383270183613,
+      "loss": 3.3605,
+      "step": 40600
+    },
+    {
+      "epoch": 11.84100442787229,
+      "grad_norm": 0.36212751269340515,
+      "learning_rate": 0.0004581789565724278,
+      "loss": 3.3682,
+      "step": 40650
+    },
+    {
+      "epoch": 11.855569797250059,
+      "grad_norm": 0.3393063545227051,
+      "learning_rate": 0.00045800408044301946,
+      "loss": 3.3724,
+      "step": 40700
+    },
+    {
+      "epoch": 11.870135166627826,
+      "grad_norm": 0.3907620310783386,
+      "learning_rate": 0.0004578292043136111,
+      "loss": 3.3667,
+      "step": 40750
+    },
+    {
+      "epoch": 11.884700536005592,
+      "grad_norm": 0.3694191873073578,
+      "learning_rate": 0.00045765432818420284,
+      "loss": 3.3633,
+      "step": 40800
+    },
+    {
+      "epoch": 11.899265905383361,
+      "grad_norm": 0.3315177261829376,
+      "learning_rate": 0.0004574794520547945,
+      "loss": 3.3846,
+      "step": 40850
+    },
+    {
+      "epoch": 11.913831274761128,
+      "grad_norm": 0.37399667501449585,
+      "learning_rate": 0.00045730457592538617,
+      "loss": 3.3641,
+      "step": 40900
+    },
+    {
+      "epoch": 11.928396644138896,
+      "grad_norm": 0.35480326414108276,
+      "learning_rate": 0.0004571296997959778,
+      "loss": 3.3734,
+      "step": 40950
+    },
+    {
+      "epoch": 11.942962013516663,
+      "grad_norm": 0.3346291482448578,
+      "learning_rate": 0.0004569548236665695,
+      "loss": 3.3695,
+      "step": 41000
+    },
+    {
+      "epoch": 11.942962013516663,
+      "eval_accuracy": 0.37045638782472007,
+      "eval_loss": 3.548724412918091,
+      "eval_runtime": 180.0916,
+      "eval_samples_per_second": 92.425,
+      "eval_steps_per_second": 5.78,
+      "step": 41000
+    },
+    {
+      "epoch": 11.95752738289443,
+      "grad_norm": 0.3365858197212219,
+      "learning_rate": 0.00045677994753716113,
+      "loss": 3.3701,
+      "step": 41050
+    },
+    {
+      "epoch": 11.972092752272198,
+      "grad_norm": 0.3484339118003845,
+      "learning_rate": 0.00045660507140775277,
+      "loss": 3.3669,
+      "step": 41100
+    },
+    {
+      "epoch": 11.986658121649965,
+      "grad_norm": 0.35748663544654846,
+      "learning_rate": 0.00045643019527834446,
+      "loss": 3.3608,
+      "step": 41150
+    },
+    {
+      "epoch": 12.001165229550221,
+      "grad_norm": 0.3389853537082672,
+      "learning_rate": 0.0004562553191489361,
+      "loss": 3.3647,
+      "step": 41200
+    },
+    {
+      "epoch": 12.01573059892799,
+      "grad_norm": 0.3526709973812103,
+      "learning_rate": 0.00045608044301952784,
+      "loss": 3.2626,
+      "step": 41250
+    },
+    {
+      "epoch": 12.030295968305756,
+      "grad_norm": 0.3826402425765991,
+      "learning_rate": 0.0004559055668901195,
+      "loss": 3.2511,
+      "step": 41300
+    },
+    {
+      "epoch": 12.044861337683523,
+      "grad_norm": 0.35080841183662415,
+      "learning_rate": 0.0004557306907607111,
+      "loss": 3.2537,
+      "step": 41350
+    },
+    {
+      "epoch": 12.059426707061291,
+      "grad_norm": 0.34643131494522095,
+      "learning_rate": 0.0004555558146313028,
+      "loss": 3.279,
+      "step": 41400
+    },
+    {
+      "epoch": 12.073992076439058,
+      "grad_norm": 0.35819053649902344,
+      "learning_rate": 0.00045538093850189444,
+      "loss": 3.2798,
+      "step": 41450
+    },
+    {
+      "epoch": 12.088557445816827,
+      "grad_norm": 0.35073381662368774,
+      "learning_rate": 0.00045520606237248613,
+      "loss": 3.282,
+      "step": 41500
+    },
+    {
+      "epoch": 12.103122815194594,
+      "grad_norm": 0.36466553807258606,
+      "learning_rate": 0.00045503118624307776,
+      "loss": 3.2781,
+      "step": 41550
+    },
+    {
+      "epoch": 12.11768818457236,
+      "grad_norm": 0.3526218831539154,
+      "learning_rate": 0.00045485631011366945,
+      "loss": 3.2747,
+      "step": 41600
+    },
+    {
+      "epoch": 12.132253553950129,
+      "grad_norm": 0.34587806463241577,
+      "learning_rate": 0.0004546814339842611,
+      "loss": 3.2836,
+      "step": 41650
+    },
+    {
+      "epoch": 12.146818923327896,
+      "grad_norm": 0.3643037974834442,
+      "learning_rate": 0.0004545065578548527,
+      "loss": 3.2837,
+      "step": 41700
+    },
+    {
+      "epoch": 12.161384292705662,
+      "grad_norm": 0.3476613759994507,
+      "learning_rate": 0.00045433168172544447,
+      "loss": 3.2871,
+      "step": 41750
+    },
+    {
+      "epoch": 12.17594966208343,
+      "grad_norm": 0.35576480627059937,
+      "learning_rate": 0.0004541568055960361,
+      "loss": 3.2914,
+      "step": 41800
+    },
+    {
+      "epoch": 12.190515031461198,
+      "grad_norm": 0.33326926827430725,
+      "learning_rate": 0.0004539819294666278,
+      "loss": 3.3,
+      "step": 41850
+    },
+    {
+      "epoch": 12.205080400838966,
+      "grad_norm": 0.3574065864086151,
+      "learning_rate": 0.00045380705333721943,
+      "loss": 3.296,
+      "step": 41900
+    },
+    {
+      "epoch": 12.219645770216733,
+      "grad_norm": 0.33230534195899963,
+      "learning_rate": 0.00045363217720781107,
+      "loss": 3.2943,
+      "step": 41950
+    },
+    {
+      "epoch": 12.2342111395945,
+      "grad_norm": 0.3553365170955658,
+      "learning_rate": 0.00045345730107840276,
+      "loss": 3.3134,
+      "step": 42000
+    },
+    {
+      "epoch": 12.2342111395945,
+      "eval_accuracy": 0.37017045838650914,
+      "eval_loss": 3.5607879161834717,
+      "eval_runtime": 180.0939,
+      "eval_samples_per_second": 92.424,
+      "eval_steps_per_second": 5.78,
+      "step": 42000
+    },
+    {
+      "epoch": 12.248776508972268,
+      "grad_norm": 0.3781476616859436,
+      "learning_rate": 0.0004532824249489944,
+      "loss": 3.3071,
+      "step": 42050
+    },
+    {
+      "epoch": 12.263341878350035,
+      "grad_norm": 0.34425944089889526,
+      "learning_rate": 0.0004531075488195861,
+      "loss": 3.3064,
+      "step": 42100
+    },
+    {
+      "epoch": 12.277907247727802,
+      "grad_norm": 0.3801526129245758,
+      "learning_rate": 0.0004529326726901777,
+      "loss": 3.302,
+      "step": 42150
+    },
+    {
+      "epoch": 12.29247261710557,
+      "grad_norm": 0.3539813160896301,
+      "learning_rate": 0.00045275779656076947,
+      "loss": 3.3085,
+      "step": 42200
+    },
+    {
+      "epoch": 12.307037986483337,
+      "grad_norm": 0.3623591661453247,
+      "learning_rate": 0.0004525829204313611,
+      "loss": 3.3184,
+      "step": 42250
+    },
+    {
+      "epoch": 12.321603355861104,
+      "grad_norm": 0.4033423364162445,
+      "learning_rate": 0.00045240804430195274,
+      "loss": 3.2947,
+      "step": 42300
+    },
+    {
+      "epoch": 12.336168725238872,
+      "grad_norm": 0.4017917811870575,
+      "learning_rate": 0.00045223316817254443,
+      "loss": 3.3224,
+      "step": 42350
+    },
+    {
+      "epoch": 12.350734094616639,
+      "grad_norm": 0.3531772792339325,
+      "learning_rate": 0.00045205829204313607,
+      "loss": 3.3133,
+      "step": 42400
+    },
+    {
+      "epoch": 12.365299463994408,
+      "grad_norm": 0.3572141230106354,
+      "learning_rate": 0.00045188341591372776,
+      "loss": 3.2987,
+      "step": 42450
+    },
+    {
+      "epoch": 12.379864833372174,
+      "grad_norm": 0.3440505266189575,
+      "learning_rate": 0.0004517085397843194,
+      "loss": 3.3061,
+      "step": 42500
+    },
+    {
+      "epoch": 12.394430202749941,
+      "grad_norm": 0.38698717951774597,
+      "learning_rate": 0.00045153366365491103,
+      "loss": 3.3218,
+      "step": 42550
+    },
+    {
+      "epoch": 12.40899557212771,
+      "grad_norm": 0.34767866134643555,
+      "learning_rate": 0.0004513587875255027,
+      "loss": 3.3256,
+      "step": 42600
+    },
+    {
+      "epoch": 12.423560941505476,
+      "grad_norm": 0.36052611470222473,
+      "learning_rate": 0.00045118391139609436,
+      "loss": 3.3373,
+      "step": 42650
+    },
+    {
+      "epoch": 12.438126310883243,
+      "grad_norm": 0.36833488941192627,
+      "learning_rate": 0.0004510090352666861,
+      "loss": 3.3225,
+      "step": 42700
+    },
+    {
+      "epoch": 12.452691680261012,
+      "grad_norm": 0.36538413166999817,
+      "learning_rate": 0.00045083415913727774,
+      "loss": 3.3232,
+      "step": 42750
+    },
+    {
+      "epoch": 12.467257049638778,
+      "grad_norm": 0.38341739773750305,
+      "learning_rate": 0.0004506592830078694,
+      "loss": 3.3253,
+      "step": 42800
+    },
+    {
+      "epoch": 12.481822419016547,
+      "grad_norm": 0.35237112641334534,
+      "learning_rate": 0.00045048440687846106,
+      "loss": 3.3285,
+      "step": 42850
+    },
+    {
+      "epoch": 12.496387788394314,
+      "grad_norm": 0.3648768961429596,
+      "learning_rate": 0.0004503095307490527,
+      "loss": 3.3253,
+      "step": 42900
+    },
+    {
+      "epoch": 12.51095315777208,
+      "grad_norm": 0.37519994378089905,
+      "learning_rate": 0.0004501346546196444,
+      "loss": 3.3266,
+      "step": 42950
+    },
+    {
+      "epoch": 12.525518527149849,
+      "grad_norm": 0.342909574508667,
+      "learning_rate": 0.000449959778490236,
+      "loss": 3.3341,
+      "step": 43000
+    },
+    {
+      "epoch": 12.525518527149849,
+      "eval_accuracy": 0.3702159578489218,
+      "eval_loss": 3.556406021118164,
+      "eval_runtime": 180.1154,
+      "eval_samples_per_second": 92.413,
+      "eval_steps_per_second": 5.78,
+      "step": 43000
+    },
+    {
+      "epoch": 12.540083896527616,
+      "grad_norm": 0.34145408868789673,
+      "learning_rate": 0.0004497849023608277,
+      "loss": 3.3171,
+      "step": 43050
+    },
+    {
+      "epoch": 12.554649265905383,
+      "grad_norm": 0.34982040524482727,
+      "learning_rate": 0.00044961002623141935,
+      "loss": 3.3283,
+      "step": 43100
+    },
+    {
+      "epoch": 12.569214635283151,
+      "grad_norm": 0.35648754239082336,
+      "learning_rate": 0.000449435150102011,
+      "loss": 3.3203,
+      "step": 43150
+    },
+    {
+      "epoch": 12.583780004660918,
+      "grad_norm": 0.35109570622444153,
+      "learning_rate": 0.00044926027397260273,
+      "loss": 3.333,
+      "step": 43200
+    },
+    {
+      "epoch": 12.598345374038686,
+      "grad_norm": 0.36943650245666504,
+      "learning_rate": 0.00044908539784319437,
+      "loss": 3.343,
+      "step": 43250
+    },
+    {
+      "epoch": 12.612910743416453,
+      "grad_norm": 0.33632126450538635,
+      "learning_rate": 0.00044891052171378606,
+      "loss": 3.3266,
+      "step": 43300
+    },
+    {
+      "epoch": 12.62747611279422,
+      "grad_norm": 0.34395912289619446,
+      "learning_rate": 0.0004487356455843777,
+      "loss": 3.3298,
+      "step": 43350
+    },
+    {
+      "epoch": 12.642041482171988,
+      "grad_norm": 0.3842009902000427,
+      "learning_rate": 0.00044856076945496933,
+      "loss": 3.3248,
+      "step": 43400
+    },
+    {
+      "epoch": 12.656606851549755,
+      "grad_norm": 0.33698466420173645,
+      "learning_rate": 0.000448385893325561,
+      "loss": 3.34,
+      "step": 43450
+    },
+    {
+      "epoch": 12.671172220927522,
+      "grad_norm": 0.36925747990608215,
+      "learning_rate": 0.00044821101719615266,
+      "loss": 3.3425,
+      "step": 43500
+    },
+    {
+      "epoch": 12.68573759030529,
+      "grad_norm": 0.34613460302352905,
+      "learning_rate": 0.00044803614106674435,
+      "loss": 3.334,
+      "step": 43550
+    },
+    {
+      "epoch": 12.700302959683057,
+      "grad_norm": 0.34533941745758057,
+      "learning_rate": 0.000447861264937336,
+      "loss": 3.3415,
+      "step": 43600
+    },
+    {
+      "epoch": 12.714868329060826,
+      "grad_norm": 0.35583844780921936,
+      "learning_rate": 0.00044768638880792773,
+      "loss": 3.3443,
+      "step": 43650
+    },
+    {
+      "epoch": 12.729433698438593,
+      "grad_norm": 0.3499312996864319,
+      "learning_rate": 0.00044751151267851937,
+      "loss": 3.3507,
+      "step": 43700
+    },
+    {
+      "epoch": 12.74399906781636,
+      "grad_norm": 0.3817194402217865,
+      "learning_rate": 0.000447336636549111,
+      "loss": 3.3313,
+      "step": 43750
+    },
+    {
+      "epoch": 12.758564437194128,
+      "grad_norm": 0.3556305170059204,
+      "learning_rate": 0.0004471617604197027,
+      "loss": 3.3493,
+      "step": 43800
+    },
+    {
+      "epoch": 12.773129806571895,
+      "grad_norm": 0.3326449394226074,
+      "learning_rate": 0.00044698688429029433,
+      "loss": 3.3476,
+      "step": 43850
+    },
+    {
+      "epoch": 12.787695175949661,
+      "grad_norm": 0.34188127517700195,
+      "learning_rate": 0.000446812008160886,
+      "loss": 3.3484,
+      "step": 43900
+    },
+    {
+      "epoch": 12.80226054532743,
+      "grad_norm": 0.36247870326042175,
+      "learning_rate": 0.00044663713203147766,
+      "loss": 3.3548,
+      "step": 43950
+    },
+    {
+      "epoch": 12.816825914705197,
+      "grad_norm": 0.3323318660259247,
+      "learning_rate": 0.0004464622559020693,
+      "loss": 3.34,
+      "step": 44000
+    },
+    {
+      "epoch": 12.816825914705197,
+      "eval_accuracy": 0.3710601080817979,
+      "eval_loss": 3.5469486713409424,
+      "eval_runtime": 180.4604,
+      "eval_samples_per_second": 92.236,
+      "eval_steps_per_second": 5.769,
+      "step": 44000
+    },
+    {
+      "epoch": 12.831391284082965,
+      "grad_norm": 0.3398094177246094,
+      "learning_rate": 0.000446287379772661,
+      "loss": 3.3637,
+      "step": 44050
+    },
+    {
+      "epoch": 12.845956653460732,
+      "grad_norm": 0.3441350758075714,
+      "learning_rate": 0.0004461125036432526,
+      "loss": 3.3565,
+      "step": 44100
+    },
+    {
+      "epoch": 12.860522022838499,
+      "grad_norm": 0.3864867091178894,
+      "learning_rate": 0.00044593762751384436,
+      "loss": 3.3531,
+      "step": 44150
+    },
+    {
+      "epoch": 12.875087392216267,
+      "grad_norm": 0.33451637625694275,
+      "learning_rate": 0.000445762751384436,
+      "loss": 3.3514,
+      "step": 44200
+    },
+    {
+      "epoch": 12.889652761594034,
+      "grad_norm": 0.35273078083992004,
+      "learning_rate": 0.0004455878752550277,
+      "loss": 3.3488,
+      "step": 44250
+    },
+    {
+      "epoch": 12.9042181309718,
+      "grad_norm": 0.35807234048843384,
+      "learning_rate": 0.0004454129991256193,
+      "loss": 3.3618,
+      "step": 44300
+    },
+    {
+      "epoch": 12.91878350034957,
+      "grad_norm": 0.37384292483329773,
+      "learning_rate": 0.00044523812299621096,
+      "loss": 3.355,
+      "step": 44350
+    },
+    {
+      "epoch": 12.933348869727336,
+      "grad_norm": 0.356778085231781,
+      "learning_rate": 0.00044506324686680265,
+      "loss": 3.3615,
+      "step": 44400
+    },
+    {
+      "epoch": 12.947914239105105,
+      "grad_norm": 0.3216100335121155,
+      "learning_rate": 0.0004448883707373943,
+      "loss": 3.3523,
+      "step": 44450
+    },
+    {
+      "epoch": 12.962479608482871,
+      "grad_norm": 0.34460195899009705,
+      "learning_rate": 0.000444713494607986,
+      "loss": 3.3419,
+      "step": 44500
+    },
+    {
+      "epoch": 12.977044977860638,
+      "grad_norm": 0.3426755368709564,
+      "learning_rate": 0.0004445386184785776,
+      "loss": 3.3618,
+      "step": 44550
+    },
+    {
+      "epoch": 12.991610347238407,
+      "grad_norm": 0.33371907472610474,
+      "learning_rate": 0.00044436374234916925,
+      "loss": 3.3606,
+      "step": 44600
+    },
+    {
+      "epoch": 13.006117455138662,
+      "grad_norm": 0.3647765815258026,
+      "learning_rate": 0.000444188866219761,
+      "loss": 3.3039,
+      "step": 44650
+    },
+    {
+      "epoch": 13.02068282451643,
+      "grad_norm": 0.35740309953689575,
+      "learning_rate": 0.00044401399009035263,
+      "loss": 3.2546,
+      "step": 44700
+    },
+    {
+      "epoch": 13.035248193894198,
+      "grad_norm": 0.34486448764801025,
+      "learning_rate": 0.0004438391139609443,
+      "loss": 3.2578,
+      "step": 44750
+    },
+    {
+      "epoch": 13.049813563271965,
+      "grad_norm": 0.3458172380924225,
+      "learning_rate": 0.00044366423783153596,
+      "loss": 3.2525,
+      "step": 44800
+    },
+    {
+      "epoch": 13.064378932649731,
+      "grad_norm": 0.3752981126308441,
+      "learning_rate": 0.0004434893617021276,
+      "loss": 3.2551,
+      "step": 44850
+    },
+    {
+      "epoch": 13.0789443020275,
+      "grad_norm": 0.3851625919342041,
+      "learning_rate": 0.0004433144855727193,
+      "loss": 3.2575,
+      "step": 44900
+    },
+    {
+      "epoch": 13.093509671405267,
+      "grad_norm": 0.3515491187572479,
+      "learning_rate": 0.0004431396094433109,
+      "loss": 3.2561,
+      "step": 44950
+    },
+    {
+      "epoch": 13.108075040783035,
+      "grad_norm": 0.3828083872795105,
+      "learning_rate": 0.0004429647333139026,
+      "loss": 3.2644,
+      "step": 45000
+    },
+    {
+      "epoch": 13.108075040783035,
+      "eval_accuracy": 0.3702774467864976,
+      "eval_loss": 3.5636725425720215,
+      "eval_runtime": 180.05,
+      "eval_samples_per_second": 92.447,
+      "eval_steps_per_second": 5.782,
+      "step": 45000
+    },
+    {
+      "epoch": 13.122640410160802,
+      "grad_norm": 0.3796162009239197,
+      "learning_rate": 0.00044278985718449425,
+      "loss": 3.2588,
+      "step": 45050
+    },
+    {
+      "epoch": 13.137205779538569,
+      "grad_norm": 0.3526037633419037,
+      "learning_rate": 0.000442614981055086,
+      "loss": 3.2679,
+      "step": 45100
+    },
+    {
+      "epoch": 13.151771148916337,
+      "grad_norm": 0.3708899915218353,
+      "learning_rate": 0.00044244010492567763,
+      "loss": 3.2736,
+      "step": 45150
+    },
+    {
+      "epoch": 13.166336518294104,
+      "grad_norm": 0.36107999086380005,
+      "learning_rate": 0.00044226522879626927,
+      "loss": 3.2702,
+      "step": 45200
+    },
+    {
+      "epoch": 13.18090188767187,
+      "grad_norm": 0.36529046297073364,
+      "learning_rate": 0.00044209035266686096,
+      "loss": 3.2764,
+      "step": 45250
+    },
+    {
+      "epoch": 13.19546725704964,
+      "grad_norm": 0.32587742805480957,
+      "learning_rate": 0.0004419154765374526,
+      "loss": 3.2871,
+      "step": 45300
+    },
+    {
+      "epoch": 13.210032626427406,
+      "grad_norm": 0.3842274844646454,
+      "learning_rate": 0.0004417406004080443,
+      "loss": 3.2774,
+      "step": 45350
+    },
+    {
+      "epoch": 13.224597995805174,
+      "grad_norm": 0.3462216556072235,
+      "learning_rate": 0.0004415657242786359,
+      "loss": 3.2834,
+      "step": 45400
+    },
+    {
+      "epoch": 13.239163365182941,
+      "grad_norm": 0.3876231610774994,
+      "learning_rate": 0.00044139084814922755,
+      "loss": 3.2794,
+      "step": 45450
+    },
+    {
+      "epoch": 13.253728734560708,
+      "grad_norm": 0.3549770712852478,
+      "learning_rate": 0.00044121597201981924,
+      "loss": 3.29,
+      "step": 45500
+    },
+    {
+      "epoch": 13.268294103938477,
+      "grad_norm": 0.36762481927871704,
+      "learning_rate": 0.0004410410958904109,
+      "loss": 3.2889,
+      "step": 45550
+    },
+    {
+      "epoch": 13.282859473316243,
+      "grad_norm": 0.3711779713630676,
+      "learning_rate": 0.0004408662197610026,
+      "loss": 3.2911,
+      "step": 45600
+    },
+    {
+      "epoch": 13.29742484269401,
+      "grad_norm": 0.38503533601760864,
+      "learning_rate": 0.00044069134363159426,
+      "loss": 3.2988,
+      "step": 45650
+    },
+    {
+      "epoch": 13.311990212071779,
+      "grad_norm": 0.35451096296310425,
+      "learning_rate": 0.00044051646750218595,
+      "loss": 3.2966,
+      "step": 45700
+    },
+    {
+      "epoch": 13.326555581449545,
+      "grad_norm": 0.36363229155540466,
+      "learning_rate": 0.0004403415913727776,
+      "loss": 3.2889,
+      "step": 45750
+    },
+    {
+      "epoch": 13.341120950827314,
+      "grad_norm": 0.3751690089702606,
+      "learning_rate": 0.0004401667152433692,
+      "loss": 3.3034,
+      "step": 45800
+    },
+    {
+      "epoch": 13.35568632020508,
+      "grad_norm": 0.3827272057533264,
+      "learning_rate": 0.0004399918391139609,
+      "loss": 3.2939,
+      "step": 45850
+    },
+    {
+      "epoch": 13.370251689582847,
+      "grad_norm": 0.3678208887577057,
+      "learning_rate": 0.00043981696298455255,
+      "loss": 3.2945,
+      "step": 45900
+    },
+    {
+      "epoch": 13.384817058960616,
+      "grad_norm": 0.367930144071579,
+      "learning_rate": 0.00043964208685514424,
+      "loss": 3.3054,
+      "step": 45950
+    },
+    {
+      "epoch": 13.399382428338383,
+      "grad_norm": 0.4058651924133301,
+      "learning_rate": 0.0004394672107257359,
+      "loss": 3.3033,
+      "step": 46000
+    },
+    {
+      "epoch": 13.399382428338383,
+      "eval_accuracy": 0.37058559689239845,
+      "eval_loss": 3.558558940887451,
+      "eval_runtime": 180.1666,
+      "eval_samples_per_second": 92.387,
+      "eval_steps_per_second": 5.778,
+      "step": 46000
+    },
+    {
+      "epoch": 13.41394779771615,
+      "grad_norm": 0.33443596959114075,
+      "learning_rate": 0.0004392923345963275,
+      "loss": 3.3057,
+      "step": 46050
+    },
+    {
+      "epoch": 13.428513167093918,
+      "grad_norm": 0.35828691720962524,
+      "learning_rate": 0.00043911745846691926,
+      "loss": 3.3038,
+      "step": 46100
+    },
+    {
+      "epoch": 13.443078536471685,
+      "grad_norm": 0.3267059922218323,
+      "learning_rate": 0.0004389425823375109,
+      "loss": 3.3025,
+      "step": 46150
+    },
+    {
+      "epoch": 13.457643905849451,
+      "grad_norm": 0.3633759319782257,
+      "learning_rate": 0.0004387677062081026,
+      "loss": 3.3072,
+      "step": 46200
+    },
+    {
+      "epoch": 13.47220927522722,
+      "grad_norm": 0.3840404748916626,
+      "learning_rate": 0.0004385928300786942,
+      "loss": 3.3114,
+      "step": 46250
+    },
+    {
+      "epoch": 13.486774644604987,
+      "grad_norm": 0.37473759055137634,
+      "learning_rate": 0.0004384179539492859,
+      "loss": 3.3016,
+      "step": 46300
+    },
+    {
+      "epoch": 13.501340013982755,
+      "grad_norm": 0.3564804792404175,
+      "learning_rate": 0.00043824307781987755,
+      "loss": 3.3094,
+      "step": 46350
+    },
+    {
+      "epoch": 13.515905383360522,
+      "grad_norm": 0.3550237715244293,
+      "learning_rate": 0.0004380682016904692,
+      "loss": 3.3267,
+      "step": 46400
+    },
+    {
+      "epoch": 13.530470752738289,
+      "grad_norm": 0.3277883529663086,
+      "learning_rate": 0.0004378933255610609,
+      "loss": 3.3101,
+      "step": 46450
+    },
+    {
+      "epoch": 13.545036122116057,
+      "grad_norm": 0.3435348570346832,
+      "learning_rate": 0.0004377184494316525,
+      "loss": 3.3102,
+      "step": 46500
+    },
+    {
+      "epoch": 13.559601491493824,
+      "grad_norm": 0.3904513120651245,
+      "learning_rate": 0.00043754357330224426,
+      "loss": 3.3174,
+      "step": 46550
+    },
+    {
+      "epoch": 13.574166860871593,
+      "grad_norm": 0.3408344089984894,
+      "learning_rate": 0.0004373686971728359,
+      "loss": 3.3124,
+      "step": 46600
+    },
+    {
+      "epoch": 13.58873223024936,
+      "grad_norm": 0.35491397976875305,
+      "learning_rate": 0.00043719382104342753,
+      "loss": 3.3261,
+      "step": 46650
+    },
+    {
+      "epoch": 13.603297599627126,
+      "grad_norm": 0.374520868062973,
+      "learning_rate": 0.0004370189449140192,
+      "loss": 3.3192,
+      "step": 46700
+    },
+    {
+      "epoch": 13.617862969004895,
+      "grad_norm": 0.36937081813812256,
+      "learning_rate": 0.00043684406878461085,
+      "loss": 3.3193,
+      "step": 46750
+    },
+    {
+      "epoch": 13.632428338382661,
+      "grad_norm": 0.36104243993759155,
+      "learning_rate": 0.00043666919265520254,
+      "loss": 3.3299,
+      "step": 46800
+    },
+    {
+      "epoch": 13.646993707760428,
+      "grad_norm": 0.38521620631217957,
+      "learning_rate": 0.0004364943165257942,
+      "loss": 3.3275,
+      "step": 46850
+    },
+    {
+      "epoch": 13.661559077138197,
+      "grad_norm": 0.36809036135673523,
+      "learning_rate": 0.0004363194403963858,
+      "loss": 3.3141,
+      "step": 46900
+    },
+    {
+      "epoch": 13.676124446515963,
+      "grad_norm": 0.36899736523628235,
+      "learning_rate": 0.0004361445642669775,
+      "loss": 3.3134,
+      "step": 46950
+    },
+    {
+      "epoch": 13.69068981589373,
+      "grad_norm": 0.340524286031723,
+      "learning_rate": 0.00043596968813756914,
+      "loss": 3.321,
+      "step": 47000
+    },
+    {
+      "epoch": 13.69068981589373,
+      "eval_accuracy": 0.3713288723481426,
+      "eval_loss": 3.546154022216797,
+      "eval_runtime": 180.3194,
+      "eval_samples_per_second": 92.308,
+      "eval_steps_per_second": 5.773,
+      "step": 47000
+    },
+    {
+      "epoch": 13.705255185271499,
+      "grad_norm": 0.4061141610145569,
+      "learning_rate": 0.0004357948120081609,
+      "loss": 3.3105,
+      "step": 47050
+    },
+    {
+      "epoch": 13.719820554649266,
+      "grad_norm": 0.38260596990585327,
+      "learning_rate": 0.0004356199358787525,
+      "loss": 3.3253,
+      "step": 47100
+    },
+    {
+      "epoch": 13.734385924027034,
+      "grad_norm": 0.33172404766082764,
+      "learning_rate": 0.0004354450597493442,
+      "loss": 3.3317,
+      "step": 47150
+    },
+    {
+      "epoch": 13.7489512934048,
+      "grad_norm": 0.37275978922843933,
+      "learning_rate": 0.00043527018361993585,
+      "loss": 3.316,
+      "step": 47200
+    },
+    {
+      "epoch": 13.763516662782568,
+      "grad_norm": 0.34196189045906067,
+      "learning_rate": 0.0004350953074905275,
+      "loss": 3.3284,
+      "step": 47250
+    },
+    {
+      "epoch": 13.778082032160336,
+      "grad_norm": 0.38566258549690247,
+      "learning_rate": 0.0004349204313611192,
+      "loss": 3.3284,
+      "step": 47300
+    },
+    {
+      "epoch": 13.792647401538103,
+      "grad_norm": 0.34335869550704956,
+      "learning_rate": 0.0004347455552317108,
+      "loss": 3.3268,
+      "step": 47350
+    },
+    {
+      "epoch": 13.80721277091587,
+      "grad_norm": 0.347888320684433,
+      "learning_rate": 0.0004345706791023025,
+      "loss": 3.3291,
+      "step": 47400
+    },
+    {
+      "epoch": 13.821778140293638,
+      "grad_norm": 0.34273475408554077,
+      "learning_rate": 0.00043439580297289414,
+      "loss": 3.3313,
+      "step": 47450
+    },
+    {
+      "epoch": 13.836343509671405,
+      "grad_norm": 0.35187360644340515,
+      "learning_rate": 0.0004342209268434858,
+      "loss": 3.3271,
+      "step": 47500
+    },
+    {
+      "epoch": 13.850908879049173,
+      "grad_norm": 0.35880064964294434,
+      "learning_rate": 0.0004340460507140775,
+      "loss": 3.3399,
+      "step": 47550
+    },
+    {
+      "epoch": 13.86547424842694,
+      "grad_norm": 0.3588651418685913,
+      "learning_rate": 0.00043387117458466916,
+      "loss": 3.3369,
+      "step": 47600
+    },
+    {
+      "epoch": 13.880039617804707,
+      "grad_norm": 0.3750631809234619,
+      "learning_rate": 0.00043369629845526085,
+      "loss": 3.3353,
+      "step": 47650
+    },
+    {
+      "epoch": 13.894604987182475,
+      "grad_norm": 0.3625471889972687,
+      "learning_rate": 0.0004335214223258525,
+      "loss": 3.3288,
+      "step": 47700
+    },
+    {
+      "epoch": 13.909170356560242,
+      "grad_norm": 0.34764859080314636,
+      "learning_rate": 0.0004333465461964442,
+      "loss": 3.3396,
+      "step": 47750
+    },
+    {
+      "epoch": 13.923735725938009,
+      "grad_norm": 0.37005481123924255,
+      "learning_rate": 0.0004331716700670358,
+      "loss": 3.345,
+      "step": 47800
+    },
+    {
+      "epoch": 13.938301095315778,
+      "grad_norm": 0.36900705099105835,
+      "learning_rate": 0.00043299679393762745,
+      "loss": 3.3468,
+      "step": 47850
+    },
+    {
+      "epoch": 13.952866464693544,
+      "grad_norm": 0.36026182770729065,
+      "learning_rate": 0.00043282191780821914,
+      "loss": 3.3357,
+      "step": 47900
+    },
+    {
+      "epoch": 13.967431834071313,
+      "grad_norm": 0.41666218638420105,
+      "learning_rate": 0.00043264704167881077,
+      "loss": 3.3406,
+      "step": 47950
+    },
+    {
+      "epoch": 13.98199720344908,
+      "grad_norm": 0.3536786735057831,
+      "learning_rate": 0.0004324721655494025,
+      "loss": 3.3478,
+      "step": 48000
+    },
+    {
+      "epoch": 13.98199720344908,
+      "eval_accuracy": 0.37184794244259217,
+      "eval_loss": 3.5400092601776123,
+      "eval_runtime": 180.0491,
+      "eval_samples_per_second": 92.447,
+      "eval_steps_per_second": 5.782,
+      "step": 48000
+    },
+    {
+      "epoch": 13.996562572826846,
+      "grad_norm": 0.36019057035446167,
+      "learning_rate": 0.00043229728941999415,
+      "loss": 3.3448,
+      "step": 48050
+    },
+    {
+      "epoch": 14.011069680727104,
+      "grad_norm": 0.36364710330963135,
+      "learning_rate": 0.0004321224132905858,
+      "loss": 3.2384,
+      "step": 48100
+    },
+    {
+      "epoch": 14.02563505010487,
+      "grad_norm": 0.37141087651252747,
+      "learning_rate": 0.0004319475371611775,
+      "loss": 3.2325,
+      "step": 48150
+    },
+    {
+      "epoch": 14.040200419482638,
+      "grad_norm": 0.3692861497402191,
+      "learning_rate": 0.0004317726610317691,
+      "loss": 3.2346,
+      "step": 48200
+    },
+    {
+      "epoch": 14.054765788860406,
+      "grad_norm": 0.3691665530204773,
+      "learning_rate": 0.0004315977849023608,
+      "loss": 3.2326,
+      "step": 48250
+    },
+    {
+      "epoch": 14.069331158238173,
+      "grad_norm": 0.3210379481315613,
+      "learning_rate": 0.00043142290877295244,
+      "loss": 3.2295,
+      "step": 48300
+    },
+    {
+      "epoch": 14.08389652761594,
+      "grad_norm": 0.3676183819770813,
+      "learning_rate": 0.00043124803264354413,
+      "loss": 3.2278,
+      "step": 48350
+    },
+    {
+      "epoch": 14.098461896993708,
+      "grad_norm": 0.3678940236568451,
+      "learning_rate": 0.00043107315651413577,
+      "loss": 3.2426,
+      "step": 48400
+    },
+    {
+      "epoch": 14.113027266371475,
+      "grad_norm": 0.35546252131462097,
+      "learning_rate": 0.0004308982803847274,
+      "loss": 3.2504,
+      "step": 48450
+    },
+    {
+      "epoch": 14.127592635749243,
+      "grad_norm": 0.36175820231437683,
+      "learning_rate": 0.00043072340425531915,
+      "loss": 3.2673,
+      "step": 48500
+    },
+    {
+      "epoch": 14.14215800512701,
+      "grad_norm": 0.3764038681983948,
+      "learning_rate": 0.0004305485281259108,
+      "loss": 3.2708,
+      "step": 48550
+    },
+    {
+      "epoch": 14.156723374504777,
+      "grad_norm": 0.35129106044769287,
+      "learning_rate": 0.0004303736519965025,
+      "loss": 3.2607,
+      "step": 48600
+    },
+    {
+      "epoch": 14.171288743882545,
+      "grad_norm": 0.3690101206302643,
+      "learning_rate": 0.0004301987758670941,
+      "loss": 3.2637,
+      "step": 48650
+    },
+    {
+      "epoch": 14.185854113260312,
+      "grad_norm": 0.3953830897808075,
+      "learning_rate": 0.00043002389973768575,
+      "loss": 3.2679,
+      "step": 48700
+    },
+    {
+      "epoch": 14.200419482638079,
+      "grad_norm": 0.4228529930114746,
+      "learning_rate": 0.00042984902360827744,
+      "loss": 3.2749,
+      "step": 48750
+    },
+    {
+      "epoch": 14.214984852015847,
+      "grad_norm": 0.3526473641395569,
+      "learning_rate": 0.0004296741474788691,
+      "loss": 3.2685,
+      "step": 48800
+    },
+    {
+      "epoch": 14.229550221393614,
+      "grad_norm": 0.3725534677505493,
+      "learning_rate": 0.00042949927134946077,
+      "loss": 3.2709,
+      "step": 48850
+    },
+    {
+      "epoch": 14.244115590771383,
+      "grad_norm": 0.4134677052497864,
+      "learning_rate": 0.0004293243952200524,
+      "loss": 3.2807,
+      "step": 48900
+    },
+    {
+      "epoch": 14.25868096014915,
+      "grad_norm": 0.373789519071579,
+      "learning_rate": 0.00042914951909064415,
+      "loss": 3.2732,
+      "step": 48950
+    },
+    {
+      "epoch": 14.273246329526916,
+      "grad_norm": 0.3695909082889557,
+      "learning_rate": 0.0004289746429612358,
+      "loss": 3.2776,
+      "step": 49000
+    },
+    {
+      "epoch": 14.273246329526916,
+      "eval_accuracy": 0.37125997652133685,
+      "eval_loss": 3.555926561355591,
+      "eval_runtime": 180.0211,
+      "eval_samples_per_second": 92.461,
+      "eval_steps_per_second": 5.783,
+      "step": 49000
+    },
+    {
+      "epoch": 14.287811698904685,
+      "grad_norm": 0.3744010031223297,
+      "learning_rate": 0.0004287997668318274,
+      "loss": 3.2625,
+      "step": 49050
+    },
+    {
+      "epoch": 14.302377068282452,
+      "grad_norm": 0.3698079288005829,
+      "learning_rate": 0.0004286248907024191,
+      "loss": 3.2764,
+      "step": 49100
+    },
+    {
+      "epoch": 14.316942437660218,
+      "grad_norm": 0.40776345133781433,
+      "learning_rate": 0.00042845001457301075,
+      "loss": 3.2732,
+      "step": 49150
+    },
+    {
+      "epoch": 14.331507807037987,
+      "grad_norm": 0.3719671368598938,
+      "learning_rate": 0.00042827513844360244,
+      "loss": 3.2849,
+      "step": 49200
+    },
+    {
+      "epoch": 14.346073176415754,
+      "grad_norm": 0.36249688267707825,
+      "learning_rate": 0.00042810026231419407,
+      "loss": 3.2847,
+      "step": 49250
+    },
+    {
+      "epoch": 14.360638545793522,
+      "grad_norm": 0.3670303523540497,
+      "learning_rate": 0.0004279253861847857,
+      "loss": 3.278,
+      "step": 49300
+    },
+    {
+      "epoch": 14.375203915171289,
+      "grad_norm": 0.36930960416793823,
+      "learning_rate": 0.0004277505100553774,
+      "loss": 3.3051,
+      "step": 49350
+    },
+    {
+      "epoch": 14.389769284549056,
+      "grad_norm": 0.3947696387767792,
+      "learning_rate": 0.00042757563392596904,
+      "loss": 3.2951,
+      "step": 49400
+    },
+    {
+      "epoch": 14.404334653926824,
+      "grad_norm": 0.3678928315639496,
+      "learning_rate": 0.0004274007577965608,
+      "loss": 3.2893,
+      "step": 49450
+    },
+    {
+      "epoch": 14.418900023304591,
+      "grad_norm": 0.3470092713832855,
+      "learning_rate": 0.0004272258816671524,
+      "loss": 3.3077,
+      "step": 49500
+    },
+    {
+      "epoch": 14.433465392682358,
+      "grad_norm": 0.380835622549057,
+      "learning_rate": 0.00042705100553774405,
+      "loss": 3.29,
+      "step": 49550
+    },
+    {
+      "epoch": 14.448030762060126,
+      "grad_norm": 0.3495819568634033,
+      "learning_rate": 0.00042687612940833574,
+      "loss": 3.289,
+      "step": 49600
+    },
+    {
+      "epoch": 14.462596131437893,
+      "grad_norm": 0.3957236707210541,
+      "learning_rate": 0.0004267012532789274,
+      "loss": 3.3002,
+      "step": 49650
+    },
+    {
+      "epoch": 14.477161500815662,
+      "grad_norm": 0.35682642459869385,
+      "learning_rate": 0.00042652637714951907,
+      "loss": 3.2979,
+      "step": 49700
+    },
+    {
+      "epoch": 14.491726870193428,
+      "grad_norm": 0.37553074955940247,
+      "learning_rate": 0.0004263515010201107,
+      "loss": 3.2958,
+      "step": 49750
+    },
+    {
+      "epoch": 14.506292239571195,
+      "grad_norm": 0.35635843873023987,
+      "learning_rate": 0.0004261766248907024,
+      "loss": 3.2884,
+      "step": 49800
+    },
+    {
+      "epoch": 14.520857608948964,
+      "grad_norm": 0.3419688940048218,
+      "learning_rate": 0.00042600174876129403,
+      "loss": 3.301,
+      "step": 49850
+    },
+    {
+      "epoch": 14.53542297832673,
+      "grad_norm": 0.34367799758911133,
+      "learning_rate": 0.00042582687263188567,
+      "loss": 3.2852,
+      "step": 49900
+    },
+    {
+      "epoch": 14.549988347704497,
+      "grad_norm": 0.40431395173072815,
+      "learning_rate": 0.0004256519965024774,
+      "loss": 3.2959,
+      "step": 49950
+    },
+    {
+      "epoch": 14.564553717082266,
+      "grad_norm": 0.35024210810661316,
+      "learning_rate": 0.00042547712037306905,
+      "loss": 3.3024,
+      "step": 50000
+    },
+    {
+      "epoch": 14.564553717082266,
+      "eval_accuracy": 0.3717955063696308,
+      "eval_loss": 3.5450422763824463,
+      "eval_runtime": 179.9316,
+      "eval_samples_per_second": 92.507,
+      "eval_steps_per_second": 5.786,
+      "step": 50000
+    },
+    {
+      "epoch": 14.579119086460032,
+      "grad_norm": 0.3777245581150055,
+      "learning_rate": 0.00042530224424366074,
+      "loss": 3.2999,
+      "step": 50050
+    },
+    {
+      "epoch": 14.5936844558378,
+      "grad_norm": 0.38056281208992004,
+      "learning_rate": 0.0004251273681142524,
+      "loss": 3.3083,
+      "step": 50100
+    },
+    {
+      "epoch": 14.608249825215568,
+      "grad_norm": 0.34788084030151367,
+      "learning_rate": 0.000424952491984844,
+      "loss": 3.31,
+      "step": 50150
+    },
+    {
+      "epoch": 14.622815194593334,
+      "grad_norm": 0.3699731230735779,
+      "learning_rate": 0.0004247776158554357,
+      "loss": 3.3078,
+      "step": 50200
+    },
+    {
+      "epoch": 14.637380563971103,
+      "grad_norm": 0.37978971004486084,
+      "learning_rate": 0.00042460273972602734,
+      "loss": 3.308,
+      "step": 50250
+    },
+    {
+      "epoch": 14.65194593334887,
+      "grad_norm": 0.34603598713874817,
+      "learning_rate": 0.00042442786359661903,
+      "loss": 3.3014,
+      "step": 50300
+    },
+    {
+      "epoch": 14.666511302726637,
+      "grad_norm": 0.36627814173698425,
+      "learning_rate": 0.00042425298746721066,
+      "loss": 3.3084,
+      "step": 50350
+    },
+    {
+      "epoch": 14.681076672104405,
+      "grad_norm": 0.37918820977211,
+      "learning_rate": 0.0004240781113378024,
+      "loss": 3.3007,
+      "step": 50400
+    },
+    {
+      "epoch": 14.695642041482172,
+      "grad_norm": 0.373757004737854,
+      "learning_rate": 0.00042390323520839405,
+      "loss": 3.3043,
+      "step": 50450
+    },
+    {
+      "epoch": 14.71020741085994,
+      "grad_norm": 0.36418765783309937,
+      "learning_rate": 0.0004237283590789857,
+      "loss": 3.31,
+      "step": 50500
+    },
+    {
+      "epoch": 14.724772780237707,
+      "grad_norm": 0.36318641901016235,
+      "learning_rate": 0.00042355348294957737,
+      "loss": 3.313,
+      "step": 50550
+    },
+    {
+      "epoch": 14.739338149615474,
+      "grad_norm": 0.3450626730918884,
+      "learning_rate": 0.000423378606820169,
+      "loss": 3.3003,
+      "step": 50600
+    },
+    {
+      "epoch": 14.753903518993242,
+      "grad_norm": 0.35747483372688293,
+      "learning_rate": 0.0004232037306907607,
+      "loss": 3.3018,
+      "step": 50650
+    },
+    {
+      "epoch": 14.76846888837101,
+      "grad_norm": 0.37540534138679504,
+      "learning_rate": 0.00042302885456135233,
+      "loss": 3.3205,
+      "step": 50700
+    },
+    {
+      "epoch": 14.783034257748776,
+      "grad_norm": 0.3449225425720215,
+      "learning_rate": 0.00042285397843194397,
+      "loss": 3.3091,
+      "step": 50750
+    },
+    {
+      "epoch": 14.797599627126544,
+      "grad_norm": 0.3874026834964752,
+      "learning_rate": 0.00042267910230253566,
+      "loss": 3.3202,
+      "step": 50800
+    },
+    {
+      "epoch": 14.812164996504311,
+      "grad_norm": 0.37947866320610046,
+      "learning_rate": 0.0004225042261731273,
+      "loss": 3.313,
+      "step": 50850
+    },
+    {
+      "epoch": 14.826730365882078,
+      "grad_norm": 0.337933212518692,
+      "learning_rate": 0.00042232935004371904,
+      "loss": 3.316,
+      "step": 50900
+    },
+    {
+      "epoch": 14.841295735259846,
+      "grad_norm": 0.3563149869441986,
+      "learning_rate": 0.0004221544739143107,
+      "loss": 3.323,
+      "step": 50950
+    },
+    {
+      "epoch": 14.855861104637613,
+      "grad_norm": 0.38767126202583313,
+      "learning_rate": 0.00042197959778490237,
+      "loss": 3.3119,
+      "step": 51000
+    },
+    {
+      "epoch": 14.855861104637613,
+      "eval_accuracy": 0.3718384192992965,
+      "eval_loss": 3.544301748275757,
+      "eval_runtime": 180.083,
+      "eval_samples_per_second": 92.43,
+      "eval_steps_per_second": 5.781,
+      "step": 51000
+    },
+    {
+      "epoch": 14.870426474015382,
+      "grad_norm": 0.37552863359451294,
+      "learning_rate": 0.000421804721655494,
+      "loss": 3.3259,
+      "step": 51050
+    },
+    {
+      "epoch": 14.884991843393149,
+      "grad_norm": 0.36282673478126526,
+      "learning_rate": 0.00042162984552608564,
+      "loss": 3.3247,
+      "step": 51100
+    },
+    {
+      "epoch": 14.899557212770915,
+      "grad_norm": 0.3442082703113556,
+      "learning_rate": 0.00042145496939667733,
+      "loss": 3.3353,
+      "step": 51150
+    },
+    {
+      "epoch": 14.914122582148684,
+      "grad_norm": 0.37182241678237915,
+      "learning_rate": 0.00042128009326726897,
+      "loss": 3.3221,
+      "step": 51200
+    },
+    {
+      "epoch": 14.92868795152645,
+      "grad_norm": 0.35627925395965576,
+      "learning_rate": 0.00042110521713786066,
+      "loss": 3.3225,
+      "step": 51250
+    },
+    {
+      "epoch": 14.943253320904217,
+      "grad_norm": 0.3497479557991028,
+      "learning_rate": 0.0004209303410084523,
+      "loss": 3.3091,
+      "step": 51300
+    },
+    {
+      "epoch": 14.957818690281986,
+      "grad_norm": 0.3572853207588196,
+      "learning_rate": 0.00042075546487904393,
+      "loss": 3.3194,
+      "step": 51350
+    },
+    {
+      "epoch": 14.972384059659753,
+      "grad_norm": 0.34908217191696167,
+      "learning_rate": 0.0004205805887496357,
+      "loss": 3.3197,
+      "step": 51400
+    },
+    {
+      "epoch": 14.986949429037521,
+      "grad_norm": 0.3747841417789459,
+      "learning_rate": 0.0004204057126202273,
+      "loss": 3.3196,
+      "step": 51450
+    },
+    {
+      "epoch": 15.001456536937777,
+      "grad_norm": 0.3852183520793915,
+      "learning_rate": 0.000420230836490819,
+      "loss": 3.3223,
+      "step": 51500
+    },
+    {
+      "epoch": 15.016021906315544,
+      "grad_norm": 0.36252638697624207,
+      "learning_rate": 0.00042005596036141064,
+      "loss": 3.2195,
+      "step": 51550
+    },
+    {
+      "epoch": 15.030587275693312,
+      "grad_norm": 0.37796682119369507,
+      "learning_rate": 0.0004198810842320023,
+      "loss": 3.2084,
+      "step": 51600
+    },
+    {
+      "epoch": 15.045152645071079,
+      "grad_norm": 0.35127121210098267,
+      "learning_rate": 0.00041970620810259396,
+      "loss": 3.2168,
+      "step": 51650
+    },
+    {
+      "epoch": 15.059718014448846,
+      "grad_norm": 0.3487504720687866,
+      "learning_rate": 0.0004195313319731856,
+      "loss": 3.2213,
+      "step": 51700
+    },
+    {
+      "epoch": 15.074283383826614,
+      "grad_norm": 0.3657950460910797,
+      "learning_rate": 0.0004193564558437773,
+      "loss": 3.2238,
+      "step": 51750
+    },
+    {
+      "epoch": 15.088848753204381,
+      "grad_norm": 0.3831111490726471,
+      "learning_rate": 0.0004191815797143689,
+      "loss": 3.2249,
+      "step": 51800
+    },
+    {
+      "epoch": 15.103414122582148,
+      "grad_norm": 0.35564419627189636,
+      "learning_rate": 0.00041900670358496067,
+      "loss": 3.2333,
+      "step": 51850
+    },
+    {
+      "epoch": 15.117979491959916,
+      "grad_norm": 0.36560797691345215,
+      "learning_rate": 0.0004188318274555523,
+      "loss": 3.2305,
+      "step": 51900
+    },
+    {
+      "epoch": 15.132544861337683,
+      "grad_norm": 0.34069278836250305,
+      "learning_rate": 0.00041865695132614394,
+      "loss": 3.2335,
+      "step": 51950
+    },
+    {
+      "epoch": 15.147110230715452,
+      "grad_norm": 0.3655010163784027,
+      "learning_rate": 0.00041848207519673563,
+      "loss": 3.242,
+      "step": 52000
+    },
+    {
+      "epoch": 15.147110230715452,
+      "eval_accuracy": 0.37159199327031206,
+      "eval_loss": 3.5560638904571533,
+      "eval_runtime": 180.1032,
+      "eval_samples_per_second": 92.419,
+      "eval_steps_per_second": 5.78,
+      "step": 52000
+    },
+    {
+      "epoch": 15.161675600093218,
+      "grad_norm": 0.35667118430137634,
+      "learning_rate": 0.00041830719906732727,
+      "loss": 3.2559,
+      "step": 52050
+    },
+    {
+      "epoch": 15.176240969470985,
+      "grad_norm": 0.38437923789024353,
+      "learning_rate": 0.00041813232293791896,
+      "loss": 3.2499,
+      "step": 52100
+    },
+    {
+      "epoch": 15.190806338848754,
+      "grad_norm": 0.361551433801651,
+      "learning_rate": 0.0004179574468085106,
+      "loss": 3.2595,
+      "step": 52150
+    },
+    {
+      "epoch": 15.20537170822652,
+      "grad_norm": 0.3916930854320526,
+      "learning_rate": 0.00041778257067910223,
+      "loss": 3.2515,
+      "step": 52200
+    },
+    {
+      "epoch": 15.219937077604287,
+      "grad_norm": 0.3800903558731079,
+      "learning_rate": 0.0004176076945496939,
+      "loss": 3.2494,
+      "step": 52250
+    },
+    {
+      "epoch": 15.234502446982056,
+      "grad_norm": 0.3692256808280945,
+      "learning_rate": 0.00041743281842028556,
+      "loss": 3.2523,
+      "step": 52300
+    },
+    {
+      "epoch": 15.249067816359823,
+      "grad_norm": 0.3822937607765198,
+      "learning_rate": 0.0004172579422908773,
+      "loss": 3.242,
+      "step": 52350
+    },
+    {
+      "epoch": 15.263633185737591,
+      "grad_norm": 0.3688431680202484,
+      "learning_rate": 0.00041708306616146894,
+      "loss": 3.2595,
+      "step": 52400
+    },
+    {
+      "epoch": 15.278198555115358,
+      "grad_norm": 0.37444230914115906,
+      "learning_rate": 0.00041690819003206063,
+      "loss": 3.2575,
+      "step": 52450
+    },
+    {
+      "epoch": 15.292763924493125,
+      "grad_norm": 0.36975809931755066,
+      "learning_rate": 0.00041673331390265227,
+      "loss": 3.2686,
+      "step": 52500
+    },
+    {
+      "epoch": 15.307329293870893,
+      "grad_norm": 0.363016813993454,
+      "learning_rate": 0.0004165584377732439,
+      "loss": 3.258,
+      "step": 52550
+    },
+    {
+      "epoch": 15.32189466324866,
+      "grad_norm": 0.39108654856681824,
+      "learning_rate": 0.0004163835616438356,
+      "loss": 3.2627,
+      "step": 52600
+    },
+    {
+      "epoch": 15.336460032626427,
+      "grad_norm": 0.3677827715873718,
+      "learning_rate": 0.00041620868551442723,
+      "loss": 3.2786,
+      "step": 52650
+    },
+    {
+      "epoch": 15.351025402004195,
+      "grad_norm": 0.3940332531929016,
+      "learning_rate": 0.0004160338093850189,
+      "loss": 3.2634,
+      "step": 52700
+    },
+    {
+      "epoch": 15.365590771381962,
+      "grad_norm": 0.3877134919166565,
+      "learning_rate": 0.00041585893325561056,
+      "loss": 3.2748,
+      "step": 52750
+    },
+    {
+      "epoch": 15.38015614075973,
+      "grad_norm": 0.37534767389297485,
+      "learning_rate": 0.0004156840571262022,
+      "loss": 3.2739,
+      "step": 52800
+    },
+    {
+      "epoch": 15.394721510137497,
+      "grad_norm": 0.35570940375328064,
+      "learning_rate": 0.00041550918099679394,
+      "loss": 3.2879,
+      "step": 52850
+    },
+    {
+      "epoch": 15.409286879515264,
+      "grad_norm": 0.39126497507095337,
+      "learning_rate": 0.0004153343048673856,
+      "loss": 3.2646,
+      "step": 52900
+    },
+    {
+      "epoch": 15.423852248893033,
+      "grad_norm": 0.3982384204864502,
+      "learning_rate": 0.00041515942873797726,
+      "loss": 3.2762,
+      "step": 52950
+    },
+    {
+      "epoch": 15.4384176182708,
+      "grad_norm": 0.37969404458999634,
+      "learning_rate": 0.0004149845526085689,
+      "loss": 3.2813,
+      "step": 53000
+    },
+    {
+      "epoch": 15.4384176182708,
+      "eval_accuracy": 0.37195399028521814,
+      "eval_loss": 3.5497677326202393,
+      "eval_runtime": 179.951,
+      "eval_samples_per_second": 92.497,
+      "eval_steps_per_second": 5.785,
+      "step": 53000
+    },
+    {
+      "epoch": 15.452982987648566,
+      "grad_norm": 0.35420140624046326,
+      "learning_rate": 0.0004148096764791606,
+      "loss": 3.2798,
+      "step": 53050
+    },
+    {
+      "epoch": 15.467548357026335,
+      "grad_norm": 0.37509068846702576,
+      "learning_rate": 0.0004146348003497522,
+      "loss": 3.28,
+      "step": 53100
+    },
+    {
+      "epoch": 15.482113726404101,
+      "grad_norm": 0.3744305372238159,
+      "learning_rate": 0.00041445992422034386,
+      "loss": 3.2838,
+      "step": 53150
+    },
+    {
+      "epoch": 15.49667909578187,
+      "grad_norm": 0.39247238636016846,
+      "learning_rate": 0.00041428504809093555,
+      "loss": 3.2829,
+      "step": 53200
+    },
+    {
+      "epoch": 15.511244465159637,
+      "grad_norm": 0.35992783308029175,
+      "learning_rate": 0.0004141101719615272,
+      "loss": 3.2892,
+      "step": 53250
+    },
+    {
+      "epoch": 15.525809834537403,
+      "grad_norm": 0.3768855035305023,
+      "learning_rate": 0.00041393529583211893,
+      "loss": 3.2853,
+      "step": 53300
+    },
+    {
+      "epoch": 15.540375203915172,
+      "grad_norm": 0.4003032147884369,
+      "learning_rate": 0.00041376041970271057,
+      "loss": 3.2819,
+      "step": 53350
+    },
+    {
+      "epoch": 15.554940573292939,
+      "grad_norm": 0.3577634394168854,
+      "learning_rate": 0.0004135855435733022,
+      "loss": 3.2958,
+      "step": 53400
+    },
+    {
+      "epoch": 15.569505942670705,
+      "grad_norm": 0.3649856150150299,
+      "learning_rate": 0.0004134106674438939,
+      "loss": 3.2893,
+      "step": 53450
+    },
+    {
+      "epoch": 15.584071312048474,
+      "grad_norm": 0.363534539937973,
+      "learning_rate": 0.00041323579131448553,
+      "loss": 3.2882,
+      "step": 53500
+    },
+    {
+      "epoch": 15.59863668142624,
+      "grad_norm": 0.36388495564460754,
+      "learning_rate": 0.0004130609151850772,
+      "loss": 3.2978,
+      "step": 53550
+    },
+    {
+      "epoch": 15.61320205080401,
+      "grad_norm": 0.37562793493270874,
+      "learning_rate": 0.00041288603905566886,
+      "loss": 3.3022,
+      "step": 53600
+    },
+    {
+      "epoch": 15.627767420181776,
+      "grad_norm": 0.38429683446884155,
+      "learning_rate": 0.0004127111629262605,
+      "loss": 3.2914,
+      "step": 53650
+    },
+    {
+      "epoch": 15.642332789559543,
+      "grad_norm": 0.3696308434009552,
+      "learning_rate": 0.0004125362867968522,
+      "loss": 3.3008,
+      "step": 53700
+    },
+    {
+      "epoch": 15.656898158937311,
+      "grad_norm": 0.3845422565937042,
+      "learning_rate": 0.0004123614106674438,
+      "loss": 3.3035,
+      "step": 53750
+    },
+    {
+      "epoch": 15.671463528315078,
+      "grad_norm": 0.372842401266098,
+      "learning_rate": 0.00041218653453803557,
+      "loss": 3.2908,
+      "step": 53800
+    },
+    {
+      "epoch": 15.686028897692845,
+      "grad_norm": 0.34964442253112793,
+      "learning_rate": 0.0004120116584086272,
+      "loss": 3.2873,
+      "step": 53850
+    },
+    {
+      "epoch": 15.700594267070613,
+      "grad_norm": 0.3756067752838135,
+      "learning_rate": 0.0004118367822792189,
+      "loss": 3.3061,
+      "step": 53900
+    },
+    {
+      "epoch": 15.71515963644838,
+      "grad_norm": 0.3714199960231781,
+      "learning_rate": 0.00041166190614981053,
+      "loss": 3.2919,
+      "step": 53950
+    },
+    {
+      "epoch": 15.729725005826147,
+      "grad_norm": 0.36421453952789307,
+      "learning_rate": 0.00041148703002040217,
+      "loss": 3.3013,
+      "step": 54000
+    },
+    {
+      "epoch": 15.729725005826147,
+      "eval_accuracy": 0.3723953468275882,
+      "eval_loss": 3.541537046432495,
+      "eval_runtime": 180.0597,
+      "eval_samples_per_second": 92.442,
+      "eval_steps_per_second": 5.781,
+      "step": 54000
+    },
+    {
+      "epoch": 15.744290375203915,
+      "grad_norm": 0.3969421982765198,
+      "learning_rate": 0.00041131215389099386,
+      "loss": 3.2974,
+      "step": 54050
+    },
+    {
+      "epoch": 15.758855744581682,
+      "grad_norm": 0.357675701379776,
+      "learning_rate": 0.0004111372777615855,
+      "loss": 3.3099,
+      "step": 54100
+    },
+    {
+      "epoch": 15.77342111395945,
+      "grad_norm": 0.37822505831718445,
+      "learning_rate": 0.0004109624016321772,
+      "loss": 3.2978,
+      "step": 54150
+    },
+    {
+      "epoch": 15.787986483337217,
+      "grad_norm": 0.3701728880405426,
+      "learning_rate": 0.0004107875255027688,
+      "loss": 3.2945,
+      "step": 54200
+    },
+    {
+      "epoch": 15.802551852714984,
+      "grad_norm": 0.39551228284835815,
+      "learning_rate": 0.00041061264937336045,
+      "loss": 3.2964,
+      "step": 54250
+    },
+    {
+      "epoch": 15.817117222092753,
+      "grad_norm": 0.36080774664878845,
+      "learning_rate": 0.0004104377732439522,
+      "loss": 3.3008,
+      "step": 54300
+    },
+    {
+      "epoch": 15.83168259147052,
+      "grad_norm": 0.37071558833122253,
+      "learning_rate": 0.00041026289711454384,
+      "loss": 3.3056,
+      "step": 54350
+    },
+    {
+      "epoch": 15.846247960848288,
+      "grad_norm": 0.36288759112358093,
+      "learning_rate": 0.0004100880209851355,
+      "loss": 3.2979,
+      "step": 54400
+    },
+    {
+      "epoch": 15.860813330226055,
+      "grad_norm": 0.3693920075893402,
+      "learning_rate": 0.00040991314485572716,
+      "loss": 3.3088,
+      "step": 54450
+    },
+    {
+      "epoch": 15.875378699603822,
+      "grad_norm": 0.37087926268577576,
+      "learning_rate": 0.00040973826872631885,
+      "loss": 3.3055,
+      "step": 54500
+    },
+    {
+      "epoch": 15.88994406898159,
+      "grad_norm": 0.36043593287467957,
+      "learning_rate": 0.0004095633925969105,
+      "loss": 3.3004,
+      "step": 54550
+    },
+    {
+      "epoch": 15.904509438359357,
+      "grad_norm": 0.3625832200050354,
+      "learning_rate": 0.0004093885164675021,
+      "loss": 3.3118,
+      "step": 54600
+    },
+    {
+      "epoch": 15.919074807737124,
+      "grad_norm": 0.36735785007476807,
+      "learning_rate": 0.0004092136403380938,
+      "loss": 3.2942,
+      "step": 54650
+    },
+    {
+      "epoch": 15.933640177114892,
+      "grad_norm": 0.3763796389102936,
+      "learning_rate": 0.00040903876420868545,
+      "loss": 3.3066,
+      "step": 54700
+    },
+    {
+      "epoch": 15.948205546492659,
+      "grad_norm": 0.36403748393058777,
+      "learning_rate": 0.00040886388807927714,
+      "loss": 3.3197,
+      "step": 54750
+    },
+    {
+      "epoch": 15.962770915870426,
+      "grad_norm": 0.38384634256362915,
+      "learning_rate": 0.00040868901194986883,
+      "loss": 3.3177,
+      "step": 54800
+    },
+    {
+      "epoch": 15.977336285248194,
+      "grad_norm": 0.38456591963768005,
+      "learning_rate": 0.00040851413582046047,
+      "loss": 3.3092,
+      "step": 54850
+    },
+    {
+      "epoch": 15.991901654625961,
+      "grad_norm": 0.36568573117256165,
+      "learning_rate": 0.00040833925969105216,
+      "loss": 3.308,
+      "step": 54900
+    },
+    {
+      "epoch": 16.006408762526217,
+      "grad_norm": 0.3781602680683136,
+      "learning_rate": 0.0004081643835616438,
+      "loss": 3.2568,
+      "step": 54950
+    },
+    {
+      "epoch": 16.020974131903984,
+      "grad_norm": 0.3532155454158783,
+      "learning_rate": 0.0004079895074322355,
+      "loss": 3.2009,
+      "step": 55000
+    },
+    {
+      "epoch": 16.020974131903984,
+      "eval_accuracy": 0.3720104237269703,
+      "eval_loss": 3.548476219177246,
+      "eval_runtime": 180.062,
+      "eval_samples_per_second": 92.44,
+      "eval_steps_per_second": 5.781,
+      "step": 55000
+    },
+    {
+      "epoch": 16.035539501281754,
+      "grad_norm": 0.37421002984046936,
+      "learning_rate": 0.0004078146313028271,
+      "loss": 3.1894,
+      "step": 55050
+    },
+    {
+      "epoch": 16.05010487065952,
+      "grad_norm": 0.38773152232170105,
+      "learning_rate": 0.0004076397551734188,
+      "loss": 3.2052,
+      "step": 55100
+    },
+    {
+      "epoch": 16.064670240037287,
+      "grad_norm": 0.40175601840019226,
+      "learning_rate": 0.00040746487904401045,
+      "loss": 3.2037,
+      "step": 55150
+    },
+    {
+      "epoch": 16.079235609415054,
+      "grad_norm": 0.4081617593765259,
+      "learning_rate": 0.0004072900029146021,
+      "loss": 3.2144,
+      "step": 55200
+    },
+    {
+      "epoch": 16.09380097879282,
+      "grad_norm": 0.40000903606414795,
+      "learning_rate": 0.0004071151267851938,
+      "loss": 3.2147,
+      "step": 55250
+    },
+    {
+      "epoch": 16.10836634817059,
+      "grad_norm": 0.3925228416919708,
+      "learning_rate": 0.00040694025065578546,
+      "loss": 3.2222,
+      "step": 55300
+    },
+    {
+      "epoch": 16.122931717548358,
+      "grad_norm": 0.3875502943992615,
+      "learning_rate": 0.00040676537452637716,
+      "loss": 3.2168,
+      "step": 55350
+    },
+    {
+      "epoch": 16.137497086926125,
+      "grad_norm": 0.3993126153945923,
+      "learning_rate": 0.0004065904983969688,
+      "loss": 3.2285,
+      "step": 55400
+    },
+    {
+      "epoch": 16.15206245630389,
+      "grad_norm": 0.3767448365688324,
+      "learning_rate": 0.00040641562226756043,
+      "loss": 3.2361,
+      "step": 55450
+    },
+    {
+      "epoch": 16.16662782568166,
+      "grad_norm": 0.3723169267177582,
+      "learning_rate": 0.0004062407461381521,
+      "loss": 3.2316,
+      "step": 55500
+    },
+    {
+      "epoch": 16.181193195059425,
+      "grad_norm": 0.3401069939136505,
+      "learning_rate": 0.00040606587000874375,
+      "loss": 3.2341,
+      "step": 55550
+    },
+    {
+      "epoch": 16.195758564437195,
+      "grad_norm": 0.3760260343551636,
+      "learning_rate": 0.00040589099387933544,
+      "loss": 3.2417,
+      "step": 55600
+    },
+    {
+      "epoch": 16.210323933814962,
+      "grad_norm": 0.39141103625297546,
+      "learning_rate": 0.0004057161177499271,
+      "loss": 3.2435,
+      "step": 55650
+    },
+    {
+      "epoch": 16.22488930319273,
+      "grad_norm": 0.36476460099220276,
+      "learning_rate": 0.0004055412416205187,
+      "loss": 3.2431,
+      "step": 55700
+    },
+    {
+      "epoch": 16.239454672570496,
+      "grad_norm": 0.391431599855423,
+      "learning_rate": 0.0004053663654911104,
+      "loss": 3.2551,
+      "step": 55750
+    },
+    {
+      "epoch": 16.254020041948262,
+      "grad_norm": 0.3892253041267395,
+      "learning_rate": 0.0004051914893617021,
+      "loss": 3.2572,
+      "step": 55800
+    },
+    {
+      "epoch": 16.268585411326033,
+      "grad_norm": 0.39836394786834717,
+      "learning_rate": 0.0004050166132322938,
+      "loss": 3.2445,
+      "step": 55850
+    },
+    {
+      "epoch": 16.2831507807038,
+      "grad_norm": 0.4005641043186188,
+      "learning_rate": 0.0004048417371028854,
+      "loss": 3.2473,
+      "step": 55900
+    },
+    {
+      "epoch": 16.297716150081566,
+      "grad_norm": 0.35440683364868164,
+      "learning_rate": 0.0004046668609734771,
+      "loss": 3.2538,
+      "step": 55950
+    },
+    {
+      "epoch": 16.312281519459333,
+      "grad_norm": 0.37304195761680603,
+      "learning_rate": 0.00040449198484406875,
+      "loss": 3.2631,
+      "step": 56000
+    },
+    {
+      "epoch": 16.312281519459333,
+      "eval_accuracy": 0.3721091822500366,
+      "eval_loss": 3.550828456878662,
+      "eval_runtime": 180.285,
+      "eval_samples_per_second": 92.326,
+      "eval_steps_per_second": 5.774,
+      "step": 56000
+    },
+    {
+      "epoch": 16.3268468888371,
+      "grad_norm": 0.4370158612728119,
+      "learning_rate": 0.0004043171087146604,
+      "loss": 3.2598,
+      "step": 56050
+    },
+    {
+      "epoch": 16.34141225821487,
+      "grad_norm": 0.40278568863868713,
+      "learning_rate": 0.0004041422325852521,
+      "loss": 3.2515,
+      "step": 56100
+    },
+    {
+      "epoch": 16.355977627592637,
+      "grad_norm": 0.36991235613822937,
+      "learning_rate": 0.0004039673564558437,
+      "loss": 3.2632,
+      "step": 56150
+    },
+    {
+      "epoch": 16.370542996970403,
+      "grad_norm": 0.35354509949684143,
+      "learning_rate": 0.0004037924803264354,
+      "loss": 3.265,
+      "step": 56200
+    },
+    {
+      "epoch": 16.38510836634817,
+      "grad_norm": 0.3710688650608063,
+      "learning_rate": 0.00040361760419702704,
+      "loss": 3.2659,
+      "step": 56250
+    },
+    {
+      "epoch": 16.399673735725937,
+      "grad_norm": 0.3730112910270691,
+      "learning_rate": 0.00040344272806761873,
+      "loss": 3.2531,
+      "step": 56300
+    },
+    {
+      "epoch": 16.414239105103704,
+      "grad_norm": 0.4005134403705597,
+      "learning_rate": 0.0004032678519382104,
+      "loss": 3.2692,
+      "step": 56350
+    },
+    {
+      "epoch": 16.428804474481474,
+      "grad_norm": 0.3491421937942505,
+      "learning_rate": 0.00040309297580880206,
+      "loss": 3.2599,
+      "step": 56400
+    },
+    {
+      "epoch": 16.44336984385924,
+      "grad_norm": 0.35259345173835754,
+      "learning_rate": 0.00040291809967939375,
+      "loss": 3.2715,
+      "step": 56450
+    },
+    {
+      "epoch": 16.457935213237008,
+      "grad_norm": 0.3865564167499542,
+      "learning_rate": 0.0004027432235499854,
+      "loss": 3.2702,
+      "step": 56500
+    },
+    {
+      "epoch": 16.472500582614774,
+      "grad_norm": 0.3678795099258423,
+      "learning_rate": 0.0004025683474205771,
+      "loss": 3.2733,
+      "step": 56550
+    },
+    {
+      "epoch": 16.48706595199254,
+      "grad_norm": 0.36398905515670776,
+      "learning_rate": 0.0004023934712911687,
+      "loss": 3.2619,
+      "step": 56600
+    },
+    {
+      "epoch": 16.50163132137031,
+      "grad_norm": 0.3686988949775696,
+      "learning_rate": 0.00040221859516176035,
+      "loss": 3.2789,
+      "step": 56650
+    },
+    {
+      "epoch": 16.516196690748078,
+      "grad_norm": 0.3928549587726593,
+      "learning_rate": 0.00040204371903235204,
+      "loss": 3.2703,
+      "step": 56700
+    },
+    {
+      "epoch": 16.530762060125845,
+      "grad_norm": 0.38307324051856995,
+      "learning_rate": 0.0004018688429029437,
+      "loss": 3.2629,
+      "step": 56750
+    },
+    {
+      "epoch": 16.54532742950361,
+      "grad_norm": 0.3929668366909027,
+      "learning_rate": 0.0004016939667735354,
+      "loss": 3.2684,
+      "step": 56800
+    },
+    {
+      "epoch": 16.55989279888138,
+      "grad_norm": 0.36345812678337097,
+      "learning_rate": 0.00040151909064412705,
+      "loss": 3.2756,
+      "step": 56850
+    },
+    {
+      "epoch": 16.57445816825915,
+      "grad_norm": 0.3730520009994507,
+      "learning_rate": 0.0004013442145147187,
+      "loss": 3.2684,
+      "step": 56900
+    },
+    {
+      "epoch": 16.589023537636916,
+      "grad_norm": 0.3729363977909088,
+      "learning_rate": 0.0004011693383853104,
+      "loss": 3.2854,
+      "step": 56950
+    },
+    {
+      "epoch": 16.603588907014682,
+      "grad_norm": 0.37262773513793945,
+      "learning_rate": 0.000400994462255902,
+      "loss": 3.2698,
+      "step": 57000
+    },
+    {
+      "epoch": 16.603588907014682,
+      "eval_accuracy": 0.3724431976834072,
+      "eval_loss": 3.54331111907959,
+      "eval_runtime": 180.2789,
+      "eval_samples_per_second": 92.329,
+      "eval_steps_per_second": 5.774,
+      "step": 57000
+    },
+    {
+      "epoch": 16.61815427639245,
+      "grad_norm": 0.3967365026473999,
+      "learning_rate": 0.0004008195861264937,
+      "loss": 3.2847,
+      "step": 57050
+    },
+    {
+      "epoch": 16.632719645770216,
+      "grad_norm": 0.3768179416656494,
+      "learning_rate": 0.00040064470999708534,
+      "loss": 3.2724,
+      "step": 57100
+    },
+    {
+      "epoch": 16.647285015147983,
+      "grad_norm": 0.3650953769683838,
+      "learning_rate": 0.00040046983386767703,
+      "loss": 3.2813,
+      "step": 57150
+    },
+    {
+      "epoch": 16.661850384525753,
+      "grad_norm": 0.35720765590667725,
+      "learning_rate": 0.00040029495773826867,
+      "loss": 3.2817,
+      "step": 57200
+    },
+    {
+      "epoch": 16.67641575390352,
+      "grad_norm": 0.39140841364860535,
+      "learning_rate": 0.0004001200816088603,
+      "loss": 3.2895,
+      "step": 57250
+    },
+    {
+      "epoch": 16.690981123281286,
+      "grad_norm": 0.3720303475856781,
+      "learning_rate": 0.00039994520547945205,
+      "loss": 3.2812,
+      "step": 57300
+    },
+    {
+      "epoch": 16.705546492659053,
+      "grad_norm": 0.37557291984558105,
+      "learning_rate": 0.0003997703293500437,
+      "loss": 3.2861,
+      "step": 57350
+    },
+    {
+      "epoch": 16.72011186203682,
+      "grad_norm": 0.36764585971832275,
+      "learning_rate": 0.0003995954532206354,
+      "loss": 3.2853,
+      "step": 57400
+    },
+    {
+      "epoch": 16.73467723141459,
+      "grad_norm": 0.38060253858566284,
+      "learning_rate": 0.000399420577091227,
+      "loss": 3.2809,
+      "step": 57450
+    },
+    {
+      "epoch": 16.749242600792357,
+      "grad_norm": 0.3591197729110718,
+      "learning_rate": 0.00039924570096181865,
+      "loss": 3.2897,
+      "step": 57500
+    },
+    {
+      "epoch": 16.763807970170124,
+      "grad_norm": 0.3743264079093933,
+      "learning_rate": 0.00039907082483241034,
+      "loss": 3.2838,
+      "step": 57550
+    },
+    {
+      "epoch": 16.77837333954789,
+      "grad_norm": 0.3858964145183563,
+      "learning_rate": 0.000398895948703002,
+      "loss": 3.2924,
+      "step": 57600
+    },
+    {
+      "epoch": 16.792938708925657,
+      "grad_norm": 0.372662216424942,
+      "learning_rate": 0.00039872107257359367,
+      "loss": 3.2809,
+      "step": 57650
+    },
+    {
+      "epoch": 16.807504078303424,
+      "grad_norm": 0.3536543846130371,
+      "learning_rate": 0.0003985461964441853,
+      "loss": 3.2989,
+      "step": 57700
+    },
+    {
+      "epoch": 16.822069447681194,
+      "grad_norm": 0.38918742537498474,
+      "learning_rate": 0.00039837132031477694,
+      "loss": 3.2845,
+      "step": 57750
+    },
+    {
+      "epoch": 16.83663481705896,
+      "grad_norm": 0.3828890323638916,
+      "learning_rate": 0.0003981964441853687,
+      "loss": 3.2902,
+      "step": 57800
+    },
+    {
+      "epoch": 16.851200186436728,
+      "grad_norm": 0.36972659826278687,
+      "learning_rate": 0.0003980215680559603,
+      "loss": 3.2828,
+      "step": 57850
+    },
+    {
+      "epoch": 16.865765555814495,
+      "grad_norm": 0.36914464831352234,
+      "learning_rate": 0.000397846691926552,
+      "loss": 3.2819,
+      "step": 57900
+    },
+    {
+      "epoch": 16.88033092519226,
+      "grad_norm": 0.41757625341415405,
+      "learning_rate": 0.00039767181579714365,
+      "loss": 3.2939,
+      "step": 57950
+    },
+    {
+      "epoch": 16.89489629457003,
+      "grad_norm": 0.3668311536312103,
+      "learning_rate": 0.00039749693966773534,
+      "loss": 3.2943,
+      "step": 58000
+    },
+    {
+      "epoch": 16.89489629457003,
+      "eval_accuracy": 0.37297237876950406,
+      "eval_loss": 3.535468101501465,
+      "eval_runtime": 180.2238,
+      "eval_samples_per_second": 92.357,
+      "eval_steps_per_second": 5.776,
+      "step": 58000
+    },
+    {
+      "epoch": 16.9094616639478,
+      "grad_norm": 0.3850747346878052,
+      "learning_rate": 0.00039732206353832697,
+      "loss": 3.3026,
+      "step": 58050
+    },
+    {
+      "epoch": 16.924027033325565,
+      "grad_norm": 0.3471890091896057,
+      "learning_rate": 0.0003971471874089186,
+      "loss": 3.3096,
+      "step": 58100
+    },
+    {
+      "epoch": 16.938592402703332,
+      "grad_norm": 0.371502548456192,
+      "learning_rate": 0.0003969723112795103,
+      "loss": 3.2999,
+      "step": 58150
+    },
+    {
+      "epoch": 16.9531577720811,
+      "grad_norm": 0.36119985580444336,
+      "learning_rate": 0.00039679743515010194,
+      "loss": 3.2996,
+      "step": 58200
+    },
+    {
+      "epoch": 16.96772314145887,
+      "grad_norm": 0.38665369153022766,
+      "learning_rate": 0.0003966225590206937,
+      "loss": 3.2906,
+      "step": 58250
+    },
+    {
+      "epoch": 16.982288510836636,
+      "grad_norm": 0.3815176784992218,
+      "learning_rate": 0.0003964476828912853,
+      "loss": 3.2889,
+      "step": 58300
+    },
+    {
+      "epoch": 16.996853880214402,
+      "grad_norm": 0.3887826204299927,
+      "learning_rate": 0.00039627280676187695,
+      "loss": 3.2914,
+      "step": 58350
+    },
+    {
+      "epoch": 17.01136098811466,
+      "grad_norm": 0.4290904402732849,
+      "learning_rate": 0.00039609793063246864,
+      "loss": 3.2189,
+      "step": 58400
+    },
+    {
+      "epoch": 17.025926357492427,
+      "grad_norm": 0.37529537081718445,
+      "learning_rate": 0.0003959230545030603,
+      "loss": 3.1944,
+      "step": 58450
+    },
+    {
+      "epoch": 17.040491726870194,
+      "grad_norm": 0.364271879196167,
+      "learning_rate": 0.00039574817837365197,
+      "loss": 3.1825,
+      "step": 58500
+    },
+    {
+      "epoch": 17.05505709624796,
+      "grad_norm": 0.37614959478378296,
+      "learning_rate": 0.0003955733022442436,
+      "loss": 3.2019,
+      "step": 58550
+    },
+    {
+      "epoch": 17.069622465625727,
+      "grad_norm": 0.3827800750732422,
+      "learning_rate": 0.0003953984261148353,
+      "loss": 3.211,
+      "step": 58600
+    },
+    {
+      "epoch": 17.084187835003497,
+      "grad_norm": 0.38689175248146057,
+      "learning_rate": 0.00039522354998542693,
+      "loss": 3.2085,
+      "step": 58650
+    },
+    {
+      "epoch": 17.098753204381264,
+      "grad_norm": 0.4140167534351349,
+      "learning_rate": 0.00039504867385601857,
+      "loss": 3.2028,
+      "step": 58700
+    },
+    {
+      "epoch": 17.11331857375903,
+      "grad_norm": 0.396028608083725,
+      "learning_rate": 0.0003948737977266103,
+      "loss": 3.2235,
+      "step": 58750
+    },
+    {
+      "epoch": 17.127883943136798,
+      "grad_norm": 0.37700241804122925,
+      "learning_rate": 0.00039469892159720195,
+      "loss": 3.192,
+      "step": 58800
+    },
+    {
+      "epoch": 17.142449312514564,
+      "grad_norm": 0.41701769828796387,
+      "learning_rate": 0.00039452404546779364,
+      "loss": 3.2142,
+      "step": 58850
+    },
+    {
+      "epoch": 17.15701468189233,
+      "grad_norm": 0.3866487741470337,
+      "learning_rate": 0.0003943491693383853,
+      "loss": 3.2286,
+      "step": 58900
+    },
+    {
+      "epoch": 17.1715800512701,
+      "grad_norm": 0.4011995494365692,
+      "learning_rate": 0.0003941742932089769,
+      "loss": 3.2263,
+      "step": 58950
+    },
+    {
+      "epoch": 17.18614542064787,
+      "grad_norm": 0.37379980087280273,
+      "learning_rate": 0.0003939994170795686,
+      "loss": 3.2083,
+      "step": 59000
+    },
+    {
+      "epoch": 17.18614542064787,
+      "eval_accuracy": 0.37220688264607005,
+      "eval_loss": 3.5509185791015625,
+      "eval_runtime": 180.2102,
+      "eval_samples_per_second": 92.364,
+      "eval_steps_per_second": 5.777,
+      "step": 59000
+    },
+    {
+      "epoch": 17.200710790025635,
+      "grad_norm": 0.38473978638648987,
+      "learning_rate": 0.00039382454095016024,
+      "loss": 3.224,
+      "step": 59050
+    },
+    {
+      "epoch": 17.215276159403402,
+      "grad_norm": 0.3696572184562683,
+      "learning_rate": 0.00039364966482075193,
+      "loss": 3.2346,
+      "step": 59100
+    },
+    {
+      "epoch": 17.22984152878117,
+      "grad_norm": 0.3958011269569397,
+      "learning_rate": 0.00039347478869134356,
+      "loss": 3.2397,
+      "step": 59150
+    },
+    {
+      "epoch": 17.24440689815894,
+      "grad_norm": 0.38348454236984253,
+      "learning_rate": 0.0003932999125619353,
+      "loss": 3.2252,
+      "step": 59200
+    },
+    {
+      "epoch": 17.258972267536706,
+      "grad_norm": 0.3725033402442932,
+      "learning_rate": 0.00039312503643252695,
+      "loss": 3.2384,
+      "step": 59250
+    },
+    {
+      "epoch": 17.273537636914472,
+      "grad_norm": 0.4087284803390503,
+      "learning_rate": 0.0003929501603031186,
+      "loss": 3.2321,
+      "step": 59300
+    },
+    {
+      "epoch": 17.28810300629224,
+      "grad_norm": 0.3712259531021118,
+      "learning_rate": 0.00039277528417371027,
+      "loss": 3.2538,
+      "step": 59350
+    },
+    {
+      "epoch": 17.302668375670006,
+      "grad_norm": 0.38870859146118164,
+      "learning_rate": 0.0003926004080443019,
+      "loss": 3.2529,
+      "step": 59400
+    },
+    {
+      "epoch": 17.317233745047773,
+      "grad_norm": 0.4209974706172943,
+      "learning_rate": 0.0003924255319148936,
+      "loss": 3.2472,
+      "step": 59450
+    },
+    {
+      "epoch": 17.331799114425543,
+      "grad_norm": 0.3777424097061157,
+      "learning_rate": 0.00039225065578548523,
+      "loss": 3.2365,
+      "step": 59500
+    },
+    {
+      "epoch": 17.34636448380331,
+      "grad_norm": 0.37616270780563354,
+      "learning_rate": 0.00039207577965607687,
+      "loss": 3.2432,
+      "step": 59550
+    },
+    {
+      "epoch": 17.360929853181077,
+      "grad_norm": 0.4067479968070984,
+      "learning_rate": 0.00039190090352666856,
+      "loss": 3.2597,
+      "step": 59600
+    },
+    {
+      "epoch": 17.375495222558843,
+      "grad_norm": 0.3870598375797272,
+      "learning_rate": 0.0003917260273972602,
+      "loss": 3.2533,
+      "step": 59650
+    },
+    {
+      "epoch": 17.39006059193661,
+      "grad_norm": 0.39382418990135193,
+      "learning_rate": 0.00039155115126785194,
+      "loss": 3.2419,
+      "step": 59700
+    },
+    {
+      "epoch": 17.40462596131438,
+      "grad_norm": 0.36751487851142883,
+      "learning_rate": 0.0003913762751384436,
+      "loss": 3.233,
+      "step": 59750
+    },
+    {
+      "epoch": 17.419191330692147,
+      "grad_norm": 0.3841138184070587,
+      "learning_rate": 0.00039120139900903527,
+      "loss": 3.2467,
+      "step": 59800
+    },
+    {
+      "epoch": 17.433756700069914,
+      "grad_norm": 0.37457749247550964,
+      "learning_rate": 0.0003910265228796269,
+      "loss": 3.2391,
+      "step": 59850
+    },
+    {
+      "epoch": 17.44832206944768,
+      "grad_norm": 0.3810558021068573,
+      "learning_rate": 0.00039085164675021854,
+      "loss": 3.2447,
+      "step": 59900
+    },
+    {
+      "epoch": 17.462887438825447,
+      "grad_norm": 0.3752453625202179,
+      "learning_rate": 0.00039067677062081023,
+      "loss": 3.2476,
+      "step": 59950
+    },
+    {
+      "epoch": 17.477452808203218,
+      "grad_norm": 0.3891676664352417,
+      "learning_rate": 0.00039050189449140187,
+      "loss": 3.2538,
+      "step": 60000
+    },
+    {
+      "epoch": 17.477452808203218,
+      "eval_accuracy": 0.3728713864227018,
+      "eval_loss": 3.544705629348755,
+      "eval_runtime": 180.1568,
+      "eval_samples_per_second": 92.392,
+      "eval_steps_per_second": 5.778,
+      "step": 60000
+    },
+    {
+      "epoch": 17.492018177580984,
+      "grad_norm": 0.38783255219459534,
+      "learning_rate": 0.00039032701836199356,
+      "loss": 3.2714,
+      "step": 60050
+    },
+    {
+      "epoch": 17.50658354695875,
+      "grad_norm": 0.36609965562820435,
+      "learning_rate": 0.0003901521422325852,
+      "loss": 3.2597,
+      "step": 60100
+    },
+    {
+      "epoch": 17.521148916336518,
+      "grad_norm": 0.38936126232147217,
+      "learning_rate": 0.00038997726610317683,
+      "loss": 3.2579,
+      "step": 60150
+    },
+    {
+      "epoch": 17.535714285714285,
+      "grad_norm": 0.3598592281341553,
+      "learning_rate": 0.0003898023899737686,
+      "loss": 3.2579,
+      "step": 60200
+    },
+    {
+      "epoch": 17.55027965509205,
+      "grad_norm": 0.37373456358909607,
+      "learning_rate": 0.0003896275138443602,
+      "loss": 3.2591,
+      "step": 60250
+    },
+    {
+      "epoch": 17.56484502446982,
+      "grad_norm": 0.3937729299068451,
+      "learning_rate": 0.0003894526377149519,
+      "loss": 3.2699,
+      "step": 60300
+    },
+    {
+      "epoch": 17.57941039384759,
+      "grad_norm": 0.434922993183136,
+      "learning_rate": 0.00038927776158554354,
+      "loss": 3.2521,
+      "step": 60350
+    },
+    {
+      "epoch": 17.593975763225355,
+      "grad_norm": 0.3775573670864105,
+      "learning_rate": 0.0003891028854561352,
+      "loss": 3.2672,
+      "step": 60400
+    },
+    {
+      "epoch": 17.608541132603122,
+      "grad_norm": 0.386683851480484,
+      "learning_rate": 0.00038892800932672686,
+      "loss": 3.2778,
+      "step": 60450
+    },
+    {
+      "epoch": 17.62310650198089,
+      "grad_norm": 0.3631393015384674,
+      "learning_rate": 0.0003887531331973185,
+      "loss": 3.2626,
+      "step": 60500
+    },
+    {
+      "epoch": 17.63767187135866,
+      "grad_norm": 0.3855550289154053,
+      "learning_rate": 0.0003885782570679102,
+      "loss": 3.2725,
+      "step": 60550
+    },
+    {
+      "epoch": 17.652237240736426,
+      "grad_norm": 0.41759514808654785,
+      "learning_rate": 0.0003884033809385018,
+      "loss": 3.2726,
+      "step": 60600
+    },
+    {
+      "epoch": 17.666802610114193,
+      "grad_norm": 0.37082067131996155,
+      "learning_rate": 0.00038822850480909357,
+      "loss": 3.2743,
+      "step": 60650
+    },
+    {
+      "epoch": 17.68136797949196,
+      "grad_norm": 0.3791321814060211,
+      "learning_rate": 0.0003880536286796852,
+      "loss": 3.2711,
+      "step": 60700
+    },
+    {
+      "epoch": 17.695933348869726,
+      "grad_norm": 0.3907877504825592,
+      "learning_rate": 0.00038787875255027684,
+      "loss": 3.2657,
+      "step": 60750
+    },
+    {
+      "epoch": 17.710498718247496,
+      "grad_norm": 0.3546347916126251,
+      "learning_rate": 0.00038770387642086853,
+      "loss": 3.2659,
+      "step": 60800
+    },
+    {
+      "epoch": 17.725064087625263,
+      "grad_norm": 0.393719881772995,
+      "learning_rate": 0.00038752900029146017,
+      "loss": 3.277,
+      "step": 60850
+    },
+    {
+      "epoch": 17.73962945700303,
+      "grad_norm": 0.3926842510700226,
+      "learning_rate": 0.00038735412416205186,
+      "loss": 3.2704,
+      "step": 60900
+    },
+    {
+      "epoch": 17.754194826380797,
+      "grad_norm": 0.3844298720359802,
+      "learning_rate": 0.0003871792480326435,
+      "loss": 3.271,
+      "step": 60950
+    },
+    {
+      "epoch": 17.768760195758563,
+      "grad_norm": 0.3942425549030304,
+      "learning_rate": 0.00038700437190323513,
+      "loss": 3.2726,
+      "step": 61000
+    },
+    {
+      "epoch": 17.768760195758563,
+      "eval_accuracy": 0.37308395238663494,
+      "eval_loss": 3.5373306274414062,
+      "eval_runtime": 180.211,
+      "eval_samples_per_second": 92.364,
+      "eval_steps_per_second": 5.777,
+      "step": 61000
+    },
+    {
+      "epoch": 17.78332556513633,
+      "grad_norm": 0.416614294052124,
+      "learning_rate": 0.0003868294957738268,
+      "loss": 3.2786,
+      "step": 61050
+    },
+    {
+      "epoch": 17.7978909345141,
+      "grad_norm": 0.3919004201889038,
+      "learning_rate": 0.00038665461964441846,
+      "loss": 3.2755,
+      "step": 61100
+    },
+    {
+      "epoch": 17.812456303891867,
+      "grad_norm": 0.3852967321872711,
+      "learning_rate": 0.0003864797435150102,
+      "loss": 3.2774,
+      "step": 61150
+    },
+    {
+      "epoch": 17.827021673269634,
+      "grad_norm": 0.4006671607494354,
+      "learning_rate": 0.00038630486738560184,
+      "loss": 3.2849,
+      "step": 61200
+    },
+    {
+      "epoch": 17.8415870426474,
+      "grad_norm": 0.3728589713573456,
+      "learning_rate": 0.00038612999125619353,
+      "loss": 3.2795,
+      "step": 61250
+    },
+    {
+      "epoch": 17.856152412025168,
+      "grad_norm": 0.38556793332099915,
+      "learning_rate": 0.00038595511512678517,
+      "loss": 3.2734,
+      "step": 61300
+    },
+    {
+      "epoch": 17.870717781402938,
+      "grad_norm": 0.3805278241634369,
+      "learning_rate": 0.0003857802389973768,
+      "loss": 3.2765,
+      "step": 61350
+    },
+    {
+      "epoch": 17.885283150780705,
+      "grad_norm": 0.3954722583293915,
+      "learning_rate": 0.0003856053628679685,
+      "loss": 3.296,
+      "step": 61400
+    },
+    {
+      "epoch": 17.89984852015847,
+      "grad_norm": 0.37796393036842346,
+      "learning_rate": 0.00038543048673856013,
+      "loss": 3.283,
+      "step": 61450
+    },
+    {
+      "epoch": 17.914413889536238,
+      "grad_norm": 0.35778191685676575,
+      "learning_rate": 0.0003852556106091518,
+      "loss": 3.2872,
+      "step": 61500
+    },
+    {
+      "epoch": 17.928979258914005,
+      "grad_norm": 0.3738497197628021,
+      "learning_rate": 0.00038508073447974346,
+      "loss": 3.2867,
+      "step": 61550
+    },
+    {
+      "epoch": 17.943544628291775,
+      "grad_norm": 0.3807421624660492,
+      "learning_rate": 0.0003849058583503351,
+      "loss": 3.2681,
+      "step": 61600
+    },
+    {
+      "epoch": 17.958109997669542,
+      "grad_norm": 0.3870691657066345,
+      "learning_rate": 0.00038473098222092684,
+      "loss": 3.2799,
+      "step": 61650
+    },
+    {
+      "epoch": 17.97267536704731,
+      "grad_norm": 0.3574218153953552,
+      "learning_rate": 0.0003845561060915185,
+      "loss": 3.2848,
+      "step": 61700
+    },
+    {
+      "epoch": 17.987240736425075,
+      "grad_norm": 0.37810570001602173,
+      "learning_rate": 0.00038438122996211016,
+      "loss": 3.2866,
+      "step": 61750
+    },
+    {
+      "epoch": 18.001747844325333,
+      "grad_norm": 0.42944374680519104,
+      "learning_rate": 0.0003842063538327018,
+      "loss": 3.272,
+      "step": 61800
+    },
+    {
+      "epoch": 18.0163132137031,
+      "grad_norm": 0.3770248293876648,
+      "learning_rate": 0.0003840314777032935,
+      "loss": 3.1656,
+      "step": 61850
+    },
+    {
+      "epoch": 18.030878583080867,
+      "grad_norm": 0.38981911540031433,
+      "learning_rate": 0.0003838566015738851,
+      "loss": 3.1825,
+      "step": 61900
+    },
+    {
+      "epoch": 18.045443952458633,
+      "grad_norm": 0.41429778933525085,
+      "learning_rate": 0.00038368172544447676,
+      "loss": 3.185,
+      "step": 61950
+    },
+    {
+      "epoch": 18.0600093218364,
+      "grad_norm": 0.3987773060798645,
+      "learning_rate": 0.00038350684931506845,
+      "loss": 3.1734,
+      "step": 62000
+    },
+    {
+      "epoch": 18.0600093218364,
+      "eval_accuracy": 0.37259674367284124,
+      "eval_loss": 3.5511462688446045,
+      "eval_runtime": 180.2134,
+      "eval_samples_per_second": 92.363,
+      "eval_steps_per_second": 5.776,
+      "step": 62000
+    },
+    {
+      "epoch": 18.07457469121417,
+      "grad_norm": 0.3764216899871826,
+      "learning_rate": 0.0003833319731856601,
+      "loss": 3.1908,
+      "step": 62050
+    },
+    {
+      "epoch": 18.089140060591937,
+      "grad_norm": 0.5347649455070496,
+      "learning_rate": 0.00038315709705625183,
+      "loss": 3.1914,
+      "step": 62100
+    },
+    {
+      "epoch": 18.103705429969704,
+      "grad_norm": 0.3660869300365448,
+      "learning_rate": 0.00038298222092684347,
+      "loss": 3.2015,
+      "step": 62150
+    },
+    {
+      "epoch": 18.11827079934747,
+      "grad_norm": 0.3871309161186218,
+      "learning_rate": 0.0003828073447974351,
+      "loss": 3.1985,
+      "step": 62200
+    },
+    {
+      "epoch": 18.132836168725238,
+      "grad_norm": 0.41517186164855957,
+      "learning_rate": 0.0003826324686680268,
+      "loss": 3.2121,
+      "step": 62250
+    },
+    {
+      "epoch": 18.147401538103008,
+      "grad_norm": 0.3585100471973419,
+      "learning_rate": 0.00038245759253861843,
+      "loss": 3.2091,
+      "step": 62300
+    },
+    {
+      "epoch": 18.161966907480775,
+      "grad_norm": 0.3985573351383209,
+      "learning_rate": 0.0003822827164092101,
+      "loss": 3.2183,
+      "step": 62350
+    },
+    {
+      "epoch": 18.17653227685854,
+      "grad_norm": 0.40456002950668335,
+      "learning_rate": 0.00038210784027980176,
+      "loss": 3.2072,
+      "step": 62400
+    },
+    {
+      "epoch": 18.191097646236308,
+      "grad_norm": 0.3861040472984314,
+      "learning_rate": 0.0003819329641503934,
+      "loss": 3.2051,
+      "step": 62450
+    },
+    {
+      "epoch": 18.205663015614075,
+      "grad_norm": 0.4029352366924286,
+      "learning_rate": 0.0003817580880209851,
+      "loss": 3.213,
+      "step": 62500
+    },
+    {
+      "epoch": 18.22022838499184,
+      "grad_norm": 0.36406001448631287,
+      "learning_rate": 0.0003815832118915767,
+      "loss": 3.2182,
+      "step": 62550
+    },
+    {
+      "epoch": 18.234793754369612,
+      "grad_norm": 0.38128963112831116,
+      "learning_rate": 0.00038140833576216847,
+      "loss": 3.2251,
+      "step": 62600
+    },
+    {
+      "epoch": 18.24935912374738,
+      "grad_norm": 0.3720012307167053,
+      "learning_rate": 0.0003812334596327601,
+      "loss": 3.2148,
+      "step": 62650
+    },
+    {
+      "epoch": 18.263924493125145,
+      "grad_norm": 0.38644638657569885,
+      "learning_rate": 0.0003810585835033518,
+      "loss": 3.2211,
+      "step": 62700
+    },
+    {
+      "epoch": 18.278489862502912,
+      "grad_norm": 0.36919474601745605,
+      "learning_rate": 0.00038088370737394343,
+      "loss": 3.2195,
+      "step": 62750
+    },
+    {
+      "epoch": 18.29305523188068,
+      "grad_norm": 0.3620702028274536,
+      "learning_rate": 0.00038070883124453507,
+      "loss": 3.2203,
+      "step": 62800
+    },
+    {
+      "epoch": 18.30762060125845,
+      "grad_norm": 0.3722609281539917,
+      "learning_rate": 0.00038053395511512676,
+      "loss": 3.2249,
+      "step": 62850
+    },
+    {
+      "epoch": 18.322185970636216,
+      "grad_norm": 0.3975137770175934,
+      "learning_rate": 0.0003803590789857184,
+      "loss": 3.2353,
+      "step": 62900
+    },
+    {
+      "epoch": 18.336751340013983,
+      "grad_norm": 0.37322407960891724,
+      "learning_rate": 0.0003801842028563101,
+      "loss": 3.2471,
+      "step": 62950
+    },
+    {
+      "epoch": 18.35131670939175,
+      "grad_norm": 0.3992110788822174,
+      "learning_rate": 0.0003800093267269017,
+      "loss": 3.2302,
+      "step": 63000
+    },
+    {
+      "epoch": 18.35131670939175,
+      "eval_accuracy": 0.3729018369673139,
+      "eval_loss": 3.544276714324951,
+      "eval_runtime": 180.3309,
+      "eval_samples_per_second": 92.303,
+      "eval_steps_per_second": 5.773,
+      "step": 63000
+    },
+    {
+      "epoch": 18.365882078769516,
+      "grad_norm": 0.37217405438423157,
+      "learning_rate": 0.00037983445059749335,
+      "loss": 3.2365,
+      "step": 63050
+    },
+    {
+      "epoch": 18.380447448147287,
+      "grad_norm": 0.38584333658218384,
+      "learning_rate": 0.0003796595744680851,
+      "loss": 3.2501,
+      "step": 63100
+    },
+    {
+      "epoch": 18.395012817525053,
+      "grad_norm": 0.3655035197734833,
+      "learning_rate": 0.00037948469833867674,
+      "loss": 3.2413,
+      "step": 63150
+    },
+    {
+      "epoch": 18.40957818690282,
+      "grad_norm": 0.3870564103126526,
+      "learning_rate": 0.0003793098222092684,
+      "loss": 3.2297,
+      "step": 63200
+    },
+    {
+      "epoch": 18.424143556280587,
+      "grad_norm": 0.4136461615562439,
+      "learning_rate": 0.00037913494607986006,
+      "loss": 3.2377,
+      "step": 63250
+    },
+    {
+      "epoch": 18.438708925658354,
+      "grad_norm": 0.39346110820770264,
+      "learning_rate": 0.00037896006995045175,
+      "loss": 3.2505,
+      "step": 63300
+    },
+    {
+      "epoch": 18.45327429503612,
+      "grad_norm": 0.3884267508983612,
+      "learning_rate": 0.0003787851938210434,
+      "loss": 3.254,
+      "step": 63350
+    },
+    {
+      "epoch": 18.46783966441389,
+      "grad_norm": 0.3669024705886841,
+      "learning_rate": 0.000378610317691635,
+      "loss": 3.2379,
+      "step": 63400
+    },
+    {
+      "epoch": 18.482405033791657,
+      "grad_norm": 0.38779205083847046,
+      "learning_rate": 0.0003784354415622267,
+      "loss": 3.2469,
+      "step": 63450
+    },
+    {
+      "epoch": 18.496970403169424,
+      "grad_norm": 0.3925683796405792,
+      "learning_rate": 0.00037826056543281835,
+      "loss": 3.2475,
+      "step": 63500
+    },
+    {
+      "epoch": 18.51153577254719,
+      "grad_norm": 0.3939642906188965,
+      "learning_rate": 0.0003780856893034101,
+      "loss": 3.2455,
+      "step": 63550
+    },
+    {
+      "epoch": 18.526101141924958,
+      "grad_norm": 0.36159586906433105,
+      "learning_rate": 0.00037791081317400173,
+      "loss": 3.2528,
+      "step": 63600
+    },
+    {
+      "epoch": 18.540666511302728,
+      "grad_norm": 0.40784958004951477,
+      "learning_rate": 0.00037773593704459337,
+      "loss": 3.2448,
+      "step": 63650
+    },
+    {
+      "epoch": 18.555231880680495,
+      "grad_norm": 0.3873050808906555,
+      "learning_rate": 0.00037756106091518506,
+      "loss": 3.2595,
+      "step": 63700
+    },
+    {
+      "epoch": 18.56979725005826,
+      "grad_norm": 0.4017407298088074,
+      "learning_rate": 0.0003773861847857767,
+      "loss": 3.2543,
+      "step": 63750
+    },
+    {
+      "epoch": 18.58436261943603,
+      "grad_norm": 0.4253888428211212,
+      "learning_rate": 0.0003772113086563684,
+      "loss": 3.2584,
+      "step": 63800
+    },
+    {
+      "epoch": 18.598927988813795,
+      "grad_norm": 0.3869839012622833,
+      "learning_rate": 0.00037703643252696,
+      "loss": 3.2514,
+      "step": 63850
+    },
+    {
+      "epoch": 18.613493358191565,
+      "grad_norm": 0.4242589771747589,
+      "learning_rate": 0.0003768615563975517,
+      "loss": 3.2537,
+      "step": 63900
+    },
+    {
+      "epoch": 18.628058727569332,
+      "grad_norm": 0.4035606384277344,
+      "learning_rate": 0.00037668668026814335,
+      "loss": 3.2601,
+      "step": 63950
+    },
+    {
+      "epoch": 18.6426240969471,
+      "grad_norm": 0.3770582675933838,
+      "learning_rate": 0.000376511804138735,
+      "loss": 3.2384,
+      "step": 64000
+    },
+    {
+      "epoch": 18.6426240969471,
+      "eval_accuracy": 0.37323761594573923,
+      "eval_loss": 3.538996934890747,
+      "eval_runtime": 180.1189,
+      "eval_samples_per_second": 92.411,
+      "eval_steps_per_second": 5.78,
+      "step": 64000
+    },
+    {
+      "epoch": 18.657189466324866,
+      "grad_norm": 0.3780357837677002,
+      "learning_rate": 0.00037633692800932673,
+      "loss": 3.2646,
+      "step": 64050
+    },
+    {
+      "epoch": 18.671754835702632,
+      "grad_norm": 0.4115695655345917,
+      "learning_rate": 0.00037616205187991837,
+      "loss": 3.2593,
+      "step": 64100
+    },
+    {
+      "epoch": 18.6863202050804,
+      "grad_norm": 0.37910208106040955,
+      "learning_rate": 0.00037598717575051006,
+      "loss": 3.2559,
+      "step": 64150
+    },
+    {
+      "epoch": 18.70088557445817,
+      "grad_norm": 0.40785324573516846,
+      "learning_rate": 0.0003758122996211017,
+      "loss": 3.2566,
+      "step": 64200
+    },
+    {
+      "epoch": 18.715450943835936,
+      "grad_norm": 0.3921259343624115,
+      "learning_rate": 0.00037563742349169333,
+      "loss": 3.2651,
+      "step": 64250
+    },
+    {
+      "epoch": 18.730016313213703,
+      "grad_norm": 0.3851292133331299,
+      "learning_rate": 0.000375462547362285,
+      "loss": 3.2619,
+      "step": 64300
+    },
+    {
+      "epoch": 18.74458168259147,
+      "grad_norm": 0.39476004242897034,
+      "learning_rate": 0.00037528767123287665,
+      "loss": 3.259,
+      "step": 64350
+    },
+    {
+      "epoch": 18.759147051969236,
+      "grad_norm": 0.3848125636577606,
+      "learning_rate": 0.00037511279510346834,
+      "loss": 3.2541,
+      "step": 64400
+    },
+    {
+      "epoch": 18.773712421347007,
+      "grad_norm": 0.3958471417427063,
+      "learning_rate": 0.00037493791897406,
+      "loss": 3.2738,
+      "step": 64450
+    },
+    {
+      "epoch": 18.788277790724774,
+      "grad_norm": 0.3779659867286682,
+      "learning_rate": 0.0003747630428446516,
+      "loss": 3.2601,
+      "step": 64500
+    },
+    {
+      "epoch": 18.80284316010254,
+      "grad_norm": 0.37562546133995056,
+      "learning_rate": 0.00037458816671524336,
+      "loss": 3.2631,
+      "step": 64550
+    },
+    {
+      "epoch": 18.817408529480307,
+      "grad_norm": 0.3678816556930542,
+      "learning_rate": 0.000374413290585835,
+      "loss": 3.2564,
+      "step": 64600
+    },
+    {
+      "epoch": 18.831973898858074,
+      "grad_norm": 0.3767731487751007,
+      "learning_rate": 0.0003742384144564267,
+      "loss": 3.2594,
+      "step": 64650
+    },
+    {
+      "epoch": 18.846539268235844,
+      "grad_norm": 0.3676002025604248,
+      "learning_rate": 0.0003740635383270183,
+      "loss": 3.2694,
+      "step": 64700
+    },
+    {
+      "epoch": 18.86110463761361,
+      "grad_norm": 0.3892149329185486,
+      "learning_rate": 0.00037388866219761,
+      "loss": 3.2799,
+      "step": 64750
+    },
+    {
+      "epoch": 18.875670006991378,
+      "grad_norm": 0.39518481492996216,
+      "learning_rate": 0.00037371378606820165,
+      "loss": 3.2705,
+      "step": 64800
+    },
+    {
+      "epoch": 18.890235376369144,
+      "grad_norm": 0.416162371635437,
+      "learning_rate": 0.0003735389099387933,
+      "loss": 3.2622,
+      "step": 64850
+    },
+    {
+      "epoch": 18.90480074574691,
+      "grad_norm": 0.3584899306297302,
+      "learning_rate": 0.000373364033809385,
+      "loss": 3.2664,
+      "step": 64900
+    },
+    {
+      "epoch": 18.919366115124678,
+      "grad_norm": 0.3977811634540558,
+      "learning_rate": 0.0003731891576799766,
+      "loss": 3.2779,
+      "step": 64950
+    },
+    {
+      "epoch": 18.93393148450245,
+      "grad_norm": 0.37734439969062805,
+      "learning_rate": 0.00037301428155056836,
+      "loss": 3.2662,
+      "step": 65000
+    },
+    {
+      "epoch": 18.93393148450245,
+      "eval_accuracy": 0.3737098933114027,
+      "eval_loss": 3.5310399532318115,
+      "eval_runtime": 180.1554,
+      "eval_samples_per_second": 92.392,
+      "eval_steps_per_second": 5.778,
+      "step": 65000
+    },
+    {
+      "epoch": 18.948496853880215,
+      "grad_norm": 0.38468220829963684,
+      "learning_rate": 0.00037283940542116,
+      "loss": 3.2678,
+      "step": 65050
+    },
+    {
+      "epoch": 18.96306222325798,
+      "grad_norm": 0.40911436080932617,
+      "learning_rate": 0.00037266452929175163,
+      "loss": 3.28,
+      "step": 65100
+    },
+    {
+      "epoch": 18.97762759263575,
+      "grad_norm": 0.3881330192089081,
+      "learning_rate": 0.0003724896531623433,
+      "loss": 3.2711,
+      "step": 65150
+    },
+    {
+      "epoch": 18.992192962013515,
+      "grad_norm": 0.3778620958328247,
+      "learning_rate": 0.00037231477703293496,
+      "loss": 3.284,
+      "step": 65200
+    },
+    {
+      "epoch": 19.006700069913773,
+      "grad_norm": 0.421502023935318,
+      "learning_rate": 0.00037213990090352665,
+      "loss": 3.2223,
+      "step": 65250
+    },
+    {
+      "epoch": 19.02126543929154,
+      "grad_norm": 0.38426366448402405,
+      "learning_rate": 0.0003719650247741183,
+      "loss": 3.1653,
+      "step": 65300
+    },
+    {
+      "epoch": 19.035830808669306,
+      "grad_norm": 0.3845437169075012,
+      "learning_rate": 0.00037179014864471,
+      "loss": 3.186,
+      "step": 65350
+    },
+    {
+      "epoch": 19.050396178047077,
+      "grad_norm": 0.38343438506126404,
+      "learning_rate": 0.0003716152725153016,
+      "loss": 3.1675,
+      "step": 65400
+    },
+    {
+      "epoch": 19.064961547424844,
+      "grad_norm": 0.41841766238212585,
+      "learning_rate": 0.00037144039638589325,
+      "loss": 3.168,
+      "step": 65450
+    },
+    {
+      "epoch": 19.07952691680261,
+      "grad_norm": 0.4013581871986389,
+      "learning_rate": 0.000371265520256485,
+      "loss": 3.1991,
+      "step": 65500
+    },
+    {
+      "epoch": 19.094092286180377,
+      "grad_norm": 0.3751412630081177,
+      "learning_rate": 0.00037109064412707663,
+      "loss": 3.194,
+      "step": 65550
+    },
+    {
+      "epoch": 19.108657655558144,
+      "grad_norm": 0.3733639717102051,
+      "learning_rate": 0.0003709157679976683,
+      "loss": 3.1903,
+      "step": 65600
+    },
+    {
+      "epoch": 19.123223024935914,
+      "grad_norm": 0.37872931361198425,
+      "learning_rate": 0.00037074089186825995,
+      "loss": 3.1894,
+      "step": 65650
+    },
+    {
+      "epoch": 19.13778839431368,
+      "grad_norm": 0.3800518810749054,
+      "learning_rate": 0.0003705660157388516,
+      "loss": 3.2034,
+      "step": 65700
+    },
+    {
+      "epoch": 19.152353763691448,
+      "grad_norm": 0.38501694798469543,
+      "learning_rate": 0.0003703911396094433,
+      "loss": 3.2032,
+      "step": 65750
+    },
+    {
+      "epoch": 19.166919133069214,
+      "grad_norm": 0.3732318580150604,
+      "learning_rate": 0.0003702162634800349,
+      "loss": 3.199,
+      "step": 65800
+    },
+    {
+      "epoch": 19.18148450244698,
+      "grad_norm": 0.39963558316230774,
+      "learning_rate": 0.0003700413873506266,
+      "loss": 3.1866,
+      "step": 65850
+    },
+    {
+      "epoch": 19.196049871824748,
+      "grad_norm": 0.39880403876304626,
+      "learning_rate": 0.00036986651122121824,
+      "loss": 3.2026,
+      "step": 65900
+    },
+    {
+      "epoch": 19.210615241202518,
+      "grad_norm": 0.3877936601638794,
+      "learning_rate": 0.00036969163509181,
+      "loss": 3.1999,
+      "step": 65950
+    },
+    {
+      "epoch": 19.225180610580285,
+      "grad_norm": 0.40352022647857666,
+      "learning_rate": 0.0003695167589624016,
+      "loss": 3.1971,
+      "step": 66000
+    },
+    {
+      "epoch": 19.225180610580285,
+      "eval_accuracy": 0.37293699029873867,
+      "eval_loss": 3.546077013015747,
+      "eval_runtime": 180.0981,
+      "eval_samples_per_second": 92.422,
+      "eval_steps_per_second": 5.78,
+      "step": 66000
+    },
+    {
+      "epoch": 19.23974597995805,
+      "grad_norm": 0.39953070878982544,
+      "learning_rate": 0.00036934188283299326,
+      "loss": 3.2065,
+      "step": 66050
+    },
+    {
+      "epoch": 19.25431134933582,
+      "grad_norm": 0.4177154004573822,
+      "learning_rate": 0.00036916700670358495,
+      "loss": 3.2163,
+      "step": 66100
+    },
+    {
+      "epoch": 19.268876718713585,
+      "grad_norm": 0.3941349387168884,
+      "learning_rate": 0.0003689921305741766,
+      "loss": 3.2171,
+      "step": 66150
+    },
+    {
+      "epoch": 19.283442088091356,
+      "grad_norm": 0.38486939668655396,
+      "learning_rate": 0.0003688172544447683,
+      "loss": 3.2216,
+      "step": 66200
+    },
+    {
+      "epoch": 19.298007457469122,
+      "grad_norm": 0.40340861678123474,
+      "learning_rate": 0.0003686423783153599,
+      "loss": 3.209,
+      "step": 66250
+    },
+    {
+      "epoch": 19.31257282684689,
+      "grad_norm": 0.4014577567577362,
+      "learning_rate": 0.00036846750218595155,
+      "loss": 3.2142,
+      "step": 66300
+    },
+    {
+      "epoch": 19.327138196224656,
+      "grad_norm": 0.37454017996788025,
+      "learning_rate": 0.00036829262605654324,
+      "loss": 3.2138,
+      "step": 66350
+    },
+    {
+      "epoch": 19.341703565602423,
+      "grad_norm": 0.40584367513656616,
+      "learning_rate": 0.0003681177499271349,
+      "loss": 3.2271,
+      "step": 66400
+    },
+    {
+      "epoch": 19.356268934980193,
+      "grad_norm": 0.46146562695503235,
+      "learning_rate": 0.0003679428737977266,
+      "loss": 3.224,
+      "step": 66450
+    },
+    {
+      "epoch": 19.37083430435796,
+      "grad_norm": 0.4199482798576355,
+      "learning_rate": 0.00036776799766831826,
+      "loss": 3.2247,
+      "step": 66500
+    },
+    {
+      "epoch": 19.385399673735726,
+      "grad_norm": 0.4005260467529297,
+      "learning_rate": 0.0003675931215389099,
+      "loss": 3.2315,
+      "step": 66550
+    },
+    {
+      "epoch": 19.399965043113493,
+      "grad_norm": 0.3830159902572632,
+      "learning_rate": 0.0003674182454095016,
+      "loss": 3.2393,
+      "step": 66600
+    },
+    {
+      "epoch": 19.41453041249126,
+      "grad_norm": 0.3859393298625946,
+      "learning_rate": 0.0003672433692800932,
+      "loss": 3.224,
+      "step": 66650
+    },
+    {
+      "epoch": 19.429095781869027,
+      "grad_norm": 0.3998680114746094,
+      "learning_rate": 0.0003670684931506849,
+      "loss": 3.2305,
+      "step": 66700
+    },
+    {
+      "epoch": 19.443661151246797,
+      "grad_norm": 0.4194350838661194,
+      "learning_rate": 0.00036689361702127655,
+      "loss": 3.2375,
+      "step": 66750
+    },
+    {
+      "epoch": 19.458226520624564,
+      "grad_norm": 0.41730183362960815,
+      "learning_rate": 0.00036671874089186824,
+      "loss": 3.2294,
+      "step": 66800
+    },
+    {
+      "epoch": 19.47279189000233,
+      "grad_norm": 0.4091123640537262,
+      "learning_rate": 0.00036654386476245987,
+      "loss": 3.2356,
+      "step": 66850
+    },
+    {
+      "epoch": 19.487357259380097,
+      "grad_norm": 0.37244871258735657,
+      "learning_rate": 0.0003663689886330515,
+      "loss": 3.2232,
+      "step": 66900
+    },
+    {
+      "epoch": 19.501922628757864,
+      "grad_norm": 0.38801804184913635,
+      "learning_rate": 0.00036619411250364325,
+      "loss": 3.2317,
+      "step": 66950
+    },
+    {
+      "epoch": 19.516487998135634,
+      "grad_norm": 0.3837634027004242,
+      "learning_rate": 0.0003660192363742349,
+      "loss": 3.233,
+      "step": 67000
+    },
+    {
+      "epoch": 19.516487998135634,
+      "eval_accuracy": 0.3734149110085773,
+      "eval_loss": 3.5416438579559326,
+      "eval_runtime": 180.1033,
+      "eval_samples_per_second": 92.419,
+      "eval_steps_per_second": 5.78,
+      "step": 67000
+    },
+    {
+      "epoch": 19.5310533675134,
+      "grad_norm": 0.38826125860214233,
+      "learning_rate": 0.0003658443602448266,
+      "loss": 3.2486,
+      "step": 67050
+    },
+    {
+      "epoch": 19.545618736891168,
+      "grad_norm": 0.37006881833076477,
+      "learning_rate": 0.0003656694841154182,
+      "loss": 3.2385,
+      "step": 67100
+    },
+    {
+      "epoch": 19.560184106268935,
+      "grad_norm": 0.3804363012313843,
+      "learning_rate": 0.00036549460798600985,
+      "loss": 3.2371,
+      "step": 67150
+    },
+    {
+      "epoch": 19.5747494756467,
+      "grad_norm": 0.3897872567176819,
+      "learning_rate": 0.00036531973185660154,
+      "loss": 3.24,
+      "step": 67200
+    },
+    {
+      "epoch": 19.589314845024468,
+      "grad_norm": 0.360208123922348,
+      "learning_rate": 0.0003651448557271932,
+      "loss": 3.2373,
+      "step": 67250
+    },
+    {
+      "epoch": 19.60388021440224,
+      "grad_norm": 0.3896685242652893,
+      "learning_rate": 0.00036496997959778487,
+      "loss": 3.2419,
+      "step": 67300
+    },
+    {
+      "epoch": 19.618445583780005,
+      "grad_norm": 0.3791717290878296,
+      "learning_rate": 0.0003647951034683765,
+      "loss": 3.2349,
+      "step": 67350
+    },
+    {
+      "epoch": 19.633010953157772,
+      "grad_norm": 0.40495508909225464,
+      "learning_rate": 0.00036462022733896825,
+      "loss": 3.25,
+      "step": 67400
+    },
+    {
+      "epoch": 19.64757632253554,
+      "grad_norm": 0.3995152711868286,
+      "learning_rate": 0.0003644453512095599,
+      "loss": 3.2518,
+      "step": 67450
+    },
+    {
+      "epoch": 19.662141691913305,
+      "grad_norm": 0.379250168800354,
+      "learning_rate": 0.0003642704750801515,
+      "loss": 3.2473,
+      "step": 67500
+    },
+    {
+      "epoch": 19.676707061291076,
+      "grad_norm": 0.37855687737464905,
+      "learning_rate": 0.0003640955989507432,
+      "loss": 3.2479,
+      "step": 67550
+    },
+    {
+      "epoch": 19.691272430668842,
+      "grad_norm": 0.37955552339553833,
+      "learning_rate": 0.00036392072282133485,
+      "loss": 3.2494,
+      "step": 67600
+    },
+    {
+      "epoch": 19.70583780004661,
+      "grad_norm": 0.4204575717449188,
+      "learning_rate": 0.00036374584669192654,
+      "loss": 3.2508,
+      "step": 67650
+    },
+    {
+      "epoch": 19.720403169424376,
+      "grad_norm": 0.3943386971950531,
+      "learning_rate": 0.0003635709705625182,
+      "loss": 3.2509,
+      "step": 67700
+    },
+    {
+      "epoch": 19.734968538802143,
+      "grad_norm": 0.41143128275871277,
+      "learning_rate": 0.0003633960944331098,
+      "loss": 3.2468,
+      "step": 67750
+    },
+    {
+      "epoch": 19.749533908179913,
+      "grad_norm": 0.4092387855052948,
+      "learning_rate": 0.0003632212183037015,
+      "loss": 3.2534,
+      "step": 67800
+    },
+    {
+      "epoch": 19.76409927755768,
+      "grad_norm": 0.42127180099487305,
+      "learning_rate": 0.00036304634217429314,
+      "loss": 3.2494,
+      "step": 67850
+    },
+    {
+      "epoch": 19.778664646935447,
+      "grad_norm": 0.3770343065261841,
+      "learning_rate": 0.0003628714660448849,
+      "loss": 3.2586,
+      "step": 67900
+    },
+    {
+      "epoch": 19.793230016313213,
+      "grad_norm": 0.41852423548698425,
+      "learning_rate": 0.0003626965899154765,
+      "loss": 3.2537,
+      "step": 67950
+    },
+    {
+      "epoch": 19.80779538569098,
+      "grad_norm": 0.4298582375049591,
+      "learning_rate": 0.0003625217137860682,
+      "loss": 3.2554,
+      "step": 68000
+    },
+    {
+      "epoch": 19.80779538569098,
+      "eval_accuracy": 0.37402227592543497,
+      "eval_loss": 3.5344889163970947,
+      "eval_runtime": 180.1298,
+      "eval_samples_per_second": 92.406,
+      "eval_steps_per_second": 5.779,
+      "step": 68000
+    },
+    {
+      "epoch": 19.822360755068747,
+      "grad_norm": 0.3977230191230774,
+      "learning_rate": 0.00036234683765665985,
+      "loss": 3.2665,
+      "step": 68050
+    },
+    {
+      "epoch": 19.836926124446517,
+      "grad_norm": 0.38390326499938965,
+      "learning_rate": 0.0003621719615272515,
+      "loss": 3.2562,
+      "step": 68100
+    },
+    {
+      "epoch": 19.851491493824284,
+      "grad_norm": 0.3879960775375366,
+      "learning_rate": 0.00036199708539784317,
+      "loss": 3.2422,
+      "step": 68150
+    },
+    {
+      "epoch": 19.86605686320205,
+      "grad_norm": 0.4085167646408081,
+      "learning_rate": 0.0003618222092684348,
+      "loss": 3.2567,
+      "step": 68200
+    },
+    {
+      "epoch": 19.880622232579817,
+      "grad_norm": 0.4254435896873474,
+      "learning_rate": 0.0003616473331390265,
+      "loss": 3.2562,
+      "step": 68250
+    },
+    {
+      "epoch": 19.895187601957584,
+      "grad_norm": 0.38896510004997253,
+      "learning_rate": 0.00036147245700961813,
+      "loss": 3.2491,
+      "step": 68300
+    },
+    {
+      "epoch": 19.909752971335354,
+      "grad_norm": 0.3670238256454468,
+      "learning_rate": 0.00036129758088020977,
+      "loss": 3.2599,
+      "step": 68350
+    },
+    {
+      "epoch": 19.92431834071312,
+      "grad_norm": 0.38788822293281555,
+      "learning_rate": 0.0003611227047508015,
+      "loss": 3.2658,
+      "step": 68400
+    },
+    {
+      "epoch": 19.938883710090888,
+      "grad_norm": 0.3823579251766205,
+      "learning_rate": 0.00036094782862139315,
+      "loss": 3.2518,
+      "step": 68450
+    },
+    {
+      "epoch": 19.953449079468655,
+      "grad_norm": 0.419264018535614,
+      "learning_rate": 0.00036077295249198484,
+      "loss": 3.265,
+      "step": 68500
+    },
+    {
+      "epoch": 19.96801444884642,
+      "grad_norm": 0.41140833497047424,
+      "learning_rate": 0.0003605980763625765,
+      "loss": 3.2592,
+      "step": 68550
+    },
+    {
+      "epoch": 19.982579818224192,
+      "grad_norm": 0.4099409878253937,
+      "learning_rate": 0.0003604232002331681,
+      "loss": 3.2693,
+      "step": 68600
+    },
+    {
+      "epoch": 19.99714518760196,
+      "grad_norm": 0.37093162536621094,
+      "learning_rate": 0.0003602483241037598,
+      "loss": 3.2544,
+      "step": 68650
+    },
+    {
+      "epoch": 20.011652295502213,
+      "grad_norm": 0.42578721046447754,
+      "learning_rate": 0.00036007344797435144,
+      "loss": 3.1794,
+      "step": 68700
+    },
+    {
+      "epoch": 20.026217664879983,
+      "grad_norm": 0.3819316625595093,
+      "learning_rate": 0.00035989857184494313,
+      "loss": 3.1559,
+      "step": 68750
+    },
+    {
+      "epoch": 20.04078303425775,
+      "grad_norm": 0.40451663732528687,
+      "learning_rate": 0.00035972369571553477,
+      "loss": 3.1716,
+      "step": 68800
+    },
+    {
+      "epoch": 20.055348403635517,
+      "grad_norm": 0.39993008971214294,
+      "learning_rate": 0.0003595488195861265,
+      "loss": 3.1791,
+      "step": 68850
+    },
+    {
+      "epoch": 20.069913773013283,
+      "grad_norm": 0.38017529249191284,
+      "learning_rate": 0.00035937394345671815,
+      "loss": 3.1703,
+      "step": 68900
+    },
+    {
+      "epoch": 20.08447914239105,
+      "grad_norm": 0.3801606297492981,
+      "learning_rate": 0.0003591990673273098,
+      "loss": 3.167,
+      "step": 68950
+    },
+    {
+      "epoch": 20.099044511768817,
+      "grad_norm": 0.4289296865463257,
+      "learning_rate": 0.0003590241911979015,
+      "loss": 3.1718,
+      "step": 69000
+    },
+    {
+      "epoch": 20.099044511768817,
+      "eval_accuracy": 0.37306749263279054,
+      "eval_loss": 3.5509676933288574,
+      "eval_runtime": 180.195,
+      "eval_samples_per_second": 92.372,
+      "eval_steps_per_second": 5.777,
+      "step": 69000
+    },
+    {
+      "epoch": 20.113609881146587,
+      "grad_norm": 0.4497009217739105,
+      "learning_rate": 0.0003588493150684931,
+      "loss": 3.1754,
+      "step": 69050
+    },
+    {
+      "epoch": 20.128175250524354,
+      "grad_norm": 0.39954957365989685,
+      "learning_rate": 0.0003586744389390848,
+      "loss": 3.1792,
+      "step": 69100
+    },
+    {
+      "epoch": 20.14274061990212,
+      "grad_norm": 0.4464070200920105,
+      "learning_rate": 0.00035849956280967644,
+      "loss": 3.1865,
+      "step": 69150
+    },
+    {
+      "epoch": 20.157305989279887,
+      "grad_norm": 0.42317837476730347,
+      "learning_rate": 0.0003583246866802681,
+      "loss": 3.1917,
+      "step": 69200
+    },
+    {
+      "epoch": 20.171871358657654,
+      "grad_norm": 0.3977357745170593,
+      "learning_rate": 0.00035814981055085976,
+      "loss": 3.1804,
+      "step": 69250
+    },
+    {
+      "epoch": 20.186436728035424,
+      "grad_norm": 0.39570218324661255,
+      "learning_rate": 0.0003579749344214514,
+      "loss": 3.2007,
+      "step": 69300
+    },
+    {
+      "epoch": 20.20100209741319,
+      "grad_norm": 0.3908149302005768,
+      "learning_rate": 0.00035780005829204315,
+      "loss": 3.1905,
+      "step": 69350
+    },
+    {
+      "epoch": 20.215567466790958,
+      "grad_norm": 0.41018813848495483,
+      "learning_rate": 0.0003576251821626348,
+      "loss": 3.1864,
+      "step": 69400
+    },
+    {
+      "epoch": 20.230132836168725,
+      "grad_norm": 0.42226818203926086,
+      "learning_rate": 0.00035745030603322647,
+      "loss": 3.2023,
+      "step": 69450
+    },
+    {
+      "epoch": 20.24469820554649,
+      "grad_norm": 0.40277135372161865,
+      "learning_rate": 0.0003572754299038181,
+      "loss": 3.1942,
+      "step": 69500
+    },
+    {
+      "epoch": 20.25926357492426,
+      "grad_norm": 0.4245125353336334,
+      "learning_rate": 0.00035710055377440974,
+      "loss": 3.1983,
+      "step": 69550
+    },
+    {
+      "epoch": 20.27382894430203,
+      "grad_norm": 0.3877376914024353,
+      "learning_rate": 0.00035692567764500143,
+      "loss": 3.1959,
+      "step": 69600
+    },
+    {
+      "epoch": 20.288394313679795,
+      "grad_norm": 0.40770015120506287,
+      "learning_rate": 0.00035675080151559307,
+      "loss": 3.2,
+      "step": 69650
+    },
+    {
+      "epoch": 20.302959683057562,
+      "grad_norm": 0.3683469891548157,
+      "learning_rate": 0.00035657592538618476,
+      "loss": 3.2096,
+      "step": 69700
+    },
+    {
+      "epoch": 20.31752505243533,
+      "grad_norm": 0.3850116431713104,
+      "learning_rate": 0.0003564010492567764,
+      "loss": 3.2068,
+      "step": 69750
+    },
+    {
+      "epoch": 20.332090421813096,
+      "grad_norm": 0.39738941192626953,
+      "learning_rate": 0.00035622617312736803,
+      "loss": 3.2022,
+      "step": 69800
+    },
+    {
+      "epoch": 20.346655791190866,
+      "grad_norm": 0.39597588777542114,
+      "learning_rate": 0.0003560512969979598,
+      "loss": 3.2073,
+      "step": 69850
+    },
+    {
+      "epoch": 20.361221160568633,
+      "grad_norm": 0.38561490178108215,
+      "learning_rate": 0.0003558764208685514,
+      "loss": 3.21,
+      "step": 69900
+    },
+    {
+      "epoch": 20.3757865299464,
+      "grad_norm": 0.39177340269088745,
+      "learning_rate": 0.0003557015447391431,
+      "loss": 3.2072,
+      "step": 69950
+    },
+    {
+      "epoch": 20.390351899324166,
+      "grad_norm": 0.37735575437545776,
+      "learning_rate": 0.00035552666860973474,
+      "loss": 3.2171,
+      "step": 70000
+    },
+    {
+      "epoch": 20.390351899324166,
+      "eval_accuracy": 0.3732674786419998,
+      "eval_loss": 3.5471997261047363,
+      "eval_runtime": 180.1445,
+      "eval_samples_per_second": 92.398,
+      "eval_steps_per_second": 5.779,
+      "step": 70000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171650,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 5
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.46313608822784e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}