diff --git "a/last_to_hit_frequency_2128/checkpoint-70000/trainer_state.json" "b/last_to_hit_frequency_2128/checkpoint-70000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/last_to_hit_frequency_2128/checkpoint-70000/trainer_state.json"
@@ -0,0 +1,10473 @@
+{
+  "best_global_step": 65000,
+  "best_metric": 3.5359609127044678,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_2128/checkpoint-30000",
+  "epoch": 20.390351899324166,
+  "eval_steps": 1000,
+  "global_step": 70000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01456536937776742,
+      "grad_norm": 0.8308652639389038,
+      "learning_rate": 0.000294,
+      "loss": 8.4387,
+      "step": 50
+    },
+    {
+      "epoch": 0.02913073875553484,
+      "grad_norm": 0.6382876038551331,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7184,
+      "step": 100
+    },
+    {
+      "epoch": 0.04369610813330226,
+      "grad_norm": 0.4452700912952423,
+      "learning_rate": 0.0005998286213931798,
+      "loss": 6.3602,
+      "step": 150
+    },
+    {
+      "epoch": 0.05826147751106968,
+      "grad_norm": 0.4627819061279297,
+      "learning_rate": 0.0005996537452637714,
+      "loss": 6.1529,
+      "step": 200
+    },
+    {
+      "epoch": 0.0728268468888371,
+      "grad_norm": 0.4908621311187744,
+      "learning_rate": 0.0005994788691343632,
+      "loss": 5.9753,
+      "step": 250
+    },
+    {
+      "epoch": 0.08739221626660452,
+      "grad_norm": 0.46045973896980286,
+      "learning_rate": 0.0005993039930049548,
+      "loss": 5.8546,
+      "step": 300
+    },
+    {
+      "epoch": 0.10195758564437195,
+      "grad_norm": 0.4518718123435974,
+      "learning_rate": 0.0005991291168755465,
+      "loss": 5.7234,
+      "step": 350
+    },
+    {
+      "epoch": 0.11652295502213936,
+      "grad_norm": 0.41379204392433167,
+      "learning_rate": 0.0005989542407461382,
+      "loss": 5.6087,
+      "step": 400
+    },
+    {
+      "epoch": 0.13108832439990678,
+      "grad_norm": 0.42392534017562866,
+      "learning_rate": 0.0005987793646167297,
+      "loss": 5.5138,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456536937776742,
+      "grad_norm": 0.44991201162338257,
+      "learning_rate": 0.0005986044884873214,
+      "loss": 5.4074,
+      "step": 500
+    },
+    {
+      "epoch": 0.16021906315544163,
+      "grad_norm": 0.4449133574962616,
+      "learning_rate": 0.0005984296123579131,
+      "loss": 5.3554,
+      "step": 550
+    },
+    {
+      "epoch": 0.17478443253320905,
+      "grad_norm": 0.43911343812942505,
+      "learning_rate": 0.0005982547362285047,
+      "loss": 5.2571,
+      "step": 600
+    },
+    {
+      "epoch": 0.18934980191097647,
+      "grad_norm": 0.44633105397224426,
+      "learning_rate": 0.0005980798600990964,
+      "loss": 5.194,
+      "step": 650
+    },
+    {
+      "epoch": 0.2039151712887439,
+      "grad_norm": 0.5142127275466919,
+      "learning_rate": 0.0005979049839696881,
+      "loss": 5.1364,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184805406665113,
+      "grad_norm": 0.42377275228500366,
+      "learning_rate": 0.0005977301078402798,
+      "loss": 5.0645,
+      "step": 750
+    },
+    {
+      "epoch": 0.23304591004427871,
+      "grad_norm": 0.49829915165901184,
+      "learning_rate": 0.0005975552317108715,
+      "loss": 5.0326,
+      "step": 800
+    },
+    {
+      "epoch": 0.24761127942204614,
+      "grad_norm": 0.4879177212715149,
+      "learning_rate": 0.0005973803555814631,
+      "loss": 4.9787,
+      "step": 850
+    },
+    {
+      "epoch": 0.26217664879981356,
+      "grad_norm": 0.40835854411125183,
+      "learning_rate": 0.0005972054794520547,
+      "loss": 4.9266,
+      "step": 900
+    },
+    {
+      "epoch": 0.276742018177581,
+      "grad_norm": 0.40832623839378357,
+      "learning_rate": 0.0005970306033226464,
+      "loss": 4.8749,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "grad_norm": 0.41372886300086975,
+      "learning_rate": 0.0005968557271932381,
+      "loss": 4.8229,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "eval_accuracy": 0.2537010050443267,
+      "eval_loss": 4.7555251121521,
+      "eval_runtime": 182.9052,
+      "eval_samples_per_second": 91.003,
+      "eval_steps_per_second": 5.691,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30587275693311583,
+      "grad_norm": 0.4627732038497925,
+      "learning_rate": 0.0005966808510638297,
+      "loss": 4.7732,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32043812631088325,
+      "grad_norm": 0.5072389245033264,
+      "learning_rate": 0.0005965059749344214,
+      "loss": 4.7445,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350034956886507,
+      "grad_norm": 0.4509411156177521,
+      "learning_rate": 0.0005963310988050131,
+      "loss": 4.7006,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3495688650664181,
+      "grad_norm": 0.4881671965122223,
+      "learning_rate": 0.0005961562226756047,
+      "loss": 4.6717,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3641342344441855,
+      "grad_norm": 0.43225303292274475,
+      "learning_rate": 0.0005959813465461965,
+      "loss": 4.6353,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37869960382195295,
+      "grad_norm": 0.40277963876724243,
+      "learning_rate": 0.000595806470416788,
+      "loss": 4.6007,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39326497319972037,
+      "grad_norm": 0.5126720070838928,
+      "learning_rate": 0.0005956315942873797,
+      "loss": 4.5957,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4078303425774878,
+      "grad_norm": 0.4341893196105957,
+      "learning_rate": 0.0005954567181579714,
+      "loss": 4.5667,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42239571195525516,
+      "grad_norm": 0.466202974319458,
+      "learning_rate": 0.000595281842028563,
+      "loss": 4.5251,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4369610813330226,
+      "grad_norm": 0.47109341621398926,
+      "learning_rate": 0.0005951069658991547,
+      "loss": 4.504,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45152645071079,
+      "grad_norm": 0.39960241317749023,
+      "learning_rate": 0.0005949320897697464,
+      "loss": 4.4896,
+      "step": 1550
+    },
+    {
+      "epoch": 0.46609182008855743,
+      "grad_norm": 0.3836056590080261,
+      "learning_rate": 0.0005947572136403381,
+      "loss": 4.465,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48065718946632485,
+      "grad_norm": 0.4436993896961212,
+      "learning_rate": 0.0005945823375109297,
+      "loss": 4.4483,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4952225588440923,
+      "grad_norm": 0.49260610342025757,
+      "learning_rate": 0.0005944074613815215,
+      "loss": 4.4329,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5097879282218597,
+      "grad_norm": 0.40900883078575134,
+      "learning_rate": 0.000594232585252113,
+      "loss": 4.4216,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5243532975996271,
+      "grad_norm": 0.38395267724990845,
+      "learning_rate": 0.0005940577091227047,
+      "loss": 4.3901,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5389186669773945,
+      "grad_norm": 0.4002784490585327,
+      "learning_rate": 0.0005938828329932964,
+      "loss": 4.3719,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553484036355162,
+      "grad_norm": 0.39246320724487305,
+      "learning_rate": 0.000593707956863888,
+      "loss": 4.3556,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5680494057329294,
+      "grad_norm": 0.429328054189682,
+      "learning_rate": 0.0005935330807344797,
+      "loss": 4.3495,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "grad_norm": 0.37213221192359924,
+      "learning_rate": 0.0005933582046050714,
+      "loss": 4.3272,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "eval_accuracy": 0.29963923746663224,
+      "eval_loss": 4.282708644866943,
+      "eval_runtime": 182.1675,
+      "eval_samples_per_second": 91.372,
+      "eval_steps_per_second": 5.715,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5971801444884642,
+      "grad_norm": 0.4040358066558838,
+      "learning_rate": 0.000593183328475663,
+      "loss": 4.3313,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6117455138662317,
+      "grad_norm": 0.4094908535480499,
+      "learning_rate": 0.0005930084523462546,
+      "loss": 4.3198,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6263108832439991,
+      "grad_norm": 0.39454561471939087,
+      "learning_rate": 0.0005928335762168463,
+      "loss": 4.3,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6408762526217665,
+      "grad_norm": 0.3934589624404907,
+      "learning_rate": 0.000592658700087438,
+      "loss": 4.3025,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6554416219995339,
+      "grad_norm": 0.386088103055954,
+      "learning_rate": 0.0005924838239580297,
+      "loss": 4.2816,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6700069913773014,
+      "grad_norm": 0.4144304096698761,
+      "learning_rate": 0.0005923089478286214,
+      "loss": 4.2738,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6845723607550688,
+      "grad_norm": 0.4177938401699066,
+      "learning_rate": 0.000592134071699213,
+      "loss": 4.2481,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6991377301328362,
+      "grad_norm": 0.36143994331359863,
+      "learning_rate": 0.0005919591955698047,
+      "loss": 4.2475,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7137030995106036,
+      "grad_norm": 0.3758913278579712,
+      "learning_rate": 0.0005917843194403964,
+      "loss": 4.233,
+      "step": 2450
+    },
+    {
+      "epoch": 0.728268468888371,
+      "grad_norm": 0.3990520238876343,
+      "learning_rate": 0.000591609443310988,
+      "loss": 4.2258,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7428338382661385,
+      "grad_norm": 0.341074138879776,
+      "learning_rate": 0.0005914345671815796,
+      "loss": 4.2438,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7573992076439059,
+      "grad_norm": 0.3899039924144745,
+      "learning_rate": 0.0005912596910521713,
+      "loss": 4.2077,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7719645770216733,
+      "grad_norm": 0.3767816424369812,
+      "learning_rate": 0.0005910848149227629,
+      "loss": 4.1915,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7865299463994407,
+      "grad_norm": 0.3586917221546173,
+      "learning_rate": 0.0005909099387933547,
+      "loss": 4.192,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8010953157772082,
+      "grad_norm": 0.3622867465019226,
+      "learning_rate": 0.0005907350626639463,
+      "loss": 4.1811,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8156606851549756,
+      "grad_norm": 0.34509238600730896,
+      "learning_rate": 0.000590560186534538,
+      "loss": 4.1775,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8302260545327429,
+      "grad_norm": 0.3676803708076477,
+      "learning_rate": 0.0005903853104051297,
+      "loss": 4.1742,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8447914239105103,
+      "grad_norm": 0.34232082962989807,
+      "learning_rate": 0.0005902104342757214,
+      "loss": 4.1572,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8593567932882777,
+      "grad_norm": 0.3571970760822296,
+      "learning_rate": 0.000590035558146313,
+      "loss": 4.1584,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "grad_norm": 0.35053205490112305,
+      "learning_rate": 0.0005898606820169046,
+      "loss": 4.142,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "eval_accuracy": 0.31524614092253395,
+      "eval_loss": 4.096119403839111,
+      "eval_runtime": 182.324,
+      "eval_samples_per_second": 91.294,
+      "eval_steps_per_second": 5.71,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8884875320438126,
+      "grad_norm": 0.4020876884460449,
+      "learning_rate": 0.0005896858058874963,
+      "loss": 4.1395,
+      "step": 3050
+    },
+    {
+      "epoch": 0.90305290142158,
+      "grad_norm": 0.3728208839893341,
+      "learning_rate": 0.0005895109297580879,
+      "loss": 4.14,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9176182707993474,
+      "grad_norm": 0.3634420335292816,
+      "learning_rate": 0.0005893360536286797,
+      "loss": 4.1265,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9321836401771149,
+      "grad_norm": 0.3639736473560333,
+      "learning_rate": 0.0005891611774992713,
+      "loss": 4.1264,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9467490095548823,
+      "grad_norm": 0.3608086407184601,
+      "learning_rate": 0.000588986301369863,
+      "loss": 4.1242,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9613143789326497,
+      "grad_norm": 0.37832555174827576,
+      "learning_rate": 0.0005888114252404547,
+      "loss": 4.1139,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9758797483104171,
+      "grad_norm": 0.3500097990036011,
+      "learning_rate": 0.0005886365491110463,
+      "loss": 4.0998,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9904451176881846,
+      "grad_norm": 0.34508016705513,
+      "learning_rate": 0.000588461672981638,
+      "loss": 4.0876,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0049522255884409,
+      "grad_norm": 0.346635103225708,
+      "learning_rate": 0.0005882867968522296,
+      "loss": 4.0698,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0195175949662083,
+      "grad_norm": 0.351591020822525,
+      "learning_rate": 0.0005881119207228212,
+      "loss": 4.0152,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0340829643439757,
+      "grad_norm": 0.342751145362854,
+      "learning_rate": 0.0005879370445934129,
+      "loss": 4.0189,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0486483337217432,
+      "grad_norm": 0.34400904178619385,
+      "learning_rate": 0.0005877621684640046,
+      "loss": 4.0082,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0632137030995106,
+      "grad_norm": 0.3556966483592987,
+      "learning_rate": 0.0005875872923345963,
+      "loss": 4.0164,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077779072477278,
+      "grad_norm": 0.3546448349952698,
+      "learning_rate": 0.000587412416205188,
+      "loss": 4.0009,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0923444418550454,
+      "grad_norm": 0.37118959426879883,
+      "learning_rate": 0.0005872375400757797,
+      "loss": 4.0167,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1069098112328128,
+      "grad_norm": 0.3513905704021454,
+      "learning_rate": 0.0005870626639463713,
+      "loss": 4.0167,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1214751806105803,
+      "grad_norm": 0.3534930348396301,
+      "learning_rate": 0.0005868877878169629,
+      "loss": 3.9929,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1360405499883477,
+      "grad_norm": 0.3811454176902771,
+      "learning_rate": 0.0005867129116875546,
+      "loss": 3.9999,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1506059193661151,
+      "grad_norm": 0.34738337993621826,
+      "learning_rate": 0.0005865380355581462,
+      "loss": 4.0029,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "grad_norm": 0.3603108525276184,
+      "learning_rate": 0.0005863631594287379,
+      "loss": 3.9925,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "eval_accuracy": 0.32528376909551887,
+      "eval_loss": 3.9905245304107666,
+      "eval_runtime": 182.0264,
+      "eval_samples_per_second": 91.443,
+      "eval_steps_per_second": 5.719,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17973665812165,
+      "grad_norm": 0.3429189622402191,
+      "learning_rate": 0.0005861882832993296,
+      "loss": 3.9933,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1943020274994174,
+      "grad_norm": 0.38075006008148193,
+      "learning_rate": 0.0005860134071699212,
+      "loss": 3.9855,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2088673968771848,
+      "grad_norm": 0.3338114321231842,
+      "learning_rate": 0.000585838531040513,
+      "loss": 3.9794,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2234327662549522,
+      "grad_norm": 0.36873266100883484,
+      "learning_rate": 0.0005856636549111046,
+      "loss": 3.9787,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2379981356327197,
+      "grad_norm": 0.32824212312698364,
+      "learning_rate": 0.0005854887787816963,
+      "loss": 3.9766,
+      "step": 4250
+    },
+    {
+      "epoch": 1.252563505010487,
+      "grad_norm": 0.3516254723072052,
+      "learning_rate": 0.0005853139026522879,
+      "loss": 3.9651,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2671288743882545,
+      "grad_norm": 0.32511937618255615,
+      "learning_rate": 0.0005851390265228796,
+      "loss": 3.9725,
+      "step": 4350
+    },
+    {
+      "epoch": 1.281694243766022,
+      "grad_norm": 0.32222479581832886,
+      "learning_rate": 0.0005849641503934712,
+      "loss": 3.9755,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2962596131437893,
+      "grad_norm": 0.3308519423007965,
+      "learning_rate": 0.0005847892742640629,
+      "loss": 3.9635,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3108249825215568,
+      "grad_norm": 0.32887038588523865,
+      "learning_rate": 0.0005846143981346546,
+      "loss": 3.9673,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3253903518993242,
+      "grad_norm": 0.33978450298309326,
+      "learning_rate": 0.0005844395220052462,
+      "loss": 3.9637,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3399557212770916,
+      "grad_norm": 0.3525462746620178,
+      "learning_rate": 0.000584264645875838,
+      "loss": 3.9552,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354521090654859,
+      "grad_norm": 0.3444075882434845,
+      "learning_rate": 0.0005840897697464296,
+      "loss": 3.953,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3690864600326265,
+      "grad_norm": 0.34169191122055054,
+      "learning_rate": 0.0005839148936170212,
+      "loss": 3.9486,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3836518294103939,
+      "grad_norm": 0.3524395227432251,
+      "learning_rate": 0.0005837400174876129,
+      "loss": 3.941,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3982171987881613,
+      "grad_norm": 0.3321269154548645,
+      "learning_rate": 0.0005835651413582045,
+      "loss": 3.9375,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4127825681659287,
+      "grad_norm": 0.3419478237628937,
+      "learning_rate": 0.0005833902652287962,
+      "loss": 3.9489,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4273479375436962,
+      "grad_norm": 0.33724281191825867,
+      "learning_rate": 0.0005832153890993879,
+      "loss": 3.939,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4419133069214636,
+      "grad_norm": 0.32965749502182007,
+      "learning_rate": 0.0005830405129699796,
+      "loss": 3.9303,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456478676299231,
+      "grad_norm": 0.32706567645072937,
+      "learning_rate": 0.0005828656368405712,
+      "loss": 3.9195,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456478676299231,
+      "eval_accuracy": 0.33166874275109504,
+      "eval_loss": 3.9142563343048096,
+      "eval_runtime": 182.2224,
+      "eval_samples_per_second": 91.344,
+      "eval_steps_per_second": 5.713,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4710440456769984,
+      "grad_norm": 0.33142364025115967,
+      "learning_rate": 0.0005826907607111629,
+      "loss": 3.9196,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4856094150547658,
+      "grad_norm": 0.3274194896221161,
+      "learning_rate": 0.0005825158845817546,
+      "loss": 3.9284,
+      "step": 5100
+    },
+    {
+      "epoch": 1.500174784432533,
+      "grad_norm": 0.35101616382598877,
+      "learning_rate": 0.0005823410084523462,
+      "loss": 3.9313,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5147401538103007,
+      "grad_norm": 0.3553934395313263,
+      "learning_rate": 0.0005821661323229379,
+      "loss": 3.9222,
+      "step": 5200
+    },
+    {
+      "epoch": 1.529305523188068,
+      "grad_norm": 0.32745224237442017,
+      "learning_rate": 0.0005819912561935295,
+      "loss": 3.9211,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5438708925658355,
+      "grad_norm": 0.32173994183540344,
+      "learning_rate": 0.0005818163800641212,
+      "loss": 3.9237,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5584362619436027,
+      "grad_norm": 0.3147367835044861,
+      "learning_rate": 0.0005816415039347129,
+      "loss": 3.9025,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5730016313213704,
+      "grad_norm": 0.3226154148578644,
+      "learning_rate": 0.0005814666278053045,
+      "loss": 3.9197,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5875670006991376,
+      "grad_norm": 0.3392418324947357,
+      "learning_rate": 0.0005812917516758962,
+      "loss": 3.9199,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6021323700769052,
+      "grad_norm": 0.3240615427494049,
+      "learning_rate": 0.0005811168755464879,
+      "loss": 3.9066,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6166977394546724,
+      "grad_norm": 0.3571517765522003,
+      "learning_rate": 0.0005809419994170794,
+      "loss": 3.9046,
+      "step": 5550
+    },
+    {
+      "epoch": 1.63126310883244,
+      "grad_norm": 0.3363195061683655,
+      "learning_rate": 0.0005807671232876712,
+      "loss": 3.895,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6458284782102073,
+      "grad_norm": 0.35087713599205017,
+      "learning_rate": 0.0005805922471582628,
+      "loss": 3.895,
+      "step": 5650
+    },
+    {
+      "epoch": 1.660393847587975,
+      "grad_norm": 0.3502371907234192,
+      "learning_rate": 0.0005804173710288545,
+      "loss": 3.8907,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6749592169657421,
+      "grad_norm": 0.37237074971199036,
+      "learning_rate": 0.0005802424948994462,
+      "loss": 3.8942,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6895245863435098,
+      "grad_norm": 0.3460238575935364,
+      "learning_rate": 0.0005800676187700379,
+      "loss": 3.8973,
+      "step": 5800
+    },
+    {
+      "epoch": 1.704089955721277,
+      "grad_norm": 0.3247397243976593,
+      "learning_rate": 0.0005798927426406295,
+      "loss": 3.8874,
+      "step": 5850
+    },
+    {
+      "epoch": 1.7186553250990446,
+      "grad_norm": 0.37104010581970215,
+      "learning_rate": 0.0005797178665112212,
+      "loss": 3.8913,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7332206944768118,
+      "grad_norm": 0.32479041814804077,
+      "learning_rate": 0.0005795429903818129,
+      "loss": 3.8934,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "grad_norm": 0.34869810938835144,
+      "learning_rate": 0.0005793681142524044,
+      "loss": 3.8842,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "eval_accuracy": 0.3369487966450319,
+      "eval_loss": 3.8599915504455566,
+      "eval_runtime": 182.1572,
+      "eval_samples_per_second": 91.377,
+      "eval_steps_per_second": 5.715,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7623514332323467,
+      "grad_norm": 0.32061490416526794,
+      "learning_rate": 0.0005791932381229961,
+      "loss": 3.8863,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7769168026101143,
+      "grad_norm": 0.3404031991958618,
+      "learning_rate": 0.0005790183619935878,
+      "loss": 3.882,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7914821719878815,
+      "grad_norm": 0.30672210454940796,
+      "learning_rate": 0.0005788434858641795,
+      "loss": 3.8718,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8060475413656492,
+      "grad_norm": 0.3539854884147644,
+      "learning_rate": 0.0005786686097347712,
+      "loss": 3.8862,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8206129107434164,
+      "grad_norm": 0.33336907625198364,
+      "learning_rate": 0.0005784937336053628,
+      "loss": 3.8618,
+      "step": 6250
+    },
+    {
+      "epoch": 1.835178280121184,
+      "grad_norm": 0.3471635580062866,
+      "learning_rate": 0.0005783188574759545,
+      "loss": 3.8604,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8497436494989512,
+      "grad_norm": 0.32666853070259094,
+      "learning_rate": 0.0005781439813465462,
+      "loss": 3.8604,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8643090188767188,
+      "grad_norm": 0.3473672866821289,
+      "learning_rate": 0.0005779691052171379,
+      "loss": 3.8667,
+      "step": 6400
+    },
+    {
+      "epoch": 1.878874388254486,
+      "grad_norm": 0.3199038803577423,
+      "learning_rate": 0.0005777942290877294,
+      "loss": 3.8611,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8934397576322537,
+      "grad_norm": 0.32697010040283203,
+      "learning_rate": 0.0005776193529583211,
+      "loss": 3.858,
+      "step": 6500
+    },
+    {
+      "epoch": 1.908005127010021,
+      "grad_norm": 0.37832486629486084,
+      "learning_rate": 0.0005774444768289128,
+      "loss": 3.8601,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9225704963877885,
+      "grad_norm": 0.3379972279071808,
+      "learning_rate": 0.0005772696006995045,
+      "loss": 3.856,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9371358657655557,
+      "grad_norm": 0.3384763300418854,
+      "learning_rate": 0.0005770947245700962,
+      "loss": 3.848,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9517012351433234,
+      "grad_norm": 0.3093826472759247,
+      "learning_rate": 0.0005769198484406878,
+      "loss": 3.8563,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9662666045210906,
+      "grad_norm": 0.32603582739830017,
+      "learning_rate": 0.0005767449723112795,
+      "loss": 3.8538,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9808319738988582,
+      "grad_norm": 0.36787310242652893,
+      "learning_rate": 0.0005765700961818712,
+      "loss": 3.8543,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9953973432766254,
+      "grad_norm": 0.32396772503852844,
+      "learning_rate": 0.0005763952200524627,
+      "loss": 3.8532,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0099044511768818,
+      "grad_norm": 0.3264414072036743,
+      "learning_rate": 0.0005762203439230544,
+      "loss": 3.7844,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0244698205546494,
+      "grad_norm": 0.3082588315010071,
+      "learning_rate": 0.0005760454677936461,
+      "loss": 3.722,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "grad_norm": 0.3626100718975067,
+      "learning_rate": 0.0005758705916642378,
+      "loss": 3.7443,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "eval_accuracy": 0.34116120036282,
+      "eval_loss": 3.816786766052246,
+      "eval_runtime": 182.2244,
+      "eval_samples_per_second": 91.343,
+      "eval_steps_per_second": 5.713,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0536005593101843,
+      "grad_norm": 0.33928582072257996,
+      "learning_rate": 0.0005756957155348294,
+      "loss": 3.7456,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0681659286879515,
+      "grad_norm": 0.33408093452453613,
+      "learning_rate": 0.0005755208394054211,
+      "loss": 3.7558,
+      "step": 7100
+    },
+    {
+      "epoch": 2.082731298065719,
+      "grad_norm": 0.35014262795448303,
+      "learning_rate": 0.0005753459632760128,
+      "loss": 3.7546,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0972966674434863,
+      "grad_norm": 0.33521801233291626,
+      "learning_rate": 0.0005751710871466045,
+      "loss": 3.7488,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111862036821254,
+      "grad_norm": 0.3408453166484833,
+      "learning_rate": 0.0005749962110171962,
+      "loss": 3.7638,
+      "step": 7250
+    },
+    {
+      "epoch": 2.126427406199021,
+      "grad_norm": 0.32431185245513916,
+      "learning_rate": 0.0005748213348877877,
+      "loss": 3.7519,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140992775576789,
+      "grad_norm": 0.32259050011634827,
+      "learning_rate": 0.0005746464587583794,
+      "loss": 3.7608,
+      "step": 7350
+    },
+    {
+      "epoch": 2.155558144954556,
+      "grad_norm": 0.3296469748020172,
+      "learning_rate": 0.0005744715826289711,
+      "loss": 3.7617,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1701235143323236,
+      "grad_norm": 0.3369705379009247,
+      "learning_rate": 0.0005742967064995627,
+      "loss": 3.747,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184688883710091,
+      "grad_norm": 0.335363507270813,
+      "learning_rate": 0.0005741218303701544,
+      "loss": 3.7517,
+      "step": 7500
+    },
+    {
+      "epoch": 2.1992542530878585,
+      "grad_norm": 0.3429674208164215,
+      "learning_rate": 0.0005739469542407461,
+      "loss": 3.7613,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2138196224656257,
+      "grad_norm": 0.3400017023086548,
+      "learning_rate": 0.0005737720781113378,
+      "loss": 3.7465,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2283849918433933,
+      "grad_norm": 0.33040422201156616,
+      "learning_rate": 0.0005735972019819295,
+      "loss": 3.765,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2429503612211605,
+      "grad_norm": 0.325589656829834,
+      "learning_rate": 0.000573422325852521,
+      "loss": 3.7555,
+      "step": 7700
+    },
+    {
+      "epoch": 2.257515730598928,
+      "grad_norm": 0.31000620126724243,
+      "learning_rate": 0.0005732474497231127,
+      "loss": 3.7614,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2720810999766954,
+      "grad_norm": 0.3232748806476593,
+      "learning_rate": 0.0005730725735937044,
+      "loss": 3.7535,
+      "step": 7800
+    },
+    {
+      "epoch": 2.286646469354463,
+      "grad_norm": 0.3055737018585205,
+      "learning_rate": 0.0005728976974642961,
+      "loss": 3.7598,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3012118387322302,
+      "grad_norm": 0.32002055644989014,
+      "learning_rate": 0.0005727228213348877,
+      "loss": 3.7501,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3157772081099974,
+      "grad_norm": 0.3241938054561615,
+      "learning_rate": 0.0005725479452054794,
+      "loss": 3.7547,
+      "step": 7950
+    },
+    {
+      "epoch": 2.330342577487765,
+      "grad_norm": 0.3343994915485382,
+      "learning_rate": 0.0005723730690760711,
+      "loss": 3.7503,
+      "step": 8000
+    },
+    {
+      "epoch": 2.330342577487765,
+      "eval_accuracy": 0.3443282921418196,
+      "eval_loss": 3.7861814498901367,
+      "eval_runtime": 182.4203,
+      "eval_samples_per_second": 91.245,
+      "eval_steps_per_second": 5.707,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3449079468655327,
+      "grad_norm": 0.32609114050865173,
+      "learning_rate": 0.0005721981929466627,
+      "loss": 3.7511,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3594733162433,
+      "grad_norm": 0.3273298144340515,
+      "learning_rate": 0.0005720233168172545,
+      "loss": 3.7491,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374038685621067,
+      "grad_norm": 0.31795287132263184,
+      "learning_rate": 0.000571848440687846,
+      "loss": 3.7475,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3886040549988348,
+      "grad_norm": 0.3376888334751129,
+      "learning_rate": 0.0005716735645584377,
+      "loss": 3.7563,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4031694243766024,
+      "grad_norm": 0.32242295145988464,
+      "learning_rate": 0.0005714986884290294,
+      "loss": 3.7462,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4177347937543696,
+      "grad_norm": 0.31965371966362,
+      "learning_rate": 0.000571323812299621,
+      "loss": 3.7578,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432300163132137,
+      "grad_norm": 0.3355007469654083,
+      "learning_rate": 0.0005711489361702127,
+      "loss": 3.7568,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4468655325099045,
+      "grad_norm": 0.32753318548202515,
+      "learning_rate": 0.0005709740600408044,
+      "loss": 3.7353,
+      "step": 8400
+    },
+    {
+      "epoch": 2.461430901887672,
+      "grad_norm": 0.3319459855556488,
+      "learning_rate": 0.0005707991839113961,
+      "loss": 3.743,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4759962712654393,
+      "grad_norm": 0.3193652927875519,
+      "learning_rate": 0.0005706243077819877,
+      "loss": 3.7468,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4905616406432065,
+      "grad_norm": 0.32112497091293335,
+      "learning_rate": 0.0005704494316525793,
+      "loss": 3.7436,
+      "step": 8550
+    },
+    {
+      "epoch": 2.505127010020974,
+      "grad_norm": 0.3209002614021301,
+      "learning_rate": 0.000570274555523171,
+      "loss": 3.7432,
+      "step": 8600
+    },
+    {
+      "epoch": 2.519692379398742,
+      "grad_norm": 0.3239600956439972,
+      "learning_rate": 0.0005700996793937627,
+      "loss": 3.7495,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534257748776509,
+      "grad_norm": 0.34214961528778076,
+      "learning_rate": 0.0005699248032643544,
+      "loss": 3.7486,
+      "step": 8700
+    },
+    {
+      "epoch": 2.548823118154276,
+      "grad_norm": 0.3178744316101074,
+      "learning_rate": 0.000569749927134946,
+      "loss": 3.7443,
+      "step": 8750
+    },
+    {
+      "epoch": 2.563388487532044,
+      "grad_norm": 0.3296307921409607,
+      "learning_rate": 0.0005695750510055377,
+      "loss": 3.7494,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5779538569098115,
+      "grad_norm": 0.33302944898605347,
+      "learning_rate": 0.0005694001748761294,
+      "loss": 3.7445,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5925192262875787,
+      "grad_norm": 0.33363667130470276,
+      "learning_rate": 0.000569225298746721,
+      "loss": 3.747,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607084595665346,
+      "grad_norm": 0.33573073148727417,
+      "learning_rate": 0.0005690504226173127,
+      "loss": 3.731,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "grad_norm": 0.3124948740005493,
+      "learning_rate": 0.0005688755464879043,
+      "loss": 3.7376,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "eval_accuracy": 0.3468023107143004,
+      "eval_loss": 3.756934881210327,
+      "eval_runtime": 182.4097,
+      "eval_samples_per_second": 91.251,
+      "eval_steps_per_second": 5.707,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636215334420881,
+      "grad_norm": 0.3117513954639435,
+      "learning_rate": 0.000568700670358496,
+      "loss": 3.7412,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6507807037986484,
+      "grad_norm": 0.3153388798236847,
+      "learning_rate": 0.0005685257942290877,
+      "loss": 3.7424,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6653460731764156,
+      "grad_norm": 0.31582581996917725,
+      "learning_rate": 0.0005683509180996793,
+      "loss": 3.7352,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6799114425541832,
+      "grad_norm": 0.31198346614837646,
+      "learning_rate": 0.000568176041970271,
+      "loss": 3.7397,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6944768119319504,
+      "grad_norm": 0.33701658248901367,
+      "learning_rate": 0.0005680011658408627,
+      "loss": 3.7386,
+      "step": 9250
+    },
+    {
+      "epoch": 2.709042181309718,
+      "grad_norm": 0.3240450918674469,
+      "learning_rate": 0.0005678262897114544,
+      "loss": 3.7343,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7236075506874853,
+      "grad_norm": 0.31347861886024475,
+      "learning_rate": 0.000567651413582046,
+      "loss": 3.7271,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738172920065253,
+      "grad_norm": 0.33607959747314453,
+      "learning_rate": 0.0005674765374526377,
+      "loss": 3.7378,
+      "step": 9400
+    },
+    {
+      "epoch": 2.75273828944302,
+      "grad_norm": 0.33370694518089294,
+      "learning_rate": 0.0005673016613232293,
+      "loss": 3.7344,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7673036588207878,
+      "grad_norm": 0.29530900716781616,
+      "learning_rate": 0.0005671267851938209,
+      "loss": 3.7352,
+      "step": 9500
+    },
+    {
+      "epoch": 2.781869028198555,
+      "grad_norm": 0.3362729251384735,
+      "learning_rate": 0.0005669519090644127,
+      "loss": 3.7499,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7964343975763226,
+      "grad_norm": 0.3185634911060333,
+      "learning_rate": 0.0005667770329350043,
+      "loss": 3.727,
+      "step": 9600
+    },
+    {
+      "epoch": 2.81099976695409,
+      "grad_norm": 0.3251460790634155,
+      "learning_rate": 0.000566602156805596,
+      "loss": 3.7348,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8255651363318575,
+      "grad_norm": 0.32407787442207336,
+      "learning_rate": 0.0005664272806761877,
+      "loss": 3.7312,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8401305057096247,
+      "grad_norm": 0.31047409772872925,
+      "learning_rate": 0.0005662524045467793,
+      "loss": 3.7414,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8546958750873923,
+      "grad_norm": 0.3150789439678192,
+      "learning_rate": 0.000566077528417371,
+      "loss": 3.7292,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8692612444651595,
+      "grad_norm": 0.32285672426223755,
+      "learning_rate": 0.0005659026522879626,
+      "loss": 3.7278,
+      "step": 9850
+    },
+    {
+      "epoch": 2.883826613842927,
+      "grad_norm": 0.3205214738845825,
+      "learning_rate": 0.0005657277761585543,
+      "loss": 3.7117,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8983919832206944,
+      "grad_norm": 0.3268585503101349,
+      "learning_rate": 0.0005655529000291459,
+      "loss": 3.7298,
+      "step": 9950
+    },
+    {
+      "epoch": 2.912957352598462,
+      "grad_norm": 0.3318754732608795,
+      "learning_rate": 0.0005653780238997376,
+      "loss": 3.7244,
+      "step": 10000
+    },
+    {
+      "epoch": 2.912957352598462,
+      "eval_accuracy": 0.3494827816278579,
+      "eval_loss": 3.7275166511535645,
+      "eval_runtime": 182.3792,
+      "eval_samples_per_second": 91.266,
+      "eval_steps_per_second": 5.708,
+      "step": 10000
+    },
+    {
+      "epoch": 2.927522721976229,
+      "grad_norm": 0.32303711771965027,
+      "learning_rate": 0.0005652031477703293,
+      "loss": 3.7313,
+      "step": 10050
+    },
+    {
+      "epoch": 2.942088091353997,
+      "grad_norm": 0.3396250009536743,
+      "learning_rate": 0.000565028271640921,
+      "loss": 3.728,
+      "step": 10100
+    },
+    {
+      "epoch": 2.956653460731764,
+      "grad_norm": 0.31802433729171753,
+      "learning_rate": 0.0005648533955115127,
+      "loss": 3.7244,
+      "step": 10150
+    },
+    {
+      "epoch": 2.9712188301095317,
+      "grad_norm": 0.3270646333694458,
+      "learning_rate": 0.0005646785193821043,
+      "loss": 3.7336,
+      "step": 10200
+    },
+    {
+      "epoch": 2.985784199487299,
+      "grad_norm": 0.3221674859523773,
+      "learning_rate": 0.000564503643252696,
+      "loss": 3.7288,
+      "step": 10250
+    },
+    {
+      "epoch": 3.0002913073875552,
+      "grad_norm": 0.35366907715797424,
+      "learning_rate": 0.0005643287671232876,
+      "loss": 3.7151,
+      "step": 10300
+    },
+    {
+      "epoch": 3.014856676765323,
+      "grad_norm": 0.33569812774658203,
+      "learning_rate": 0.0005641538909938792,
+      "loss": 3.6103,
+      "step": 10350
+    },
+    {
+      "epoch": 3.02942204614309,
+      "grad_norm": 0.3463501036167145,
+      "learning_rate": 0.0005639790148644709,
+      "loss": 3.6142,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0439874155208577,
+      "grad_norm": 0.3205231726169586,
+      "learning_rate": 0.0005638041387350626,
+      "loss": 3.6093,
+      "step": 10450
+    },
+    {
+      "epoch": 3.058552784898625,
+      "grad_norm": 0.31053611636161804,
+      "learning_rate": 0.0005636292626056543,
+      "loss": 3.6248,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0731181542763926,
+      "grad_norm": 0.32655248045921326,
+      "learning_rate": 0.000563454386476246,
+      "loss": 3.6235,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0876835236541598,
+      "grad_norm": 0.3263218104839325,
+      "learning_rate": 0.0005632795103468376,
+      "loss": 3.6202,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1022488930319274,
+      "grad_norm": 0.32272443175315857,
+      "learning_rate": 0.0005631046342174293,
+      "loss": 3.6317,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1168142624096946,
+      "grad_norm": 0.3152412474155426,
+      "learning_rate": 0.000562929758088021,
+      "loss": 3.6225,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1313796317874623,
+      "grad_norm": 0.3140038251876831,
+      "learning_rate": 0.0005627548819586126,
+      "loss": 3.6281,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1459450011652295,
+      "grad_norm": 0.3572128713130951,
+      "learning_rate": 0.0005625800058292042,
+      "loss": 3.629,
+      "step": 10800
+    },
+    {
+      "epoch": 3.160510370542997,
+      "grad_norm": 0.3352822959423065,
+      "learning_rate": 0.0005624051296997959,
+      "loss": 3.623,
+      "step": 10850
+    },
+    {
+      "epoch": 3.1750757399207643,
+      "grad_norm": 0.3205251097679138,
+      "learning_rate": 0.0005622302535703876,
+      "loss": 3.6395,
+      "step": 10900
+    },
+    {
+      "epoch": 3.189641109298532,
+      "grad_norm": 0.3109528720378876,
+      "learning_rate": 0.0005620553774409792,
+      "loss": 3.6292,
+      "step": 10950
+    },
+    {
+      "epoch": 3.204206478676299,
+      "grad_norm": 0.3360290825366974,
+      "learning_rate": 0.000561880501311571,
+      "loss": 3.6353,
+      "step": 11000
+    },
+    {
+      "epoch": 3.204206478676299,
+      "eval_accuracy": 0.35141351075380384,
+      "eval_loss": 3.7175660133361816,
+      "eval_runtime": 182.3999,
+      "eval_samples_per_second": 91.256,
+      "eval_steps_per_second": 5.707,
+      "step": 11000
+    },
+    {
+      "epoch": 3.218771848054067,
+      "grad_norm": 0.32028627395629883,
+      "learning_rate": 0.0005617056251821626,
+      "loss": 3.6444,
+      "step": 11050
+    },
+    {
+      "epoch": 3.233337217431834,
+      "grad_norm": 0.31713923811912537,
+      "learning_rate": 0.0005615307490527543,
+      "loss": 3.639,
+      "step": 11100
+    },
+    {
+      "epoch": 3.2479025868096016,
+      "grad_norm": 0.3299584686756134,
+      "learning_rate": 0.000561355872923346,
+      "loss": 3.6466,
+      "step": 11150
+    },
+    {
+      "epoch": 3.262467956187369,
+      "grad_norm": 0.3305450677871704,
+      "learning_rate": 0.0005611809967939375,
+      "loss": 3.6363,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2770333255651365,
+      "grad_norm": 0.3444271385669708,
+      "learning_rate": 0.0005610061206645292,
+      "loss": 3.6381,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2915986949429037,
+      "grad_norm": 0.31552445888519287,
+      "learning_rate": 0.0005608312445351209,
+      "loss": 3.6396,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3061640643206713,
+      "grad_norm": 0.309539258480072,
+      "learning_rate": 0.0005606563684057126,
+      "loss": 3.6437,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3207294336984385,
+      "grad_norm": 0.322343647480011,
+      "learning_rate": 0.0005604814922763042,
+      "loss": 3.6493,
+      "step": 11400
+    },
+    {
+      "epoch": 3.335294803076206,
+      "grad_norm": 0.3462202847003937,
+      "learning_rate": 0.0005603066161468959,
+      "loss": 3.6308,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3498601724539734,
+      "grad_norm": 0.3419882357120514,
+      "learning_rate": 0.0005601317400174876,
+      "loss": 3.6382,
+      "step": 11500
+    },
+    {
+      "epoch": 3.364425541831741,
+      "grad_norm": 0.346147358417511,
+      "learning_rate": 0.0005599568638880793,
+      "loss": 3.632,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3789909112095082,
+      "grad_norm": 0.32104918360710144,
+      "learning_rate": 0.0005597819877586709,
+      "loss": 3.6416,
+      "step": 11600
+    },
+    {
+      "epoch": 3.393556280587276,
+      "grad_norm": 0.3208399713039398,
+      "learning_rate": 0.0005596071116292625,
+      "loss": 3.6589,
+      "step": 11650
+    },
+    {
+      "epoch": 3.408121649965043,
+      "grad_norm": 0.3355486989021301,
+      "learning_rate": 0.0005594322354998542,
+      "loss": 3.6351,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4226870193428107,
+      "grad_norm": 0.329441100358963,
+      "learning_rate": 0.0005592573593704459,
+      "loss": 3.6544,
+      "step": 11750
+    },
+    {
+      "epoch": 3.437252388720578,
+      "grad_norm": 0.331617534160614,
+      "learning_rate": 0.0005590824832410375,
+      "loss": 3.6444,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4518177580983456,
+      "grad_norm": 0.35520729422569275,
+      "learning_rate": 0.0005589076071116292,
+      "loss": 3.6517,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4663831274761128,
+      "grad_norm": 0.32801005244255066,
+      "learning_rate": 0.0005587327309822209,
+      "loss": 3.6411,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4809484968538804,
+      "grad_norm": 0.3370635509490967,
+      "learning_rate": 0.0005585578548528126,
+      "loss": 3.6359,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "grad_norm": 0.31257134675979614,
+      "learning_rate": 0.0005583829787234043,
+      "loss": 3.6428,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "eval_accuracy": 0.3529537910046269,
+      "eval_loss": 3.6976659297943115,
+      "eval_runtime": 182.5172,
+      "eval_samples_per_second": 91.197,
+      "eval_steps_per_second": 5.704,
+      "step": 12000
+    },
+    {
+      "epoch": 3.510079235609415,
+      "grad_norm": 0.3163146674633026,
+      "learning_rate": 0.0005582081025939958,
+      "loss": 3.6404,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5246446049871825,
+      "grad_norm": 0.31714287400245667,
+      "learning_rate": 0.0005580332264645875,
+      "loss": 3.6287,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53920997436495,
+      "grad_norm": 0.31056082248687744,
+      "learning_rate": 0.0005578583503351792,
+      "loss": 3.6548,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5537753437427173,
+      "grad_norm": 0.31519201397895813,
+      "learning_rate": 0.0005576834742057709,
+      "loss": 3.6466,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5683407131204845,
+      "grad_norm": 0.3620156943798065,
+      "learning_rate": 0.0005575085980763625,
+      "loss": 3.6459,
+      "step": 12250
+    },
+    {
+      "epoch": 3.582906082498252,
+      "grad_norm": 0.3160246014595032,
+      "learning_rate": 0.0005573337219469542,
+      "loss": 3.6571,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59747145187602,
+      "grad_norm": 0.3447693884372711,
+      "learning_rate": 0.0005571588458175459,
+      "loss": 3.639,
+      "step": 12350
+    },
+    {
+      "epoch": 3.612036821253787,
+      "grad_norm": 0.31839776039123535,
+      "learning_rate": 0.0005569839696881374,
+      "loss": 3.6516,
+      "step": 12400
+    },
+    {
+      "epoch": 3.626602190631554,
+      "grad_norm": 0.3184966742992401,
+      "learning_rate": 0.0005568090935587292,
+      "loss": 3.6529,
+      "step": 12450
+    },
+    {
+      "epoch": 3.641167560009322,
+      "grad_norm": 0.3189146816730499,
+      "learning_rate": 0.0005566342174293208,
+      "loss": 3.6469,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6557329293870895,
+      "grad_norm": 0.34218892455101013,
+      "learning_rate": 0.0005564593412999125,
+      "loss": 3.6535,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6702982987648567,
+      "grad_norm": 0.3211210370063782,
+      "learning_rate": 0.0005562844651705042,
+      "loss": 3.6398,
+      "step": 12600
+    },
+    {
+      "epoch": 3.684863668142624,
+      "grad_norm": 0.31546565890312195,
+      "learning_rate": 0.0005561095890410958,
+      "loss": 3.6409,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6994290375203915,
+      "grad_norm": 0.32176557183265686,
+      "learning_rate": 0.0005559347129116875,
+      "loss": 3.6457,
+      "step": 12700
+    },
+    {
+      "epoch": 3.713994406898159,
+      "grad_norm": 0.323650598526001,
+      "learning_rate": 0.0005557598367822792,
+      "loss": 3.6463,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7285597762759264,
+      "grad_norm": 0.31617245078086853,
+      "learning_rate": 0.0005555849606528709,
+      "loss": 3.6484,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7431251456536936,
+      "grad_norm": 0.3181900084018707,
+      "learning_rate": 0.0005554100845234624,
+      "loss": 3.659,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7576905150314612,
+      "grad_norm": 0.3386443257331848,
+      "learning_rate": 0.0005552352083940541,
+      "loss": 3.6516,
+      "step": 12900
+    },
+    {
+      "epoch": 3.772255884409229,
+      "grad_norm": 0.33526474237442017,
+      "learning_rate": 0.0005550603322646458,
+      "loss": 3.6433,
+      "step": 12950
+    },
+    {
+      "epoch": 3.786821253786996,
+      "grad_norm": 0.3211400806903839,
+      "learning_rate": 0.0005548854561352375,
+      "loss": 3.646,
+      "step": 13000
+    },
+    {
+      "epoch": 3.786821253786996,
+      "eval_accuracy": 0.3546315102000507,
+      "eval_loss": 3.6797258853912354,
+      "eval_runtime": 182.657,
+      "eval_samples_per_second": 91.127,
+      "eval_steps_per_second": 5.699,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8013866231647633,
+      "grad_norm": 0.31162765622138977,
+      "learning_rate": 0.0005547105800058292,
+      "loss": 3.6343,
+      "step": 13050
+    },
+    {
+      "epoch": 3.815951992542531,
+      "grad_norm": 0.292121559381485,
+      "learning_rate": 0.0005545357038764208,
+      "loss": 3.6399,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8305173619202986,
+      "grad_norm": 0.354305237531662,
+      "learning_rate": 0.0005543608277470125,
+      "loss": 3.6457,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8450827312980658,
+      "grad_norm": 0.3242047131061554,
+      "learning_rate": 0.0005541859516176042,
+      "loss": 3.6389,
+      "step": 13200
+    },
+    {
+      "epoch": 3.859648100675833,
+      "grad_norm": 0.3040805757045746,
+      "learning_rate": 0.0005540110754881958,
+      "loss": 3.6281,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8742134700536006,
+      "grad_norm": 0.33070269227027893,
+      "learning_rate": 0.0005538361993587874,
+      "loss": 3.6411,
+      "step": 13300
+    },
+    {
+      "epoch": 3.888778839431368,
+      "grad_norm": 0.3205200135707855,
+      "learning_rate": 0.0005536613232293791,
+      "loss": 3.6402,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9033442088091355,
+      "grad_norm": 0.3389040231704712,
+      "learning_rate": 0.0005534864470999708,
+      "loss": 3.6405,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9179095781869027,
+      "grad_norm": 0.34000879526138306,
+      "learning_rate": 0.0005533115709705625,
+      "loss": 3.6363,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9324749475646703,
+      "grad_norm": 0.31868523359298706,
+      "learning_rate": 0.0005531366948411541,
+      "loss": 3.6466,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9470403169424375,
+      "grad_norm": 0.31396111845970154,
+      "learning_rate": 0.0005529618187117458,
+      "loss": 3.6298,
+      "step": 13550
+    },
+    {
+      "epoch": 3.961605686320205,
+      "grad_norm": 0.31641459465026855,
+      "learning_rate": 0.0005527869425823375,
+      "loss": 3.6324,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9761710556979724,
+      "grad_norm": 0.3213881254196167,
+      "learning_rate": 0.0005526120664529292,
+      "loss": 3.6464,
+      "step": 13650
+    },
+    {
+      "epoch": 3.99073642507574,
+      "grad_norm": 0.32177630066871643,
+      "learning_rate": 0.0005524371903235207,
+      "loss": 3.6374,
+      "step": 13700
+    },
+    {
+      "epoch": 4.005243532975996,
+      "grad_norm": 0.32364439964294434,
+      "learning_rate": 0.0005522623141941124,
+      "loss": 3.6032,
+      "step": 13750
+    },
+    {
+      "epoch": 4.0198089023537635,
+      "grad_norm": 0.3256928026676178,
+      "learning_rate": 0.0005520874380647041,
+      "loss": 3.5313,
+      "step": 13800
+    },
+    {
+      "epoch": 4.034374271731531,
+      "grad_norm": 0.332454651594162,
+      "learning_rate": 0.0005519125619352957,
+      "loss": 3.5456,
+      "step": 13850
+    },
+    {
+      "epoch": 4.048939641109299,
+      "grad_norm": 0.34020113945007324,
+      "learning_rate": 0.0005517376858058875,
+      "loss": 3.5473,
+      "step": 13900
+    },
+    {
+      "epoch": 4.063505010487066,
+      "grad_norm": 0.3192497491836548,
+      "learning_rate": 0.0005515628096764791,
+      "loss": 3.5316,
+      "step": 13950
+    },
+    {
+      "epoch": 4.078070379864833,
+      "grad_norm": 0.31511300802230835,
+      "learning_rate": 0.0005513879335470708,
+      "loss": 3.5479,
+      "step": 14000
+    },
+    {
+      "epoch": 4.078070379864833,
+      "eval_accuracy": 0.3563426191818444,
+      "eval_loss": 3.672767162322998,
+      "eval_runtime": 182.5621,
+      "eval_samples_per_second": 91.174,
+      "eval_steps_per_second": 5.702,
+      "step": 14000
+    },
+    {
+      "epoch": 4.092635749242601,
+      "grad_norm": 0.3340079188346863,
+      "learning_rate": 0.0005512130574176625,
+      "loss": 3.5413,
+      "step": 14050
+    },
+    {
+      "epoch": 4.1072011186203685,
+      "grad_norm": 0.3118899464607239,
+      "learning_rate": 0.000551038181288254,
+      "loss": 3.5422,
+      "step": 14100
+    },
+    {
+      "epoch": 4.121766487998135,
+      "grad_norm": 0.3274664580821991,
+      "learning_rate": 0.0005508633051588457,
+      "loss": 3.5457,
+      "step": 14150
+    },
+    {
+      "epoch": 4.136331857375903,
+      "grad_norm": 0.3153150677680969,
+      "learning_rate": 0.0005506884290294374,
+      "loss": 3.56,
+      "step": 14200
+    },
+    {
+      "epoch": 4.150897226753671,
+      "grad_norm": 0.3385670781135559,
+      "learning_rate": 0.0005505135529000291,
+      "loss": 3.5466,
+      "step": 14250
+    },
+    {
+      "epoch": 4.165462596131438,
+      "grad_norm": 0.3047159016132355,
+      "learning_rate": 0.0005503386767706207,
+      "loss": 3.5501,
+      "step": 14300
+    },
+    {
+      "epoch": 4.180027965509205,
+      "grad_norm": 0.3238605260848999,
+      "learning_rate": 0.0005501638006412124,
+      "loss": 3.5547,
+      "step": 14350
+    },
+    {
+      "epoch": 4.194593334886973,
+      "grad_norm": 0.3106607496738434,
+      "learning_rate": 0.0005499889245118041,
+      "loss": 3.5582,
+      "step": 14400
+    },
+    {
+      "epoch": 4.20915870426474,
+      "grad_norm": 0.31510302424430847,
+      "learning_rate": 0.0005498140483823958,
+      "loss": 3.554,
+      "step": 14450
+    },
+    {
+      "epoch": 4.223724073642508,
+      "grad_norm": 0.3922960162162781,
+      "learning_rate": 0.0005496391722529875,
+      "loss": 3.5483,
+      "step": 14500
+    },
+    {
+      "epoch": 4.238289443020275,
+      "grad_norm": 0.333943635225296,
+      "learning_rate": 0.000549464296123579,
+      "loss": 3.56,
+      "step": 14550
+    },
+    {
+      "epoch": 4.252854812398042,
+      "grad_norm": 0.3157419264316559,
+      "learning_rate": 0.0005492894199941707,
+      "loss": 3.5645,
+      "step": 14600
+    },
+    {
+      "epoch": 4.26742018177581,
+      "grad_norm": 0.32612183690071106,
+      "learning_rate": 0.0005491145438647624,
+      "loss": 3.5556,
+      "step": 14650
+    },
+    {
+      "epoch": 4.281985551153578,
+      "grad_norm": 0.33562448620796204,
+      "learning_rate": 0.000548939667735354,
+      "loss": 3.5716,
+      "step": 14700
+    },
+    {
+      "epoch": 4.296550920531344,
+      "grad_norm": 0.32943233847618103,
+      "learning_rate": 0.0005487647916059457,
+      "loss": 3.5683,
+      "step": 14750
+    },
+    {
+      "epoch": 4.311116289909112,
+      "grad_norm": 0.32873958349227905,
+      "learning_rate": 0.0005485899154765374,
+      "loss": 3.5584,
+      "step": 14800
+    },
+    {
+      "epoch": 4.32568165928688,
+      "grad_norm": 0.3185548782348633,
+      "learning_rate": 0.0005484150393471291,
+      "loss": 3.5823,
+      "step": 14850
+    },
+    {
+      "epoch": 4.340247028664647,
+      "grad_norm": 0.32610881328582764,
+      "learning_rate": 0.0005482401632177208,
+      "loss": 3.5761,
+      "step": 14900
+    },
+    {
+      "epoch": 4.354812398042414,
+      "grad_norm": 0.31527790427207947,
+      "learning_rate": 0.0005480652870883124,
+      "loss": 3.5687,
+      "step": 14950
+    },
+    {
+      "epoch": 4.369377767420182,
+      "grad_norm": 0.3269599378108978,
+      "learning_rate": 0.000547890410958904,
+      "loss": 3.5782,
+      "step": 15000
+    },
+    {
+      "epoch": 4.369377767420182,
+      "eval_accuracy": 0.3571946465826318,
+      "eval_loss": 3.6619949340820312,
+      "eval_runtime": 182.5907,
+      "eval_samples_per_second": 91.16,
+      "eval_steps_per_second": 5.701,
+      "step": 15000
+    },
+    {
+      "epoch": 4.383943136797949,
+      "grad_norm": 0.31807681918144226,
+      "learning_rate": 0.0005477155348294957,
+      "loss": 3.5722,
+      "step": 15050
+    },
+    {
+      "epoch": 4.398508506175717,
+      "grad_norm": 0.3249761462211609,
+      "learning_rate": 0.0005475406587000874,
+      "loss": 3.58,
+      "step": 15100
+    },
+    {
+      "epoch": 4.413073875553484,
+      "grad_norm": 0.33852142095565796,
+      "learning_rate": 0.000547365782570679,
+      "loss": 3.5796,
+      "step": 15150
+    },
+    {
+      "epoch": 4.427639244931251,
+      "grad_norm": 0.32763510942459106,
+      "learning_rate": 0.0005471909064412707,
+      "loss": 3.5777,
+      "step": 15200
+    },
+    {
+      "epoch": 4.442204614309019,
+      "grad_norm": 0.31176745891571045,
+      "learning_rate": 0.0005470160303118624,
+      "loss": 3.5721,
+      "step": 15250
+    },
+    {
+      "epoch": 4.456769983686787,
+      "grad_norm": 0.35347744822502136,
+      "learning_rate": 0.000546841154182454,
+      "loss": 3.5759,
+      "step": 15300
+    },
+    {
+      "epoch": 4.471335353064553,
+      "grad_norm": 0.33067938685417175,
+      "learning_rate": 0.0005466662780530458,
+      "loss": 3.5752,
+      "step": 15350
+    },
+    {
+      "epoch": 4.485900722442321,
+      "grad_norm": 0.3238064646720886,
+      "learning_rate": 0.0005464914019236374,
+      "loss": 3.5855,
+      "step": 15400
+    },
+    {
+      "epoch": 4.500466091820089,
+      "grad_norm": 0.33181995153427124,
+      "learning_rate": 0.000546316525794229,
+      "loss": 3.5676,
+      "step": 15450
+    },
+    {
+      "epoch": 4.515031461197856,
+      "grad_norm": 0.32865026593208313,
+      "learning_rate": 0.0005461416496648207,
+      "loss": 3.5663,
+      "step": 15500
+    },
+    {
+      "epoch": 4.529596830575623,
+      "grad_norm": 0.30539754033088684,
+      "learning_rate": 0.0005459667735354123,
+      "loss": 3.572,
+      "step": 15550
+    },
+    {
+      "epoch": 4.544162199953391,
+      "grad_norm": 0.33434492349624634,
+      "learning_rate": 0.000545791897406004,
+      "loss": 3.5709,
+      "step": 15600
+    },
+    {
+      "epoch": 4.558727569331158,
+      "grad_norm": 0.32164353132247925,
+      "learning_rate": 0.0005456170212765957,
+      "loss": 3.5783,
+      "step": 15650
+    },
+    {
+      "epoch": 4.573292938708926,
+      "grad_norm": 0.3319690525531769,
+      "learning_rate": 0.0005454421451471874,
+      "loss": 3.58,
+      "step": 15700
+    },
+    {
+      "epoch": 4.587858308086693,
+      "grad_norm": 0.3449385464191437,
+      "learning_rate": 0.000545267269017779,
+      "loss": 3.58,
+      "step": 15750
+    },
+    {
+      "epoch": 4.6024236774644605,
+      "grad_norm": 0.32032108306884766,
+      "learning_rate": 0.0005450923928883708,
+      "loss": 3.5837,
+      "step": 15800
+    },
+    {
+      "epoch": 4.616989046842228,
+      "grad_norm": 0.3191685974597931,
+      "learning_rate": 0.0005449175167589623,
+      "loss": 3.5717,
+      "step": 15850
+    },
+    {
+      "epoch": 4.631554416219995,
+      "grad_norm": 0.32119137048721313,
+      "learning_rate": 0.000544742640629554,
+      "loss": 3.5932,
+      "step": 15900
+    },
+    {
+      "epoch": 4.6461197855977625,
+      "grad_norm": 0.33280646800994873,
+      "learning_rate": 0.0005445677645001457,
+      "loss": 3.5691,
+      "step": 15950
+    },
+    {
+      "epoch": 4.66068515497553,
+      "grad_norm": 0.32261455059051514,
+      "learning_rate": 0.0005443928883707373,
+      "loss": 3.5819,
+      "step": 16000
+    },
+    {
+      "epoch": 4.66068515497553,
+      "eval_accuracy": 0.3582682928119667,
+      "eval_loss": 3.6463451385498047,
+      "eval_runtime": 182.103,
+      "eval_samples_per_second": 91.404,
+      "eval_steps_per_second": 5.717,
+      "step": 16000
+    },
+    {
+      "epoch": 4.675250524353298,
+      "grad_norm": 0.3126870095729828,
+      "learning_rate": 0.000544218012241329,
+      "loss": 3.5687,
+      "step": 16050
+    },
+    {
+      "epoch": 4.689815893731065,
+      "grad_norm": 0.3362468183040619,
+      "learning_rate": 0.0005440431361119207,
+      "loss": 3.5788,
+      "step": 16100
+    },
+    {
+      "epoch": 4.704381263108832,
+      "grad_norm": 0.30104732513427734,
+      "learning_rate": 0.0005438682599825123,
+      "loss": 3.5875,
+      "step": 16150
+    },
+    {
+      "epoch": 4.7189466324866,
+      "grad_norm": 0.3225014805793762,
+      "learning_rate": 0.000543693383853104,
+      "loss": 3.5542,
+      "step": 16200
+    },
+    {
+      "epoch": 4.7335120018643675,
+      "grad_norm": 0.3089386522769928,
+      "learning_rate": 0.0005435185077236957,
+      "loss": 3.5733,
+      "step": 16250
+    },
+    {
+      "epoch": 4.748077371242134,
+      "grad_norm": 0.32772549986839294,
+      "learning_rate": 0.0005433436315942873,
+      "loss": 3.5776,
+      "step": 16300
+    },
+    {
+      "epoch": 4.762642740619902,
+      "grad_norm": 0.3394605815410614,
+      "learning_rate": 0.000543168755464879,
+      "loss": 3.5714,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7772081099976695,
+      "grad_norm": 0.31668463349342346,
+      "learning_rate": 0.0005429938793354706,
+      "loss": 3.5802,
+      "step": 16400
+    },
+    {
+      "epoch": 4.791773479375437,
+      "grad_norm": 0.30830904841423035,
+      "learning_rate": 0.0005428190032060623,
+      "loss": 3.5706,
+      "step": 16450
+    },
+    {
+      "epoch": 4.806338848753205,
+      "grad_norm": 0.3307313621044159,
+      "learning_rate": 0.000542644127076654,
+      "loss": 3.5669,
+      "step": 16500
+    },
+    {
+      "epoch": 4.820904218130972,
+      "grad_norm": 0.3045443892478943,
+      "learning_rate": 0.0005424692509472457,
+      "loss": 3.5737,
+      "step": 16550
+    },
+    {
+      "epoch": 4.835469587508739,
+      "grad_norm": 0.3446267247200012,
+      "learning_rate": 0.0005422943748178373,
+      "loss": 3.5815,
+      "step": 16600
+    },
+    {
+      "epoch": 4.850034956886507,
+      "grad_norm": 0.33735302090644836,
+      "learning_rate": 0.000542119498688429,
+      "loss": 3.5774,
+      "step": 16650
+    },
+    {
+      "epoch": 4.864600326264274,
+      "grad_norm": 0.3310108184814453,
+      "learning_rate": 0.0005419446225590207,
+      "loss": 3.5777,
+      "step": 16700
+    },
+    {
+      "epoch": 4.879165695642041,
+      "grad_norm": 0.3099808692932129,
+      "learning_rate": 0.0005417697464296122,
+      "loss": 3.5811,
+      "step": 16750
+    },
+    {
+      "epoch": 4.893731065019809,
+      "grad_norm": 0.3206506371498108,
+      "learning_rate": 0.000541594870300204,
+      "loss": 3.5822,
+      "step": 16800
+    },
+    {
+      "epoch": 4.908296434397577,
+      "grad_norm": 0.31750303506851196,
+      "learning_rate": 0.0005414199941707956,
+      "loss": 3.5802,
+      "step": 16850
+    },
+    {
+      "epoch": 4.922861803775344,
+      "grad_norm": 0.33419302105903625,
+      "learning_rate": 0.0005412451180413873,
+      "loss": 3.572,
+      "step": 16900
+    },
+    {
+      "epoch": 4.937427173153111,
+      "grad_norm": 0.35304707288742065,
+      "learning_rate": 0.000541070241911979,
+      "loss": 3.5736,
+      "step": 16950
+    },
+    {
+      "epoch": 4.951992542530879,
+      "grad_norm": 0.3392590284347534,
+      "learning_rate": 0.0005408953657825706,
+      "loss": 3.5792,
+      "step": 17000
+    },
+    {
+      "epoch": 4.951992542530879,
+      "eval_accuracy": 0.35947126567865034,
+      "eval_loss": 3.634124279022217,
+      "eval_runtime": 181.9213,
+      "eval_samples_per_second": 91.496,
+      "eval_steps_per_second": 5.722,
+      "step": 17000
+    },
+    {
+      "epoch": 4.966557911908646,
+      "grad_norm": 0.3133297264575958,
+      "learning_rate": 0.0005407204896531623,
+      "loss": 3.574,
+      "step": 17050
+    },
+    {
+      "epoch": 4.981123281286413,
+      "grad_norm": 0.31224194169044495,
+      "learning_rate": 0.000540545613523754,
+      "loss": 3.5708,
+      "step": 17100
+    },
+    {
+      "epoch": 4.995688650664181,
+      "grad_norm": 0.3288334310054779,
+      "learning_rate": 0.0005403707373943456,
+      "loss": 3.5705,
+      "step": 17150
+    },
+    {
+      "epoch": 5.010195758564437,
+      "grad_norm": 0.343101441860199,
+      "learning_rate": 0.0005401958612649372,
+      "loss": 3.5059,
+      "step": 17200
+    },
+    {
+      "epoch": 5.024761127942204,
+      "grad_norm": 0.31737393140792847,
+      "learning_rate": 0.000540020985135529,
+      "loss": 3.4676,
+      "step": 17250
+    },
+    {
+      "epoch": 5.039326497319972,
+      "grad_norm": 0.3343667685985565,
+      "learning_rate": 0.0005398461090061206,
+      "loss": 3.4729,
+      "step": 17300
+    },
+    {
+      "epoch": 5.0538918666977395,
+      "grad_norm": 0.3518417477607727,
+      "learning_rate": 0.0005396712328767123,
+      "loss": 3.4788,
+      "step": 17350
+    },
+    {
+      "epoch": 5.068457236075507,
+      "grad_norm": 0.32810088992118835,
+      "learning_rate": 0.000539496356747304,
+      "loss": 3.4653,
+      "step": 17400
+    },
+    {
+      "epoch": 5.083022605453274,
+      "grad_norm": 0.3590436279773712,
+      "learning_rate": 0.0005393214806178956,
+      "loss": 3.4829,
+      "step": 17450
+    },
+    {
+      "epoch": 5.0975879748310415,
+      "grad_norm": 0.3379361629486084,
+      "learning_rate": 0.0005391466044884873,
+      "loss": 3.4851,
+      "step": 17500
+    },
+    {
+      "epoch": 5.112153344208809,
+      "grad_norm": 0.3168104588985443,
+      "learning_rate": 0.000538971728359079,
+      "loss": 3.4889,
+      "step": 17550
+    },
+    {
+      "epoch": 5.126718713586577,
+      "grad_norm": 0.33108997344970703,
+      "learning_rate": 0.0005387968522296705,
+      "loss": 3.492,
+      "step": 17600
+    },
+    {
+      "epoch": 5.141284082964344,
+      "grad_norm": 0.3419332802295685,
+      "learning_rate": 0.0005386219761002622,
+      "loss": 3.4919,
+      "step": 17650
+    },
+    {
+      "epoch": 5.155849452342111,
+      "grad_norm": 0.34570637345314026,
+      "learning_rate": 0.0005384470999708539,
+      "loss": 3.4986,
+      "step": 17700
+    },
+    {
+      "epoch": 5.170414821719879,
+      "grad_norm": 0.3261895477771759,
+      "learning_rate": 0.0005382722238414456,
+      "loss": 3.5012,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1849801910976465,
+      "grad_norm": 0.34492751955986023,
+      "learning_rate": 0.0005380973477120373,
+      "loss": 3.4965,
+      "step": 17800
+    },
+    {
+      "epoch": 5.199545560475413,
+      "grad_norm": 0.3237833082675934,
+      "learning_rate": 0.000537922471582629,
+      "loss": 3.4965,
+      "step": 17850
+    },
+    {
+      "epoch": 5.214110929853181,
+      "grad_norm": 0.31047049164772034,
+      "learning_rate": 0.0005377475954532206,
+      "loss": 3.5017,
+      "step": 17900
+    },
+    {
+      "epoch": 5.228676299230949,
+      "grad_norm": 0.32261335849761963,
+      "learning_rate": 0.0005375727193238123,
+      "loss": 3.5174,
+      "step": 17950
+    },
+    {
+      "epoch": 5.243241668608716,
+      "grad_norm": 0.3363330662250519,
+      "learning_rate": 0.000537397843194404,
+      "loss": 3.512,
+      "step": 18000
+    },
+    {
+      "epoch": 5.243241668608716,
+      "eval_accuracy": 0.3601516413607749,
+      "eval_loss": 3.6384644508361816,
+      "eval_runtime": 181.9458,
+      "eval_samples_per_second": 91.483,
+      "eval_steps_per_second": 5.721,
+      "step": 18000
+    },
+    {
+      "epoch": 5.257807037986483,
+      "grad_norm": 0.3440595269203186,
+      "learning_rate": 0.0005372229670649955,
+      "loss": 3.505,
+      "step": 18050
+    },
+    {
+      "epoch": 5.272372407364251,
+      "grad_norm": 0.3164835572242737,
+      "learning_rate": 0.0005370480909355872,
+      "loss": 3.5013,
+      "step": 18100
+    },
+    {
+      "epoch": 5.286937776742018,
+      "grad_norm": 0.3304733335971832,
+      "learning_rate": 0.0005368732148061789,
+      "loss": 3.5155,
+      "step": 18150
+    },
+    {
+      "epoch": 5.301503146119786,
+      "grad_norm": 0.3306984305381775,
+      "learning_rate": 0.0005366983386767705,
+      "loss": 3.5053,
+      "step": 18200
+    },
+    {
+      "epoch": 5.316068515497553,
+      "grad_norm": 0.32221606373786926,
+      "learning_rate": 0.0005365234625473623,
+      "loss": 3.5078,
+      "step": 18250
+    },
+    {
+      "epoch": 5.33063388487532,
+      "grad_norm": 0.30464252829551697,
+      "learning_rate": 0.0005363485864179539,
+      "loss": 3.5094,
+      "step": 18300
+    },
+    {
+      "epoch": 5.345199254253088,
+      "grad_norm": 0.32405513525009155,
+      "learning_rate": 0.0005361737102885456,
+      "loss": 3.507,
+      "step": 18350
+    },
+    {
+      "epoch": 5.359764623630856,
+      "grad_norm": 0.33651819825172424,
+      "learning_rate": 0.0005359988341591373,
+      "loss": 3.5145,
+      "step": 18400
+    },
+    {
+      "epoch": 5.374329993008622,
+      "grad_norm": 0.357702374458313,
+      "learning_rate": 0.000535823958029729,
+      "loss": 3.5132,
+      "step": 18450
+    },
+    {
+      "epoch": 5.38889536238639,
+      "grad_norm": 0.3228895962238312,
+      "learning_rate": 0.0005356490819003205,
+      "loss": 3.4973,
+      "step": 18500
+    },
+    {
+      "epoch": 5.403460731764158,
+      "grad_norm": 0.3350990414619446,
+      "learning_rate": 0.0005354742057709122,
+      "loss": 3.5208,
+      "step": 18550
+    },
+    {
+      "epoch": 5.418026101141925,
+      "grad_norm": 0.34133604168891907,
+      "learning_rate": 0.0005352993296415039,
+      "loss": 3.5177,
+      "step": 18600
+    },
+    {
+      "epoch": 5.432591470519692,
+      "grad_norm": 0.32490041851997375,
+      "learning_rate": 0.0005351244535120955,
+      "loss": 3.5159,
+      "step": 18650
+    },
+    {
+      "epoch": 5.44715683989746,
+      "grad_norm": 0.32596027851104736,
+      "learning_rate": 0.0005349495773826873,
+      "loss": 3.5274,
+      "step": 18700
+    },
+    {
+      "epoch": 5.461722209275227,
+      "grad_norm": 0.3423188626766205,
+      "learning_rate": 0.0005347747012532789,
+      "loss": 3.53,
+      "step": 18750
+    },
+    {
+      "epoch": 5.476287578652995,
+      "grad_norm": 0.31081074476242065,
+      "learning_rate": 0.0005345998251238706,
+      "loss": 3.5269,
+      "step": 18800
+    },
+    {
+      "epoch": 5.490852948030762,
+      "grad_norm": 0.34136995673179626,
+      "learning_rate": 0.0005344249489944623,
+      "loss": 3.5268,
+      "step": 18850
+    },
+    {
+      "epoch": 5.505418317408529,
+      "grad_norm": 0.34362757205963135,
+      "learning_rate": 0.0005342500728650538,
+      "loss": 3.5227,
+      "step": 18900
+    },
+    {
+      "epoch": 5.519983686786297,
+      "grad_norm": 0.30831918120384216,
+      "learning_rate": 0.0005340751967356455,
+      "loss": 3.5222,
+      "step": 18950
+    },
+    {
+      "epoch": 5.534549056164065,
+      "grad_norm": 0.3135395646095276,
+      "learning_rate": 0.0005339003206062372,
+      "loss": 3.5208,
+      "step": 19000
+    },
+    {
+      "epoch": 5.534549056164065,
+      "eval_accuracy": 0.3609769804464003,
+      "eval_loss": 3.626232385635376,
+      "eval_runtime": 181.8603,
+      "eval_samples_per_second": 91.526,
+      "eval_steps_per_second": 5.724,
+      "step": 19000
+    },
+    {
+      "epoch": 5.549114425541831,
+      "grad_norm": 0.3581465482711792,
+      "learning_rate": 0.0005337254444768288,
+      "loss": 3.531,
+      "step": 19050
+    },
+    {
+      "epoch": 5.563679794919599,
+      "grad_norm": 0.32384639978408813,
+      "learning_rate": 0.0005335505683474205,
+      "loss": 3.5197,
+      "step": 19100
+    },
+    {
+      "epoch": 5.578245164297367,
+      "grad_norm": 0.3450806736946106,
+      "learning_rate": 0.0005333756922180122,
+      "loss": 3.5301,
+      "step": 19150
+    },
+    {
+      "epoch": 5.592810533675134,
+      "grad_norm": 0.32282331585884094,
+      "learning_rate": 0.0005332008160886039,
+      "loss": 3.5306,
+      "step": 19200
+    },
+    {
+      "epoch": 5.607375903052901,
+      "grad_norm": 0.3486621677875519,
+      "learning_rate": 0.0005330259399591956,
+      "loss": 3.5208,
+      "step": 19250
+    },
+    {
+      "epoch": 5.621941272430669,
+      "grad_norm": 0.3094702363014221,
+      "learning_rate": 0.0005328510638297873,
+      "loss": 3.5239,
+      "step": 19300
+    },
+    {
+      "epoch": 5.636506641808436,
+      "grad_norm": 0.3274450898170471,
+      "learning_rate": 0.0005326761877003788,
+      "loss": 3.5336,
+      "step": 19350
+    },
+    {
+      "epoch": 5.651072011186204,
+      "grad_norm": 0.3350226879119873,
+      "learning_rate": 0.0005325013115709705,
+      "loss": 3.5326,
+      "step": 19400
+    },
+    {
+      "epoch": 5.665637380563971,
+      "grad_norm": 0.3588801622390747,
+      "learning_rate": 0.0005323264354415622,
+      "loss": 3.5271,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6802027499417385,
+      "grad_norm": 0.3390669524669647,
+      "learning_rate": 0.0005321515593121538,
+      "loss": 3.5297,
+      "step": 19500
+    },
+    {
+      "epoch": 5.694768119319506,
+      "grad_norm": 0.322145938873291,
+      "learning_rate": 0.0005319766831827455,
+      "loss": 3.5217,
+      "step": 19550
+    },
+    {
+      "epoch": 5.709333488697274,
+      "grad_norm": 0.35364869236946106,
+      "learning_rate": 0.0005318018070533372,
+      "loss": 3.5149,
+      "step": 19600
+    },
+    {
+      "epoch": 5.7238988580750405,
+      "grad_norm": 0.32203230261802673,
+      "learning_rate": 0.0005316269309239288,
+      "loss": 3.5356,
+      "step": 19650
+    },
+    {
+      "epoch": 5.738464227452808,
+      "grad_norm": 0.352469265460968,
+      "learning_rate": 0.0005314520547945206,
+      "loss": 3.532,
+      "step": 19700
+    },
+    {
+      "epoch": 5.753029596830576,
+      "grad_norm": NaN,
+      "learning_rate": 0.0005312771786651121,
+      "loss": 3.5222,
+      "step": 19750
+    },
+    {
+      "epoch": 5.7675949662083426,
+      "grad_norm": 0.3287372589111328,
+      "learning_rate": 0.0005311023025357038,
+      "loss": 3.5235,
+      "step": 19800
+    },
+    {
+      "epoch": 5.78216033558611,
+      "grad_norm": 0.3262624442577362,
+      "learning_rate": 0.0005309274264062955,
+      "loss": 3.5397,
+      "step": 19850
+    },
+    {
+      "epoch": 5.796725704963878,
+      "grad_norm": 0.3266109228134155,
+      "learning_rate": 0.0005307525502768872,
+      "loss": 3.5228,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8112910743416455,
+      "grad_norm": 0.34884291887283325,
+      "learning_rate": 0.0005305776741474788,
+      "loss": 3.5194,
+      "step": 19950
+    },
+    {
+      "epoch": 5.825856443719413,
+      "grad_norm": 0.3074500858783722,
+      "learning_rate": 0.0005304027980180705,
+      "loss": 3.5273,
+      "step": 20000
+    },
+    {
+      "epoch": 5.825856443719413,
+      "eval_accuracy": 0.3620989478102355,
+      "eval_loss": 3.6148364543914795,
+      "eval_runtime": 181.8652,
+      "eval_samples_per_second": 91.524,
+      "eval_steps_per_second": 5.724,
+      "step": 20000
+    },
+    {
+      "epoch": 5.84042181309718,
+      "grad_norm": 0.3382808566093445,
+      "learning_rate": 0.0005302279218886622,
+      "loss": 3.5279,
+      "step": 20050
+    },
+    {
+      "epoch": 5.8549871824749475,
+      "grad_norm": 0.3046127259731293,
+      "learning_rate": 0.0005300530457592538,
+      "loss": 3.5388,
+      "step": 20100
+    },
+    {
+      "epoch": 5.869552551852715,
+      "grad_norm": 0.3430224657058716,
+      "learning_rate": 0.0005298781696298456,
+      "loss": 3.5282,
+      "step": 20150
+    },
+    {
+      "epoch": 5.884117921230482,
+      "grad_norm": 0.36001190543174744,
+      "learning_rate": 0.0005297032935004371,
+      "loss": 3.5291,
+      "step": 20200
+    },
+    {
+      "epoch": 5.89868329060825,
+      "grad_norm": 0.3140873312950134,
+      "learning_rate": 0.0005295284173710288,
+      "loss": 3.5389,
+      "step": 20250
+    },
+    {
+      "epoch": 5.913248659986017,
+      "grad_norm": 0.34070631861686707,
+      "learning_rate": 0.0005293535412416205,
+      "loss": 3.5397,
+      "step": 20300
+    },
+    {
+      "epoch": 5.927814029363785,
+      "grad_norm": 0.3694857954978943,
+      "learning_rate": 0.0005291786651122121,
+      "loss": 3.5365,
+      "step": 20350
+    },
+    {
+      "epoch": 5.9423793987415525,
+      "grad_norm": 0.32443273067474365,
+      "learning_rate": 0.0005290037889828038,
+      "loss": 3.5392,
+      "step": 20400
+    },
+    {
+      "epoch": 5.956944768119319,
+      "grad_norm": 0.3110935389995575,
+      "learning_rate": 0.0005288289128533955,
+      "loss": 3.5326,
+      "step": 20450
+    },
+    {
+      "epoch": 5.971510137497087,
+      "grad_norm": 0.3232935965061188,
+      "learning_rate": 0.0005286540367239872,
+      "loss": 3.5381,
+      "step": 20500
+    },
+    {
+      "epoch": 5.986075506874855,
+      "grad_norm": 0.3413400948047638,
+      "learning_rate": 0.0005284791605945788,
+      "loss": 3.5416,
+      "step": 20550
+    },
+    {
+      "epoch": 6.0005826147751105,
+      "grad_norm": 0.3248980641365051,
+      "learning_rate": 0.0005283042844651704,
+      "loss": 3.5249,
+      "step": 20600
+    },
+    {
+      "epoch": 6.015147984152878,
+      "grad_norm": 0.3668578565120697,
+      "learning_rate": 0.0005281294083357621,
+      "loss": 3.4226,
+      "step": 20650
+    },
+    {
+      "epoch": 6.029713353530646,
+      "grad_norm": 0.32745733857154846,
+      "learning_rate": 0.0005279545322063538,
+      "loss": 3.4291,
+      "step": 20700
+    },
+    {
+      "epoch": 6.044278722908413,
+      "grad_norm": 0.3670319616794586,
+      "learning_rate": 0.0005277796560769455,
+      "loss": 3.4327,
+      "step": 20750
+    },
+    {
+      "epoch": 6.05884409228618,
+      "grad_norm": 0.3462134897708893,
+      "learning_rate": 0.0005276047799475371,
+      "loss": 3.4398,
+      "step": 20800
+    },
+    {
+      "epoch": 6.073409461663948,
+      "grad_norm": 0.3434312641620636,
+      "learning_rate": 0.0005274299038181288,
+      "loss": 3.4347,
+      "step": 20850
+    },
+    {
+      "epoch": 6.087974831041715,
+      "grad_norm": 0.3447525203227997,
+      "learning_rate": 0.0005272550276887205,
+      "loss": 3.448,
+      "step": 20900
+    },
+    {
+      "epoch": 6.102540200419483,
+      "grad_norm": 0.3339570462703705,
+      "learning_rate": 0.0005270801515593121,
+      "loss": 3.4478,
+      "step": 20950
+    },
+    {
+      "epoch": 6.11710556979725,
+      "grad_norm": 0.32857051491737366,
+      "learning_rate": 0.0005269052754299037,
+      "loss": 3.4475,
+      "step": 21000
+    },
+    {
+      "epoch": 6.11710556979725,
+      "eval_accuracy": 0.3623171571183439,
+      "eval_loss": 3.619291067123413,
+      "eval_runtime": 182.0455,
+      "eval_samples_per_second": 91.433,
+      "eval_steps_per_second": 5.718,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1316709391750175,
+      "grad_norm": 0.33486610651016235,
+      "learning_rate": 0.0005267303993004954,
+      "loss": 3.435,
+      "step": 21050
+    },
+    {
+      "epoch": 6.146236308552785,
+      "grad_norm": 0.3245387077331543,
+      "learning_rate": 0.000526555523171087,
+      "loss": 3.4504,
+      "step": 21100
+    },
+    {
+      "epoch": 6.160801677930552,
+      "grad_norm": 0.325870543718338,
+      "learning_rate": 0.0005263806470416788,
+      "loss": 3.4532,
+      "step": 21150
+    },
+    {
+      "epoch": 6.1753670473083195,
+      "grad_norm": 0.35105100274086,
+      "learning_rate": 0.0005262057709122704,
+      "loss": 3.4521,
+      "step": 21200
+    },
+    {
+      "epoch": 6.189932416686087,
+      "grad_norm": 0.3488394320011139,
+      "learning_rate": 0.0005260308947828621,
+      "loss": 3.4584,
+      "step": 21250
+    },
+    {
+      "epoch": 6.204497786063855,
+      "grad_norm": 0.3601958453655243,
+      "learning_rate": 0.0005258560186534538,
+      "loss": 3.4632,
+      "step": 21300
+    },
+    {
+      "epoch": 6.219063155441622,
+      "grad_norm": 0.320527583360672,
+      "learning_rate": 0.0005256811425240455,
+      "loss": 3.4616,
+      "step": 21350
+    },
+    {
+      "epoch": 6.233628524819389,
+      "grad_norm": 0.3193604648113251,
+      "learning_rate": 0.0005255062663946371,
+      "loss": 3.4545,
+      "step": 21400
+    },
+    {
+      "epoch": 6.248193894197157,
+      "grad_norm": 0.32400959730148315,
+      "learning_rate": 0.0005253313902652287,
+      "loss": 3.46,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2627592635749245,
+      "grad_norm": 0.36129894852638245,
+      "learning_rate": 0.0005251565141358204,
+      "loss": 3.4582,
+      "step": 21500
+    },
+    {
+      "epoch": 6.277324632952691,
+      "grad_norm": 0.34856846928596497,
+      "learning_rate": 0.000524981638006412,
+      "loss": 3.4597,
+      "step": 21550
+    },
+    {
+      "epoch": 6.291890002330459,
+      "grad_norm": 0.35143759846687317,
+      "learning_rate": 0.0005248067618770038,
+      "loss": 3.4663,
+      "step": 21600
+    },
+    {
+      "epoch": 6.306455371708227,
+      "grad_norm": 0.3181470036506653,
+      "learning_rate": 0.0005246318857475954,
+      "loss": 3.4571,
+      "step": 21650
+    },
+    {
+      "epoch": 6.321020741085994,
+      "grad_norm": 0.3355952799320221,
+      "learning_rate": 0.0005244570096181871,
+      "loss": 3.4644,
+      "step": 21700
+    },
+    {
+      "epoch": 6.335586110463761,
+      "grad_norm": 0.3471963405609131,
+      "learning_rate": 0.0005242821334887788,
+      "loss": 3.4665,
+      "step": 21750
+    },
+    {
+      "epoch": 6.350151479841529,
+      "grad_norm": 0.322955846786499,
+      "learning_rate": 0.0005241072573593704,
+      "loss": 3.4713,
+      "step": 21800
+    },
+    {
+      "epoch": 6.364716849219296,
+      "grad_norm": 0.35527798533439636,
+      "learning_rate": 0.000523932381229962,
+      "loss": 3.4744,
+      "step": 21850
+    },
+    {
+      "epoch": 6.379282218597064,
+      "grad_norm": 0.3321806490421295,
+      "learning_rate": 0.0005237575051005537,
+      "loss": 3.4808,
+      "step": 21900
+    },
+    {
+      "epoch": 6.393847587974831,
+      "grad_norm": 0.33331242203712463,
+      "learning_rate": 0.0005235826289711454,
+      "loss": 3.4621,
+      "step": 21950
+    },
+    {
+      "epoch": 6.408412957352598,
+      "grad_norm": 0.3406297266483307,
+      "learning_rate": 0.000523407752841737,
+      "loss": 3.4806,
+      "step": 22000
+    },
+    {
+      "epoch": 6.408412957352598,
+      "eval_accuracy": 0.3629372195595958,
+      "eval_loss": 3.6126840114593506,
+      "eval_runtime": 179.7323,
+      "eval_samples_per_second": 92.61,
+      "eval_steps_per_second": 5.792,
+      "step": 22000
+    },
+    {
+      "epoch": 6.422978326730366,
+      "grad_norm": 0.3514094352722168,
+      "learning_rate": 0.0005232328767123287,
+      "loss": 3.4762,
+      "step": 22050
+    },
+    {
+      "epoch": 6.437543696108134,
+      "grad_norm": 0.33424749970436096,
+      "learning_rate": 0.0005230580005829204,
+      "loss": 3.4723,
+      "step": 22100
+    },
+    {
+      "epoch": 6.4521090654859,
+      "grad_norm": 0.3223506808280945,
+      "learning_rate": 0.0005228831244535121,
+      "loss": 3.4735,
+      "step": 22150
+    },
+    {
+      "epoch": 6.466674434863668,
+      "grad_norm": 0.3629089891910553,
+      "learning_rate": 0.0005227082483241038,
+      "loss": 3.4835,
+      "step": 22200
+    },
+    {
+      "epoch": 6.481239804241436,
+      "grad_norm": 0.3067444860935211,
+      "learning_rate": 0.0005225333721946954,
+      "loss": 3.4759,
+      "step": 22250
+    },
+    {
+      "epoch": 6.495805173619203,
+      "grad_norm": 0.31871816515922546,
+      "learning_rate": 0.000522358496065287,
+      "loss": 3.4852,
+      "step": 22300
+    },
+    {
+      "epoch": 6.51037054299697,
+      "grad_norm": 0.3342389464378357,
+      "learning_rate": 0.0005221836199358787,
+      "loss": 3.4748,
+      "step": 22350
+    },
+    {
+      "epoch": 6.524935912374738,
+      "grad_norm": 0.334839403629303,
+      "learning_rate": 0.0005220087438064703,
+      "loss": 3.48,
+      "step": 22400
+    },
+    {
+      "epoch": 6.539501281752505,
+      "grad_norm": 0.3808937072753906,
+      "learning_rate": 0.000521833867677062,
+      "loss": 3.4808,
+      "step": 22450
+    },
+    {
+      "epoch": 6.554066651130273,
+      "grad_norm": 0.3652092516422272,
+      "learning_rate": 0.0005216589915476537,
+      "loss": 3.5021,
+      "step": 22500
+    },
+    {
+      "epoch": 6.56863202050804,
+      "grad_norm": 0.32643789052963257,
+      "learning_rate": 0.0005214841154182454,
+      "loss": 3.4911,
+      "step": 22550
+    },
+    {
+      "epoch": 6.583197389885807,
+      "grad_norm": 0.3469211459159851,
+      "learning_rate": 0.0005213092392888371,
+      "loss": 3.4757,
+      "step": 22600
+    },
+    {
+      "epoch": 6.597762759263575,
+      "grad_norm": 0.3310937285423279,
+      "learning_rate": 0.0005211343631594287,
+      "loss": 3.4829,
+      "step": 22650
+    },
+    {
+      "epoch": 6.612328128641343,
+      "grad_norm": 0.3375169634819031,
+      "learning_rate": 0.0005209594870300204,
+      "loss": 3.4889,
+      "step": 22700
+    },
+    {
+      "epoch": 6.626893498019109,
+      "grad_norm": 0.3277340531349182,
+      "learning_rate": 0.000520784610900612,
+      "loss": 3.4734,
+      "step": 22750
+    },
+    {
+      "epoch": 6.641458867396877,
+      "grad_norm": 0.35384461283683777,
+      "learning_rate": 0.0005206097347712037,
+      "loss": 3.5004,
+      "step": 22800
+    },
+    {
+      "epoch": 6.656024236774645,
+      "grad_norm": 0.33254358172416687,
+      "learning_rate": 0.0005204348586417953,
+      "loss": 3.4922,
+      "step": 22850
+    },
+    {
+      "epoch": 6.670589606152412,
+      "grad_norm": 0.3284110426902771,
+      "learning_rate": 0.000520259982512387,
+      "loss": 3.4888,
+      "step": 22900
+    },
+    {
+      "epoch": 6.685154975530179,
+      "grad_norm": 0.32339197397232056,
+      "learning_rate": 0.0005200851063829787,
+      "loss": 3.4905,
+      "step": 22950
+    },
+    {
+      "epoch": 6.699720344907947,
+      "grad_norm": 0.33628493547439575,
+      "learning_rate": 0.0005199102302535703,
+      "loss": 3.4748,
+      "step": 23000
+    },
+    {
+      "epoch": 6.699720344907947,
+      "eval_accuracy": 0.36394831872432204,
+      "eval_loss": 3.600461006164551,
+      "eval_runtime": 179.7946,
+      "eval_samples_per_second": 92.578,
+      "eval_steps_per_second": 5.79,
+      "step": 23000
+    },
+    {
+      "epoch": 6.714285714285714,
+      "grad_norm": 0.3240261375904083,
+      "learning_rate": 0.0005197353541241621,
+      "loss": 3.4955,
+      "step": 23050
+    },
+    {
+      "epoch": 6.728851083663482,
+      "grad_norm": 0.3188318610191345,
+      "learning_rate": 0.0005195604779947537,
+      "loss": 3.483,
+      "step": 23100
+    },
+    {
+      "epoch": 6.743416453041249,
+      "grad_norm": 0.3339631259441376,
+      "learning_rate": 0.0005193856018653454,
+      "loss": 3.4883,
+      "step": 23150
+    },
+    {
+      "epoch": 6.7579818224190165,
+      "grad_norm": 0.3179808557033539,
+      "learning_rate": 0.000519210725735937,
+      "loss": 3.4943,
+      "step": 23200
+    },
+    {
+      "epoch": 6.772547191796784,
+      "grad_norm": 0.3453110456466675,
+      "learning_rate": 0.0005190358496065286,
+      "loss": 3.496,
+      "step": 23250
+    },
+    {
+      "epoch": 6.787112561174552,
+      "grad_norm": 0.32360783219337463,
+      "learning_rate": 0.0005188609734771203,
+      "loss": 3.4991,
+      "step": 23300
+    },
+    {
+      "epoch": 6.8016779305523185,
+      "grad_norm": 0.3246710002422333,
+      "learning_rate": 0.000518686097347712,
+      "loss": 3.4903,
+      "step": 23350
+    },
+    {
+      "epoch": 6.816243299930086,
+      "grad_norm": 0.344545841217041,
+      "learning_rate": 0.0005185112212183037,
+      "loss": 3.4918,
+      "step": 23400
+    },
+    {
+      "epoch": 6.830808669307854,
+      "grad_norm": 0.32257169485092163,
+      "learning_rate": 0.0005183363450888953,
+      "loss": 3.487,
+      "step": 23450
+    },
+    {
+      "epoch": 6.845374038685621,
+      "grad_norm": 0.3380378484725952,
+      "learning_rate": 0.000518161468959487,
+      "loss": 3.4881,
+      "step": 23500
+    },
+    {
+      "epoch": 6.859939408063388,
+      "grad_norm": 0.34541237354278564,
+      "learning_rate": 0.0005179865928300787,
+      "loss": 3.4939,
+      "step": 23550
+    },
+    {
+      "epoch": 6.874504777441156,
+      "grad_norm": 0.3542953431606293,
+      "learning_rate": 0.0005178117167006703,
+      "loss": 3.4869,
+      "step": 23600
+    },
+    {
+      "epoch": 6.8890701468189235,
+      "grad_norm": 0.3760510981082916,
+      "learning_rate": 0.000517636840571262,
+      "loss": 3.5067,
+      "step": 23650
+    },
+    {
+      "epoch": 6.903635516196691,
+      "grad_norm": 0.33901602029800415,
+      "learning_rate": 0.0005174619644418536,
+      "loss": 3.4951,
+      "step": 23700
+    },
+    {
+      "epoch": 6.918200885574458,
+      "grad_norm": 0.33704662322998047,
+      "learning_rate": 0.0005172870883124453,
+      "loss": 3.4922,
+      "step": 23750
+    },
+    {
+      "epoch": 6.9327662549522255,
+      "grad_norm": 0.32309016585350037,
+      "learning_rate": 0.000517112212183037,
+      "loss": 3.4908,
+      "step": 23800
+    },
+    {
+      "epoch": 6.947331624329993,
+      "grad_norm": 0.3241852819919586,
+      "learning_rate": 0.0005169373360536286,
+      "loss": 3.4964,
+      "step": 23850
+    },
+    {
+      "epoch": 6.961896993707761,
+      "grad_norm": 0.3242267370223999,
+      "learning_rate": 0.0005167624599242203,
+      "loss": 3.4817,
+      "step": 23900
+    },
+    {
+      "epoch": 6.976462363085528,
+      "grad_norm": 0.3280220925807953,
+      "learning_rate": 0.000516587583794812,
+      "loss": 3.4932,
+      "step": 23950
+    },
+    {
+      "epoch": 6.991027732463295,
+      "grad_norm": 0.3425884544849396,
+      "learning_rate": 0.0005164127076654037,
+      "loss": 3.4908,
+      "step": 24000
+    },
+    {
+      "epoch": 6.991027732463295,
+      "eval_accuracy": 0.3643506421361469,
+      "eval_loss": 3.5942769050598145,
+      "eval_runtime": 179.5597,
+      "eval_samples_per_second": 92.699,
+      "eval_steps_per_second": 5.798,
+      "step": 24000
+    },
+    {
+      "epoch": 7.005534840363552,
+      "grad_norm": 0.3431827127933502,
+      "learning_rate": 0.0005162378315359953,
+      "loss": 3.45,
+      "step": 24050
+    },
+    {
+      "epoch": 7.020100209741319,
+      "grad_norm": 0.36127012968063354,
+      "learning_rate": 0.0005160629554065869,
+      "loss": 3.3704,
+      "step": 24100
+    },
+    {
+      "epoch": 7.034665579119086,
+      "grad_norm": 0.36049118638038635,
+      "learning_rate": 0.0005158880792771786,
+      "loss": 3.3914,
+      "step": 24150
+    },
+    {
+      "epoch": 7.049230948496854,
+      "grad_norm": 0.3440174162387848,
+      "learning_rate": 0.0005157132031477703,
+      "loss": 3.3868,
+      "step": 24200
+    },
+    {
+      "epoch": 7.063796317874622,
+      "grad_norm": 0.38178542256355286,
+      "learning_rate": 0.000515538327018362,
+      "loss": 3.3956,
+      "step": 24250
+    },
+    {
+      "epoch": 7.0783616872523885,
+      "grad_norm": 0.32825422286987305,
+      "learning_rate": 0.0005153634508889536,
+      "loss": 3.3839,
+      "step": 24300
+    },
+    {
+      "epoch": 7.092927056630156,
+      "grad_norm": 0.34752145409584045,
+      "learning_rate": 0.0005151885747595453,
+      "loss": 3.3979,
+      "step": 24350
+    },
+    {
+      "epoch": 7.107492426007924,
+      "grad_norm": 0.33364078402519226,
+      "learning_rate": 0.000515013698630137,
+      "loss": 3.404,
+      "step": 24400
+    },
+    {
+      "epoch": 7.122057795385691,
+      "grad_norm": 0.3302494287490845,
+      "learning_rate": 0.0005148388225007285,
+      "loss": 3.4225,
+      "step": 24450
+    },
+    {
+      "epoch": 7.136623164763458,
+      "grad_norm": 0.33415162563323975,
+      "learning_rate": 0.0005146639463713203,
+      "loss": 3.401,
+      "step": 24500
+    },
+    {
+      "epoch": 7.151188534141226,
+      "grad_norm": 0.33528947830200195,
+      "learning_rate": 0.0005144890702419119,
+      "loss": 3.4015,
+      "step": 24550
+    },
+    {
+      "epoch": 7.165753903518993,
+      "grad_norm": 0.3421080410480499,
+      "learning_rate": 0.0005143141941125036,
+      "loss": 3.4205,
+      "step": 24600
+    },
+    {
+      "epoch": 7.180319272896761,
+      "grad_norm": 0.3325115442276001,
+      "learning_rate": 0.0005141393179830953,
+      "loss": 3.4281,
+      "step": 24650
+    },
+    {
+      "epoch": 7.194884642274528,
+      "grad_norm": 0.31258365511894226,
+      "learning_rate": 0.0005139644418536869,
+      "loss": 3.426,
+      "step": 24700
+    },
+    {
+      "epoch": 7.2094500116522955,
+      "grad_norm": 0.31508442759513855,
+      "learning_rate": 0.0005137895657242786,
+      "loss": 3.4143,
+      "step": 24750
+    },
+    {
+      "epoch": 7.224015381030063,
+      "grad_norm": 0.3417088985443115,
+      "learning_rate": 0.0005136146895948703,
+      "loss": 3.4357,
+      "step": 24800
+    },
+    {
+      "epoch": 7.238580750407831,
+      "grad_norm": 0.3098302185535431,
+      "learning_rate": 0.000513439813465462,
+      "loss": 3.4303,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2531461197855975,
+      "grad_norm": 0.31606337428092957,
+      "learning_rate": 0.0005132649373360535,
+      "loss": 3.4353,
+      "step": 24900
+    },
+    {
+      "epoch": 7.267711489163365,
+      "grad_norm": 0.33023601770401,
+      "learning_rate": 0.0005130900612066452,
+      "loss": 3.4206,
+      "step": 24950
+    },
+    {
+      "epoch": 7.282276858541133,
+      "grad_norm": 0.33378899097442627,
+      "learning_rate": 0.0005129151850772369,
+      "loss": 3.4153,
+      "step": 25000
+    },
+    {
+      "epoch": 7.282276858541133,
+      "eval_accuracy": 0.364236481986269,
+      "eval_loss": 3.6024866104125977,
+      "eval_runtime": 179.4604,
+      "eval_samples_per_second": 92.75,
+      "eval_steps_per_second": 5.801,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2968422279189,
+      "grad_norm": 0.3245193064212799,
+      "learning_rate": 0.0005127403089478286,
+      "loss": 3.427,
+      "step": 25050
+    },
+    {
+      "epoch": 7.311407597296667,
+      "grad_norm": 0.35483458638191223,
+      "learning_rate": 0.0005125654328184203,
+      "loss": 3.4314,
+      "step": 25100
+    },
+    {
+      "epoch": 7.325972966674435,
+      "grad_norm": 0.3492553234100342,
+      "learning_rate": 0.0005123905566890119,
+      "loss": 3.4336,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3405383360522025,
+      "grad_norm": 0.35173410177230835,
+      "learning_rate": 0.0005122156805596036,
+      "loss": 3.4165,
+      "step": 25200
+    },
+    {
+      "epoch": 7.35510370542997,
+      "grad_norm": 0.3420160412788391,
+      "learning_rate": 0.0005120408044301953,
+      "loss": 3.4402,
+      "step": 25250
+    },
+    {
+      "epoch": 7.369669074807737,
+      "grad_norm": 0.3215605318546295,
+      "learning_rate": 0.0005118659283007868,
+      "loss": 3.447,
+      "step": 25300
+    },
+    {
+      "epoch": 7.384234444185505,
+      "grad_norm": 0.3140503764152527,
+      "learning_rate": 0.0005116910521713785,
+      "loss": 3.4381,
+      "step": 25350
+    },
+    {
+      "epoch": 7.398799813563272,
+      "grad_norm": 0.32911545038223267,
+      "learning_rate": 0.0005115161760419702,
+      "loss": 3.4403,
+      "step": 25400
+    },
+    {
+      "epoch": 7.413365182941039,
+      "grad_norm": 0.3454091548919678,
+      "learning_rate": 0.0005113412999125619,
+      "loss": 3.4435,
+      "step": 25450
+    },
+    {
+      "epoch": 7.427930552318807,
+      "grad_norm": 0.3304098844528198,
+      "learning_rate": 0.0005111664237831536,
+      "loss": 3.4443,
+      "step": 25500
+    },
+    {
+      "epoch": 7.442495921696574,
+      "grad_norm": 0.32890447974205017,
+      "learning_rate": 0.0005109915476537452,
+      "loss": 3.4347,
+      "step": 25550
+    },
+    {
+      "epoch": 7.457061291074342,
+      "grad_norm": 0.3333839476108551,
+      "learning_rate": 0.0005108166715243369,
+      "loss": 3.4441,
+      "step": 25600
+    },
+    {
+      "epoch": 7.471626660452109,
+      "grad_norm": 0.3388593792915344,
+      "learning_rate": 0.0005106417953949286,
+      "loss": 3.4551,
+      "step": 25650
+    },
+    {
+      "epoch": 7.486192029829876,
+      "grad_norm": 0.3506496846675873,
+      "learning_rate": 0.0005104669192655203,
+      "loss": 3.4465,
+      "step": 25700
+    },
+    {
+      "epoch": 7.500757399207644,
+      "grad_norm": 0.35972943902015686,
+      "learning_rate": 0.0005102920431361118,
+      "loss": 3.455,
+      "step": 25750
+    },
+    {
+      "epoch": 7.515322768585412,
+      "grad_norm": 0.3275600075721741,
+      "learning_rate": 0.0005101171670067035,
+      "loss": 3.4599,
+      "step": 25800
+    },
+    {
+      "epoch": 7.529888137963178,
+      "grad_norm": 0.3396972417831421,
+      "learning_rate": 0.0005099422908772952,
+      "loss": 3.4512,
+      "step": 25850
+    },
+    {
+      "epoch": 7.544453507340946,
+      "grad_norm": 0.3468742072582245,
+      "learning_rate": 0.0005097674147478868,
+      "loss": 3.4439,
+      "step": 25900
+    },
+    {
+      "epoch": 7.559018876718714,
+      "grad_norm": 0.3341714143753052,
+      "learning_rate": 0.0005095925386184786,
+      "loss": 3.454,
+      "step": 25950
+    },
+    {
+      "epoch": 7.573584246096481,
+      "grad_norm": 0.33167895674705505,
+      "learning_rate": 0.0005094176624890702,
+      "loss": 3.4552,
+      "step": 26000
+    },
+    {
+      "epoch": 7.573584246096481,
+      "eval_accuracy": 0.3649543623932247,
+      "eval_loss": 3.5933761596679688,
+      "eval_runtime": 179.6882,
+      "eval_samples_per_second": 92.633,
+      "eval_steps_per_second": 5.793,
+      "step": 26000
+    },
+    {
+      "epoch": 7.588149615474248,
+      "grad_norm": 0.38421186804771423,
+      "learning_rate": 0.0005092427863596619,
+      "loss": 3.4496,
+      "step": 26050
+    },
+    {
+      "epoch": 7.602714984852016,
+      "grad_norm": 0.3296069800853729,
+      "learning_rate": 0.0005090679102302536,
+      "loss": 3.4484,
+      "step": 26100
+    },
+    {
+      "epoch": 7.617280354229783,
+      "grad_norm": 0.33456072211265564,
+      "learning_rate": 0.0005088930341008451,
+      "loss": 3.4478,
+      "step": 26150
+    },
+    {
+      "epoch": 7.631845723607551,
+      "grad_norm": 0.34444794058799744,
+      "learning_rate": 0.0005087181579714368,
+      "loss": 3.4491,
+      "step": 26200
+    },
+    {
+      "epoch": 7.646411092985318,
+      "grad_norm": 0.3780238628387451,
+      "learning_rate": 0.0005085432818420285,
+      "loss": 3.451,
+      "step": 26250
+    },
+    {
+      "epoch": 7.660976462363085,
+      "grad_norm": 0.33494746685028076,
+      "learning_rate": 0.0005083684057126202,
+      "loss": 3.4543,
+      "step": 26300
+    },
+    {
+      "epoch": 7.675541831740853,
+      "grad_norm": 0.40819284319877625,
+      "learning_rate": 0.0005081935295832118,
+      "loss": 3.4655,
+      "step": 26350
+    },
+    {
+      "epoch": 7.690107201118621,
+      "grad_norm": 0.3251825273036957,
+      "learning_rate": 0.0005080186534538035,
+      "loss": 3.4459,
+      "step": 26400
+    },
+    {
+      "epoch": 7.704672570496387,
+      "grad_norm": 0.3159500062465668,
+      "learning_rate": 0.0005078437773243952,
+      "loss": 3.4561,
+      "step": 26450
+    },
+    {
+      "epoch": 7.719237939874155,
+      "grad_norm": 0.3354164958000183,
+      "learning_rate": 0.0005076689011949869,
+      "loss": 3.4639,
+      "step": 26500
+    },
+    {
+      "epoch": 7.733803309251923,
+      "grad_norm": 0.3452058732509613,
+      "learning_rate": 0.0005074940250655786,
+      "loss": 3.4627,
+      "step": 26550
+    },
+    {
+      "epoch": 7.74836867862969,
+      "grad_norm": 0.3344949781894684,
+      "learning_rate": 0.0005073191489361701,
+      "loss": 3.4492,
+      "step": 26600
+    },
+    {
+      "epoch": 7.762934048007457,
+      "grad_norm": 0.35478341579437256,
+      "learning_rate": 0.0005071442728067618,
+      "loss": 3.4452,
+      "step": 26650
+    },
+    {
+      "epoch": 7.777499417385225,
+      "grad_norm": 0.3661314845085144,
+      "learning_rate": 0.0005069693966773535,
+      "loss": 3.4577,
+      "step": 26700
+    },
+    {
+      "epoch": 7.792064786762992,
+      "grad_norm": 0.34170424938201904,
+      "learning_rate": 0.0005067945205479451,
+      "loss": 3.4678,
+      "step": 26750
+    },
+    {
+      "epoch": 7.80663015614076,
+      "grad_norm": 0.31290966272354126,
+      "learning_rate": 0.0005066196444185368,
+      "loss": 3.4606,
+      "step": 26800
+    },
+    {
+      "epoch": 7.821195525518527,
+      "grad_norm": 0.35089555382728577,
+      "learning_rate": 0.0005064447682891285,
+      "loss": 3.4679,
+      "step": 26850
+    },
+    {
+      "epoch": 7.8357608948962945,
+      "grad_norm": 0.33421048521995544,
+      "learning_rate": 0.0005062698921597202,
+      "loss": 3.4708,
+      "step": 26900
+    },
+    {
+      "epoch": 7.850326264274062,
+      "grad_norm": 0.35330483317375183,
+      "learning_rate": 0.0005060950160303119,
+      "loss": 3.4581,
+      "step": 26950
+    },
+    {
+      "epoch": 7.86489163365183,
+      "grad_norm": 0.3339422941207886,
+      "learning_rate": 0.0005059201399009035,
+      "loss": 3.4603,
+      "step": 27000
+    },
+    {
+      "epoch": 7.86489163365183,
+      "eval_accuracy": 0.3656727130788616,
+      "eval_loss": 3.585218906402588,
+      "eval_runtime": 179.5703,
+      "eval_samples_per_second": 92.694,
+      "eval_steps_per_second": 5.797,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8794570030295965,
+      "grad_norm": 0.335092157125473,
+      "learning_rate": 0.0005057452637714951,
+      "loss": 3.4618,
+      "step": 27050
+    },
+    {
+      "epoch": 7.894022372407364,
+      "grad_norm": 0.35167837142944336,
+      "learning_rate": 0.0005055703876420868,
+      "loss": 3.4618,
+      "step": 27100
+    },
+    {
+      "epoch": 7.908587741785132,
+      "grad_norm": 0.3454788327217102,
+      "learning_rate": 0.0005053955115126785,
+      "loss": 3.4624,
+      "step": 27150
+    },
+    {
+      "epoch": 7.923153111162899,
+      "grad_norm": 0.35379868745803833,
+      "learning_rate": 0.0005052206353832701,
+      "loss": 3.4515,
+      "step": 27200
+    },
+    {
+      "epoch": 7.937718480540666,
+      "grad_norm": 0.35463815927505493,
+      "learning_rate": 0.0005050457592538618,
+      "loss": 3.4616,
+      "step": 27250
+    },
+    {
+      "epoch": 7.952283849918434,
+      "grad_norm": 0.36919155716896057,
+      "learning_rate": 0.0005048708831244535,
+      "loss": 3.465,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9668492192962015,
+      "grad_norm": 0.32335364818573,
+      "learning_rate": 0.0005046960069950451,
+      "loss": 3.4751,
+      "step": 27350
+    },
+    {
+      "epoch": 7.981414588673969,
+      "grad_norm": 0.32492557168006897,
+      "learning_rate": 0.0005045211308656369,
+      "loss": 3.4596,
+      "step": 27400
+    },
+    {
+      "epoch": 7.995979958051736,
+      "grad_norm": 0.32239827513694763,
+      "learning_rate": 0.0005043462547362284,
+      "loss": 3.4709,
+      "step": 27450
+    },
+    {
+      "epoch": 8.010487065951992,
+      "grad_norm": 0.3487697243690491,
+      "learning_rate": 0.0005041713786068201,
+      "loss": 3.3878,
+      "step": 27500
+    },
+    {
+      "epoch": 8.02505243532976,
+      "grad_norm": 0.339937299489975,
+      "learning_rate": 0.0005039965024774118,
+      "loss": 3.3456,
+      "step": 27550
+    },
+    {
+      "epoch": 8.039617804707527,
+      "grad_norm": 0.34511151909828186,
+      "learning_rate": 0.0005038216263480034,
+      "loss": 3.3591,
+      "step": 27600
+    },
+    {
+      "epoch": 8.054183174085296,
+      "grad_norm": 0.33415067195892334,
+      "learning_rate": 0.0005036467502185951,
+      "loss": 3.351,
+      "step": 27650
+    },
+    {
+      "epoch": 8.068748543463062,
+      "grad_norm": 0.33265748620033264,
+      "learning_rate": 0.0005034718740891868,
+      "loss": 3.3549,
+      "step": 27700
+    },
+    {
+      "epoch": 8.08331391284083,
+      "grad_norm": 0.3668820261955261,
+      "learning_rate": 0.0005032969979597785,
+      "loss": 3.3766,
+      "step": 27750
+    },
+    {
+      "epoch": 8.097879282218598,
+      "grad_norm": 0.35505983233451843,
+      "learning_rate": 0.0005031221218303701,
+      "loss": 3.3705,
+      "step": 27800
+    },
+    {
+      "epoch": 8.112444651596364,
+      "grad_norm": 0.3510807752609253,
+      "learning_rate": 0.0005029472457009618,
+      "loss": 3.3713,
+      "step": 27850
+    },
+    {
+      "epoch": 8.127010020974131,
+      "grad_norm": 0.3338639736175537,
+      "learning_rate": 0.0005027723695715534,
+      "loss": 3.3701,
+      "step": 27900
+    },
+    {
+      "epoch": 8.1415753903519,
+      "grad_norm": 0.327267587184906,
+      "learning_rate": 0.0005025974934421451,
+      "loss": 3.3841,
+      "step": 27950
+    },
+    {
+      "epoch": 8.156140759729666,
+      "grad_norm": 0.3316822052001953,
+      "learning_rate": 0.0005024226173127368,
+      "loss": 3.3745,
+      "step": 28000
+    },
+    {
+      "epoch": 8.156140759729666,
+      "eval_accuracy": 0.36602271798739533,
+      "eval_loss": 3.592015266418457,
+      "eval_runtime": 179.543,
+      "eval_samples_per_second": 92.708,
+      "eval_steps_per_second": 5.798,
+      "step": 28000
+    },
+    {
+      "epoch": 8.170706129107435,
+      "grad_norm": 0.34737464785575867,
+      "learning_rate": 0.0005022477411833284,
+      "loss": 3.3776,
+      "step": 28050
+    },
+    {
+      "epoch": 8.185271498485202,
+      "grad_norm": 0.35966166853904724,
+      "learning_rate": 0.0005020728650539201,
+      "loss": 3.3925,
+      "step": 28100
+    },
+    {
+      "epoch": 8.199836867862969,
+      "grad_norm": 0.3718971312046051,
+      "learning_rate": 0.0005018979889245118,
+      "loss": 3.3919,
+      "step": 28150
+    },
+    {
+      "epoch": 8.214402237240737,
+      "grad_norm": 0.36448919773101807,
+      "learning_rate": 0.0005017231127951034,
+      "loss": 3.383,
+      "step": 28200
+    },
+    {
+      "epoch": 8.228967606618504,
+      "grad_norm": 0.3384815752506256,
+      "learning_rate": 0.0005015482366656951,
+      "loss": 3.395,
+      "step": 28250
+    },
+    {
+      "epoch": 8.24353297599627,
+      "grad_norm": 0.35150644183158875,
+      "learning_rate": 0.0005013733605362868,
+      "loss": 3.392,
+      "step": 28300
+    },
+    {
+      "epoch": 8.258098345374039,
+      "grad_norm": 0.3527531027793884,
+      "learning_rate": 0.0005011984844068784,
+      "loss": 3.3917,
+      "step": 28350
+    },
+    {
+      "epoch": 8.272663714751806,
+      "grad_norm": 0.35610780119895935,
+      "learning_rate": 0.0005010236082774701,
+      "loss": 3.4048,
+      "step": 28400
+    },
+    {
+      "epoch": 8.287229084129574,
+      "grad_norm": 0.3675138056278229,
+      "learning_rate": 0.0005008487321480617,
+      "loss": 3.4005,
+      "step": 28450
+    },
+    {
+      "epoch": 8.301794453507341,
+      "grad_norm": 0.33330920338630676,
+      "learning_rate": 0.0005006738560186534,
+      "loss": 3.4086,
+      "step": 28500
+    },
+    {
+      "epoch": 8.316359822885108,
+      "grad_norm": 0.3794533312320709,
+      "learning_rate": 0.0005004989798892451,
+      "loss": 3.4062,
+      "step": 28550
+    },
+    {
+      "epoch": 8.330925192262876,
+      "grad_norm": 0.33697935938835144,
+      "learning_rate": 0.0005003241037598368,
+      "loss": 3.4017,
+      "step": 28600
+    },
+    {
+      "epoch": 8.345490561640643,
+      "grad_norm": 0.3670228123664856,
+      "learning_rate": 0.0005001492276304284,
+      "loss": 3.4173,
+      "step": 28650
+    },
+    {
+      "epoch": 8.36005593101841,
+      "grad_norm": 0.3506808578968048,
+      "learning_rate": 0.0004999743515010201,
+      "loss": 3.405,
+      "step": 28700
+    },
+    {
+      "epoch": 8.374621300396178,
+      "grad_norm": 0.31867876648902893,
+      "learning_rate": 0.0004997994753716117,
+      "loss": 3.423,
+      "step": 28750
+    },
+    {
+      "epoch": 8.389186669773945,
+      "grad_norm": 0.34136125445365906,
+      "learning_rate": 0.0004996245992422033,
+      "loss": 3.4096,
+      "step": 28800
+    },
+    {
+      "epoch": 8.403752039151712,
+      "grad_norm": 0.36181262135505676,
+      "learning_rate": 0.0004994497231127951,
+      "loss": 3.4094,
+      "step": 28850
+    },
+    {
+      "epoch": 8.41831740852948,
+      "grad_norm": 0.36319324374198914,
+      "learning_rate": 0.0004992748469833867,
+      "loss": 3.4074,
+      "step": 28900
+    },
+    {
+      "epoch": 8.432882777907247,
+      "grad_norm": 0.31755128502845764,
+      "learning_rate": 0.0004990999708539784,
+      "loss": 3.4134,
+      "step": 28950
+    },
+    {
+      "epoch": 8.447448147285016,
+      "grad_norm": 0.34557104110717773,
+      "learning_rate": 0.0004989250947245701,
+      "loss": 3.406,
+      "step": 29000
+    },
+    {
+      "epoch": 8.447448147285016,
+      "eval_accuracy": 0.36604270483134926,
+      "eval_loss": 3.5855937004089355,
+      "eval_runtime": 179.6608,
+      "eval_samples_per_second": 92.647,
+      "eval_steps_per_second": 5.794,
+      "step": 29000
+    },
+    {
+      "epoch": 8.462013516662783,
+      "grad_norm": 0.3253464102745056,
+      "learning_rate": 0.0004987502185951617,
+      "loss": 3.4097,
+      "step": 29050
+    },
+    {
+      "epoch": 8.47657888604055,
+      "grad_norm": 0.3373265564441681,
+      "learning_rate": 0.0004985753424657534,
+      "loss": 3.4196,
+      "step": 29100
+    },
+    {
+      "epoch": 8.491144255418318,
+      "grad_norm": 0.3448173701763153,
+      "learning_rate": 0.000498400466336345,
+      "loss": 3.423,
+      "step": 29150
+    },
+    {
+      "epoch": 8.505709624796085,
+      "grad_norm": 0.33007627725601196,
+      "learning_rate": 0.0004982255902069367,
+      "loss": 3.4089,
+      "step": 29200
+    },
+    {
+      "epoch": 8.520274994173853,
+      "grad_norm": 0.3380190432071686,
+      "learning_rate": 0.0004980507140775283,
+      "loss": 3.4091,
+      "step": 29250
+    },
+    {
+      "epoch": 8.53484036355162,
+      "grad_norm": 0.33369770646095276,
+      "learning_rate": 0.0004978758379481201,
+      "loss": 3.4299,
+      "step": 29300
+    },
+    {
+      "epoch": 8.549405732929387,
+      "grad_norm": 0.3573598265647888,
+      "learning_rate": 0.0004977009618187117,
+      "loss": 3.4095,
+      "step": 29350
+    },
+    {
+      "epoch": 8.563971102307155,
+      "grad_norm": 0.3663587272167206,
+      "learning_rate": 0.0004975260856893034,
+      "loss": 3.4357,
+      "step": 29400
+    },
+    {
+      "epoch": 8.578536471684922,
+      "grad_norm": 0.3414006233215332,
+      "learning_rate": 0.0004973512095598951,
+      "loss": 3.4199,
+      "step": 29450
+    },
+    {
+      "epoch": 8.593101841062689,
+      "grad_norm": 0.34500792622566223,
+      "learning_rate": 0.0004971763334304867,
+      "loss": 3.4219,
+      "step": 29500
+    },
+    {
+      "epoch": 8.607667210440457,
+      "grad_norm": 0.32384201884269714,
+      "learning_rate": 0.0004970014573010784,
+      "loss": 3.4209,
+      "step": 29550
+    },
+    {
+      "epoch": 8.622232579818224,
+      "grad_norm": 0.351113498210907,
+      "learning_rate": 0.00049682658117167,
+      "loss": 3.425,
+      "step": 29600
+    },
+    {
+      "epoch": 8.63679794919599,
+      "grad_norm": 0.3571644127368927,
+      "learning_rate": 0.0004966517050422616,
+      "loss": 3.4208,
+      "step": 29650
+    },
+    {
+      "epoch": 8.65136331857376,
+      "grad_norm": 0.34789296984672546,
+      "learning_rate": 0.0004964768289128533,
+      "loss": 3.4295,
+      "step": 29700
+    },
+    {
+      "epoch": 8.665928687951526,
+      "grad_norm": 0.34940293431282043,
+      "learning_rate": 0.000496301952783445,
+      "loss": 3.4267,
+      "step": 29750
+    },
+    {
+      "epoch": 8.680494057329295,
+      "grad_norm": 0.34850820899009705,
+      "learning_rate": 0.0004961270766540367,
+      "loss": 3.4153,
+      "step": 29800
+    },
+    {
+      "epoch": 8.695059426707061,
+      "grad_norm": 0.35262537002563477,
+      "learning_rate": 0.0004959522005246284,
+      "loss": 3.4276,
+      "step": 29850
+    },
+    {
+      "epoch": 8.709624796084828,
+      "grad_norm": 0.33390718698501587,
+      "learning_rate": 0.00049577732439522,
+      "loss": 3.4387,
+      "step": 29900
+    },
+    {
+      "epoch": 8.724190165462597,
+      "grad_norm": 0.33877861499786377,
+      "learning_rate": 0.0004956024482658117,
+      "loss": 3.4404,
+      "step": 29950
+    },
+    {
+      "epoch": 8.738755534840363,
+      "grad_norm": 0.3310054838657379,
+      "learning_rate": 0.0004954275721364034,
+      "loss": 3.429,
+      "step": 30000
+    },
+    {
+      "epoch": 8.738755534840363,
+      "eval_accuracy": 0.36692130297762826,
+      "eval_loss": 3.57623028755188,
+      "eval_runtime": 179.5297,
+      "eval_samples_per_second": 92.714,
+      "eval_steps_per_second": 5.798,
+      "step": 30000
+    },
+    {
+      "epoch": 8.753320904218132,
+      "grad_norm": 0.3291216790676117,
+      "learning_rate": 0.000495252696006995,
+      "loss": 3.4293,
+      "step": 30050
+    },
+    {
+      "epoch": 8.767886273595899,
+      "grad_norm": 0.336401104927063,
+      "learning_rate": 0.0004950778198775866,
+      "loss": 3.4227,
+      "step": 30100
+    },
+    {
+      "epoch": 8.782451642973665,
+      "grad_norm": 0.3546486794948578,
+      "learning_rate": 0.0004949029437481783,
+      "loss": 3.4305,
+      "step": 30150
+    },
+    {
+      "epoch": 8.797017012351434,
+      "grad_norm": 0.3768438398838043,
+      "learning_rate": 0.00049472806761877,
+      "loss": 3.4396,
+      "step": 30200
+    },
+    {
+      "epoch": 8.8115823817292,
+      "grad_norm": 0.36031797528266907,
+      "learning_rate": 0.0004945531914893616,
+      "loss": 3.4374,
+      "step": 30250
+    },
+    {
+      "epoch": 8.826147751106967,
+      "grad_norm": 0.338821679353714,
+      "learning_rate": 0.0004943783153599534,
+      "loss": 3.434,
+      "step": 30300
+    },
+    {
+      "epoch": 8.840713120484736,
+      "grad_norm": 0.3513728976249695,
+      "learning_rate": 0.000494203439230545,
+      "loss": 3.4468,
+      "step": 30350
+    },
+    {
+      "epoch": 8.855278489862503,
+      "grad_norm": 0.3318538963794708,
+      "learning_rate": 0.0004940285631011367,
+      "loss": 3.4349,
+      "step": 30400
+    },
+    {
+      "epoch": 8.86984385924027,
+      "grad_norm": 0.35798031091690063,
+      "learning_rate": 0.0004938536869717284,
+      "loss": 3.4249,
+      "step": 30450
+    },
+    {
+      "epoch": 8.884409228618038,
+      "grad_norm": 0.32275164127349854,
+      "learning_rate": 0.0004936788108423199,
+      "loss": 3.4337,
+      "step": 30500
+    },
+    {
+      "epoch": 8.898974597995805,
+      "grad_norm": 0.3375926613807678,
+      "learning_rate": 0.0004935039347129116,
+      "loss": 3.4416,
+      "step": 30550
+    },
+    {
+      "epoch": 8.913539967373573,
+      "grad_norm": 0.3426482677459717,
+      "learning_rate": 0.0004933290585835033,
+      "loss": 3.4399,
+      "step": 30600
+    },
+    {
+      "epoch": 8.92810533675134,
+      "grad_norm": 0.3388661742210388,
+      "learning_rate": 0.000493154182454095,
+      "loss": 3.4365,
+      "step": 30650
+    },
+    {
+      "epoch": 8.942670706129107,
+      "grad_norm": 0.33085867762565613,
+      "learning_rate": 0.0004929793063246866,
+      "loss": 3.4426,
+      "step": 30700
+    },
+    {
+      "epoch": 8.957236075506875,
+      "grad_norm": 0.34728479385375977,
+      "learning_rate": 0.0004928044301952783,
+      "loss": 3.4563,
+      "step": 30750
+    },
+    {
+      "epoch": 8.971801444884642,
+      "grad_norm": 0.37290677428245544,
+      "learning_rate": 0.00049262955406587,
+      "loss": 3.4409,
+      "step": 30800
+    },
+    {
+      "epoch": 8.986366814262409,
+      "grad_norm": 0.3528863787651062,
+      "learning_rate": 0.0004924546779364617,
+      "loss": 3.4321,
+      "step": 30850
+    },
+    {
+      "epoch": 9.000873922162667,
+      "grad_norm": 0.33041954040527344,
+      "learning_rate": 0.0004922798018070533,
+      "loss": 3.4225,
+      "step": 30900
+    },
+    {
+      "epoch": 9.015439291540433,
+      "grad_norm": 0.3403935730457306,
+      "learning_rate": 0.0004921049256776449,
+      "loss": 3.3243,
+      "step": 30950
+    },
+    {
+      "epoch": 9.0300046609182,
+      "grad_norm": 0.350462406873703,
+      "learning_rate": 0.0004919300495482366,
+      "loss": 3.317,
+      "step": 31000
+    },
+    {
+      "epoch": 9.0300046609182,
+      "eval_accuracy": 0.3672527318782519,
+      "eval_loss": 3.5818264484405518,
+      "eval_runtime": 179.6159,
+      "eval_samples_per_second": 92.67,
+      "eval_steps_per_second": 5.796,
+      "step": 31000
+    },
+    {
+      "epoch": 9.044570030295969,
+      "grad_norm": 0.3548542857170105,
+      "learning_rate": 0.0004917551734188283,
+      "loss": 3.3339,
+      "step": 31050
+    },
+    {
+      "epoch": 9.059135399673735,
+      "grad_norm": 0.34578046202659607,
+      "learning_rate": 0.0004915802972894199,
+      "loss": 3.3305,
+      "step": 31100
+    },
+    {
+      "epoch": 9.073700769051504,
+      "grad_norm": 0.335111141204834,
+      "learning_rate": 0.0004914054211600116,
+      "loss": 3.337,
+      "step": 31150
+    },
+    {
+      "epoch": 9.08826613842927,
+      "grad_norm": 0.32308679819107056,
+      "learning_rate": 0.0004912305450306033,
+      "loss": 3.3521,
+      "step": 31200
+    },
+    {
+      "epoch": 9.102831507807037,
+      "grad_norm": 0.359343558549881,
+      "learning_rate": 0.000491055668901195,
+      "loss": 3.3517,
+      "step": 31250
+    },
+    {
+      "epoch": 9.117396877184806,
+      "grad_norm": 0.3571661710739136,
+      "learning_rate": 0.0004908807927717865,
+      "loss": 3.3586,
+      "step": 31300
+    },
+    {
+      "epoch": 9.131962246562573,
+      "grad_norm": 0.3565060496330261,
+      "learning_rate": 0.0004907059166423783,
+      "loss": 3.3453,
+      "step": 31350
+    },
+    {
+      "epoch": 9.14652761594034,
+      "grad_norm": 0.39377450942993164,
+      "learning_rate": 0.0004905310405129699,
+      "loss": 3.3478,
+      "step": 31400
+    },
+    {
+      "epoch": 9.161092985318108,
+      "grad_norm": 0.3553165793418884,
+      "learning_rate": 0.0004903561643835616,
+      "loss": 3.3566,
+      "step": 31450
+    },
+    {
+      "epoch": 9.175658354695875,
+      "grad_norm": 0.34331372380256653,
+      "learning_rate": 0.0004901812882541533,
+      "loss": 3.3631,
+      "step": 31500
+    },
+    {
+      "epoch": 9.190223724073643,
+      "grad_norm": 0.3618679344654083,
+      "learning_rate": 0.0004900064121247449,
+      "loss": 3.3571,
+      "step": 31550
+    },
+    {
+      "epoch": 9.20478909345141,
+      "grad_norm": 0.37382376194000244,
+      "learning_rate": 0.0004898315359953366,
+      "loss": 3.3748,
+      "step": 31600
+    },
+    {
+      "epoch": 9.219354462829177,
+      "grad_norm": 0.35998404026031494,
+      "learning_rate": 0.0004896566598659283,
+      "loss": 3.3669,
+      "step": 31650
+    },
+    {
+      "epoch": 9.233919832206945,
+      "grad_norm": 0.33387935161590576,
+      "learning_rate": 0.0004894817837365199,
+      "loss": 3.3586,
+      "step": 31700
+    },
+    {
+      "epoch": 9.248485201584712,
+      "grad_norm": 0.3467971682548523,
+      "learning_rate": 0.0004893069076071115,
+      "loss": 3.37,
+      "step": 31750
+    },
+    {
+      "epoch": 9.263050570962479,
+      "grad_norm": 0.33971235156059265,
+      "learning_rate": 0.0004891320314777032,
+      "loss": 3.3652,
+      "step": 31800
+    },
+    {
+      "epoch": 9.277615940340247,
+      "grad_norm": 0.33422431349754333,
+      "learning_rate": 0.0004889571553482949,
+      "loss": 3.3646,
+      "step": 31850
+    },
+    {
+      "epoch": 9.292181309718014,
+      "grad_norm": 0.3441632390022278,
+      "learning_rate": 0.0004887822792188866,
+      "loss": 3.3774,
+      "step": 31900
+    },
+    {
+      "epoch": 9.306746679095783,
+      "grad_norm": 0.34111490845680237,
+      "learning_rate": 0.0004886074030894782,
+      "loss": 3.37,
+      "step": 31950
+    },
+    {
+      "epoch": 9.32131204847355,
+      "grad_norm": 0.35753685235977173,
+      "learning_rate": 0.0004884325269600699,
+      "loss": 3.3833,
+      "step": 32000
+    },
+    {
+      "epoch": 9.32131204847355,
+      "eval_accuracy": 0.36719994309627957,
+      "eval_loss": 3.580444812774658,
+      "eval_runtime": 179.6886,
+      "eval_samples_per_second": 92.632,
+      "eval_steps_per_second": 5.793,
+      "step": 32000
+    },
+    {
+      "epoch": 9.335877417851316,
+      "grad_norm": 0.3657474219799042,
+      "learning_rate": 0.0004882576508306615,
+      "loss": 3.3801,
+      "step": 32050
+    },
+    {
+      "epoch": 9.350442787229085,
+      "grad_norm": 0.360665500164032,
+      "learning_rate": 0.00048808277470125327,
+      "loss": 3.3826,
+      "step": 32100
+    },
+    {
+      "epoch": 9.365008156606851,
+      "grad_norm": 0.3531738817691803,
+      "learning_rate": 0.0004879078985718449,
+      "loss": 3.379,
+      "step": 32150
+    },
+    {
+      "epoch": 9.379573525984618,
+      "grad_norm": 0.3487440347671509,
+      "learning_rate": 0.0004877330224424366,
+      "loss": 3.3753,
+      "step": 32200
+    },
+    {
+      "epoch": 9.394138895362387,
+      "grad_norm": 0.3362681269645691,
+      "learning_rate": 0.00048755814631302823,
+      "loss": 3.3867,
+      "step": 32250
+    },
+    {
+      "epoch": 9.408704264740154,
+      "grad_norm": 0.38282108306884766,
+      "learning_rate": 0.00048738327018361987,
+      "loss": 3.3906,
+      "step": 32300
+    },
+    {
+      "epoch": 9.423269634117922,
+      "grad_norm": 0.3605978786945343,
+      "learning_rate": 0.00048720839405421156,
+      "loss": 3.3758,
+      "step": 32350
+    },
+    {
+      "epoch": 9.437835003495689,
+      "grad_norm": 0.33404749631881714,
+      "learning_rate": 0.0004870335179248032,
+      "loss": 3.3921,
+      "step": 32400
+    },
+    {
+      "epoch": 9.452400372873456,
+      "grad_norm": 0.3609205186367035,
+      "learning_rate": 0.0004868586417953949,
+      "loss": 3.3738,
+      "step": 32450
+    },
+    {
+      "epoch": 9.466965742251224,
+      "grad_norm": 0.3603569567203522,
+      "learning_rate": 0.0004866837656659865,
+      "loss": 3.3808,
+      "step": 32500
+    },
+    {
+      "epoch": 9.48153111162899,
+      "grad_norm": 0.34669244289398193,
+      "learning_rate": 0.00048650888953657816,
+      "loss": 3.4071,
+      "step": 32550
+    },
+    {
+      "epoch": 9.496096481006758,
+      "grad_norm": 0.3603197932243347,
+      "learning_rate": 0.0004863340134071699,
+      "loss": 3.3825,
+      "step": 32600
+    },
+    {
+      "epoch": 9.510661850384526,
+      "grad_norm": 0.33809515833854675,
+      "learning_rate": 0.00048615913727776154,
+      "loss": 3.3967,
+      "step": 32650
+    },
+    {
+      "epoch": 9.525227219762293,
+      "grad_norm": 0.33064547181129456,
+      "learning_rate": 0.00048598426114835323,
+      "loss": 3.4032,
+      "step": 32700
+    },
+    {
+      "epoch": 9.53979258914006,
+      "grad_norm": 0.343467116355896,
+      "learning_rate": 0.00048580938501894486,
+      "loss": 3.4028,
+      "step": 32750
+    },
+    {
+      "epoch": 9.554357958517828,
+      "grad_norm": 0.3574856221675873,
+      "learning_rate": 0.00048563450888953655,
+      "loss": 3.4011,
+      "step": 32800
+    },
+    {
+      "epoch": 9.568923327895595,
+      "grad_norm": 0.373276025056839,
+      "learning_rate": 0.0004854596327601282,
+      "loss": 3.3909,
+      "step": 32850
+    },
+    {
+      "epoch": 9.583488697273363,
+      "grad_norm": 0.372328519821167,
+      "learning_rate": 0.0004852847566307198,
+      "loss": 3.399,
+      "step": 32900
+    },
+    {
+      "epoch": 9.59805406665113,
+      "grad_norm": 0.3499378561973572,
+      "learning_rate": 0.0004851098805013115,
+      "loss": 3.4031,
+      "step": 32950
+    },
+    {
+      "epoch": 9.612619436028897,
+      "grad_norm": 0.36343520879745483,
+      "learning_rate": 0.00048493500437190315,
+      "loss": 3.3999,
+      "step": 33000
+    },
+    {
+      "epoch": 9.612619436028897,
+      "eval_accuracy": 0.3681005267709079,
+      "eval_loss": 3.5709691047668457,
+      "eval_runtime": 179.7111,
+      "eval_samples_per_second": 92.621,
+      "eval_steps_per_second": 5.793,
+      "step": 33000
+    },
+    {
+      "epoch": 9.627184805406666,
+      "grad_norm": 0.33371296525001526,
+      "learning_rate": 0.0004847601282424949,
+      "loss": 3.4052,
+      "step": 33050
+    },
+    {
+      "epoch": 9.641750174784432,
+      "grad_norm": 0.3303377032279968,
+      "learning_rate": 0.00048458525211308653,
+      "loss": 3.4074,
+      "step": 33100
+    },
+    {
+      "epoch": 9.6563155441622,
+      "grad_norm": 0.35207274556159973,
+      "learning_rate": 0.00048441037598367817,
+      "loss": 3.4158,
+      "step": 33150
+    },
+    {
+      "epoch": 9.670880913539968,
+      "grad_norm": 0.365450918674469,
+      "learning_rate": 0.00048423549985426986,
+      "loss": 3.4032,
+      "step": 33200
+    },
+    {
+      "epoch": 9.685446282917734,
+      "grad_norm": 0.33346548676490784,
+      "learning_rate": 0.0004840606237248615,
+      "loss": 3.4133,
+      "step": 33250
+    },
+    {
+      "epoch": 9.700011652295503,
+      "grad_norm": 0.3496672511100769,
+      "learning_rate": 0.0004838857475954532,
+      "loss": 3.4026,
+      "step": 33300
+    },
+    {
+      "epoch": 9.71457702167327,
+      "grad_norm": 0.3341895341873169,
+      "learning_rate": 0.0004837108714660448,
+      "loss": 3.4104,
+      "step": 33350
+    },
+    {
+      "epoch": 9.729142391051036,
+      "grad_norm": 0.35880976915359497,
+      "learning_rate": 0.0004835359953366365,
+      "loss": 3.419,
+      "step": 33400
+    },
+    {
+      "epoch": 9.743707760428805,
+      "grad_norm": 0.36180543899536133,
+      "learning_rate": 0.00048336111920722815,
+      "loss": 3.3987,
+      "step": 33450
+    },
+    {
+      "epoch": 9.758273129806572,
+      "grad_norm": 0.33391448855400085,
+      "learning_rate": 0.0004831862430778198,
+      "loss": 3.4069,
+      "step": 33500
+    },
+    {
+      "epoch": 9.772838499184338,
+      "grad_norm": 0.335786908864975,
+      "learning_rate": 0.00048301136694841153,
+      "loss": 3.4064,
+      "step": 33550
+    },
+    {
+      "epoch": 9.787403868562107,
+      "grad_norm": 0.3425864279270172,
+      "learning_rate": 0.00048283649081900317,
+      "loss": 3.4015,
+      "step": 33600
+    },
+    {
+      "epoch": 9.801969237939874,
+      "grad_norm": 0.3565329313278198,
+      "learning_rate": 0.00048266161468959486,
+      "loss": 3.4078,
+      "step": 33650
+    },
+    {
+      "epoch": 9.816534607317642,
+      "grad_norm": 0.3550211489200592,
+      "learning_rate": 0.0004824867385601865,
+      "loss": 3.3977,
+      "step": 33700
+    },
+    {
+      "epoch": 9.831099976695409,
+      "grad_norm": 0.33708900213241577,
+      "learning_rate": 0.00048231186243077813,
+      "loss": 3.4104,
+      "step": 33750
+    },
+    {
+      "epoch": 9.845665346073176,
+      "grad_norm": 0.3294453024864197,
+      "learning_rate": 0.0004821369863013698,
+      "loss": 3.4108,
+      "step": 33800
+    },
+    {
+      "epoch": 9.860230715450944,
+      "grad_norm": 0.373319536447525,
+      "learning_rate": 0.00048196211017196146,
+      "loss": 3.4074,
+      "step": 33850
+    },
+    {
+      "epoch": 9.874796084828711,
+      "grad_norm": 0.34141167998313904,
+      "learning_rate": 0.00048178723404255315,
+      "loss": 3.4147,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88936145420648,
+      "grad_norm": 0.35247910022735596,
+      "learning_rate": 0.0004816123579131448,
+      "loss": 3.4067,
+      "step": 33950
+    },
+    {
+      "epoch": 9.903926823584246,
+      "grad_norm": 0.3380388021469116,
+      "learning_rate": 0.0004814374817837364,
+      "loss": 3.4035,
+      "step": 34000
+    },
+    {
+      "epoch": 9.903926823584246,
+      "eval_accuracy": 0.3680470325709136,
+      "eval_loss": 3.5645980834960938,
+      "eval_runtime": 179.6385,
+      "eval_samples_per_second": 92.658,
+      "eval_steps_per_second": 5.795,
+      "step": 34000
+    },
+    {
+      "epoch": 9.918492192962013,
+      "grad_norm": 0.33066990971565247,
+      "learning_rate": 0.00048126260565432816,
+      "loss": 3.4149,
+      "step": 34050
+    },
+    {
+      "epoch": 9.933057562339782,
+      "grad_norm": 0.36766499280929565,
+      "learning_rate": 0.0004810877295249198,
+      "loss": 3.4174,
+      "step": 34100
+    },
+    {
+      "epoch": 9.947622931717548,
+      "grad_norm": 0.32511964440345764,
+      "learning_rate": 0.0004809128533955115,
+      "loss": 3.4018,
+      "step": 34150
+    },
+    {
+      "epoch": 9.962188301095315,
+      "grad_norm": 0.3310585618019104,
+      "learning_rate": 0.0004807379772661031,
+      "loss": 3.4235,
+      "step": 34200
+    },
+    {
+      "epoch": 9.976753670473084,
+      "grad_norm": 0.34995537996292114,
+      "learning_rate": 0.0004805631011366948,
+      "loss": 3.405,
+      "step": 34250
+    },
+    {
+      "epoch": 9.99131903985085,
+      "grad_norm": 0.3458772301673889,
+      "learning_rate": 0.00048038822500728645,
+      "loss": 3.4196,
+      "step": 34300
+    },
+    {
+      "epoch": 10.005826147751106,
+      "grad_norm": 0.3567534387111664,
+      "learning_rate": 0.0004802133488778781,
+      "loss": 3.3754,
+      "step": 34350
+    },
+    {
+      "epoch": 10.020391517128875,
+      "grad_norm": 0.34907981753349304,
+      "learning_rate": 0.0004800384727484698,
+      "loss": 3.3042,
+      "step": 34400
+    },
+    {
+      "epoch": 10.034956886506642,
+      "grad_norm": 0.36751100420951843,
+      "learning_rate": 0.0004798635966190614,
+      "loss": 3.3046,
+      "step": 34450
+    },
+    {
+      "epoch": 10.049522255884408,
+      "grad_norm": 0.3411542773246765,
+      "learning_rate": 0.00047968872048965316,
+      "loss": 3.3093,
+      "step": 34500
+    },
+    {
+      "epoch": 10.064087625262177,
+      "grad_norm": 0.35095176100730896,
+      "learning_rate": 0.0004795138443602448,
+      "loss": 3.3179,
+      "step": 34550
+    },
+    {
+      "epoch": 10.078652994639944,
+      "grad_norm": 0.34343641996383667,
+      "learning_rate": 0.00047933896823083643,
+      "loss": 3.3087,
+      "step": 34600
+    },
+    {
+      "epoch": 10.093218364017712,
+      "grad_norm": 0.39370444416999817,
+      "learning_rate": 0.0004791640921014281,
+      "loss": 3.3124,
+      "step": 34650
+    },
+    {
+      "epoch": 10.107783733395479,
+      "grad_norm": 0.36577534675598145,
+      "learning_rate": 0.00047898921597201976,
+      "loss": 3.3235,
+      "step": 34700
+    },
+    {
+      "epoch": 10.122349102773246,
+      "grad_norm": 0.38264644145965576,
+      "learning_rate": 0.00047881433984261145,
+      "loss": 3.3297,
+      "step": 34750
+    },
+    {
+      "epoch": 10.136914472151014,
+      "grad_norm": 0.3537810742855072,
+      "learning_rate": 0.0004786394637132031,
+      "loss": 3.3233,
+      "step": 34800
+    },
+    {
+      "epoch": 10.151479841528781,
+      "grad_norm": 0.36147990822792053,
+      "learning_rate": 0.0004784645875837948,
+      "loss": 3.3265,
+      "step": 34850
+    },
+    {
+      "epoch": 10.166045210906548,
+      "grad_norm": 0.36201244592666626,
+      "learning_rate": 0.0004782897114543864,
+      "loss": 3.328,
+      "step": 34900
+    },
+    {
+      "epoch": 10.180610580284316,
+      "grad_norm": 0.3463992178440094,
+      "learning_rate": 0.00047811483532497805,
+      "loss": 3.3514,
+      "step": 34950
+    },
+    {
+      "epoch": 10.195175949662083,
+      "grad_norm": 0.3376941382884979,
+      "learning_rate": 0.0004779399591955698,
+      "loss": 3.3507,
+      "step": 35000
+    },
+    {
+      "epoch": 10.195175949662083,
+      "eval_accuracy": 0.3683125048864894,
+      "eval_loss": 3.574247121810913,
+      "eval_runtime": 179.5052,
+      "eval_samples_per_second": 92.727,
+      "eval_steps_per_second": 5.799,
+      "step": 35000
+    },
+    {
+      "epoch": 10.209741319039852,
+      "grad_norm": 0.3373648226261139,
+      "learning_rate": 0.00047776508306616143,
+      "loss": 3.339,
+      "step": 35050
+    },
+    {
+      "epoch": 10.224306688417618,
+      "grad_norm": 0.34261074662208557,
+      "learning_rate": 0.0004775902069367531,
+      "loss": 3.344,
+      "step": 35100
+    },
+    {
+      "epoch": 10.238872057795385,
+      "grad_norm": 0.3796720802783966,
+      "learning_rate": 0.00047741533080734476,
+      "loss": 3.3424,
+      "step": 35150
+    },
+    {
+      "epoch": 10.253437427173154,
+      "grad_norm": 0.36016082763671875,
+      "learning_rate": 0.0004772404546779364,
+      "loss": 3.3428,
+      "step": 35200
+    },
+    {
+      "epoch": 10.26800279655092,
+      "grad_norm": 0.350201815366745,
+      "learning_rate": 0.0004770655785485281,
+      "loss": 3.3538,
+      "step": 35250
+    },
+    {
+      "epoch": 10.282568165928687,
+      "grad_norm": 0.3484781086444855,
+      "learning_rate": 0.0004768907024191197,
+      "loss": 3.3387,
+      "step": 35300
+    },
+    {
+      "epoch": 10.297133535306456,
+      "grad_norm": 0.33689433336257935,
+      "learning_rate": 0.0004767158262897114,
+      "loss": 3.3516,
+      "step": 35350
+    },
+    {
+      "epoch": 10.311698904684222,
+      "grad_norm": 0.3487689793109894,
+      "learning_rate": 0.00047654095016030305,
+      "loss": 3.346,
+      "step": 35400
+    },
+    {
+      "epoch": 10.326264274061991,
+      "grad_norm": 0.34821704030036926,
+      "learning_rate": 0.0004763660740308948,
+      "loss": 3.3566,
+      "step": 35450
+    },
+    {
+      "epoch": 10.340829643439758,
+      "grad_norm": 0.3616722524166107,
+      "learning_rate": 0.0004761911979014864,
+      "loss": 3.3588,
+      "step": 35500
+    },
+    {
+      "epoch": 10.355395012817525,
+      "grad_norm": 0.390299528837204,
+      "learning_rate": 0.00047601632177207806,
+      "loss": 3.3664,
+      "step": 35550
+    },
+    {
+      "epoch": 10.369960382195293,
+      "grad_norm": 0.35546815395355225,
+      "learning_rate": 0.00047584144564266975,
+      "loss": 3.3594,
+      "step": 35600
+    },
+    {
+      "epoch": 10.38452575157306,
+      "grad_norm": 0.35586267709732056,
+      "learning_rate": 0.0004756665695132614,
+      "loss": 3.3715,
+      "step": 35650
+    },
+    {
+      "epoch": 10.399091120950827,
+      "grad_norm": 0.3343265950679779,
+      "learning_rate": 0.0004754916933838531,
+      "loss": 3.3525,
+      "step": 35700
+    },
+    {
+      "epoch": 10.413656490328595,
+      "grad_norm": 0.350169837474823,
+      "learning_rate": 0.0004753168172544447,
+      "loss": 3.3714,
+      "step": 35750
+    },
+    {
+      "epoch": 10.428221859706362,
+      "grad_norm": 0.347483366727829,
+      "learning_rate": 0.00047514194112503635,
+      "loss": 3.3678,
+      "step": 35800
+    },
+    {
+      "epoch": 10.44278722908413,
+      "grad_norm": 0.3664875328540802,
+      "learning_rate": 0.00047496706499562804,
+      "loss": 3.3738,
+      "step": 35850
+    },
+    {
+      "epoch": 10.457352598461897,
+      "grad_norm": 0.35261833667755127,
+      "learning_rate": 0.0004747921888662197,
+      "loss": 3.374,
+      "step": 35900
+    },
+    {
+      "epoch": 10.471917967839664,
+      "grad_norm": 0.3594660758972168,
+      "learning_rate": 0.0004746173127368114,
+      "loss": 3.3644,
+      "step": 35950
+    },
+    {
+      "epoch": 10.486483337217432,
+      "grad_norm": 0.3424801528453827,
+      "learning_rate": 0.00047444243660740306,
+      "loss": 3.3689,
+      "step": 36000
+    },
+    {
+      "epoch": 10.486483337217432,
+      "eval_accuracy": 0.36841914057746694,
+      "eval_loss": 3.5697360038757324,
+      "eval_runtime": 179.5198,
+      "eval_samples_per_second": 92.72,
+      "eval_steps_per_second": 5.799,
+      "step": 36000
+    },
+    {
+      "epoch": 10.5010487065952,
+      "grad_norm": 0.3455258905887604,
+      "learning_rate": 0.0004742675604779947,
+      "loss": 3.3751,
+      "step": 36050
+    },
+    {
+      "epoch": 10.515614075972966,
+      "grad_norm": 0.3722718358039856,
+      "learning_rate": 0.0004740926843485864,
+      "loss": 3.3761,
+      "step": 36100
+    },
+    {
+      "epoch": 10.530179445350734,
+      "grad_norm": 0.3568621575832367,
+      "learning_rate": 0.000473917808219178,
+      "loss": 3.3765,
+      "step": 36150
+    },
+    {
+      "epoch": 10.544744814728501,
+      "grad_norm": 0.3658199906349182,
+      "learning_rate": 0.0004737429320897697,
+      "loss": 3.3724,
+      "step": 36200
+    },
+    {
+      "epoch": 10.55931018410627,
+      "grad_norm": 0.34567561745643616,
+      "learning_rate": 0.00047356805596036135,
+      "loss": 3.3713,
+      "step": 36250
+    },
+    {
+      "epoch": 10.573875553484037,
+      "grad_norm": 0.3556523621082306,
+      "learning_rate": 0.00047339317983095304,
+      "loss": 3.3746,
+      "step": 36300
+    },
+    {
+      "epoch": 10.588440922861803,
+      "grad_norm": 0.3559434115886688,
+      "learning_rate": 0.0004732183037015447,
+      "loss": 3.3639,
+      "step": 36350
+    },
+    {
+      "epoch": 10.603006292239572,
+      "grad_norm": 0.36187922954559326,
+      "learning_rate": 0.0004730434275721363,
+      "loss": 3.3817,
+      "step": 36400
+    },
+    {
+      "epoch": 10.617571661617339,
+      "grad_norm": 0.3516775369644165,
+      "learning_rate": 0.00047286855144272806,
+      "loss": 3.3719,
+      "step": 36450
+    },
+    {
+      "epoch": 10.632137030995105,
+      "grad_norm": 0.34939226508140564,
+      "learning_rate": 0.0004726936753133197,
+      "loss": 3.3685,
+      "step": 36500
+    },
+    {
+      "epoch": 10.646702400372874,
+      "grad_norm": 0.35714349150657654,
+      "learning_rate": 0.0004725187991839114,
+      "loss": 3.3735,
+      "step": 36550
+    },
+    {
+      "epoch": 10.66126776975064,
+      "grad_norm": 0.38368478417396545,
+      "learning_rate": 0.000472343923054503,
+      "loss": 3.3903,
+      "step": 36600
+    },
+    {
+      "epoch": 10.675833139128407,
+      "grad_norm": 0.3668145537376404,
+      "learning_rate": 0.00047216904692509465,
+      "loss": 3.389,
+      "step": 36650
+    },
+    {
+      "epoch": 10.690398508506176,
+      "grad_norm": 0.3444463908672333,
+      "learning_rate": 0.00047199417079568634,
+      "loss": 3.3861,
+      "step": 36700
+    },
+    {
+      "epoch": 10.704963877883943,
+      "grad_norm": 0.36496496200561523,
+      "learning_rate": 0.000471819294666278,
+      "loss": 3.3841,
+      "step": 36750
+    },
+    {
+      "epoch": 10.719529247261711,
+      "grad_norm": 0.37470269203186035,
+      "learning_rate": 0.00047164441853686967,
+      "loss": 3.384,
+      "step": 36800
+    },
+    {
+      "epoch": 10.734094616639478,
+      "grad_norm": 0.379410982131958,
+      "learning_rate": 0.0004714695424074613,
+      "loss": 3.376,
+      "step": 36850
+    },
+    {
+      "epoch": 10.748659986017245,
+      "grad_norm": 0.34845808148384094,
+      "learning_rate": 0.00047129466627805305,
+      "loss": 3.386,
+      "step": 36900
+    },
+    {
+      "epoch": 10.763225355395013,
+      "grad_norm": 0.33667710423469543,
+      "learning_rate": 0.0004711197901486447,
+      "loss": 3.3802,
+      "step": 36950
+    },
+    {
+      "epoch": 10.77779072477278,
+      "grad_norm": 0.33371374011039734,
+      "learning_rate": 0.0004709449140192363,
+      "loss": 3.379,
+      "step": 37000
+    },
+    {
+      "epoch": 10.77779072477278,
+      "eval_accuracy": 0.36916417957826586,
+      "eval_loss": 3.5598151683807373,
+      "eval_runtime": 179.634,
+      "eval_samples_per_second": 92.661,
+      "eval_steps_per_second": 5.795,
+      "step": 37000
+    },
+    {
+      "epoch": 10.792356094150549,
+      "grad_norm": 0.3444075584411621,
+      "learning_rate": 0.000470770037889828,
+      "loss": 3.3856,
+      "step": 37050
+    },
+    {
+      "epoch": 10.806921463528315,
+      "grad_norm": 0.34622007608413696,
+      "learning_rate": 0.00047059516176041965,
+      "loss": 3.3919,
+      "step": 37100
+    },
+    {
+      "epoch": 10.821486832906082,
+      "grad_norm": 0.39845308661460876,
+      "learning_rate": 0.00047042028563101134,
+      "loss": 3.3956,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83605220228385,
+      "grad_norm": 0.36669787764549255,
+      "learning_rate": 0.000470245409501603,
+      "loss": 3.3887,
+      "step": 37200
+    },
+    {
+      "epoch": 10.850617571661617,
+      "grad_norm": 0.34656545519828796,
+      "learning_rate": 0.0004700705333721946,
+      "loss": 3.4005,
+      "step": 37250
+    },
+    {
+      "epoch": 10.865182941039384,
+      "grad_norm": 0.35354891419410706,
+      "learning_rate": 0.0004698956572427863,
+      "loss": 3.3945,
+      "step": 37300
+    },
+    {
+      "epoch": 10.879748310417153,
+      "grad_norm": 0.33106672763824463,
+      "learning_rate": 0.00046972078111337794,
+      "loss": 3.3908,
+      "step": 37350
+    },
+    {
+      "epoch": 10.89431367979492,
+      "grad_norm": 0.3322337567806244,
+      "learning_rate": 0.0004695459049839697,
+      "loss": 3.3792,
+      "step": 37400
+    },
+    {
+      "epoch": 10.908879049172686,
+      "grad_norm": 0.35176587104797363,
+      "learning_rate": 0.0004693710288545613,
+      "loss": 3.3877,
+      "step": 37450
+    },
+    {
+      "epoch": 10.923444418550455,
+      "grad_norm": 0.34353724122047424,
+      "learning_rate": 0.000469196152725153,
+      "loss": 3.3928,
+      "step": 37500
+    },
+    {
+      "epoch": 10.938009787928221,
+      "grad_norm": 0.36018845438957214,
+      "learning_rate": 0.00046902127659574465,
+      "loss": 3.3907,
+      "step": 37550
+    },
+    {
+      "epoch": 10.95257515730599,
+      "grad_norm": 0.3528198003768921,
+      "learning_rate": 0.0004688464004663363,
+      "loss": 3.3962,
+      "step": 37600
+    },
+    {
+      "epoch": 10.967140526683757,
+      "grad_norm": 0.34510186314582825,
+      "learning_rate": 0.000468671524336928,
+      "loss": 3.3842,
+      "step": 37650
+    },
+    {
+      "epoch": 10.981705896061523,
+      "grad_norm": 0.3542734384536743,
+      "learning_rate": 0.0004684966482075196,
+      "loss": 3.4001,
+      "step": 37700
+    },
+    {
+      "epoch": 10.996271265439292,
+      "grad_norm": 0.3544139266014099,
+      "learning_rate": 0.0004683217720781113,
+      "loss": 3.3989,
+      "step": 37750
+    },
+    {
+      "epoch": 11.010778373339548,
+      "grad_norm": 0.35023751854896545,
+      "learning_rate": 0.00046814689594870294,
+      "loss": 3.3096,
+      "step": 37800
+    },
+    {
+      "epoch": 11.025343742717315,
+      "grad_norm": 0.36776822805404663,
+      "learning_rate": 0.0004679720198192946,
+      "loss": 3.2977,
+      "step": 37850
+    },
+    {
+      "epoch": 11.039909112095083,
+      "grad_norm": 0.3768557608127594,
+      "learning_rate": 0.0004677971436898863,
+      "loss": 3.2838,
+      "step": 37900
+    },
+    {
+      "epoch": 11.05447448147285,
+      "grad_norm": 0.34867051243782043,
+      "learning_rate": 0.00046762226756047795,
+      "loss": 3.2998,
+      "step": 37950
+    },
+    {
+      "epoch": 11.069039850850617,
+      "grad_norm": 0.3468160331249237,
+      "learning_rate": 0.00046744739143106964,
+      "loss": 3.2905,
+      "step": 38000
+    },
+    {
+      "epoch": 11.069039850850617,
+      "eval_accuracy": 0.368822286976984,
+      "eval_loss": 3.57218337059021,
+      "eval_runtime": 179.7316,
+      "eval_samples_per_second": 92.61,
+      "eval_steps_per_second": 5.792,
+      "step": 38000
+    },
+    {
+      "epoch": 11.083605220228385,
+      "grad_norm": 0.35249587893486023,
+      "learning_rate": 0.0004672725153016613,
+      "loss": 3.3018,
+      "step": 38050
+    },
+    {
+      "epoch": 11.098170589606152,
+      "grad_norm": 0.3808625340461731,
+      "learning_rate": 0.00046709763917225297,
+      "loss": 3.297,
+      "step": 38100
+    },
+    {
+      "epoch": 11.11273595898392,
+      "grad_norm": 0.3879368007183075,
+      "learning_rate": 0.0004669227630428446,
+      "loss": 3.3146,
+      "step": 38150
+    },
+    {
+      "epoch": 11.127301328361687,
+      "grad_norm": 0.34145820140838623,
+      "learning_rate": 0.00046674788691343624,
+      "loss": 3.3006,
+      "step": 38200
+    },
+    {
+      "epoch": 11.141866697739454,
+      "grad_norm": 0.3693101108074188,
+      "learning_rate": 0.00046657301078402793,
+      "loss": 3.3096,
+      "step": 38250
+    },
+    {
+      "epoch": 11.156432067117223,
+      "grad_norm": 0.3697426915168762,
+      "learning_rate": 0.00046639813465461957,
+      "loss": 3.3111,
+      "step": 38300
+    },
+    {
+      "epoch": 11.17099743649499,
+      "grad_norm": 0.37008944153785706,
+      "learning_rate": 0.0004662232585252113,
+      "loss": 3.3214,
+      "step": 38350
+    },
+    {
+      "epoch": 11.185562805872756,
+      "grad_norm": 0.3367568254470825,
+      "learning_rate": 0.00046604838239580295,
+      "loss": 3.3141,
+      "step": 38400
+    },
+    {
+      "epoch": 11.200128175250525,
+      "grad_norm": 0.36272698640823364,
+      "learning_rate": 0.0004658735062663946,
+      "loss": 3.3173,
+      "step": 38450
+    },
+    {
+      "epoch": 11.214693544628291,
+      "grad_norm": 0.35574871301651,
+      "learning_rate": 0.0004656986301369863,
+      "loss": 3.3162,
+      "step": 38500
+    },
+    {
+      "epoch": 11.22925891400606,
+      "grad_norm": 0.3292505443096161,
+      "learning_rate": 0.0004655237540075779,
+      "loss": 3.3112,
+      "step": 38550
+    },
+    {
+      "epoch": 11.243824283383827,
+      "grad_norm": 0.36970898509025574,
+      "learning_rate": 0.0004653488778781696,
+      "loss": 3.3333,
+      "step": 38600
+    },
+    {
+      "epoch": 11.258389652761593,
+      "grad_norm": 0.3610363304615021,
+      "learning_rate": 0.00046517400174876124,
+      "loss": 3.3249,
+      "step": 38650
+    },
+    {
+      "epoch": 11.272955022139362,
+      "grad_norm": 0.38531872630119324,
+      "learning_rate": 0.0004649991256193529,
+      "loss": 3.3284,
+      "step": 38700
+    },
+    {
+      "epoch": 11.287520391517129,
+      "grad_norm": 0.37316545844078064,
+      "learning_rate": 0.00046482424948994457,
+      "loss": 3.3431,
+      "step": 38750
+    },
+    {
+      "epoch": 11.302085760894895,
+      "grad_norm": 0.4000358581542969,
+      "learning_rate": 0.0004646493733605362,
+      "loss": 3.3247,
+      "step": 38800
+    },
+    {
+      "epoch": 11.316651130272664,
+      "grad_norm": 0.3880660831928253,
+      "learning_rate": 0.00046447449723112795,
+      "loss": 3.337,
+      "step": 38850
+    },
+    {
+      "epoch": 11.33121649965043,
+      "grad_norm": 0.38193467259407043,
+      "learning_rate": 0.0004642996211017196,
+      "loss": 3.3412,
+      "step": 38900
+    },
+    {
+      "epoch": 11.3457818690282,
+      "grad_norm": 0.3892691433429718,
+      "learning_rate": 0.0004641247449723113,
+      "loss": 3.3442,
+      "step": 38950
+    },
+    {
+      "epoch": 11.360347238405966,
+      "grad_norm": 0.3644644618034363,
+      "learning_rate": 0.0004639498688429029,
+      "loss": 3.3332,
+      "step": 39000
+    },
+    {
+      "epoch": 11.360347238405966,
+      "eval_accuracy": 0.36906248181344165,
+      "eval_loss": 3.567659616470337,
+      "eval_runtime": 179.7805,
+      "eval_samples_per_second": 92.585,
+      "eval_steps_per_second": 5.79,
+      "step": 39000
+    },
+    {
+      "epoch": 11.374912607783733,
+      "grad_norm": 0.38239893317222595,
+      "learning_rate": 0.00046377499271349455,
+      "loss": 3.334,
+      "step": 39050
+    },
+    {
+      "epoch": 11.389477977161501,
+      "grad_norm": 0.34430813789367676,
+      "learning_rate": 0.00046360011658408624,
+      "loss": 3.3331,
+      "step": 39100
+    },
+    {
+      "epoch": 11.404043346539268,
+      "grad_norm": 0.3366773724555969,
+      "learning_rate": 0.00046342524045467787,
+      "loss": 3.3524,
+      "step": 39150
+    },
+    {
+      "epoch": 11.418608715917035,
+      "grad_norm": 0.3577640950679779,
+      "learning_rate": 0.00046325036432526956,
+      "loss": 3.3418,
+      "step": 39200
+    },
+    {
+      "epoch": 11.433174085294803,
+      "grad_norm": 0.3685474395751953,
+      "learning_rate": 0.0004630754881958612,
+      "loss": 3.348,
+      "step": 39250
+    },
+    {
+      "epoch": 11.44773945467257,
+      "grad_norm": 0.36754393577575684,
+      "learning_rate": 0.00046290061206645284,
+      "loss": 3.3386,
+      "step": 39300
+    },
+    {
+      "epoch": 11.462304824050339,
+      "grad_norm": 0.36672037839889526,
+      "learning_rate": 0.0004627257359370446,
+      "loss": 3.3531,
+      "step": 39350
+    },
+    {
+      "epoch": 11.476870193428105,
+      "grad_norm": 0.3429676592350006,
+      "learning_rate": 0.0004625508598076362,
+      "loss": 3.3398,
+      "step": 39400
+    },
+    {
+      "epoch": 11.491435562805872,
+      "grad_norm": 0.40767839550971985,
+      "learning_rate": 0.0004623759836782279,
+      "loss": 3.3367,
+      "step": 39450
+    },
+    {
+      "epoch": 11.50600093218364,
+      "grad_norm": 0.3494288921356201,
+      "learning_rate": 0.00046220110754881954,
+      "loss": 3.3617,
+      "step": 39500
+    },
+    {
+      "epoch": 11.520566301561407,
+      "grad_norm": 0.35878896713256836,
+      "learning_rate": 0.00046202623141941123,
+      "loss": 3.354,
+      "step": 39550
+    },
+    {
+      "epoch": 11.535131670939174,
+      "grad_norm": 0.3389423191547394,
+      "learning_rate": 0.00046185135529000287,
+      "loss": 3.3341,
+      "step": 39600
+    },
+    {
+      "epoch": 11.549697040316943,
+      "grad_norm": 0.35176849365234375,
+      "learning_rate": 0.0004616764791605945,
+      "loss": 3.3581,
+      "step": 39650
+    },
+    {
+      "epoch": 11.56426240969471,
+      "grad_norm": 0.33921658992767334,
+      "learning_rate": 0.0004615016030311862,
+      "loss": 3.3701,
+      "step": 39700
+    },
+    {
+      "epoch": 11.578827779072478,
+      "grad_norm": 0.35132601857185364,
+      "learning_rate": 0.00046132672690177783,
+      "loss": 3.3563,
+      "step": 39750
+    },
+    {
+      "epoch": 11.593393148450245,
+      "grad_norm": 0.34590718150138855,
+      "learning_rate": 0.0004611518507723696,
+      "loss": 3.3562,
+      "step": 39800
+    },
+    {
+      "epoch": 11.607958517828012,
+      "grad_norm": 0.3639914393424988,
+      "learning_rate": 0.0004609769746429612,
+      "loss": 3.36,
+      "step": 39850
+    },
+    {
+      "epoch": 11.62252388720578,
+      "grad_norm": 0.3800894021987915,
+      "learning_rate": 0.00046080209851355285,
+      "loss": 3.3671,
+      "step": 39900
+    },
+    {
+      "epoch": 11.637089256583547,
+      "grad_norm": 0.3536291718482971,
+      "learning_rate": 0.00046062722238414454,
+      "loss": 3.3553,
+      "step": 39950
+    },
+    {
+      "epoch": 11.651654625961314,
+      "grad_norm": 0.35299673676490784,
+      "learning_rate": 0.0004604523462547362,
+      "loss": 3.3602,
+      "step": 40000
+    },
+    {
+      "epoch": 11.651654625961314,
+      "eval_accuracy": 0.3690838794934393,
+      "eval_loss": 3.560605764389038,
+      "eval_runtime": 179.7311,
+      "eval_samples_per_second": 92.611,
+      "eval_steps_per_second": 5.792,
+      "step": 40000
+    },
+    {
+      "epoch": 11.666219995339082,
+      "grad_norm": 0.37060144543647766,
+      "learning_rate": 0.00046027747012532787,
+      "loss": 3.3573,
+      "step": 40050
+    },
+    {
+      "epoch": 11.680785364716849,
+      "grad_norm": 0.4053195118904114,
+      "learning_rate": 0.0004601025939959195,
+      "loss": 3.3733,
+      "step": 40100
+    },
+    {
+      "epoch": 11.695350734094617,
+      "grad_norm": 0.3602805733680725,
+      "learning_rate": 0.0004599277178665112,
+      "loss": 3.3565,
+      "step": 40150
+    },
+    {
+      "epoch": 11.709916103472384,
+      "grad_norm": 0.3374777138233185,
+      "learning_rate": 0.00045975284173710283,
+      "loss": 3.357,
+      "step": 40200
+    },
+    {
+      "epoch": 11.724481472850151,
+      "grad_norm": 0.369104266166687,
+      "learning_rate": 0.00045957796560769446,
+      "loss": 3.371,
+      "step": 40250
+    },
+    {
+      "epoch": 11.73904684222792,
+      "grad_norm": 0.3783121407032013,
+      "learning_rate": 0.0004594030894782862,
+      "loss": 3.3587,
+      "step": 40300
+    },
+    {
+      "epoch": 11.753612211605686,
+      "grad_norm": 0.3741343021392822,
+      "learning_rate": 0.00045922821334887785,
+      "loss": 3.3759,
+      "step": 40350
+    },
+    {
+      "epoch": 11.768177580983453,
+      "grad_norm": 0.40091201663017273,
+      "learning_rate": 0.00045905333721946954,
+      "loss": 3.363,
+      "step": 40400
+    },
+    {
+      "epoch": 11.782742950361222,
+      "grad_norm": 0.3418520390987396,
+      "learning_rate": 0.00045887846109006117,
+      "loss": 3.3579,
+      "step": 40450
+    },
+    {
+      "epoch": 11.797308319738988,
+      "grad_norm": 0.328216016292572,
+      "learning_rate": 0.0004587035849606528,
+      "loss": 3.368,
+      "step": 40500
+    },
+    {
+      "epoch": 11.811873689116755,
+      "grad_norm": 0.3601894676685333,
+      "learning_rate": 0.0004585287088312445,
+      "loss": 3.3796,
+      "step": 40550
+    },
+    {
+      "epoch": 11.826439058494524,
+      "grad_norm": 0.3456202447414398,
+      "learning_rate": 0.00045835383270183613,
+      "loss": 3.3721,
+      "step": 40600
+    },
+    {
+      "epoch": 11.84100442787229,
+      "grad_norm": 0.3935994803905487,
+      "learning_rate": 0.0004581789565724278,
+      "loss": 3.3641,
+      "step": 40650
+    },
+    {
+      "epoch": 11.855569797250059,
+      "grad_norm": 0.38910239934921265,
+      "learning_rate": 0.00045800408044301946,
+      "loss": 3.3686,
+      "step": 40700
+    },
+    {
+      "epoch": 11.870135166627826,
+      "grad_norm": 0.37248024344444275,
+      "learning_rate": 0.0004578292043136111,
+      "loss": 3.3777,
+      "step": 40750
+    },
+    {
+      "epoch": 11.884700536005592,
+      "grad_norm": 0.36761152744293213,
+      "learning_rate": 0.00045765432818420284,
+      "loss": 3.359,
+      "step": 40800
+    },
+    {
+      "epoch": 11.899265905383361,
+      "grad_norm": 0.36878830194473267,
+      "learning_rate": 0.0004574794520547945,
+      "loss": 3.374,
+      "step": 40850
+    },
+    {
+      "epoch": 11.913831274761128,
+      "grad_norm": 0.36202317476272583,
+      "learning_rate": 0.00045730457592538617,
+      "loss": 3.3762,
+      "step": 40900
+    },
+    {
+      "epoch": 11.928396644138896,
+      "grad_norm": 0.37495362758636475,
+      "learning_rate": 0.0004571296997959778,
+      "loss": 3.3604,
+      "step": 40950
+    },
+    {
+      "epoch": 11.942962013516663,
+      "grad_norm": 0.3607545495033264,
+      "learning_rate": 0.0004569548236665695,
+      "loss": 3.3818,
+      "step": 41000
+    },
+    {
+      "epoch": 11.942962013516663,
+      "eval_accuracy": 0.37025463827045607,
+      "eval_loss": 3.5513434410095215,
+      "eval_runtime": 179.6988,
+      "eval_samples_per_second": 92.627,
+      "eval_steps_per_second": 5.793,
+      "step": 41000
+    },
+    {
+      "epoch": 11.95752738289443,
+      "grad_norm": 0.3468360900878906,
+      "learning_rate": 0.00045677994753716113,
+      "loss": 3.3849,
+      "step": 41050
+    },
+    {
+      "epoch": 11.972092752272198,
+      "grad_norm": 0.3485865294933319,
+      "learning_rate": 0.00045660507140775277,
+      "loss": 3.3785,
+      "step": 41100
+    },
+    {
+      "epoch": 11.986658121649965,
+      "grad_norm": 0.34709101915359497,
+      "learning_rate": 0.00045643019527834446,
+      "loss": 3.3663,
+      "step": 41150
+    },
+    {
+      "epoch": 12.001165229550221,
+      "grad_norm": 0.38278666138648987,
+      "learning_rate": 0.0004562553191489361,
+      "loss": 3.3766,
+      "step": 41200
+    },
+    {
+      "epoch": 12.01573059892799,
+      "grad_norm": 0.34984317421913147,
+      "learning_rate": 0.00045608044301952784,
+      "loss": 3.2617,
+      "step": 41250
+    },
+    {
+      "epoch": 12.030295968305756,
+      "grad_norm": 0.329671174287796,
+      "learning_rate": 0.0004559055668901195,
+      "loss": 3.265,
+      "step": 41300
+    },
+    {
+      "epoch": 12.044861337683523,
+      "grad_norm": 0.377273291349411,
+      "learning_rate": 0.0004557306907607111,
+      "loss": 3.274,
+      "step": 41350
+    },
+    {
+      "epoch": 12.059426707061291,
+      "grad_norm": 0.353500097990036,
+      "learning_rate": 0.0004555558146313028,
+      "loss": 3.2773,
+      "step": 41400
+    },
+    {
+      "epoch": 12.073992076439058,
+      "grad_norm": 0.33617493510246277,
+      "learning_rate": 0.00045538093850189444,
+      "loss": 3.2767,
+      "step": 41450
+    },
+    {
+      "epoch": 12.088557445816827,
+      "grad_norm": 0.37292763590812683,
+      "learning_rate": 0.00045520606237248613,
+      "loss": 3.2795,
+      "step": 41500
+    },
+    {
+      "epoch": 12.103122815194594,
+      "grad_norm": 0.3755471706390381,
+      "learning_rate": 0.00045503118624307776,
+      "loss": 3.2793,
+      "step": 41550
+    },
+    {
+      "epoch": 12.11768818457236,
+      "grad_norm": 0.4093469977378845,
+      "learning_rate": 0.00045485631011366945,
+      "loss": 3.2801,
+      "step": 41600
+    },
+    {
+      "epoch": 12.132253553950129,
+      "grad_norm": 0.3457607924938202,
+      "learning_rate": 0.0004546814339842611,
+      "loss": 3.2882,
+      "step": 41650
+    },
+    {
+      "epoch": 12.146818923327896,
+      "grad_norm": 0.37875697016716003,
+      "learning_rate": 0.0004545065578548527,
+      "loss": 3.2868,
+      "step": 41700
+    },
+    {
+      "epoch": 12.161384292705662,
+      "grad_norm": 0.3649895489215851,
+      "learning_rate": 0.00045433168172544447,
+      "loss": 3.2959,
+      "step": 41750
+    },
+    {
+      "epoch": 12.17594966208343,
+      "grad_norm": 0.391001433134079,
+      "learning_rate": 0.0004541568055960361,
+      "loss": 3.3045,
+      "step": 41800
+    },
+    {
+      "epoch": 12.190515031461198,
+      "grad_norm": 0.3820909857749939,
+      "learning_rate": 0.0004539819294666278,
+      "loss": 3.3038,
+      "step": 41850
+    },
+    {
+      "epoch": 12.205080400838966,
+      "grad_norm": 0.37390992045402527,
+      "learning_rate": 0.00045380705333721943,
+      "loss": 3.2909,
+      "step": 41900
+    },
+    {
+      "epoch": 12.219645770216733,
+      "grad_norm": 0.38024622201919556,
+      "learning_rate": 0.00045363217720781107,
+      "loss": 3.3109,
+      "step": 41950
+    },
+    {
+      "epoch": 12.2342111395945,
+      "grad_norm": 0.3541949391365051,
+      "learning_rate": 0.00045345730107840276,
+      "loss": 3.2924,
+      "step": 42000
+    },
+    {
+      "epoch": 12.2342111395945,
+      "eval_accuracy": 0.3695281752775673,
+      "eval_loss": 3.5650668144226074,
+      "eval_runtime": 179.7999,
+      "eval_samples_per_second": 92.575,
+      "eval_steps_per_second": 5.79,
+      "step": 42000
+    },
+    {
+      "epoch": 12.248776508972268,
+      "grad_norm": 0.35464340448379517,
+      "learning_rate": 0.0004532824249489944,
+      "loss": 3.3094,
+      "step": 42050
+    },
+    {
+      "epoch": 12.263341878350035,
+      "grad_norm": 0.3588162362575531,
+      "learning_rate": 0.0004531075488195861,
+      "loss": 3.3044,
+      "step": 42100
+    },
+    {
+      "epoch": 12.277907247727802,
+      "grad_norm": 0.35945770144462585,
+      "learning_rate": 0.0004529326726901777,
+      "loss": 3.3149,
+      "step": 42150
+    },
+    {
+      "epoch": 12.29247261710557,
+      "grad_norm": 0.3775635063648224,
+      "learning_rate": 0.00045275779656076947,
+      "loss": 3.3133,
+      "step": 42200
+    },
+    {
+      "epoch": 12.307037986483337,
+      "grad_norm": 0.36539244651794434,
+      "learning_rate": 0.0004525829204313611,
+      "loss": 3.3236,
+      "step": 42250
+    },
+    {
+      "epoch": 12.321603355861104,
+      "grad_norm": 0.36154550313949585,
+      "learning_rate": 0.00045240804430195274,
+      "loss": 3.3229,
+      "step": 42300
+    },
+    {
+      "epoch": 12.336168725238872,
+      "grad_norm": 0.36169642210006714,
+      "learning_rate": 0.00045223316817254443,
+      "loss": 3.3229,
+      "step": 42350
+    },
+    {
+      "epoch": 12.350734094616639,
+      "grad_norm": 0.3900752067565918,
+      "learning_rate": 0.00045205829204313607,
+      "loss": 3.3163,
+      "step": 42400
+    },
+    {
+      "epoch": 12.365299463994408,
+      "grad_norm": 0.36182570457458496,
+      "learning_rate": 0.00045188341591372776,
+      "loss": 3.3282,
+      "step": 42450
+    },
+    {
+      "epoch": 12.379864833372174,
+      "grad_norm": 0.36705368757247925,
+      "learning_rate": 0.0004517085397843194,
+      "loss": 3.3301,
+      "step": 42500
+    },
+    {
+      "epoch": 12.394430202749941,
+      "grad_norm": 0.35039055347442627,
+      "learning_rate": 0.00045153366365491103,
+      "loss": 3.3307,
+      "step": 42550
+    },
+    {
+      "epoch": 12.40899557212771,
+      "grad_norm": 0.38655978441238403,
+      "learning_rate": 0.0004513587875255027,
+      "loss": 3.3337,
+      "step": 42600
+    },
+    {
+      "epoch": 12.423560941505476,
+      "grad_norm": 0.34374991059303284,
+      "learning_rate": 0.00045118391139609436,
+      "loss": 3.3308,
+      "step": 42650
+    },
+    {
+      "epoch": 12.438126310883243,
+      "grad_norm": 0.366639643907547,
+      "learning_rate": 0.0004510090352666861,
+      "loss": 3.3421,
+      "step": 42700
+    },
+    {
+      "epoch": 12.452691680261012,
+      "grad_norm": 0.3672907054424286,
+      "learning_rate": 0.00045083415913727774,
+      "loss": 3.3184,
+      "step": 42750
+    },
+    {
+      "epoch": 12.467257049638778,
+      "grad_norm": 0.36934059858322144,
+      "learning_rate": 0.0004506592830078694,
+      "loss": 3.3262,
+      "step": 42800
+    },
+    {
+      "epoch": 12.481822419016547,
+      "grad_norm": 0.36116307973861694,
+      "learning_rate": 0.00045048440687846106,
+      "loss": 3.3253,
+      "step": 42850
+    },
+    {
+      "epoch": 12.496387788394314,
+      "grad_norm": 0.3898486793041229,
+      "learning_rate": 0.0004503095307490527,
+      "loss": 3.3426,
+      "step": 42900
+    },
+    {
+      "epoch": 12.51095315777208,
+      "grad_norm": 0.3628914952278137,
+      "learning_rate": 0.0004501346546196444,
+      "loss": 3.3299,
+      "step": 42950
+    },
+    {
+      "epoch": 12.525518527149849,
+      "grad_norm": 0.3678210973739624,
+      "learning_rate": 0.000449959778490236,
+      "loss": 3.3339,
+      "step": 43000
+    },
+    {
+      "epoch": 12.525518527149849,
+      "eval_accuracy": 0.3700859257935512,
+      "eval_loss": 3.558652877807617,
+      "eval_runtime": 179.6498,
+      "eval_samples_per_second": 92.652,
+      "eval_steps_per_second": 5.795,
+      "step": 43000
+    },
+    {
+      "epoch": 12.540083896527616,
+      "grad_norm": 0.3682061433792114,
+      "learning_rate": 0.0004497849023608277,
+      "loss": 3.3348,
+      "step": 43050
+    },
+    {
+      "epoch": 12.554649265905383,
+      "grad_norm": 0.3561127781867981,
+      "learning_rate": 0.00044961002623141935,
+      "loss": 3.332,
+      "step": 43100
+    },
+    {
+      "epoch": 12.569214635283151,
+      "grad_norm": 0.36310920119285583,
+      "learning_rate": 0.000449435150102011,
+      "loss": 3.3311,
+      "step": 43150
+    },
+    {
+      "epoch": 12.583780004660918,
+      "grad_norm": 0.378019779920578,
+      "learning_rate": 0.00044926027397260273,
+      "loss": 3.3486,
+      "step": 43200
+    },
+    {
+      "epoch": 12.598345374038686,
+      "grad_norm": 0.35761335492134094,
+      "learning_rate": 0.00044908539784319437,
+      "loss": 3.3456,
+      "step": 43250
+    },
+    {
+      "epoch": 12.612910743416453,
+      "grad_norm": 0.38079050183296204,
+      "learning_rate": 0.00044891052171378606,
+      "loss": 3.34,
+      "step": 43300
+    },
+    {
+      "epoch": 12.62747611279422,
+      "grad_norm": 0.36224061250686646,
+      "learning_rate": 0.0004487356455843777,
+      "loss": 3.3454,
+      "step": 43350
+    },
+    {
+      "epoch": 12.642041482171988,
+      "grad_norm": 0.3658839762210846,
+      "learning_rate": 0.00044856076945496933,
+      "loss": 3.3426,
+      "step": 43400
+    },
+    {
+      "epoch": 12.656606851549755,
+      "grad_norm": 0.3703918755054474,
+      "learning_rate": 0.000448385893325561,
+      "loss": 3.3413,
+      "step": 43450
+    },
+    {
+      "epoch": 12.671172220927522,
+      "grad_norm": 0.39403098821640015,
+      "learning_rate": 0.00044821101719615266,
+      "loss": 3.3435,
+      "step": 43500
+    },
+    {
+      "epoch": 12.68573759030529,
+      "grad_norm": 0.3617226481437683,
+      "learning_rate": 0.00044803614106674435,
+      "loss": 3.3455,
+      "step": 43550
+    },
+    {
+      "epoch": 12.700302959683057,
+      "grad_norm": 0.38985195755958557,
+      "learning_rate": 0.000447861264937336,
+      "loss": 3.347,
+      "step": 43600
+    },
+    {
+      "epoch": 12.714868329060826,
+      "grad_norm": 0.3814113438129425,
+      "learning_rate": 0.00044768638880792773,
+      "loss": 3.345,
+      "step": 43650
+    },
+    {
+      "epoch": 12.729433698438593,
+      "grad_norm": 0.33327075839042664,
+      "learning_rate": 0.00044751151267851937,
+      "loss": 3.3439,
+      "step": 43700
+    },
+    {
+      "epoch": 12.74399906781636,
+      "grad_norm": 0.39450183510780334,
+      "learning_rate": 0.000447336636549111,
+      "loss": 3.354,
+      "step": 43750
+    },
+    {
+      "epoch": 12.758564437194128,
+      "grad_norm": 0.34784385561943054,
+      "learning_rate": 0.0004471617604197027,
+      "loss": 3.3438,
+      "step": 43800
+    },
+    {
+      "epoch": 12.773129806571895,
+      "grad_norm": 0.3895801305770874,
+      "learning_rate": 0.00044698688429029433,
+      "loss": 3.3385,
+      "step": 43850
+    },
+    {
+      "epoch": 12.787695175949661,
+      "grad_norm": 0.37325412034988403,
+      "learning_rate": 0.000446812008160886,
+      "loss": 3.3382,
+      "step": 43900
+    },
+    {
+      "epoch": 12.80226054532743,
+      "grad_norm": 0.3635622262954712,
+      "learning_rate": 0.00044663713203147766,
+      "loss": 3.351,
+      "step": 43950
+    },
+    {
+      "epoch": 12.816825914705197,
+      "grad_norm": 0.34390226006507874,
+      "learning_rate": 0.0004464622559020693,
+      "loss": 3.352,
+      "step": 44000
+    },
+    {
+      "epoch": 12.816825914705197,
+      "eval_accuracy": 0.37086952764621406,
+      "eval_loss": 3.5511374473571777,
+      "eval_runtime": 179.8312,
+      "eval_samples_per_second": 92.559,
+      "eval_steps_per_second": 5.789,
+      "step": 44000
+    },
+    {
+      "epoch": 12.831391284082965,
+      "grad_norm": 0.3434717059135437,
+      "learning_rate": 0.000446287379772661,
+      "loss": 3.358,
+      "step": 44050
+    },
+    {
+      "epoch": 12.845956653460732,
+      "grad_norm": 0.3896610140800476,
+      "learning_rate": 0.0004461125036432526,
+      "loss": 3.349,
+      "step": 44100
+    },
+    {
+      "epoch": 12.860522022838499,
+      "grad_norm": 0.38952627778053284,
+      "learning_rate": 0.00044593762751384436,
+      "loss": 3.3503,
+      "step": 44150
+    },
+    {
+      "epoch": 12.875087392216267,
+      "grad_norm": 0.3934302031993866,
+      "learning_rate": 0.000445762751384436,
+      "loss": 3.365,
+      "step": 44200
+    },
+    {
+      "epoch": 12.889652761594034,
+      "grad_norm": 0.4013819694519043,
+      "learning_rate": 0.0004455878752550277,
+      "loss": 3.3597,
+      "step": 44250
+    },
+    {
+      "epoch": 12.9042181309718,
+      "grad_norm": 0.3661031126976013,
+      "learning_rate": 0.0004454129991256193,
+      "loss": 3.3674,
+      "step": 44300
+    },
+    {
+      "epoch": 12.91878350034957,
+      "grad_norm": 0.417216420173645,
+      "learning_rate": 0.00044523812299621096,
+      "loss": 3.3518,
+      "step": 44350
+    },
+    {
+      "epoch": 12.933348869727336,
+      "grad_norm": 0.3435131907463074,
+      "learning_rate": 0.00044506324686680265,
+      "loss": 3.3626,
+      "step": 44400
+    },
+    {
+      "epoch": 12.947914239105105,
+      "grad_norm": 0.3419662415981293,
+      "learning_rate": 0.0004448883707373943,
+      "loss": 3.3505,
+      "step": 44450
+    },
+    {
+      "epoch": 12.962479608482871,
+      "grad_norm": 0.370420902967453,
+      "learning_rate": 0.000444713494607986,
+      "loss": 3.3523,
+      "step": 44500
+    },
+    {
+      "epoch": 12.977044977860638,
+      "grad_norm": 0.37728220224380493,
+      "learning_rate": 0.0004445386184785776,
+      "loss": 3.344,
+      "step": 44550
+    },
+    {
+      "epoch": 12.991610347238407,
+      "grad_norm": 0.34244418144226074,
+      "learning_rate": 0.00044436374234916925,
+      "loss": 3.363,
+      "step": 44600
+    },
+    {
+      "epoch": 13.006117455138662,
+      "grad_norm": 0.37950703501701355,
+      "learning_rate": 0.000444188866219761,
+      "loss": 3.3071,
+      "step": 44650
+    },
+    {
+      "epoch": 13.02068282451643,
+      "grad_norm": 0.3414568305015564,
+      "learning_rate": 0.00044401399009035263,
+      "loss": 3.2523,
+      "step": 44700
+    },
+    {
+      "epoch": 13.035248193894198,
+      "grad_norm": 0.3659200966358185,
+      "learning_rate": 0.0004438391139609443,
+      "loss": 3.2501,
+      "step": 44750
+    },
+    {
+      "epoch": 13.049813563271965,
+      "grad_norm": 0.3586159348487854,
+      "learning_rate": 0.00044366423783153596,
+      "loss": 3.2587,
+      "step": 44800
+    },
+    {
+      "epoch": 13.064378932649731,
+      "grad_norm": 0.3674415051937103,
+      "learning_rate": 0.0004434893617021276,
+      "loss": 3.265,
+      "step": 44850
+    },
+    {
+      "epoch": 13.0789443020275,
+      "grad_norm": 0.36518362164497375,
+      "learning_rate": 0.0004433144855727193,
+      "loss": 3.2717,
+      "step": 44900
+    },
+    {
+      "epoch": 13.093509671405267,
+      "grad_norm": 0.35845455527305603,
+      "learning_rate": 0.0004431396094433109,
+      "loss": 3.2737,
+      "step": 44950
+    },
+    {
+      "epoch": 13.108075040783035,
+      "grad_norm": 0.3665563464164734,
+      "learning_rate": 0.0004429647333139026,
+      "loss": 3.2587,
+      "step": 45000
+    },
+    {
+      "epoch": 13.108075040783035,
+      "eval_accuracy": 0.37041758983351547,
+      "eval_loss": 3.561434268951416,
+      "eval_runtime": 179.821,
+      "eval_samples_per_second": 92.564,
+      "eval_steps_per_second": 5.789,
+      "step": 45000
+    },
+    {
+      "epoch": 13.122640410160802,
+      "grad_norm": 0.34525081515312195,
+      "learning_rate": 0.00044278985718449425,
+      "loss": 3.2747,
+      "step": 45050
+    },
+    {
+      "epoch": 13.137205779538569,
+      "grad_norm": 0.3698437809944153,
+      "learning_rate": 0.000442614981055086,
+      "loss": 3.2796,
+      "step": 45100
+    },
+    {
+      "epoch": 13.151771148916337,
+      "grad_norm": 0.34682697057724,
+      "learning_rate": 0.00044244010492567763,
+      "loss": 3.2792,
+      "step": 45150
+    },
+    {
+      "epoch": 13.166336518294104,
+      "grad_norm": 0.371136337518692,
+      "learning_rate": 0.00044226522879626927,
+      "loss": 3.2857,
+      "step": 45200
+    },
+    {
+      "epoch": 13.18090188767187,
+      "grad_norm": 0.3699894845485687,
+      "learning_rate": 0.00044209035266686096,
+      "loss": 3.2826,
+      "step": 45250
+    },
+    {
+      "epoch": 13.19546725704964,
+      "grad_norm": 0.35856735706329346,
+      "learning_rate": 0.0004419154765374526,
+      "loss": 3.2864,
+      "step": 45300
+    },
+    {
+      "epoch": 13.210032626427406,
+      "grad_norm": 0.37214043736457825,
+      "learning_rate": 0.0004417406004080443,
+      "loss": 3.2817,
+      "step": 45350
+    },
+    {
+      "epoch": 13.224597995805174,
+      "grad_norm": 0.35830315947532654,
+      "learning_rate": 0.0004415657242786359,
+      "loss": 3.2916,
+      "step": 45400
+    },
+    {
+      "epoch": 13.239163365182941,
+      "grad_norm": 0.38184452056884766,
+      "learning_rate": 0.00044139084814922755,
+      "loss": 3.2905,
+      "step": 45450
+    },
+    {
+      "epoch": 13.253728734560708,
+      "grad_norm": 0.36729663610458374,
+      "learning_rate": 0.00044121597201981924,
+      "loss": 3.2988,
+      "step": 45500
+    },
+    {
+      "epoch": 13.268294103938477,
+      "grad_norm": 0.360363632440567,
+      "learning_rate": 0.0004410410958904109,
+      "loss": 3.2856,
+      "step": 45550
+    },
+    {
+      "epoch": 13.282859473316243,
+      "grad_norm": 0.3617470860481262,
+      "learning_rate": 0.0004408662197610026,
+      "loss": 3.2866,
+      "step": 45600
+    },
+    {
+      "epoch": 13.29742484269401,
+      "grad_norm": 0.35599926114082336,
+      "learning_rate": 0.00044069134363159426,
+      "loss": 3.2948,
+      "step": 45650
+    },
+    {
+      "epoch": 13.311990212071779,
+      "grad_norm": 0.3696223199367523,
+      "learning_rate": 0.00044051646750218595,
+      "loss": 3.3045,
+      "step": 45700
+    },
+    {
+      "epoch": 13.326555581449545,
+      "grad_norm": 0.3685641288757324,
+      "learning_rate": 0.0004403415913727776,
+      "loss": 3.2931,
+      "step": 45750
+    },
+    {
+      "epoch": 13.341120950827314,
+      "grad_norm": 0.36321672797203064,
+      "learning_rate": 0.0004401667152433692,
+      "loss": 3.3018,
+      "step": 45800
+    },
+    {
+      "epoch": 13.35568632020508,
+      "grad_norm": 0.3663840591907501,
+      "learning_rate": 0.0004399918391139609,
+      "loss": 3.3055,
+      "step": 45850
+    },
+    {
+      "epoch": 13.370251689582847,
+      "grad_norm": 0.37315475940704346,
+      "learning_rate": 0.00043981696298455255,
+      "loss": 3.3086,
+      "step": 45900
+    },
+    {
+      "epoch": 13.384817058960616,
+      "grad_norm": 0.404310941696167,
+      "learning_rate": 0.00043964208685514424,
+      "loss": 3.3042,
+      "step": 45950
+    },
+    {
+      "epoch": 13.399382428338383,
+      "grad_norm": 0.3406934142112732,
+      "learning_rate": 0.0004394672107257359,
+      "loss": 3.3059,
+      "step": 46000
+    },
+    {
+      "epoch": 13.399382428338383,
+      "eval_accuracy": 0.37039325291175984,
+      "eval_loss": 3.558974266052246,
+      "eval_runtime": 179.7471,
+      "eval_samples_per_second": 92.602,
+      "eval_steps_per_second": 5.791,
+      "step": 46000
+    },
+    {
+      "epoch": 13.41394779771615,
+      "grad_norm": 0.3774344325065613,
+      "learning_rate": 0.0004392923345963275,
+      "loss": 3.3012,
+      "step": 46050
+    },
+    {
+      "epoch": 13.428513167093918,
+      "grad_norm": 0.36368629336357117,
+      "learning_rate": 0.00043911745846691926,
+      "loss": 3.3133,
+      "step": 46100
+    },
+    {
+      "epoch": 13.443078536471685,
+      "grad_norm": 0.34740936756134033,
+      "learning_rate": 0.0004389425823375109,
+      "loss": 3.3042,
+      "step": 46150
+    },
+    {
+      "epoch": 13.457643905849451,
+      "grad_norm": 0.3726678192615509,
+      "learning_rate": 0.0004387677062081026,
+      "loss": 3.319,
+      "step": 46200
+    },
+    {
+      "epoch": 13.47220927522722,
+      "grad_norm": 0.3507010340690613,
+      "learning_rate": 0.0004385928300786942,
+      "loss": 3.3134,
+      "step": 46250
+    },
+    {
+      "epoch": 13.486774644604987,
+      "grad_norm": 0.36555829644203186,
+      "learning_rate": 0.0004384179539492859,
+      "loss": 3.3205,
+      "step": 46300
+    },
+    {
+      "epoch": 13.501340013982755,
+      "grad_norm": 0.34969812631607056,
+      "learning_rate": 0.00043824307781987755,
+      "loss": 3.3253,
+      "step": 46350
+    },
+    {
+      "epoch": 13.515905383360522,
+      "grad_norm": 0.3921741545200348,
+      "learning_rate": 0.0004380682016904692,
+      "loss": 3.3118,
+      "step": 46400
+    },
+    {
+      "epoch": 13.530470752738289,
+      "grad_norm": 0.4136374592781067,
+      "learning_rate": 0.0004378933255610609,
+      "loss": 3.3229,
+      "step": 46450
+    },
+    {
+      "epoch": 13.545036122116057,
+      "grad_norm": 0.39142510294914246,
+      "learning_rate": 0.0004377184494316525,
+      "loss": 3.3253,
+      "step": 46500
+    },
+    {
+      "epoch": 13.559601491493824,
+      "grad_norm": 0.35085681080818176,
+      "learning_rate": 0.00043754357330224426,
+      "loss": 3.3399,
+      "step": 46550
+    },
+    {
+      "epoch": 13.574166860871593,
+      "grad_norm": 0.38441339135169983,
+      "learning_rate": 0.0004373686971728359,
+      "loss": 3.3177,
+      "step": 46600
+    },
+    {
+      "epoch": 13.58873223024936,
+      "grad_norm": 0.3715854287147522,
+      "learning_rate": 0.00043719382104342753,
+      "loss": 3.3232,
+      "step": 46650
+    },
+    {
+      "epoch": 13.603297599627126,
+      "grad_norm": 0.37551915645599365,
+      "learning_rate": 0.0004370189449140192,
+      "loss": 3.319,
+      "step": 46700
+    },
+    {
+      "epoch": 13.617862969004895,
+      "grad_norm": 0.41187357902526855,
+      "learning_rate": 0.00043684406878461085,
+      "loss": 3.3137,
+      "step": 46750
+    },
+    {
+      "epoch": 13.632428338382661,
+      "grad_norm": 0.37525227665901184,
+      "learning_rate": 0.00043666919265520254,
+      "loss": 3.319,
+      "step": 46800
+    },
+    {
+      "epoch": 13.646993707760428,
+      "grad_norm": 0.3764234483242035,
+      "learning_rate": 0.0004364943165257942,
+      "loss": 3.3301,
+      "step": 46850
+    },
+    {
+      "epoch": 13.661559077138197,
+      "grad_norm": 0.3498331904411316,
+      "learning_rate": 0.0004363194403963858,
+      "loss": 3.3309,
+      "step": 46900
+    },
+    {
+      "epoch": 13.676124446515963,
+      "grad_norm": 0.35479307174682617,
+      "learning_rate": 0.0004361445642669775,
+      "loss": 3.3297,
+      "step": 46950
+    },
+    {
+      "epoch": 13.69068981589373,
+      "grad_norm": 0.36635255813598633,
+      "learning_rate": 0.00043596968813756914,
+      "loss": 3.3337,
+      "step": 47000
+    },
+    {
+      "epoch": 13.69068981589373,
+      "eval_accuracy": 0.370927724633021,
+      "eval_loss": 3.551501989364624,
+      "eval_runtime": 179.6084,
+      "eval_samples_per_second": 92.674,
+      "eval_steps_per_second": 5.796,
+      "step": 47000
+    },
+    {
+      "epoch": 13.705255185271499,
+      "grad_norm": 0.3506696820259094,
+      "learning_rate": 0.0004357948120081609,
+      "loss": 3.3283,
+      "step": 47050
+    },
+    {
+      "epoch": 13.719820554649266,
+      "grad_norm": 0.33802589774131775,
+      "learning_rate": 0.0004356199358787525,
+      "loss": 3.3193,
+      "step": 47100
+    },
+    {
+      "epoch": 13.734385924027034,
+      "grad_norm": 0.4081648886203766,
+      "learning_rate": 0.0004354450597493442,
+      "loss": 3.3291,
+      "step": 47150
+    },
+    {
+      "epoch": 13.7489512934048,
+      "grad_norm": 0.35251572728157043,
+      "learning_rate": 0.00043527018361993585,
+      "loss": 3.3204,
+      "step": 47200
+    },
+    {
+      "epoch": 13.763516662782568,
+      "grad_norm": 0.34620070457458496,
+      "learning_rate": 0.0004350953074905275,
+      "loss": 3.3274,
+      "step": 47250
+    },
+    {
+      "epoch": 13.778082032160336,
+      "grad_norm": 0.36664706468582153,
+      "learning_rate": 0.0004349204313611192,
+      "loss": 3.3266,
+      "step": 47300
+    },
+    {
+      "epoch": 13.792647401538103,
+      "grad_norm": 0.35269954800605774,
+      "learning_rate": 0.0004347455552317108,
+      "loss": 3.3388,
+      "step": 47350
+    },
+    {
+      "epoch": 13.80721277091587,
+      "grad_norm": 0.39205700159072876,
+      "learning_rate": 0.0004345706791023025,
+      "loss": 3.3248,
+      "step": 47400
+    },
+    {
+      "epoch": 13.821778140293638,
+      "grad_norm": 0.39721786975860596,
+      "learning_rate": 0.00043439580297289414,
+      "loss": 3.3325,
+      "step": 47450
+    },
+    {
+      "epoch": 13.836343509671405,
+      "grad_norm": 0.39120006561279297,
+      "learning_rate": 0.0004342209268434858,
+      "loss": 3.3322,
+      "step": 47500
+    },
+    {
+      "epoch": 13.850908879049173,
+      "grad_norm": 0.34740549325942993,
+      "learning_rate": 0.0004340460507140775,
+      "loss": 3.3376,
+      "step": 47550
+    },
+    {
+      "epoch": 13.86547424842694,
+      "grad_norm": 0.3894191086292267,
+      "learning_rate": 0.00043387117458466916,
+      "loss": 3.3419,
+      "step": 47600
+    },
+    {
+      "epoch": 13.880039617804707,
+      "grad_norm": 0.36777186393737793,
+      "learning_rate": 0.00043369629845526085,
+      "loss": 3.3491,
+      "step": 47650
+    },
+    {
+      "epoch": 13.894604987182475,
+      "grad_norm": 0.3732227683067322,
+      "learning_rate": 0.0004335214223258525,
+      "loss": 3.3383,
+      "step": 47700
+    },
+    {
+      "epoch": 13.909170356560242,
+      "grad_norm": 0.3650364279747009,
+      "learning_rate": 0.0004333465461964442,
+      "loss": 3.3382,
+      "step": 47750
+    },
+    {
+      "epoch": 13.923735725938009,
+      "grad_norm": 0.36800310015678406,
+      "learning_rate": 0.0004331716700670358,
+      "loss": 3.3333,
+      "step": 47800
+    },
+    {
+      "epoch": 13.938301095315778,
+      "grad_norm": 0.4116499125957489,
+      "learning_rate": 0.00043299679393762745,
+      "loss": 3.3534,
+      "step": 47850
+    },
+    {
+      "epoch": 13.952866464693544,
+      "grad_norm": 0.3539418876171112,
+      "learning_rate": 0.00043282191780821914,
+      "loss": 3.3383,
+      "step": 47900
+    },
+    {
+      "epoch": 13.967431834071313,
+      "grad_norm": 0.35670432448387146,
+      "learning_rate": 0.00043264704167881077,
+      "loss": 3.347,
+      "step": 47950
+    },
+    {
+      "epoch": 13.98199720344908,
+      "grad_norm": 0.3599216639995575,
+      "learning_rate": 0.0004324721655494025,
+      "loss": 3.3343,
+      "step": 48000
+    },
+    {
+      "epoch": 13.98199720344908,
+      "eval_accuracy": 0.3713798975850602,
+      "eval_loss": 3.5442428588867188,
+      "eval_runtime": 179.6319,
+      "eval_samples_per_second": 92.662,
+      "eval_steps_per_second": 5.795,
+      "step": 48000
+    },
+    {
+      "epoch": 13.996562572826846,
+      "grad_norm": 0.3528788387775421,
+      "learning_rate": 0.00043229728941999415,
+      "loss": 3.345,
+      "step": 48050
+    },
+    {
+      "epoch": 14.011069680727104,
+      "grad_norm": 0.3962458670139313,
+      "learning_rate": 0.0004321224132905858,
+      "loss": 3.2571,
+      "step": 48100
+    },
+    {
+      "epoch": 14.02563505010487,
+      "grad_norm": 0.3570566475391388,
+      "learning_rate": 0.0004319475371611775,
+      "loss": 3.2367,
+      "step": 48150
+    },
+    {
+      "epoch": 14.040200419482638,
+      "grad_norm": 0.3566766679286957,
+      "learning_rate": 0.0004317726610317691,
+      "loss": 3.2429,
+      "step": 48200
+    },
+    {
+      "epoch": 14.054765788860406,
+      "grad_norm": 0.38148075342178345,
+      "learning_rate": 0.0004315977849023608,
+      "loss": 3.2336,
+      "step": 48250
+    },
+    {
+      "epoch": 14.069331158238173,
+      "grad_norm": 0.36465924978256226,
+      "learning_rate": 0.00043142290877295244,
+      "loss": 3.2448,
+      "step": 48300
+    },
+    {
+      "epoch": 14.08389652761594,
+      "grad_norm": 0.4034234583377838,
+      "learning_rate": 0.00043124803264354413,
+      "loss": 3.2529,
+      "step": 48350
+    },
+    {
+      "epoch": 14.098461896993708,
+      "grad_norm": 0.38046795129776,
+      "learning_rate": 0.00043107315651413577,
+      "loss": 3.2572,
+      "step": 48400
+    },
+    {
+      "epoch": 14.113027266371475,
+      "grad_norm": 0.37367624044418335,
+      "learning_rate": 0.0004308982803847274,
+      "loss": 3.2464,
+      "step": 48450
+    },
+    {
+      "epoch": 14.127592635749243,
+      "grad_norm": 0.3575690984725952,
+      "learning_rate": 0.00043072340425531915,
+      "loss": 3.2573,
+      "step": 48500
+    },
+    {
+      "epoch": 14.14215800512701,
+      "grad_norm": 0.37947431206703186,
+      "learning_rate": 0.0004305485281259108,
+      "loss": 3.2611,
+      "step": 48550
+    },
+    {
+      "epoch": 14.156723374504777,
+      "grad_norm": 0.4311124086380005,
+      "learning_rate": 0.0004303736519965025,
+      "loss": 3.2644,
+      "step": 48600
+    },
+    {
+      "epoch": 14.171288743882545,
+      "grad_norm": 0.40697988867759705,
+      "learning_rate": 0.0004301987758670941,
+      "loss": 3.271,
+      "step": 48650
+    },
+    {
+      "epoch": 14.185854113260312,
+      "grad_norm": 0.3614986538887024,
+      "learning_rate": 0.00043002389973768575,
+      "loss": 3.2703,
+      "step": 48700
+    },
+    {
+      "epoch": 14.200419482638079,
+      "grad_norm": 0.40103664994239807,
+      "learning_rate": 0.00042984902360827744,
+      "loss": 3.2759,
+      "step": 48750
+    },
+    {
+      "epoch": 14.214984852015847,
+      "grad_norm": 0.3614042401313782,
+      "learning_rate": 0.0004296741474788691,
+      "loss": 3.2763,
+      "step": 48800
+    },
+    {
+      "epoch": 14.229550221393614,
+      "grad_norm": 0.3694717586040497,
+      "learning_rate": 0.00042949927134946077,
+      "loss": 3.2836,
+      "step": 48850
+    },
+    {
+      "epoch": 14.244115590771383,
+      "grad_norm": 0.3669103682041168,
+      "learning_rate": 0.0004293243952200524,
+      "loss": 3.2802,
+      "step": 48900
+    },
+    {
+      "epoch": 14.25868096014915,
+      "grad_norm": 0.38352200388908386,
+      "learning_rate": 0.00042914951909064415,
+      "loss": 3.2663,
+      "step": 48950
+    },
+    {
+      "epoch": 14.273246329526916,
+      "grad_norm": 0.44323334097862244,
+      "learning_rate": 0.0004289746429612358,
+      "loss": 3.2804,
+      "step": 49000
+    },
+    {
+      "epoch": 14.273246329526916,
+      "eval_accuracy": 0.3708143874708354,
+      "eval_loss": 3.558420181274414,
+      "eval_runtime": 179.5689,
+      "eval_samples_per_second": 92.694,
+      "eval_steps_per_second": 5.797,
+      "step": 49000
+    },
+    {
+      "epoch": 14.287811698904685,
+      "grad_norm": 0.3794345259666443,
+      "learning_rate": 0.0004287997668318274,
+      "loss": 3.2769,
+      "step": 49050
+    },
+    {
+      "epoch": 14.302377068282452,
+      "grad_norm": 0.38060110807418823,
+      "learning_rate": 0.0004286248907024191,
+      "loss": 3.2725,
+      "step": 49100
+    },
+    {
+      "epoch": 14.316942437660218,
+      "grad_norm": 0.43410491943359375,
+      "learning_rate": 0.00042845001457301075,
+      "loss": 3.2889,
+      "step": 49150
+    },
+    {
+      "epoch": 14.331507807037987,
+      "grad_norm": 0.39754602313041687,
+      "learning_rate": 0.00042827513844360244,
+      "loss": 3.2825,
+      "step": 49200
+    },
+    {
+      "epoch": 14.346073176415754,
+      "grad_norm": 0.41671112179756165,
+      "learning_rate": 0.00042810026231419407,
+      "loss": 3.2873,
+      "step": 49250
+    },
+    {
+      "epoch": 14.360638545793522,
+      "grad_norm": 0.37623023986816406,
+      "learning_rate": 0.0004279253861847857,
+      "loss": 3.2924,
+      "step": 49300
+    },
+    {
+      "epoch": 14.375203915171289,
+      "grad_norm": 0.421653687953949,
+      "learning_rate": 0.0004277505100553774,
+      "loss": 3.2735,
+      "step": 49350
+    },
+    {
+      "epoch": 14.389769284549056,
+      "grad_norm": 0.3558456301689148,
+      "learning_rate": 0.00042757563392596904,
+      "loss": 3.3055,
+      "step": 49400
+    },
+    {
+      "epoch": 14.404334653926824,
+      "grad_norm": 0.3729119300842285,
+      "learning_rate": 0.0004274007577965608,
+      "loss": 3.3128,
+      "step": 49450
+    },
+    {
+      "epoch": 14.418900023304591,
+      "grad_norm": 0.3821575939655304,
+      "learning_rate": 0.0004272258816671524,
+      "loss": 3.2866,
+      "step": 49500
+    },
+    {
+      "epoch": 14.433465392682358,
+      "grad_norm": 0.38078463077545166,
+      "learning_rate": 0.00042705100553774405,
+      "loss": 3.2985,
+      "step": 49550
+    },
+    {
+      "epoch": 14.448030762060126,
+      "grad_norm": 0.38333752751350403,
+      "learning_rate": 0.00042687612940833574,
+      "loss": 3.2894,
+      "step": 49600
+    },
+    {
+      "epoch": 14.462596131437893,
+      "grad_norm": 0.343722939491272,
+      "learning_rate": 0.0004267012532789274,
+      "loss": 3.307,
+      "step": 49650
+    },
+    {
+      "epoch": 14.477161500815662,
+      "grad_norm": 0.34256860613822937,
+      "learning_rate": 0.00042652637714951907,
+      "loss": 3.3008,
+      "step": 49700
+    },
+    {
+      "epoch": 14.491726870193428,
+      "grad_norm": 0.37949851155281067,
+      "learning_rate": 0.0004263515010201107,
+      "loss": 3.3107,
+      "step": 49750
+    },
+    {
+      "epoch": 14.506292239571195,
+      "grad_norm": 0.3749626874923706,
+      "learning_rate": 0.0004261766248907024,
+      "loss": 3.2993,
+      "step": 49800
+    },
+    {
+      "epoch": 14.520857608948964,
+      "grad_norm": 0.36548712849617004,
+      "learning_rate": 0.00042600174876129403,
+      "loss": 3.3102,
+      "step": 49850
+    },
+    {
+      "epoch": 14.53542297832673,
+      "grad_norm": 0.404715895652771,
+      "learning_rate": 0.00042582687263188567,
+      "loss": 3.3081,
+      "step": 49900
+    },
+    {
+      "epoch": 14.549988347704497,
+      "grad_norm": 0.37166038155555725,
+      "learning_rate": 0.0004256519965024774,
+      "loss": 3.3047,
+      "step": 49950
+    },
+    {
+      "epoch": 14.564553717082266,
+      "grad_norm": 0.3786543011665344,
+      "learning_rate": 0.00042547712037306905,
+      "loss": 3.293,
+      "step": 50000
+    },
+    {
+      "epoch": 14.564553717082266,
+      "eval_accuracy": 0.37125997652133685,
+      "eval_loss": 3.553053140640259,
+      "eval_runtime": 179.5811,
+      "eval_samples_per_second": 92.688,
+      "eval_steps_per_second": 5.797,
+      "step": 50000
+    },
+    {
+      "epoch": 14.579119086460032,
+      "grad_norm": 0.35522982478141785,
+      "learning_rate": 0.00042530224424366074,
+      "loss": 3.3022,
+      "step": 50050
+    },
+    {
+      "epoch": 14.5936844558378,
+      "grad_norm": 0.3993748426437378,
+      "learning_rate": 0.0004251273681142524,
+      "loss": 3.312,
+      "step": 50100
+    },
+    {
+      "epoch": 14.608249825215568,
+      "grad_norm": 0.39016029238700867,
+      "learning_rate": 0.000424952491984844,
+      "loss": 3.3076,
+      "step": 50150
+    },
+    {
+      "epoch": 14.622815194593334,
+      "grad_norm": 0.38183167576789856,
+      "learning_rate": 0.0004247776158554357,
+      "loss": 3.2985,
+      "step": 50200
+    },
+    {
+      "epoch": 14.637380563971103,
+      "grad_norm": 0.3808605968952179,
+      "learning_rate": 0.00042460273972602734,
+      "loss": 3.3138,
+      "step": 50250
+    },
+    {
+      "epoch": 14.65194593334887,
+      "grad_norm": 0.366777241230011,
+      "learning_rate": 0.00042442786359661903,
+      "loss": 3.3074,
+      "step": 50300
+    },
+    {
+      "epoch": 14.666511302726637,
+      "grad_norm": 0.3863094449043274,
+      "learning_rate": 0.00042425298746721066,
+      "loss": 3.3151,
+      "step": 50350
+    },
+    {
+      "epoch": 14.681076672104405,
+      "grad_norm": 0.35356074571609497,
+      "learning_rate": 0.0004240781113378024,
+      "loss": 3.3114,
+      "step": 50400
+    },
+    {
+      "epoch": 14.695642041482172,
+      "grad_norm": 0.38444754481315613,
+      "learning_rate": 0.00042390323520839405,
+      "loss": 3.3152,
+      "step": 50450
+    },
+    {
+      "epoch": 14.71020741085994,
+      "grad_norm": 0.3628937602043152,
+      "learning_rate": 0.0004237283590789857,
+      "loss": 3.3073,
+      "step": 50500
+    },
+    {
+      "epoch": 14.724772780237707,
+      "grad_norm": 0.3597457706928253,
+      "learning_rate": 0.00042355348294957737,
+      "loss": 3.3091,
+      "step": 50550
+    },
+    {
+      "epoch": 14.739338149615474,
+      "grad_norm": 0.40730124711990356,
+      "learning_rate": 0.000423378606820169,
+      "loss": 3.303,
+      "step": 50600
+    },
+    {
+      "epoch": 14.753903518993242,
+      "grad_norm": 0.3871900737285614,
+      "learning_rate": 0.0004232037306907607,
+      "loss": 3.323,
+      "step": 50650
+    },
+    {
+      "epoch": 14.76846888837101,
+      "grad_norm": 0.3685663938522339,
+      "learning_rate": 0.00042302885456135233,
+      "loss": 3.3253,
+      "step": 50700
+    },
+    {
+      "epoch": 14.783034257748776,
+      "grad_norm": 0.358916699886322,
+      "learning_rate": 0.00042285397843194397,
+      "loss": 3.3162,
+      "step": 50750
+    },
+    {
+      "epoch": 14.797599627126544,
+      "grad_norm": 0.37842485308647156,
+      "learning_rate": 0.00042267910230253566,
+      "loss": 3.335,
+      "step": 50800
+    },
+    {
+      "epoch": 14.812164996504311,
+      "grad_norm": 0.36957690119743347,
+      "learning_rate": 0.0004225042261731273,
+      "loss": 3.3302,
+      "step": 50850
+    },
+    {
+      "epoch": 14.826730365882078,
+      "grad_norm": 0.3704380989074707,
+      "learning_rate": 0.00042232935004371904,
+      "loss": 3.324,
+      "step": 50900
+    },
+    {
+      "epoch": 14.841295735259846,
+      "grad_norm": 0.3660496175289154,
+      "learning_rate": 0.0004221544739143107,
+      "loss": 3.3219,
+      "step": 50950
+    },
+    {
+      "epoch": 14.855861104637613,
+      "grad_norm": 0.3719576299190521,
+      "learning_rate": 0.00042197959778490237,
+      "loss": 3.3208,
+      "step": 51000
+    },
+    {
+      "epoch": 14.855861104637613,
+      "eval_accuracy": 0.3716361994663513,
+      "eval_loss": 3.5443522930145264,
+      "eval_runtime": 179.6673,
+      "eval_samples_per_second": 92.643,
+      "eval_steps_per_second": 5.794,
+      "step": 51000
+    },
+    {
+      "epoch": 14.870426474015382,
+      "grad_norm": 0.39968937635421753,
+      "learning_rate": 0.000421804721655494,
+      "loss": 3.332,
+      "step": 51050
+    },
+    {
+      "epoch": 14.884991843393149,
+      "grad_norm": 0.3981848359107971,
+      "learning_rate": 0.00042162984552608564,
+      "loss": 3.3107,
+      "step": 51100
+    },
+    {
+      "epoch": 14.899557212770915,
+      "grad_norm": 0.3961758613586426,
+      "learning_rate": 0.00042145496939667733,
+      "loss": 3.3288,
+      "step": 51150
+    },
+    {
+      "epoch": 14.914122582148684,
+      "grad_norm": 0.3636086881160736,
+      "learning_rate": 0.00042128009326726897,
+      "loss": 3.3134,
+      "step": 51200
+    },
+    {
+      "epoch": 14.92868795152645,
+      "grad_norm": 0.36392343044281006,
+      "learning_rate": 0.00042110521713786066,
+      "loss": 3.3325,
+      "step": 51250
+    },
+    {
+      "epoch": 14.943253320904217,
+      "grad_norm": 0.36686888337135315,
+      "learning_rate": 0.0004209303410084523,
+      "loss": 3.3204,
+      "step": 51300
+    },
+    {
+      "epoch": 14.957818690281986,
+      "grad_norm": 0.37597978115081787,
+      "learning_rate": 0.00042075546487904393,
+      "loss": 3.3212,
+      "step": 51350
+    },
+    {
+      "epoch": 14.972384059659753,
+      "grad_norm": 0.3588141202926636,
+      "learning_rate": 0.0004205805887496357,
+      "loss": 3.3291,
+      "step": 51400
+    },
+    {
+      "epoch": 14.986949429037521,
+      "grad_norm": 0.37139445543289185,
+      "learning_rate": 0.0004204057126202273,
+      "loss": 3.3212,
+      "step": 51450
+    },
+    {
+      "epoch": 15.001456536937777,
+      "grad_norm": 0.393344908952713,
+      "learning_rate": 0.000420230836490819,
+      "loss": 3.3173,
+      "step": 51500
+    },
+    {
+      "epoch": 15.016021906315544,
+      "grad_norm": 0.4330257475376129,
+      "learning_rate": 0.00042005596036141064,
+      "loss": 3.2146,
+      "step": 51550
+    },
+    {
+      "epoch": 15.030587275693312,
+      "grad_norm": 0.3898090422153473,
+      "learning_rate": 0.0004198810842320023,
+      "loss": 3.2189,
+      "step": 51600
+    },
+    {
+      "epoch": 15.045152645071079,
+      "grad_norm": 0.4010067880153656,
+      "learning_rate": 0.00041970620810259396,
+      "loss": 3.2251,
+      "step": 51650
+    },
+    {
+      "epoch": 15.059718014448846,
+      "grad_norm": 0.40854838490486145,
+      "learning_rate": 0.0004195313319731856,
+      "loss": 3.2156,
+      "step": 51700
+    },
+    {
+      "epoch": 15.074283383826614,
+      "grad_norm": 0.36628204584121704,
+      "learning_rate": 0.0004193564558437773,
+      "loss": 3.234,
+      "step": 51750
+    },
+    {
+      "epoch": 15.088848753204381,
+      "grad_norm": 0.38783887028694153,
+      "learning_rate": 0.0004191815797143689,
+      "loss": 3.2315,
+      "step": 51800
+    },
+    {
+      "epoch": 15.103414122582148,
+      "grad_norm": 0.3718164265155792,
+      "learning_rate": 0.00041900670358496067,
+      "loss": 3.2369,
+      "step": 51850
+    },
+    {
+      "epoch": 15.117979491959916,
+      "grad_norm": 0.42094212770462036,
+      "learning_rate": 0.0004188318274555523,
+      "loss": 3.2398,
+      "step": 51900
+    },
+    {
+      "epoch": 15.132544861337683,
+      "grad_norm": 0.36034852266311646,
+      "learning_rate": 0.00041865695132614394,
+      "loss": 3.2404,
+      "step": 51950
+    },
+    {
+      "epoch": 15.147110230715452,
+      "grad_norm": 0.3888159692287445,
+      "learning_rate": 0.00041848207519673563,
+      "loss": 3.2449,
+      "step": 52000
+    },
+    {
+      "epoch": 15.147110230715452,
+      "eval_accuracy": 0.3711453460927778,
+      "eval_loss": 3.5572781562805176,
+      "eval_runtime": 179.6315,
+      "eval_samples_per_second": 92.662,
+      "eval_steps_per_second": 5.795,
+      "step": 52000
+    },
+    {
+      "epoch": 15.161675600093218,
+      "grad_norm": 0.3690231740474701,
+      "learning_rate": 0.00041830719906732727,
+      "loss": 3.2507,
+      "step": 52050
+    },
+    {
+      "epoch": 15.176240969470985,
+      "grad_norm": 0.37178748846054077,
+      "learning_rate": 0.00041813232293791896,
+      "loss": 3.2635,
+      "step": 52100
+    },
+    {
+      "epoch": 15.190806338848754,
+      "grad_norm": 0.40822193026542664,
+      "learning_rate": 0.0004179574468085106,
+      "loss": 3.2505,
+      "step": 52150
+    },
+    {
+      "epoch": 15.20537170822652,
+      "grad_norm": 0.40897294878959656,
+      "learning_rate": 0.00041778257067910223,
+      "loss": 3.2578,
+      "step": 52200
+    },
+    {
+      "epoch": 15.219937077604287,
+      "grad_norm": 0.416759729385376,
+      "learning_rate": 0.0004176076945496939,
+      "loss": 3.2576,
+      "step": 52250
+    },
+    {
+      "epoch": 15.234502446982056,
+      "grad_norm": 0.3542684018611908,
+      "learning_rate": 0.00041743281842028556,
+      "loss": 3.248,
+      "step": 52300
+    },
+    {
+      "epoch": 15.249067816359823,
+      "grad_norm": 0.3839828670024872,
+      "learning_rate": 0.0004172579422908773,
+      "loss": 3.2648,
+      "step": 52350
+    },
+    {
+      "epoch": 15.263633185737591,
+      "grad_norm": 0.36714503169059753,
+      "learning_rate": 0.00041708306616146894,
+      "loss": 3.257,
+      "step": 52400
+    },
+    {
+      "epoch": 15.278198555115358,
+      "grad_norm": 0.38585343956947327,
+      "learning_rate": 0.00041690819003206063,
+      "loss": 3.263,
+      "step": 52450
+    },
+    {
+      "epoch": 15.292763924493125,
+      "grad_norm": 0.3717619776725769,
+      "learning_rate": 0.00041673331390265227,
+      "loss": 3.2767,
+      "step": 52500
+    },
+    {
+      "epoch": 15.307329293870893,
+      "grad_norm": 0.3753516972064972,
+      "learning_rate": 0.0004165584377732439,
+      "loss": 3.2765,
+      "step": 52550
+    },
+    {
+      "epoch": 15.32189466324866,
+      "grad_norm": 0.3641180098056793,
+      "learning_rate": 0.0004163835616438356,
+      "loss": 3.2676,
+      "step": 52600
+    },
+    {
+      "epoch": 15.336460032626427,
+      "grad_norm": 0.37075987458229065,
+      "learning_rate": 0.00041620868551442723,
+      "loss": 3.2739,
+      "step": 52650
+    },
+    {
+      "epoch": 15.351025402004195,
+      "grad_norm": 0.3723219633102417,
+      "learning_rate": 0.0004160338093850189,
+      "loss": 3.2761,
+      "step": 52700
+    },
+    {
+      "epoch": 15.365590771381962,
+      "grad_norm": 0.3730946183204651,
+      "learning_rate": 0.00041585893325561056,
+      "loss": 3.2739,
+      "step": 52750
+    },
+    {
+      "epoch": 15.38015614075973,
+      "grad_norm": 0.3803166449069977,
+      "learning_rate": 0.0004156840571262022,
+      "loss": 3.2923,
+      "step": 52800
+    },
+    {
+      "epoch": 15.394721510137497,
+      "grad_norm": 0.3986593782901764,
+      "learning_rate": 0.00041550918099679394,
+      "loss": 3.2797,
+      "step": 52850
+    },
+    {
+      "epoch": 15.409286879515264,
+      "grad_norm": 0.3798179626464844,
+      "learning_rate": 0.0004153343048673856,
+      "loss": 3.2717,
+      "step": 52900
+    },
+    {
+      "epoch": 15.423852248893033,
+      "grad_norm": 0.4205482304096222,
+      "learning_rate": 0.00041515942873797726,
+      "loss": 3.2857,
+      "step": 52950
+    },
+    {
+      "epoch": 15.4384176182708,
+      "grad_norm": 0.35909244418144226,
+      "learning_rate": 0.0004149845526085689,
+      "loss": 3.2765,
+      "step": 53000
+    },
+    {
+      "epoch": 15.4384176182708,
+      "eval_accuracy": 0.3717464798171086,
+      "eval_loss": 3.5521633625030518,
+      "eval_runtime": 193.9407,
+      "eval_samples_per_second": 85.825,
+      "eval_steps_per_second": 5.368,
+      "step": 53000
+    },
+    {
+      "epoch": 15.452982987648566,
+      "grad_norm": 0.361979216337204,
+      "learning_rate": 0.0004148096764791606,
+      "loss": 3.2751,
+      "step": 53050
+    },
+    {
+      "epoch": 15.467548357026335,
+      "grad_norm": 0.36735597252845764,
+      "learning_rate": 0.0004146348003497522,
+      "loss": 3.3015,
+      "step": 53100
+    },
+    {
+      "epoch": 15.482113726404101,
+      "grad_norm": 0.3767015039920807,
+      "learning_rate": 0.00041445992422034386,
+      "loss": 3.2938,
+      "step": 53150
+    },
+    {
+      "epoch": 15.49667909578187,
+      "grad_norm": 0.38670143485069275,
+      "learning_rate": 0.00041428504809093555,
+      "loss": 3.2936,
+      "step": 53200
+    },
+    {
+      "epoch": 15.511244465159637,
+      "grad_norm": 0.39119359850883484,
+      "learning_rate": 0.0004141101719615272,
+      "loss": 3.2893,
+      "step": 53250
+    },
+    {
+      "epoch": 15.525809834537403,
+      "grad_norm": 0.36352699995040894,
+      "learning_rate": 0.00041393529583211893,
+      "loss": 3.2955,
+      "step": 53300
+    },
+    {
+      "epoch": 15.540375203915172,
+      "grad_norm": 0.38741451501846313,
+      "learning_rate": 0.00041376041970271057,
+      "loss": 3.299,
+      "step": 53350
+    },
+    {
+      "epoch": 15.554940573292939,
+      "grad_norm": 0.3951430916786194,
+      "learning_rate": 0.0004135855435733022,
+      "loss": 3.2996,
+      "step": 53400
+    },
+    {
+      "epoch": 15.569505942670705,
+      "grad_norm": 0.36441171169281006,
+      "learning_rate": 0.0004134106674438939,
+      "loss": 3.2938,
+      "step": 53450
+    },
+    {
+      "epoch": 15.584071312048474,
+      "grad_norm": 0.3774093985557556,
+      "learning_rate": 0.00041323579131448553,
+      "loss": 3.2882,
+      "step": 53500
+    },
+    {
+      "epoch": 15.59863668142624,
+      "grad_norm": 0.3849200904369354,
+      "learning_rate": 0.0004130609151850772,
+      "loss": 3.3071,
+      "step": 53550
+    },
+    {
+      "epoch": 15.61320205080401,
+      "grad_norm": 0.3753909468650818,
+      "learning_rate": 0.00041288603905566886,
+      "loss": 3.2861,
+      "step": 53600
+    },
+    {
+      "epoch": 15.627767420181776,
+      "grad_norm": 0.3853233754634857,
+      "learning_rate": 0.0004127111629262605,
+      "loss": 3.3072,
+      "step": 53650
+    },
+    {
+      "epoch": 15.642332789559543,
+      "grad_norm": 0.3988652229309082,
+      "learning_rate": 0.0004125362867968522,
+      "loss": 3.3094,
+      "step": 53700
+    },
+    {
+      "epoch": 15.656898158937311,
+      "grad_norm": 0.3708445429801941,
+      "learning_rate": 0.0004123614106674438,
+      "loss": 3.2967,
+      "step": 53750
+    },
+    {
+      "epoch": 15.671463528315078,
+      "grad_norm": 0.36685454845428467,
+      "learning_rate": 0.00041218653453803557,
+      "loss": 3.2913,
+      "step": 53800
+    },
+    {
+      "epoch": 15.686028897692845,
+      "grad_norm": 0.38278666138648987,
+      "learning_rate": 0.0004120116584086272,
+      "loss": 3.2861,
+      "step": 53850
+    },
+    {
+      "epoch": 15.700594267070613,
+      "grad_norm": 0.384741872549057,
+      "learning_rate": 0.0004118367822792189,
+      "loss": 3.304,
+      "step": 53900
+    },
+    {
+      "epoch": 15.71515963644838,
+      "grad_norm": 0.3768286108970642,
+      "learning_rate": 0.00041166190614981053,
+      "loss": 3.2982,
+      "step": 53950
+    },
+    {
+      "epoch": 15.729725005826147,
+      "grad_norm": 0.3943612575531006,
+      "learning_rate": 0.00041148703002040217,
+      "loss": 3.2996,
+      "step": 54000
+    },
+    {
+      "epoch": 15.729725005826147,
+      "eval_accuracy": 0.3718921486386314,
+      "eval_loss": 3.544917345046997,
+      "eval_runtime": 220.4654,
+      "eval_samples_per_second": 75.499,
+      "eval_steps_per_second": 4.722,
+      "step": 54000
+    },
+    {
+      "epoch": 15.744290375203915,
+      "grad_norm": 0.3631158769130707,
+      "learning_rate": 0.00041131215389099386,
+      "loss": 3.2903,
+      "step": 54050
+    },
+    {
+      "epoch": 15.758855744581682,
+      "grad_norm": 0.40076887607574463,
+      "learning_rate": 0.0004111372777615855,
+      "loss": 3.3015,
+      "step": 54100
+    },
+    {
+      "epoch": 15.77342111395945,
+      "grad_norm": 0.3838764429092407,
+      "learning_rate": 0.0004109624016321772,
+      "loss": 3.2982,
+      "step": 54150
+    },
+    {
+      "epoch": 15.787986483337217,
+      "grad_norm": 0.3836144804954529,
+      "learning_rate": 0.0004107875255027688,
+      "loss": 3.3118,
+      "step": 54200
+    },
+    {
+      "epoch": 15.802551852714984,
+      "grad_norm": 0.39159563183784485,
+      "learning_rate": 0.00041061264937336045,
+      "loss": 3.2957,
+      "step": 54250
+    },
+    {
+      "epoch": 15.817117222092753,
+      "grad_norm": 0.3700462281703949,
+      "learning_rate": 0.0004104377732439522,
+      "loss": 3.3081,
+      "step": 54300
+    },
+    {
+      "epoch": 15.83168259147052,
+      "grad_norm": 0.37243711948394775,
+      "learning_rate": 0.00041026289711454384,
+      "loss": 3.3134,
+      "step": 54350
+    },
+    {
+      "epoch": 15.846247960848288,
+      "grad_norm": 0.38975927233695984,
+      "learning_rate": 0.0004100880209851355,
+      "loss": 3.3046,
+      "step": 54400
+    },
+    {
+      "epoch": 15.860813330226055,
+      "grad_norm": 0.39330175518989563,
+      "learning_rate": 0.00040991314485572716,
+      "loss": 3.3103,
+      "step": 54450
+    },
+    {
+      "epoch": 15.875378699603822,
+      "grad_norm": 0.36677080392837524,
+      "learning_rate": 0.00040973826872631885,
+      "loss": 3.3041,
+      "step": 54500
+    },
+    {
+      "epoch": 15.88994406898159,
+      "grad_norm": 0.38371795415878296,
+      "learning_rate": 0.0004095633925969105,
+      "loss": 3.3192,
+      "step": 54550
+    },
+    {
+      "epoch": 15.904509438359357,
+      "grad_norm": 0.37720179557800293,
+      "learning_rate": 0.0004093885164675021,
+      "loss": 3.3182,
+      "step": 54600
+    },
+    {
+      "epoch": 15.919074807737124,
+      "grad_norm": 0.372707724571228,
+      "learning_rate": 0.0004092136403380938,
+      "loss": 3.3182,
+      "step": 54650
+    },
+    {
+      "epoch": 15.933640177114892,
+      "grad_norm": 0.38466477394104004,
+      "learning_rate": 0.00040903876420868545,
+      "loss": 3.3171,
+      "step": 54700
+    },
+    {
+      "epoch": 15.948205546492659,
+      "grad_norm": 0.4322209656238556,
+      "learning_rate": 0.00040886388807927714,
+      "loss": 3.3102,
+      "step": 54750
+    },
+    {
+      "epoch": 15.962770915870426,
+      "grad_norm": 0.3643110394477844,
+      "learning_rate": 0.00040868901194986883,
+      "loss": 3.3154,
+      "step": 54800
+    },
+    {
+      "epoch": 15.977336285248194,
+      "grad_norm": 0.3549572229385376,
+      "learning_rate": 0.00040851413582046047,
+      "loss": 3.3118,
+      "step": 54850
+    },
+    {
+      "epoch": 15.991901654625961,
+      "grad_norm": 0.35710573196411133,
+      "learning_rate": 0.00040833925969105216,
+      "loss": 3.3088,
+      "step": 54900
+    },
+    {
+      "epoch": 16.006408762526217,
+      "grad_norm": 0.3736666738986969,
+      "learning_rate": 0.0004081643835616438,
+      "loss": 3.2679,
+      "step": 54950
+    },
+    {
+      "epoch": 16.020974131903984,
+      "grad_norm": 0.39405354857444763,
+      "learning_rate": 0.0004079895074322355,
+      "loss": 3.195,
+      "step": 55000
+    },
+    {
+      "epoch": 16.020974131903984,
+      "eval_accuracy": 0.37159434466371843,
+      "eval_loss": 3.5555102825164795,
+      "eval_runtime": 179.8603,
+      "eval_samples_per_second": 92.544,
+      "eval_steps_per_second": 5.788,
+      "step": 55000
+    },
+    {
+      "epoch": 16.035539501281754,
+      "grad_norm": 0.38940897583961487,
+      "learning_rate": 0.0004078146313028271,
+      "loss": 3.2109,
+      "step": 55050
+    },
+    {
+      "epoch": 16.05010487065952,
+      "grad_norm": 0.3882853388786316,
+      "learning_rate": 0.0004076397551734188,
+      "loss": 3.2045,
+      "step": 55100
+    },
+    {
+      "epoch": 16.064670240037287,
+      "grad_norm": 0.39605289697647095,
+      "learning_rate": 0.00040746487904401045,
+      "loss": 3.2356,
+      "step": 55150
+    },
+    {
+      "epoch": 16.079235609415054,
+      "grad_norm": 0.3754449486732483,
+      "learning_rate": 0.0004072900029146021,
+      "loss": 3.2203,
+      "step": 55200
+    },
+    {
+      "epoch": 16.09380097879282,
+      "grad_norm": 0.4028746783733368,
+      "learning_rate": 0.0004071151267851938,
+      "loss": 3.2177,
+      "step": 55250
+    },
+    {
+      "epoch": 16.10836634817059,
+      "grad_norm": 0.36337772011756897,
+      "learning_rate": 0.00040694025065578546,
+      "loss": 3.2235,
+      "step": 55300
+    },
+    {
+      "epoch": 16.122931717548358,
+      "grad_norm": 0.3819507360458374,
+      "learning_rate": 0.00040676537452637716,
+      "loss": 3.2269,
+      "step": 55350
+    },
+    {
+      "epoch": 16.137497086926125,
+      "grad_norm": 0.38409850001335144,
+      "learning_rate": 0.0004065904983969688,
+      "loss": 3.2293,
+      "step": 55400
+    },
+    {
+      "epoch": 16.15206245630389,
+      "grad_norm": 0.4041096866130829,
+      "learning_rate": 0.00040641562226756043,
+      "loss": 3.2422,
+      "step": 55450
+    },
+    {
+      "epoch": 16.16662782568166,
+      "grad_norm": 0.3929169774055481,
+      "learning_rate": 0.0004062407461381521,
+      "loss": 3.2401,
+      "step": 55500
+    },
+    {
+      "epoch": 16.181193195059425,
+      "grad_norm": 0.379218190908432,
+      "learning_rate": 0.00040606587000874375,
+      "loss": 3.2338,
+      "step": 55550
+    },
+    {
+      "epoch": 16.195758564437195,
+      "grad_norm": 0.39579394459724426,
+      "learning_rate": 0.00040589099387933544,
+      "loss": 3.2271,
+      "step": 55600
+    },
+    {
+      "epoch": 16.210323933814962,
+      "grad_norm": 0.38522908091545105,
+      "learning_rate": 0.0004057161177499271,
+      "loss": 3.2319,
+      "step": 55650
+    },
+    {
+      "epoch": 16.22488930319273,
+      "grad_norm": 0.3886246085166931,
+      "learning_rate": 0.0004055412416205187,
+      "loss": 3.25,
+      "step": 55700
+    },
+    {
+      "epoch": 16.239454672570496,
+      "grad_norm": 0.387268990278244,
+      "learning_rate": 0.0004053663654911104,
+      "loss": 3.2485,
+      "step": 55750
+    },
+    {
+      "epoch": 16.254020041948262,
+      "grad_norm": 0.3706577718257904,
+      "learning_rate": 0.0004051914893617021,
+      "loss": 3.2549,
+      "step": 55800
+    },
+    {
+      "epoch": 16.268585411326033,
+      "grad_norm": 0.36555173993110657,
+      "learning_rate": 0.0004050166132322938,
+      "loss": 3.2546,
+      "step": 55850
+    },
+    {
+      "epoch": 16.2831507807038,
+      "grad_norm": 0.4174744486808777,
+      "learning_rate": 0.0004048417371028854,
+      "loss": 3.2461,
+      "step": 55900
+    },
+    {
+      "epoch": 16.297716150081566,
+      "grad_norm": 0.3815324604511261,
+      "learning_rate": 0.0004046668609734771,
+      "loss": 3.2584,
+      "step": 55950
+    },
+    {
+      "epoch": 16.312281519459333,
+      "grad_norm": 0.3781425654888153,
+      "learning_rate": 0.00040449198484406875,
+      "loss": 3.2685,
+      "step": 56000
+    },
+    {
+      "epoch": 16.312281519459333,
+      "eval_accuracy": 0.3717649382553484,
+      "eval_loss": 3.551072835922241,
+      "eval_runtime": 179.583,
+      "eval_samples_per_second": 92.687,
+      "eval_steps_per_second": 5.797,
+      "step": 56000
+    },
+    {
+      "epoch": 16.3268468888371,
+      "grad_norm": 0.3846278190612793,
+      "learning_rate": 0.0004043171087146604,
+      "loss": 3.2671,
+      "step": 56050
+    },
+    {
+      "epoch": 16.34141225821487,
+      "grad_norm": 0.3843114674091339,
+      "learning_rate": 0.0004041422325852521,
+      "loss": 3.2572,
+      "step": 56100
+    },
+    {
+      "epoch": 16.355977627592637,
+      "grad_norm": 0.3832460343837738,
+      "learning_rate": 0.0004039673564558437,
+      "loss": 3.2482,
+      "step": 56150
+    },
+    {
+      "epoch": 16.370542996970403,
+      "grad_norm": 0.39614608883857727,
+      "learning_rate": 0.0004037924803264354,
+      "loss": 3.2568,
+      "step": 56200
+    },
+    {
+      "epoch": 16.38510836634817,
+      "grad_norm": 0.4128139615058899,
+      "learning_rate": 0.00040361760419702704,
+      "loss": 3.2602,
+      "step": 56250
+    },
+    {
+      "epoch": 16.399673735725937,
+      "grad_norm": 0.41927552223205566,
+      "learning_rate": 0.00040344272806761873,
+      "loss": 3.2699,
+      "step": 56300
+    },
+    {
+      "epoch": 16.414239105103704,
+      "grad_norm": 0.4142034351825714,
+      "learning_rate": 0.0004032678519382104,
+      "loss": 3.2651,
+      "step": 56350
+    },
+    {
+      "epoch": 16.428804474481474,
+      "grad_norm": 0.4234794080257416,
+      "learning_rate": 0.00040309297580880206,
+      "loss": 3.2693,
+      "step": 56400
+    },
+    {
+      "epoch": 16.44336984385924,
+      "grad_norm": 0.379566490650177,
+      "learning_rate": 0.00040291809967939375,
+      "loss": 3.259,
+      "step": 56450
+    },
+    {
+      "epoch": 16.457935213237008,
+      "grad_norm": 0.3937167525291443,
+      "learning_rate": 0.0004027432235499854,
+      "loss": 3.2841,
+      "step": 56500
+    },
+    {
+      "epoch": 16.472500582614774,
+      "grad_norm": 0.386248379945755,
+      "learning_rate": 0.0004025683474205771,
+      "loss": 3.277,
+      "step": 56550
+    },
+    {
+      "epoch": 16.48706595199254,
+      "grad_norm": 0.38750800490379333,
+      "learning_rate": 0.0004023934712911687,
+      "loss": 3.2801,
+      "step": 56600
+    },
+    {
+      "epoch": 16.50163132137031,
+      "grad_norm": 0.39586499333381653,
+      "learning_rate": 0.00040221859516176035,
+      "loss": 3.2722,
+      "step": 56650
+    },
+    {
+      "epoch": 16.516196690748078,
+      "grad_norm": 0.37789252400398254,
+      "learning_rate": 0.00040204371903235204,
+      "loss": 3.2748,
+      "step": 56700
+    },
+    {
+      "epoch": 16.530762060125845,
+      "grad_norm": 0.3938862085342407,
+      "learning_rate": 0.0004018688429029437,
+      "loss": 3.277,
+      "step": 56750
+    },
+    {
+      "epoch": 16.54532742950361,
+      "grad_norm": 0.3977769613265991,
+      "learning_rate": 0.0004016939667735354,
+      "loss": 3.291,
+      "step": 56800
+    },
+    {
+      "epoch": 16.55989279888138,
+      "grad_norm": 0.3525155782699585,
+      "learning_rate": 0.00040151909064412705,
+      "loss": 3.2735,
+      "step": 56850
+    },
+    {
+      "epoch": 16.57445816825915,
+      "grad_norm": 0.362099826335907,
+      "learning_rate": 0.0004013442145147187,
+      "loss": 3.2917,
+      "step": 56900
+    },
+    {
+      "epoch": 16.589023537636916,
+      "grad_norm": 0.37509220838546753,
+      "learning_rate": 0.0004011693383853104,
+      "loss": 3.2782,
+      "step": 56950
+    },
+    {
+      "epoch": 16.603588907014682,
+      "grad_norm": 0.3895016312599182,
+      "learning_rate": 0.000400994462255902,
+      "loss": 3.2776,
+      "step": 57000
+    },
+    {
+      "epoch": 16.603588907014682,
+      "eval_accuracy": 0.3718954405894003,
+      "eval_loss": 3.5490047931671143,
+      "eval_runtime": 185.9028,
+      "eval_samples_per_second": 89.536,
+      "eval_steps_per_second": 5.6,
+      "step": 57000
+    },
+    {
+      "epoch": 16.61815427639245,
+      "grad_norm": 0.3511142432689667,
+      "learning_rate": 0.0004008195861264937,
+      "loss": 3.2816,
+      "step": 57050
+    },
+    {
+      "epoch": 16.632719645770216,
+      "grad_norm": 0.42816150188446045,
+      "learning_rate": 0.00040064470999708534,
+      "loss": 3.2748,
+      "step": 57100
+    },
+    {
+      "epoch": 16.647285015147983,
+      "grad_norm": 0.370182067155838,
+      "learning_rate": 0.00040046983386767703,
+      "loss": 3.2864,
+      "step": 57150
+    },
+    {
+      "epoch": 16.661850384525753,
+      "grad_norm": 0.39222970604896545,
+      "learning_rate": 0.00040029495773826867,
+      "loss": 3.2856,
+      "step": 57200
+    },
+    {
+      "epoch": 16.67641575390352,
+      "grad_norm": 0.3937409818172455,
+      "learning_rate": 0.0004001200816088603,
+      "loss": 3.2803,
+      "step": 57250
+    },
+    {
+      "epoch": 16.690981123281286,
+      "grad_norm": 0.38916105031967163,
+      "learning_rate": 0.00039994520547945205,
+      "loss": 3.2942,
+      "step": 57300
+    },
+    {
+      "epoch": 16.705546492659053,
+      "grad_norm": 0.37478119134902954,
+      "learning_rate": 0.0003997703293500437,
+      "loss": 3.3025,
+      "step": 57350
+    },
+    {
+      "epoch": 16.72011186203682,
+      "grad_norm": 0.3683931827545166,
+      "learning_rate": 0.0003995954532206354,
+      "loss": 3.2842,
+      "step": 57400
+    },
+    {
+      "epoch": 16.73467723141459,
+      "grad_norm": 0.4007303714752197,
+      "learning_rate": 0.000399420577091227,
+      "loss": 3.2829,
+      "step": 57450
+    },
+    {
+      "epoch": 16.749242600792357,
+      "grad_norm": 0.3843965232372284,
+      "learning_rate": 0.00039924570096181865,
+      "loss": 3.2901,
+      "step": 57500
+    },
+    {
+      "epoch": 16.763807970170124,
+      "grad_norm": 0.3941800594329834,
+      "learning_rate": 0.00039907082483241034,
+      "loss": 3.2858,
+      "step": 57550
+    },
+    {
+      "epoch": 16.77837333954789,
+      "grad_norm": 0.37438079714775085,
+      "learning_rate": 0.000398895948703002,
+      "loss": 3.2916,
+      "step": 57600
+    },
+    {
+      "epoch": 16.792938708925657,
+      "grad_norm": 0.3703000545501709,
+      "learning_rate": 0.00039872107257359367,
+      "loss": 3.3016,
+      "step": 57650
+    },
+    {
+      "epoch": 16.807504078303424,
+      "grad_norm": 0.3948332369327545,
+      "learning_rate": 0.0003985461964441853,
+      "loss": 3.3057,
+      "step": 57700
+    },
+    {
+      "epoch": 16.822069447681194,
+      "grad_norm": 0.38669082522392273,
+      "learning_rate": 0.00039837132031477694,
+      "loss": 3.2897,
+      "step": 57750
+    },
+    {
+      "epoch": 16.83663481705896,
+      "grad_norm": 0.3628772497177124,
+      "learning_rate": 0.0003981964441853687,
+      "loss": 3.2891,
+      "step": 57800
+    },
+    {
+      "epoch": 16.851200186436728,
+      "grad_norm": 0.39237385988235474,
+      "learning_rate": 0.0003980215680559603,
+      "loss": 3.3021,
+      "step": 57850
+    },
+    {
+      "epoch": 16.865765555814495,
+      "grad_norm": 0.3908953070640564,
+      "learning_rate": 0.000397846691926552,
+      "loss": 3.2962,
+      "step": 57900
+    },
+    {
+      "epoch": 16.88033092519226,
+      "grad_norm": 0.3867229223251343,
+      "learning_rate": 0.00039767181579714365,
+      "loss": 3.2861,
+      "step": 57950
+    },
+    {
+      "epoch": 16.89489629457003,
+      "grad_norm": 0.3902886211872101,
+      "learning_rate": 0.00039749693966773534,
+      "loss": 3.3034,
+      "step": 58000
+    },
+    {
+      "epoch": 16.89489629457003,
+      "eval_accuracy": 0.3726240198363548,
+      "eval_loss": 3.5381805896759033,
+      "eval_runtime": 441.5045,
+      "eval_samples_per_second": 37.701,
+      "eval_steps_per_second": 2.358,
+      "step": 58000
+    },
+    {
+      "epoch": 16.9094616639478,
+      "grad_norm": 0.38796380162239075,
+      "learning_rate": 0.00039732206353832697,
+      "loss": 3.2994,
+      "step": 58050
+    },
+    {
+      "epoch": 16.924027033325565,
+      "grad_norm": 0.35192742943763733,
+      "learning_rate": 0.0003971471874089186,
+      "loss": 3.2959,
+      "step": 58100
+    },
+    {
+      "epoch": 16.938592402703332,
+      "grad_norm": 0.372641384601593,
+      "learning_rate": 0.0003969723112795103,
+      "loss": 3.3026,
+      "step": 58150
+    },
+    {
+      "epoch": 16.9531577720811,
+      "grad_norm": 0.37450307607650757,
+      "learning_rate": 0.00039679743515010194,
+      "loss": 3.3098,
+      "step": 58200
+    },
+    {
+      "epoch": 16.96772314145887,
+      "grad_norm": 0.38844752311706543,
+      "learning_rate": 0.0003966225590206937,
+      "loss": 3.3031,
+      "step": 58250
+    },
+    {
+      "epoch": 16.982288510836636,
+      "grad_norm": 0.37731024622917175,
+      "learning_rate": 0.0003964476828912853,
+      "loss": 3.3081,
+      "step": 58300
+    },
+    {
+      "epoch": 16.996853880214402,
+      "grad_norm": 0.37375837564468384,
+      "learning_rate": 0.00039627280676187695,
+      "loss": 3.2973,
+      "step": 58350
+    },
+    {
+      "epoch": 17.01136098811466,
+      "grad_norm": 0.4000365436077118,
+      "learning_rate": 0.00039609793063246864,
+      "loss": 3.2205,
+      "step": 58400
+    },
+    {
+      "epoch": 17.025926357492427,
+      "grad_norm": 0.3670046031475067,
+      "learning_rate": 0.0003959230545030603,
+      "loss": 3.1889,
+      "step": 58450
+    },
+    {
+      "epoch": 17.040491726870194,
+      "grad_norm": 0.39600327610969543,
+      "learning_rate": 0.00039574817837365197,
+      "loss": 3.2056,
+      "step": 58500
+    },
+    {
+      "epoch": 17.05505709624796,
+      "grad_norm": 0.38830217719078064,
+      "learning_rate": 0.0003955733022442436,
+      "loss": 3.2047,
+      "step": 58550
+    },
+    {
+      "epoch": 17.069622465625727,
+      "grad_norm": 0.394195556640625,
+      "learning_rate": 0.0003953984261148353,
+      "loss": 3.2156,
+      "step": 58600
+    },
+    {
+      "epoch": 17.084187835003497,
+      "grad_norm": 0.3784361183643341,
+      "learning_rate": 0.00039522354998542693,
+      "loss": 3.2088,
+      "step": 58650
+    },
+    {
+      "epoch": 17.098753204381264,
+      "grad_norm": 0.4057703912258148,
+      "learning_rate": 0.00039504867385601857,
+      "loss": 3.2158,
+      "step": 58700
+    },
+    {
+      "epoch": 17.11331857375903,
+      "grad_norm": 0.37357842922210693,
+      "learning_rate": 0.0003948737977266103,
+      "loss": 3.2155,
+      "step": 58750
+    },
+    {
+      "epoch": 17.127883943136798,
+      "grad_norm": 0.3923245370388031,
+      "learning_rate": 0.00039469892159720195,
+      "loss": 3.2217,
+      "step": 58800
+    },
+    {
+      "epoch": 17.142449312514564,
+      "grad_norm": 0.4075673520565033,
+      "learning_rate": 0.00039452404546779364,
+      "loss": 3.2176,
+      "step": 58850
+    },
+    {
+      "epoch": 17.15701468189233,
+      "grad_norm": 0.37942689657211304,
+      "learning_rate": 0.0003943491693383853,
+      "loss": 3.2212,
+      "step": 58900
+    },
+    {
+      "epoch": 17.1715800512701,
+      "grad_norm": 0.39208337664604187,
+      "learning_rate": 0.0003941742932089769,
+      "loss": 3.2204,
+      "step": 58950
+    },
+    {
+      "epoch": 17.18614542064787,
+      "grad_norm": 0.3947177529335022,
+      "learning_rate": 0.0003939994170795686,
+      "loss": 3.2318,
+      "step": 59000
+    },
+    {
+      "epoch": 17.18614542064787,
+      "eval_accuracy": 0.37187474832742445,
+      "eval_loss": 3.5549113750457764,
+      "eval_runtime": 179.6498,
+      "eval_samples_per_second": 92.653,
+      "eval_steps_per_second": 5.795,
+      "step": 59000
+    },
+    {
+      "epoch": 17.200710790025635,
+      "grad_norm": 0.4263794720172882,
+      "learning_rate": 0.00039382454095016024,
+      "loss": 3.2376,
+      "step": 59050
+    },
+    {
+      "epoch": 17.215276159403402,
+      "grad_norm": 0.37080931663513184,
+      "learning_rate": 0.00039364966482075193,
+      "loss": 3.2378,
+      "step": 59100
+    },
+    {
+      "epoch": 17.22984152878117,
+      "grad_norm": 0.39057183265686035,
+      "learning_rate": 0.00039347478869134356,
+      "loss": 3.2199,
+      "step": 59150
+    },
+    {
+      "epoch": 17.24440689815894,
+      "grad_norm": 0.41177624464035034,
+      "learning_rate": 0.0003932999125619353,
+      "loss": 3.2296,
+      "step": 59200
+    },
+    {
+      "epoch": 17.258972267536706,
+      "grad_norm": 0.4065467417240143,
+      "learning_rate": 0.00039312503643252695,
+      "loss": 3.2364,
+      "step": 59250
+    },
+    {
+      "epoch": 17.273537636914472,
+      "grad_norm": 0.37535977363586426,
+      "learning_rate": 0.0003929501603031186,
+      "loss": 3.2309,
+      "step": 59300
+    },
+    {
+      "epoch": 17.28810300629224,
+      "grad_norm": 0.4139235019683838,
+      "learning_rate": 0.00039277528417371027,
+      "loss": 3.2482,
+      "step": 59350
+    },
+    {
+      "epoch": 17.302668375670006,
+      "grad_norm": 0.3840341866016388,
+      "learning_rate": 0.0003926004080443019,
+      "loss": 3.2422,
+      "step": 59400
+    },
+    {
+      "epoch": 17.317233745047773,
+      "grad_norm": 0.3817002475261688,
+      "learning_rate": 0.0003924255319148936,
+      "loss": 3.2387,
+      "step": 59450
+    },
+    {
+      "epoch": 17.331799114425543,
+      "grad_norm": 0.3794045150279999,
+      "learning_rate": 0.00039225065578548523,
+      "loss": 3.2531,
+      "step": 59500
+    },
+    {
+      "epoch": 17.34636448380331,
+      "grad_norm": 0.3869137465953827,
+      "learning_rate": 0.00039207577965607687,
+      "loss": 3.2437,
+      "step": 59550
+    },
+    {
+      "epoch": 17.360929853181077,
+      "grad_norm": 0.39294636249542236,
+      "learning_rate": 0.00039190090352666856,
+      "loss": 3.247,
+      "step": 59600
+    },
+    {
+      "epoch": 17.375495222558843,
+      "grad_norm": 0.37759485840797424,
+      "learning_rate": 0.0003917260273972602,
+      "loss": 3.2565,
+      "step": 59650
+    },
+    {
+      "epoch": 17.39006059193661,
+      "grad_norm": 0.379200279712677,
+      "learning_rate": 0.00039155115126785194,
+      "loss": 3.245,
+      "step": 59700
+    },
+    {
+      "epoch": 17.40462596131438,
+      "grad_norm": 0.40147554874420166,
+      "learning_rate": 0.0003913762751384436,
+      "loss": 3.2482,
+      "step": 59750
+    },
+    {
+      "epoch": 17.419191330692147,
+      "grad_norm": 0.38646212220191956,
+      "learning_rate": 0.00039120139900903527,
+      "loss": 3.254,
+      "step": 59800
+    },
+    {
+      "epoch": 17.433756700069914,
+      "grad_norm": 0.3718118965625763,
+      "learning_rate": 0.0003910265228796269,
+      "loss": 3.2701,
+      "step": 59850
+    },
+    {
+      "epoch": 17.44832206944768,
+      "grad_norm": 0.4207517206668854,
+      "learning_rate": 0.00039085164675021854,
+      "loss": 3.2703,
+      "step": 59900
+    },
+    {
+      "epoch": 17.462887438825447,
+      "grad_norm": 0.41934168338775635,
+      "learning_rate": 0.00039067677062081023,
+      "loss": 3.2493,
+      "step": 59950
+    },
+    {
+      "epoch": 17.477452808203218,
+      "grad_norm": 0.377540647983551,
+      "learning_rate": 0.00039050189449140187,
+      "loss": 3.2693,
+      "step": 60000
+    },
+    {
+      "epoch": 17.477452808203218,
+      "eval_accuracy": 0.37255406588251616,
+      "eval_loss": 3.5508663654327393,
+      "eval_runtime": 179.5955,
+      "eval_samples_per_second": 92.681,
+      "eval_steps_per_second": 5.796,
+      "step": 60000
+    },
+    {
+      "epoch": 17.492018177580984,
+      "grad_norm": 0.39113548398017883,
+      "learning_rate": 0.00039032701836199356,
+      "loss": 3.2626,
+      "step": 60050
+    },
+    {
+      "epoch": 17.50658354695875,
+      "grad_norm": 0.4056381285190582,
+      "learning_rate": 0.0003901521422325852,
+      "loss": 3.263,
+      "step": 60100
+    },
+    {
+      "epoch": 17.521148916336518,
+      "grad_norm": 0.3636176884174347,
+      "learning_rate": 0.00038997726610317683,
+      "loss": 3.2644,
+      "step": 60150
+    },
+    {
+      "epoch": 17.535714285714285,
+      "grad_norm": 0.3688655495643616,
+      "learning_rate": 0.0003898023899737686,
+      "loss": 3.2731,
+      "step": 60200
+    },
+    {
+      "epoch": 17.55027965509205,
+      "grad_norm": 0.39269739389419556,
+      "learning_rate": 0.0003896275138443602,
+      "loss": 3.2603,
+      "step": 60250
+    },
+    {
+      "epoch": 17.56484502446982,
+      "grad_norm": 0.3798394799232483,
+      "learning_rate": 0.0003894526377149519,
+      "loss": 3.2662,
+      "step": 60300
+    },
+    {
+      "epoch": 17.57941039384759,
+      "grad_norm": 0.3824335038661957,
+      "learning_rate": 0.00038927776158554354,
+      "loss": 3.2774,
+      "step": 60350
+    },
+    {
+      "epoch": 17.593975763225355,
+      "grad_norm": 0.38308337330818176,
+      "learning_rate": 0.0003891028854561352,
+      "loss": 3.2772,
+      "step": 60400
+    },
+    {
+      "epoch": 17.608541132603122,
+      "grad_norm": 0.3748604655265808,
+      "learning_rate": 0.00038892800932672686,
+      "loss": 3.2586,
+      "step": 60450
+    },
+    {
+      "epoch": 17.62310650198089,
+      "grad_norm": 0.40975135564804077,
+      "learning_rate": 0.0003887531331973185,
+      "loss": 3.2655,
+      "step": 60500
+    },
+    {
+      "epoch": 17.63767187135866,
+      "grad_norm": 0.38740789890289307,
+      "learning_rate": 0.0003885782570679102,
+      "loss": 3.2761,
+      "step": 60550
+    },
+    {
+      "epoch": 17.652237240736426,
+      "grad_norm": 0.3646203577518463,
+      "learning_rate": 0.0003884033809385018,
+      "loss": 3.2751,
+      "step": 60600
+    },
+    {
+      "epoch": 17.666802610114193,
+      "grad_norm": 0.37517863512039185,
+      "learning_rate": 0.00038822850480909357,
+      "loss": 3.2792,
+      "step": 60650
+    },
+    {
+      "epoch": 17.68136797949196,
+      "grad_norm": 0.3650130033493042,
+      "learning_rate": 0.0003880536286796852,
+      "loss": 3.2693,
+      "step": 60700
+    },
+    {
+      "epoch": 17.695933348869726,
+      "grad_norm": 0.4000101089477539,
+      "learning_rate": 0.00038787875255027684,
+      "loss": 3.2749,
+      "step": 60750
+    },
+    {
+      "epoch": 17.710498718247496,
+      "grad_norm": 0.35344168543815613,
+      "learning_rate": 0.00038770387642086853,
+      "loss": 3.2679,
+      "step": 60800
+    },
+    {
+      "epoch": 17.725064087625263,
+      "grad_norm": 0.40958935022354126,
+      "learning_rate": 0.00038752900029146017,
+      "loss": 3.2857,
+      "step": 60850
+    },
+    {
+      "epoch": 17.73962945700303,
+      "grad_norm": 0.377948135137558,
+      "learning_rate": 0.00038735412416205186,
+      "loss": 3.2691,
+      "step": 60900
+    },
+    {
+      "epoch": 17.754194826380797,
+      "grad_norm": 0.4192025065422058,
+      "learning_rate": 0.0003871792480326435,
+      "loss": 3.2764,
+      "step": 60950
+    },
+    {
+      "epoch": 17.768760195758563,
+      "grad_norm": 0.3829701244831085,
+      "learning_rate": 0.00038700437190323513,
+      "loss": 3.2845,
+      "step": 61000
+    },
+    {
+      "epoch": 17.768760195758563,
+      "eval_accuracy": 0.3727979053787536,
+      "eval_loss": 3.5411858558654785,
+      "eval_runtime": 179.8099,
+      "eval_samples_per_second": 92.57,
+      "eval_steps_per_second": 5.789,
+      "step": 61000
+    },
+    {
+      "epoch": 17.78332556513633,
+      "grad_norm": 0.34948283433914185,
+      "learning_rate": 0.0003868294957738268,
+      "loss": 3.2742,
+      "step": 61050
+    },
+    {
+      "epoch": 17.7978909345141,
+      "grad_norm": 0.3973924219608307,
+      "learning_rate": 0.00038665461964441846,
+      "loss": 3.2799,
+      "step": 61100
+    },
+    {
+      "epoch": 17.812456303891867,
+      "grad_norm": 0.39604613184928894,
+      "learning_rate": 0.0003864797435150102,
+      "loss": 3.2907,
+      "step": 61150
+    },
+    {
+      "epoch": 17.827021673269634,
+      "grad_norm": 0.3890770971775055,
+      "learning_rate": 0.00038630486738560184,
+      "loss": 3.2818,
+      "step": 61200
+    },
+    {
+      "epoch": 17.8415870426474,
+      "grad_norm": 0.39360764622688293,
+      "learning_rate": 0.00038612999125619353,
+      "loss": 3.2866,
+      "step": 61250
+    },
+    {
+      "epoch": 17.856152412025168,
+      "grad_norm": 0.3879394233226776,
+      "learning_rate": 0.00038595511512678517,
+      "loss": 3.2828,
+      "step": 61300
+    },
+    {
+      "epoch": 17.870717781402938,
+      "grad_norm": 0.3946910500526428,
+      "learning_rate": 0.0003857802389973768,
+      "loss": 3.2841,
+      "step": 61350
+    },
+    {
+      "epoch": 17.885283150780705,
+      "grad_norm": 0.3722352981567383,
+      "learning_rate": 0.0003856053628679685,
+      "loss": 3.2906,
+      "step": 61400
+    },
+    {
+      "epoch": 17.89984852015847,
+      "grad_norm": 0.37943729758262634,
+      "learning_rate": 0.00038543048673856013,
+      "loss": 3.2828,
+      "step": 61450
+    },
+    {
+      "epoch": 17.914413889536238,
+      "grad_norm": 0.3946760594844818,
+      "learning_rate": 0.0003852556106091518,
+      "loss": 3.2874,
+      "step": 61500
+    },
+    {
+      "epoch": 17.928979258914005,
+      "grad_norm": 0.38303500413894653,
+      "learning_rate": 0.00038508073447974346,
+      "loss": 3.2743,
+      "step": 61550
+    },
+    {
+      "epoch": 17.943544628291775,
+      "grad_norm": 0.35602617263793945,
+      "learning_rate": 0.0003849058583503351,
+      "loss": 3.2839,
+      "step": 61600
+    },
+    {
+      "epoch": 17.958109997669542,
+      "grad_norm": 0.364681601524353,
+      "learning_rate": 0.00038473098222092684,
+      "loss": 3.2894,
+      "step": 61650
+    },
+    {
+      "epoch": 17.97267536704731,
+      "grad_norm": 0.3929082155227661,
+      "learning_rate": 0.0003845561060915185,
+      "loss": 3.2816,
+      "step": 61700
+    },
+    {
+      "epoch": 17.987240736425075,
+      "grad_norm": 0.37357085943222046,
+      "learning_rate": 0.00038438122996211016,
+      "loss": 3.295,
+      "step": 61750
+    },
+    {
+      "epoch": 18.001747844325333,
+      "grad_norm": 0.4023423492908478,
+      "learning_rate": 0.0003842063538327018,
+      "loss": 3.2778,
+      "step": 61800
+    },
+    {
+      "epoch": 18.0163132137031,
+      "grad_norm": 0.3764852285385132,
+      "learning_rate": 0.0003840314777032935,
+      "loss": 3.175,
+      "step": 61850
+    },
+    {
+      "epoch": 18.030878583080867,
+      "grad_norm": 0.4034103453159332,
+      "learning_rate": 0.0003838566015738851,
+      "loss": 3.1882,
+      "step": 61900
+    },
+    {
+      "epoch": 18.045443952458633,
+      "grad_norm": 0.3662955164909363,
+      "learning_rate": 0.00038368172544447676,
+      "loss": 3.1941,
+      "step": 61950
+    },
+    {
+      "epoch": 18.0600093218364,
+      "grad_norm": 0.3883397579193115,
+      "learning_rate": 0.00038350684931506845,
+      "loss": 3.2022,
+      "step": 62000
+    },
+    {
+      "epoch": 18.0600093218364,
+      "eval_accuracy": 0.3724488410275824,
+      "eval_loss": 3.5511727333068848,
+      "eval_runtime": 179.5651,
+      "eval_samples_per_second": 92.696,
+      "eval_steps_per_second": 5.797,
+      "step": 62000
+    },
+    {
+      "epoch": 18.07457469121417,
+      "grad_norm": 0.39406126737594604,
+      "learning_rate": 0.0003833319731856601,
+      "loss": 3.1855,
+      "step": 62050
+    },
+    {
+      "epoch": 18.089140060591937,
+      "grad_norm": 0.39789289236068726,
+      "learning_rate": 0.00038315709705625183,
+      "loss": 3.1935,
+      "step": 62100
+    },
+    {
+      "epoch": 18.103705429969704,
+      "grad_norm": 0.38348227739334106,
+      "learning_rate": 0.00038298222092684347,
+      "loss": 3.1978,
+      "step": 62150
+    },
+    {
+      "epoch": 18.11827079934747,
+      "grad_norm": 0.3813340663909912,
+      "learning_rate": 0.0003828073447974351,
+      "loss": 3.2017,
+      "step": 62200
+    },
+    {
+      "epoch": 18.132836168725238,
+      "grad_norm": 0.4284285008907318,
+      "learning_rate": 0.0003826324686680268,
+      "loss": 3.2196,
+      "step": 62250
+    },
+    {
+      "epoch": 18.147401538103008,
+      "grad_norm": 0.39629238843917847,
+      "learning_rate": 0.00038245759253861843,
+      "loss": 3.2077,
+      "step": 62300
+    },
+    {
+      "epoch": 18.161966907480775,
+      "grad_norm": 0.40169140696525574,
+      "learning_rate": 0.0003822827164092101,
+      "loss": 3.2137,
+      "step": 62350
+    },
+    {
+      "epoch": 18.17653227685854,
+      "grad_norm": 0.3855275511741638,
+      "learning_rate": 0.00038210784027980176,
+      "loss": 3.2136,
+      "step": 62400
+    },
+    {
+      "epoch": 18.191097646236308,
+      "grad_norm": 0.3961770236492157,
+      "learning_rate": 0.0003819329641503934,
+      "loss": 3.2086,
+      "step": 62450
+    },
+    {
+      "epoch": 18.205663015614075,
+      "grad_norm": 0.37826651334762573,
+      "learning_rate": 0.0003817580880209851,
+      "loss": 3.2079,
+      "step": 62500
+    },
+    {
+      "epoch": 18.22022838499184,
+      "grad_norm": 0.4296334385871887,
+      "learning_rate": 0.0003815832118915767,
+      "loss": 3.2111,
+      "step": 62550
+    },
+    {
+      "epoch": 18.234793754369612,
+      "grad_norm": 0.40578290820121765,
+      "learning_rate": 0.00038140833576216847,
+      "loss": 3.2258,
+      "step": 62600
+    },
+    {
+      "epoch": 18.24935912374738,
+      "grad_norm": 0.4326179027557373,
+      "learning_rate": 0.0003812334596327601,
+      "loss": 3.2318,
+      "step": 62650
+    },
+    {
+      "epoch": 18.263924493125145,
+      "grad_norm": 0.40364038944244385,
+      "learning_rate": 0.0003810585835033518,
+      "loss": 3.2235,
+      "step": 62700
+    },
+    {
+      "epoch": 18.278489862502912,
+      "grad_norm": 0.41583460569381714,
+      "learning_rate": 0.00038088370737394343,
+      "loss": 3.2385,
+      "step": 62750
+    },
+    {
+      "epoch": 18.29305523188068,
+      "grad_norm": 0.3775072693824768,
+      "learning_rate": 0.00038070883124453507,
+      "loss": 3.2353,
+      "step": 62800
+    },
+    {
+      "epoch": 18.30762060125845,
+      "grad_norm": 0.4191801846027374,
+      "learning_rate": 0.00038053395511512676,
+      "loss": 3.2385,
+      "step": 62850
+    },
+    {
+      "epoch": 18.322185970636216,
+      "grad_norm": 0.3996080458164215,
+      "learning_rate": 0.0003803590789857184,
+      "loss": 3.22,
+      "step": 62900
+    },
+    {
+      "epoch": 18.336751340013983,
+      "grad_norm": 0.3744161128997803,
+      "learning_rate": 0.0003801842028563101,
+      "loss": 3.242,
+      "step": 62950
+    },
+    {
+      "epoch": 18.35131670939175,
+      "grad_norm": 0.3929766118526459,
+      "learning_rate": 0.0003800093267269017,
+      "loss": 3.2434,
+      "step": 63000
+    },
+    {
+      "epoch": 18.35131670939175,
+      "eval_accuracy": 0.3726039154227306,
+      "eval_loss": 3.551494836807251,
+      "eval_runtime": 179.6728,
+      "eval_samples_per_second": 92.641,
+      "eval_steps_per_second": 5.794,
+      "step": 63000
+    },
+    {
+      "epoch": 18.365882078769516,
+      "grad_norm": 0.37301939725875854,
+      "learning_rate": 0.00037983445059749335,
+      "loss": 3.2367,
+      "step": 63050
+    },
+    {
+      "epoch": 18.380447448147287,
+      "grad_norm": 0.3646920323371887,
+      "learning_rate": 0.0003796595744680851,
+      "loss": 3.2411,
+      "step": 63100
+    },
+    {
+      "epoch": 18.395012817525053,
+      "grad_norm": 0.41786086559295654,
+      "learning_rate": 0.00037948469833867674,
+      "loss": 3.238,
+      "step": 63150
+    },
+    {
+      "epoch": 18.40957818690282,
+      "grad_norm": 0.39375847578048706,
+      "learning_rate": 0.0003793098222092684,
+      "loss": 3.2392,
+      "step": 63200
+    },
+    {
+      "epoch": 18.424143556280587,
+      "grad_norm": 0.3906821310520172,
+      "learning_rate": 0.00037913494607986006,
+      "loss": 3.2303,
+      "step": 63250
+    },
+    {
+      "epoch": 18.438708925658354,
+      "grad_norm": 0.38584890961647034,
+      "learning_rate": 0.00037896006995045175,
+      "loss": 3.2507,
+      "step": 63300
+    },
+    {
+      "epoch": 18.45327429503612,
+      "grad_norm": 0.37560147047042847,
+      "learning_rate": 0.0003787851938210434,
+      "loss": 3.2401,
+      "step": 63350
+    },
+    {
+      "epoch": 18.46783966441389,
+      "grad_norm": 0.39870715141296387,
+      "learning_rate": 0.000378610317691635,
+      "loss": 3.2559,
+      "step": 63400
+    },
+    {
+      "epoch": 18.482405033791657,
+      "grad_norm": 0.416790634393692,
+      "learning_rate": 0.0003784354415622267,
+      "loss": 3.252,
+      "step": 63450
+    },
+    {
+      "epoch": 18.496970403169424,
+      "grad_norm": 0.3930261731147766,
+      "learning_rate": 0.00037826056543281835,
+      "loss": 3.2556,
+      "step": 63500
+    },
+    {
+      "epoch": 18.51153577254719,
+      "grad_norm": 0.38571596145629883,
+      "learning_rate": 0.0003780856893034101,
+      "loss": 3.2523,
+      "step": 63550
+    },
+    {
+      "epoch": 18.526101141924958,
+      "grad_norm": 0.3790442943572998,
+      "learning_rate": 0.00037791081317400173,
+      "loss": 3.2502,
+      "step": 63600
+    },
+    {
+      "epoch": 18.540666511302728,
+      "grad_norm": 0.40154215693473816,
+      "learning_rate": 0.00037773593704459337,
+      "loss": 3.2554,
+      "step": 63650
+    },
+    {
+      "epoch": 18.555231880680495,
+      "grad_norm": 0.3869607746601105,
+      "learning_rate": 0.00037756106091518506,
+      "loss": 3.2674,
+      "step": 63700
+    },
+    {
+      "epoch": 18.56979725005826,
+      "grad_norm": 0.36808493733406067,
+      "learning_rate": 0.0003773861847857767,
+      "loss": 3.2566,
+      "step": 63750
+    },
+    {
+      "epoch": 18.58436261943603,
+      "grad_norm": 0.4031069278717041,
+      "learning_rate": 0.0003772113086563684,
+      "loss": 3.2647,
+      "step": 63800
+    },
+    {
+      "epoch": 18.598927988813795,
+      "grad_norm": 0.39664480090141296,
+      "learning_rate": 0.00037703643252696,
+      "loss": 3.2611,
+      "step": 63850
+    },
+    {
+      "epoch": 18.613493358191565,
+      "grad_norm": 0.4211257994174957,
+      "learning_rate": 0.0003768615563975517,
+      "loss": 3.2466,
+      "step": 63900
+    },
+    {
+      "epoch": 18.628058727569332,
+      "grad_norm": 0.37485969066619873,
+      "learning_rate": 0.00037668668026814335,
+      "loss": 3.2698,
+      "step": 63950
+    },
+    {
+      "epoch": 18.6426240969471,
+      "grad_norm": 0.3820188343524933,
+      "learning_rate": 0.000376511804138735,
+      "loss": 3.2583,
+      "step": 64000
+    },
+    {
+      "epoch": 18.6426240969471,
+      "eval_accuracy": 0.37283764392732077,
+      "eval_loss": 3.542705535888672,
+      "eval_runtime": 179.7091,
+      "eval_samples_per_second": 92.622,
+      "eval_steps_per_second": 5.793,
+      "step": 64000
+    },
+    {
+      "epoch": 18.657189466324866,
+      "grad_norm": 0.3915201425552368,
+      "learning_rate": 0.00037633692800932673,
+      "loss": 3.2704,
+      "step": 64050
+    },
+    {
+      "epoch": 18.671754835702632,
+      "grad_norm": 0.36770007014274597,
+      "learning_rate": 0.00037616205187991837,
+      "loss": 3.2531,
+      "step": 64100
+    },
+    {
+      "epoch": 18.6863202050804,
+      "grad_norm": 0.4022904336452484,
+      "learning_rate": 0.00037598717575051006,
+      "loss": 3.2499,
+      "step": 64150
+    },
+    {
+      "epoch": 18.70088557445817,
+      "grad_norm": 0.36411207914352417,
+      "learning_rate": 0.0003758122996211017,
+      "loss": 3.2623,
+      "step": 64200
+    },
+    {
+      "epoch": 18.715450943835936,
+      "grad_norm": 0.37535756826400757,
+      "learning_rate": 0.00037563742349169333,
+      "loss": 3.2548,
+      "step": 64250
+    },
+    {
+      "epoch": 18.730016313213703,
+      "grad_norm": 0.3946349322795868,
+      "learning_rate": 0.000375462547362285,
+      "loss": 3.2668,
+      "step": 64300
+    },
+    {
+      "epoch": 18.74458168259147,
+      "grad_norm": 0.4044114053249359,
+      "learning_rate": 0.00037528767123287665,
+      "loss": 3.2717,
+      "step": 64350
+    },
+    {
+      "epoch": 18.759147051969236,
+      "grad_norm": 0.3657906949520111,
+      "learning_rate": 0.00037511279510346834,
+      "loss": 3.2569,
+      "step": 64400
+    },
+    {
+      "epoch": 18.773712421347007,
+      "grad_norm": 0.3859136402606964,
+      "learning_rate": 0.00037493791897406,
+      "loss": 3.2739,
+      "step": 64450
+    },
+    {
+      "epoch": 18.788277790724774,
+      "grad_norm": 0.38982921838760376,
+      "learning_rate": 0.0003747630428446516,
+      "loss": 3.2765,
+      "step": 64500
+    },
+    {
+      "epoch": 18.80284316010254,
+      "grad_norm": 0.3761852979660034,
+      "learning_rate": 0.00037458816671524336,
+      "loss": 3.2637,
+      "step": 64550
+    },
+    {
+      "epoch": 18.817408529480307,
+      "grad_norm": 0.3764474093914032,
+      "learning_rate": 0.000374413290585835,
+      "loss": 3.2715,
+      "step": 64600
+    },
+    {
+      "epoch": 18.831973898858074,
+      "grad_norm": 0.37012961506843567,
+      "learning_rate": 0.0003742384144564267,
+      "loss": 3.2756,
+      "step": 64650
+    },
+    {
+      "epoch": 18.846539268235844,
+      "grad_norm": 0.4159339964389801,
+      "learning_rate": 0.0003740635383270183,
+      "loss": 3.2728,
+      "step": 64700
+    },
+    {
+      "epoch": 18.86110463761361,
+      "grad_norm": 0.3688717484474182,
+      "learning_rate": 0.00037388866219761,
+      "loss": 3.2715,
+      "step": 64750
+    },
+    {
+      "epoch": 18.875670006991378,
+      "grad_norm": 0.4111153185367584,
+      "learning_rate": 0.00037371378606820165,
+      "loss": 3.283,
+      "step": 64800
+    },
+    {
+      "epoch": 18.890235376369144,
+      "grad_norm": 0.4147163927555084,
+      "learning_rate": 0.0003735389099387933,
+      "loss": 3.2711,
+      "step": 64850
+    },
+    {
+      "epoch": 18.90480074574691,
+      "grad_norm": 0.36633679270744324,
+      "learning_rate": 0.000373364033809385,
+      "loss": 3.2735,
+      "step": 64900
+    },
+    {
+      "epoch": 18.919366115124678,
+      "grad_norm": 0.3624868094921112,
+      "learning_rate": 0.0003731891576799766,
+      "loss": 3.2772,
+      "step": 64950
+    },
+    {
+      "epoch": 18.93393148450245,
+      "grad_norm": 0.38773536682128906,
+      "learning_rate": 0.00037301428155056836,
+      "loss": 3.2754,
+      "step": 65000
+    },
+    {
+      "epoch": 18.93393148450245,
+      "eval_accuracy": 0.3732179818107963,
+      "eval_loss": 3.5359609127044678,
+      "eval_runtime": 179.6247,
+      "eval_samples_per_second": 92.665,
+      "eval_steps_per_second": 5.795,
+      "step": 65000
+    },
+    {
+      "epoch": 18.948496853880215,
+      "grad_norm": 0.37369510531425476,
+      "learning_rate": 0.00037283940542116,
+      "loss": 3.2842,
+      "step": 65050
+    },
+    {
+      "epoch": 18.96306222325798,
+      "grad_norm": 0.4039534032344818,
+      "learning_rate": 0.00037266452929175163,
+      "loss": 3.2827,
+      "step": 65100
+    },
+    {
+      "epoch": 18.97762759263575,
+      "grad_norm": 0.36381030082702637,
+      "learning_rate": 0.0003724896531623433,
+      "loss": 3.2838,
+      "step": 65150
+    },
+    {
+      "epoch": 18.992192962013515,
+      "grad_norm": 0.38101911544799805,
+      "learning_rate": 0.00037231477703293496,
+      "loss": 3.277,
+      "step": 65200
+    },
+    {
+      "epoch": 19.006700069913773,
+      "grad_norm": 0.3940986692905426,
+      "learning_rate": 0.00037213990090352665,
+      "loss": 3.2168,
+      "step": 65250
+    },
+    {
+      "epoch": 19.02126543929154,
+      "grad_norm": 0.44007158279418945,
+      "learning_rate": 0.0003719650247741183,
+      "loss": 3.1807,
+      "step": 65300
+    },
+    {
+      "epoch": 19.035830808669306,
+      "grad_norm": 0.3865497410297394,
+      "learning_rate": 0.00037179014864471,
+      "loss": 3.1847,
+      "step": 65350
+    },
+    {
+      "epoch": 19.050396178047077,
+      "grad_norm": 0.40062960982322693,
+      "learning_rate": 0.0003716152725153016,
+      "loss": 3.1729,
+      "step": 65400
+    },
+    {
+      "epoch": 19.064961547424844,
+      "grad_norm": 0.3954075276851654,
+      "learning_rate": 0.00037144039638589325,
+      "loss": 3.1875,
+      "step": 65450
+    },
+    {
+      "epoch": 19.07952691680261,
+      "grad_norm": 0.37583020329475403,
+      "learning_rate": 0.000371265520256485,
+      "loss": 3.1852,
+      "step": 65500
+    },
+    {
+      "epoch": 19.094092286180377,
+      "grad_norm": 0.38721004128456116,
+      "learning_rate": 0.00037109064412707663,
+      "loss": 3.1877,
+      "step": 65550
+    },
+    {
+      "epoch": 19.108657655558144,
+      "grad_norm": 0.4319014847278595,
+      "learning_rate": 0.0003709157679976683,
+      "loss": 3.2013,
+      "step": 65600
+    },
+    {
+      "epoch": 19.123223024935914,
+      "grad_norm": 0.36834290623664856,
+      "learning_rate": 0.00037074089186825995,
+      "loss": 3.1977,
+      "step": 65650
+    },
+    {
+      "epoch": 19.13778839431368,
+      "grad_norm": 0.4034636318683624,
+      "learning_rate": 0.0003705660157388516,
+      "loss": 3.1942,
+      "step": 65700
+    },
+    {
+      "epoch": 19.152353763691448,
+      "grad_norm": 0.3813159763813019,
+      "learning_rate": 0.0003703911396094433,
+      "loss": 3.1803,
+      "step": 65750
+    },
+    {
+      "epoch": 19.166919133069214,
+      "grad_norm": 0.3532137870788574,
+      "learning_rate": 0.0003702162634800349,
+      "loss": 3.2012,
+      "step": 65800
+    },
+    {
+      "epoch": 19.18148450244698,
+      "grad_norm": 0.38538220524787903,
+      "learning_rate": 0.0003700413873506266,
+      "loss": 3.2056,
+      "step": 65850
+    },
+    {
+      "epoch": 19.196049871824748,
+      "grad_norm": 0.36290931701660156,
+      "learning_rate": 0.00036986651122121824,
+      "loss": 3.214,
+      "step": 65900
+    },
+    {
+      "epoch": 19.210615241202518,
+      "grad_norm": 0.4123310446739197,
+      "learning_rate": 0.00036969163509181,
+      "loss": 3.2063,
+      "step": 65950
+    },
+    {
+      "epoch": 19.225180610580285,
+      "grad_norm": 0.41633448004722595,
+      "learning_rate": 0.0003695167589624016,
+      "loss": 3.1981,
+      "step": 66000
+    },
+    {
+      "epoch": 19.225180610580285,
+      "eval_accuracy": 0.3726028572956977,
+      "eval_loss": 3.5563158988952637,
+      "eval_runtime": 179.6893,
+      "eval_samples_per_second": 92.632,
+      "eval_steps_per_second": 5.793,
+      "step": 66000
+    },
+    {
+      "epoch": 19.23974597995805,
+      "grad_norm": 0.4212048053741455,
+      "learning_rate": 0.00036934188283299326,
+      "loss": 3.221,
+      "step": 66050
+    },
+    {
+      "epoch": 19.25431134933582,
+      "grad_norm": 0.4007203280925751,
+      "learning_rate": 0.00036916700670358495,
+      "loss": 3.2141,
+      "step": 66100
+    },
+    {
+      "epoch": 19.268876718713585,
+      "grad_norm": 0.4050043523311615,
+      "learning_rate": 0.0003689921305741766,
+      "loss": 3.2165,
+      "step": 66150
+    },
+    {
+      "epoch": 19.283442088091356,
+      "grad_norm": 0.41472339630126953,
+      "learning_rate": 0.0003688172544447683,
+      "loss": 3.2188,
+      "step": 66200
+    },
+    {
+      "epoch": 19.298007457469122,
+      "grad_norm": 0.3750508725643158,
+      "learning_rate": 0.0003686423783153599,
+      "loss": 3.2196,
+      "step": 66250
+    },
+    {
+      "epoch": 19.31257282684689,
+      "grad_norm": 0.3770619034767151,
+      "learning_rate": 0.00036846750218595155,
+      "loss": 3.2164,
+      "step": 66300
+    },
+    {
+      "epoch": 19.327138196224656,
+      "grad_norm": 0.36972129344940186,
+      "learning_rate": 0.00036829262605654324,
+      "loss": 3.2416,
+      "step": 66350
+    },
+    {
+      "epoch": 19.341703565602423,
+      "grad_norm": 0.3639586567878723,
+      "learning_rate": 0.0003681177499271349,
+      "loss": 3.2269,
+      "step": 66400
+    },
+    {
+      "epoch": 19.356268934980193,
+      "grad_norm": 0.410324364900589,
+      "learning_rate": 0.0003679428737977266,
+      "loss": 3.2258,
+      "step": 66450
+    },
+    {
+      "epoch": 19.37083430435796,
+      "grad_norm": 0.4071573317050934,
+      "learning_rate": 0.00036776799766831826,
+      "loss": 3.2323,
+      "step": 66500
+    },
+    {
+      "epoch": 19.385399673735726,
+      "grad_norm": 0.3902466595172882,
+      "learning_rate": 0.0003675931215389099,
+      "loss": 3.2259,
+      "step": 66550
+    },
+    {
+      "epoch": 19.399965043113493,
+      "grad_norm": 0.37968191504478455,
+      "learning_rate": 0.0003674182454095016,
+      "loss": 3.2274,
+      "step": 66600
+    },
+    {
+      "epoch": 19.41453041249126,
+      "grad_norm": 0.39837968349456787,
+      "learning_rate": 0.0003672433692800932,
+      "loss": 3.2345,
+      "step": 66650
+    },
+    {
+      "epoch": 19.429095781869027,
+      "grad_norm": 0.38949036598205566,
+      "learning_rate": 0.0003670684931506849,
+      "loss": 3.2259,
+      "step": 66700
+    },
+    {
+      "epoch": 19.443661151246797,
+      "grad_norm": 0.4259556531906128,
+      "learning_rate": 0.00036689361702127655,
+      "loss": 3.2506,
+      "step": 66750
+    },
+    {
+      "epoch": 19.458226520624564,
+      "grad_norm": 0.3879312574863434,
+      "learning_rate": 0.00036671874089186824,
+      "loss": 3.2416,
+      "step": 66800
+    },
+    {
+      "epoch": 19.47279189000233,
+      "grad_norm": 0.3887031078338623,
+      "learning_rate": 0.00036654386476245987,
+      "loss": 3.2328,
+      "step": 66850
+    },
+    {
+      "epoch": 19.487357259380097,
+      "grad_norm": 0.41111478209495544,
+      "learning_rate": 0.0003663689886330515,
+      "loss": 3.2329,
+      "step": 66900
+    },
+    {
+      "epoch": 19.501922628757864,
+      "grad_norm": 0.406820684671402,
+      "learning_rate": 0.00036619411250364325,
+      "loss": 3.2492,
+      "step": 66950
+    },
+    {
+      "epoch": 19.516487998135634,
+      "grad_norm": 0.3900870382785797,
+      "learning_rate": 0.0003660192363742349,
+      "loss": 3.2406,
+      "step": 67000
+    },
+    {
+      "epoch": 19.516487998135634,
+      "eval_accuracy": 0.37308265912026145,
+      "eval_loss": 3.544171094894409,
+      "eval_runtime": 179.6633,
+      "eval_samples_per_second": 92.646,
+      "eval_steps_per_second": 5.794,
+      "step": 67000
+    },
+    {
+      "epoch": 19.5310533675134,
+      "grad_norm": 0.3849544823169708,
+      "learning_rate": 0.0003658443602448266,
+      "loss": 3.2406,
+      "step": 67050
+    },
+    {
+      "epoch": 19.545618736891168,
+      "grad_norm": 0.4000382423400879,
+      "learning_rate": 0.0003656694841154182,
+      "loss": 3.2455,
+      "step": 67100
+    },
+    {
+      "epoch": 19.560184106268935,
+      "grad_norm": 0.39659613370895386,
+      "learning_rate": 0.00036549460798600985,
+      "loss": 3.2367,
+      "step": 67150
+    },
+    {
+      "epoch": 19.5747494756467,
+      "grad_norm": 0.36925145983695984,
+      "learning_rate": 0.00036531973185660154,
+      "loss": 3.2491,
+      "step": 67200
+    },
+    {
+      "epoch": 19.589314845024468,
+      "grad_norm": 0.3747584819793701,
+      "learning_rate": 0.0003651448557271932,
+      "loss": 3.2431,
+      "step": 67250
+    },
+    {
+      "epoch": 19.60388021440224,
+      "grad_norm": 0.371640682220459,
+      "learning_rate": 0.00036496997959778487,
+      "loss": 3.2545,
+      "step": 67300
+    },
+    {
+      "epoch": 19.618445583780005,
+      "grad_norm": 0.38793015480041504,
+      "learning_rate": 0.0003647951034683765,
+      "loss": 3.2471,
+      "step": 67350
+    },
+    {
+      "epoch": 19.633010953157772,
+      "grad_norm": 0.4079042375087738,
+      "learning_rate": 0.00036462022733896825,
+      "loss": 3.2497,
+      "step": 67400
+    },
+    {
+      "epoch": 19.64757632253554,
+      "grad_norm": 0.39877283573150635,
+      "learning_rate": 0.0003644453512095599,
+      "loss": 3.2512,
+      "step": 67450
+    },
+    {
+      "epoch": 19.662141691913305,
+      "grad_norm": 0.40305206179618835,
+      "learning_rate": 0.0003642704750801515,
+      "loss": 3.2577,
+      "step": 67500
+    },
+    {
+      "epoch": 19.676707061291076,
+      "grad_norm": 0.3949699103832245,
+      "learning_rate": 0.0003640955989507432,
+      "loss": 3.2556,
+      "step": 67550
+    },
+    {
+      "epoch": 19.691272430668842,
+      "grad_norm": 0.3933976888656616,
+      "learning_rate": 0.00036392072282133485,
+      "loss": 3.2607,
+      "step": 67600
+    },
+    {
+      "epoch": 19.70583780004661,
+      "grad_norm": 0.4123631417751312,
+      "learning_rate": 0.00036374584669192654,
+      "loss": 3.2436,
+      "step": 67650
+    },
+    {
+      "epoch": 19.720403169424376,
+      "grad_norm": 0.36142420768737793,
+      "learning_rate": 0.0003635709705625182,
+      "loss": 3.2589,
+      "step": 67700
+    },
+    {
+      "epoch": 19.734968538802143,
+      "grad_norm": 0.4008404612541199,
+      "learning_rate": 0.0003633960944331098,
+      "loss": 3.2537,
+      "step": 67750
+    },
+    {
+      "epoch": 19.749533908179913,
+      "grad_norm": 0.36675870418548584,
+      "learning_rate": 0.0003632212183037015,
+      "loss": 3.256,
+      "step": 67800
+    },
+    {
+      "epoch": 19.76409927755768,
+      "grad_norm": 0.3671972155570984,
+      "learning_rate": 0.00036304634217429314,
+      "loss": 3.242,
+      "step": 67850
+    },
+    {
+      "epoch": 19.778664646935447,
+      "grad_norm": 0.40335404872894287,
+      "learning_rate": 0.0003628714660448849,
+      "loss": 3.2533,
+      "step": 67900
+    },
+    {
+      "epoch": 19.793230016313213,
+      "grad_norm": 0.4161832630634308,
+      "learning_rate": 0.0003626965899154765,
+      "loss": 3.2499,
+      "step": 67950
+    },
+    {
+      "epoch": 19.80779538569098,
+      "grad_norm": 0.4160614311695099,
+      "learning_rate": 0.0003625217137860682,
+      "loss": 3.2603,
+      "step": 68000
+    },
+    {
+      "epoch": 19.80779538569098,
+      "eval_accuracy": 0.3735364780476851,
+      "eval_loss": 3.539116859436035,
+      "eval_runtime": 179.5961,
+      "eval_samples_per_second": 92.68,
+      "eval_steps_per_second": 5.796,
+      "step": 68000
+    },
+    {
+      "epoch": 19.822360755068747,
+      "grad_norm": 0.3749236464500427,
+      "learning_rate": 0.00036234683765665985,
+      "loss": 3.2614,
+      "step": 68050
+    },
+    {
+      "epoch": 19.836926124446517,
+      "grad_norm": 0.39177680015563965,
+      "learning_rate": 0.0003621719615272515,
+      "loss": 3.257,
+      "step": 68100
+    },
+    {
+      "epoch": 19.851491493824284,
+      "grad_norm": 0.3916817307472229,
+      "learning_rate": 0.00036199708539784317,
+      "loss": 3.264,
+      "step": 68150
+    },
+    {
+      "epoch": 19.86605686320205,
+      "grad_norm": 0.43153202533721924,
+      "learning_rate": 0.0003618222092684348,
+      "loss": 3.2513,
+      "step": 68200
+    },
+    {
+      "epoch": 19.880622232579817,
+      "grad_norm": 0.40334609150886536,
+      "learning_rate": 0.0003616473331390265,
+      "loss": 3.2527,
+      "step": 68250
+    },
+    {
+      "epoch": 19.895187601957584,
+      "grad_norm": 0.4108611047267914,
+      "learning_rate": 0.00036147245700961813,
+      "loss": 3.2539,
+      "step": 68300
+    },
+    {
+      "epoch": 19.909752971335354,
+      "grad_norm": 0.39317360520362854,
+      "learning_rate": 0.00036129758088020977,
+      "loss": 3.2706,
+      "step": 68350
+    },
+    {
+      "epoch": 19.92431834071312,
+      "grad_norm": 0.3866609036922455,
+      "learning_rate": 0.0003611227047508015,
+      "loss": 3.2614,
+      "step": 68400
+    },
+    {
+      "epoch": 19.938883710090888,
+      "grad_norm": 0.4029618501663208,
+      "learning_rate": 0.00036094782862139315,
+      "loss": 3.2586,
+      "step": 68450
+    },
+    {
+      "epoch": 19.953449079468655,
+      "grad_norm": 0.3921782672405243,
+      "learning_rate": 0.00036077295249198484,
+      "loss": 3.2668,
+      "step": 68500
+    },
+    {
+      "epoch": 19.96801444884642,
+      "grad_norm": 0.37679243087768555,
+      "learning_rate": 0.0003605980763625765,
+      "loss": 3.2669,
+      "step": 68550
+    },
+    {
+      "epoch": 19.982579818224192,
+      "grad_norm": 0.39157822728157043,
+      "learning_rate": 0.0003604232002331681,
+      "loss": 3.2711,
+      "step": 68600
+    },
+    {
+      "epoch": 19.99714518760196,
+      "grad_norm": 0.37484461069107056,
+      "learning_rate": 0.0003602483241037598,
+      "loss": 3.2748,
+      "step": 68650
+    },
+    {
+      "epoch": 20.011652295502213,
+      "grad_norm": 0.3911686837673187,
+      "learning_rate": 0.00036007344797435144,
+      "loss": 3.184,
+      "step": 68700
+    },
+    {
+      "epoch": 20.026217664879983,
+      "grad_norm": 0.37566766142845154,
+      "learning_rate": 0.00035989857184494313,
+      "loss": 3.1588,
+      "step": 68750
+    },
+    {
+      "epoch": 20.04078303425775,
+      "grad_norm": 0.3786637485027313,
+      "learning_rate": 0.00035972369571553477,
+      "loss": 3.1597,
+      "step": 68800
+    },
+    {
+      "epoch": 20.055348403635517,
+      "grad_norm": 0.4000002145767212,
+      "learning_rate": 0.0003595488195861265,
+      "loss": 3.1753,
+      "step": 68850
+    },
+    {
+      "epoch": 20.069913773013283,
+      "grad_norm": 0.41424959897994995,
+      "learning_rate": 0.00035937394345671815,
+      "loss": 3.1716,
+      "step": 68900
+    },
+    {
+      "epoch": 20.08447914239105,
+      "grad_norm": 0.4069176912307739,
+      "learning_rate": 0.0003591990673273098,
+      "loss": 3.1743,
+      "step": 68950
+    },
+    {
+      "epoch": 20.099044511768817,
+      "grad_norm": 0.39178600907325745,
+      "learning_rate": 0.0003590241911979015,
+      "loss": 3.1759,
+      "step": 69000
+    },
+    {
+      "epoch": 20.099044511768817,
+      "eval_accuracy": 0.3731403858283871,
+      "eval_loss": 3.5491995811462402,
+      "eval_runtime": 179.7147,
+      "eval_samples_per_second": 92.619,
+      "eval_steps_per_second": 5.793,
+      "step": 69000
+    },
+    {
+      "epoch": 20.113609881146587,
+      "grad_norm": 0.42330384254455566,
+      "learning_rate": 0.0003588493150684931,
+      "loss": 3.1797,
+      "step": 69050
+    },
+    {
+      "epoch": 20.128175250524354,
+      "grad_norm": 0.3965478241443634,
+      "learning_rate": 0.0003586744389390848,
+      "loss": 3.1863,
+      "step": 69100
+    },
+    {
+      "epoch": 20.14274061990212,
+      "grad_norm": 0.41020357608795166,
+      "learning_rate": 0.00035849956280967644,
+      "loss": 3.1812,
+      "step": 69150
+    },
+    {
+      "epoch": 20.157305989279887,
+      "grad_norm": 0.42909374833106995,
+      "learning_rate": 0.0003583246866802681,
+      "loss": 3.2021,
+      "step": 69200
+    },
+    {
+      "epoch": 20.171871358657654,
+      "grad_norm": 0.38205522298812866,
+      "learning_rate": 0.00035814981055085976,
+      "loss": 3.2022,
+      "step": 69250
+    },
+    {
+      "epoch": 20.186436728035424,
+      "grad_norm": 0.3973395526409149,
+      "learning_rate": 0.0003579749344214514,
+      "loss": 3.2015,
+      "step": 69300
+    },
+    {
+      "epoch": 20.20100209741319,
+      "grad_norm": 0.41039976477622986,
+      "learning_rate": 0.00035780005829204315,
+      "loss": 3.1961,
+      "step": 69350
+    },
+    {
+      "epoch": 20.215567466790958,
+      "grad_norm": 0.3577198088169098,
+      "learning_rate": 0.0003576251821626348,
+      "loss": 3.2056,
+      "step": 69400
+    },
+    {
+      "epoch": 20.230132836168725,
+      "grad_norm": 0.43166354298591614,
+      "learning_rate": 0.00035745030603322647,
+      "loss": 3.2002,
+      "step": 69450
+    },
+    {
+      "epoch": 20.24469820554649,
+      "grad_norm": 0.3968643546104431,
+      "learning_rate": 0.0003572754299038181,
+      "loss": 3.2117,
+      "step": 69500
+    },
+    {
+      "epoch": 20.25926357492426,
+      "grad_norm": 0.3748406171798706,
+      "learning_rate": 0.00035710055377440974,
+      "loss": 3.199,
+      "step": 69550
+    },
+    {
+      "epoch": 20.27382894430203,
+      "grad_norm": 0.41351601481437683,
+      "learning_rate": 0.00035692567764500143,
+      "loss": 3.1976,
+      "step": 69600
+    },
+    {
+      "epoch": 20.288394313679795,
+      "grad_norm": 0.40381181240081787,
+      "learning_rate": 0.00035675080151559307,
+      "loss": 3.1988,
+      "step": 69650
+    },
+    {
+      "epoch": 20.302959683057562,
+      "grad_norm": 0.38123536109924316,
+      "learning_rate": 0.00035657592538618476,
+      "loss": 3.2122,
+      "step": 69700
+    },
+    {
+      "epoch": 20.31752505243533,
+      "grad_norm": 0.40208685398101807,
+      "learning_rate": 0.0003564010492567764,
+      "loss": 3.2187,
+      "step": 69750
+    },
+    {
+      "epoch": 20.332090421813096,
+      "grad_norm": 0.40056926012039185,
+      "learning_rate": 0.00035622617312736803,
+      "loss": 3.2165,
+      "step": 69800
+    },
+    {
+      "epoch": 20.346655791190866,
+      "grad_norm": 0.40889421105384827,
+      "learning_rate": 0.0003560512969979598,
+      "loss": 3.2228,
+      "step": 69850
+    },
+    {
+      "epoch": 20.361221160568633,
+      "grad_norm": 0.3698402941226959,
+      "learning_rate": 0.0003558764208685514,
+      "loss": 3.2188,
+      "step": 69900
+    },
+    {
+      "epoch": 20.3757865299464,
+      "grad_norm": 0.4034403860569,
+      "learning_rate": 0.0003557015447391431,
+      "loss": 3.2169,
+      "step": 69950
+    },
+    {
+      "epoch": 20.390351899324166,
+      "grad_norm": 0.37178969383239746,
+      "learning_rate": 0.00035552666860973474,
+      "loss": 3.2213,
+      "step": 70000
+    },
+    {
+      "epoch": 20.390351899324166,
+      "eval_accuracy": 0.3731590794059675,
+      "eval_loss": 3.5475502014160156,
+      "eval_runtime": 179.6766,
+      "eval_samples_per_second": 92.639,
+      "eval_steps_per_second": 5.794,
+      "step": 70000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171650,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 5
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.46313608822784e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}