diff --git "a/last_to_hit_frequency_3591/checkpoint-100000/trainer_state.json" "b/last_to_hit_frequency_3591/checkpoint-100000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/last_to_hit_frequency_3591/checkpoint-100000/trainer_state.json"
@@ -0,0 +1,14943 @@
+{
+  "best_global_step": 72000,
+  "best_metric": 3.527845621109009,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_3591/checkpoint-30000",
+  "epoch": 29.129340480074575,
+  "eval_steps": 1000,
+  "global_step": 100000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01456536937776742,
+      "grad_norm": 1.7487624883651733,
+      "learning_rate": 0.000294,
+      "loss": 8.4085,
+      "step": 50
+    },
+    {
+      "epoch": 0.02913073875553484,
+      "grad_norm": 0.8792372941970825,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7474,
+      "step": 100
+    },
+    {
+      "epoch": 0.04369610813330226,
+      "grad_norm": 0.49055325984954834,
+      "learning_rate": 0.0005998286213931798,
+      "loss": 6.369,
+      "step": 150
+    },
+    {
+      "epoch": 0.05826147751106968,
+      "grad_norm": 0.47406330704689026,
+      "learning_rate": 0.0005996537452637714,
+      "loss": 6.152,
+      "step": 200
+    },
+    {
+      "epoch": 0.0728268468888371,
+      "grad_norm": 0.5468786358833313,
+      "learning_rate": 0.0005994788691343632,
+      "loss": 6.0121,
+      "step": 250
+    },
+    {
+      "epoch": 0.08739221626660452,
+      "grad_norm": 0.5636491775512695,
+      "learning_rate": 0.0005993039930049548,
+      "loss": 5.8836,
+      "step": 300
+    },
+    {
+      "epoch": 0.10195758564437195,
+      "grad_norm": 0.47112441062927246,
+      "learning_rate": 0.0005991291168755465,
+      "loss": 5.7826,
+      "step": 350
+    },
+    {
+      "epoch": 0.11652295502213936,
+      "grad_norm": 0.614771842956543,
+      "learning_rate": 0.0005989542407461382,
+      "loss": 5.6472,
+      "step": 400
+    },
+    {
+      "epoch": 0.13108832439990678,
+      "grad_norm": 0.49487683176994324,
+      "learning_rate": 0.0005987793646167297,
+      "loss": 5.5373,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456536937776742,
+      "grad_norm": 0.4133830964565277,
+      "learning_rate": 0.0005986044884873214,
+      "loss": 5.4464,
+      "step": 500
+    },
+    {
+      "epoch": 0.16021906315544163,
+      "grad_norm": 0.43713563680648804,
+      "learning_rate": 0.0005984296123579131,
+      "loss": 5.3584,
+      "step": 550
+    },
+    {
+      "epoch": 0.17478443253320905,
+      "grad_norm": 0.4758622348308563,
+      "learning_rate": 0.0005982547362285047,
+      "loss": 5.2639,
+      "step": 600
+    },
+    {
+      "epoch": 0.18934980191097647,
+      "grad_norm": 0.5036144852638245,
+      "learning_rate": 0.0005980798600990964,
+      "loss": 5.2065,
+      "step": 650
+    },
+    {
+      "epoch": 0.2039151712887439,
+      "grad_norm": 0.4153907597064972,
+      "learning_rate": 0.0005979049839696881,
+      "loss": 5.1452,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184805406665113,
+      "grad_norm": 0.49257349967956543,
+      "learning_rate": 0.0005977301078402798,
+      "loss": 5.0858,
+      "step": 750
+    },
+    {
+      "epoch": 0.23304591004427871,
+      "grad_norm": 0.39646583795547485,
+      "learning_rate": 0.0005975552317108715,
+      "loss": 5.0535,
+      "step": 800
+    },
+    {
+      "epoch": 0.24761127942204614,
+      "grad_norm": 0.4354263246059418,
+      "learning_rate": 0.0005973803555814631,
+      "loss": 4.9802,
+      "step": 850
+    },
+    {
+      "epoch": 0.26217664879981356,
+      "grad_norm": 0.45780736207962036,
+      "learning_rate": 0.0005972054794520547,
+      "loss": 4.9416,
+      "step": 900
+    },
+    {
+      "epoch": 0.276742018177581,
+      "grad_norm": 0.4360142946243286,
+      "learning_rate": 0.0005970306033226464,
+      "loss": 4.8913,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "grad_norm": 0.49000921845436096,
+      "learning_rate": 0.0005968557271932381,
+      "loss": 4.8516,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "eval_accuracy": 0.25072978433607523,
+      "eval_loss": 4.782108783721924,
+      "eval_runtime": 182.688,
+      "eval_samples_per_second": 91.112,
+      "eval_steps_per_second": 5.698,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30587275693311583,
+      "grad_norm": 0.4772098958492279,
+      "learning_rate": 0.0005966808510638297,
+      "loss": 4.7909,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32043812631088325,
+      "grad_norm": 0.4113984704017639,
+      "learning_rate": 0.0005965059749344214,
+      "loss": 4.7556,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350034956886507,
+      "grad_norm": 0.4781721830368042,
+      "learning_rate": 0.0005963310988050131,
+      "loss": 4.7197,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3495688650664181,
+      "grad_norm": 0.5447441935539246,
+      "learning_rate": 0.0005961562226756047,
+      "loss": 4.6751,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3641342344441855,
+      "grad_norm": 0.41747575998306274,
+      "learning_rate": 0.0005959813465461965,
+      "loss": 4.6392,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37869960382195295,
+      "grad_norm": 0.44658181071281433,
+      "learning_rate": 0.000595806470416788,
+      "loss": 4.6204,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39326497319972037,
+      "grad_norm": 0.4609386622905731,
+      "learning_rate": 0.0005956315942873797,
+      "loss": 4.5836,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4078303425774878,
+      "grad_norm": 0.4060616195201874,
+      "learning_rate": 0.0005954567181579714,
+      "loss": 4.5615,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42239571195525516,
+      "grad_norm": 0.4222695827484131,
+      "learning_rate": 0.000595281842028563,
+      "loss": 4.5376,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4369610813330226,
+      "grad_norm": 0.4395248591899872,
+      "learning_rate": 0.0005951069658991547,
+      "loss": 4.5107,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45152645071079,
+      "grad_norm": 0.4157288372516632,
+      "learning_rate": 0.0005949320897697464,
+      "loss": 4.5016,
+      "step": 1550
+    },
+    {
+      "epoch": 0.46609182008855743,
+      "grad_norm": 0.4212968349456787,
+      "learning_rate": 0.0005947572136403381,
+      "loss": 4.4616,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48065718946632485,
+      "grad_norm": 0.49588635563850403,
+      "learning_rate": 0.0005945823375109297,
+      "loss": 4.4497,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4952225588440923,
+      "grad_norm": 0.44670262932777405,
+      "learning_rate": 0.0005944074613815215,
+      "loss": 4.4344,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5097879282218597,
+      "grad_norm": 0.36793598532676697,
+      "learning_rate": 0.000594232585252113,
+      "loss": 4.4265,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5243532975996271,
+      "grad_norm": 0.4189833700656891,
+      "learning_rate": 0.0005940577091227047,
+      "loss": 4.4012,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5389186669773945,
+      "grad_norm": 0.4342043995857239,
+      "learning_rate": 0.0005938828329932964,
+      "loss": 4.3671,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553484036355162,
+      "grad_norm": 0.4385491609573364,
+      "learning_rate": 0.000593707956863888,
+      "loss": 4.3797,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5680494057329294,
+      "grad_norm": 0.3876365125179291,
+      "learning_rate": 0.0005935330807344797,
+      "loss": 4.3617,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "grad_norm": 0.41458868980407715,
+      "learning_rate": 0.0005933582046050714,
+      "loss": 4.342,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "eval_accuracy": 0.2995414195009285,
+      "eval_loss": 4.284348011016846,
+      "eval_runtime": 182.6917,
+      "eval_samples_per_second": 91.11,
+      "eval_steps_per_second": 5.698,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5971801444884642,
+      "grad_norm": 0.41804736852645874,
+      "learning_rate": 0.000593183328475663,
+      "loss": 4.3264,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6117455138662317,
+      "grad_norm": 0.36194130778312683,
+      "learning_rate": 0.0005930084523462546,
+      "loss": 4.3039,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6263108832439991,
+      "grad_norm": 0.3918125629425049,
+      "learning_rate": 0.0005928335762168463,
+      "loss": 4.2908,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6408762526217665,
+      "grad_norm": 0.40795278549194336,
+      "learning_rate": 0.000592658700087438,
+      "loss": 4.3007,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6554416219995339,
+      "grad_norm": 0.38976508378982544,
+      "learning_rate": 0.0005924838239580297,
+      "loss": 4.2836,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6700069913773014,
+      "grad_norm": 0.40438467264175415,
+      "learning_rate": 0.0005923089478286214,
+      "loss": 4.2674,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6845723607550688,
+      "grad_norm": 0.41360440850257874,
+      "learning_rate": 0.000592134071699213,
+      "loss": 4.2654,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6991377301328362,
+      "grad_norm": 0.377794474363327,
+      "learning_rate": 0.0005919591955698047,
+      "loss": 4.247,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7137030995106036,
+      "grad_norm": 0.391387015581131,
+      "learning_rate": 0.0005917843194403964,
+      "loss": 4.2377,
+      "step": 2450
+    },
+    {
+      "epoch": 0.728268468888371,
+      "grad_norm": 0.355221688747406,
+      "learning_rate": 0.000591609443310988,
+      "loss": 4.2396,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7428338382661385,
+      "grad_norm": 0.4049054682254791,
+      "learning_rate": 0.0005914345671815796,
+      "loss": 4.225,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7573992076439059,
+      "grad_norm": 0.3985639214515686,
+      "learning_rate": 0.0005912596910521713,
+      "loss": 4.204,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7719645770216733,
+      "grad_norm": 0.3753962814807892,
+      "learning_rate": 0.0005910848149227629,
+      "loss": 4.207,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7865299463994407,
+      "grad_norm": 0.3719504773616791,
+      "learning_rate": 0.0005909099387933547,
+      "loss": 4.1872,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8010953157772082,
+      "grad_norm": 0.3820745348930359,
+      "learning_rate": 0.0005907350626639463,
+      "loss": 4.1728,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8156606851549756,
+      "grad_norm": 0.39611145853996277,
+      "learning_rate": 0.000590560186534538,
+      "loss": 4.175,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8302260545327429,
+      "grad_norm": 0.3846186697483063,
+      "learning_rate": 0.0005903853104051297,
+      "loss": 4.171,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8447914239105103,
+      "grad_norm": 0.37356239557266235,
+      "learning_rate": 0.0005902104342757214,
+      "loss": 4.1733,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8593567932882777,
+      "grad_norm": 0.3760608434677124,
+      "learning_rate": 0.000590035558146313,
+      "loss": 4.1446,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "grad_norm": 0.36522382497787476,
+      "learning_rate": 0.0005898606820169046,
+      "loss": 4.1349,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "eval_accuracy": 0.3161861104367184,
+      "eval_loss": 4.093681335449219,
+      "eval_runtime": 182.834,
+      "eval_samples_per_second": 91.039,
+      "eval_steps_per_second": 5.694,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8884875320438126,
+      "grad_norm": 0.3807234764099121,
+      "learning_rate": 0.0005896858058874963,
+      "loss": 4.1488,
+      "step": 3050
+    },
+    {
+      "epoch": 0.90305290142158,
+      "grad_norm": 0.37441378831863403,
+      "learning_rate": 0.0005895109297580879,
+      "loss": 4.1366,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9176182707993474,
+      "grad_norm": 0.3428540825843811,
+      "learning_rate": 0.0005893360536286797,
+      "loss": 4.1134,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9321836401771149,
+      "grad_norm": 0.37319910526275635,
+      "learning_rate": 0.0005891611774992713,
+      "loss": 4.1224,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9467490095548823,
+      "grad_norm": 0.3176666796207428,
+      "learning_rate": 0.000588986301369863,
+      "loss": 4.1011,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9613143789326497,
+      "grad_norm": 0.33169302344322205,
+      "learning_rate": 0.0005888114252404547,
+      "loss": 4.109,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9758797483104171,
+      "grad_norm": 0.37893936038017273,
+      "learning_rate": 0.0005886365491110463,
+      "loss": 4.0925,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9904451176881846,
+      "grad_norm": 0.37214741110801697,
+      "learning_rate": 0.000588461672981638,
+      "loss": 4.0892,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0049522255884409,
+      "grad_norm": 0.36634066700935364,
+      "learning_rate": 0.0005882867968522296,
+      "loss": 4.0616,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0195175949662083,
+      "grad_norm": 0.35482263565063477,
+      "learning_rate": 0.0005881119207228212,
+      "loss": 4.0124,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0340829643439757,
+      "grad_norm": 0.3779667913913727,
+      "learning_rate": 0.0005879370445934129,
+      "loss": 4.016,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0486483337217432,
+      "grad_norm": 0.3397766947746277,
+      "learning_rate": 0.0005877621684640046,
+      "loss": 4.0033,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0632137030995106,
+      "grad_norm": 0.3722945749759674,
+      "learning_rate": 0.0005875872923345963,
+      "loss": 4.0129,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077779072477278,
+      "grad_norm": 0.34330493211746216,
+      "learning_rate": 0.000587412416205188,
+      "loss": 4.003,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0923444418550454,
+      "grad_norm": 0.3765234649181366,
+      "learning_rate": 0.0005872375400757797,
+      "loss": 4.0107,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1069098112328128,
+      "grad_norm": 0.35264670848846436,
+      "learning_rate": 0.0005870626639463713,
+      "loss": 4.0043,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1214751806105803,
+      "grad_norm": 0.3567696213722229,
+      "learning_rate": 0.0005868877878169629,
+      "loss": 4.0016,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1360405499883477,
+      "grad_norm": 0.37721458077430725,
+      "learning_rate": 0.0005867129116875546,
+      "loss": 4.0014,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1506059193661151,
+      "grad_norm": 0.34036508202552795,
+      "learning_rate": 0.0005865380355581462,
+      "loss": 3.9731,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "grad_norm": 0.33936476707458496,
+      "learning_rate": 0.0005863631594287379,
+      "loss": 3.9879,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "eval_accuracy": 0.3259847194699489,
+      "eval_loss": 3.9844980239868164,
+      "eval_runtime": 182.8731,
+      "eval_samples_per_second": 91.019,
+      "eval_steps_per_second": 5.692,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17973665812165,
+      "grad_norm": 0.35038861632347107,
+      "learning_rate": 0.0005861882832993296,
+      "loss": 3.9796,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1943020274994174,
+      "grad_norm": 0.3499089777469635,
+      "learning_rate": 0.0005860134071699212,
+      "loss": 4.0012,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2088673968771848,
+      "grad_norm": 0.35973456501960754,
+      "learning_rate": 0.000585838531040513,
+      "loss": 3.9843,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2234327662549522,
+      "grad_norm": 0.364793598651886,
+      "learning_rate": 0.0005856636549111046,
+      "loss": 3.9738,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2379981356327197,
+      "grad_norm": 0.3340921998023987,
+      "learning_rate": 0.0005854887787816963,
+      "loss": 3.9804,
+      "step": 4250
+    },
+    {
+      "epoch": 1.252563505010487,
+      "grad_norm": 0.33135804533958435,
+      "learning_rate": 0.0005853139026522879,
+      "loss": 3.9741,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2671288743882545,
+      "grad_norm": 0.348160058259964,
+      "learning_rate": 0.0005851390265228796,
+      "loss": 3.9693,
+      "step": 4350
+    },
+    {
+      "epoch": 1.281694243766022,
+      "grad_norm": 0.3392653167247772,
+      "learning_rate": 0.0005849641503934712,
+      "loss": 3.9645,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2962596131437893,
+      "grad_norm": 0.33709239959716797,
+      "learning_rate": 0.0005847892742640629,
+      "loss": 3.9534,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3108249825215568,
+      "grad_norm": 0.3292732238769531,
+      "learning_rate": 0.0005846143981346546,
+      "loss": 3.9565,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3253903518993242,
+      "grad_norm": 0.346902459859848,
+      "learning_rate": 0.0005844395220052462,
+      "loss": 3.945,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3399557212770916,
+      "grad_norm": 0.3570997416973114,
+      "learning_rate": 0.000584264645875838,
+      "loss": 3.9458,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354521090654859,
+      "grad_norm": 0.3431386351585388,
+      "learning_rate": 0.0005840897697464296,
+      "loss": 3.9343,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3690864600326265,
+      "grad_norm": 0.32554003596305847,
+      "learning_rate": 0.0005839148936170212,
+      "loss": 3.9334,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3836518294103939,
+      "grad_norm": 0.33119750022888184,
+      "learning_rate": 0.0005837400174876129,
+      "loss": 3.9402,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3982171987881613,
+      "grad_norm": 0.32726046442985535,
+      "learning_rate": 0.0005835651413582045,
+      "loss": 3.9414,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4127825681659287,
+      "grad_norm": 0.37814900279045105,
+      "learning_rate": 0.0005833902652287962,
+      "loss": 3.9381,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4273479375436962,
+      "grad_norm": 0.35007208585739136,
+      "learning_rate": 0.0005832153890993879,
+      "loss": 3.941,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4419133069214636,
+      "grad_norm": 0.47103646397590637,
+      "learning_rate": 0.0005830405129699796,
+      "loss": 3.9365,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456478676299231,
+      "grad_norm": 0.3166019916534424,
+      "learning_rate": 0.0005828656368405712,
+      "loss": 3.9229,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456478676299231,
+      "eval_accuracy": 0.33251971202484953,
+      "eval_loss": 3.9095144271850586,
+      "eval_runtime": 182.8302,
+      "eval_samples_per_second": 91.041,
+      "eval_steps_per_second": 5.694,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4710440456769984,
+      "grad_norm": 0.32818934321403503,
+      "learning_rate": 0.0005826907607111629,
+      "loss": 3.9321,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4856094150547658,
+      "grad_norm": 0.3326047658920288,
+      "learning_rate": 0.0005825158845817546,
+      "loss": 3.9226,
+      "step": 5100
+    },
+    {
+      "epoch": 1.500174784432533,
+      "grad_norm": 0.3376672863960266,
+      "learning_rate": 0.0005823410084523462,
+      "loss": 3.93,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5147401538103007,
+      "grad_norm": 0.3359002470970154,
+      "learning_rate": 0.0005821661323229379,
+      "loss": 3.9337,
+      "step": 5200
+    },
+    {
+      "epoch": 1.529305523188068,
+      "grad_norm": 0.33936458826065063,
+      "learning_rate": 0.0005819912561935295,
+      "loss": 3.9042,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5438708925658355,
+      "grad_norm": 0.34475040435791016,
+      "learning_rate": 0.0005818163800641212,
+      "loss": 3.9182,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5584362619436027,
+      "grad_norm": 0.3286258578300476,
+      "learning_rate": 0.0005816415039347129,
+      "loss": 3.9125,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5730016313213704,
+      "grad_norm": 0.3313068747520447,
+      "learning_rate": 0.0005814666278053045,
+      "loss": 3.8986,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5875670006991376,
+      "grad_norm": 0.3305990695953369,
+      "learning_rate": 0.0005812917516758962,
+      "loss": 3.9158,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6021323700769052,
+      "grad_norm": 0.3188944458961487,
+      "learning_rate": 0.0005811168755464879,
+      "loss": 3.8946,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6166977394546724,
+      "grad_norm": 0.353261798620224,
+      "learning_rate": 0.0005809419994170794,
+      "loss": 3.9022,
+      "step": 5550
+    },
+    {
+      "epoch": 1.63126310883244,
+      "grad_norm": 0.3191840648651123,
+      "learning_rate": 0.0005807671232876712,
+      "loss": 3.8972,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6458284782102073,
+      "grad_norm": 0.3437453508377075,
+      "learning_rate": 0.0005805922471582628,
+      "loss": 3.8846,
+      "step": 5650
+    },
+    {
+      "epoch": 1.660393847587975,
+      "grad_norm": 0.3136034309864044,
+      "learning_rate": 0.0005804173710288545,
+      "loss": 3.8851,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6749592169657421,
+      "grad_norm": 0.31763720512390137,
+      "learning_rate": 0.0005802424948994462,
+      "loss": 3.8838,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6895245863435098,
+      "grad_norm": 0.352448970079422,
+      "learning_rate": 0.0005800676187700379,
+      "loss": 3.8774,
+      "step": 5800
+    },
+    {
+      "epoch": 1.704089955721277,
+      "grad_norm": 0.32752853631973267,
+      "learning_rate": 0.0005798927426406295,
+      "loss": 3.8735,
+      "step": 5850
+    },
+    {
+      "epoch": 1.7186553250990446,
+      "grad_norm": 0.31873783469200134,
+      "learning_rate": 0.0005797178665112212,
+      "loss": 3.8819,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7332206944768118,
+      "grad_norm": 0.3134726881980896,
+      "learning_rate": 0.0005795429903818129,
+      "loss": 3.8784,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "grad_norm": 0.32978641986846924,
+      "learning_rate": 0.0005793681142524044,
+      "loss": 3.8805,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "eval_accuracy": 0.33774250948934204,
+      "eval_loss": 3.851607084274292,
+      "eval_runtime": 183.0051,
+      "eval_samples_per_second": 90.954,
+      "eval_steps_per_second": 5.688,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7623514332323467,
+      "grad_norm": 0.31962209939956665,
+      "learning_rate": 0.0005791932381229961,
+      "loss": 3.8609,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7769168026101143,
+      "grad_norm": 0.353483647108078,
+      "learning_rate": 0.0005790183619935878,
+      "loss": 3.866,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7914821719878815,
+      "grad_norm": 0.34389597177505493,
+      "learning_rate": 0.0005788434858641795,
+      "loss": 3.8668,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8060475413656492,
+      "grad_norm": 0.306185245513916,
+      "learning_rate": 0.0005786686097347712,
+      "loss": 3.8713,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8206129107434164,
+      "grad_norm": 0.3174114525318146,
+      "learning_rate": 0.0005784937336053628,
+      "loss": 3.854,
+      "step": 6250
+    },
+    {
+      "epoch": 1.835178280121184,
+      "grad_norm": 0.3356582522392273,
+      "learning_rate": 0.0005783188574759545,
+      "loss": 3.8665,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8497436494989512,
+      "grad_norm": 0.3357125222682953,
+      "learning_rate": 0.0005781439813465462,
+      "loss": 3.8615,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8643090188767188,
+      "grad_norm": 0.3359164297580719,
+      "learning_rate": 0.0005779691052171379,
+      "loss": 3.8584,
+      "step": 6400
+    },
+    {
+      "epoch": 1.878874388254486,
+      "grad_norm": 0.3328222632408142,
+      "learning_rate": 0.0005777942290877294,
+      "loss": 3.8494,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8934397576322537,
+      "grad_norm": 0.3154282569885254,
+      "learning_rate": 0.0005776193529583211,
+      "loss": 3.8592,
+      "step": 6500
+    },
+    {
+      "epoch": 1.908005127010021,
+      "grad_norm": 0.35024163126945496,
+      "learning_rate": 0.0005774444768289128,
+      "loss": 3.8619,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9225704963877885,
+      "grad_norm": 0.3441106379032135,
+      "learning_rate": 0.0005772696006995045,
+      "loss": 3.8442,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9371358657655557,
+      "grad_norm": 0.3300905227661133,
+      "learning_rate": 0.0005770947245700962,
+      "loss": 3.8414,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9517012351433234,
+      "grad_norm": 0.32087442278862,
+      "learning_rate": 0.0005769198484406878,
+      "loss": 3.8356,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9662666045210906,
+      "grad_norm": 0.3275013864040375,
+      "learning_rate": 0.0005767449723112795,
+      "loss": 3.837,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9808319738988582,
+      "grad_norm": 0.3164341151714325,
+      "learning_rate": 0.0005765700961818712,
+      "loss": 3.836,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9953973432766254,
+      "grad_norm": 0.3237488269805908,
+      "learning_rate": 0.0005763952200524627,
+      "loss": 3.8226,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0099044511768818,
+      "grad_norm": 0.3478614091873169,
+      "learning_rate": 0.0005762203439230544,
+      "loss": 3.7605,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0244698205546494,
+      "grad_norm": 0.3280088007450104,
+      "learning_rate": 0.0005760454677936461,
+      "loss": 3.7386,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "grad_norm": 0.3092538118362427,
+      "learning_rate": 0.0005758705916642378,
+      "loss": 3.7313,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "eval_accuracy": 0.34221756385061836,
+      "eval_loss": 3.8092734813690186,
+      "eval_runtime": 182.6845,
+      "eval_samples_per_second": 91.113,
+      "eval_steps_per_second": 5.698,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0536005593101843,
+      "grad_norm": 0.3326481282711029,
+      "learning_rate": 0.0005756957155348294,
+      "loss": 3.7507,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0681659286879515,
+      "grad_norm": 0.3384568989276886,
+      "learning_rate": 0.0005755208394054211,
+      "loss": 3.7393,
+      "step": 7100
+    },
+    {
+      "epoch": 2.082731298065719,
+      "grad_norm": 0.33243587613105774,
+      "learning_rate": 0.0005753459632760128,
+      "loss": 3.7399,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0972966674434863,
+      "grad_norm": 0.31076061725616455,
+      "learning_rate": 0.0005751710871466045,
+      "loss": 3.7378,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111862036821254,
+      "grad_norm": 0.3274184465408325,
+      "learning_rate": 0.0005749962110171962,
+      "loss": 3.7429,
+      "step": 7250
+    },
+    {
+      "epoch": 2.126427406199021,
+      "grad_norm": 0.32674604654312134,
+      "learning_rate": 0.0005748213348877877,
+      "loss": 3.7583,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140992775576789,
+      "grad_norm": 0.32339632511138916,
+      "learning_rate": 0.0005746464587583794,
+      "loss": 3.7516,
+      "step": 7350
+    },
+    {
+      "epoch": 2.155558144954556,
+      "grad_norm": 0.3199286162853241,
+      "learning_rate": 0.0005744715826289711,
+      "loss": 3.7587,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1701235143323236,
+      "grad_norm": 0.3267718553543091,
+      "learning_rate": 0.0005742967064995627,
+      "loss": 3.749,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184688883710091,
+      "grad_norm": 0.32316339015960693,
+      "learning_rate": 0.0005741218303701544,
+      "loss": 3.747,
+      "step": 7500
+    },
+    {
+      "epoch": 2.1992542530878585,
+      "grad_norm": 0.3192504048347473,
+      "learning_rate": 0.0005739469542407461,
+      "loss": 3.7527,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2138196224656257,
+      "grad_norm": 0.3137800097465515,
+      "learning_rate": 0.0005737720781113378,
+      "loss": 3.7386,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2283849918433933,
+      "grad_norm": 0.3079003393650055,
+      "learning_rate": 0.0005735972019819295,
+      "loss": 3.7412,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2429503612211605,
+      "grad_norm": 0.317622572183609,
+      "learning_rate": 0.000573422325852521,
+      "loss": 3.7524,
+      "step": 7700
+    },
+    {
+      "epoch": 2.257515730598928,
+      "grad_norm": 0.3800288140773773,
+      "learning_rate": 0.0005732474497231127,
+      "loss": 3.7621,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2720810999766954,
+      "grad_norm": 0.3083147704601288,
+      "learning_rate": 0.0005730725735937044,
+      "loss": 3.7586,
+      "step": 7800
+    },
+    {
+      "epoch": 2.286646469354463,
+      "grad_norm": 0.3407652974128723,
+      "learning_rate": 0.0005728976974642961,
+      "loss": 3.7421,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3012118387322302,
+      "grad_norm": 0.3287487030029297,
+      "learning_rate": 0.0005727228213348877,
+      "loss": 3.7566,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3157772081099974,
+      "grad_norm": 0.3335205912590027,
+      "learning_rate": 0.0005725479452054794,
+      "loss": 3.7335,
+      "step": 7950
+    },
+    {
+      "epoch": 2.330342577487765,
+      "grad_norm": 0.3264673352241516,
+      "learning_rate": 0.0005723730690760711,
+      "loss": 3.7565,
+      "step": 8000
+    },
+    {
+      "epoch": 2.330342577487765,
+      "eval_accuracy": 0.34492789746043634,
+      "eval_loss": 3.7804434299468994,
+      "eval_runtime": 183.2368,
+      "eval_samples_per_second": 90.839,
+      "eval_steps_per_second": 5.681,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3449079468655327,
+      "grad_norm": 0.3331637382507324,
+      "learning_rate": 0.0005721981929466627,
+      "loss": 3.7361,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3594733162433,
+      "grad_norm": 0.332265704870224,
+      "learning_rate": 0.0005720233168172545,
+      "loss": 3.7488,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374038685621067,
+      "grad_norm": 0.3338277041912079,
+      "learning_rate": 0.000571848440687846,
+      "loss": 3.749,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3886040549988348,
+      "grad_norm": 0.3343018889427185,
+      "learning_rate": 0.0005716735645584377,
+      "loss": 3.7482,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4031694243766024,
+      "grad_norm": 0.3533099591732025,
+      "learning_rate": 0.0005714986884290294,
+      "loss": 3.7509,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4177347937543696,
+      "grad_norm": 0.3301255404949188,
+      "learning_rate": 0.000571323812299621,
+      "loss": 3.7334,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432300163132137,
+      "grad_norm": 0.31493982672691345,
+      "learning_rate": 0.0005711489361702127,
+      "loss": 3.7405,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4468655325099045,
+      "grad_norm": 0.3294617235660553,
+      "learning_rate": 0.0005709740600408044,
+      "loss": 3.7471,
+      "step": 8400
+    },
+    {
+      "epoch": 2.461430901887672,
+      "grad_norm": 0.316875696182251,
+      "learning_rate": 0.0005707991839113961,
+      "loss": 3.756,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4759962712654393,
+      "grad_norm": 0.310937762260437,
+      "learning_rate": 0.0005706243077819877,
+      "loss": 3.7412,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4905616406432065,
+      "grad_norm": 0.31915387511253357,
+      "learning_rate": 0.0005704494316525793,
+      "loss": 3.7455,
+      "step": 8550
+    },
+    {
+      "epoch": 2.505127010020974,
+      "grad_norm": 0.32014259696006775,
+      "learning_rate": 0.000570274555523171,
+      "loss": 3.7348,
+      "step": 8600
+    },
+    {
+      "epoch": 2.519692379398742,
+      "grad_norm": 0.32315295934677124,
+      "learning_rate": 0.0005700996793937627,
+      "loss": 3.7426,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534257748776509,
+      "grad_norm": 0.33680716156959534,
+      "learning_rate": 0.0005699248032643544,
+      "loss": 3.7458,
+      "step": 8700
+    },
+    {
+      "epoch": 2.548823118154276,
+      "grad_norm": 0.30035462975502014,
+      "learning_rate": 0.000569749927134946,
+      "loss": 3.7355,
+      "step": 8750
+    },
+    {
+      "epoch": 2.563388487532044,
+      "grad_norm": 0.3163704574108124,
+      "learning_rate": 0.0005695750510055377,
+      "loss": 3.745,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5779538569098115,
+      "grad_norm": 0.3217114806175232,
+      "learning_rate": 0.0005694001748761294,
+      "loss": 3.7532,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5925192262875787,
+      "grad_norm": 0.3296523988246918,
+      "learning_rate": 0.000569225298746721,
+      "loss": 3.7536,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607084595665346,
+      "grad_norm": 0.3380911648273468,
+      "learning_rate": 0.0005690504226173127,
+      "loss": 3.7383,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "grad_norm": 0.3250406086444855,
+      "learning_rate": 0.0005688755464879043,
+      "loss": 3.7287,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "eval_accuracy": 0.3478768975009979,
+      "eval_loss": 3.7482099533081055,
+      "eval_runtime": 182.7889,
+      "eval_samples_per_second": 91.061,
+      "eval_steps_per_second": 5.695,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636215334420881,
+      "grad_norm": 0.2994716465473175,
+      "learning_rate": 0.000568700670358496,
+      "loss": 3.7238,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6507807037986484,
+      "grad_norm": 0.34974968433380127,
+      "learning_rate": 0.0005685257942290877,
+      "loss": 3.7324,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6653460731764156,
+      "grad_norm": 0.3160337209701538,
+      "learning_rate": 0.0005683509180996793,
+      "loss": 3.7271,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6799114425541832,
+      "grad_norm": 0.3240397274494171,
+      "learning_rate": 0.000568176041970271,
+      "loss": 3.7258,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6944768119319504,
+      "grad_norm": 0.32082828879356384,
+      "learning_rate": 0.0005680011658408627,
+      "loss": 3.7358,
+      "step": 9250
+    },
+    {
+      "epoch": 2.709042181309718,
+      "grad_norm": 0.31649598479270935,
+      "learning_rate": 0.0005678262897114544,
+      "loss": 3.7392,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7236075506874853,
+      "grad_norm": 0.31169888377189636,
+      "learning_rate": 0.000567651413582046,
+      "loss": 3.735,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738172920065253,
+      "grad_norm": 0.3002908229827881,
+      "learning_rate": 0.0005674765374526377,
+      "loss": 3.7288,
+      "step": 9400
+    },
+    {
+      "epoch": 2.75273828944302,
+      "grad_norm": 0.30253180861473083,
+      "learning_rate": 0.0005673016613232293,
+      "loss": 3.7204,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7673036588207878,
+      "grad_norm": 0.31831657886505127,
+      "learning_rate": 0.0005671267851938209,
+      "loss": 3.7258,
+      "step": 9500
+    },
+    {
+      "epoch": 2.781869028198555,
+      "grad_norm": 0.31231117248535156,
+      "learning_rate": 0.0005669519090644127,
+      "loss": 3.721,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7964343975763226,
+      "grad_norm": 0.3149794936180115,
+      "learning_rate": 0.0005667770329350043,
+      "loss": 3.7337,
+      "step": 9600
+    },
+    {
+      "epoch": 2.81099976695409,
+      "grad_norm": 0.3181709945201874,
+      "learning_rate": 0.000566602156805596,
+      "loss": 3.7265,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8255651363318575,
+      "grad_norm": 0.31284210085868835,
+      "learning_rate": 0.0005664272806761877,
+      "loss": 3.7288,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8401305057096247,
+      "grad_norm": 0.3245266377925873,
+      "learning_rate": 0.0005662524045467793,
+      "loss": 3.7222,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8546958750873923,
+      "grad_norm": 0.319336473941803,
+      "learning_rate": 0.000566077528417371,
+      "loss": 3.7298,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8692612444651595,
+      "grad_norm": 0.3112030625343323,
+      "learning_rate": 0.0005659026522879626,
+      "loss": 3.7303,
+      "step": 9850
+    },
+    {
+      "epoch": 2.883826613842927,
+      "grad_norm": 0.3266408443450928,
+      "learning_rate": 0.0005657277761585543,
+      "loss": 3.7114,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8983919832206944,
+      "grad_norm": 0.3135480284690857,
+      "learning_rate": 0.0005655529000291459,
+      "loss": 3.7203,
+      "step": 9950
+    },
+    {
+      "epoch": 2.912957352598462,
+      "grad_norm": 0.3140251040458679,
+      "learning_rate": 0.0005653780238997376,
+      "loss": 3.7117,
+      "step": 10000
+    },
+    {
+      "epoch": 2.912957352598462,
+      "eval_accuracy": 0.3501354108677876,
+      "eval_loss": 3.7232017517089844,
+      "eval_runtime": 182.9143,
+      "eval_samples_per_second": 90.999,
+      "eval_steps_per_second": 5.691,
+      "step": 10000
+    },
+    {
+      "epoch": 2.927522721976229,
+      "grad_norm": 0.30501222610473633,
+      "learning_rate": 0.0005652031477703293,
+      "loss": 3.7205,
+      "step": 10050
+    },
+    {
+      "epoch": 2.942088091353997,
+      "grad_norm": 0.31396257877349854,
+      "learning_rate": 0.000565028271640921,
+      "loss": 3.7231,
+      "step": 10100
+    },
+    {
+      "epoch": 2.956653460731764,
+      "grad_norm": 0.29565396904945374,
+      "learning_rate": 0.0005648533955115127,
+      "loss": 3.7167,
+      "step": 10150
+    },
+    {
+      "epoch": 2.9712188301095317,
+      "grad_norm": 0.29209980368614197,
+      "learning_rate": 0.0005646785193821043,
+      "loss": 3.7011,
+      "step": 10200
+    },
+    {
+      "epoch": 2.985784199487299,
+      "grad_norm": 0.341580867767334,
+      "learning_rate": 0.000564503643252696,
+      "loss": 3.7199,
+      "step": 10250
+    },
+    {
+      "epoch": 3.0002913073875552,
+      "grad_norm": 0.306345134973526,
+      "learning_rate": 0.0005643287671232876,
+      "loss": 3.7135,
+      "step": 10300
+    },
+    {
+      "epoch": 3.014856676765323,
+      "grad_norm": 0.3232329487800598,
+      "learning_rate": 0.0005641538909938792,
+      "loss": 3.602,
+      "step": 10350
+    },
+    {
+      "epoch": 3.02942204614309,
+      "grad_norm": 0.3200088441371918,
+      "learning_rate": 0.0005639790148644709,
+      "loss": 3.6044,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0439874155208577,
+      "grad_norm": 0.31947940587997437,
+      "learning_rate": 0.0005638041387350626,
+      "loss": 3.612,
+      "step": 10450
+    },
+    {
+      "epoch": 3.058552784898625,
+      "grad_norm": 0.3416725695133209,
+      "learning_rate": 0.0005636292626056543,
+      "loss": 3.6212,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0731181542763926,
+      "grad_norm": 0.3318784832954407,
+      "learning_rate": 0.000563454386476246,
+      "loss": 3.6079,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0876835236541598,
+      "grad_norm": 0.3215178847312927,
+      "learning_rate": 0.0005632795103468376,
+      "loss": 3.623,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1022488930319274,
+      "grad_norm": 0.3258126974105835,
+      "learning_rate": 0.0005631046342174293,
+      "loss": 3.6237,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1168142624096946,
+      "grad_norm": 0.3248828947544098,
+      "learning_rate": 0.000562929758088021,
+      "loss": 3.6206,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1313796317874623,
+      "grad_norm": 0.3171083331108093,
+      "learning_rate": 0.0005627548819586126,
+      "loss": 3.6301,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1459450011652295,
+      "grad_norm": 0.3476962745189667,
+      "learning_rate": 0.0005625800058292042,
+      "loss": 3.6342,
+      "step": 10800
+    },
+    {
+      "epoch": 3.160510370542997,
+      "grad_norm": 0.32831183075904846,
+      "learning_rate": 0.0005624051296997959,
+      "loss": 3.6253,
+      "step": 10850
+    },
+    {
+      "epoch": 3.1750757399207643,
+      "grad_norm": 0.3208300769329071,
+      "learning_rate": 0.0005622302535703876,
+      "loss": 3.6201,
+      "step": 10900
+    },
+    {
+      "epoch": 3.189641109298532,
+      "grad_norm": 0.34289947152137756,
+      "learning_rate": 0.0005620553774409792,
+      "loss": 3.6407,
+      "step": 10950
+    },
+    {
+      "epoch": 3.204206478676299,
+      "grad_norm": 0.34401383996009827,
+      "learning_rate": 0.000561880501311571,
+      "loss": 3.6328,
+      "step": 11000
+    },
+    {
+      "epoch": 3.204206478676299,
+      "eval_accuracy": 0.35195491908561366,
+      "eval_loss": 3.711780548095703,
+      "eval_runtime": 183.0966,
+      "eval_samples_per_second": 90.908,
+      "eval_steps_per_second": 5.686,
+      "step": 11000
+    },
+    {
+      "epoch": 3.218771848054067,
+      "grad_norm": 0.3176616132259369,
+      "learning_rate": 0.0005617056251821626,
+      "loss": 3.6359,
+      "step": 11050
+    },
+    {
+      "epoch": 3.233337217431834,
+      "grad_norm": 0.3091390132904053,
+      "learning_rate": 0.0005615307490527543,
+      "loss": 3.6274,
+      "step": 11100
+    },
+    {
+      "epoch": 3.2479025868096016,
+      "grad_norm": 0.31460776925086975,
+      "learning_rate": 0.000561355872923346,
+      "loss": 3.6288,
+      "step": 11150
+    },
+    {
+      "epoch": 3.262467956187369,
+      "grad_norm": 0.33371245861053467,
+      "learning_rate": 0.0005611809967939375,
+      "loss": 3.6434,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2770333255651365,
+      "grad_norm": 0.3317493796348572,
+      "learning_rate": 0.0005610061206645292,
+      "loss": 3.6385,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2915986949429037,
+      "grad_norm": 0.3349264860153198,
+      "learning_rate": 0.0005608312445351209,
+      "loss": 3.6397,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3061640643206713,
+      "grad_norm": 0.32256776094436646,
+      "learning_rate": 0.0005606563684057126,
+      "loss": 3.6336,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3207294336984385,
+      "grad_norm": 0.32920846343040466,
+      "learning_rate": 0.0005604814922763042,
+      "loss": 3.633,
+      "step": 11400
+    },
+    {
+      "epoch": 3.335294803076206,
+      "grad_norm": 0.32023611664772034,
+      "learning_rate": 0.0005603066161468959,
+      "loss": 3.6273,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3498601724539734,
+      "grad_norm": 0.3239540755748749,
+      "learning_rate": 0.0005601317400174876,
+      "loss": 3.6354,
+      "step": 11500
+    },
+    {
+      "epoch": 3.364425541831741,
+      "grad_norm": 0.3216831386089325,
+      "learning_rate": 0.0005599568638880793,
+      "loss": 3.6423,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3789909112095082,
+      "grad_norm": 0.3342783749103546,
+      "learning_rate": 0.0005597819877586709,
+      "loss": 3.6189,
+      "step": 11600
+    },
+    {
+      "epoch": 3.393556280587276,
+      "grad_norm": 0.30918747186660767,
+      "learning_rate": 0.0005596071116292625,
+      "loss": 3.6395,
+      "step": 11650
+    },
+    {
+      "epoch": 3.408121649965043,
+      "grad_norm": 0.3271181583404541,
+      "learning_rate": 0.0005594322354998542,
+      "loss": 3.6403,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4226870193428107,
+      "grad_norm": 0.30378925800323486,
+      "learning_rate": 0.0005592573593704459,
+      "loss": 3.6356,
+      "step": 11750
+    },
+    {
+      "epoch": 3.437252388720578,
+      "grad_norm": 0.3317507803440094,
+      "learning_rate": 0.0005590824832410375,
+      "loss": 3.6269,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4518177580983456,
+      "grad_norm": 0.33540189266204834,
+      "learning_rate": 0.0005589076071116292,
+      "loss": 3.64,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4663831274761128,
+      "grad_norm": 0.2987757921218872,
+      "learning_rate": 0.0005587327309822209,
+      "loss": 3.6381,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4809484968538804,
+      "grad_norm": 0.3357625901699066,
+      "learning_rate": 0.0005585578548528126,
+      "loss": 3.6504,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "grad_norm": 0.3251186013221741,
+      "learning_rate": 0.0005583829787234043,
+      "loss": 3.6393,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "eval_accuracy": 0.3538467326506846,
+      "eval_loss": 3.695300817489624,
+      "eval_runtime": 183.2049,
+      "eval_samples_per_second": 90.855,
+      "eval_steps_per_second": 5.682,
+      "step": 12000
+    },
+    {
+      "epoch": 3.510079235609415,
+      "grad_norm": 0.3250819742679596,
+      "learning_rate": 0.0005582081025939958,
+      "loss": 3.628,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5246446049871825,
+      "grad_norm": 0.3002704381942749,
+      "learning_rate": 0.0005580332264645875,
+      "loss": 3.6323,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53920997436495,
+      "grad_norm": 0.3166797161102295,
+      "learning_rate": 0.0005578583503351792,
+      "loss": 3.6524,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5537753437427173,
+      "grad_norm": 0.32995665073394775,
+      "learning_rate": 0.0005576834742057709,
+      "loss": 3.6375,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5683407131204845,
+      "grad_norm": 0.3259824812412262,
+      "learning_rate": 0.0005575085980763625,
+      "loss": 3.6469,
+      "step": 12250
+    },
+    {
+      "epoch": 3.582906082498252,
+      "grad_norm": 0.35942357778549194,
+      "learning_rate": 0.0005573337219469542,
+      "loss": 3.6256,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59747145187602,
+      "grad_norm": 0.35430946946144104,
+      "learning_rate": 0.0005571588458175459,
+      "loss": 3.6376,
+      "step": 12350
+    },
+    {
+      "epoch": 3.612036821253787,
+      "grad_norm": 0.31567761301994324,
+      "learning_rate": 0.0005569839696881374,
+      "loss": 3.6417,
+      "step": 12400
+    },
+    {
+      "epoch": 3.626602190631554,
+      "grad_norm": 0.32618406414985657,
+      "learning_rate": 0.0005568090935587292,
+      "loss": 3.6446,
+      "step": 12450
+    },
+    {
+      "epoch": 3.641167560009322,
+      "grad_norm": 0.31533339619636536,
+      "learning_rate": 0.0005566342174293208,
+      "loss": 3.6602,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6557329293870895,
+      "grad_norm": 0.3274492621421814,
+      "learning_rate": 0.0005564593412999125,
+      "loss": 3.6288,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6702982987648567,
+      "grad_norm": 0.3271532952785492,
+      "learning_rate": 0.0005562844651705042,
+      "loss": 3.6392,
+      "step": 12600
+    },
+    {
+      "epoch": 3.684863668142624,
+      "grad_norm": 0.3347550630569458,
+      "learning_rate": 0.0005561095890410958,
+      "loss": 3.6288,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6994290375203915,
+      "grad_norm": 0.32583537697792053,
+      "learning_rate": 0.0005559347129116875,
+      "loss": 3.6302,
+      "step": 12700
+    },
+    {
+      "epoch": 3.713994406898159,
+      "grad_norm": 0.30043119192123413,
+      "learning_rate": 0.0005557598367822792,
+      "loss": 3.6391,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7285597762759264,
+      "grad_norm": 0.328071653842926,
+      "learning_rate": 0.0005555849606528709,
+      "loss": 3.6402,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7431251456536936,
+      "grad_norm": 0.3114347755908966,
+      "learning_rate": 0.0005554100845234624,
+      "loss": 3.6368,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7576905150314612,
+      "grad_norm": 0.3095006048679352,
+      "learning_rate": 0.0005552352083940541,
+      "loss": 3.642,
+      "step": 12900
+    },
+    {
+      "epoch": 3.772255884409229,
+      "grad_norm": 0.341740220785141,
+      "learning_rate": 0.0005550603322646458,
+      "loss": 3.6366,
+      "step": 12950
+    },
+    {
+      "epoch": 3.786821253786996,
+      "grad_norm": 0.3182758390903473,
+      "learning_rate": 0.0005548854561352375,
+      "loss": 3.6428,
+      "step": 13000
+    },
+    {
+      "epoch": 3.786821253786996,
+      "eval_accuracy": 0.35534045531206226,
+      "eval_loss": 3.67643141746521,
+      "eval_runtime": 182.9247,
+      "eval_samples_per_second": 90.994,
+      "eval_steps_per_second": 5.691,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8013866231647633,
+      "grad_norm": 0.3386680781841278,
+      "learning_rate": 0.0005547105800058292,
+      "loss": 3.6498,
+      "step": 13050
+    },
+    {
+      "epoch": 3.815951992542531,
+      "grad_norm": 0.317020446062088,
+      "learning_rate": 0.0005545357038764208,
+      "loss": 3.6364,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8305173619202986,
+      "grad_norm": 0.3221585154533386,
+      "learning_rate": 0.0005543608277470125,
+      "loss": 3.639,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8450827312980658,
+      "grad_norm": 0.2992505133152008,
+      "learning_rate": 0.0005541859516176042,
+      "loss": 3.6415,
+      "step": 13200
+    },
+    {
+      "epoch": 3.859648100675833,
+      "grad_norm": 0.30075782537460327,
+      "learning_rate": 0.0005540110754881958,
+      "loss": 3.6458,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8742134700536006,
+      "grad_norm": 0.3180374205112457,
+      "learning_rate": 0.0005538361993587874,
+      "loss": 3.6236,
+      "step": 13300
+    },
+    {
+      "epoch": 3.888778839431368,
+      "grad_norm": 0.3106091022491455,
+      "learning_rate": 0.0005536613232293791,
+      "loss": 3.6353,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9033442088091355,
+      "grad_norm": 0.29708361625671387,
+      "learning_rate": 0.0005534864470999708,
+      "loss": 3.6366,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9179095781869027,
+      "grad_norm": 0.34170088171958923,
+      "learning_rate": 0.0005533115709705625,
+      "loss": 3.6262,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9324749475646703,
+      "grad_norm": 0.3265573978424072,
+      "learning_rate": 0.0005531366948411541,
+      "loss": 3.635,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9470403169424375,
+      "grad_norm": 0.31741398572921753,
+      "learning_rate": 0.0005529618187117458,
+      "loss": 3.631,
+      "step": 13550
+    },
+    {
+      "epoch": 3.961605686320205,
+      "grad_norm": 0.32067936658859253,
+      "learning_rate": 0.0005527869425823375,
+      "loss": 3.6362,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9761710556979724,
+      "grad_norm": 0.31947606801986694,
+      "learning_rate": 0.0005526120664529292,
+      "loss": 3.6329,
+      "step": 13650
+    },
+    {
+      "epoch": 3.99073642507574,
+      "grad_norm": 0.2974706292152405,
+      "learning_rate": 0.0005524371903235207,
+      "loss": 3.636,
+      "step": 13700
+    },
+    {
+      "epoch": 4.005243532975996,
+      "grad_norm": 0.3165639042854309,
+      "learning_rate": 0.0005522623141941124,
+      "loss": 3.5888,
+      "step": 13750
+    },
+    {
+      "epoch": 4.0198089023537635,
+      "grad_norm": 0.3143453299999237,
+      "learning_rate": 0.0005520874380647041,
+      "loss": 3.5185,
+      "step": 13800
+    },
+    {
+      "epoch": 4.034374271731531,
+      "grad_norm": 0.3318156599998474,
+      "learning_rate": 0.0005519125619352957,
+      "loss": 3.52,
+      "step": 13850
+    },
+    {
+      "epoch": 4.048939641109299,
+      "grad_norm": 0.30892449617385864,
+      "learning_rate": 0.0005517376858058875,
+      "loss": 3.5368,
+      "step": 13900
+    },
+    {
+      "epoch": 4.063505010487066,
+      "grad_norm": 0.3253442645072937,
+      "learning_rate": 0.0005515628096764791,
+      "loss": 3.5317,
+      "step": 13950
+    },
+    {
+      "epoch": 4.078070379864833,
+      "grad_norm": 0.31239765882492065,
+      "learning_rate": 0.0005513879335470708,
+      "loss": 3.5319,
+      "step": 14000
+    },
+    {
+      "epoch": 4.078070379864833,
+      "eval_accuracy": 0.35649757600732224,
+      "eval_loss": 3.668315887451172,
+      "eval_runtime": 182.8782,
+      "eval_samples_per_second": 91.017,
+      "eval_steps_per_second": 5.692,
+      "step": 14000
+    },
+    {
+      "epoch": 4.092635749242601,
+      "grad_norm": 0.32942360639572144,
+      "learning_rate": 0.0005512130574176625,
+      "loss": 3.5415,
+      "step": 14050
+    },
+    {
+      "epoch": 4.1072011186203685,
+      "grad_norm": 0.3166195750236511,
+      "learning_rate": 0.000551038181288254,
+      "loss": 3.5305,
+      "step": 14100
+    },
+    {
+      "epoch": 4.121766487998135,
+      "grad_norm": 0.3233526349067688,
+      "learning_rate": 0.0005508633051588457,
+      "loss": 3.5415,
+      "step": 14150
+    },
+    {
+      "epoch": 4.136331857375903,
+      "grad_norm": 0.32190340757369995,
+      "learning_rate": 0.0005506884290294374,
+      "loss": 3.5477,
+      "step": 14200
+    },
+    {
+      "epoch": 4.150897226753671,
+      "grad_norm": 0.34432554244995117,
+      "learning_rate": 0.0005505135529000291,
+      "loss": 3.5525,
+      "step": 14250
+    },
+    {
+      "epoch": 4.165462596131438,
+      "grad_norm": 0.3212747275829315,
+      "learning_rate": 0.0005503386767706207,
+      "loss": 3.5454,
+      "step": 14300
+    },
+    {
+      "epoch": 4.180027965509205,
+      "grad_norm": 0.3225768804550171,
+      "learning_rate": 0.0005501638006412124,
+      "loss": 3.5577,
+      "step": 14350
+    },
+    {
+      "epoch": 4.194593334886973,
+      "grad_norm": 0.3135558068752289,
+      "learning_rate": 0.0005499889245118041,
+      "loss": 3.5534,
+      "step": 14400
+    },
+    {
+      "epoch": 4.20915870426474,
+      "grad_norm": 0.342723548412323,
+      "learning_rate": 0.0005498140483823958,
+      "loss": 3.5381,
+      "step": 14450
+    },
+    {
+      "epoch": 4.223724073642508,
+      "grad_norm": 0.3262888193130493,
+      "learning_rate": 0.0005496391722529875,
+      "loss": 3.5649,
+      "step": 14500
+    },
+    {
+      "epoch": 4.238289443020275,
+      "grad_norm": 0.3289225399494171,
+      "learning_rate": 0.000549464296123579,
+      "loss": 3.5588,
+      "step": 14550
+    },
+    {
+      "epoch": 4.252854812398042,
+      "grad_norm": 0.30956584215164185,
+      "learning_rate": 0.0005492894199941707,
+      "loss": 3.5632,
+      "step": 14600
+    },
+    {
+      "epoch": 4.26742018177581,
+      "grad_norm": 0.33964231610298157,
+      "learning_rate": 0.0005491145438647624,
+      "loss": 3.5687,
+      "step": 14650
+    },
+    {
+      "epoch": 4.281985551153578,
+      "grad_norm": 0.33205127716064453,
+      "learning_rate": 0.000548939667735354,
+      "loss": 3.5543,
+      "step": 14700
+    },
+    {
+      "epoch": 4.296550920531344,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0005487647916059457,
+      "loss": 3.5658,
+      "step": 14750
+    },
+    {
+      "epoch": 4.311116289909112,
+      "grad_norm": 0.3393228054046631,
+      "learning_rate": 0.0005485899154765374,
+      "loss": 3.5577,
+      "step": 14800
+    },
+    {
+      "epoch": 4.32568165928688,
+      "grad_norm": 0.31260740756988525,
+      "learning_rate": 0.0005484150393471291,
+      "loss": 3.5722,
+      "step": 14850
+    },
+    {
+      "epoch": 4.340247028664647,
+      "grad_norm": 0.331304132938385,
+      "learning_rate": 0.0005482401632177208,
+      "loss": 3.5601,
+      "step": 14900
+    },
+    {
+      "epoch": 4.354812398042414,
+      "grad_norm": 0.3720129728317261,
+      "learning_rate": 0.0005480652870883124,
+      "loss": 3.5754,
+      "step": 14950
+    },
+    {
+      "epoch": 4.369377767420182,
+      "grad_norm": 0.31915804743766785,
+      "learning_rate": 0.000547890410958904,
+      "loss": 3.5417,
+      "step": 15000
+    },
+    {
+      "epoch": 4.369377767420182,
+      "eval_accuracy": 0.3573578332850318,
+      "eval_loss": 3.6594953536987305,
+      "eval_runtime": 183.0024,
+      "eval_samples_per_second": 90.955,
+      "eval_steps_per_second": 5.688,
+      "step": 15000
+    },
+    {
+      "epoch": 4.383943136797949,
+      "grad_norm": 0.3377160429954529,
+      "learning_rate": 0.0005477155348294957,
+      "loss": 3.5704,
+      "step": 15050
+    },
+    {
+      "epoch": 4.398508506175717,
+      "grad_norm": 0.31284624338150024,
+      "learning_rate": 0.0005475406587000874,
+      "loss": 3.562,
+      "step": 15100
+    },
+    {
+      "epoch": 4.413073875553484,
+      "grad_norm": 0.331798255443573,
+      "learning_rate": 0.000547365782570679,
+      "loss": 3.5583,
+      "step": 15150
+    },
+    {
+      "epoch": 4.427639244931251,
+      "grad_norm": 0.31809282302856445,
+      "learning_rate": 0.0005471909064412707,
+      "loss": 3.5653,
+      "step": 15200
+    },
+    {
+      "epoch": 4.442204614309019,
+      "grad_norm": 0.3349161446094513,
+      "learning_rate": 0.0005470160303118624,
+      "loss": 3.5674,
+      "step": 15250
+    },
+    {
+      "epoch": 4.456769983686787,
+      "grad_norm": 0.32030266523361206,
+      "learning_rate": 0.000546841154182454,
+      "loss": 3.5776,
+      "step": 15300
+    },
+    {
+      "epoch": 4.471335353064553,
+      "grad_norm": NaN,
+      "learning_rate": 0.0005466662780530458,
+      "loss": 3.5804,
+      "step": 15350
+    },
+    {
+      "epoch": 4.485900722442321,
+      "grad_norm": 0.3176333010196686,
+      "learning_rate": 0.0005464914019236374,
+      "loss": 3.5782,
+      "step": 15400
+    },
+    {
+      "epoch": 4.500466091820089,
+      "grad_norm": 0.3277718126773834,
+      "learning_rate": 0.000546316525794229,
+      "loss": 3.5864,
+      "step": 15450
+    },
+    {
+      "epoch": 4.515031461197856,
+      "grad_norm": 0.3107328712940216,
+      "learning_rate": 0.0005461416496648207,
+      "loss": 3.5818,
+      "step": 15500
+    },
+    {
+      "epoch": 4.529596830575623,
+      "grad_norm": 0.32739174365997314,
+      "learning_rate": 0.0005459667735354123,
+      "loss": 3.5574,
+      "step": 15550
+    },
+    {
+      "epoch": 4.544162199953391,
+      "grad_norm": 0.3131246268749237,
+      "learning_rate": 0.000545791897406004,
+      "loss": 3.5691,
+      "step": 15600
+    },
+    {
+      "epoch": 4.558727569331158,
+      "grad_norm": 0.33344247937202454,
+      "learning_rate": 0.0005456170212765957,
+      "loss": 3.5742,
+      "step": 15650
+    },
+    {
+      "epoch": 4.573292938708926,
+      "grad_norm": 0.3450835347175598,
+      "learning_rate": 0.0005454421451471874,
+      "loss": 3.5739,
+      "step": 15700
+    },
+    {
+      "epoch": 4.587858308086693,
+      "grad_norm": 0.31020715832710266,
+      "learning_rate": 0.000545267269017779,
+      "loss": 3.5733,
+      "step": 15750
+    },
+    {
+      "epoch": 4.6024236774644605,
+      "grad_norm": 0.3077200651168823,
+      "learning_rate": 0.0005450923928883708,
+      "loss": 3.5576,
+      "step": 15800
+    },
+    {
+      "epoch": 4.616989046842228,
+      "grad_norm": 0.3353123068809509,
+      "learning_rate": 0.0005449175167589623,
+      "loss": 3.5723,
+      "step": 15850
+    },
+    {
+      "epoch": 4.631554416219995,
+      "grad_norm": 0.32540038228034973,
+      "learning_rate": 0.000544742640629554,
+      "loss": 3.5793,
+      "step": 15900
+    },
+    {
+      "epoch": 4.6461197855977625,
+      "grad_norm": 0.3153678774833679,
+      "learning_rate": 0.0005445677645001457,
+      "loss": 3.566,
+      "step": 15950
+    },
+    {
+      "epoch": 4.66068515497553,
+      "grad_norm": 0.3324628472328186,
+      "learning_rate": 0.0005443928883707373,
+      "loss": 3.5679,
+      "step": 16000
+    },
+    {
+      "epoch": 4.66068515497553,
+      "eval_accuracy": 0.35883380292619155,
+      "eval_loss": 3.6458113193511963,
+      "eval_runtime": 182.9054,
+      "eval_samples_per_second": 91.003,
+      "eval_steps_per_second": 5.691,
+      "step": 16000
+    },
+    {
+      "epoch": 4.675250524353298,
+      "grad_norm": 0.30943557620048523,
+      "learning_rate": 0.000544218012241329,
+      "loss": 3.5781,
+      "step": 16050
+    },
+    {
+      "epoch": 4.689815893731065,
+      "grad_norm": 0.33318641781806946,
+      "learning_rate": 0.0005440431361119207,
+      "loss": 3.5782,
+      "step": 16100
+    },
+    {
+      "epoch": 4.704381263108832,
+      "grad_norm": 0.32952025532722473,
+      "learning_rate": 0.0005438682599825123,
+      "loss": 3.5731,
+      "step": 16150
+    },
+    {
+      "epoch": 4.7189466324866,
+      "grad_norm": 0.33214232325553894,
+      "learning_rate": 0.000543693383853104,
+      "loss": 3.5754,
+      "step": 16200
+    },
+    {
+      "epoch": 4.7335120018643675,
+      "grad_norm": 0.3278939723968506,
+      "learning_rate": 0.0005435185077236957,
+      "loss": 3.5669,
+      "step": 16250
+    },
+    {
+      "epoch": 4.748077371242134,
+      "grad_norm": 0.35436153411865234,
+      "learning_rate": 0.0005433436315942873,
+      "loss": 3.5705,
+      "step": 16300
+    },
+    {
+      "epoch": 4.762642740619902,
+      "grad_norm": 0.32666391134262085,
+      "learning_rate": 0.000543168755464879,
+      "loss": 3.567,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7772081099976695,
+      "grad_norm": 0.3220253586769104,
+      "learning_rate": 0.0005429938793354706,
+      "loss": 3.5843,
+      "step": 16400
+    },
+    {
+      "epoch": 4.791773479375437,
+      "grad_norm": 0.3143666982650757,
+      "learning_rate": 0.0005428190032060623,
+      "loss": 3.5715,
+      "step": 16450
+    },
+    {
+      "epoch": 4.806338848753205,
+      "grad_norm": 0.30706289410591125,
+      "learning_rate": 0.000542644127076654,
+      "loss": 3.5738,
+      "step": 16500
+    },
+    {
+      "epoch": 4.820904218130972,
+      "grad_norm": 0.3221777677536011,
+      "learning_rate": 0.0005424692509472457,
+      "loss": 3.5769,
+      "step": 16550
+    },
+    {
+      "epoch": 4.835469587508739,
+      "grad_norm": 0.32023727893829346,
+      "learning_rate": 0.0005422943748178373,
+      "loss": 3.5685,
+      "step": 16600
+    },
+    {
+      "epoch": 4.850034956886507,
+      "grad_norm": 0.3428962230682373,
+      "learning_rate": 0.000542119498688429,
+      "loss": 3.5825,
+      "step": 16650
+    },
+    {
+      "epoch": 4.864600326264274,
+      "grad_norm": 0.3187549412250519,
+      "learning_rate": 0.0005419446225590207,
+      "loss": 3.5684,
+      "step": 16700
+    },
+    {
+      "epoch": 4.879165695642041,
+      "grad_norm": 0.32895606756210327,
+      "learning_rate": 0.0005417697464296122,
+      "loss": 3.5723,
+      "step": 16750
+    },
+    {
+      "epoch": 4.893731065019809,
+      "grad_norm": 0.31578466296195984,
+      "learning_rate": 0.000541594870300204,
+      "loss": 3.5769,
+      "step": 16800
+    },
+    {
+      "epoch": 4.908296434397577,
+      "grad_norm": 0.3406703472137451,
+      "learning_rate": 0.0005414199941707956,
+      "loss": 3.5636,
+      "step": 16850
+    },
+    {
+      "epoch": 4.922861803775344,
+      "grad_norm": 0.32653164863586426,
+      "learning_rate": 0.0005412451180413873,
+      "loss": 3.5667,
+      "step": 16900
+    },
+    {
+      "epoch": 4.937427173153111,
+      "grad_norm": 0.31869933009147644,
+      "learning_rate": 0.000541070241911979,
+      "loss": 3.5796,
+      "step": 16950
+    },
+    {
+      "epoch": 4.951992542530879,
+      "grad_norm": 0.3086072504520416,
+      "learning_rate": 0.0005408953657825706,
+      "loss": 3.5802,
+      "step": 17000
+    },
+    {
+      "epoch": 4.951992542530879,
+      "eval_accuracy": 0.36002831077661235,
+      "eval_loss": 3.6297221183776855,
+      "eval_runtime": 182.8739,
+      "eval_samples_per_second": 91.019,
+      "eval_steps_per_second": 5.692,
+      "step": 17000
+    },
+    {
+      "epoch": 4.966557911908646,
+      "grad_norm": 0.3254891335964203,
+      "learning_rate": 0.0005407204896531623,
+      "loss": 3.5614,
+      "step": 17050
+    },
+    {
+      "epoch": 4.981123281286413,
+      "grad_norm": 0.3315959870815277,
+      "learning_rate": 0.000540545613523754,
+      "loss": 3.5736,
+      "step": 17100
+    },
+    {
+      "epoch": 4.995688650664181,
+      "grad_norm": 0.308977335691452,
+      "learning_rate": 0.0005403707373943456,
+      "loss": 3.5687,
+      "step": 17150
+    },
+    {
+      "epoch": 5.010195758564437,
+      "grad_norm": 0.33981844782829285,
+      "learning_rate": 0.0005401958612649372,
+      "loss": 3.5112,
+      "step": 17200
+    },
+    {
+      "epoch": 5.024761127942204,
+      "grad_norm": 0.32200807332992554,
+      "learning_rate": 0.000540020985135529,
+      "loss": 3.4613,
+      "step": 17250
+    },
+    {
+      "epoch": 5.039326497319972,
+      "grad_norm": 0.30297327041625977,
+      "learning_rate": 0.0005398461090061206,
+      "loss": 3.4576,
+      "step": 17300
+    },
+    {
+      "epoch": 5.0538918666977395,
+      "grad_norm": 0.32989853620529175,
+      "learning_rate": 0.0005396712328767123,
+      "loss": 3.4718,
+      "step": 17350
+    },
+    {
+      "epoch": 5.068457236075507,
+      "grad_norm": 0.31648069620132446,
+      "learning_rate": 0.000539496356747304,
+      "loss": 3.4785,
+      "step": 17400
+    },
+    {
+      "epoch": 5.083022605453274,
+      "grad_norm": 0.3199455440044403,
+      "learning_rate": 0.0005393214806178956,
+      "loss": 3.4711,
+      "step": 17450
+    },
+    {
+      "epoch": 5.0975879748310415,
+      "grad_norm": 0.33137065172195435,
+      "learning_rate": 0.0005391466044884873,
+      "loss": 3.491,
+      "step": 17500
+    },
+    {
+      "epoch": 5.112153344208809,
+      "grad_norm": 0.33032289147377014,
+      "learning_rate": 0.000538971728359079,
+      "loss": 3.4838,
+      "step": 17550
+    },
+    {
+      "epoch": 5.126718713586577,
+      "grad_norm": 0.32575368881225586,
+      "learning_rate": 0.0005387968522296705,
+      "loss": 3.492,
+      "step": 17600
+    },
+    {
+      "epoch": 5.141284082964344,
+      "grad_norm": 0.3349052965641022,
+      "learning_rate": 0.0005386219761002622,
+      "loss": 3.4838,
+      "step": 17650
+    },
+    {
+      "epoch": 5.155849452342111,
+      "grad_norm": 0.3152545094490051,
+      "learning_rate": 0.0005384470999708539,
+      "loss": 3.4819,
+      "step": 17700
+    },
+    {
+      "epoch": 5.170414821719879,
+      "grad_norm": 0.321737676858902,
+      "learning_rate": 0.0005382722238414456,
+      "loss": 3.486,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1849801910976465,
+      "grad_norm": 0.36409512162208557,
+      "learning_rate": 0.0005380973477120373,
+      "loss": 3.4959,
+      "step": 17800
+    },
+    {
+      "epoch": 5.199545560475413,
+      "grad_norm": 0.32220658659935,
+      "learning_rate": 0.000537922471582629,
+      "loss": 3.4938,
+      "step": 17850
+    },
+    {
+      "epoch": 5.214110929853181,
+      "grad_norm": 0.3427848219871521,
+      "learning_rate": 0.0005377475954532206,
+      "loss": 3.4992,
+      "step": 17900
+    },
+    {
+      "epoch": 5.228676299230949,
+      "grad_norm": 0.33268028497695923,
+      "learning_rate": 0.0005375727193238123,
+      "loss": 3.4945,
+      "step": 17950
+    },
+    {
+      "epoch": 5.243241668608716,
+      "grad_norm": 0.2996683120727539,
+      "learning_rate": 0.000537397843194404,
+      "loss": 3.4929,
+      "step": 18000
+    },
+    {
+      "epoch": 5.243241668608716,
+      "eval_accuracy": 0.36062627011984466,
+      "eval_loss": 3.6342105865478516,
+      "eval_runtime": 182.824,
+      "eval_samples_per_second": 91.044,
+      "eval_steps_per_second": 5.694,
+      "step": 18000
+    },
+    {
+      "epoch": 5.257807037986483,
+      "grad_norm": 0.30999842286109924,
+      "learning_rate": 0.0005372229670649955,
+      "loss": 3.5096,
+      "step": 18050
+    },
+    {
+      "epoch": 5.272372407364251,
+      "grad_norm": 0.32671719789505005,
+      "learning_rate": 0.0005370480909355872,
+      "loss": 3.5092,
+      "step": 18100
+    },
+    {
+      "epoch": 5.286937776742018,
+      "grad_norm": 0.32220521569252014,
+      "learning_rate": 0.0005368732148061789,
+      "loss": 3.5055,
+      "step": 18150
+    },
+    {
+      "epoch": 5.301503146119786,
+      "grad_norm": 0.3446556031703949,
+      "learning_rate": 0.0005366983386767705,
+      "loss": 3.5103,
+      "step": 18200
+    },
+    {
+      "epoch": 5.316068515497553,
+      "grad_norm": 0.333375483751297,
+      "learning_rate": 0.0005365234625473623,
+      "loss": 3.5227,
+      "step": 18250
+    },
+    {
+      "epoch": 5.33063388487532,
+      "grad_norm": 0.32386448979377747,
+      "learning_rate": 0.0005363485864179539,
+      "loss": 3.5153,
+      "step": 18300
+    },
+    {
+      "epoch": 5.345199254253088,
+      "grad_norm": 0.31273576617240906,
+      "learning_rate": 0.0005361737102885456,
+      "loss": 3.5114,
+      "step": 18350
+    },
+    {
+      "epoch": 5.359764623630856,
+      "grad_norm": 0.32386016845703125,
+      "learning_rate": 0.0005359988341591373,
+      "loss": 3.5176,
+      "step": 18400
+    },
+    {
+      "epoch": 5.374329993008622,
+      "grad_norm": 0.3084457516670227,
+      "learning_rate": 0.000535823958029729,
+      "loss": 3.5144,
+      "step": 18450
+    },
+    {
+      "epoch": 5.38889536238639,
+      "grad_norm": 0.34510570764541626,
+      "learning_rate": 0.0005356490819003205,
+      "loss": 3.5283,
+      "step": 18500
+    },
+    {
+      "epoch": 5.403460731764158,
+      "grad_norm": 0.33156850934028625,
+      "learning_rate": 0.0005354742057709122,
+      "loss": 3.5198,
+      "step": 18550
+    },
+    {
+      "epoch": 5.418026101141925,
+      "grad_norm": 0.32253149151802063,
+      "learning_rate": 0.0005352993296415039,
+      "loss": 3.5129,
+      "step": 18600
+    },
+    {
+      "epoch": 5.432591470519692,
+      "grad_norm": 0.343431293964386,
+      "learning_rate": 0.0005351244535120955,
+      "loss": 3.5214,
+      "step": 18650
+    },
+    {
+      "epoch": 5.44715683989746,
+      "grad_norm": 0.320697158575058,
+      "learning_rate": 0.0005349495773826873,
+      "loss": 3.5089,
+      "step": 18700
+    },
+    {
+      "epoch": 5.461722209275227,
+      "grad_norm": 0.3079582154750824,
+      "learning_rate": 0.0005347747012532789,
+      "loss": 3.523,
+      "step": 18750
+    },
+    {
+      "epoch": 5.476287578652995,
+      "grad_norm": 0.33627447485923767,
+      "learning_rate": 0.0005345998251238706,
+      "loss": 3.5175,
+      "step": 18800
+    },
+    {
+      "epoch": 5.490852948030762,
+      "grad_norm": 0.3293239176273346,
+      "learning_rate": 0.0005344249489944623,
+      "loss": 3.5214,
+      "step": 18850
+    },
+    {
+      "epoch": 5.505418317408529,
+      "grad_norm": 0.321121484041214,
+      "learning_rate": 0.0005342500728650538,
+      "loss": 3.5227,
+      "step": 18900
+    },
+    {
+      "epoch": 5.519983686786297,
+      "grad_norm": 0.32323557138442993,
+      "learning_rate": 0.0005340751967356455,
+      "loss": 3.5089,
+      "step": 18950
+    },
+    {
+      "epoch": 5.534549056164065,
+      "grad_norm": 0.3353577256202698,
+      "learning_rate": 0.0005339003206062372,
+      "loss": 3.5125,
+      "step": 19000
+    },
+    {
+      "epoch": 5.534549056164065,
+      "eval_accuracy": 0.36119366134879455,
+      "eval_loss": 3.62491774559021,
+      "eval_runtime": 182.9044,
+      "eval_samples_per_second": 91.004,
+      "eval_steps_per_second": 5.691,
+      "step": 19000
+    },
+    {
+      "epoch": 5.549114425541831,
+      "grad_norm": 0.3373064696788788,
+      "learning_rate": 0.0005337254444768288,
+      "loss": 3.5161,
+      "step": 19050
+    },
+    {
+      "epoch": 5.563679794919599,
+      "grad_norm": 0.3205850422382355,
+      "learning_rate": 0.0005335505683474205,
+      "loss": 3.5198,
+      "step": 19100
+    },
+    {
+      "epoch": 5.578245164297367,
+      "grad_norm": 0.33948105573654175,
+      "learning_rate": 0.0005333756922180122,
+      "loss": 3.5155,
+      "step": 19150
+    },
+    {
+      "epoch": 5.592810533675134,
+      "grad_norm": 0.30776116251945496,
+      "learning_rate": 0.0005332008160886039,
+      "loss": 3.5019,
+      "step": 19200
+    },
+    {
+      "epoch": 5.607375903052901,
+      "grad_norm": 0.42596328258514404,
+      "learning_rate": 0.0005330259399591956,
+      "loss": 3.5311,
+      "step": 19250
+    },
+    {
+      "epoch": 5.621941272430669,
+      "grad_norm": 0.34155476093292236,
+      "learning_rate": 0.0005328510638297873,
+      "loss": 3.5215,
+      "step": 19300
+    },
+    {
+      "epoch": 5.636506641808436,
+      "grad_norm": 0.33613601326942444,
+      "learning_rate": 0.0005326761877003788,
+      "loss": 3.5089,
+      "step": 19350
+    },
+    {
+      "epoch": 5.651072011186204,
+      "grad_norm": 0.32420244812965393,
+      "learning_rate": 0.0005325013115709705,
+      "loss": 3.5318,
+      "step": 19400
+    },
+    {
+      "epoch": 5.665637380563971,
+      "grad_norm": 0.30287185311317444,
+      "learning_rate": 0.0005323264354415622,
+      "loss": 3.5207,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6802027499417385,
+      "grad_norm": 0.3400566577911377,
+      "learning_rate": 0.0005321515593121538,
+      "loss": 3.5213,
+      "step": 19500
+    },
+    {
+      "epoch": 5.694768119319506,
+      "grad_norm": 0.31341907382011414,
+      "learning_rate": 0.0005319766831827455,
+      "loss": 3.5249,
+      "step": 19550
+    },
+    {
+      "epoch": 5.709333488697274,
+      "grad_norm": 0.3233737647533417,
+      "learning_rate": 0.0005318018070533372,
+      "loss": 3.5391,
+      "step": 19600
+    },
+    {
+      "epoch": 5.7238988580750405,
+      "grad_norm": 0.3121044635772705,
+      "learning_rate": 0.0005316269309239288,
+      "loss": 3.5232,
+      "step": 19650
+    },
+    {
+      "epoch": 5.738464227452808,
+      "grad_norm": 0.32169246673583984,
+      "learning_rate": 0.0005314520547945206,
+      "loss": 3.5159,
+      "step": 19700
+    },
+    {
+      "epoch": 5.753029596830576,
+      "grad_norm": 0.3152818977832794,
+      "learning_rate": 0.0005312771786651121,
+      "loss": 3.5194,
+      "step": 19750
+    },
+    {
+      "epoch": 5.7675949662083426,
+      "grad_norm": 0.31117457151412964,
+      "learning_rate": 0.0005311023025357038,
+      "loss": 3.5284,
+      "step": 19800
+    },
+    {
+      "epoch": 5.78216033558611,
+      "grad_norm": 0.3282223641872406,
+      "learning_rate": 0.0005309274264062955,
+      "loss": 3.526,
+      "step": 19850
+    },
+    {
+      "epoch": 5.796725704963878,
+      "grad_norm": 0.3373311161994934,
+      "learning_rate": 0.0005307525502768872,
+      "loss": 3.5291,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8112910743416455,
+      "grad_norm": 0.3154096007347107,
+      "learning_rate": 0.0005305776741474788,
+      "loss": 3.53,
+      "step": 19950
+    },
+    {
+      "epoch": 5.825856443719413,
+      "grad_norm": 0.31639188528060913,
+      "learning_rate": 0.0005304027980180705,
+      "loss": 3.5162,
+      "step": 20000
+    },
+    {
+      "epoch": 5.825856443719413,
+      "eval_accuracy": 0.3622192215829698,
+      "eval_loss": 3.612488269805908,
+      "eval_runtime": 182.8981,
+      "eval_samples_per_second": 91.007,
+      "eval_steps_per_second": 5.692,
+      "step": 20000
+    },
+    {
+      "epoch": 5.84042181309718,
+      "grad_norm": 0.33335795998573303,
+      "learning_rate": 0.0005302279218886622,
+      "loss": 3.5267,
+      "step": 20050
+    },
+    {
+      "epoch": 5.8549871824749475,
+      "grad_norm": 0.32977914810180664,
+      "learning_rate": 0.0005300530457592538,
+      "loss": 3.5278,
+      "step": 20100
+    },
+    {
+      "epoch": 5.869552551852715,
+      "grad_norm": 0.319991797208786,
+      "learning_rate": 0.0005298781696298456,
+      "loss": 3.5243,
+      "step": 20150
+    },
+    {
+      "epoch": 5.884117921230482,
+      "grad_norm": 0.31696072220802307,
+      "learning_rate": 0.0005297032935004371,
+      "loss": 3.5255,
+      "step": 20200
+    },
+    {
+      "epoch": 5.89868329060825,
+      "grad_norm": 0.31803423166275024,
+      "learning_rate": 0.0005295284173710288,
+      "loss": 3.5163,
+      "step": 20250
+    },
+    {
+      "epoch": 5.913248659986017,
+      "grad_norm": 0.30268293619155884,
+      "learning_rate": 0.0005293535412416205,
+      "loss": 3.5247,
+      "step": 20300
+    },
+    {
+      "epoch": 5.927814029363785,
+      "grad_norm": 0.3222343325614929,
+      "learning_rate": 0.0005291786651122121,
+      "loss": 3.5226,
+      "step": 20350
+    },
+    {
+      "epoch": 5.9423793987415525,
+      "grad_norm": 0.3210581839084625,
+      "learning_rate": 0.0005290037889828038,
+      "loss": 3.5315,
+      "step": 20400
+    },
+    {
+      "epoch": 5.956944768119319,
+      "grad_norm": 0.30163270235061646,
+      "learning_rate": 0.0005288289128533955,
+      "loss": 3.531,
+      "step": 20450
+    },
+    {
+      "epoch": 5.971510137497087,
+      "grad_norm": 0.3174237906932831,
+      "learning_rate": 0.0005286540367239872,
+      "loss": 3.5308,
+      "step": 20500
+    },
+    {
+      "epoch": 5.986075506874855,
+      "grad_norm": 0.3105069398880005,
+      "learning_rate": 0.0005284791605945788,
+      "loss": 3.5229,
+      "step": 20550
+    },
+    {
+      "epoch": 6.0005826147751105,
+      "grad_norm": 0.35065528750419617,
+      "learning_rate": 0.0005283042844651704,
+      "loss": 3.5306,
+      "step": 20600
+    },
+    {
+      "epoch": 6.015147984152878,
+      "grad_norm": 0.3209701180458069,
+      "learning_rate": 0.0005281294083357621,
+      "loss": 3.4202,
+      "step": 20650
+    },
+    {
+      "epoch": 6.029713353530646,
+      "grad_norm": 0.35732001066207886,
+      "learning_rate": 0.0005279545322063538,
+      "loss": 3.4097,
+      "step": 20700
+    },
+    {
+      "epoch": 6.044278722908413,
+      "grad_norm": 0.3293222188949585,
+      "learning_rate": 0.0005277796560769455,
+      "loss": 3.4211,
+      "step": 20750
+    },
+    {
+      "epoch": 6.05884409228618,
+      "grad_norm": 0.3226439952850342,
+      "learning_rate": 0.0005276047799475371,
+      "loss": 3.4286,
+      "step": 20800
+    },
+    {
+      "epoch": 6.073409461663948,
+      "grad_norm": 0.33143454790115356,
+      "learning_rate": 0.0005274299038181288,
+      "loss": 3.4274,
+      "step": 20850
+    },
+    {
+      "epoch": 6.087974831041715,
+      "grad_norm": 0.31629180908203125,
+      "learning_rate": 0.0005272550276887205,
+      "loss": 3.4214,
+      "step": 20900
+    },
+    {
+      "epoch": 6.102540200419483,
+      "grad_norm": 0.3248637020587921,
+      "learning_rate": 0.0005270801515593121,
+      "loss": 3.448,
+      "step": 20950
+    },
+    {
+      "epoch": 6.11710556979725,
+      "grad_norm": 0.333915114402771,
+      "learning_rate": 0.0005269052754299037,
+      "loss": 3.4304,
+      "step": 21000
+    },
+    {
+      "epoch": 6.11710556979725,
+      "eval_accuracy": 0.36297096205497675,
+      "eval_loss": 3.6168880462646484,
+      "eval_runtime": 183.1564,
+      "eval_samples_per_second": 90.879,
+      "eval_steps_per_second": 5.684,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1316709391750175,
+      "grad_norm": 0.339853435754776,
+      "learning_rate": 0.0005267303993004954,
+      "loss": 3.432,
+      "step": 21050
+    },
+    {
+      "epoch": 6.146236308552785,
+      "grad_norm": 0.3413807153701782,
+      "learning_rate": 0.000526555523171087,
+      "loss": 3.4503,
+      "step": 21100
+    },
+    {
+      "epoch": 6.160801677930552,
+      "grad_norm": 0.3138124942779541,
+      "learning_rate": 0.0005263806470416788,
+      "loss": 3.4536,
+      "step": 21150
+    },
+    {
+      "epoch": 6.1753670473083195,
+      "grad_norm": 0.3247540593147278,
+      "learning_rate": 0.0005262057709122704,
+      "loss": 3.4454,
+      "step": 21200
+    },
+    {
+      "epoch": 6.189932416686087,
+      "grad_norm": 0.30432644486427307,
+      "learning_rate": 0.0005260308947828621,
+      "loss": 3.4577,
+      "step": 21250
+    },
+    {
+      "epoch": 6.204497786063855,
+      "grad_norm": 0.3186846077442169,
+      "learning_rate": 0.0005258560186534538,
+      "loss": 3.4504,
+      "step": 21300
+    },
+    {
+      "epoch": 6.219063155441622,
+      "grad_norm": 0.3390951454639435,
+      "learning_rate": 0.0005256811425240455,
+      "loss": 3.4427,
+      "step": 21350
+    },
+    {
+      "epoch": 6.233628524819389,
+      "grad_norm": 0.3416147828102112,
+      "learning_rate": 0.0005255062663946371,
+      "loss": 3.4451,
+      "step": 21400
+    },
+    {
+      "epoch": 6.248193894197157,
+      "grad_norm": 0.330108106136322,
+      "learning_rate": 0.0005253313902652287,
+      "loss": 3.4634,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2627592635749245,
+      "grad_norm": 0.32614409923553467,
+      "learning_rate": 0.0005251565141358204,
+      "loss": 3.4689,
+      "step": 21500
+    },
+    {
+      "epoch": 6.277324632952691,
+      "grad_norm": 0.3183272182941437,
+      "learning_rate": 0.000524981638006412,
+      "loss": 3.4614,
+      "step": 21550
+    },
+    {
+      "epoch": 6.291890002330459,
+      "grad_norm": 0.3260941803455353,
+      "learning_rate": 0.0005248067618770038,
+      "loss": 3.4687,
+      "step": 21600
+    },
+    {
+      "epoch": 6.306455371708227,
+      "grad_norm": 0.34689396619796753,
+      "learning_rate": 0.0005246318857475954,
+      "loss": 3.4607,
+      "step": 21650
+    },
+    {
+      "epoch": 6.321020741085994,
+      "grad_norm": 0.3259231150150299,
+      "learning_rate": 0.0005244570096181871,
+      "loss": 3.4606,
+      "step": 21700
+    },
+    {
+      "epoch": 6.335586110463761,
+      "grad_norm": 0.33066433668136597,
+      "learning_rate": 0.0005242821334887788,
+      "loss": 3.4666,
+      "step": 21750
+    },
+    {
+      "epoch": 6.350151479841529,
+      "grad_norm": 0.3282213807106018,
+      "learning_rate": 0.0005241072573593704,
+      "loss": 3.4566,
+      "step": 21800
+    },
+    {
+      "epoch": 6.364716849219296,
+      "grad_norm": 0.32075440883636475,
+      "learning_rate": 0.000523932381229962,
+      "loss": 3.4641,
+      "step": 21850
+    },
+    {
+      "epoch": 6.379282218597064,
+      "grad_norm": 0.3442796468734741,
+      "learning_rate": 0.0005237575051005537,
+      "loss": 3.4588,
+      "step": 21900
+    },
+    {
+      "epoch": 6.393847587974831,
+      "grad_norm": 0.3072233498096466,
+      "learning_rate": 0.0005235826289711454,
+      "loss": 3.462,
+      "step": 21950
+    },
+    {
+      "epoch": 6.408412957352598,
+      "grad_norm": 0.35822081565856934,
+      "learning_rate": 0.000523407752841737,
+      "loss": 3.4737,
+      "step": 22000
+    },
+    {
+      "epoch": 6.408412957352598,
+      "eval_accuracy": 0.3631809414861629,
+      "eval_loss": 3.6075851917266846,
+      "eval_runtime": 183.2532,
+      "eval_samples_per_second": 90.831,
+      "eval_steps_per_second": 5.681,
+      "step": 22000
+    },
+    {
+      "epoch": 6.422978326730366,
+      "grad_norm": 0.3440370559692383,
+      "learning_rate": 0.0005232328767123287,
+      "loss": 3.4619,
+      "step": 22050
+    },
+    {
+      "epoch": 6.437543696108134,
+      "grad_norm": 0.3295625150203705,
+      "learning_rate": 0.0005230580005829204,
+      "loss": 3.4756,
+      "step": 22100
+    },
+    {
+      "epoch": 6.4521090654859,
+      "grad_norm": 0.32679641246795654,
+      "learning_rate": 0.0005228831244535121,
+      "loss": 3.4795,
+      "step": 22150
+    },
+    {
+      "epoch": 6.466674434863668,
+      "grad_norm": 0.3422777056694031,
+      "learning_rate": 0.0005227082483241038,
+      "loss": 3.4777,
+      "step": 22200
+    },
+    {
+      "epoch": 6.481239804241436,
+      "grad_norm": 0.3188982903957367,
+      "learning_rate": 0.0005225333721946954,
+      "loss": 3.4789,
+      "step": 22250
+    },
+    {
+      "epoch": 6.495805173619203,
+      "grad_norm": 0.33180058002471924,
+      "learning_rate": 0.000522358496065287,
+      "loss": 3.4764,
+      "step": 22300
+    },
+    {
+      "epoch": 6.51037054299697,
+      "grad_norm": 0.35259467363357544,
+      "learning_rate": 0.0005221836199358787,
+      "loss": 3.4783,
+      "step": 22350
+    },
+    {
+      "epoch": 6.524935912374738,
+      "grad_norm": 0.31127235293388367,
+      "learning_rate": 0.0005220087438064703,
+      "loss": 3.4812,
+      "step": 22400
+    },
+    {
+      "epoch": 6.539501281752505,
+      "grad_norm": 0.3426322340965271,
+      "learning_rate": 0.000521833867677062,
+      "loss": 3.4858,
+      "step": 22450
+    },
+    {
+      "epoch": 6.554066651130273,
+      "grad_norm": 0.33666297793388367,
+      "learning_rate": 0.0005216589915476537,
+      "loss": 3.4842,
+      "step": 22500
+    },
+    {
+      "epoch": 6.56863202050804,
+      "grad_norm": 0.33777403831481934,
+      "learning_rate": 0.0005214841154182454,
+      "loss": 3.4867,
+      "step": 22550
+    },
+    {
+      "epoch": 6.583197389885807,
+      "grad_norm": 0.33145034313201904,
+      "learning_rate": 0.0005213092392888371,
+      "loss": 3.4843,
+      "step": 22600
+    },
+    {
+      "epoch": 6.597762759263575,
+      "grad_norm": 0.3483746647834778,
+      "learning_rate": 0.0005211343631594287,
+      "loss": 3.4842,
+      "step": 22650
+    },
+    {
+      "epoch": 6.612328128641343,
+      "grad_norm": 0.34439757466316223,
+      "learning_rate": 0.0005209594870300204,
+      "loss": 3.4747,
+      "step": 22700
+    },
+    {
+      "epoch": 6.626893498019109,
+      "grad_norm": 0.330509752035141,
+      "learning_rate": 0.000520784610900612,
+      "loss": 3.4757,
+      "step": 22750
+    },
+    {
+      "epoch": 6.641458867396877,
+      "grad_norm": 0.3516590893268585,
+      "learning_rate": 0.0005206097347712037,
+      "loss": 3.4689,
+      "step": 22800
+    },
+    {
+      "epoch": 6.656024236774645,
+      "grad_norm": 0.35408666729927063,
+      "learning_rate": 0.0005204348586417953,
+      "loss": 3.4825,
+      "step": 22850
+    },
+    {
+      "epoch": 6.670589606152412,
+      "grad_norm": 0.35224005579948425,
+      "learning_rate": 0.000520259982512387,
+      "loss": 3.4933,
+      "step": 22900
+    },
+    {
+      "epoch": 6.685154975530179,
+      "grad_norm": 0.33899161219596863,
+      "learning_rate": 0.0005200851063829787,
+      "loss": 3.4857,
+      "step": 22950
+    },
+    {
+      "epoch": 6.699720344907947,
+      "grad_norm": 0.3293270170688629,
+      "learning_rate": 0.0005199102302535703,
+      "loss": 3.4775,
+      "step": 23000
+    },
+    {
+      "epoch": 6.699720344907947,
+      "eval_accuracy": 0.3640601274807935,
+      "eval_loss": 3.599479913711548,
+      "eval_runtime": 183.341,
+      "eval_samples_per_second": 90.787,
+      "eval_steps_per_second": 5.678,
+      "step": 23000
+    },
+    {
+      "epoch": 6.714285714285714,
+      "grad_norm": 0.32254496216773987,
+      "learning_rate": 0.0005197353541241621,
+      "loss": 3.4873,
+      "step": 23050
+    },
+    {
+      "epoch": 6.728851083663482,
+      "grad_norm": 0.32425615191459656,
+      "learning_rate": 0.0005195604779947537,
+      "loss": 3.4951,
+      "step": 23100
+    },
+    {
+      "epoch": 6.743416453041249,
+      "grad_norm": 0.3163357675075531,
+      "learning_rate": 0.0005193856018653454,
+      "loss": 3.4845,
+      "step": 23150
+    },
+    {
+      "epoch": 6.7579818224190165,
+      "grad_norm": 0.3210630416870117,
+      "learning_rate": 0.000519210725735937,
+      "loss": 3.4869,
+      "step": 23200
+    },
+    {
+      "epoch": 6.772547191796784,
+      "grad_norm": 0.33914175629615784,
+      "learning_rate": 0.0005190358496065286,
+      "loss": 3.4848,
+      "step": 23250
+    },
+    {
+      "epoch": 6.787112561174552,
+      "grad_norm": 0.3026992082595825,
+      "learning_rate": 0.0005188609734771203,
+      "loss": 3.4891,
+      "step": 23300
+    },
+    {
+      "epoch": 6.8016779305523185,
+      "grad_norm": 0.3406318724155426,
+      "learning_rate": 0.000518686097347712,
+      "loss": 3.4859,
+      "step": 23350
+    },
+    {
+      "epoch": 6.816243299930086,
+      "grad_norm": 0.3310200273990631,
+      "learning_rate": 0.0005185112212183037,
+      "loss": 3.5022,
+      "step": 23400
+    },
+    {
+      "epoch": 6.830808669307854,
+      "grad_norm": 0.30199411511421204,
+      "learning_rate": 0.0005183363450888953,
+      "loss": 3.4872,
+      "step": 23450
+    },
+    {
+      "epoch": 6.845374038685621,
+      "grad_norm": 0.3239602744579315,
+      "learning_rate": 0.000518161468959487,
+      "loss": 3.4867,
+      "step": 23500
+    },
+    {
+      "epoch": 6.859939408063388,
+      "grad_norm": 0.32365167140960693,
+      "learning_rate": 0.0005179865928300787,
+      "loss": 3.4923,
+      "step": 23550
+    },
+    {
+      "epoch": 6.874504777441156,
+      "grad_norm": 0.31722819805145264,
+      "learning_rate": 0.0005178117167006703,
+      "loss": 3.4779,
+      "step": 23600
+    },
+    {
+      "epoch": 6.8890701468189235,
+      "grad_norm": 0.3375761806964874,
+      "learning_rate": 0.000517636840571262,
+      "loss": 3.4864,
+      "step": 23650
+    },
+    {
+      "epoch": 6.903635516196691,
+      "grad_norm": 0.30887332558631897,
+      "learning_rate": 0.0005174619644418536,
+      "loss": 3.4974,
+      "step": 23700
+    },
+    {
+      "epoch": 6.918200885574458,
+      "grad_norm": 0.34449508786201477,
+      "learning_rate": 0.0005172870883124453,
+      "loss": 3.4916,
+      "step": 23750
+    },
+    {
+      "epoch": 6.9327662549522255,
+      "grad_norm": 0.3328838348388672,
+      "learning_rate": 0.000517112212183037,
+      "loss": 3.4975,
+      "step": 23800
+    },
+    {
+      "epoch": 6.947331624329993,
+      "grad_norm": 0.33597683906555176,
+      "learning_rate": 0.0005169373360536286,
+      "loss": 3.4857,
+      "step": 23850
+    },
+    {
+      "epoch": 6.961896993707761,
+      "grad_norm": 0.3501330316066742,
+      "learning_rate": 0.0005167624599242203,
+      "loss": 3.4972,
+      "step": 23900
+    },
+    {
+      "epoch": 6.976462363085528,
+      "grad_norm": 0.3559366762638092,
+      "learning_rate": 0.000516587583794812,
+      "loss": 3.4917,
+      "step": 23950
+    },
+    {
+      "epoch": 6.991027732463295,
+      "grad_norm": 0.3154732286930084,
+      "learning_rate": 0.0005164127076654037,
+      "loss": 3.49,
+      "step": 24000
+    },
+    {
+      "epoch": 6.991027732463295,
+      "eval_accuracy": 0.36516716349649847,
+      "eval_loss": 3.586782932281494,
+      "eval_runtime": 182.9453,
+      "eval_samples_per_second": 90.983,
+      "eval_steps_per_second": 5.69,
+      "step": 24000
+    },
+    {
+      "epoch": 7.005534840363552,
+      "grad_norm": 0.3098287880420685,
+      "learning_rate": 0.0005162378315359953,
+      "loss": 3.4496,
+      "step": 24050
+    },
+    {
+      "epoch": 7.020100209741319,
+      "grad_norm": 0.34997889399528503,
+      "learning_rate": 0.0005160629554065869,
+      "loss": 3.3794,
+      "step": 24100
+    },
+    {
+      "epoch": 7.034665579119086,
+      "grad_norm": 0.33267155289649963,
+      "learning_rate": 0.0005158880792771786,
+      "loss": 3.3792,
+      "step": 24150
+    },
+    {
+      "epoch": 7.049230948496854,
+      "grad_norm": 0.3368155360221863,
+      "learning_rate": 0.0005157132031477703,
+      "loss": 3.3876,
+      "step": 24200
+    },
+    {
+      "epoch": 7.063796317874622,
+      "grad_norm": 0.33117562532424927,
+      "learning_rate": 0.000515538327018362,
+      "loss": 3.4,
+      "step": 24250
+    },
+    {
+      "epoch": 7.0783616872523885,
+      "grad_norm": 0.3448326587677002,
+      "learning_rate": 0.0005153634508889536,
+      "loss": 3.4042,
+      "step": 24300
+    },
+    {
+      "epoch": 7.092927056630156,
+      "grad_norm": 0.3393695652484894,
+      "learning_rate": 0.0005151885747595453,
+      "loss": 3.4025,
+      "step": 24350
+    },
+    {
+      "epoch": 7.107492426007924,
+      "grad_norm": 0.32103875279426575,
+      "learning_rate": 0.000515013698630137,
+      "loss": 3.3923,
+      "step": 24400
+    },
+    {
+      "epoch": 7.122057795385691,
+      "grad_norm": 0.32782039046287537,
+      "learning_rate": 0.0005148388225007285,
+      "loss": 3.4004,
+      "step": 24450
+    },
+    {
+      "epoch": 7.136623164763458,
+      "grad_norm": 0.321836918592453,
+      "learning_rate": 0.0005146639463713203,
+      "loss": 3.4083,
+      "step": 24500
+    },
+    {
+      "epoch": 7.151188534141226,
+      "grad_norm": 0.3271026015281677,
+      "learning_rate": 0.0005144890702419119,
+      "loss": 3.4261,
+      "step": 24550
+    },
+    {
+      "epoch": 7.165753903518993,
+      "grad_norm": 0.32715457677841187,
+      "learning_rate": 0.0005143141941125036,
+      "loss": 3.4129,
+      "step": 24600
+    },
+    {
+      "epoch": 7.180319272896761,
+      "grad_norm": 0.3357333838939667,
+      "learning_rate": 0.0005141393179830953,
+      "loss": 3.4146,
+      "step": 24650
+    },
+    {
+      "epoch": 7.194884642274528,
+      "grad_norm": 0.3631840646266937,
+      "learning_rate": 0.0005139644418536869,
+      "loss": 3.4181,
+      "step": 24700
+    },
+    {
+      "epoch": 7.2094500116522955,
+      "grad_norm": 0.3274785578250885,
+      "learning_rate": 0.0005137895657242786,
+      "loss": 3.4125,
+      "step": 24750
+    },
+    {
+      "epoch": 7.224015381030063,
+      "grad_norm": 0.3449844419956207,
+      "learning_rate": 0.0005136146895948703,
+      "loss": 3.4166,
+      "step": 24800
+    },
+    {
+      "epoch": 7.238580750407831,
+      "grad_norm": 0.32263556122779846,
+      "learning_rate": 0.000513439813465462,
+      "loss": 3.4285,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2531461197855975,
+      "grad_norm": 0.3413465619087219,
+      "learning_rate": 0.0005132649373360535,
+      "loss": 3.4285,
+      "step": 24900
+    },
+    {
+      "epoch": 7.267711489163365,
+      "grad_norm": 0.3473232388496399,
+      "learning_rate": 0.0005130900612066452,
+      "loss": 3.4347,
+      "step": 24950
+    },
+    {
+      "epoch": 7.282276858541133,
+      "grad_norm": 0.32374686002731323,
+      "learning_rate": 0.0005129151850772369,
+      "loss": 3.4352,
+      "step": 25000
+    },
+    {
+      "epoch": 7.282276858541133,
+      "eval_accuracy": 0.3647126391510529,
+      "eval_loss": 3.598836898803711,
+      "eval_runtime": 183.3183,
+      "eval_samples_per_second": 90.798,
+      "eval_steps_per_second": 5.679,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2968422279189,
+      "grad_norm": 0.3352644443511963,
+      "learning_rate": 0.0005127403089478286,
+      "loss": 3.4219,
+      "step": 25050
+    },
+    {
+      "epoch": 7.311407597296667,
+      "grad_norm": 0.36998191475868225,
+      "learning_rate": 0.0005125654328184203,
+      "loss": 3.4354,
+      "step": 25100
+    },
+    {
+      "epoch": 7.325972966674435,
+      "grad_norm": 0.33111071586608887,
+      "learning_rate": 0.0005123905566890119,
+      "loss": 3.4273,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3405383360522025,
+      "grad_norm": 0.3304993212223053,
+      "learning_rate": 0.0005122156805596036,
+      "loss": 3.4285,
+      "step": 25200
+    },
+    {
+      "epoch": 7.35510370542997,
+      "grad_norm": 0.34959882497787476,
+      "learning_rate": 0.0005120408044301953,
+      "loss": 3.4227,
+      "step": 25250
+    },
+    {
+      "epoch": 7.369669074807737,
+      "grad_norm": 0.36705702543258667,
+      "learning_rate": 0.0005118659283007868,
+      "loss": 3.4247,
+      "step": 25300
+    },
+    {
+      "epoch": 7.384234444185505,
+      "grad_norm": 0.3186073899269104,
+      "learning_rate": 0.0005116910521713785,
+      "loss": 3.4439,
+      "step": 25350
+    },
+    {
+      "epoch": 7.398799813563272,
+      "grad_norm": 0.3507457971572876,
+      "learning_rate": 0.0005115161760419702,
+      "loss": 3.4373,
+      "step": 25400
+    },
+    {
+      "epoch": 7.413365182941039,
+      "grad_norm": 0.3345440626144409,
+      "learning_rate": 0.0005113412999125619,
+      "loss": 3.4429,
+      "step": 25450
+    },
+    {
+      "epoch": 7.427930552318807,
+      "grad_norm": 0.3337697684764862,
+      "learning_rate": 0.0005111664237831536,
+      "loss": 3.4366,
+      "step": 25500
+    },
+    {
+      "epoch": 7.442495921696574,
+      "grad_norm": 0.3243204653263092,
+      "learning_rate": 0.0005109915476537452,
+      "loss": 3.451,
+      "step": 25550
+    },
+    {
+      "epoch": 7.457061291074342,
+      "grad_norm": 0.34631916880607605,
+      "learning_rate": 0.0005108166715243369,
+      "loss": 3.4431,
+      "step": 25600
+    },
+    {
+      "epoch": 7.471626660452109,
+      "grad_norm": 0.31301483511924744,
+      "learning_rate": 0.0005106417953949286,
+      "loss": 3.436,
+      "step": 25650
+    },
+    {
+      "epoch": 7.486192029829876,
+      "grad_norm": 0.3258545994758606,
+      "learning_rate": 0.0005104669192655203,
+      "loss": 3.4431,
+      "step": 25700
+    },
+    {
+      "epoch": 7.500757399207644,
+      "grad_norm": 0.3321167230606079,
+      "learning_rate": 0.0005102920431361118,
+      "loss": 3.4506,
+      "step": 25750
+    },
+    {
+      "epoch": 7.515322768585412,
+      "grad_norm": 0.3389933407306671,
+      "learning_rate": 0.0005101171670067035,
+      "loss": 3.4348,
+      "step": 25800
+    },
+    {
+      "epoch": 7.529888137963178,
+      "grad_norm": 0.34453973174095154,
+      "learning_rate": 0.0005099422908772952,
+      "loss": 3.43,
+      "step": 25850
+    },
+    {
+      "epoch": 7.544453507340946,
+      "grad_norm": 0.3527055084705353,
+      "learning_rate": 0.0005097674147478868,
+      "loss": 3.4518,
+      "step": 25900
+    },
+    {
+      "epoch": 7.559018876718714,
+      "grad_norm": 0.33843597769737244,
+      "learning_rate": 0.0005095925386184786,
+      "loss": 3.4435,
+      "step": 25950
+    },
+    {
+      "epoch": 7.573584246096481,
+      "grad_norm": 0.3532460927963257,
+      "learning_rate": 0.0005094176624890702,
+      "loss": 3.4508,
+      "step": 26000
+    },
+    {
+      "epoch": 7.573584246096481,
+      "eval_accuracy": 0.36540406638218725,
+      "eval_loss": 3.589508056640625,
+      "eval_runtime": 183.429,
+      "eval_samples_per_second": 90.744,
+      "eval_steps_per_second": 5.675,
+      "step": 26000
+    },
+    {
+      "epoch": 7.588149615474248,
+      "grad_norm": 0.33581236004829407,
+      "learning_rate": 0.0005092427863596619,
+      "loss": 3.4393,
+      "step": 26050
+    },
+    {
+      "epoch": 7.602714984852016,
+      "grad_norm": 0.31811484694480896,
+      "learning_rate": 0.0005090679102302536,
+      "loss": 3.4466,
+      "step": 26100
+    },
+    {
+      "epoch": 7.617280354229783,
+      "grad_norm": 0.34038013219833374,
+      "learning_rate": 0.0005088930341008451,
+      "loss": 3.4549,
+      "step": 26150
+    },
+    {
+      "epoch": 7.631845723607551,
+      "grad_norm": 0.314603716135025,
+      "learning_rate": 0.0005087181579714368,
+      "loss": 3.4585,
+      "step": 26200
+    },
+    {
+      "epoch": 7.646411092985318,
+      "grad_norm": 0.3383300006389618,
+      "learning_rate": 0.0005085432818420285,
+      "loss": 3.4489,
+      "step": 26250
+    },
+    {
+      "epoch": 7.660976462363085,
+      "grad_norm": 0.32058727741241455,
+      "learning_rate": 0.0005083684057126202,
+      "loss": 3.4562,
+      "step": 26300
+    },
+    {
+      "epoch": 7.675541831740853,
+      "grad_norm": 0.3361349105834961,
+      "learning_rate": 0.0005081935295832118,
+      "loss": 3.4513,
+      "step": 26350
+    },
+    {
+      "epoch": 7.690107201118621,
+      "grad_norm": 0.3190141022205353,
+      "learning_rate": 0.0005080186534538035,
+      "loss": 3.4377,
+      "step": 26400
+    },
+    {
+      "epoch": 7.704672570496387,
+      "grad_norm": 0.32522907853126526,
+      "learning_rate": 0.0005078437773243952,
+      "loss": 3.4548,
+      "step": 26450
+    },
+    {
+      "epoch": 7.719237939874155,
+      "grad_norm": 0.3320629596710205,
+      "learning_rate": 0.0005076689011949869,
+      "loss": 3.4482,
+      "step": 26500
+    },
+    {
+      "epoch": 7.733803309251923,
+      "grad_norm": 0.3172106146812439,
+      "learning_rate": 0.0005074940250655786,
+      "loss": 3.4546,
+      "step": 26550
+    },
+    {
+      "epoch": 7.74836867862969,
+      "grad_norm": 0.339695006608963,
+      "learning_rate": 0.0005073191489361701,
+      "loss": 3.456,
+      "step": 26600
+    },
+    {
+      "epoch": 7.762934048007457,
+      "grad_norm": 0.3172706365585327,
+      "learning_rate": 0.0005071442728067618,
+      "loss": 3.4525,
+      "step": 26650
+    },
+    {
+      "epoch": 7.777499417385225,
+      "grad_norm": 0.3277393877506256,
+      "learning_rate": 0.0005069693966773535,
+      "loss": 3.4459,
+      "step": 26700
+    },
+    {
+      "epoch": 7.792064786762992,
+      "grad_norm": 0.3323872685432434,
+      "learning_rate": 0.0005067945205479451,
+      "loss": 3.4475,
+      "step": 26750
+    },
+    {
+      "epoch": 7.80663015614076,
+      "grad_norm": 0.339382529258728,
+      "learning_rate": 0.0005066196444185368,
+      "loss": 3.4398,
+      "step": 26800
+    },
+    {
+      "epoch": 7.821195525518527,
+      "grad_norm": 0.33565738797187805,
+      "learning_rate": 0.0005064447682891285,
+      "loss": 3.4522,
+      "step": 26850
+    },
+    {
+      "epoch": 7.8357608948962945,
+      "grad_norm": 0.33060815930366516,
+      "learning_rate": 0.0005062698921597202,
+      "loss": 3.4555,
+      "step": 26900
+    },
+    {
+      "epoch": 7.850326264274062,
+      "grad_norm": 0.3253958225250244,
+      "learning_rate": 0.0005060950160303119,
+      "loss": 3.4451,
+      "step": 26950
+    },
+    {
+      "epoch": 7.86489163365183,
+      "grad_norm": 0.3636493384838104,
+      "learning_rate": 0.0005059201399009035,
+      "loss": 3.4523,
+      "step": 27000
+    },
+    {
+      "epoch": 7.86489163365183,
+      "eval_accuracy": 0.36607174453991753,
+      "eval_loss": 3.5809550285339355,
+      "eval_runtime": 183.0826,
+      "eval_samples_per_second": 90.915,
+      "eval_steps_per_second": 5.686,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8794570030295965,
+      "grad_norm": 0.3568457365036011,
+      "learning_rate": 0.0005057452637714951,
+      "loss": 3.4527,
+      "step": 27050
+    },
+    {
+      "epoch": 7.894022372407364,
+      "grad_norm": 0.3288189172744751,
+      "learning_rate": 0.0005055703876420868,
+      "loss": 3.4552,
+      "step": 27100
+    },
+    {
+      "epoch": 7.908587741785132,
+      "grad_norm": 0.29840394854545593,
+      "learning_rate": 0.0005053955115126785,
+      "loss": 3.4563,
+      "step": 27150
+    },
+    {
+      "epoch": 7.923153111162899,
+      "grad_norm": 0.37738728523254395,
+      "learning_rate": 0.0005052206353832701,
+      "loss": 3.4605,
+      "step": 27200
+    },
+    {
+      "epoch": 7.937718480540666,
+      "grad_norm": 0.3255780339241028,
+      "learning_rate": 0.0005050457592538618,
+      "loss": 3.4607,
+      "step": 27250
+    },
+    {
+      "epoch": 7.952283849918434,
+      "grad_norm": 0.3199935853481293,
+      "learning_rate": 0.0005048708831244535,
+      "loss": 3.4621,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9668492192962015,
+      "grad_norm": 0.34419822692871094,
+      "learning_rate": 0.0005046960069950451,
+      "loss": 3.4547,
+      "step": 27350
+    },
+    {
+      "epoch": 7.981414588673969,
+      "grad_norm": 0.31108030676841736,
+      "learning_rate": 0.0005045211308656369,
+      "loss": 3.4622,
+      "step": 27400
+    },
+    {
+      "epoch": 7.995979958051736,
+      "grad_norm": 0.3241368532180786,
+      "learning_rate": 0.0005043462547362284,
+      "loss": 3.4599,
+      "step": 27450
+    },
+    {
+      "epoch": 8.010487065951992,
+      "grad_norm": 0.34238871932029724,
+      "learning_rate": 0.0005041713786068201,
+      "loss": 3.3724,
+      "step": 27500
+    },
+    {
+      "epoch": 8.02505243532976,
+      "grad_norm": 0.3241780400276184,
+      "learning_rate": 0.0005039965024774118,
+      "loss": 3.3516,
+      "step": 27550
+    },
+    {
+      "epoch": 8.039617804707527,
+      "grad_norm": 0.33774638175964355,
+      "learning_rate": 0.0005038216263480034,
+      "loss": 3.3319,
+      "step": 27600
+    },
+    {
+      "epoch": 8.054183174085296,
+      "grad_norm": 0.33839425444602966,
+      "learning_rate": 0.0005036467502185951,
+      "loss": 3.3643,
+      "step": 27650
+    },
+    {
+      "epoch": 8.068748543463062,
+      "grad_norm": 0.33138301968574524,
+      "learning_rate": 0.0005034718740891868,
+      "loss": 3.3576,
+      "step": 27700
+    },
+    {
+      "epoch": 8.08331391284083,
+      "grad_norm": 0.331544429063797,
+      "learning_rate": 0.0005032969979597785,
+      "loss": 3.3717,
+      "step": 27750
+    },
+    {
+      "epoch": 8.097879282218598,
+      "grad_norm": 0.3336641192436218,
+      "learning_rate": 0.0005031221218303701,
+      "loss": 3.3637,
+      "step": 27800
+    },
+    {
+      "epoch": 8.112444651596364,
+      "grad_norm": 0.3461131453514099,
+      "learning_rate": 0.0005029472457009618,
+      "loss": 3.367,
+      "step": 27850
+    },
+    {
+      "epoch": 8.127010020974131,
+      "grad_norm": 0.3179863691329956,
+      "learning_rate": 0.0005027723695715534,
+      "loss": 3.3618,
+      "step": 27900
+    },
+    {
+      "epoch": 8.1415753903519,
+      "grad_norm": 0.3158666491508484,
+      "learning_rate": 0.0005025974934421451,
+      "loss": 3.3798,
+      "step": 27950
+    },
+    {
+      "epoch": 8.156140759729666,
+      "grad_norm": 0.34379681944847107,
+      "learning_rate": 0.0005024226173127368,
+      "loss": 3.3762,
+      "step": 28000
+    },
+    {
+      "epoch": 8.156140759729666,
+      "eval_accuracy": 0.3662919525324213,
+      "eval_loss": 3.588055372238159,
+      "eval_runtime": 182.9945,
+      "eval_samples_per_second": 90.959,
+      "eval_steps_per_second": 5.689,
+      "step": 28000
+    },
+    {
+      "epoch": 8.170706129107435,
+      "grad_norm": 0.3444662392139435,
+      "learning_rate": 0.0005022477411833284,
+      "loss": 3.3854,
+      "step": 28050
+    },
+    {
+      "epoch": 8.185271498485202,
+      "grad_norm": 0.3341796398162842,
+      "learning_rate": 0.0005020728650539201,
+      "loss": 3.3786,
+      "step": 28100
+    },
+    {
+      "epoch": 8.199836867862969,
+      "grad_norm": 0.3741398751735687,
+      "learning_rate": 0.0005018979889245118,
+      "loss": 3.3783,
+      "step": 28150
+    },
+    {
+      "epoch": 8.214402237240737,
+      "grad_norm": 0.329504132270813,
+      "learning_rate": 0.0005017231127951034,
+      "loss": 3.3892,
+      "step": 28200
+    },
+    {
+      "epoch": 8.228967606618504,
+      "grad_norm": 0.37047260999679565,
+      "learning_rate": 0.0005015482366656951,
+      "loss": 3.3807,
+      "step": 28250
+    },
+    {
+      "epoch": 8.24353297599627,
+      "grad_norm": 0.35633140802383423,
+      "learning_rate": 0.0005013733605362868,
+      "loss": 3.392,
+      "step": 28300
+    },
+    {
+      "epoch": 8.258098345374039,
+      "grad_norm": 0.39560022950172424,
+      "learning_rate": 0.0005011984844068784,
+      "loss": 3.3931,
+      "step": 28350
+    },
+    {
+      "epoch": 8.272663714751806,
+      "grad_norm": 0.3520383834838867,
+      "learning_rate": 0.0005010236082774701,
+      "loss": 3.404,
+      "step": 28400
+    },
+    {
+      "epoch": 8.287229084129574,
+      "grad_norm": 0.3606109023094177,
+      "learning_rate": 0.0005008487321480617,
+      "loss": 3.3849,
+      "step": 28450
+    },
+    {
+      "epoch": 8.301794453507341,
+      "grad_norm": 0.35454607009887695,
+      "learning_rate": 0.0005006738560186534,
+      "loss": 3.3959,
+      "step": 28500
+    },
+    {
+      "epoch": 8.316359822885108,
+      "grad_norm": 0.3578483462333679,
+      "learning_rate": 0.0005004989798892451,
+      "loss": 3.4044,
+      "step": 28550
+    },
+    {
+      "epoch": 8.330925192262876,
+      "grad_norm": 0.36204418540000916,
+      "learning_rate": 0.0005003241037598368,
+      "loss": 3.3994,
+      "step": 28600
+    },
+    {
+      "epoch": 8.345490561640643,
+      "grad_norm": 0.35746678709983826,
+      "learning_rate": 0.0005001492276304284,
+      "loss": 3.3966,
+      "step": 28650
+    },
+    {
+      "epoch": 8.36005593101841,
+      "grad_norm": 0.32434630393981934,
+      "learning_rate": 0.0004999743515010201,
+      "loss": 3.408,
+      "step": 28700
+    },
+    {
+      "epoch": 8.374621300396178,
+      "grad_norm": 0.3382759094238281,
+      "learning_rate": 0.0004997994753716117,
+      "loss": 3.3965,
+      "step": 28750
+    },
+    {
+      "epoch": 8.389186669773945,
+      "grad_norm": 0.3254185616970062,
+      "learning_rate": 0.0004996245992422033,
+      "loss": 3.4115,
+      "step": 28800
+    },
+    {
+      "epoch": 8.403752039151712,
+      "grad_norm": 0.3328564763069153,
+      "learning_rate": 0.0004994497231127951,
+      "loss": 3.3962,
+      "step": 28850
+    },
+    {
+      "epoch": 8.41831740852948,
+      "grad_norm": 0.3341105580329895,
+      "learning_rate": 0.0004992748469833867,
+      "loss": 3.4067,
+      "step": 28900
+    },
+    {
+      "epoch": 8.432882777907247,
+      "grad_norm": 0.3595927655696869,
+      "learning_rate": 0.0004990999708539784,
+      "loss": 3.4093,
+      "step": 28950
+    },
+    {
+      "epoch": 8.447448147285016,
+      "grad_norm": 0.32767578959465027,
+      "learning_rate": 0.0004989250947245701,
+      "loss": 3.4033,
+      "step": 29000
+    },
+    {
+      "epoch": 8.447448147285016,
+      "eval_accuracy": 0.3668947322321366,
+      "eval_loss": 3.5828919410705566,
+      "eval_runtime": 183.1656,
+      "eval_samples_per_second": 90.874,
+      "eval_steps_per_second": 5.683,
+      "step": 29000
+    },
+    {
+      "epoch": 8.462013516662783,
+      "grad_norm": 0.335256963968277,
+      "learning_rate": 0.0004987502185951617,
+      "loss": 3.4103,
+      "step": 29050
+    },
+    {
+      "epoch": 8.47657888604055,
+      "grad_norm": 0.328800767660141,
+      "learning_rate": 0.0004985753424657534,
+      "loss": 3.4104,
+      "step": 29100
+    },
+    {
+      "epoch": 8.491144255418318,
+      "grad_norm": 0.3295251429080963,
+      "learning_rate": 0.000498400466336345,
+      "loss": 3.4066,
+      "step": 29150
+    },
+    {
+      "epoch": 8.505709624796085,
+      "grad_norm": 0.3609812259674072,
+      "learning_rate": 0.0004982255902069367,
+      "loss": 3.4191,
+      "step": 29200
+    },
+    {
+      "epoch": 8.520274994173853,
+      "grad_norm": 0.353671669960022,
+      "learning_rate": 0.0004980507140775283,
+      "loss": 3.4129,
+      "step": 29250
+    },
+    {
+      "epoch": 8.53484036355162,
+      "grad_norm": 0.32933372259140015,
+      "learning_rate": 0.0004978758379481201,
+      "loss": 3.423,
+      "step": 29300
+    },
+    {
+      "epoch": 8.549405732929387,
+      "grad_norm": 0.32520967721939087,
+      "learning_rate": 0.0004977009618187117,
+      "loss": 3.4124,
+      "step": 29350
+    },
+    {
+      "epoch": 8.563971102307155,
+      "grad_norm": 0.3547916114330292,
+      "learning_rate": 0.0004975260856893034,
+      "loss": 3.4198,
+      "step": 29400
+    },
+    {
+      "epoch": 8.578536471684922,
+      "grad_norm": 0.34057968854904175,
+      "learning_rate": 0.0004973512095598951,
+      "loss": 3.4191,
+      "step": 29450
+    },
+    {
+      "epoch": 8.593101841062689,
+      "grad_norm": 0.33685600757598877,
+      "learning_rate": 0.0004971763334304867,
+      "loss": 3.4209,
+      "step": 29500
+    },
+    {
+      "epoch": 8.607667210440457,
+      "grad_norm": 0.34290191531181335,
+      "learning_rate": 0.0004970014573010784,
+      "loss": 3.4257,
+      "step": 29550
+    },
+    {
+      "epoch": 8.622232579818224,
+      "grad_norm": 0.311635822057724,
+      "learning_rate": 0.00049682658117167,
+      "loss": 3.4108,
+      "step": 29600
+    },
+    {
+      "epoch": 8.63679794919599,
+      "grad_norm": 0.3206999599933624,
+      "learning_rate": 0.0004966517050422616,
+      "loss": 3.4277,
+      "step": 29650
+    },
+    {
+      "epoch": 8.65136331857376,
+      "grad_norm": 0.378736674785614,
+      "learning_rate": 0.0004964768289128533,
+      "loss": 3.4192,
+      "step": 29700
+    },
+    {
+      "epoch": 8.665928687951526,
+      "grad_norm": 0.35404956340789795,
+      "learning_rate": 0.000496301952783445,
+      "loss": 3.4304,
+      "step": 29750
+    },
+    {
+      "epoch": 8.680494057329295,
+      "grad_norm": 0.33619192242622375,
+      "learning_rate": 0.0004961270766540367,
+      "loss": 3.4157,
+      "step": 29800
+    },
+    {
+      "epoch": 8.695059426707061,
+      "grad_norm": 0.3306516110897064,
+      "learning_rate": 0.0004959522005246284,
+      "loss": 3.4257,
+      "step": 29850
+    },
+    {
+      "epoch": 8.709624796084828,
+      "grad_norm": 0.314596563577652,
+      "learning_rate": 0.00049577732439522,
+      "loss": 3.4326,
+      "step": 29900
+    },
+    {
+      "epoch": 8.724190165462597,
+      "grad_norm": 0.3481888771057129,
+      "learning_rate": 0.0004956024482658117,
+      "loss": 3.4345,
+      "step": 29950
+    },
+    {
+      "epoch": 8.738755534840363,
+      "grad_norm": 0.33885663747787476,
+      "learning_rate": 0.0004954275721364034,
+      "loss": 3.4225,
+      "step": 30000
+    },
+    {
+      "epoch": 8.738755534840363,
+      "eval_accuracy": 0.3673473754628571,
+      "eval_loss": 3.574705123901367,
+      "eval_runtime": 183.0508,
+      "eval_samples_per_second": 90.931,
+      "eval_steps_per_second": 5.687,
+      "step": 30000
+    },
+    {
+      "epoch": 8.753320904218132,
+      "grad_norm": 0.37235426902770996,
+      "learning_rate": 0.000495252696006995,
+      "loss": 3.4281,
+      "step": 30050
+    },
+    {
+      "epoch": 8.767886273595899,
+      "grad_norm": 0.3520287573337555,
+      "learning_rate": 0.0004950778198775866,
+      "loss": 3.4421,
+      "step": 30100
+    },
+    {
+      "epoch": 8.782451642973665,
+      "grad_norm": 0.33238449692726135,
+      "learning_rate": 0.0004949029437481783,
+      "loss": 3.4196,
+      "step": 30150
+    },
+    {
+      "epoch": 8.797017012351434,
+      "grad_norm": 0.3224245607852936,
+      "learning_rate": 0.00049472806761877,
+      "loss": 3.4336,
+      "step": 30200
+    },
+    {
+      "epoch": 8.8115823817292,
+      "grad_norm": 0.32464221119880676,
+      "learning_rate": 0.0004945531914893616,
+      "loss": 3.4363,
+      "step": 30250
+    },
+    {
+      "epoch": 8.826147751106967,
+      "grad_norm": 0.3441407382488251,
+      "learning_rate": 0.0004943783153599534,
+      "loss": 3.4194,
+      "step": 30300
+    },
+    {
+      "epoch": 8.840713120484736,
+      "grad_norm": 0.33133164048194885,
+      "learning_rate": 0.000494203439230545,
+      "loss": 3.4289,
+      "step": 30350
+    },
+    {
+      "epoch": 8.855278489862503,
+      "grad_norm": 0.38068103790283203,
+      "learning_rate": 0.0004940285631011367,
+      "loss": 3.4221,
+      "step": 30400
+    },
+    {
+      "epoch": 8.86984385924027,
+      "grad_norm": 0.3354259133338928,
+      "learning_rate": 0.0004938536869717284,
+      "loss": 3.4192,
+      "step": 30450
+    },
+    {
+      "epoch": 8.884409228618038,
+      "grad_norm": 0.3407922685146332,
+      "learning_rate": 0.0004936788108423199,
+      "loss": 3.4377,
+      "step": 30500
+    },
+    {
+      "epoch": 8.898974597995805,
+      "grad_norm": 0.31988245248794556,
+      "learning_rate": 0.0004935039347129116,
+      "loss": 3.4418,
+      "step": 30550
+    },
+    {
+      "epoch": 8.913539967373573,
+      "grad_norm": 0.33597180247306824,
+      "learning_rate": 0.0004933290585835033,
+      "loss": 3.4441,
+      "step": 30600
+    },
+    {
+      "epoch": 8.92810533675134,
+      "grad_norm": 0.32772478461265564,
+      "learning_rate": 0.000493154182454095,
+      "loss": 3.4356,
+      "step": 30650
+    },
+    {
+      "epoch": 8.942670706129107,
+      "grad_norm": 0.3514624536037445,
+      "learning_rate": 0.0004929793063246866,
+      "loss": 3.4297,
+      "step": 30700
+    },
+    {
+      "epoch": 8.957236075506875,
+      "grad_norm": 0.38539716601371765,
+      "learning_rate": 0.0004928044301952783,
+      "loss": 3.4521,
+      "step": 30750
+    },
+    {
+      "epoch": 8.971801444884642,
+      "grad_norm": 0.33038049936294556,
+      "learning_rate": 0.00049262955406587,
+      "loss": 3.4149,
+      "step": 30800
+    },
+    {
+      "epoch": 8.986366814262409,
+      "grad_norm": 0.3251229226589203,
+      "learning_rate": 0.0004924546779364617,
+      "loss": 3.4331,
+      "step": 30850
+    },
+    {
+      "epoch": 9.000873922162667,
+      "grad_norm": 0.3207502067089081,
+      "learning_rate": 0.0004922798018070533,
+      "loss": 3.4394,
+      "step": 30900
+    },
+    {
+      "epoch": 9.015439291540433,
+      "grad_norm": 0.3693355321884155,
+      "learning_rate": 0.0004921049256776449,
+      "loss": 3.3291,
+      "step": 30950
+    },
+    {
+      "epoch": 9.0300046609182,
+      "grad_norm": 0.3489963114261627,
+      "learning_rate": 0.0004919300495482366,
+      "loss": 3.3258,
+      "step": 31000
+    },
+    {
+      "epoch": 9.0300046609182,
+      "eval_accuracy": 0.3673397334342865,
+      "eval_loss": 3.5807602405548096,
+      "eval_runtime": 182.9792,
+      "eval_samples_per_second": 90.967,
+      "eval_steps_per_second": 5.689,
+      "step": 31000
+    },
+    {
+      "epoch": 9.044570030295969,
+      "grad_norm": 0.3296785056591034,
+      "learning_rate": 0.0004917551734188283,
+      "loss": 3.3114,
+      "step": 31050
+    },
+    {
+      "epoch": 9.059135399673735,
+      "grad_norm": 0.3842584788799286,
+      "learning_rate": 0.0004915802972894199,
+      "loss": 3.3355,
+      "step": 31100
+    },
+    {
+      "epoch": 9.073700769051504,
+      "grad_norm": 0.3418821394443512,
+      "learning_rate": 0.0004914054211600116,
+      "loss": 3.3305,
+      "step": 31150
+    },
+    {
+      "epoch": 9.08826613842927,
+      "grad_norm": 0.3381640911102295,
+      "learning_rate": 0.0004912305450306033,
+      "loss": 3.3356,
+      "step": 31200
+    },
+    {
+      "epoch": 9.102831507807037,
+      "grad_norm": 0.33703309297561646,
+      "learning_rate": 0.000491055668901195,
+      "loss": 3.3429,
+      "step": 31250
+    },
+    {
+      "epoch": 9.117396877184806,
+      "grad_norm": 0.33812689781188965,
+      "learning_rate": 0.0004908807927717865,
+      "loss": 3.3502,
+      "step": 31300
+    },
+    {
+      "epoch": 9.131962246562573,
+      "grad_norm": 0.3623238801956177,
+      "learning_rate": 0.0004907059166423783,
+      "loss": 3.3456,
+      "step": 31350
+    },
+    {
+      "epoch": 9.14652761594034,
+      "grad_norm": 0.337973415851593,
+      "learning_rate": 0.0004905310405129699,
+      "loss": 3.3457,
+      "step": 31400
+    },
+    {
+      "epoch": 9.161092985318108,
+      "grad_norm": 0.3395603597164154,
+      "learning_rate": 0.0004903561643835616,
+      "loss": 3.3571,
+      "step": 31450
+    },
+    {
+      "epoch": 9.175658354695875,
+      "grad_norm": 0.35803207755088806,
+      "learning_rate": 0.0004901812882541533,
+      "loss": 3.3575,
+      "step": 31500
+    },
+    {
+      "epoch": 9.190223724073643,
+      "grad_norm": 0.3479365110397339,
+      "learning_rate": 0.0004900064121247449,
+      "loss": 3.3484,
+      "step": 31550
+    },
+    {
+      "epoch": 9.20478909345141,
+      "grad_norm": 0.3644011616706848,
+      "learning_rate": 0.0004898315359953366,
+      "loss": 3.3551,
+      "step": 31600
+    },
+    {
+      "epoch": 9.219354462829177,
+      "grad_norm": 0.36317089200019836,
+      "learning_rate": 0.0004896566598659283,
+      "loss": 3.3621,
+      "step": 31650
+    },
+    {
+      "epoch": 9.233919832206945,
+      "grad_norm": 0.33678680658340454,
+      "learning_rate": 0.0004894817837365199,
+      "loss": 3.3595,
+      "step": 31700
+    },
+    {
+      "epoch": 9.248485201584712,
+      "grad_norm": 0.3474813997745514,
+      "learning_rate": 0.0004893069076071115,
+      "loss": 3.3684,
+      "step": 31750
+    },
+    {
+      "epoch": 9.263050570962479,
+      "grad_norm": 0.35735565423965454,
+      "learning_rate": 0.0004891320314777032,
+      "loss": 3.3718,
+      "step": 31800
+    },
+    {
+      "epoch": 9.277615940340247,
+      "grad_norm": 0.3313765227794647,
+      "learning_rate": 0.0004889571553482949,
+      "loss": 3.3739,
+      "step": 31850
+    },
+    {
+      "epoch": 9.292181309718014,
+      "grad_norm": 0.33610573410987854,
+      "learning_rate": 0.0004887822792188866,
+      "loss": 3.375,
+      "step": 31900
+    },
+    {
+      "epoch": 9.306746679095783,
+      "grad_norm": 0.32763785123825073,
+      "learning_rate": 0.0004886074030894782,
+      "loss": 3.3646,
+      "step": 31950
+    },
+    {
+      "epoch": 9.32131204847355,
+      "grad_norm": 0.3415217995643616,
+      "learning_rate": 0.0004884325269600699,
+      "loss": 3.3753,
+      "step": 32000
+    },
+    {
+      "epoch": 9.32131204847355,
+      "eval_accuracy": 0.3678076607221482,
+      "eval_loss": 3.5773885250091553,
+      "eval_runtime": 182.9174,
+      "eval_samples_per_second": 90.997,
+      "eval_steps_per_second": 5.691,
+      "step": 32000
+    },
+    {
+      "epoch": 9.335877417851316,
+      "grad_norm": 0.3389699459075928,
+      "learning_rate": 0.0004882576508306615,
+      "loss": 3.3737,
+      "step": 32050
+    },
+    {
+      "epoch": 9.350442787229085,
+      "grad_norm": 0.3345525860786438,
+      "learning_rate": 0.00048808277470125327,
+      "loss": 3.3696,
+      "step": 32100
+    },
+    {
+      "epoch": 9.365008156606851,
+      "grad_norm": 0.3717873692512512,
+      "learning_rate": 0.0004879078985718449,
+      "loss": 3.3813,
+      "step": 32150
+    },
+    {
+      "epoch": 9.379573525984618,
+      "grad_norm": 0.3748685419559479,
+      "learning_rate": 0.0004877330224424366,
+      "loss": 3.3806,
+      "step": 32200
+    },
+    {
+      "epoch": 9.394138895362387,
+      "grad_norm": 0.34190633893013,
+      "learning_rate": 0.00048755814631302823,
+      "loss": 3.3887,
+      "step": 32250
+    },
+    {
+      "epoch": 9.408704264740154,
+      "grad_norm": 0.3739558756351471,
+      "learning_rate": 0.00048738327018361987,
+      "loss": 3.3818,
+      "step": 32300
+    },
+    {
+      "epoch": 9.423269634117922,
+      "grad_norm": 0.3366689085960388,
+      "learning_rate": 0.00048720839405421156,
+      "loss": 3.3763,
+      "step": 32350
+    },
+    {
+      "epoch": 9.437835003495689,
+      "grad_norm": 0.335202157497406,
+      "learning_rate": 0.0004870335179248032,
+      "loss": 3.3885,
+      "step": 32400
+    },
+    {
+      "epoch": 9.452400372873456,
+      "grad_norm": 0.35205066204071045,
+      "learning_rate": 0.0004868586417953949,
+      "loss": 3.3892,
+      "step": 32450
+    },
+    {
+      "epoch": 9.466965742251224,
+      "grad_norm": 0.3260645270347595,
+      "learning_rate": 0.0004866837656659865,
+      "loss": 3.368,
+      "step": 32500
+    },
+    {
+      "epoch": 9.48153111162899,
+      "grad_norm": 0.35463911294937134,
+      "learning_rate": 0.00048650888953657816,
+      "loss": 3.4004,
+      "step": 32550
+    },
+    {
+      "epoch": 9.496096481006758,
+      "grad_norm": 0.3363436162471771,
+      "learning_rate": 0.0004863340134071699,
+      "loss": 3.3733,
+      "step": 32600
+    },
+    {
+      "epoch": 9.510661850384526,
+      "grad_norm": 0.32455047965049744,
+      "learning_rate": 0.00048615913727776154,
+      "loss": 3.3817,
+      "step": 32650
+    },
+    {
+      "epoch": 9.525227219762293,
+      "grad_norm": 0.383531779050827,
+      "learning_rate": 0.00048598426114835323,
+      "loss": 3.3819,
+      "step": 32700
+    },
+    {
+      "epoch": 9.53979258914006,
+      "grad_norm": 0.3713645935058594,
+      "learning_rate": 0.00048580938501894486,
+      "loss": 3.3917,
+      "step": 32750
+    },
+    {
+      "epoch": 9.554357958517828,
+      "grad_norm": 0.3357818126678467,
+      "learning_rate": 0.00048563450888953655,
+      "loss": 3.3873,
+      "step": 32800
+    },
+    {
+      "epoch": 9.568923327895595,
+      "grad_norm": 0.3960472047328949,
+      "learning_rate": 0.0004854596327601282,
+      "loss": 3.4062,
+      "step": 32850
+    },
+    {
+      "epoch": 9.583488697273363,
+      "grad_norm": 0.33174315094947815,
+      "learning_rate": 0.0004852847566307198,
+      "loss": 3.3952,
+      "step": 32900
+    },
+    {
+      "epoch": 9.59805406665113,
+      "grad_norm": 0.33647504448890686,
+      "learning_rate": 0.0004851098805013115,
+      "loss": 3.3959,
+      "step": 32950
+    },
+    {
+      "epoch": 9.612619436028897,
+      "grad_norm": 0.32862532138824463,
+      "learning_rate": 0.00048493500437190315,
+      "loss": 3.397,
+      "step": 33000
+    },
+    {
+      "epoch": 9.612619436028897,
+      "eval_accuracy": 0.3678878432373044,
+      "eval_loss": 3.5697684288024902,
+      "eval_runtime": 182.9346,
+      "eval_samples_per_second": 90.989,
+      "eval_steps_per_second": 5.691,
+      "step": 33000
+    },
+    {
+      "epoch": 9.627184805406666,
+      "grad_norm": 0.33516356348991394,
+      "learning_rate": 0.0004847601282424949,
+      "loss": 3.3865,
+      "step": 33050
+    },
+    {
+      "epoch": 9.641750174784432,
+      "grad_norm": 0.3567980229854584,
+      "learning_rate": 0.00048458525211308653,
+      "loss": 3.401,
+      "step": 33100
+    },
+    {
+      "epoch": 9.6563155441622,
+      "grad_norm": 0.37242189049720764,
+      "learning_rate": 0.00048441037598367817,
+      "loss": 3.3982,
+      "step": 33150
+    },
+    {
+      "epoch": 9.670880913539968,
+      "grad_norm": 0.33064374327659607,
+      "learning_rate": 0.00048423549985426986,
+      "loss": 3.393,
+      "step": 33200
+    },
+    {
+      "epoch": 9.685446282917734,
+      "grad_norm": 0.3263695538043976,
+      "learning_rate": 0.0004840606237248615,
+      "loss": 3.396,
+      "step": 33250
+    },
+    {
+      "epoch": 9.700011652295503,
+      "grad_norm": 0.3375985622406006,
+      "learning_rate": 0.0004838857475954532,
+      "loss": 3.3941,
+      "step": 33300
+    },
+    {
+      "epoch": 9.71457702167327,
+      "grad_norm": 0.3303999900817871,
+      "learning_rate": 0.0004837108714660448,
+      "loss": 3.4037,
+      "step": 33350
+    },
+    {
+      "epoch": 9.729142391051036,
+      "grad_norm": 0.34881582856178284,
+      "learning_rate": 0.0004835359953366365,
+      "loss": 3.3999,
+      "step": 33400
+    },
+    {
+      "epoch": 9.743707760428805,
+      "grad_norm": 0.33368274569511414,
+      "learning_rate": 0.00048336111920722815,
+      "loss": 3.4048,
+      "step": 33450
+    },
+    {
+      "epoch": 9.758273129806572,
+      "grad_norm": 0.3425474166870117,
+      "learning_rate": 0.0004831862430778198,
+      "loss": 3.4066,
+      "step": 33500
+    },
+    {
+      "epoch": 9.772838499184338,
+      "grad_norm": 0.35753095149993896,
+      "learning_rate": 0.00048301136694841153,
+      "loss": 3.409,
+      "step": 33550
+    },
+    {
+      "epoch": 9.787403868562107,
+      "grad_norm": 0.37741026282310486,
+      "learning_rate": 0.00048283649081900317,
+      "loss": 3.4035,
+      "step": 33600
+    },
+    {
+      "epoch": 9.801969237939874,
+      "grad_norm": 0.33492302894592285,
+      "learning_rate": 0.00048266161468959486,
+      "loss": 3.4129,
+      "step": 33650
+    },
+    {
+      "epoch": 9.816534607317642,
+      "grad_norm": 0.3427729308605194,
+      "learning_rate": 0.0004824867385601865,
+      "loss": 3.4104,
+      "step": 33700
+    },
+    {
+      "epoch": 9.831099976695409,
+      "grad_norm": 0.32642075419425964,
+      "learning_rate": 0.00048231186243077813,
+      "loss": 3.4067,
+      "step": 33750
+    },
+    {
+      "epoch": 9.845665346073176,
+      "grad_norm": 0.34097906947135925,
+      "learning_rate": 0.0004821369863013698,
+      "loss": 3.4071,
+      "step": 33800
+    },
+    {
+      "epoch": 9.860230715450944,
+      "grad_norm": 0.3331773579120636,
+      "learning_rate": 0.00048196211017196146,
+      "loss": 3.3962,
+      "step": 33850
+    },
+    {
+      "epoch": 9.874796084828711,
+      "grad_norm": 0.33215972781181335,
+      "learning_rate": 0.00048178723404255315,
+      "loss": 3.4052,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88936145420648,
+      "grad_norm": 0.3272652328014374,
+      "learning_rate": 0.0004816123579131448,
+      "loss": 3.4032,
+      "step": 33950
+    },
+    {
+      "epoch": 9.903926823584246,
+      "grad_norm": 0.31748270988464355,
+      "learning_rate": 0.0004814374817837364,
+      "loss": 3.4104,
+      "step": 34000
+    },
+    {
+      "epoch": 9.903926823584246,
+      "eval_accuracy": 0.36873904765039955,
+      "eval_loss": 3.559356212615967,
+      "eval_runtime": 182.8824,
+      "eval_samples_per_second": 91.015,
+      "eval_steps_per_second": 5.692,
+      "step": 34000
+    },
+    {
+      "epoch": 9.918492192962013,
+      "grad_norm": 0.33125990629196167,
+      "learning_rate": 0.00048126260565432816,
+      "loss": 3.4106,
+      "step": 34050
+    },
+    {
+      "epoch": 9.933057562339782,
+      "grad_norm": 0.32902291417121887,
+      "learning_rate": 0.0004810877295249198,
+      "loss": 3.4107,
+      "step": 34100
+    },
+    {
+      "epoch": 9.947622931717548,
+      "grad_norm": 0.3390238881111145,
+      "learning_rate": 0.0004809128533955115,
+      "loss": 3.4169,
+      "step": 34150
+    },
+    {
+      "epoch": 9.962188301095315,
+      "grad_norm": 0.3341410458087921,
+      "learning_rate": 0.0004807379772661031,
+      "loss": 3.4056,
+      "step": 34200
+    },
+    {
+      "epoch": 9.976753670473084,
+      "grad_norm": 0.3485892415046692,
+      "learning_rate": 0.0004805631011366948,
+      "loss": 3.4186,
+      "step": 34250
+    },
+    {
+      "epoch": 9.99131903985085,
+      "grad_norm": 0.3646714687347412,
+      "learning_rate": 0.00048038822500728645,
+      "loss": 3.4096,
+      "step": 34300
+    },
+    {
+      "epoch": 10.005826147751106,
+      "grad_norm": 0.35494521260261536,
+      "learning_rate": 0.0004802133488778781,
+      "loss": 3.367,
+      "step": 34350
+    },
+    {
+      "epoch": 10.020391517128875,
+      "grad_norm": 0.37137651443481445,
+      "learning_rate": 0.0004800384727484698,
+      "loss": 3.2986,
+      "step": 34400
+    },
+    {
+      "epoch": 10.034956886506642,
+      "grad_norm": 0.34275004267692566,
+      "learning_rate": 0.0004798635966190614,
+      "loss": 3.3048,
+      "step": 34450
+    },
+    {
+      "epoch": 10.049522255884408,
+      "grad_norm": 0.3744957447052002,
+      "learning_rate": 0.00047968872048965316,
+      "loss": 3.3039,
+      "step": 34500
+    },
+    {
+      "epoch": 10.064087625262177,
+      "grad_norm": 0.3762577772140503,
+      "learning_rate": 0.0004795138443602448,
+      "loss": 3.3204,
+      "step": 34550
+    },
+    {
+      "epoch": 10.078652994639944,
+      "grad_norm": 0.3589562475681305,
+      "learning_rate": 0.00047933896823083643,
+      "loss": 3.3209,
+      "step": 34600
+    },
+    {
+      "epoch": 10.093218364017712,
+      "grad_norm": 0.3450121581554413,
+      "learning_rate": 0.0004791640921014281,
+      "loss": 3.3213,
+      "step": 34650
+    },
+    {
+      "epoch": 10.107783733395479,
+      "grad_norm": 0.35901615023612976,
+      "learning_rate": 0.00047898921597201976,
+      "loss": 3.3242,
+      "step": 34700
+    },
+    {
+      "epoch": 10.122349102773246,
+      "grad_norm": 0.3458753228187561,
+      "learning_rate": 0.00047881433984261145,
+      "loss": 3.3272,
+      "step": 34750
+    },
+    {
+      "epoch": 10.136914472151014,
+      "grad_norm": 0.3470238149166107,
+      "learning_rate": 0.0004786394637132031,
+      "loss": 3.3357,
+      "step": 34800
+    },
+    {
+      "epoch": 10.151479841528781,
+      "grad_norm": 0.3423948585987091,
+      "learning_rate": 0.0004784645875837948,
+      "loss": 3.3348,
+      "step": 34850
+    },
+    {
+      "epoch": 10.166045210906548,
+      "grad_norm": 0.36468738317489624,
+      "learning_rate": 0.0004782897114543864,
+      "loss": 3.334,
+      "step": 34900
+    },
+    {
+      "epoch": 10.180610580284316,
+      "grad_norm": 0.36260437965393066,
+      "learning_rate": 0.00047811483532497805,
+      "loss": 3.3367,
+      "step": 34950
+    },
+    {
+      "epoch": 10.195175949662083,
+      "grad_norm": 0.34456831216812134,
+      "learning_rate": 0.0004779399591955698,
+      "loss": 3.3356,
+      "step": 35000
+    },
+    {
+      "epoch": 10.195175949662083,
+      "eval_accuracy": 0.36854717394844216,
+      "eval_loss": 3.571223497390747,
+      "eval_runtime": 182.8992,
+      "eval_samples_per_second": 91.006,
+      "eval_steps_per_second": 5.692,
+      "step": 35000
+    },
+    {
+      "epoch": 10.209741319039852,
+      "grad_norm": 0.397850900888443,
+      "learning_rate": 0.00047776508306616143,
+      "loss": 3.3288,
+      "step": 35050
+    },
+    {
+      "epoch": 10.224306688417618,
+      "grad_norm": 0.3417603671550751,
+      "learning_rate": 0.0004775902069367531,
+      "loss": 3.3384,
+      "step": 35100
+    },
+    {
+      "epoch": 10.238872057795385,
+      "grad_norm": 0.37064996361732483,
+      "learning_rate": 0.00047741533080734476,
+      "loss": 3.3495,
+      "step": 35150
+    },
+    {
+      "epoch": 10.253437427173154,
+      "grad_norm": 0.3486710488796234,
+      "learning_rate": 0.0004772404546779364,
+      "loss": 3.3439,
+      "step": 35200
+    },
+    {
+      "epoch": 10.26800279655092,
+      "grad_norm": 0.35850661993026733,
+      "learning_rate": 0.0004770655785485281,
+      "loss": 3.3368,
+      "step": 35250
+    },
+    {
+      "epoch": 10.282568165928687,
+      "grad_norm": 0.3511759638786316,
+      "learning_rate": 0.0004768907024191197,
+      "loss": 3.3436,
+      "step": 35300
+    },
+    {
+      "epoch": 10.297133535306456,
+      "grad_norm": 0.34070059657096863,
+      "learning_rate": 0.0004767158262897114,
+      "loss": 3.3356,
+      "step": 35350
+    },
+    {
+      "epoch": 10.311698904684222,
+      "grad_norm": 0.3479948937892914,
+      "learning_rate": 0.00047654095016030305,
+      "loss": 3.3556,
+      "step": 35400
+    },
+    {
+      "epoch": 10.326264274061991,
+      "grad_norm": 0.33916714787483215,
+      "learning_rate": 0.0004763660740308948,
+      "loss": 3.368,
+      "step": 35450
+    },
+    {
+      "epoch": 10.340829643439758,
+      "grad_norm": 0.3525298833847046,
+      "learning_rate": 0.0004761911979014864,
+      "loss": 3.3581,
+      "step": 35500
+    },
+    {
+      "epoch": 10.355395012817525,
+      "grad_norm": 0.3512914776802063,
+      "learning_rate": 0.00047601632177207806,
+      "loss": 3.3598,
+      "step": 35550
+    },
+    {
+      "epoch": 10.369960382195293,
+      "grad_norm": 0.3474045991897583,
+      "learning_rate": 0.00047584144564266975,
+      "loss": 3.3461,
+      "step": 35600
+    },
+    {
+      "epoch": 10.38452575157306,
+      "grad_norm": 0.3418622612953186,
+      "learning_rate": 0.0004756665695132614,
+      "loss": 3.3611,
+      "step": 35650
+    },
+    {
+      "epoch": 10.399091120950827,
+      "grad_norm": 0.34689825773239136,
+      "learning_rate": 0.0004754916933838531,
+      "loss": 3.3457,
+      "step": 35700
+    },
+    {
+      "epoch": 10.413656490328595,
+      "grad_norm": 0.32606783509254456,
+      "learning_rate": 0.0004753168172544447,
+      "loss": 3.3576,
+      "step": 35750
+    },
+    {
+      "epoch": 10.428221859706362,
+      "grad_norm": 0.3652861714363098,
+      "learning_rate": 0.00047514194112503635,
+      "loss": 3.3603,
+      "step": 35800
+    },
+    {
+      "epoch": 10.44278722908413,
+      "grad_norm": 0.3363873362541199,
+      "learning_rate": 0.00047496706499562804,
+      "loss": 3.356,
+      "step": 35850
+    },
+    {
+      "epoch": 10.457352598461897,
+      "grad_norm": 0.3396660387516022,
+      "learning_rate": 0.0004747921888662197,
+      "loss": 3.3657,
+      "step": 35900
+    },
+    {
+      "epoch": 10.471917967839664,
+      "grad_norm": 0.3386980891227722,
+      "learning_rate": 0.0004746173127368114,
+      "loss": 3.3529,
+      "step": 35950
+    },
+    {
+      "epoch": 10.486483337217432,
+      "grad_norm": 0.35521814227104187,
+      "learning_rate": 0.00047444243660740306,
+      "loss": 3.3582,
+      "step": 36000
+    },
+    {
+      "epoch": 10.486483337217432,
+      "eval_accuracy": 0.36882287482533554,
+      "eval_loss": 3.565991163253784,
+      "eval_runtime": 182.8426,
+      "eval_samples_per_second": 91.035,
+      "eval_steps_per_second": 5.693,
+      "step": 36000
+    },
+    {
+      "epoch": 10.5010487065952,
+      "grad_norm": 0.36773911118507385,
+      "learning_rate": 0.0004742675604779947,
+      "loss": 3.3744,
+      "step": 36050
+    },
+    {
+      "epoch": 10.515614075972966,
+      "grad_norm": 0.3470185101032257,
+      "learning_rate": 0.0004740926843485864,
+      "loss": 3.3555,
+      "step": 36100
+    },
+    {
+      "epoch": 10.530179445350734,
+      "grad_norm": 0.3499913513660431,
+      "learning_rate": 0.000473917808219178,
+      "loss": 3.3665,
+      "step": 36150
+    },
+    {
+      "epoch": 10.544744814728501,
+      "grad_norm": 0.32940107583999634,
+      "learning_rate": 0.0004737429320897697,
+      "loss": 3.3777,
+      "step": 36200
+    },
+    {
+      "epoch": 10.55931018410627,
+      "grad_norm": 0.3400062620639801,
+      "learning_rate": 0.00047356805596036135,
+      "loss": 3.3633,
+      "step": 36250
+    },
+    {
+      "epoch": 10.573875553484037,
+      "grad_norm": 0.35213765501976013,
+      "learning_rate": 0.00047339317983095304,
+      "loss": 3.3725,
+      "step": 36300
+    },
+    {
+      "epoch": 10.588440922861803,
+      "grad_norm": 0.3383064270019531,
+      "learning_rate": 0.0004732183037015447,
+      "loss": 3.3674,
+      "step": 36350
+    },
+    {
+      "epoch": 10.603006292239572,
+      "grad_norm": 0.3423946797847748,
+      "learning_rate": 0.0004730434275721363,
+      "loss": 3.3744,
+      "step": 36400
+    },
+    {
+      "epoch": 10.617571661617339,
+      "grad_norm": 0.32633453607559204,
+      "learning_rate": 0.00047286855144272806,
+      "loss": 3.3741,
+      "step": 36450
+    },
+    {
+      "epoch": 10.632137030995105,
+      "grad_norm": 0.3443063199520111,
+      "learning_rate": 0.0004726936753133197,
+      "loss": 3.3679,
+      "step": 36500
+    },
+    {
+      "epoch": 10.646702400372874,
+      "grad_norm": 0.36050254106521606,
+      "learning_rate": 0.0004725187991839114,
+      "loss": 3.3732,
+      "step": 36550
+    },
+    {
+      "epoch": 10.66126776975064,
+      "grad_norm": 0.35534441471099854,
+      "learning_rate": 0.000472343923054503,
+      "loss": 3.3749,
+      "step": 36600
+    },
+    {
+      "epoch": 10.675833139128407,
+      "grad_norm": 0.369785338640213,
+      "learning_rate": 0.00047216904692509465,
+      "loss": 3.3807,
+      "step": 36650
+    },
+    {
+      "epoch": 10.690398508506176,
+      "grad_norm": 0.35582253336906433,
+      "learning_rate": 0.00047199417079568634,
+      "loss": 3.3754,
+      "step": 36700
+    },
+    {
+      "epoch": 10.704963877883943,
+      "grad_norm": 0.36503931879997253,
+      "learning_rate": 0.000471819294666278,
+      "loss": 3.373,
+      "step": 36750
+    },
+    {
+      "epoch": 10.719529247261711,
+      "grad_norm": 0.33128494024276733,
+      "learning_rate": 0.00047164441853686967,
+      "loss": 3.3887,
+      "step": 36800
+    },
+    {
+      "epoch": 10.734094616639478,
+      "grad_norm": 0.32897520065307617,
+      "learning_rate": 0.0004714695424074613,
+      "loss": 3.3912,
+      "step": 36850
+    },
+    {
+      "epoch": 10.748659986017245,
+      "grad_norm": 0.3286604583263397,
+      "learning_rate": 0.00047129466627805305,
+      "loss": 3.3706,
+      "step": 36900
+    },
+    {
+      "epoch": 10.763225355395013,
+      "grad_norm": 0.3394903242588043,
+      "learning_rate": 0.0004711197901486447,
+      "loss": 3.3752,
+      "step": 36950
+    },
+    {
+      "epoch": 10.77779072477278,
+      "grad_norm": 0.3878060579299927,
+      "learning_rate": 0.0004709449140192363,
+      "loss": 3.3867,
+      "step": 37000
+    },
+    {
+      "epoch": 10.77779072477278,
+      "eval_accuracy": 0.3696466855052468,
+      "eval_loss": 3.55853533744812,
+      "eval_runtime": 182.8784,
+      "eval_samples_per_second": 91.017,
+      "eval_steps_per_second": 5.692,
+      "step": 37000
+    },
+    {
+      "epoch": 10.792356094150549,
+      "grad_norm": 0.3377073407173157,
+      "learning_rate": 0.000470770037889828,
+      "loss": 3.3732,
+      "step": 37050
+    },
+    {
+      "epoch": 10.806921463528315,
+      "grad_norm": 0.3298020660877228,
+      "learning_rate": 0.00047059516176041965,
+      "loss": 3.3806,
+      "step": 37100
+    },
+    {
+      "epoch": 10.821486832906082,
+      "grad_norm": 0.33295738697052,
+      "learning_rate": 0.00047042028563101134,
+      "loss": 3.389,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83605220228385,
+      "grad_norm": 0.332865446805954,
+      "learning_rate": 0.000470245409501603,
+      "loss": 3.3811,
+      "step": 37200
+    },
+    {
+      "epoch": 10.850617571661617,
+      "grad_norm": 0.3256121575832367,
+      "learning_rate": 0.0004700705333721946,
+      "loss": 3.3895,
+      "step": 37250
+    },
+    {
+      "epoch": 10.865182941039384,
+      "grad_norm": 0.3128420114517212,
+      "learning_rate": 0.0004698956572427863,
+      "loss": 3.3926,
+      "step": 37300
+    },
+    {
+      "epoch": 10.879748310417153,
+      "grad_norm": 0.3602403402328491,
+      "learning_rate": 0.00046972078111337794,
+      "loss": 3.3802,
+      "step": 37350
+    },
+    {
+      "epoch": 10.89431367979492,
+      "grad_norm": 0.3453914523124695,
+      "learning_rate": 0.0004695459049839697,
+      "loss": 3.3903,
+      "step": 37400
+    },
+    {
+      "epoch": 10.908879049172686,
+      "grad_norm": 0.3409225046634674,
+      "learning_rate": 0.0004693710288545613,
+      "loss": 3.386,
+      "step": 37450
+    },
+    {
+      "epoch": 10.923444418550455,
+      "grad_norm": 0.33663007616996765,
+      "learning_rate": 0.000469196152725153,
+      "loss": 3.3895,
+      "step": 37500
+    },
+    {
+      "epoch": 10.938009787928221,
+      "grad_norm": 0.3527715504169464,
+      "learning_rate": 0.00046902127659574465,
+      "loss": 3.3847,
+      "step": 37550
+    },
+    {
+      "epoch": 10.95257515730599,
+      "grad_norm": 0.33517366647720337,
+      "learning_rate": 0.0004688464004663363,
+      "loss": 3.3846,
+      "step": 37600
+    },
+    {
+      "epoch": 10.967140526683757,
+      "grad_norm": 0.3335331082344055,
+      "learning_rate": 0.000468671524336928,
+      "loss": 3.3903,
+      "step": 37650
+    },
+    {
+      "epoch": 10.981705896061523,
+      "grad_norm": 0.36070242524147034,
+      "learning_rate": 0.0004684966482075196,
+      "loss": 3.3784,
+      "step": 37700
+    },
+    {
+      "epoch": 10.996271265439292,
+      "grad_norm": 0.359713077545166,
+      "learning_rate": 0.0004683217720781113,
+      "loss": 3.3875,
+      "step": 37750
+    },
+    {
+      "epoch": 11.010778373339548,
+      "grad_norm": 0.3132033050060272,
+      "learning_rate": 0.00046814689594870294,
+      "loss": 3.2974,
+      "step": 37800
+    },
+    {
+      "epoch": 11.025343742717315,
+      "grad_norm": 0.3545316755771637,
+      "learning_rate": 0.0004679720198192946,
+      "loss": 3.2811,
+      "step": 37850
+    },
+    {
+      "epoch": 11.039909112095083,
+      "grad_norm": 0.3404482901096344,
+      "learning_rate": 0.0004677971436898863,
+      "loss": 3.287,
+      "step": 37900
+    },
+    {
+      "epoch": 11.05447448147285,
+      "grad_norm": 0.36774441599845886,
+      "learning_rate": 0.00046762226756047795,
+      "loss": 3.2756,
+      "step": 37950
+    },
+    {
+      "epoch": 11.069039850850617,
+      "grad_norm": 0.3585566580295563,
+      "learning_rate": 0.00046744739143106964,
+      "loss": 3.2826,
+      "step": 38000
+    },
+    {
+      "epoch": 11.069039850850617,
+      "eval_accuracy": 0.36930385234660246,
+      "eval_loss": 3.566277027130127,
+      "eval_runtime": 182.9741,
+      "eval_samples_per_second": 90.969,
+      "eval_steps_per_second": 5.689,
+      "step": 38000
+    },
+    {
+      "epoch": 11.083605220228385,
+      "grad_norm": 0.31277865171432495,
+      "learning_rate": 0.0004672725153016613,
+      "loss": 3.2838,
+      "step": 38050
+    },
+    {
+      "epoch": 11.098170589606152,
+      "grad_norm": 0.3467193841934204,
+      "learning_rate": 0.00046709763917225297,
+      "loss": 3.2891,
+      "step": 38100
+    },
+    {
+      "epoch": 11.11273595898392,
+      "grad_norm": 0.358010470867157,
+      "learning_rate": 0.0004669227630428446,
+      "loss": 3.3082,
+      "step": 38150
+    },
+    {
+      "epoch": 11.127301328361687,
+      "grad_norm": 0.37875378131866455,
+      "learning_rate": 0.00046674788691343624,
+      "loss": 3.3049,
+      "step": 38200
+    },
+    {
+      "epoch": 11.141866697739454,
+      "grad_norm": 0.3386188745498657,
+      "learning_rate": 0.00046657301078402793,
+      "loss": 3.3061,
+      "step": 38250
+    },
+    {
+      "epoch": 11.156432067117223,
+      "grad_norm": 0.3643662929534912,
+      "learning_rate": 0.00046639813465461957,
+      "loss": 3.3008,
+      "step": 38300
+    },
+    {
+      "epoch": 11.17099743649499,
+      "grad_norm": 0.3499019145965576,
+      "learning_rate": 0.0004662232585252113,
+      "loss": 3.3057,
+      "step": 38350
+    },
+    {
+      "epoch": 11.185562805872756,
+      "grad_norm": 0.32544127106666565,
+      "learning_rate": 0.00046604838239580295,
+      "loss": 3.3152,
+      "step": 38400
+    },
+    {
+      "epoch": 11.200128175250525,
+      "grad_norm": 0.3370313048362732,
+      "learning_rate": 0.0004658735062663946,
+      "loss": 3.3052,
+      "step": 38450
+    },
+    {
+      "epoch": 11.214693544628291,
+      "grad_norm": 0.3501981496810913,
+      "learning_rate": 0.0004656986301369863,
+      "loss": 3.3075,
+      "step": 38500
+    },
+    {
+      "epoch": 11.22925891400606,
+      "grad_norm": 0.3423289954662323,
+      "learning_rate": 0.0004655237540075779,
+      "loss": 3.3127,
+      "step": 38550
+    },
+    {
+      "epoch": 11.243824283383827,
+      "grad_norm": 0.3272441625595093,
+      "learning_rate": 0.0004653488778781696,
+      "loss": 3.3347,
+      "step": 38600
+    },
+    {
+      "epoch": 11.258389652761593,
+      "grad_norm": 0.3582228422164917,
+      "learning_rate": 0.00046517400174876124,
+      "loss": 3.3142,
+      "step": 38650
+    },
+    {
+      "epoch": 11.272955022139362,
+      "grad_norm": 0.33023253083229065,
+      "learning_rate": 0.0004649991256193529,
+      "loss": 3.3241,
+      "step": 38700
+    },
+    {
+      "epoch": 11.287520391517129,
+      "grad_norm": 0.342748761177063,
+      "learning_rate": 0.00046482424948994457,
+      "loss": 3.3166,
+      "step": 38750
+    },
+    {
+      "epoch": 11.302085760894895,
+      "grad_norm": 0.34566137194633484,
+      "learning_rate": 0.0004646493733605362,
+      "loss": 3.3259,
+      "step": 38800
+    },
+    {
+      "epoch": 11.316651130272664,
+      "grad_norm": 0.35357779264450073,
+      "learning_rate": 0.00046447449723112795,
+      "loss": 3.3259,
+      "step": 38850
+    },
+    {
+      "epoch": 11.33121649965043,
+      "grad_norm": 0.3932008743286133,
+      "learning_rate": 0.0004642996211017196,
+      "loss": 3.3338,
+      "step": 38900
+    },
+    {
+      "epoch": 11.3457818690282,
+      "grad_norm": 0.3381226062774658,
+      "learning_rate": 0.0004641247449723113,
+      "loss": 3.3374,
+      "step": 38950
+    },
+    {
+      "epoch": 11.360347238405966,
+      "grad_norm": 0.384560763835907,
+      "learning_rate": 0.0004639498688429029,
+      "loss": 3.3467,
+      "step": 39000
+    },
+    {
+      "epoch": 11.360347238405966,
+      "eval_accuracy": 0.36921779134793037,
+      "eval_loss": 3.5652575492858887,
+      "eval_runtime": 182.9963,
+      "eval_samples_per_second": 90.958,
+      "eval_steps_per_second": 5.689,
+      "step": 39000
+    },
+    {
+      "epoch": 11.374912607783733,
+      "grad_norm": 0.3471319377422333,
+      "learning_rate": 0.00046377499271349455,
+      "loss": 3.3447,
+      "step": 39050
+    },
+    {
+      "epoch": 11.389477977161501,
+      "grad_norm": 0.34540602564811707,
+      "learning_rate": 0.00046360011658408624,
+      "loss": 3.3322,
+      "step": 39100
+    },
+    {
+      "epoch": 11.404043346539268,
+      "grad_norm": 0.35746103525161743,
+      "learning_rate": 0.00046342524045467787,
+      "loss": 3.3347,
+      "step": 39150
+    },
+    {
+      "epoch": 11.418608715917035,
+      "grad_norm": 0.35586029291152954,
+      "learning_rate": 0.00046325036432526956,
+      "loss": 3.3522,
+      "step": 39200
+    },
+    {
+      "epoch": 11.433174085294803,
+      "grad_norm": 0.34494882822036743,
+      "learning_rate": 0.0004630754881958612,
+      "loss": 3.3351,
+      "step": 39250
+    },
+    {
+      "epoch": 11.44773945467257,
+      "grad_norm": 0.36397963762283325,
+      "learning_rate": 0.00046290061206645284,
+      "loss": 3.3338,
+      "step": 39300
+    },
+    {
+      "epoch": 11.462304824050339,
+      "grad_norm": 0.3657397925853729,
+      "learning_rate": 0.0004627257359370446,
+      "loss": 3.3472,
+      "step": 39350
+    },
+    {
+      "epoch": 11.476870193428105,
+      "grad_norm": 0.3489623963832855,
+      "learning_rate": 0.0004625508598076362,
+      "loss": 3.3422,
+      "step": 39400
+    },
+    {
+      "epoch": 11.491435562805872,
+      "grad_norm": 0.34582746028900146,
+      "learning_rate": 0.0004623759836782279,
+      "loss": 3.3421,
+      "step": 39450
+    },
+    {
+      "epoch": 11.50600093218364,
+      "grad_norm": 0.3677220046520233,
+      "learning_rate": 0.00046220110754881954,
+      "loss": 3.3496,
+      "step": 39500
+    },
+    {
+      "epoch": 11.520566301561407,
+      "grad_norm": 0.38130640983581543,
+      "learning_rate": 0.00046202623141941123,
+      "loss": 3.3369,
+      "step": 39550
+    },
+    {
+      "epoch": 11.535131670939174,
+      "grad_norm": 0.4037802815437317,
+      "learning_rate": 0.00046185135529000287,
+      "loss": 3.353,
+      "step": 39600
+    },
+    {
+      "epoch": 11.549697040316943,
+      "grad_norm": 0.3462783396244049,
+      "learning_rate": 0.0004616764791605945,
+      "loss": 3.3579,
+      "step": 39650
+    },
+    {
+      "epoch": 11.56426240969471,
+      "grad_norm": 0.35526663064956665,
+      "learning_rate": 0.0004615016030311862,
+      "loss": 3.3736,
+      "step": 39700
+    },
+    {
+      "epoch": 11.578827779072478,
+      "grad_norm": 0.35431742668151855,
+      "learning_rate": 0.00046132672690177783,
+      "loss": 3.3481,
+      "step": 39750
+    },
+    {
+      "epoch": 11.593393148450245,
+      "grad_norm": 0.35547706484794617,
+      "learning_rate": 0.0004611518507723696,
+      "loss": 3.3512,
+      "step": 39800
+    },
+    {
+      "epoch": 11.607958517828012,
+      "grad_norm": 0.33214691281318665,
+      "learning_rate": 0.0004609769746429612,
+      "loss": 3.3563,
+      "step": 39850
+    },
+    {
+      "epoch": 11.62252388720578,
+      "grad_norm": 0.3578963875770569,
+      "learning_rate": 0.00046080209851355285,
+      "loss": 3.3504,
+      "step": 39900
+    },
+    {
+      "epoch": 11.637089256583547,
+      "grad_norm": 0.3476736545562744,
+      "learning_rate": 0.00046062722238414454,
+      "loss": 3.3642,
+      "step": 39950
+    },
+    {
+      "epoch": 11.651654625961314,
+      "grad_norm": 0.34914782643318176,
+      "learning_rate": 0.0004604523462547362,
+      "loss": 3.3574,
+      "step": 40000
+    },
+    {
+      "epoch": 11.651654625961314,
+      "eval_accuracy": 0.36975038195446647,
+      "eval_loss": 3.5585873126983643,
+      "eval_runtime": 182.9354,
+      "eval_samples_per_second": 90.988,
+      "eval_steps_per_second": 5.691,
+      "step": 40000
+    },
+    {
+      "epoch": 11.666219995339082,
+      "grad_norm": 0.3997795581817627,
+      "learning_rate": 0.00046027747012532787,
+      "loss": 3.3593,
+      "step": 40050
+    },
+    {
+      "epoch": 11.680785364716849,
+      "grad_norm": 0.34242531657218933,
+      "learning_rate": 0.0004601025939959195,
+      "loss": 3.351,
+      "step": 40100
+    },
+    {
+      "epoch": 11.695350734094617,
+      "grad_norm": 0.35713550448417664,
+      "learning_rate": 0.0004599277178665112,
+      "loss": 3.3586,
+      "step": 40150
+    },
+    {
+      "epoch": 11.709916103472384,
+      "grad_norm": 0.37306341528892517,
+      "learning_rate": 0.00045975284173710283,
+      "loss": 3.3586,
+      "step": 40200
+    },
+    {
+      "epoch": 11.724481472850151,
+      "grad_norm": 0.3383656144142151,
+      "learning_rate": 0.00045957796560769446,
+      "loss": 3.3614,
+      "step": 40250
+    },
+    {
+      "epoch": 11.73904684222792,
+      "grad_norm": 0.35948076844215393,
+      "learning_rate": 0.0004594030894782862,
+      "loss": 3.3551,
+      "step": 40300
+    },
+    {
+      "epoch": 11.753612211605686,
+      "grad_norm": 0.360495388507843,
+      "learning_rate": 0.00045922821334887785,
+      "loss": 3.3622,
+      "step": 40350
+    },
+    {
+      "epoch": 11.768177580983453,
+      "grad_norm": 0.34619754552841187,
+      "learning_rate": 0.00045905333721946954,
+      "loss": 3.3622,
+      "step": 40400
+    },
+    {
+      "epoch": 11.782742950361222,
+      "grad_norm": 0.34578973054885864,
+      "learning_rate": 0.00045887846109006117,
+      "loss": 3.3726,
+      "step": 40450
+    },
+    {
+      "epoch": 11.797308319738988,
+      "grad_norm": 0.3434268534183502,
+      "learning_rate": 0.0004587035849606528,
+      "loss": 3.3633,
+      "step": 40500
+    },
+    {
+      "epoch": 11.811873689116755,
+      "grad_norm": 0.35702431201934814,
+      "learning_rate": 0.0004585287088312445,
+      "loss": 3.3524,
+      "step": 40550
+    },
+    {
+      "epoch": 11.826439058494524,
+      "grad_norm": 0.34746211767196655,
+      "learning_rate": 0.00045835383270183613,
+      "loss": 3.377,
+      "step": 40600
+    },
+    {
+      "epoch": 11.84100442787229,
+      "grad_norm": 0.3523807227611542,
+      "learning_rate": 0.0004581789565724278,
+      "loss": 3.3717,
+      "step": 40650
+    },
+    {
+      "epoch": 11.855569797250059,
+      "grad_norm": 0.33209025859832764,
+      "learning_rate": 0.00045800408044301946,
+      "loss": 3.361,
+      "step": 40700
+    },
+    {
+      "epoch": 11.870135166627826,
+      "grad_norm": 0.3397273123264313,
+      "learning_rate": 0.0004578292043136111,
+      "loss": 3.361,
+      "step": 40750
+    },
+    {
+      "epoch": 11.884700536005592,
+      "grad_norm": 0.335334450006485,
+      "learning_rate": 0.00045765432818420284,
+      "loss": 3.3646,
+      "step": 40800
+    },
+    {
+      "epoch": 11.899265905383361,
+      "grad_norm": 0.34581536054611206,
+      "learning_rate": 0.0004574794520547945,
+      "loss": 3.3717,
+      "step": 40850
+    },
+    {
+      "epoch": 11.913831274761128,
+      "grad_norm": 0.36878204345703125,
+      "learning_rate": 0.00045730457592538617,
+      "loss": 3.3697,
+      "step": 40900
+    },
+    {
+      "epoch": 11.928396644138896,
+      "grad_norm": 0.33850565552711487,
+      "learning_rate": 0.0004571296997959778,
+      "loss": 3.3665,
+      "step": 40950
+    },
+    {
+      "epoch": 11.942962013516663,
+      "grad_norm": 0.3630038797855377,
+      "learning_rate": 0.0004569548236665695,
+      "loss": 3.3856,
+      "step": 41000
+    },
+    {
+      "epoch": 11.942962013516663,
+      "eval_accuracy": 0.37081274149545096,
+      "eval_loss": 3.5459952354431152,
+      "eval_runtime": 183.1108,
+      "eval_samples_per_second": 90.901,
+      "eval_steps_per_second": 5.685,
+      "step": 41000
+    },
+    {
+      "epoch": 11.95752738289443,
+      "grad_norm": 0.33172306418418884,
+      "learning_rate": 0.00045677994753716113,
+      "loss": 3.3737,
+      "step": 41050
+    },
+    {
+      "epoch": 11.972092752272198,
+      "grad_norm": 0.34910187125205994,
+      "learning_rate": 0.00045660507140775277,
+      "loss": 3.3773,
+      "step": 41100
+    },
+    {
+      "epoch": 11.986658121649965,
+      "grad_norm": 0.3395317494869232,
+      "learning_rate": 0.00045643019527834446,
+      "loss": 3.3751,
+      "step": 41150
+    },
+    {
+      "epoch": 12.001165229550221,
+      "grad_norm": 0.34665730595588684,
+      "learning_rate": 0.0004562553191489361,
+      "loss": 3.3681,
+      "step": 41200
+    },
+    {
+      "epoch": 12.01573059892799,
+      "grad_norm": 0.3783624768257141,
+      "learning_rate": 0.00045608044301952784,
+      "loss": 3.2697,
+      "step": 41250
+    },
+    {
+      "epoch": 12.030295968305756,
+      "grad_norm": 0.3724413514137268,
+      "learning_rate": 0.0004559055668901195,
+      "loss": 3.2685,
+      "step": 41300
+    },
+    {
+      "epoch": 12.044861337683523,
+      "grad_norm": 0.34241390228271484,
+      "learning_rate": 0.0004557306907607111,
+      "loss": 3.2785,
+      "step": 41350
+    },
+    {
+      "epoch": 12.059426707061291,
+      "grad_norm": 0.3715740442276001,
+      "learning_rate": 0.0004555558146313028,
+      "loss": 3.2753,
+      "step": 41400
+    },
+    {
+      "epoch": 12.073992076439058,
+      "grad_norm": 0.3630043864250183,
+      "learning_rate": 0.00045538093850189444,
+      "loss": 3.2737,
+      "step": 41450
+    },
+    {
+      "epoch": 12.088557445816827,
+      "grad_norm": 0.3790743947029114,
+      "learning_rate": 0.00045520606237248613,
+      "loss": 3.2885,
+      "step": 41500
+    },
+    {
+      "epoch": 12.103122815194594,
+      "grad_norm": 0.37035778164863586,
+      "learning_rate": 0.00045503118624307776,
+      "loss": 3.2849,
+      "step": 41550
+    },
+    {
+      "epoch": 12.11768818457236,
+      "grad_norm": 0.3581169545650482,
+      "learning_rate": 0.00045485631011366945,
+      "loss": 3.2934,
+      "step": 41600
+    },
+    {
+      "epoch": 12.132253553950129,
+      "grad_norm": 0.367798775434494,
+      "learning_rate": 0.0004546814339842611,
+      "loss": 3.2741,
+      "step": 41650
+    },
+    {
+      "epoch": 12.146818923327896,
+      "grad_norm": 0.3496311902999878,
+      "learning_rate": 0.0004545065578548527,
+      "loss": 3.2908,
+      "step": 41700
+    },
+    {
+      "epoch": 12.161384292705662,
+      "grad_norm": 0.3402174413204193,
+      "learning_rate": 0.00045433168172544447,
+      "loss": 3.2904,
+      "step": 41750
+    },
+    {
+      "epoch": 12.17594966208343,
+      "grad_norm": 0.3760812282562256,
+      "learning_rate": 0.0004541568055960361,
+      "loss": 3.2939,
+      "step": 41800
+    },
+    {
+      "epoch": 12.190515031461198,
+      "grad_norm": 0.36442825198173523,
+      "learning_rate": 0.0004539819294666278,
+      "loss": 3.2946,
+      "step": 41850
+    },
+    {
+      "epoch": 12.205080400838966,
+      "grad_norm": 0.39214181900024414,
+      "learning_rate": 0.00045380705333721943,
+      "loss": 3.3044,
+      "step": 41900
+    },
+    {
+      "epoch": 12.219645770216733,
+      "grad_norm": 0.3448289930820465,
+      "learning_rate": 0.00045363217720781107,
+      "loss": 3.2939,
+      "step": 41950
+    },
+    {
+      "epoch": 12.2342111395945,
+      "grad_norm": 0.3863697946071625,
+      "learning_rate": 0.00045345730107840276,
+      "loss": 3.3073,
+      "step": 42000
+    },
+    {
+      "epoch": 12.2342111395945,
+      "eval_accuracy": 0.36998775511883647,
+      "eval_loss": 3.563772439956665,
+      "eval_runtime": 183.0531,
+      "eval_samples_per_second": 90.93,
+      "eval_steps_per_second": 5.687,
+      "step": 42000
+    },
+    {
+      "epoch": 12.248776508972268,
+      "grad_norm": 0.3787136375904083,
+      "learning_rate": 0.0004532824249489944,
+      "loss": 3.3172,
+      "step": 42050
+    },
+    {
+      "epoch": 12.263341878350035,
+      "grad_norm": 0.3432846665382385,
+      "learning_rate": 0.0004531075488195861,
+      "loss": 3.3017,
+      "step": 42100
+    },
+    {
+      "epoch": 12.277907247727802,
+      "grad_norm": 0.3608255982398987,
+      "learning_rate": 0.0004529326726901777,
+      "loss": 3.306,
+      "step": 42150
+    },
+    {
+      "epoch": 12.29247261710557,
+      "grad_norm": 0.3482573628425598,
+      "learning_rate": 0.00045275779656076947,
+      "loss": 3.3011,
+      "step": 42200
+    },
+    {
+      "epoch": 12.307037986483337,
+      "grad_norm": 0.35142436623573303,
+      "learning_rate": 0.0004525829204313611,
+      "loss": 3.3088,
+      "step": 42250
+    },
+    {
+      "epoch": 12.321603355861104,
+      "grad_norm": 0.37184178829193115,
+      "learning_rate": 0.00045240804430195274,
+      "loss": 3.3109,
+      "step": 42300
+    },
+    {
+      "epoch": 12.336168725238872,
+      "grad_norm": 0.3423960208892822,
+      "learning_rate": 0.00045223316817254443,
+      "loss": 3.3097,
+      "step": 42350
+    },
+    {
+      "epoch": 12.350734094616639,
+      "grad_norm": 0.399601548910141,
+      "learning_rate": 0.00045205829204313607,
+      "loss": 3.3125,
+      "step": 42400
+    },
+    {
+      "epoch": 12.365299463994408,
+      "grad_norm": 0.3392581641674042,
+      "learning_rate": 0.00045188341591372776,
+      "loss": 3.3176,
+      "step": 42450
+    },
+    {
+      "epoch": 12.379864833372174,
+      "grad_norm": 0.3549819886684418,
+      "learning_rate": 0.0004517085397843194,
+      "loss": 3.3176,
+      "step": 42500
+    },
+    {
+      "epoch": 12.394430202749941,
+      "grad_norm": 0.37998539209365845,
+      "learning_rate": 0.00045153366365491103,
+      "loss": 3.3305,
+      "step": 42550
+    },
+    {
+      "epoch": 12.40899557212771,
+      "grad_norm": 0.3557809293270111,
+      "learning_rate": 0.0004513587875255027,
+      "loss": 3.3238,
+      "step": 42600
+    },
+    {
+      "epoch": 12.423560941505476,
+      "grad_norm": 0.3527306914329529,
+      "learning_rate": 0.00045118391139609436,
+      "loss": 3.3299,
+      "step": 42650
+    },
+    {
+      "epoch": 12.438126310883243,
+      "grad_norm": 0.3596922755241394,
+      "learning_rate": 0.0004510090352666861,
+      "loss": 3.3251,
+      "step": 42700
+    },
+    {
+      "epoch": 12.452691680261012,
+      "grad_norm": 0.3422442674636841,
+      "learning_rate": 0.00045083415913727774,
+      "loss": 3.321,
+      "step": 42750
+    },
+    {
+      "epoch": 12.467257049638778,
+      "grad_norm": 0.4016224443912506,
+      "learning_rate": 0.0004506592830078694,
+      "loss": 3.3283,
+      "step": 42800
+    },
+    {
+      "epoch": 12.481822419016547,
+      "grad_norm": 0.34260863065719604,
+      "learning_rate": 0.00045048440687846106,
+      "loss": 3.3187,
+      "step": 42850
+    },
+    {
+      "epoch": 12.496387788394314,
+      "grad_norm": 0.343815416097641,
+      "learning_rate": 0.0004503095307490527,
+      "loss": 3.3066,
+      "step": 42900
+    },
+    {
+      "epoch": 12.51095315777208,
+      "grad_norm": 0.37220168113708496,
+      "learning_rate": 0.0004501346546196444,
+      "loss": 3.3269,
+      "step": 42950
+    },
+    {
+      "epoch": 12.525518527149849,
+      "grad_norm": 0.3391141891479492,
+      "learning_rate": 0.000449959778490236,
+      "loss": 3.338,
+      "step": 43000
+    },
+    {
+      "epoch": 12.525518527149849,
+      "eval_accuracy": 0.37058042382690454,
+      "eval_loss": 3.5549614429473877,
+      "eval_runtime": 182.7229,
+      "eval_samples_per_second": 91.094,
+      "eval_steps_per_second": 5.697,
+      "step": 43000
+    },
+    {
+      "epoch": 12.540083896527616,
+      "grad_norm": 0.37905097007751465,
+      "learning_rate": 0.0004497849023608277,
+      "loss": 3.3293,
+      "step": 43050
+    },
+    {
+      "epoch": 12.554649265905383,
+      "grad_norm": 0.34309253096580505,
+      "learning_rate": 0.00044961002623141935,
+      "loss": 3.3142,
+      "step": 43100
+    },
+    {
+      "epoch": 12.569214635283151,
+      "grad_norm": 0.35698094964027405,
+      "learning_rate": 0.000449435150102011,
+      "loss": 3.3401,
+      "step": 43150
+    },
+    {
+      "epoch": 12.583780004660918,
+      "grad_norm": 0.3387773036956787,
+      "learning_rate": 0.00044926027397260273,
+      "loss": 3.3299,
+      "step": 43200
+    },
+    {
+      "epoch": 12.598345374038686,
+      "grad_norm": 0.37266799807548523,
+      "learning_rate": 0.00044908539784319437,
+      "loss": 3.3354,
+      "step": 43250
+    },
+    {
+      "epoch": 12.612910743416453,
+      "grad_norm": 0.3330574333667755,
+      "learning_rate": 0.00044891052171378606,
+      "loss": 3.3432,
+      "step": 43300
+    },
+    {
+      "epoch": 12.62747611279422,
+      "grad_norm": 0.34195274114608765,
+      "learning_rate": 0.0004487356455843777,
+      "loss": 3.341,
+      "step": 43350
+    },
+    {
+      "epoch": 12.642041482171988,
+      "grad_norm": 0.36545178294181824,
+      "learning_rate": 0.00044856076945496933,
+      "loss": 3.3416,
+      "step": 43400
+    },
+    {
+      "epoch": 12.656606851549755,
+      "grad_norm": 0.3684341311454773,
+      "learning_rate": 0.000448385893325561,
+      "loss": 3.3324,
+      "step": 43450
+    },
+    {
+      "epoch": 12.671172220927522,
+      "grad_norm": 0.34405165910720825,
+      "learning_rate": 0.00044821101719615266,
+      "loss": 3.3484,
+      "step": 43500
+    },
+    {
+      "epoch": 12.68573759030529,
+      "grad_norm": 0.34472227096557617,
+      "learning_rate": 0.00044803614106674435,
+      "loss": 3.3525,
+      "step": 43550
+    },
+    {
+      "epoch": 12.700302959683057,
+      "grad_norm": 0.39018353819847107,
+      "learning_rate": 0.000447861264937336,
+      "loss": 3.3332,
+      "step": 43600
+    },
+    {
+      "epoch": 12.714868329060826,
+      "grad_norm": 0.3903914988040924,
+      "learning_rate": 0.00044768638880792773,
+      "loss": 3.3509,
+      "step": 43650
+    },
+    {
+      "epoch": 12.729433698438593,
+      "grad_norm": 0.36807510256767273,
+      "learning_rate": 0.00044751151267851937,
+      "loss": 3.3308,
+      "step": 43700
+    },
+    {
+      "epoch": 12.74399906781636,
+      "grad_norm": 0.3552221953868866,
+      "learning_rate": 0.000447336636549111,
+      "loss": 3.3335,
+      "step": 43750
+    },
+    {
+      "epoch": 12.758564437194128,
+      "grad_norm": 0.36349716782569885,
+      "learning_rate": 0.0004471617604197027,
+      "loss": 3.3353,
+      "step": 43800
+    },
+    {
+      "epoch": 12.773129806571895,
+      "grad_norm": 0.4054705500602722,
+      "learning_rate": 0.00044698688429029433,
+      "loss": 3.3506,
+      "step": 43850
+    },
+    {
+      "epoch": 12.787695175949661,
+      "grad_norm": 0.3815021812915802,
+      "learning_rate": 0.000446812008160886,
+      "loss": 3.3482,
+      "step": 43900
+    },
+    {
+      "epoch": 12.80226054532743,
+      "grad_norm": 0.3345995843410492,
+      "learning_rate": 0.00044663713203147766,
+      "loss": 3.3389,
+      "step": 43950
+    },
+    {
+      "epoch": 12.816825914705197,
+      "grad_norm": 0.3273864984512329,
+      "learning_rate": 0.0004464622559020693,
+      "loss": 3.3361,
+      "step": 44000
+    },
+    {
+      "epoch": 12.816825914705197,
+      "eval_accuracy": 0.3706991691939247,
+      "eval_loss": 3.5476930141448975,
+      "eval_runtime": 182.941,
+      "eval_samples_per_second": 90.986,
+      "eval_steps_per_second": 5.69,
+      "step": 44000
+    },
+    {
+      "epoch": 12.831391284082965,
+      "grad_norm": 0.35794728994369507,
+      "learning_rate": 0.000446287379772661,
+      "loss": 3.3331,
+      "step": 44050
+    },
+    {
+      "epoch": 12.845956653460732,
+      "grad_norm": 0.38504233956336975,
+      "learning_rate": 0.0004461125036432526,
+      "loss": 3.3451,
+      "step": 44100
+    },
+    {
+      "epoch": 12.860522022838499,
+      "grad_norm": 0.3586675822734833,
+      "learning_rate": 0.00044593762751384436,
+      "loss": 3.3517,
+      "step": 44150
+    },
+    {
+      "epoch": 12.875087392216267,
+      "grad_norm": 0.39704591035842896,
+      "learning_rate": 0.000445762751384436,
+      "loss": 3.3519,
+      "step": 44200
+    },
+    {
+      "epoch": 12.889652761594034,
+      "grad_norm": 0.3523213267326355,
+      "learning_rate": 0.0004455878752550277,
+      "loss": 3.3524,
+      "step": 44250
+    },
+    {
+      "epoch": 12.9042181309718,
+      "grad_norm": 0.3636520802974701,
+      "learning_rate": 0.0004454129991256193,
+      "loss": 3.352,
+      "step": 44300
+    },
+    {
+      "epoch": 12.91878350034957,
+      "grad_norm": 0.34113213419914246,
+      "learning_rate": 0.00044523812299621096,
+      "loss": 3.3493,
+      "step": 44350
+    },
+    {
+      "epoch": 12.933348869727336,
+      "grad_norm": 0.33921322226524353,
+      "learning_rate": 0.00044506324686680265,
+      "loss": 3.3483,
+      "step": 44400
+    },
+    {
+      "epoch": 12.947914239105105,
+      "grad_norm": 0.3314392566680908,
+      "learning_rate": 0.0004448883707373943,
+      "loss": 3.3575,
+      "step": 44450
+    },
+    {
+      "epoch": 12.962479608482871,
+      "grad_norm": 0.33687737584114075,
+      "learning_rate": 0.000444713494607986,
+      "loss": 3.3469,
+      "step": 44500
+    },
+    {
+      "epoch": 12.977044977860638,
+      "grad_norm": 0.35737890005111694,
+      "learning_rate": 0.0004445386184785776,
+      "loss": 3.3468,
+      "step": 44550
+    },
+    {
+      "epoch": 12.991610347238407,
+      "grad_norm": 0.3497537672519684,
+      "learning_rate": 0.00044436374234916925,
+      "loss": 3.3558,
+      "step": 44600
+    },
+    {
+      "epoch": 13.006117455138662,
+      "grad_norm": 0.33811378479003906,
+      "learning_rate": 0.000444188866219761,
+      "loss": 3.3249,
+      "step": 44650
+    },
+    {
+      "epoch": 13.02068282451643,
+      "grad_norm": 0.3885464370250702,
+      "learning_rate": 0.00044401399009035263,
+      "loss": 3.246,
+      "step": 44700
+    },
+    {
+      "epoch": 13.035248193894198,
+      "grad_norm": 0.3625794053077698,
+      "learning_rate": 0.0004438391139609443,
+      "loss": 3.2451,
+      "step": 44750
+    },
+    {
+      "epoch": 13.049813563271965,
+      "grad_norm": 0.3693816363811493,
+      "learning_rate": 0.00044366423783153596,
+      "loss": 3.2544,
+      "step": 44800
+    },
+    {
+      "epoch": 13.064378932649731,
+      "grad_norm": 0.36462539434432983,
+      "learning_rate": 0.0004434893617021276,
+      "loss": 3.2496,
+      "step": 44850
+    },
+    {
+      "epoch": 13.0789443020275,
+      "grad_norm": 0.36766380071640015,
+      "learning_rate": 0.0004433144855727193,
+      "loss": 3.2578,
+      "step": 44900
+    },
+    {
+      "epoch": 13.093509671405267,
+      "grad_norm": 0.3557598888874054,
+      "learning_rate": 0.0004431396094433109,
+      "loss": 3.2583,
+      "step": 44950
+    },
+    {
+      "epoch": 13.108075040783035,
+      "grad_norm": 0.373970091342926,
+      "learning_rate": 0.0004429647333139026,
+      "loss": 3.2665,
+      "step": 45000
+    },
+    {
+      "epoch": 13.108075040783035,
+      "eval_accuracy": 0.370416296567142,
+      "eval_loss": 3.5617780685424805,
+      "eval_runtime": 183.1028,
+      "eval_samples_per_second": 90.905,
+      "eval_steps_per_second": 5.685,
+      "step": 45000
+    },
+    {
+      "epoch": 13.122640410160802,
+      "grad_norm": 0.3416595458984375,
+      "learning_rate": 0.00044278985718449425,
+      "loss": 3.2524,
+      "step": 45050
+    },
+    {
+      "epoch": 13.137205779538569,
+      "grad_norm": 0.3591947853565216,
+      "learning_rate": 0.000442614981055086,
+      "loss": 3.2659,
+      "step": 45100
+    },
+    {
+      "epoch": 13.151771148916337,
+      "grad_norm": 0.5230109095573425,
+      "learning_rate": 0.00044244010492567763,
+      "loss": 3.2759,
+      "step": 45150
+    },
+    {
+      "epoch": 13.166336518294104,
+      "grad_norm": 0.33014577627182007,
+      "learning_rate": 0.00044226522879626927,
+      "loss": 3.279,
+      "step": 45200
+    },
+    {
+      "epoch": 13.18090188767187,
+      "grad_norm": 0.3678034543991089,
+      "learning_rate": 0.00044209035266686096,
+      "loss": 3.2777,
+      "step": 45250
+    },
+    {
+      "epoch": 13.19546725704964,
+      "grad_norm": 0.35158881545066833,
+      "learning_rate": 0.0004419154765374526,
+      "loss": 3.2766,
+      "step": 45300
+    },
+    {
+      "epoch": 13.210032626427406,
+      "grad_norm": 0.3579759895801544,
+      "learning_rate": 0.0004417406004080443,
+      "loss": 3.2681,
+      "step": 45350
+    },
+    {
+      "epoch": 13.224597995805174,
+      "grad_norm": 0.36432188749313354,
+      "learning_rate": 0.0004415657242786359,
+      "loss": 3.2759,
+      "step": 45400
+    },
+    {
+      "epoch": 13.239163365182941,
+      "grad_norm": 0.3609389662742615,
+      "learning_rate": 0.00044139084814922755,
+      "loss": 3.2998,
+      "step": 45450
+    },
+    {
+      "epoch": 13.253728734560708,
+      "grad_norm": 0.3791234791278839,
+      "learning_rate": 0.00044121597201981924,
+      "loss": 3.2908,
+      "step": 45500
+    },
+    {
+      "epoch": 13.268294103938477,
+      "grad_norm": 0.3537718951702118,
+      "learning_rate": 0.0004410410958904109,
+      "loss": 3.2842,
+      "step": 45550
+    },
+    {
+      "epoch": 13.282859473316243,
+      "grad_norm": 0.3571259081363678,
+      "learning_rate": 0.0004408662197610026,
+      "loss": 3.306,
+      "step": 45600
+    },
+    {
+      "epoch": 13.29742484269401,
+      "grad_norm": 0.3450362980365753,
+      "learning_rate": 0.00044069134363159426,
+      "loss": 3.2932,
+      "step": 45650
+    },
+    {
+      "epoch": 13.311990212071779,
+      "grad_norm": 0.37204447388648987,
+      "learning_rate": 0.00044051646750218595,
+      "loss": 3.2995,
+      "step": 45700
+    },
+    {
+      "epoch": 13.326555581449545,
+      "grad_norm": 0.3619859516620636,
+      "learning_rate": 0.0004403415913727776,
+      "loss": 3.3131,
+      "step": 45750
+    },
+    {
+      "epoch": 13.341120950827314,
+      "grad_norm": 0.40078550577163696,
+      "learning_rate": 0.0004401667152433692,
+      "loss": 3.3,
+      "step": 45800
+    },
+    {
+      "epoch": 13.35568632020508,
+      "grad_norm": 0.3776214122772217,
+      "learning_rate": 0.0004399918391139609,
+      "loss": 3.3003,
+      "step": 45850
+    },
+    {
+      "epoch": 13.370251689582847,
+      "grad_norm": 0.34433192014694214,
+      "learning_rate": 0.00043981696298455255,
+      "loss": 3.2994,
+      "step": 45900
+    },
+    {
+      "epoch": 13.384817058960616,
+      "grad_norm": 0.3606876730918884,
+      "learning_rate": 0.00043964208685514424,
+      "loss": 3.3046,
+      "step": 45950
+    },
+    {
+      "epoch": 13.399382428338383,
+      "grad_norm": 0.3861118257045746,
+      "learning_rate": 0.0004394672107257359,
+      "loss": 3.2949,
+      "step": 46000
+    },
+    {
+      "epoch": 13.399382428338383,
+      "eval_accuracy": 0.37078323150820136,
+      "eval_loss": 3.5553009510040283,
+      "eval_runtime": 182.8517,
+      "eval_samples_per_second": 91.03,
+      "eval_steps_per_second": 5.693,
+      "step": 46000
+    },
+    {
+      "epoch": 13.41394779771615,
+      "grad_norm": 0.4186341464519501,
+      "learning_rate": 0.0004392923345963275,
+      "loss": 3.315,
+      "step": 46050
+    },
+    {
+      "epoch": 13.428513167093918,
+      "grad_norm": 0.33551856875419617,
+      "learning_rate": 0.00043911745846691926,
+      "loss": 3.3034,
+      "step": 46100
+    },
+    {
+      "epoch": 13.443078536471685,
+      "grad_norm": 0.3242250978946686,
+      "learning_rate": 0.0004389425823375109,
+      "loss": 3.3139,
+      "step": 46150
+    },
+    {
+      "epoch": 13.457643905849451,
+      "grad_norm": 0.3546141982078552,
+      "learning_rate": 0.0004387677062081026,
+      "loss": 3.3164,
+      "step": 46200
+    },
+    {
+      "epoch": 13.47220927522722,
+      "grad_norm": 0.3523365557193756,
+      "learning_rate": 0.0004385928300786942,
+      "loss": 3.3058,
+      "step": 46250
+    },
+    {
+      "epoch": 13.486774644604987,
+      "grad_norm": 0.3621370792388916,
+      "learning_rate": 0.0004384179539492859,
+      "loss": 3.3242,
+      "step": 46300
+    },
+    {
+      "epoch": 13.501340013982755,
+      "grad_norm": 0.36154744029045105,
+      "learning_rate": 0.00043824307781987755,
+      "loss": 3.3023,
+      "step": 46350
+    },
+    {
+      "epoch": 13.515905383360522,
+      "grad_norm": 0.34886816143989563,
+      "learning_rate": 0.0004380682016904692,
+      "loss": 3.3159,
+      "step": 46400
+    },
+    {
+      "epoch": 13.530470752738289,
+      "grad_norm": 0.3677714467048645,
+      "learning_rate": 0.0004378933255610609,
+      "loss": 3.3182,
+      "step": 46450
+    },
+    {
+      "epoch": 13.545036122116057,
+      "grad_norm": 0.351188987493515,
+      "learning_rate": 0.0004377184494316525,
+      "loss": 3.3209,
+      "step": 46500
+    },
+    {
+      "epoch": 13.559601491493824,
+      "grad_norm": 0.3655342161655426,
+      "learning_rate": 0.00043754357330224426,
+      "loss": 3.3248,
+      "step": 46550
+    },
+    {
+      "epoch": 13.574166860871593,
+      "grad_norm": 0.3502582907676697,
+      "learning_rate": 0.0004373686971728359,
+      "loss": 3.3134,
+      "step": 46600
+    },
+    {
+      "epoch": 13.58873223024936,
+      "grad_norm": 0.36190810799598694,
+      "learning_rate": 0.00043719382104342753,
+      "loss": 3.3175,
+      "step": 46650
+    },
+    {
+      "epoch": 13.603297599627126,
+      "grad_norm": 0.35235005617141724,
+      "learning_rate": 0.0004370189449140192,
+      "loss": 3.3199,
+      "step": 46700
+    },
+    {
+      "epoch": 13.617862969004895,
+      "grad_norm": 0.3493868410587311,
+      "learning_rate": 0.00043684406878461085,
+      "loss": 3.3251,
+      "step": 46750
+    },
+    {
+      "epoch": 13.632428338382661,
+      "grad_norm": 0.3571072816848755,
+      "learning_rate": 0.00043666919265520254,
+      "loss": 3.3171,
+      "step": 46800
+    },
+    {
+      "epoch": 13.646993707760428,
+      "grad_norm": 0.3887588083744049,
+      "learning_rate": 0.0004364943165257942,
+      "loss": 3.3209,
+      "step": 46850
+    },
+    {
+      "epoch": 13.661559077138197,
+      "grad_norm": 0.3646533489227295,
+      "learning_rate": 0.0004363194403963858,
+      "loss": 3.3227,
+      "step": 46900
+    },
+    {
+      "epoch": 13.676124446515963,
+      "grad_norm": 0.35989436507225037,
+      "learning_rate": 0.0004361445642669775,
+      "loss": 3.3222,
+      "step": 46950
+    },
+    {
+      "epoch": 13.69068981589373,
+      "grad_norm": 0.34057942032814026,
+      "learning_rate": 0.00043596968813756914,
+      "loss": 3.3255,
+      "step": 47000
+    },
+    {
+      "epoch": 13.69068981589373,
+      "eval_accuracy": 0.3710363590083939,
+      "eval_loss": 3.5472984313964844,
+      "eval_runtime": 182.8591,
+      "eval_samples_per_second": 91.026,
+      "eval_steps_per_second": 5.693,
+      "step": 47000
+    },
+    {
+      "epoch": 13.705255185271499,
+      "grad_norm": 0.3698166310787201,
+      "learning_rate": 0.0004357948120081609,
+      "loss": 3.3216,
+      "step": 47050
+    },
+    {
+      "epoch": 13.719820554649266,
+      "grad_norm": 0.3621658682823181,
+      "learning_rate": 0.0004356199358787525,
+      "loss": 3.3184,
+      "step": 47100
+    },
+    {
+      "epoch": 13.734385924027034,
+      "grad_norm": 0.36228302121162415,
+      "learning_rate": 0.0004354450597493442,
+      "loss": 3.3329,
+      "step": 47150
+    },
+    {
+      "epoch": 13.7489512934048,
+      "grad_norm": 0.33166074752807617,
+      "learning_rate": 0.00043527018361993585,
+      "loss": 3.3176,
+      "step": 47200
+    },
+    {
+      "epoch": 13.763516662782568,
+      "grad_norm": 0.3457214832305908,
+      "learning_rate": 0.0004350953074905275,
+      "loss": 3.3255,
+      "step": 47250
+    },
+    {
+      "epoch": 13.778082032160336,
+      "grad_norm": 0.38725757598876953,
+      "learning_rate": 0.0004349204313611192,
+      "loss": 3.3119,
+      "step": 47300
+    },
+    {
+      "epoch": 13.792647401538103,
+      "grad_norm": 0.3344460427761078,
+      "learning_rate": 0.0004347455552317108,
+      "loss": 3.3217,
+      "step": 47350
+    },
+    {
+      "epoch": 13.80721277091587,
+      "grad_norm": 0.3496969938278198,
+      "learning_rate": 0.0004345706791023025,
+      "loss": 3.3271,
+      "step": 47400
+    },
+    {
+      "epoch": 13.821778140293638,
+      "grad_norm": 0.3360762596130371,
+      "learning_rate": 0.00043439580297289414,
+      "loss": 3.3266,
+      "step": 47450
+    },
+    {
+      "epoch": 13.836343509671405,
+      "grad_norm": 0.34943684935569763,
+      "learning_rate": 0.0004342209268434858,
+      "loss": 3.3241,
+      "step": 47500
+    },
+    {
+      "epoch": 13.850908879049173,
+      "grad_norm": 0.3448510468006134,
+      "learning_rate": 0.0004340460507140775,
+      "loss": 3.3468,
+      "step": 47550
+    },
+    {
+      "epoch": 13.86547424842694,
+      "grad_norm": 0.35688409209251404,
+      "learning_rate": 0.00043387117458466916,
+      "loss": 3.3362,
+      "step": 47600
+    },
+    {
+      "epoch": 13.880039617804707,
+      "grad_norm": 0.36410847306251526,
+      "learning_rate": 0.00043369629845526085,
+      "loss": 3.3353,
+      "step": 47650
+    },
+    {
+      "epoch": 13.894604987182475,
+      "grad_norm": 0.33943018317222595,
+      "learning_rate": 0.0004335214223258525,
+      "loss": 3.3441,
+      "step": 47700
+    },
+    {
+      "epoch": 13.909170356560242,
+      "grad_norm": 0.34998592734336853,
+      "learning_rate": 0.0004333465461964442,
+      "loss": 3.3327,
+      "step": 47750
+    },
+    {
+      "epoch": 13.923735725938009,
+      "grad_norm": 0.36426639556884766,
+      "learning_rate": 0.0004331716700670358,
+      "loss": 3.3365,
+      "step": 47800
+    },
+    {
+      "epoch": 13.938301095315778,
+      "grad_norm": 0.3527946472167969,
+      "learning_rate": 0.00043299679393762745,
+      "loss": 3.3339,
+      "step": 47850
+    },
+    {
+      "epoch": 13.952866464693544,
+      "grad_norm": 0.36141130328178406,
+      "learning_rate": 0.00043282191780821914,
+      "loss": 3.3355,
+      "step": 47900
+    },
+    {
+      "epoch": 13.967431834071313,
+      "grad_norm": 0.35111555457115173,
+      "learning_rate": 0.00043264704167881077,
+      "loss": 3.3369,
+      "step": 47950
+    },
+    {
+      "epoch": 13.98199720344908,
+      "grad_norm": 0.3544767498970032,
+      "learning_rate": 0.0004324721655494025,
+      "loss": 3.3499,
+      "step": 48000
+    },
+    {
+      "epoch": 13.98199720344908,
+      "eval_accuracy": 0.37179233198853223,
+      "eval_loss": 3.53971004486084,
+      "eval_runtime": 182.9517,
+      "eval_samples_per_second": 90.98,
+      "eval_steps_per_second": 5.69,
+      "step": 48000
+    },
+    {
+      "epoch": 13.996562572826846,
+      "grad_norm": 0.3594323992729187,
+      "learning_rate": 0.00043229728941999415,
+      "loss": 3.3321,
+      "step": 48050
+    },
+    {
+      "epoch": 14.011069680727104,
+      "grad_norm": 0.37229645252227783,
+      "learning_rate": 0.0004321224132905858,
+      "loss": 3.2537,
+      "step": 48100
+    },
+    {
+      "epoch": 14.02563505010487,
+      "grad_norm": 0.3777659237384796,
+      "learning_rate": 0.0004319475371611775,
+      "loss": 3.2282,
+      "step": 48150
+    },
+    {
+      "epoch": 14.040200419482638,
+      "grad_norm": 0.34410324692726135,
+      "learning_rate": 0.0004317726610317691,
+      "loss": 3.237,
+      "step": 48200
+    },
+    {
+      "epoch": 14.054765788860406,
+      "grad_norm": 0.38047492504119873,
+      "learning_rate": 0.0004315977849023608,
+      "loss": 3.2391,
+      "step": 48250
+    },
+    {
+      "epoch": 14.069331158238173,
+      "grad_norm": 0.3687790334224701,
+      "learning_rate": 0.00043142290877295244,
+      "loss": 3.2302,
+      "step": 48300
+    },
+    {
+      "epoch": 14.08389652761594,
+      "grad_norm": 0.35645702481269836,
+      "learning_rate": 0.00043124803264354413,
+      "loss": 3.2453,
+      "step": 48350
+    },
+    {
+      "epoch": 14.098461896993708,
+      "grad_norm": 0.3644995093345642,
+      "learning_rate": 0.00043107315651413577,
+      "loss": 3.2361,
+      "step": 48400
+    },
+    {
+      "epoch": 14.113027266371475,
+      "grad_norm": 0.3667345643043518,
+      "learning_rate": 0.0004308982803847274,
+      "loss": 3.2444,
+      "step": 48450
+    },
+    {
+      "epoch": 14.127592635749243,
+      "grad_norm": 0.3460175693035126,
+      "learning_rate": 0.00043072340425531915,
+      "loss": 3.2458,
+      "step": 48500
+    },
+    {
+      "epoch": 14.14215800512701,
+      "grad_norm": 0.38901305198669434,
+      "learning_rate": 0.0004305485281259108,
+      "loss": 3.2477,
+      "step": 48550
+    },
+    {
+      "epoch": 14.156723374504777,
+      "grad_norm": 0.3598061203956604,
+      "learning_rate": 0.0004303736519965025,
+      "loss": 3.2543,
+      "step": 48600
+    },
+    {
+      "epoch": 14.171288743882545,
+      "grad_norm": 0.37196290493011475,
+      "learning_rate": 0.0004301987758670941,
+      "loss": 3.2641,
+      "step": 48650
+    },
+    {
+      "epoch": 14.185854113260312,
+      "grad_norm": 0.348818302154541,
+      "learning_rate": 0.00043002389973768575,
+      "loss": 3.2655,
+      "step": 48700
+    },
+    {
+      "epoch": 14.200419482638079,
+      "grad_norm": 0.3512094020843506,
+      "learning_rate": 0.00042984902360827744,
+      "loss": 3.2692,
+      "step": 48750
+    },
+    {
+      "epoch": 14.214984852015847,
+      "grad_norm": 0.36507096886634827,
+      "learning_rate": 0.0004296741474788691,
+      "loss": 3.2659,
+      "step": 48800
+    },
+    {
+      "epoch": 14.229550221393614,
+      "grad_norm": 0.3447706997394562,
+      "learning_rate": 0.00042949927134946077,
+      "loss": 3.2631,
+      "step": 48850
+    },
+    {
+      "epoch": 14.244115590771383,
+      "grad_norm": 0.34184086322784424,
+      "learning_rate": 0.0004293243952200524,
+      "loss": 3.2789,
+      "step": 48900
+    },
+    {
+      "epoch": 14.25868096014915,
+      "grad_norm": 0.352250337600708,
+      "learning_rate": 0.00042914951909064415,
+      "loss": 3.2707,
+      "step": 48950
+    },
+    {
+      "epoch": 14.273246329526916,
+      "grad_norm": 0.3569357693195343,
+      "learning_rate": 0.0004289746429612358,
+      "loss": 3.2776,
+      "step": 49000
+    },
+    {
+      "epoch": 14.273246329526916,
+      "eval_accuracy": 0.37109361543783825,
+      "eval_loss": 3.5551700592041016,
+      "eval_runtime": 182.8453,
+      "eval_samples_per_second": 91.033,
+      "eval_steps_per_second": 5.693,
+      "step": 49000
+    },
+    {
+      "epoch": 14.287811698904685,
+      "grad_norm": 0.3431905508041382,
+      "learning_rate": 0.0004287997668318274,
+      "loss": 3.2826,
+      "step": 49050
+    },
+    {
+      "epoch": 14.302377068282452,
+      "grad_norm": 0.3872802257537842,
+      "learning_rate": 0.0004286248907024191,
+      "loss": 3.2845,
+      "step": 49100
+    },
+    {
+      "epoch": 14.316942437660218,
+      "grad_norm": 0.36308524012565613,
+      "learning_rate": 0.00042845001457301075,
+      "loss": 3.2846,
+      "step": 49150
+    },
+    {
+      "epoch": 14.331507807037987,
+      "grad_norm": 0.4181770384311676,
+      "learning_rate": 0.00042827513844360244,
+      "loss": 3.2949,
+      "step": 49200
+    },
+    {
+      "epoch": 14.346073176415754,
+      "grad_norm": 0.33394357562065125,
+      "learning_rate": 0.00042810026231419407,
+      "loss": 3.2837,
+      "step": 49250
+    },
+    {
+      "epoch": 14.360638545793522,
+      "grad_norm": 0.3773725628852844,
+      "learning_rate": 0.0004279253861847857,
+      "loss": 3.28,
+      "step": 49300
+    },
+    {
+      "epoch": 14.375203915171289,
+      "grad_norm": 0.3761826455593109,
+      "learning_rate": 0.0004277505100553774,
+      "loss": 3.2864,
+      "step": 49350
+    },
+    {
+      "epoch": 14.389769284549056,
+      "grad_norm": 0.36056873202323914,
+      "learning_rate": 0.00042757563392596904,
+      "loss": 3.2878,
+      "step": 49400
+    },
+    {
+      "epoch": 14.404334653926824,
+      "grad_norm": 0.3693695366382599,
+      "learning_rate": 0.0004274007577965608,
+      "loss": 3.2889,
+      "step": 49450
+    },
+    {
+      "epoch": 14.418900023304591,
+      "grad_norm": 0.34379875659942627,
+      "learning_rate": 0.0004272258816671524,
+      "loss": 3.2927,
+      "step": 49500
+    },
+    {
+      "epoch": 14.433465392682358,
+      "grad_norm": 0.3781634569168091,
+      "learning_rate": 0.00042705100553774405,
+      "loss": 3.3048,
+      "step": 49550
+    },
+    {
+      "epoch": 14.448030762060126,
+      "grad_norm": 0.3523657023906708,
+      "learning_rate": 0.00042687612940833574,
+      "loss": 3.2899,
+      "step": 49600
+    },
+    {
+      "epoch": 14.462596131437893,
+      "grad_norm": 0.365022212266922,
+      "learning_rate": 0.0004267012532789274,
+      "loss": 3.2877,
+      "step": 49650
+    },
+    {
+      "epoch": 14.477161500815662,
+      "grad_norm": 0.3355906009674072,
+      "learning_rate": 0.00042652637714951907,
+      "loss": 3.2973,
+      "step": 49700
+    },
+    {
+      "epoch": 14.491726870193428,
+      "grad_norm": 0.37081584334373474,
+      "learning_rate": 0.0004263515010201107,
+      "loss": 3.3001,
+      "step": 49750
+    },
+    {
+      "epoch": 14.506292239571195,
+      "grad_norm": 0.3539626896381378,
+      "learning_rate": 0.0004261766248907024,
+      "loss": 3.2871,
+      "step": 49800
+    },
+    {
+      "epoch": 14.520857608948964,
+      "grad_norm": 0.36710795760154724,
+      "learning_rate": 0.00042600174876129403,
+      "loss": 3.2974,
+      "step": 49850
+    },
+    {
+      "epoch": 14.53542297832673,
+      "grad_norm": 0.34463170170783997,
+      "learning_rate": 0.00042582687263188567,
+      "loss": 3.2937,
+      "step": 49900
+    },
+    {
+      "epoch": 14.549988347704497,
+      "grad_norm": 0.3455180525779724,
+      "learning_rate": 0.0004256519965024774,
+      "loss": 3.2996,
+      "step": 49950
+    },
+    {
+      "epoch": 14.564553717082266,
+      "grad_norm": 0.34833067655563354,
+      "learning_rate": 0.00042547712037306905,
+      "loss": 3.2981,
+      "step": 50000
+    },
+    {
+      "epoch": 14.564553717082266,
+      "eval_accuracy": 0.37169604242854265,
+      "eval_loss": 3.54777193069458,
+      "eval_runtime": 182.9961,
+      "eval_samples_per_second": 90.958,
+      "eval_steps_per_second": 5.689,
+      "step": 50000
+    },
+    {
+      "epoch": 14.579119086460032,
+      "grad_norm": 0.3737424314022064,
+      "learning_rate": 0.00042530224424366074,
+      "loss": 3.3015,
+      "step": 50050
+    },
+    {
+      "epoch": 14.5936844558378,
+      "grad_norm": 0.3670142889022827,
+      "learning_rate": 0.0004251273681142524,
+      "loss": 3.3052,
+      "step": 50100
+    },
+    {
+      "epoch": 14.608249825215568,
+      "grad_norm": 0.38956716656684875,
+      "learning_rate": 0.000424952491984844,
+      "loss": 3.3031,
+      "step": 50150
+    },
+    {
+      "epoch": 14.622815194593334,
+      "grad_norm": 0.3635333180427551,
+      "learning_rate": 0.0004247776158554357,
+      "loss": 3.3088,
+      "step": 50200
+    },
+    {
+      "epoch": 14.637380563971103,
+      "grad_norm": 0.3675437569618225,
+      "learning_rate": 0.00042460273972602734,
+      "loss": 3.3087,
+      "step": 50250
+    },
+    {
+      "epoch": 14.65194593334887,
+      "grad_norm": 0.3805491328239441,
+      "learning_rate": 0.00042442786359661903,
+      "loss": 3.3056,
+      "step": 50300
+    },
+    {
+      "epoch": 14.666511302726637,
+      "grad_norm": 0.36096301674842834,
+      "learning_rate": 0.00042425298746721066,
+      "loss": 3.3121,
+      "step": 50350
+    },
+    {
+      "epoch": 14.681076672104405,
+      "grad_norm": 0.35494565963745117,
+      "learning_rate": 0.0004240781113378024,
+      "loss": 3.3134,
+      "step": 50400
+    },
+    {
+      "epoch": 14.695642041482172,
+      "grad_norm": 0.3403199017047882,
+      "learning_rate": 0.00042390323520839405,
+      "loss": 3.316,
+      "step": 50450
+    },
+    {
+      "epoch": 14.71020741085994,
+      "grad_norm": 0.3592222332954407,
+      "learning_rate": 0.0004237283590789857,
+      "loss": 3.3016,
+      "step": 50500
+    },
+    {
+      "epoch": 14.724772780237707,
+      "grad_norm": 0.4019494652748108,
+      "learning_rate": 0.00042355348294957737,
+      "loss": 3.3111,
+      "step": 50550
+    },
+    {
+      "epoch": 14.739338149615474,
+      "grad_norm": 0.3454861044883728,
+      "learning_rate": 0.000423378606820169,
+      "loss": 3.3105,
+      "step": 50600
+    },
+    {
+      "epoch": 14.753903518993242,
+      "grad_norm": 0.35580456256866455,
+      "learning_rate": 0.0004232037306907607,
+      "loss": 3.3183,
+      "step": 50650
+    },
+    {
+      "epoch": 14.76846888837101,
+      "grad_norm": 0.3536444306373596,
+      "learning_rate": 0.00042302885456135233,
+      "loss": 3.3263,
+      "step": 50700
+    },
+    {
+      "epoch": 14.783034257748776,
+      "grad_norm": 0.36378782987594604,
+      "learning_rate": 0.00042285397843194397,
+      "loss": 3.3226,
+      "step": 50750
+    },
+    {
+      "epoch": 14.797599627126544,
+      "grad_norm": 0.3512897193431854,
+      "learning_rate": 0.00042267910230253566,
+      "loss": 3.3141,
+      "step": 50800
+    },
+    {
+      "epoch": 14.812164996504311,
+      "grad_norm": 0.3477229177951813,
+      "learning_rate": 0.0004225042261731273,
+      "loss": 3.3043,
+      "step": 50850
+    },
+    {
+      "epoch": 14.826730365882078,
+      "grad_norm": 0.3499688506126404,
+      "learning_rate": 0.00042232935004371904,
+      "loss": 3.3226,
+      "step": 50900
+    },
+    {
+      "epoch": 14.841295735259846,
+      "grad_norm": 0.34953367710113525,
+      "learning_rate": 0.0004221544739143107,
+      "loss": 3.3144,
+      "step": 50950
+    },
+    {
+      "epoch": 14.855861104637613,
+      "grad_norm": 0.38450056314468384,
+      "learning_rate": 0.00042197959778490237,
+      "loss": 3.3145,
+      "step": 51000
+    },
+    {
+      "epoch": 14.855861104637613,
+      "eval_accuracy": 0.37208954811509365,
+      "eval_loss": 3.5404186248779297,
+      "eval_runtime": 230.8513,
+      "eval_samples_per_second": 72.103,
+      "eval_steps_per_second": 4.509,
+      "step": 51000
+    },
+    {
+      "epoch": 14.870426474015382,
+      "grad_norm": 0.37415948510169983,
+      "learning_rate": 0.000421804721655494,
+      "loss": 3.3218,
+      "step": 51050
+    },
+    {
+      "epoch": 14.884991843393149,
+      "grad_norm": 0.36808714270591736,
+      "learning_rate": 0.00042162984552608564,
+      "loss": 3.3253,
+      "step": 51100
+    },
+    {
+      "epoch": 14.899557212770915,
+      "grad_norm": 0.37905970215797424,
+      "learning_rate": 0.00042145496939667733,
+      "loss": 3.3039,
+      "step": 51150
+    },
+    {
+      "epoch": 14.914122582148684,
+      "grad_norm": 0.3744428753852844,
+      "learning_rate": 0.00042128009326726897,
+      "loss": 3.3124,
+      "step": 51200
+    },
+    {
+      "epoch": 14.92868795152645,
+      "grad_norm": 0.38499951362609863,
+      "learning_rate": 0.00042110521713786066,
+      "loss": 3.3305,
+      "step": 51250
+    },
+    {
+      "epoch": 14.943253320904217,
+      "grad_norm": 0.35487884283065796,
+      "learning_rate": 0.0004209303410084523,
+      "loss": 3.3269,
+      "step": 51300
+    },
+    {
+      "epoch": 14.957818690281986,
+      "grad_norm": 0.35819604992866516,
+      "learning_rate": 0.00042075546487904393,
+      "loss": 3.3218,
+      "step": 51350
+    },
+    {
+      "epoch": 14.972384059659753,
+      "grad_norm": 0.37901991605758667,
+      "learning_rate": 0.0004205805887496357,
+      "loss": 3.3316,
+      "step": 51400
+    },
+    {
+      "epoch": 14.986949429037521,
+      "grad_norm": 0.3591151833534241,
+      "learning_rate": 0.0004204057126202273,
+      "loss": 3.3292,
+      "step": 51450
+    },
+    {
+      "epoch": 15.001456536937777,
+      "grad_norm": 0.3722686171531677,
+      "learning_rate": 0.000420230836490819,
+      "loss": 3.3101,
+      "step": 51500
+    },
+    {
+      "epoch": 15.016021906315544,
+      "grad_norm": 0.3850182592868805,
+      "learning_rate": 0.00042005596036141064,
+      "loss": 3.2084,
+      "step": 51550
+    },
+    {
+      "epoch": 15.030587275693312,
+      "grad_norm": 0.3716738522052765,
+      "learning_rate": 0.0004198810842320023,
+      "loss": 3.2187,
+      "step": 51600
+    },
+    {
+      "epoch": 15.045152645071079,
+      "grad_norm": 0.3463042974472046,
+      "learning_rate": 0.00041970620810259396,
+      "loss": 3.2243,
+      "step": 51650
+    },
+    {
+      "epoch": 15.059718014448846,
+      "grad_norm": 0.3639398515224457,
+      "learning_rate": 0.0004195313319731856,
+      "loss": 3.2268,
+      "step": 51700
+    },
+    {
+      "epoch": 15.074283383826614,
+      "grad_norm": 0.3452468812465668,
+      "learning_rate": 0.0004193564558437773,
+      "loss": 3.222,
+      "step": 51750
+    },
+    {
+      "epoch": 15.088848753204381,
+      "grad_norm": 0.36396175622940063,
+      "learning_rate": 0.0004191815797143689,
+      "loss": 3.2334,
+      "step": 51800
+    },
+    {
+      "epoch": 15.103414122582148,
+      "grad_norm": 0.37959036231040955,
+      "learning_rate": 0.00041900670358496067,
+      "loss": 3.2208,
+      "step": 51850
+    },
+    {
+      "epoch": 15.117979491959916,
+      "grad_norm": 0.3539676070213318,
+      "learning_rate": 0.0004188318274555523,
+      "loss": 3.2322,
+      "step": 51900
+    },
+    {
+      "epoch": 15.132544861337683,
+      "grad_norm": 0.3693428039550781,
+      "learning_rate": 0.00041865695132614394,
+      "loss": 3.2495,
+      "step": 51950
+    },
+    {
+      "epoch": 15.147110230715452,
+      "grad_norm": 0.3708992004394531,
+      "learning_rate": 0.00041848207519673563,
+      "loss": 3.2624,
+      "step": 52000
+    },
+    {
+      "epoch": 15.147110230715452,
+      "eval_accuracy": 0.37109831822465095,
+      "eval_loss": 3.5561776161193848,
+      "eval_runtime": 182.9851,
+      "eval_samples_per_second": 90.964,
+      "eval_steps_per_second": 5.689,
+      "step": 52000
+    },
+    {
+      "epoch": 15.161675600093218,
+      "grad_norm": 0.3789459764957428,
+      "learning_rate": 0.00041830719906732727,
+      "loss": 3.2452,
+      "step": 52050
+    },
+    {
+      "epoch": 15.176240969470985,
+      "grad_norm": 0.3604375720024109,
+      "learning_rate": 0.00041813232293791896,
+      "loss": 3.2586,
+      "step": 52100
+    },
+    {
+      "epoch": 15.190806338848754,
+      "grad_norm": 0.3920636475086212,
+      "learning_rate": 0.0004179574468085106,
+      "loss": 3.2532,
+      "step": 52150
+    },
+    {
+      "epoch": 15.20537170822652,
+      "grad_norm": 0.36862772703170776,
+      "learning_rate": 0.00041778257067910223,
+      "loss": 3.2501,
+      "step": 52200
+    },
+    {
+      "epoch": 15.219937077604287,
+      "grad_norm": 0.3847063481807709,
+      "learning_rate": 0.0004176076945496939,
+      "loss": 3.2635,
+      "step": 52250
+    },
+    {
+      "epoch": 15.234502446982056,
+      "grad_norm": 0.37286511063575745,
+      "learning_rate": 0.00041743281842028556,
+      "loss": 3.2518,
+      "step": 52300
+    },
+    {
+      "epoch": 15.249067816359823,
+      "grad_norm": 0.35213422775268555,
+      "learning_rate": 0.0004172579422908773,
+      "loss": 3.2495,
+      "step": 52350
+    },
+    {
+      "epoch": 15.263633185737591,
+      "grad_norm": 0.37196075916290283,
+      "learning_rate": 0.00041708306616146894,
+      "loss": 3.2574,
+      "step": 52400
+    },
+    {
+      "epoch": 15.278198555115358,
+      "grad_norm": 0.36524370312690735,
+      "learning_rate": 0.00041690819003206063,
+      "loss": 3.2571,
+      "step": 52450
+    },
+    {
+      "epoch": 15.292763924493125,
+      "grad_norm": 0.3636349141597748,
+      "learning_rate": 0.00041673331390265227,
+      "loss": 3.2618,
+      "step": 52500
+    },
+    {
+      "epoch": 15.307329293870893,
+      "grad_norm": 0.37588223814964294,
+      "learning_rate": 0.0004165584377732439,
+      "loss": 3.2782,
+      "step": 52550
+    },
+    {
+      "epoch": 15.32189466324866,
+      "grad_norm": 0.4110542833805084,
+      "learning_rate": 0.0004163835616438356,
+      "loss": 3.2642,
+      "step": 52600
+    },
+    {
+      "epoch": 15.336460032626427,
+      "grad_norm": 0.3869507908821106,
+      "learning_rate": 0.00041620868551442723,
+      "loss": 3.269,
+      "step": 52650
+    },
+    {
+      "epoch": 15.351025402004195,
+      "grad_norm": 0.3692317306995392,
+      "learning_rate": 0.0004160338093850189,
+      "loss": 3.2689,
+      "step": 52700
+    },
+    {
+      "epoch": 15.365590771381962,
+      "grad_norm": 0.3578970730304718,
+      "learning_rate": 0.00041585893325561056,
+      "loss": 3.2703,
+      "step": 52750
+    },
+    {
+      "epoch": 15.38015614075973,
+      "grad_norm": 0.37819942831993103,
+      "learning_rate": 0.0004156840571262022,
+      "loss": 3.2626,
+      "step": 52800
+    },
+    {
+      "epoch": 15.394721510137497,
+      "grad_norm": 0.3595389425754547,
+      "learning_rate": 0.00041550918099679394,
+      "loss": 3.2812,
+      "step": 52850
+    },
+    {
+      "epoch": 15.409286879515264,
+      "grad_norm": 0.36239922046661377,
+      "learning_rate": 0.0004153343048673856,
+      "loss": 3.271,
+      "step": 52900
+    },
+    {
+      "epoch": 15.423852248893033,
+      "grad_norm": 0.3499103784561157,
+      "learning_rate": 0.00041515942873797726,
+      "loss": 3.2892,
+      "step": 52950
+    },
+    {
+      "epoch": 15.4384176182708,
+      "grad_norm": 0.4039655327796936,
+      "learning_rate": 0.0004149845526085689,
+      "loss": 3.2688,
+      "step": 53000
+    },
+    {
+      "epoch": 15.4384176182708,
+      "eval_accuracy": 0.371650190257119,
+      "eval_loss": 3.551593780517578,
+      "eval_runtime": 182.8196,
+      "eval_samples_per_second": 91.046,
+      "eval_steps_per_second": 5.694,
+      "step": 53000
+    },
+    {
+      "epoch": 15.452982987648566,
+      "grad_norm": 0.3695296347141266,
+      "learning_rate": 0.0004148096764791606,
+      "loss": 3.2855,
+      "step": 53050
+    },
+    {
+      "epoch": 15.467548357026335,
+      "grad_norm": 0.3560178875923157,
+      "learning_rate": 0.0004146348003497522,
+      "loss": 3.2919,
+      "step": 53100
+    },
+    {
+      "epoch": 15.482113726404101,
+      "grad_norm": 0.37681275606155396,
+      "learning_rate": 0.00041445992422034386,
+      "loss": 3.2786,
+      "step": 53150
+    },
+    {
+      "epoch": 15.49667909578187,
+      "grad_norm": 0.352400004863739,
+      "learning_rate": 0.00041428504809093555,
+      "loss": 3.2802,
+      "step": 53200
+    },
+    {
+      "epoch": 15.511244465159637,
+      "grad_norm": 0.3676430881023407,
+      "learning_rate": 0.0004141101719615272,
+      "loss": 3.27,
+      "step": 53250
+    },
+    {
+      "epoch": 15.525809834537403,
+      "grad_norm": 0.3506219983100891,
+      "learning_rate": 0.00041393529583211893,
+      "loss": 3.286,
+      "step": 53300
+    },
+    {
+      "epoch": 15.540375203915172,
+      "grad_norm": 0.37572669982910156,
+      "learning_rate": 0.00041376041970271057,
+      "loss": 3.2762,
+      "step": 53350
+    },
+    {
+      "epoch": 15.554940573292939,
+      "grad_norm": 0.37286514043807983,
+      "learning_rate": 0.0004135855435733022,
+      "loss": 3.2919,
+      "step": 53400
+    },
+    {
+      "epoch": 15.569505942670705,
+      "grad_norm": 0.3498823046684265,
+      "learning_rate": 0.0004134106674438939,
+      "loss": 3.281,
+      "step": 53450
+    },
+    {
+      "epoch": 15.584071312048474,
+      "grad_norm": 0.38120484352111816,
+      "learning_rate": 0.00041323579131448553,
+      "loss": 3.2949,
+      "step": 53500
+    },
+    {
+      "epoch": 15.59863668142624,
+      "grad_norm": 0.35132765769958496,
+      "learning_rate": 0.0004130609151850772,
+      "loss": 3.299,
+      "step": 53550
+    },
+    {
+      "epoch": 15.61320205080401,
+      "grad_norm": 0.39883989095687866,
+      "learning_rate": 0.00041288603905566886,
+      "loss": 3.2928,
+      "step": 53600
+    },
+    {
+      "epoch": 15.627767420181776,
+      "grad_norm": 0.3505145013332367,
+      "learning_rate": 0.0004127111629262605,
+      "loss": 3.2905,
+      "step": 53650
+    },
+    {
+      "epoch": 15.642332789559543,
+      "grad_norm": 0.35722294449806213,
+      "learning_rate": 0.0004125362867968522,
+      "loss": 3.3067,
+      "step": 53700
+    },
+    {
+      "epoch": 15.656898158937311,
+      "grad_norm": 0.34419217705726624,
+      "learning_rate": 0.0004123614106674438,
+      "loss": 3.3027,
+      "step": 53750
+    },
+    {
+      "epoch": 15.671463528315078,
+      "grad_norm": 0.3667759895324707,
+      "learning_rate": 0.00041218653453803557,
+      "loss": 3.293,
+      "step": 53800
+    },
+    {
+      "epoch": 15.686028897692845,
+      "grad_norm": 0.35497573018074036,
+      "learning_rate": 0.0004120116584086272,
+      "loss": 3.2875,
+      "step": 53850
+    },
+    {
+      "epoch": 15.700594267070613,
+      "grad_norm": 0.3888442814350128,
+      "learning_rate": 0.0004118367822792189,
+      "loss": 3.2854,
+      "step": 53900
+    },
+    {
+      "epoch": 15.71515963644838,
+      "grad_norm": 0.3837001621723175,
+      "learning_rate": 0.00041166190614981053,
+      "loss": 3.3003,
+      "step": 53950
+    },
+    {
+      "epoch": 15.729725005826147,
+      "grad_norm": 0.3705950081348419,
+      "learning_rate": 0.00041148703002040217,
+      "loss": 3.2895,
+      "step": 54000
+    },
+    {
+      "epoch": 15.729725005826147,
+      "eval_accuracy": 0.3725364304319686,
+      "eval_loss": 3.5416035652160645,
+      "eval_runtime": 194.42,
+      "eval_samples_per_second": 85.614,
+      "eval_steps_per_second": 5.354,
+      "step": 54000
+    },
+    {
+      "epoch": 15.744290375203915,
+      "grad_norm": 0.3562714159488678,
+      "learning_rate": 0.00041131215389099386,
+      "loss": 3.3019,
+      "step": 54050
+    },
+    {
+      "epoch": 15.758855744581682,
+      "grad_norm": 0.3780875504016876,
+      "learning_rate": 0.0004111372777615855,
+      "loss": 3.2982,
+      "step": 54100
+    },
+    {
+      "epoch": 15.77342111395945,
+      "grad_norm": 0.34928643703460693,
+      "learning_rate": 0.0004109624016321772,
+      "loss": 3.293,
+      "step": 54150
+    },
+    {
+      "epoch": 15.787986483337217,
+      "grad_norm": 0.3641933500766754,
+      "learning_rate": 0.0004107875255027688,
+      "loss": 3.2979,
+      "step": 54200
+    },
+    {
+      "epoch": 15.802551852714984,
+      "grad_norm": 0.34777817130088806,
+      "learning_rate": 0.00041061264937336045,
+      "loss": 3.3047,
+      "step": 54250
+    },
+    {
+      "epoch": 15.817117222092753,
+      "grad_norm": 0.3781992793083191,
+      "learning_rate": 0.0004104377732439522,
+      "loss": 3.3023,
+      "step": 54300
+    },
+    {
+      "epoch": 15.83168259147052,
+      "grad_norm": 0.35115933418273926,
+      "learning_rate": 0.00041026289711454384,
+      "loss": 3.3009,
+      "step": 54350
+    },
+    {
+      "epoch": 15.846247960848288,
+      "grad_norm": 0.3789314925670624,
+      "learning_rate": 0.0004100880209851355,
+      "loss": 3.3064,
+      "step": 54400
+    },
+    {
+      "epoch": 15.860813330226055,
+      "grad_norm": 0.35568493604660034,
+      "learning_rate": 0.00040991314485572716,
+      "loss": 3.2978,
+      "step": 54450
+    },
+    {
+      "epoch": 15.875378699603822,
+      "grad_norm": 0.34552639722824097,
+      "learning_rate": 0.00040973826872631885,
+      "loss": 3.3108,
+      "step": 54500
+    },
+    {
+      "epoch": 15.88994406898159,
+      "grad_norm": 0.34846824407577515,
+      "learning_rate": 0.0004095633925969105,
+      "loss": 3.3053,
+      "step": 54550
+    },
+    {
+      "epoch": 15.904509438359357,
+      "grad_norm": 0.37127169966697693,
+      "learning_rate": 0.0004093885164675021,
+      "loss": 3.3031,
+      "step": 54600
+    },
+    {
+      "epoch": 15.919074807737124,
+      "grad_norm": 0.37105128169059753,
+      "learning_rate": 0.0004092136403380938,
+      "loss": 3.3211,
+      "step": 54650
+    },
+    {
+      "epoch": 15.933640177114892,
+      "grad_norm": 0.34885090589523315,
+      "learning_rate": 0.00040903876420868545,
+      "loss": 3.3023,
+      "step": 54700
+    },
+    {
+      "epoch": 15.948205546492659,
+      "grad_norm": 0.3694795072078705,
+      "learning_rate": 0.00040886388807927714,
+      "loss": 3.3139,
+      "step": 54750
+    },
+    {
+      "epoch": 15.962770915870426,
+      "grad_norm": 0.37064146995544434,
+      "learning_rate": 0.00040868901194986883,
+      "loss": 3.2984,
+      "step": 54800
+    },
+    {
+      "epoch": 15.977336285248194,
+      "grad_norm": 0.3710354268550873,
+      "learning_rate": 0.00040851413582046047,
+      "loss": 3.314,
+      "step": 54850
+    },
+    {
+      "epoch": 15.991901654625961,
+      "grad_norm": 0.3885522484779358,
+      "learning_rate": 0.00040833925969105216,
+      "loss": 3.3173,
+      "step": 54900
+    },
+    {
+      "epoch": 16.006408762526217,
+      "grad_norm": 0.3504789173603058,
+      "learning_rate": 0.0004081643835616438,
+      "loss": 3.2611,
+      "step": 54950
+    },
+    {
+      "epoch": 16.020974131903984,
+      "grad_norm": 0.380199670791626,
+      "learning_rate": 0.0004079895074322355,
+      "loss": 3.2005,
+      "step": 55000
+    },
+    {
+      "epoch": 16.020974131903984,
+      "eval_accuracy": 0.37211976352036513,
+      "eval_loss": 3.5526411533355713,
+      "eval_runtime": 244.6938,
+      "eval_samples_per_second": 68.024,
+      "eval_steps_per_second": 4.254,
+      "step": 55000
+    },
+    {
+      "epoch": 16.035539501281754,
+      "grad_norm": 0.38104501366615295,
+      "learning_rate": 0.0004078146313028271,
+      "loss": 3.2048,
+      "step": 55050
+    },
+    {
+      "epoch": 16.05010487065952,
+      "grad_norm": 0.3903481364250183,
+      "learning_rate": 0.0004076397551734188,
+      "loss": 3.2114,
+      "step": 55100
+    },
+    {
+      "epoch": 16.064670240037287,
+      "grad_norm": 0.39314690232276917,
+      "learning_rate": 0.00040746487904401045,
+      "loss": 3.2053,
+      "step": 55150
+    },
+    {
+      "epoch": 16.079235609415054,
+      "grad_norm": 0.38397568464279175,
+      "learning_rate": 0.0004072900029146021,
+      "loss": 3.2069,
+      "step": 55200
+    },
+    {
+      "epoch": 16.09380097879282,
+      "grad_norm": 0.3606434762477875,
+      "learning_rate": 0.0004071151267851938,
+      "loss": 3.2255,
+      "step": 55250
+    },
+    {
+      "epoch": 16.10836634817059,
+      "grad_norm": 0.41309845447540283,
+      "learning_rate": 0.00040694025065578546,
+      "loss": 3.2299,
+      "step": 55300
+    },
+    {
+      "epoch": 16.122931717548358,
+      "grad_norm": 0.3519381582736969,
+      "learning_rate": 0.00040676537452637716,
+      "loss": 3.2235,
+      "step": 55350
+    },
+    {
+      "epoch": 16.137497086926125,
+      "grad_norm": 0.3964149057865143,
+      "learning_rate": 0.0004065904983969688,
+      "loss": 3.2243,
+      "step": 55400
+    },
+    {
+      "epoch": 16.15206245630389,
+      "grad_norm": 0.38055652379989624,
+      "learning_rate": 0.00040641562226756043,
+      "loss": 3.2306,
+      "step": 55450
+    },
+    {
+      "epoch": 16.16662782568166,
+      "grad_norm": 0.3741556406021118,
+      "learning_rate": 0.0004062407461381521,
+      "loss": 3.2467,
+      "step": 55500
+    },
+    {
+      "epoch": 16.181193195059425,
+      "grad_norm": 0.36357757449150085,
+      "learning_rate": 0.00040606587000874375,
+      "loss": 3.2398,
+      "step": 55550
+    },
+    {
+      "epoch": 16.195758564437195,
+      "grad_norm": 0.3765939772129059,
+      "learning_rate": 0.00040589099387933544,
+      "loss": 3.2399,
+      "step": 55600
+    },
+    {
+      "epoch": 16.210323933814962,
+      "grad_norm": 0.4116554856300354,
+      "learning_rate": 0.0004057161177499271,
+      "loss": 3.2463,
+      "step": 55650
+    },
+    {
+      "epoch": 16.22488930319273,
+      "grad_norm": 0.35418593883514404,
+      "learning_rate": 0.0004055412416205187,
+      "loss": 3.2358,
+      "step": 55700
+    },
+    {
+      "epoch": 16.239454672570496,
+      "grad_norm": 0.3857363164424896,
+      "learning_rate": 0.0004053663654911104,
+      "loss": 3.2392,
+      "step": 55750
+    },
+    {
+      "epoch": 16.254020041948262,
+      "grad_norm": 0.38058850169181824,
+      "learning_rate": 0.0004051914893617021,
+      "loss": 3.2453,
+      "step": 55800
+    },
+    {
+      "epoch": 16.268585411326033,
+      "grad_norm": 0.389410138130188,
+      "learning_rate": 0.0004050166132322938,
+      "loss": 3.2443,
+      "step": 55850
+    },
+    {
+      "epoch": 16.2831507807038,
+      "grad_norm": 0.3780372440814972,
+      "learning_rate": 0.0004048417371028854,
+      "loss": 3.2311,
+      "step": 55900
+    },
+    {
+      "epoch": 16.297716150081566,
+      "grad_norm": 0.3871661126613617,
+      "learning_rate": 0.0004046668609734771,
+      "loss": 3.2533,
+      "step": 55950
+    },
+    {
+      "epoch": 16.312281519459333,
+      "grad_norm": 0.3906373083591461,
+      "learning_rate": 0.00040449198484406875,
+      "loss": 3.2617,
+      "step": 56000
+    },
+    {
+      "epoch": 16.312281519459333,
+      "eval_accuracy": 0.37166923654371037,
+      "eval_loss": 3.552480697631836,
+      "eval_runtime": 183.2175,
+      "eval_samples_per_second": 90.848,
+      "eval_steps_per_second": 5.682,
+      "step": 56000
+    },
+    {
+      "epoch": 16.3268468888371,
+      "grad_norm": 0.38734468817710876,
+      "learning_rate": 0.0004043171087146604,
+      "loss": 3.2497,
+      "step": 56050
+    },
+    {
+      "epoch": 16.34141225821487,
+      "grad_norm": 0.3852713406085968,
+      "learning_rate": 0.0004041422325852521,
+      "loss": 3.2587,
+      "step": 56100
+    },
+    {
+      "epoch": 16.355977627592637,
+      "grad_norm": 0.3770441710948944,
+      "learning_rate": 0.0004039673564558437,
+      "loss": 3.2505,
+      "step": 56150
+    },
+    {
+      "epoch": 16.370542996970403,
+      "grad_norm": 0.36964693665504456,
+      "learning_rate": 0.0004037924803264354,
+      "loss": 3.2582,
+      "step": 56200
+    },
+    {
+      "epoch": 16.38510836634817,
+      "grad_norm": 0.3863416016101837,
+      "learning_rate": 0.00040361760419702704,
+      "loss": 3.2596,
+      "step": 56250
+    },
+    {
+      "epoch": 16.399673735725937,
+      "grad_norm": 0.4353122115135193,
+      "learning_rate": 0.00040344272806761873,
+      "loss": 3.2657,
+      "step": 56300
+    },
+    {
+      "epoch": 16.414239105103704,
+      "grad_norm": 0.3620751202106476,
+      "learning_rate": 0.0004032678519382104,
+      "loss": 3.2607,
+      "step": 56350
+    },
+    {
+      "epoch": 16.428804474481474,
+      "grad_norm": 0.3591553568840027,
+      "learning_rate": 0.00040309297580880206,
+      "loss": 3.2657,
+      "step": 56400
+    },
+    {
+      "epoch": 16.44336984385924,
+      "grad_norm": 0.3680947721004486,
+      "learning_rate": 0.00040291809967939375,
+      "loss": 3.27,
+      "step": 56450
+    },
+    {
+      "epoch": 16.457935213237008,
+      "grad_norm": 0.3693728446960449,
+      "learning_rate": 0.0004027432235499854,
+      "loss": 3.2616,
+      "step": 56500
+    },
+    {
+      "epoch": 16.472500582614774,
+      "grad_norm": 0.3734280467033386,
+      "learning_rate": 0.0004025683474205771,
+      "loss": 3.261,
+      "step": 56550
+    },
+    {
+      "epoch": 16.48706595199254,
+      "grad_norm": 0.38935521245002747,
+      "learning_rate": 0.0004023934712911687,
+      "loss": 3.2739,
+      "step": 56600
+    },
+    {
+      "epoch": 16.50163132137031,
+      "grad_norm": 0.34762194752693176,
+      "learning_rate": 0.00040221859516176035,
+      "loss": 3.2653,
+      "step": 56650
+    },
+    {
+      "epoch": 16.516196690748078,
+      "grad_norm": 0.36241579055786133,
+      "learning_rate": 0.00040204371903235204,
+      "loss": 3.2751,
+      "step": 56700
+    },
+    {
+      "epoch": 16.530762060125845,
+      "grad_norm": 0.37560877203941345,
+      "learning_rate": 0.0004018688429029437,
+      "loss": 3.2705,
+      "step": 56750
+    },
+    {
+      "epoch": 16.54532742950361,
+      "grad_norm": 0.37584882974624634,
+      "learning_rate": 0.0004016939667735354,
+      "loss": 3.2786,
+      "step": 56800
+    },
+    {
+      "epoch": 16.55989279888138,
+      "grad_norm": 0.38627269864082336,
+      "learning_rate": 0.00040151909064412705,
+      "loss": 3.2784,
+      "step": 56850
+    },
+    {
+      "epoch": 16.57445816825915,
+      "grad_norm": 0.37627264857292175,
+      "learning_rate": 0.0004013442145147187,
+      "loss": 3.2719,
+      "step": 56900
+    },
+    {
+      "epoch": 16.589023537636916,
+      "grad_norm": 0.4065730571746826,
+      "learning_rate": 0.0004011693383853104,
+      "loss": 3.2775,
+      "step": 56950
+    },
+    {
+      "epoch": 16.603588907014682,
+      "grad_norm": 0.382291704416275,
+      "learning_rate": 0.000400994462255902,
+      "loss": 3.2742,
+      "step": 57000
+    },
+    {
+      "epoch": 16.603588907014682,
+      "eval_accuracy": 0.3722539105141968,
+      "eval_loss": 3.5430514812469482,
+      "eval_runtime": 183.267,
+      "eval_samples_per_second": 90.824,
+      "eval_steps_per_second": 5.68,
+      "step": 57000
+    },
+    {
+      "epoch": 16.61815427639245,
+      "grad_norm": 0.3411348760128021,
+      "learning_rate": 0.0004008195861264937,
+      "loss": 3.2873,
+      "step": 57050
+    },
+    {
+      "epoch": 16.632719645770216,
+      "grad_norm": 0.3512347638607025,
+      "learning_rate": 0.00040064470999708534,
+      "loss": 3.2737,
+      "step": 57100
+    },
+    {
+      "epoch": 16.647285015147983,
+      "grad_norm": 0.3661727011203766,
+      "learning_rate": 0.00040046983386767703,
+      "loss": 3.2778,
+      "step": 57150
+    },
+    {
+      "epoch": 16.661850384525753,
+      "grad_norm": 0.36524951457977295,
+      "learning_rate": 0.00040029495773826867,
+      "loss": 3.2854,
+      "step": 57200
+    },
+    {
+      "epoch": 16.67641575390352,
+      "grad_norm": 0.408110111951828,
+      "learning_rate": 0.0004001200816088603,
+      "loss": 3.2824,
+      "step": 57250
+    },
+    {
+      "epoch": 16.690981123281286,
+      "grad_norm": 0.37179291248321533,
+      "learning_rate": 0.00039994520547945205,
+      "loss": 3.2841,
+      "step": 57300
+    },
+    {
+      "epoch": 16.705546492659053,
+      "grad_norm": 0.39645498991012573,
+      "learning_rate": 0.0003997703293500437,
+      "loss": 3.2906,
+      "step": 57350
+    },
+    {
+      "epoch": 16.72011186203682,
+      "grad_norm": 0.5533550977706909,
+      "learning_rate": 0.0003995954532206354,
+      "loss": 3.2817,
+      "step": 57400
+    },
+    {
+      "epoch": 16.73467723141459,
+      "grad_norm": 0.3692794740200043,
+      "learning_rate": 0.000399420577091227,
+      "loss": 3.2839,
+      "step": 57450
+    },
+    {
+      "epoch": 16.749242600792357,
+      "grad_norm": 0.3895152807235718,
+      "learning_rate": 0.00039924570096181865,
+      "loss": 3.2873,
+      "step": 57500
+    },
+    {
+      "epoch": 16.763807970170124,
+      "grad_norm": 0.36398881673812866,
+      "learning_rate": 0.00039907082483241034,
+      "loss": 3.2989,
+      "step": 57550
+    },
+    {
+      "epoch": 16.77837333954789,
+      "grad_norm": 0.3857356309890747,
+      "learning_rate": 0.000398895948703002,
+      "loss": 3.2908,
+      "step": 57600
+    },
+    {
+      "epoch": 16.792938708925657,
+      "grad_norm": 0.37029045820236206,
+      "learning_rate": 0.00039872107257359367,
+      "loss": 3.2951,
+      "step": 57650
+    },
+    {
+      "epoch": 16.807504078303424,
+      "grad_norm": 0.3431582450866699,
+      "learning_rate": 0.0003985461964441853,
+      "loss": 3.3015,
+      "step": 57700
+    },
+    {
+      "epoch": 16.822069447681194,
+      "grad_norm": 0.3670969605445862,
+      "learning_rate": 0.00039837132031477694,
+      "loss": 3.2909,
+      "step": 57750
+    },
+    {
+      "epoch": 16.83663481705896,
+      "grad_norm": 0.4164856970310211,
+      "learning_rate": 0.0003981964441853687,
+      "loss": 3.2918,
+      "step": 57800
+    },
+    {
+      "epoch": 16.851200186436728,
+      "grad_norm": 0.3781265318393707,
+      "learning_rate": 0.0003980215680559603,
+      "loss": 3.2909,
+      "step": 57850
+    },
+    {
+      "epoch": 16.865765555814495,
+      "grad_norm": 0.36362481117248535,
+      "learning_rate": 0.000397846691926552,
+      "loss": 3.2974,
+      "step": 57900
+    },
+    {
+      "epoch": 16.88033092519226,
+      "grad_norm": 0.37053555250167847,
+      "learning_rate": 0.00039767181579714365,
+      "loss": 3.2951,
+      "step": 57950
+    },
+    {
+      "epoch": 16.89489629457003,
+      "grad_norm": 0.3637015223503113,
+      "learning_rate": 0.00039749693966773534,
+      "loss": 3.288,
+      "step": 58000
+    },
+    {
+      "epoch": 16.89489629457003,
+      "eval_accuracy": 0.3729018369673139,
+      "eval_loss": 3.5355286598205566,
+      "eval_runtime": 183.1123,
+      "eval_samples_per_second": 90.901,
+      "eval_steps_per_second": 5.685,
+      "step": 58000
+    },
+    {
+      "epoch": 16.9094616639478,
+      "grad_norm": 0.3660704791545868,
+      "learning_rate": 0.00039732206353832697,
+      "loss": 3.2817,
+      "step": 58050
+    },
+    {
+      "epoch": 16.924027033325565,
+      "grad_norm": 0.36517202854156494,
+      "learning_rate": 0.0003971471874089186,
+      "loss": 3.3008,
+      "step": 58100
+    },
+    {
+      "epoch": 16.938592402703332,
+      "grad_norm": 0.3487236797809601,
+      "learning_rate": 0.0003969723112795103,
+      "loss": 3.2899,
+      "step": 58150
+    },
+    {
+      "epoch": 16.9531577720811,
+      "grad_norm": 0.3602113127708435,
+      "learning_rate": 0.00039679743515010194,
+      "loss": 3.2924,
+      "step": 58200
+    },
+    {
+      "epoch": 16.96772314145887,
+      "grad_norm": 0.33480265736579895,
+      "learning_rate": 0.0003966225590206937,
+      "loss": 3.2941,
+      "step": 58250
+    },
+    {
+      "epoch": 16.982288510836636,
+      "grad_norm": 0.4005352258682251,
+      "learning_rate": 0.0003964476828912853,
+      "loss": 3.3045,
+      "step": 58300
+    },
+    {
+      "epoch": 16.996853880214402,
+      "grad_norm": 0.35570263862609863,
+      "learning_rate": 0.00039627280676187695,
+      "loss": 3.2877,
+      "step": 58350
+    },
+    {
+      "epoch": 17.01136098811466,
+      "grad_norm": 0.3764328062534332,
+      "learning_rate": 0.00039609793063246864,
+      "loss": 3.2242,
+      "step": 58400
+    },
+    {
+      "epoch": 17.025926357492427,
+      "grad_norm": 0.39979997277259827,
+      "learning_rate": 0.0003959230545030603,
+      "loss": 3.189,
+      "step": 58450
+    },
+    {
+      "epoch": 17.040491726870194,
+      "grad_norm": 0.37430837750434875,
+      "learning_rate": 0.00039574817837365197,
+      "loss": 3.1966,
+      "step": 58500
+    },
+    {
+      "epoch": 17.05505709624796,
+      "grad_norm": 0.4127182066440582,
+      "learning_rate": 0.0003955733022442436,
+      "loss": 3.1846,
+      "step": 58550
+    },
+    {
+      "epoch": 17.069622465625727,
+      "grad_norm": 0.38014090061187744,
+      "learning_rate": 0.0003953984261148353,
+      "loss": 3.189,
+      "step": 58600
+    },
+    {
+      "epoch": 17.084187835003497,
+      "grad_norm": 0.36704063415527344,
+      "learning_rate": 0.00039522354998542693,
+      "loss": 3.1994,
+      "step": 58650
+    },
+    {
+      "epoch": 17.098753204381264,
+      "grad_norm": 0.36234790086746216,
+      "learning_rate": 0.00039504867385601857,
+      "loss": 3.2003,
+      "step": 58700
+    },
+    {
+      "epoch": 17.11331857375903,
+      "grad_norm": 0.42512327432632446,
+      "learning_rate": 0.0003948737977266103,
+      "loss": 3.197,
+      "step": 58750
+    },
+    {
+      "epoch": 17.127883943136798,
+      "grad_norm": 0.4088022708892822,
+      "learning_rate": 0.00039469892159720195,
+      "loss": 3.2194,
+      "step": 58800
+    },
+    {
+      "epoch": 17.142449312514564,
+      "grad_norm": 0.36195436120033264,
+      "learning_rate": 0.00039452404546779364,
+      "loss": 3.206,
+      "step": 58850
+    },
+    {
+      "epoch": 17.15701468189233,
+      "grad_norm": 0.3801659643650055,
+      "learning_rate": 0.0003943491693383853,
+      "loss": 3.2166,
+      "step": 58900
+    },
+    {
+      "epoch": 17.1715800512701,
+      "grad_norm": 0.4113653898239136,
+      "learning_rate": 0.0003941742932089769,
+      "loss": 3.2178,
+      "step": 58950
+    },
+    {
+      "epoch": 17.18614542064787,
+      "grad_norm": 0.38333582878112793,
+      "learning_rate": 0.0003939994170795686,
+      "loss": 3.2174,
+      "step": 59000
+    },
+    {
+      "epoch": 17.18614542064787,
+      "eval_accuracy": 0.37211094579509135,
+      "eval_loss": 3.5520923137664795,
+      "eval_runtime": 183.3352,
+      "eval_samples_per_second": 90.79,
+      "eval_steps_per_second": 5.678,
+      "step": 59000
+    },
+    {
+      "epoch": 17.200710790025635,
+      "grad_norm": 0.36782196164131165,
+      "learning_rate": 0.00039382454095016024,
+      "loss": 3.227,
+      "step": 59050
+    },
+    {
+      "epoch": 17.215276159403402,
+      "grad_norm": 0.3619454503059387,
+      "learning_rate": 0.00039364966482075193,
+      "loss": 3.2329,
+      "step": 59100
+    },
+    {
+      "epoch": 17.22984152878117,
+      "grad_norm": 0.38505318760871887,
+      "learning_rate": 0.00039347478869134356,
+      "loss": 3.2246,
+      "step": 59150
+    },
+    {
+      "epoch": 17.24440689815894,
+      "grad_norm": 0.3795805871486664,
+      "learning_rate": 0.0003932999125619353,
+      "loss": 3.2382,
+      "step": 59200
+    },
+    {
+      "epoch": 17.258972267536706,
+      "grad_norm": 0.39559125900268555,
+      "learning_rate": 0.00039312503643252695,
+      "loss": 3.231,
+      "step": 59250
+    },
+    {
+      "epoch": 17.273537636914472,
+      "grad_norm": 0.38150516152381897,
+      "learning_rate": 0.0003929501603031186,
+      "loss": 3.2364,
+      "step": 59300
+    },
+    {
+      "epoch": 17.28810300629224,
+      "grad_norm": 0.38355526328086853,
+      "learning_rate": 0.00039277528417371027,
+      "loss": 3.2434,
+      "step": 59350
+    },
+    {
+      "epoch": 17.302668375670006,
+      "grad_norm": 0.40026649832725525,
+      "learning_rate": 0.0003926004080443019,
+      "loss": 3.2365,
+      "step": 59400
+    },
+    {
+      "epoch": 17.317233745047773,
+      "grad_norm": 0.3706173300743103,
+      "learning_rate": 0.0003924255319148936,
+      "loss": 3.2459,
+      "step": 59450
+    },
+    {
+      "epoch": 17.331799114425543,
+      "grad_norm": 0.38115477561950684,
+      "learning_rate": 0.00039225065578548523,
+      "loss": 3.2438,
+      "step": 59500
+    },
+    {
+      "epoch": 17.34636448380331,
+      "grad_norm": 0.39307278394699097,
+      "learning_rate": 0.00039207577965607687,
+      "loss": 3.2399,
+      "step": 59550
+    },
+    {
+      "epoch": 17.360929853181077,
+      "grad_norm": 0.4148417115211487,
+      "learning_rate": 0.00039190090352666856,
+      "loss": 3.24,
+      "step": 59600
+    },
+    {
+      "epoch": 17.375495222558843,
+      "grad_norm": 0.3889954388141632,
+      "learning_rate": 0.0003917260273972602,
+      "loss": 3.2453,
+      "step": 59650
+    },
+    {
+      "epoch": 17.39006059193661,
+      "grad_norm": 0.40655389428138733,
+      "learning_rate": 0.00039155115126785194,
+      "loss": 3.2551,
+      "step": 59700
+    },
+    {
+      "epoch": 17.40462596131438,
+      "grad_norm": 0.37711861729621887,
+      "learning_rate": 0.0003913762751384436,
+      "loss": 3.2578,
+      "step": 59750
+    },
+    {
+      "epoch": 17.419191330692147,
+      "grad_norm": 0.3447685241699219,
+      "learning_rate": 0.00039120139900903527,
+      "loss": 3.2389,
+      "step": 59800
+    },
+    {
+      "epoch": 17.433756700069914,
+      "grad_norm": 0.3808634877204895,
+      "learning_rate": 0.0003910265228796269,
+      "loss": 3.2547,
+      "step": 59850
+    },
+    {
+      "epoch": 17.44832206944768,
+      "grad_norm": 0.3830925524234772,
+      "learning_rate": 0.00039085164675021854,
+      "loss": 3.2616,
+      "step": 59900
+    },
+    {
+      "epoch": 17.462887438825447,
+      "grad_norm": 0.38864314556121826,
+      "learning_rate": 0.00039067677062081023,
+      "loss": 3.2602,
+      "step": 59950
+    },
+    {
+      "epoch": 17.477452808203218,
+      "grad_norm": 0.3750167787075043,
+      "learning_rate": 0.00039050189449140187,
+      "loss": 3.2585,
+      "step": 60000
+    },
+    {
+      "epoch": 17.477452808203218,
+      "eval_accuracy": 0.3726959724745888,
+      "eval_loss": 3.546113967895508,
+      "eval_runtime": 183.2239,
+      "eval_samples_per_second": 90.845,
+      "eval_steps_per_second": 5.682,
+      "step": 60000
+    },
+    {
+      "epoch": 17.492018177580984,
+      "grad_norm": 0.3868557810783386,
+      "learning_rate": 0.00039032701836199356,
+      "loss": 3.2677,
+      "step": 60050
+    },
+    {
+      "epoch": 17.50658354695875,
+      "grad_norm": 0.369974821805954,
+      "learning_rate": 0.0003901521422325852,
+      "loss": 3.2681,
+      "step": 60100
+    },
+    {
+      "epoch": 17.521148916336518,
+      "grad_norm": 0.4058365821838379,
+      "learning_rate": 0.00038997726610317683,
+      "loss": 3.2593,
+      "step": 60150
+    },
+    {
+      "epoch": 17.535714285714285,
+      "grad_norm": 0.3893314599990845,
+      "learning_rate": 0.0003898023899737686,
+      "loss": 3.262,
+      "step": 60200
+    },
+    {
+      "epoch": 17.55027965509205,
+      "grad_norm": 0.41334789991378784,
+      "learning_rate": 0.0003896275138443602,
+      "loss": 3.2686,
+      "step": 60250
+    },
+    {
+      "epoch": 17.56484502446982,
+      "grad_norm": 0.4167849123477936,
+      "learning_rate": 0.0003894526377149519,
+      "loss": 3.2655,
+      "step": 60300
+    },
+    {
+      "epoch": 17.57941039384759,
+      "grad_norm": 0.3605276644229889,
+      "learning_rate": 0.00038927776158554354,
+      "loss": 3.2709,
+      "step": 60350
+    },
+    {
+      "epoch": 17.593975763225355,
+      "grad_norm": 0.3882104158401489,
+      "learning_rate": 0.0003891028854561352,
+      "loss": 3.2607,
+      "step": 60400
+    },
+    {
+      "epoch": 17.608541132603122,
+      "grad_norm": 0.35230183601379395,
+      "learning_rate": 0.00038892800932672686,
+      "loss": 3.2739,
+      "step": 60450
+    },
+    {
+      "epoch": 17.62310650198089,
+      "grad_norm": 0.41730719804763794,
+      "learning_rate": 0.0003887531331973185,
+      "loss": 3.2713,
+      "step": 60500
+    },
+    {
+      "epoch": 17.63767187135866,
+      "grad_norm": 0.3690003752708435,
+      "learning_rate": 0.0003885782570679102,
+      "loss": 3.2633,
+      "step": 60550
+    },
+    {
+      "epoch": 17.652237240736426,
+      "grad_norm": 0.35980063676834106,
+      "learning_rate": 0.0003884033809385018,
+      "loss": 3.2724,
+      "step": 60600
+    },
+    {
+      "epoch": 17.666802610114193,
+      "grad_norm": 0.3813663125038147,
+      "learning_rate": 0.00038822850480909357,
+      "loss": 3.2703,
+      "step": 60650
+    },
+    {
+      "epoch": 17.68136797949196,
+      "grad_norm": 0.35949084162712097,
+      "learning_rate": 0.0003880536286796852,
+      "loss": 3.2721,
+      "step": 60700
+    },
+    {
+      "epoch": 17.695933348869726,
+      "grad_norm": 0.4059002697467804,
+      "learning_rate": 0.00038787875255027684,
+      "loss": 3.2704,
+      "step": 60750
+    },
+    {
+      "epoch": 17.710498718247496,
+      "grad_norm": 0.3678555488586426,
+      "learning_rate": 0.00038770387642086853,
+      "loss": 3.2785,
+      "step": 60800
+    },
+    {
+      "epoch": 17.725064087625263,
+      "grad_norm": 0.4052152931690216,
+      "learning_rate": 0.00038752900029146017,
+      "loss": 3.2585,
+      "step": 60850
+    },
+    {
+      "epoch": 17.73962945700303,
+      "grad_norm": 0.36793798208236694,
+      "learning_rate": 0.00038735412416205186,
+      "loss": 3.271,
+      "step": 60900
+    },
+    {
+      "epoch": 17.754194826380797,
+      "grad_norm": 0.3850816786289215,
+      "learning_rate": 0.0003871792480326435,
+      "loss": 3.2786,
+      "step": 60950
+    },
+    {
+      "epoch": 17.768760195758563,
+      "grad_norm": 0.3746644854545593,
+      "learning_rate": 0.00038700437190323513,
+      "loss": 3.279,
+      "step": 61000
+    },
+    {
+      "epoch": 17.768760195758563,
+      "eval_accuracy": 0.3732691246173842,
+      "eval_loss": 3.5366263389587402,
+      "eval_runtime": 183.1426,
+      "eval_samples_per_second": 90.885,
+      "eval_steps_per_second": 5.684,
+      "step": 61000
+    },
+    {
+      "epoch": 17.78332556513633,
+      "grad_norm": 0.3772430717945099,
+      "learning_rate": 0.0003868294957738268,
+      "loss": 3.2763,
+      "step": 61050
+    },
+    {
+      "epoch": 17.7978909345141,
+      "grad_norm": 0.3624294698238373,
+      "learning_rate": 0.00038665461964441846,
+      "loss": 3.2655,
+      "step": 61100
+    },
+    {
+      "epoch": 17.812456303891867,
+      "grad_norm": 0.3729947507381439,
+      "learning_rate": 0.0003864797435150102,
+      "loss": 3.288,
+      "step": 61150
+    },
+    {
+      "epoch": 17.827021673269634,
+      "grad_norm": 0.37783336639404297,
+      "learning_rate": 0.00038630486738560184,
+      "loss": 3.2851,
+      "step": 61200
+    },
+    {
+      "epoch": 17.8415870426474,
+      "grad_norm": 0.35818716883659363,
+      "learning_rate": 0.00038612999125619353,
+      "loss": 3.2853,
+      "step": 61250
+    },
+    {
+      "epoch": 17.856152412025168,
+      "grad_norm": 0.39936739206314087,
+      "learning_rate": 0.00038595511512678517,
+      "loss": 3.2638,
+      "step": 61300
+    },
+    {
+      "epoch": 17.870717781402938,
+      "grad_norm": 0.38293829560279846,
+      "learning_rate": 0.0003857802389973768,
+      "loss": 3.2811,
+      "step": 61350
+    },
+    {
+      "epoch": 17.885283150780705,
+      "grad_norm": 0.3781496286392212,
+      "learning_rate": 0.0003856053628679685,
+      "loss": 3.294,
+      "step": 61400
+    },
+    {
+      "epoch": 17.89984852015847,
+      "grad_norm": 0.44559693336486816,
+      "learning_rate": 0.00038543048673856013,
+      "loss": 3.2725,
+      "step": 61450
+    },
+    {
+      "epoch": 17.914413889536238,
+      "grad_norm": 0.3721054196357727,
+      "learning_rate": 0.0003852556106091518,
+      "loss": 3.2932,
+      "step": 61500
+    },
+    {
+      "epoch": 17.928979258914005,
+      "grad_norm": 0.37438303232192993,
+      "learning_rate": 0.00038508073447974346,
+      "loss": 3.2904,
+      "step": 61550
+    },
+    {
+      "epoch": 17.943544628291775,
+      "grad_norm": 0.37687599658966064,
+      "learning_rate": 0.0003849058583503351,
+      "loss": 3.2774,
+      "step": 61600
+    },
+    {
+      "epoch": 17.958109997669542,
+      "grad_norm": 0.3619229197502136,
+      "learning_rate": 0.00038473098222092684,
+      "loss": 3.2736,
+      "step": 61650
+    },
+    {
+      "epoch": 17.97267536704731,
+      "grad_norm": 0.382866770029068,
+      "learning_rate": 0.0003845561060915185,
+      "loss": 3.2737,
+      "step": 61700
+    },
+    {
+      "epoch": 17.987240736425075,
+      "grad_norm": 0.43251529335975647,
+      "learning_rate": 0.00038438122996211016,
+      "loss": 3.2917,
+      "step": 61750
+    },
+    {
+      "epoch": 18.001747844325333,
+      "grad_norm": 0.41620609164237976,
+      "learning_rate": 0.0003842063538327018,
+      "loss": 3.27,
+      "step": 61800
+    },
+    {
+      "epoch": 18.0163132137031,
+      "grad_norm": 0.3748462200164795,
+      "learning_rate": 0.0003840314777032935,
+      "loss": 3.1763,
+      "step": 61850
+    },
+    {
+      "epoch": 18.030878583080867,
+      "grad_norm": 0.37598931789398193,
+      "learning_rate": 0.0003838566015738851,
+      "loss": 3.1863,
+      "step": 61900
+    },
+    {
+      "epoch": 18.045443952458633,
+      "grad_norm": 0.3749133348464966,
+      "learning_rate": 0.00038368172544447676,
+      "loss": 3.1954,
+      "step": 61950
+    },
+    {
+      "epoch": 18.0600093218364,
+      "grad_norm": 0.37899747490882874,
+      "learning_rate": 0.00038350684931506845,
+      "loss": 3.1786,
+      "step": 62000
+    },
+    {
+      "epoch": 18.0600093218364,
+      "eval_accuracy": 0.3728517522877588,
+      "eval_loss": 3.5506234169006348,
+      "eval_runtime": 183.1708,
+      "eval_samples_per_second": 90.871,
+      "eval_steps_per_second": 5.683,
+      "step": 62000
+    },
+    {
+      "epoch": 18.07457469121417,
+      "grad_norm": 0.37994885444641113,
+      "learning_rate": 0.0003833319731856601,
+      "loss": 3.1824,
+      "step": 62050
+    },
+    {
+      "epoch": 18.089140060591937,
+      "grad_norm": 0.3813993036746979,
+      "learning_rate": 0.00038315709705625183,
+      "loss": 3.1725,
+      "step": 62100
+    },
+    {
+      "epoch": 18.103705429969704,
+      "grad_norm": 0.38244864344596863,
+      "learning_rate": 0.00038298222092684347,
+      "loss": 3.1979,
+      "step": 62150
+    },
+    {
+      "epoch": 18.11827079934747,
+      "grad_norm": 0.38922345638275146,
+      "learning_rate": 0.0003828073447974351,
+      "loss": 3.2135,
+      "step": 62200
+    },
+    {
+      "epoch": 18.132836168725238,
+      "grad_norm": 0.35493120551109314,
+      "learning_rate": 0.0003826324686680268,
+      "loss": 3.2028,
+      "step": 62250
+    },
+    {
+      "epoch": 18.147401538103008,
+      "grad_norm": 0.4383052885532379,
+      "learning_rate": 0.00038245759253861843,
+      "loss": 3.2043,
+      "step": 62300
+    },
+    {
+      "epoch": 18.161966907480775,
+      "grad_norm": 0.38917025923728943,
+      "learning_rate": 0.0003822827164092101,
+      "loss": 3.2037,
+      "step": 62350
+    },
+    {
+      "epoch": 18.17653227685854,
+      "grad_norm": 0.40065309405326843,
+      "learning_rate": 0.00038210784027980176,
+      "loss": 3.2053,
+      "step": 62400
+    },
+    {
+      "epoch": 18.191097646236308,
+      "grad_norm": 0.37339338660240173,
+      "learning_rate": 0.0003819329641503934,
+      "loss": 3.2231,
+      "step": 62450
+    },
+    {
+      "epoch": 18.205663015614075,
+      "grad_norm": 0.4179166853427887,
+      "learning_rate": 0.0003817580880209851,
+      "loss": 3.2151,
+      "step": 62500
+    },
+    {
+      "epoch": 18.22022838499184,
+      "grad_norm": 0.3906039297580719,
+      "learning_rate": 0.0003815832118915767,
+      "loss": 3.2223,
+      "step": 62550
+    },
+    {
+      "epoch": 18.234793754369612,
+      "grad_norm": 0.3898009657859802,
+      "learning_rate": 0.00038140833576216847,
+      "loss": 3.2201,
+      "step": 62600
+    },
+    {
+      "epoch": 18.24935912374738,
+      "grad_norm": 0.3770427405834198,
+      "learning_rate": 0.0003812334596327601,
+      "loss": 3.2242,
+      "step": 62650
+    },
+    {
+      "epoch": 18.263924493125145,
+      "grad_norm": 0.39647871255874634,
+      "learning_rate": 0.0003810585835033518,
+      "loss": 3.2251,
+      "step": 62700
+    },
+    {
+      "epoch": 18.278489862502912,
+      "grad_norm": 0.40589639544487,
+      "learning_rate": 0.00038088370737394343,
+      "loss": 3.2276,
+      "step": 62750
+    },
+    {
+      "epoch": 18.29305523188068,
+      "grad_norm": 0.37991979718208313,
+      "learning_rate": 0.00038070883124453507,
+      "loss": 3.2265,
+      "step": 62800
+    },
+    {
+      "epoch": 18.30762060125845,
+      "grad_norm": 0.38895300030708313,
+      "learning_rate": 0.00038053395511512676,
+      "loss": 3.2223,
+      "step": 62850
+    },
+    {
+      "epoch": 18.322185970636216,
+      "grad_norm": 0.38880106806755066,
+      "learning_rate": 0.0003803590789857184,
+      "loss": 3.2355,
+      "step": 62900
+    },
+    {
+      "epoch": 18.336751340013983,
+      "grad_norm": 0.4023679196834564,
+      "learning_rate": 0.0003801842028563101,
+      "loss": 3.2323,
+      "step": 62950
+    },
+    {
+      "epoch": 18.35131670939175,
+      "grad_norm": 0.38894176483154297,
+      "learning_rate": 0.0003800093267269017,
+      "loss": 3.225,
+      "step": 63000
+    },
+    {
+      "epoch": 18.35131670939175,
+      "eval_accuracy": 0.37240310642582913,
+      "eval_loss": 3.552517890930176,
+      "eval_runtime": 183.0843,
+      "eval_samples_per_second": 90.914,
+      "eval_steps_per_second": 5.686,
+      "step": 63000
+    },
+    {
+      "epoch": 18.365882078769516,
+      "grad_norm": 0.4012887179851532,
+      "learning_rate": 0.00037983445059749335,
+      "loss": 3.2361,
+      "step": 63050
+    },
+    {
+      "epoch": 18.380447448147287,
+      "grad_norm": 0.44362279772758484,
+      "learning_rate": 0.0003796595744680851,
+      "loss": 3.2371,
+      "step": 63100
+    },
+    {
+      "epoch": 18.395012817525053,
+      "grad_norm": 0.4012256860733032,
+      "learning_rate": 0.00037948469833867674,
+      "loss": 3.2383,
+      "step": 63150
+    },
+    {
+      "epoch": 18.40957818690282,
+      "grad_norm": 0.3982083201408386,
+      "learning_rate": 0.0003793098222092684,
+      "loss": 3.241,
+      "step": 63200
+    },
+    {
+      "epoch": 18.424143556280587,
+      "grad_norm": 0.42237627506256104,
+      "learning_rate": 0.00037913494607986006,
+      "loss": 3.2331,
+      "step": 63250
+    },
+    {
+      "epoch": 18.438708925658354,
+      "grad_norm": 0.4271126985549927,
+      "learning_rate": 0.00037896006995045175,
+      "loss": 3.2448,
+      "step": 63300
+    },
+    {
+      "epoch": 18.45327429503612,
+      "grad_norm": 0.3965889811515808,
+      "learning_rate": 0.0003787851938210434,
+      "loss": 3.2412,
+      "step": 63350
+    },
+    {
+      "epoch": 18.46783966441389,
+      "grad_norm": 0.40629059076309204,
+      "learning_rate": 0.000378610317691635,
+      "loss": 3.2385,
+      "step": 63400
+    },
+    {
+      "epoch": 18.482405033791657,
+      "grad_norm": 0.38835418224334717,
+      "learning_rate": 0.0003784354415622267,
+      "loss": 3.2388,
+      "step": 63450
+    },
+    {
+      "epoch": 18.496970403169424,
+      "grad_norm": 0.4206826686859131,
+      "learning_rate": 0.00037826056543281835,
+      "loss": 3.2447,
+      "step": 63500
+    },
+    {
+      "epoch": 18.51153577254719,
+      "grad_norm": 0.37770959734916687,
+      "learning_rate": 0.0003780856893034101,
+      "loss": 3.2432,
+      "step": 63550
+    },
+    {
+      "epoch": 18.526101141924958,
+      "grad_norm": 0.37431496381759644,
+      "learning_rate": 0.00037791081317400173,
+      "loss": 3.2454,
+      "step": 63600
+    },
+    {
+      "epoch": 18.540666511302728,
+      "grad_norm": 0.36141154170036316,
+      "learning_rate": 0.00037773593704459337,
+      "loss": 3.2647,
+      "step": 63650
+    },
+    {
+      "epoch": 18.555231880680495,
+      "grad_norm": 0.37699899077415466,
+      "learning_rate": 0.00037756106091518506,
+      "loss": 3.254,
+      "step": 63700
+    },
+    {
+      "epoch": 18.56979725005826,
+      "grad_norm": 0.3868577778339386,
+      "learning_rate": 0.0003773861847857767,
+      "loss": 3.2491,
+      "step": 63750
+    },
+    {
+      "epoch": 18.58436261943603,
+      "grad_norm": 0.4043353796005249,
+      "learning_rate": 0.0003772113086563684,
+      "loss": 3.2591,
+      "step": 63800
+    },
+    {
+      "epoch": 18.598927988813795,
+      "grad_norm": 0.3505975008010864,
+      "learning_rate": 0.00037703643252696,
+      "loss": 3.2696,
+      "step": 63850
+    },
+    {
+      "epoch": 18.613493358191565,
+      "grad_norm": 0.39516180753707886,
+      "learning_rate": 0.0003768615563975517,
+      "loss": 3.2635,
+      "step": 63900
+    },
+    {
+      "epoch": 18.628058727569332,
+      "grad_norm": 0.3725748062133789,
+      "learning_rate": 0.00037668668026814335,
+      "loss": 3.2531,
+      "step": 63950
+    },
+    {
+      "epoch": 18.6426240969471,
+      "grad_norm": 0.40790900588035583,
+      "learning_rate": 0.000376511804138735,
+      "loss": 3.2611,
+      "step": 64000
+    },
+    {
+      "epoch": 18.6426240969471,
+      "eval_accuracy": 0.37318447445475594,
+      "eval_loss": 3.5376601219177246,
+      "eval_runtime": 183.0655,
+      "eval_samples_per_second": 90.924,
+      "eval_steps_per_second": 5.686,
+      "step": 64000
+    },
+    {
+      "epoch": 18.657189466324866,
+      "grad_norm": 0.36995676159858704,
+      "learning_rate": 0.00037633692800932673,
+      "loss": 3.2652,
+      "step": 64050
+    },
+    {
+      "epoch": 18.671754835702632,
+      "grad_norm": 0.3847509026527405,
+      "learning_rate": 0.00037616205187991837,
+      "loss": 3.2635,
+      "step": 64100
+    },
+    {
+      "epoch": 18.6863202050804,
+      "grad_norm": 0.3744068443775177,
+      "learning_rate": 0.00037598717575051006,
+      "loss": 3.2476,
+      "step": 64150
+    },
+    {
+      "epoch": 18.70088557445817,
+      "grad_norm": 0.3912813663482666,
+      "learning_rate": 0.0003758122996211017,
+      "loss": 3.2559,
+      "step": 64200
+    },
+    {
+      "epoch": 18.715450943835936,
+      "grad_norm": 0.37561044096946716,
+      "learning_rate": 0.00037563742349169333,
+      "loss": 3.2607,
+      "step": 64250
+    },
+    {
+      "epoch": 18.730016313213703,
+      "grad_norm": 0.37838783860206604,
+      "learning_rate": 0.000375462547362285,
+      "loss": 3.2648,
+      "step": 64300
+    },
+    {
+      "epoch": 18.74458168259147,
+      "grad_norm": 0.3800770044326782,
+      "learning_rate": 0.00037528767123287665,
+      "loss": 3.2628,
+      "step": 64350
+    },
+    {
+      "epoch": 18.759147051969236,
+      "grad_norm": 0.3997965157032013,
+      "learning_rate": 0.00037511279510346834,
+      "loss": 3.2526,
+      "step": 64400
+    },
+    {
+      "epoch": 18.773712421347007,
+      "grad_norm": 0.4012726843357086,
+      "learning_rate": 0.00037493791897406,
+      "loss": 3.2618,
+      "step": 64450
+    },
+    {
+      "epoch": 18.788277790724774,
+      "grad_norm": 0.3717740774154663,
+      "learning_rate": 0.0003747630428446516,
+      "loss": 3.2546,
+      "step": 64500
+    },
+    {
+      "epoch": 18.80284316010254,
+      "grad_norm": 0.37287068367004395,
+      "learning_rate": 0.00037458816671524336,
+      "loss": 3.2703,
+      "step": 64550
+    },
+    {
+      "epoch": 18.817408529480307,
+      "grad_norm": 0.35750555992126465,
+      "learning_rate": 0.000374413290585835,
+      "loss": 3.2715,
+      "step": 64600
+    },
+    {
+      "epoch": 18.831973898858074,
+      "grad_norm": 0.418002724647522,
+      "learning_rate": 0.0003742384144564267,
+      "loss": 3.2617,
+      "step": 64650
+    },
+    {
+      "epoch": 18.846539268235844,
+      "grad_norm": 0.3695155680179596,
+      "learning_rate": 0.0003740635383270183,
+      "loss": 3.2776,
+      "step": 64700
+    },
+    {
+      "epoch": 18.86110463761361,
+      "grad_norm": 0.3876601755619049,
+      "learning_rate": 0.00037388866219761,
+      "loss": 3.261,
+      "step": 64750
+    },
+    {
+      "epoch": 18.875670006991378,
+      "grad_norm": 0.38923588395118713,
+      "learning_rate": 0.00037371378606820165,
+      "loss": 3.278,
+      "step": 64800
+    },
+    {
+      "epoch": 18.890235376369144,
+      "grad_norm": 0.3957293629646301,
+      "learning_rate": 0.0003735389099387933,
+      "loss": 3.2707,
+      "step": 64850
+    },
+    {
+      "epoch": 18.90480074574691,
+      "grad_norm": 0.4016440212726593,
+      "learning_rate": 0.000373364033809385,
+      "loss": 3.2544,
+      "step": 64900
+    },
+    {
+      "epoch": 18.919366115124678,
+      "grad_norm": 0.35893428325653076,
+      "learning_rate": 0.0003731891576799766,
+      "loss": 3.2673,
+      "step": 64950
+    },
+    {
+      "epoch": 18.93393148450245,
+      "grad_norm": 0.3813352584838867,
+      "learning_rate": 0.00037301428155056836,
+      "loss": 3.2804,
+      "step": 65000
+    },
+    {
+      "epoch": 18.93393148450245,
+      "eval_accuracy": 0.37387366786215426,
+      "eval_loss": 3.531456470489502,
+      "eval_runtime": 183.037,
+      "eval_samples_per_second": 90.938,
+      "eval_steps_per_second": 5.687,
+      "step": 65000
+    },
+    {
+      "epoch": 18.948496853880215,
+      "grad_norm": 0.3757934272289276,
+      "learning_rate": 0.00037283940542116,
+      "loss": 3.2652,
+      "step": 65050
+    },
+    {
+      "epoch": 18.96306222325798,
+      "grad_norm": 0.3764069378376007,
+      "learning_rate": 0.00037266452929175163,
+      "loss": 3.278,
+      "step": 65100
+    },
+    {
+      "epoch": 18.97762759263575,
+      "grad_norm": 0.3668430745601654,
+      "learning_rate": 0.0003724896531623433,
+      "loss": 3.27,
+      "step": 65150
+    },
+    {
+      "epoch": 18.992192962013515,
+      "grad_norm": 0.4349370300769806,
+      "learning_rate": 0.00037231477703293496,
+      "loss": 3.2825,
+      "step": 65200
+    },
+    {
+      "epoch": 19.006700069913773,
+      "grad_norm": 0.42744481563568115,
+      "learning_rate": 0.00037213990090352665,
+      "loss": 3.2131,
+      "step": 65250
+    },
+    {
+      "epoch": 19.02126543929154,
+      "grad_norm": 0.3895418643951416,
+      "learning_rate": 0.0003719650247741183,
+      "loss": 3.1544,
+      "step": 65300
+    },
+    {
+      "epoch": 19.035830808669306,
+      "grad_norm": 0.401532381772995,
+      "learning_rate": 0.00037179014864471,
+      "loss": 3.1616,
+      "step": 65350
+    },
+    {
+      "epoch": 19.050396178047077,
+      "grad_norm": 0.38685935735702515,
+      "learning_rate": 0.0003716152725153016,
+      "loss": 3.1816,
+      "step": 65400
+    },
+    {
+      "epoch": 19.064961547424844,
+      "grad_norm": 0.42113545536994934,
+      "learning_rate": 0.00037144039638589325,
+      "loss": 3.1864,
+      "step": 65450
+    },
+    {
+      "epoch": 19.07952691680261,
+      "grad_norm": 0.39501887559890747,
+      "learning_rate": 0.000371265520256485,
+      "loss": 3.1961,
+      "step": 65500
+    },
+    {
+      "epoch": 19.094092286180377,
+      "grad_norm": 0.36792710423469543,
+      "learning_rate": 0.00037109064412707663,
+      "loss": 3.1827,
+      "step": 65550
+    },
+    {
+      "epoch": 19.108657655558144,
+      "grad_norm": 0.39077720046043396,
+      "learning_rate": 0.0003709157679976683,
+      "loss": 3.1918,
+      "step": 65600
+    },
+    {
+      "epoch": 19.123223024935914,
+      "grad_norm": 0.37719228863716125,
+      "learning_rate": 0.00037074089186825995,
+      "loss": 3.189,
+      "step": 65650
+    },
+    {
+      "epoch": 19.13778839431368,
+      "grad_norm": 0.38715848326683044,
+      "learning_rate": 0.0003705660157388516,
+      "loss": 3.19,
+      "step": 65700
+    },
+    {
+      "epoch": 19.152353763691448,
+      "grad_norm": 0.4000729024410248,
+      "learning_rate": 0.0003703911396094433,
+      "loss": 3.2001,
+      "step": 65750
+    },
+    {
+      "epoch": 19.166919133069214,
+      "grad_norm": 0.392840176820755,
+      "learning_rate": 0.0003702162634800349,
+      "loss": 3.1929,
+      "step": 65800
+    },
+    {
+      "epoch": 19.18148450244698,
+      "grad_norm": 0.41991594433784485,
+      "learning_rate": 0.0003700413873506266,
+      "loss": 3.2054,
+      "step": 65850
+    },
+    {
+      "epoch": 19.196049871824748,
+      "grad_norm": 0.40436795353889465,
+      "learning_rate": 0.00036986651122121824,
+      "loss": 3.1997,
+      "step": 65900
+    },
+    {
+      "epoch": 19.210615241202518,
+      "grad_norm": 0.37329864501953125,
+      "learning_rate": 0.00036969163509181,
+      "loss": 3.1967,
+      "step": 65950
+    },
+    {
+      "epoch": 19.225180610580285,
+      "grad_norm": 0.41763025522232056,
+      "learning_rate": 0.0003695167589624016,
+      "loss": 3.2131,
+      "step": 66000
+    },
+    {
+      "epoch": 19.225180610580285,
+      "eval_accuracy": 0.3733340230753992,
+      "eval_loss": 3.547522783279419,
+      "eval_runtime": 183.2911,
+      "eval_samples_per_second": 90.812,
+      "eval_steps_per_second": 5.679,
+      "step": 66000
+    },
+    {
+      "epoch": 19.23974597995805,
+      "grad_norm": 0.41672950983047485,
+      "learning_rate": 0.00036934188283299326,
+      "loss": 3.2026,
+      "step": 66050
+    },
+    {
+      "epoch": 19.25431134933582,
+      "grad_norm": 0.36991941928863525,
+      "learning_rate": 0.00036916700670358495,
+      "loss": 3.2116,
+      "step": 66100
+    },
+    {
+      "epoch": 19.268876718713585,
+      "grad_norm": 0.3786165118217468,
+      "learning_rate": 0.0003689921305741766,
+      "loss": 3.2133,
+      "step": 66150
+    },
+    {
+      "epoch": 19.283442088091356,
+      "grad_norm": 0.39697492122650146,
+      "learning_rate": 0.0003688172544447683,
+      "loss": 3.2199,
+      "step": 66200
+    },
+    {
+      "epoch": 19.298007457469122,
+      "grad_norm": 0.38866594433784485,
+      "learning_rate": 0.0003686423783153599,
+      "loss": 3.2099,
+      "step": 66250
+    },
+    {
+      "epoch": 19.31257282684689,
+      "grad_norm": 0.4164191484451294,
+      "learning_rate": 0.00036846750218595155,
+      "loss": 3.207,
+      "step": 66300
+    },
+    {
+      "epoch": 19.327138196224656,
+      "grad_norm": 0.3987915813922882,
+      "learning_rate": 0.00036829262605654324,
+      "loss": 3.2254,
+      "step": 66350
+    },
+    {
+      "epoch": 19.341703565602423,
+      "grad_norm": 0.40145906805992126,
+      "learning_rate": 0.0003681177499271349,
+      "loss": 3.2226,
+      "step": 66400
+    },
+    {
+      "epoch": 19.356268934980193,
+      "grad_norm": 0.3883962035179138,
+      "learning_rate": 0.0003679428737977266,
+      "loss": 3.2213,
+      "step": 66450
+    },
+    {
+      "epoch": 19.37083430435796,
+      "grad_norm": 0.3735280930995941,
+      "learning_rate": 0.00036776799766831826,
+      "loss": 3.2262,
+      "step": 66500
+    },
+    {
+      "epoch": 19.385399673735726,
+      "grad_norm": 0.3815906345844269,
+      "learning_rate": 0.0003675931215389099,
+      "loss": 3.2163,
+      "step": 66550
+    },
+    {
+      "epoch": 19.399965043113493,
+      "grad_norm": 0.4458494484424591,
+      "learning_rate": 0.0003674182454095016,
+      "loss": 3.2309,
+      "step": 66600
+    },
+    {
+      "epoch": 19.41453041249126,
+      "grad_norm": 0.37692028284072876,
+      "learning_rate": 0.0003672433692800932,
+      "loss": 3.2361,
+      "step": 66650
+    },
+    {
+      "epoch": 19.429095781869027,
+      "grad_norm": 0.3867899775505066,
+      "learning_rate": 0.0003670684931506849,
+      "loss": 3.2322,
+      "step": 66700
+    },
+    {
+      "epoch": 19.443661151246797,
+      "grad_norm": 0.3880974054336548,
+      "learning_rate": 0.00036689361702127655,
+      "loss": 3.2276,
+      "step": 66750
+    },
+    {
+      "epoch": 19.458226520624564,
+      "grad_norm": 0.397736519575119,
+      "learning_rate": 0.00036671874089186824,
+      "loss": 3.2354,
+      "step": 66800
+    },
+    {
+      "epoch": 19.47279189000233,
+      "grad_norm": 0.3957333266735077,
+      "learning_rate": 0.00036654386476245987,
+      "loss": 3.2387,
+      "step": 66850
+    },
+    {
+      "epoch": 19.487357259380097,
+      "grad_norm": 0.39339420199394226,
+      "learning_rate": 0.0003663689886330515,
+      "loss": 3.2303,
+      "step": 66900
+    },
+    {
+      "epoch": 19.501922628757864,
+      "grad_norm": 0.3895646035671234,
+      "learning_rate": 0.00036619411250364325,
+      "loss": 3.2333,
+      "step": 66950
+    },
+    {
+      "epoch": 19.516487998135634,
+      "grad_norm": 0.39041730761528015,
+      "learning_rate": 0.0003660192363742349,
+      "loss": 3.237,
+      "step": 67000
+    },
+    {
+      "epoch": 19.516487998135634,
+      "eval_accuracy": 0.37304538953477095,
+      "eval_loss": 3.542279005050659,
+      "eval_runtime": 183.1165,
+      "eval_samples_per_second": 90.898,
+      "eval_steps_per_second": 5.685,
+      "step": 67000
+    },
+    {
+      "epoch": 19.5310533675134,
+      "grad_norm": 0.4078094959259033,
+      "learning_rate": 0.0003658443602448266,
+      "loss": 3.2433,
+      "step": 67050
+    },
+    {
+      "epoch": 19.545618736891168,
+      "grad_norm": 0.3974609673023224,
+      "learning_rate": 0.0003656694841154182,
+      "loss": 3.2438,
+      "step": 67100
+    },
+    {
+      "epoch": 19.560184106268935,
+      "grad_norm": 0.3790847659111023,
+      "learning_rate": 0.00036549460798600985,
+      "loss": 3.2263,
+      "step": 67150
+    },
+    {
+      "epoch": 19.5747494756467,
+      "grad_norm": 0.3998972177505493,
+      "learning_rate": 0.00036531973185660154,
+      "loss": 3.2406,
+      "step": 67200
+    },
+    {
+      "epoch": 19.589314845024468,
+      "grad_norm": 0.3818765878677368,
+      "learning_rate": 0.0003651448557271932,
+      "loss": 3.2381,
+      "step": 67250
+    },
+    {
+      "epoch": 19.60388021440224,
+      "grad_norm": 0.4024520218372345,
+      "learning_rate": 0.00036496997959778487,
+      "loss": 3.2361,
+      "step": 67300
+    },
+    {
+      "epoch": 19.618445583780005,
+      "grad_norm": 0.42870834469795227,
+      "learning_rate": 0.0003647951034683765,
+      "loss": 3.2474,
+      "step": 67350
+    },
+    {
+      "epoch": 19.633010953157772,
+      "grad_norm": 0.37617725133895874,
+      "learning_rate": 0.00036462022733896825,
+      "loss": 3.2467,
+      "step": 67400
+    },
+    {
+      "epoch": 19.64757632253554,
+      "grad_norm": 0.3796943426132202,
+      "learning_rate": 0.0003644453512095599,
+      "loss": 3.2554,
+      "step": 67450
+    },
+    {
+      "epoch": 19.662141691913305,
+      "grad_norm": 0.36963871121406555,
+      "learning_rate": 0.0003642704750801515,
+      "loss": 3.2466,
+      "step": 67500
+    },
+    {
+      "epoch": 19.676707061291076,
+      "grad_norm": 0.37516793608665466,
+      "learning_rate": 0.0003640955989507432,
+      "loss": 3.2437,
+      "step": 67550
+    },
+    {
+      "epoch": 19.691272430668842,
+      "grad_norm": 0.3728578984737396,
+      "learning_rate": 0.00036392072282133485,
+      "loss": 3.2347,
+      "step": 67600
+    },
+    {
+      "epoch": 19.70583780004661,
+      "grad_norm": 0.3838897943496704,
+      "learning_rate": 0.00036374584669192654,
+      "loss": 3.2424,
+      "step": 67650
+    },
+    {
+      "epoch": 19.720403169424376,
+      "grad_norm": 0.3947892487049103,
+      "learning_rate": 0.0003635709705625182,
+      "loss": 3.252,
+      "step": 67700
+    },
+    {
+      "epoch": 19.734968538802143,
+      "grad_norm": 0.3661569058895111,
+      "learning_rate": 0.0003633960944331098,
+      "loss": 3.2519,
+      "step": 67750
+    },
+    {
+      "epoch": 19.749533908179913,
+      "grad_norm": 0.35730046033859253,
+      "learning_rate": 0.0003632212183037015,
+      "loss": 3.258,
+      "step": 67800
+    },
+    {
+      "epoch": 19.76409927755768,
+      "grad_norm": 0.41826891899108887,
+      "learning_rate": 0.00036304634217429314,
+      "loss": 3.246,
+      "step": 67850
+    },
+    {
+      "epoch": 19.778664646935447,
+      "grad_norm": 0.40090757608413696,
+      "learning_rate": 0.0003628714660448849,
+      "loss": 3.263,
+      "step": 67900
+    },
+    {
+      "epoch": 19.793230016313213,
+      "grad_norm": 0.4540848731994629,
+      "learning_rate": 0.0003626965899154765,
+      "loss": 3.2501,
+      "step": 67950
+    },
+    {
+      "epoch": 19.80779538569098,
+      "grad_norm": 0.41131654381752014,
+      "learning_rate": 0.0003625217137860682,
+      "loss": 3.2455,
+      "step": 68000
+    },
+    {
+      "epoch": 19.80779538569098,
+      "eval_accuracy": 0.3737583320155733,
+      "eval_loss": 3.5353798866271973,
+      "eval_runtime": 183.1485,
+      "eval_samples_per_second": 90.883,
+      "eval_steps_per_second": 5.684,
+      "step": 68000
+    },
+    {
+      "epoch": 19.822360755068747,
+      "grad_norm": 0.40951302647590637,
+      "learning_rate": 0.00036234683765665985,
+      "loss": 3.2595,
+      "step": 68050
+    },
+    {
+      "epoch": 19.836926124446517,
+      "grad_norm": 0.42168769240379333,
+      "learning_rate": 0.0003621719615272515,
+      "loss": 3.2715,
+      "step": 68100
+    },
+    {
+      "epoch": 19.851491493824284,
+      "grad_norm": 0.3935104310512543,
+      "learning_rate": 0.00036199708539784317,
+      "loss": 3.2587,
+      "step": 68150
+    },
+    {
+      "epoch": 19.86605686320205,
+      "grad_norm": 0.41649124026298523,
+      "learning_rate": 0.0003618222092684348,
+      "loss": 3.2611,
+      "step": 68200
+    },
+    {
+      "epoch": 19.880622232579817,
+      "grad_norm": 0.3767942786216736,
+      "learning_rate": 0.0003616473331390265,
+      "loss": 3.2503,
+      "step": 68250
+    },
+    {
+      "epoch": 19.895187601957584,
+      "grad_norm": 0.3804614543914795,
+      "learning_rate": 0.00036147245700961813,
+      "loss": 3.2593,
+      "step": 68300
+    },
+    {
+      "epoch": 19.909752971335354,
+      "grad_norm": 0.40164914727211,
+      "learning_rate": 0.00036129758088020977,
+      "loss": 3.2635,
+      "step": 68350
+    },
+    {
+      "epoch": 19.92431834071312,
+      "grad_norm": 0.37406373023986816,
+      "learning_rate": 0.0003611227047508015,
+      "loss": 3.2568,
+      "step": 68400
+    },
+    {
+      "epoch": 19.938883710090888,
+      "grad_norm": 0.38547801971435547,
+      "learning_rate": 0.00036094782862139315,
+      "loss": 3.2668,
+      "step": 68450
+    },
+    {
+      "epoch": 19.953449079468655,
+      "grad_norm": 0.40557870268821716,
+      "learning_rate": 0.00036077295249198484,
+      "loss": 3.2518,
+      "step": 68500
+    },
+    {
+      "epoch": 19.96801444884642,
+      "grad_norm": 0.39365437626838684,
+      "learning_rate": 0.0003605980763625765,
+      "loss": 3.2751,
+      "step": 68550
+    },
+    {
+      "epoch": 19.982579818224192,
+      "grad_norm": 0.3988928198814392,
+      "learning_rate": 0.0003604232002331681,
+      "loss": 3.2664,
+      "step": 68600
+    },
+    {
+      "epoch": 19.99714518760196,
+      "grad_norm": 0.3843889534473419,
+      "learning_rate": 0.0003602483241037598,
+      "loss": 3.2573,
+      "step": 68650
+    },
+    {
+      "epoch": 20.011652295502213,
+      "grad_norm": 0.4098789095878601,
+      "learning_rate": 0.00036007344797435144,
+      "loss": 3.1714,
+      "step": 68700
+    },
+    {
+      "epoch": 20.026217664879983,
+      "grad_norm": 0.37843582034111023,
+      "learning_rate": 0.00035989857184494313,
+      "loss": 3.1703,
+      "step": 68750
+    },
+    {
+      "epoch": 20.04078303425775,
+      "grad_norm": 0.4209931492805481,
+      "learning_rate": 0.00035972369571553477,
+      "loss": 3.163,
+      "step": 68800
+    },
+    {
+      "epoch": 20.055348403635517,
+      "grad_norm": 0.39720186591148376,
+      "learning_rate": 0.0003595488195861265,
+      "loss": 3.17,
+      "step": 68850
+    },
+    {
+      "epoch": 20.069913773013283,
+      "grad_norm": 0.41874927282333374,
+      "learning_rate": 0.00035937394345671815,
+      "loss": 3.1616,
+      "step": 68900
+    },
+    {
+      "epoch": 20.08447914239105,
+      "grad_norm": 0.3896577060222626,
+      "learning_rate": 0.0003591990673273098,
+      "loss": 3.1776,
+      "step": 68950
+    },
+    {
+      "epoch": 20.099044511768817,
+      "grad_norm": 0.4793306887149811,
+      "learning_rate": 0.0003590241911979015,
+      "loss": 3.1751,
+      "step": 69000
+    },
+    {
+      "epoch": 20.099044511768817,
+      "eval_accuracy": 0.3726623475488781,
+      "eval_loss": 3.5513575077056885,
+      "eval_runtime": 182.9863,
+      "eval_samples_per_second": 90.963,
+      "eval_steps_per_second": 5.689,
+      "step": 69000
+    },
+    {
+      "epoch": 20.113609881146587,
+      "grad_norm": 0.38327980041503906,
+      "learning_rate": 0.0003588493150684931,
+      "loss": 3.1769,
+      "step": 69050
+    },
+    {
+      "epoch": 20.128175250524354,
+      "grad_norm": 0.4346780776977539,
+      "learning_rate": 0.0003586744389390848,
+      "loss": 3.1907,
+      "step": 69100
+    },
+    {
+      "epoch": 20.14274061990212,
+      "grad_norm": 0.42224401235580444,
+      "learning_rate": 0.00035849956280967644,
+      "loss": 3.187,
+      "step": 69150
+    },
+    {
+      "epoch": 20.157305989279887,
+      "grad_norm": 0.37276560068130493,
+      "learning_rate": 0.0003583246866802681,
+      "loss": 3.1784,
+      "step": 69200
+    },
+    {
+      "epoch": 20.171871358657654,
+      "grad_norm": 0.40166395902633667,
+      "learning_rate": 0.00035814981055085976,
+      "loss": 3.1858,
+      "step": 69250
+    },
+    {
+      "epoch": 20.186436728035424,
+      "grad_norm": 0.3963995575904846,
+      "learning_rate": 0.0003579749344214514,
+      "loss": 3.1894,
+      "step": 69300
+    },
+    {
+      "epoch": 20.20100209741319,
+      "grad_norm": 0.39567843079566956,
+      "learning_rate": 0.00035780005829204315,
+      "loss": 3.2148,
+      "step": 69350
+    },
+    {
+      "epoch": 20.215567466790958,
+      "grad_norm": 0.40741077065467834,
+      "learning_rate": 0.0003576251821626348,
+      "loss": 3.1976,
+      "step": 69400
+    },
+    {
+      "epoch": 20.230132836168725,
+      "grad_norm": 0.3944525718688965,
+      "learning_rate": 0.00035745030603322647,
+      "loss": 3.1988,
+      "step": 69450
+    },
+    {
+      "epoch": 20.24469820554649,
+      "grad_norm": 0.3943832516670227,
+      "learning_rate": 0.0003572754299038181,
+      "loss": 3.2033,
+      "step": 69500
+    },
+    {
+      "epoch": 20.25926357492426,
+      "grad_norm": 0.39849257469177246,
+      "learning_rate": 0.00035710055377440974,
+      "loss": 3.1942,
+      "step": 69550
+    },
+    {
+      "epoch": 20.27382894430203,
+      "grad_norm": 0.3904484510421753,
+      "learning_rate": 0.00035692567764500143,
+      "loss": 3.1997,
+      "step": 69600
+    },
+    {
+      "epoch": 20.288394313679795,
+      "grad_norm": 0.4032699763774872,
+      "learning_rate": 0.00035675080151559307,
+      "loss": 3.2064,
+      "step": 69650
+    },
+    {
+      "epoch": 20.302959683057562,
+      "grad_norm": 0.4133860170841217,
+      "learning_rate": 0.00035657592538618476,
+      "loss": 3.2082,
+      "step": 69700
+    },
+    {
+      "epoch": 20.31752505243533,
+      "grad_norm": 0.3864280879497528,
+      "learning_rate": 0.0003564010492567764,
+      "loss": 3.207,
+      "step": 69750
+    },
+    {
+      "epoch": 20.332090421813096,
+      "grad_norm": 0.39940962195396423,
+      "learning_rate": 0.00035622617312736803,
+      "loss": 3.2138,
+      "step": 69800
+    },
+    {
+      "epoch": 20.346655791190866,
+      "grad_norm": 0.43951013684272766,
+      "learning_rate": 0.0003560512969979598,
+      "loss": 3.2098,
+      "step": 69850
+    },
+    {
+      "epoch": 20.361221160568633,
+      "grad_norm": 0.3883720934391022,
+      "learning_rate": 0.0003558764208685514,
+      "loss": 3.2001,
+      "step": 69900
+    },
+    {
+      "epoch": 20.3757865299464,
+      "grad_norm": 0.38097983598709106,
+      "learning_rate": 0.0003557015447391431,
+      "loss": 3.2136,
+      "step": 69950
+    },
+    {
+      "epoch": 20.390351899324166,
+      "grad_norm": 0.379999577999115,
+      "learning_rate": 0.00035552666860973474,
+      "loss": 3.2249,
+      "step": 70000
+    },
+    {
+      "epoch": 20.390351899324166,
+      "eval_accuracy": 0.37320645998310525,
+      "eval_loss": 3.547257423400879,
+      "eval_runtime": 183.2991,
+      "eval_samples_per_second": 90.808,
+      "eval_steps_per_second": 5.679,
+      "step": 70000
+    },
+    {
+      "epoch": 20.404917268701933,
+      "grad_norm": 0.4156287908554077,
+      "learning_rate": 0.00035535179248032643,
+      "loss": 3.2132,
+      "step": 70050
+    },
+    {
+      "epoch": 20.419482638079703,
+      "grad_norm": 0.43959635496139526,
+      "learning_rate": 0.00035517691635091807,
+      "loss": 3.2181,
+      "step": 70100
+    },
+    {
+      "epoch": 20.43404800745747,
+      "grad_norm": 0.4064927399158478,
+      "learning_rate": 0.0003550020402215097,
+      "loss": 3.2219,
+      "step": 70150
+    },
+    {
+      "epoch": 20.448613376835237,
+      "grad_norm": 0.40533578395843506,
+      "learning_rate": 0.0003548271640921014,
+      "loss": 3.2271,
+      "step": 70200
+    },
+    {
+      "epoch": 20.463178746213003,
+      "grad_norm": 0.4207044541835785,
+      "learning_rate": 0.00035465228796269303,
+      "loss": 3.2202,
+      "step": 70250
+    },
+    {
+      "epoch": 20.47774411559077,
+      "grad_norm": 0.4209517538547516,
+      "learning_rate": 0.0003544774118332848,
+      "loss": 3.2265,
+      "step": 70300
+    },
+    {
+      "epoch": 20.49230948496854,
+      "grad_norm": 0.40417852997779846,
+      "learning_rate": 0.0003543025357038764,
+      "loss": 3.2258,
+      "step": 70350
+    },
+    {
+      "epoch": 20.506874854346307,
+      "grad_norm": 0.4038628041744232,
+      "learning_rate": 0.00035412765957446805,
+      "loss": 3.2193,
+      "step": 70400
+    },
+    {
+      "epoch": 20.521440223724074,
+      "grad_norm": 0.3865199089050293,
+      "learning_rate": 0.00035395278344505974,
+      "loss": 3.2313,
+      "step": 70450
+    },
+    {
+      "epoch": 20.53600559310184,
+      "grad_norm": 0.39419683814048767,
+      "learning_rate": 0.0003537779073156514,
+      "loss": 3.2345,
+      "step": 70500
+    },
+    {
+      "epoch": 20.550570962479608,
+      "grad_norm": 0.3920558989048004,
+      "learning_rate": 0.00035360303118624306,
+      "loss": 3.2299,
+      "step": 70550
+    },
+    {
+      "epoch": 20.565136331857374,
+      "grad_norm": 0.41333022713661194,
+      "learning_rate": 0.0003534281550568347,
+      "loss": 3.2204,
+      "step": 70600
+    },
+    {
+      "epoch": 20.579701701235145,
+      "grad_norm": 0.3800913393497467,
+      "learning_rate": 0.0003532532789274264,
+      "loss": 3.2371,
+      "step": 70650
+    },
+    {
+      "epoch": 20.59426707061291,
+      "grad_norm": 0.3914024531841278,
+      "learning_rate": 0.000353078402798018,
+      "loss": 3.2452,
+      "step": 70700
+    },
+    {
+      "epoch": 20.608832439990678,
+      "grad_norm": 0.42056038975715637,
+      "learning_rate": 0.00035290352666860966,
+      "loss": 3.2355,
+      "step": 70750
+    },
+    {
+      "epoch": 20.623397809368445,
+      "grad_norm": 0.4147467613220215,
+      "learning_rate": 0.0003527286505392014,
+      "loss": 3.2157,
+      "step": 70800
+    },
+    {
+      "epoch": 20.63796317874621,
+      "grad_norm": 0.41056451201438904,
+      "learning_rate": 0.00035255377440979304,
+      "loss": 3.2341,
+      "step": 70850
+    },
+    {
+      "epoch": 20.652528548123982,
+      "grad_norm": 0.3810015320777893,
+      "learning_rate": 0.00035237889828038473,
+      "loss": 3.2236,
+      "step": 70900
+    },
+    {
+      "epoch": 20.66709391750175,
+      "grad_norm": 0.4104416072368622,
+      "learning_rate": 0.00035220402215097637,
+      "loss": 3.2287,
+      "step": 70950
+    },
+    {
+      "epoch": 20.681659286879515,
+      "grad_norm": 0.4050891101360321,
+      "learning_rate": 0.000352029146021568,
+      "loss": 3.233,
+      "step": 71000
+    },
+    {
+      "epoch": 20.681659286879515,
+      "eval_accuracy": 0.3739369203447848,
+      "eval_loss": 3.538299083709717,
+      "eval_runtime": 182.9222,
+      "eval_samples_per_second": 90.995,
+      "eval_steps_per_second": 5.691,
+      "step": 71000
+    },
+    {
+      "epoch": 20.696224656257282,
+      "grad_norm": 0.40285784006118774,
+      "learning_rate": 0.0003518542698921597,
+      "loss": 3.2479,
+      "step": 71050
+    },
+    {
+      "epoch": 20.71079002563505,
+      "grad_norm": 0.40842974185943604,
+      "learning_rate": 0.00035167939376275133,
+      "loss": 3.2222,
+      "step": 71100
+    },
+    {
+      "epoch": 20.72535539501282,
+      "grad_norm": 0.42046529054641724,
+      "learning_rate": 0.000351504517633343,
+      "loss": 3.23,
+      "step": 71150
+    },
+    {
+      "epoch": 20.739920764390586,
+      "grad_norm": 0.389822781085968,
+      "learning_rate": 0.00035132964150393466,
+      "loss": 3.2453,
+      "step": 71200
+    },
+    {
+      "epoch": 20.754486133768353,
+      "grad_norm": 0.3888729214668274,
+      "learning_rate": 0.0003511547653745263,
+      "loss": 3.2398,
+      "step": 71250
+    },
+    {
+      "epoch": 20.76905150314612,
+      "grad_norm": 0.42378807067871094,
+      "learning_rate": 0.00035097988924511804,
+      "loss": 3.2396,
+      "step": 71300
+    },
+    {
+      "epoch": 20.783616872523886,
+      "grad_norm": 0.3676110506057739,
+      "learning_rate": 0.0003508050131157097,
+      "loss": 3.2502,
+      "step": 71350
+    },
+    {
+      "epoch": 20.798182241901653,
+      "grad_norm": 0.4406678080558777,
+      "learning_rate": 0.00035063013698630137,
+      "loss": 3.2417,
+      "step": 71400
+    },
+    {
+      "epoch": 20.812747611279423,
+      "grad_norm": 0.3739905059337616,
+      "learning_rate": 0.000350455260856893,
+      "loss": 3.2365,
+      "step": 71450
+    },
+    {
+      "epoch": 20.82731298065719,
+      "grad_norm": 0.43861594796180725,
+      "learning_rate": 0.0003502803847274847,
+      "loss": 3.2463,
+      "step": 71500
+    },
+    {
+      "epoch": 20.841878350034957,
+      "grad_norm": 0.4056912064552307,
+      "learning_rate": 0.00035010550859807633,
+      "loss": 3.2361,
+      "step": 71550
+    },
+    {
+      "epoch": 20.856443719412724,
+      "grad_norm": 0.4023033082485199,
+      "learning_rate": 0.00034993063246866797,
+      "loss": 3.2484,
+      "step": 71600
+    },
+    {
+      "epoch": 20.87100908879049,
+      "grad_norm": 0.37357252836227417,
+      "learning_rate": 0.00034975575633925966,
+      "loss": 3.2357,
+      "step": 71650
+    },
+    {
+      "epoch": 20.88557445816826,
+      "grad_norm": 0.40037524700164795,
+      "learning_rate": 0.0003495808802098513,
+      "loss": 3.2396,
+      "step": 71700
+    },
+    {
+      "epoch": 20.900139827546028,
+      "grad_norm": 0.41211217641830444,
+      "learning_rate": 0.00034940600408044304,
+      "loss": 3.2631,
+      "step": 71750
+    },
+    {
+      "epoch": 20.914705196923794,
+      "grad_norm": 0.4259653091430664,
+      "learning_rate": 0.0003492311279510347,
+      "loss": 3.2562,
+      "step": 71800
+    },
+    {
+      "epoch": 20.92927056630156,
+      "grad_norm": 0.3834655284881592,
+      "learning_rate": 0.0003490562518216263,
+      "loss": 3.2472,
+      "step": 71850
+    },
+    {
+      "epoch": 20.943835935679328,
+      "grad_norm": 0.40790706872940063,
+      "learning_rate": 0.000348881375692218,
+      "loss": 3.254,
+      "step": 71900
+    },
+    {
+      "epoch": 20.958401305057095,
+      "grad_norm": 0.3871559202671051,
+      "learning_rate": 0.00034870649956280964,
+      "loss": 3.2523,
+      "step": 71950
+    },
+    {
+      "epoch": 20.972966674434865,
+      "grad_norm": 0.4063284397125244,
+      "learning_rate": 0.0003485316234334013,
+      "loss": 3.2472,
+      "step": 72000
+    },
+    {
+      "epoch": 20.972966674434865,
+      "eval_accuracy": 0.37428234003617616,
+      "eval_loss": 3.527845621109009,
+      "eval_runtime": 183.655,
+      "eval_samples_per_second": 90.632,
+      "eval_steps_per_second": 5.668,
+      "step": 72000
+    },
+    {
+      "epoch": 20.98753204381263,
+      "grad_norm": 0.3977331817150116,
+      "learning_rate": 0.00034835674730399296,
+      "loss": 3.2603,
+      "step": 72050
+    },
+    {
+      "epoch": 21.002039151712886,
+      "grad_norm": 0.42167672514915466,
+      "learning_rate": 0.00034818187117458465,
+      "loss": 3.2473,
+      "step": 72100
+    },
+    {
+      "epoch": 21.016604521090656,
+      "grad_norm": 0.38619017601013184,
+      "learning_rate": 0.0003480069950451763,
+      "loss": 3.1387,
+      "step": 72150
+    },
+    {
+      "epoch": 21.031169890468423,
+      "grad_norm": 0.40300482511520386,
+      "learning_rate": 0.0003478321189157679,
+      "loss": 3.1455,
+      "step": 72200
+    },
+    {
+      "epoch": 21.04573525984619,
+      "grad_norm": 0.40839800238609314,
+      "learning_rate": 0.00034765724278635967,
+      "loss": 3.1608,
+      "step": 72250
+    },
+    {
+      "epoch": 21.060300629223956,
+      "grad_norm": 0.38552382588386536,
+      "learning_rate": 0.0003474823666569513,
+      "loss": 3.1538,
+      "step": 72300
+    },
+    {
+      "epoch": 21.074865998601723,
+      "grad_norm": 0.4049597680568695,
+      "learning_rate": 0.000347307490527543,
+      "loss": 3.1621,
+      "step": 72350
+    },
+    {
+      "epoch": 21.089431367979493,
+      "grad_norm": 0.4319642186164856,
+      "learning_rate": 0.00034713261439813463,
+      "loss": 3.16,
+      "step": 72400
+    },
+    {
+      "epoch": 21.10399673735726,
+      "grad_norm": 0.3880416750907898,
+      "learning_rate": 0.00034695773826872627,
+      "loss": 3.155,
+      "step": 72450
+    },
+    {
+      "epoch": 21.118562106735027,
+      "grad_norm": 0.3752796947956085,
+      "learning_rate": 0.00034678286213931796,
+      "loss": 3.1564,
+      "step": 72500
+    },
+    {
+      "epoch": 21.133127476112794,
+      "grad_norm": 0.396068811416626,
+      "learning_rate": 0.0003466079860099096,
+      "loss": 3.1695,
+      "step": 72550
+    },
+    {
+      "epoch": 21.14769284549056,
+      "grad_norm": 0.3929023742675781,
+      "learning_rate": 0.0003464331098805013,
+      "loss": 3.1709,
+      "step": 72600
+    },
+    {
+      "epoch": 21.16225821486833,
+      "grad_norm": 0.44244226813316345,
+      "learning_rate": 0.0003462582337510929,
+      "loss": 3.1791,
+      "step": 72650
+    },
+    {
+      "epoch": 21.176823584246097,
+      "grad_norm": 0.40291813015937805,
+      "learning_rate": 0.00034608335762168467,
+      "loss": 3.1853,
+      "step": 72700
+    },
+    {
+      "epoch": 21.191388953623864,
+      "grad_norm": 0.39293503761291504,
+      "learning_rate": 0.0003459084814922763,
+      "loss": 3.1719,
+      "step": 72750
+    },
+    {
+      "epoch": 21.20595432300163,
+      "grad_norm": 0.3924226462841034,
+      "learning_rate": 0.00034573360536286794,
+      "loss": 3.1875,
+      "step": 72800
+    },
+    {
+      "epoch": 21.220519692379398,
+      "grad_norm": 0.3741477131843567,
+      "learning_rate": 0.00034555872923345963,
+      "loss": 3.2003,
+      "step": 72850
+    },
+    {
+      "epoch": 21.235085061757164,
+      "grad_norm": 0.40738731622695923,
+      "learning_rate": 0.00034538385310405127,
+      "loss": 3.1968,
+      "step": 72900
+    },
+    {
+      "epoch": 21.249650431134935,
+      "grad_norm": 0.3989109396934509,
+      "learning_rate": 0.00034520897697464296,
+      "loss": 3.1861,
+      "step": 72950
+    },
+    {
+      "epoch": 21.2642158005127,
+      "grad_norm": 0.3871462345123291,
+      "learning_rate": 0.0003450341008452346,
+      "loss": 3.1848,
+      "step": 73000
+    },
+    {
+      "epoch": 21.2642158005127,
+      "eval_accuracy": 0.3733212079813346,
+      "eval_loss": 3.550053596496582,
+      "eval_runtime": 183.2865,
+      "eval_samples_per_second": 90.814,
+      "eval_steps_per_second": 5.68,
+      "step": 73000
+    },
+    {
+      "epoch": 21.27878116989047,
+      "grad_norm": 0.3962025046348572,
+      "learning_rate": 0.00034485922471582623,
+      "loss": 3.1968,
+      "step": 73050
+    },
+    {
+      "epoch": 21.293346539268235,
+      "grad_norm": 0.3977092504501343,
+      "learning_rate": 0.0003446843485864179,
+      "loss": 3.1869,
+      "step": 73100
+    },
+    {
+      "epoch": 21.307911908646002,
+      "grad_norm": 0.37942761182785034,
+      "learning_rate": 0.00034450947245700955,
+      "loss": 3.192,
+      "step": 73150
+    },
+    {
+      "epoch": 21.322477278023772,
+      "grad_norm": 0.4310181438922882,
+      "learning_rate": 0.0003443345963276013,
+      "loss": 3.1912,
+      "step": 73200
+    },
+    {
+      "epoch": 21.33704264740154,
+      "grad_norm": 0.43790924549102783,
+      "learning_rate": 0.00034415972019819294,
+      "loss": 3.2091,
+      "step": 73250
+    },
+    {
+      "epoch": 21.351608016779306,
+      "grad_norm": 0.38981059193611145,
+      "learning_rate": 0.00034398484406878457,
+      "loss": 3.2036,
+      "step": 73300
+    },
+    {
+      "epoch": 21.366173386157072,
+      "grad_norm": 0.37896528840065,
+      "learning_rate": 0.00034380996793937626,
+      "loss": 3.1914,
+      "step": 73350
+    },
+    {
+      "epoch": 21.38073875553484,
+      "grad_norm": 0.4180900752544403,
+      "learning_rate": 0.0003436350918099679,
+      "loss": 3.2035,
+      "step": 73400
+    },
+    {
+      "epoch": 21.39530412491261,
+      "grad_norm": 0.40925297141075134,
+      "learning_rate": 0.0003434602156805596,
+      "loss": 3.2148,
+      "step": 73450
+    },
+    {
+      "epoch": 21.409869494290376,
+      "grad_norm": 0.42755791544914246,
+      "learning_rate": 0.0003432853395511512,
+      "loss": 3.2015,
+      "step": 73500
+    },
+    {
+      "epoch": 21.424434863668143,
+      "grad_norm": 0.4559558629989624,
+      "learning_rate": 0.0003431104634217429,
+      "loss": 3.224,
+      "step": 73550
+    },
+    {
+      "epoch": 21.43900023304591,
+      "grad_norm": 0.394946813583374,
+      "learning_rate": 0.00034293558729233455,
+      "loss": 3.21,
+      "step": 73600
+    },
+    {
+      "epoch": 21.453565602423676,
+      "grad_norm": 0.3784257769584656,
+      "learning_rate": 0.0003427607111629262,
+      "loss": 3.218,
+      "step": 73650
+    },
+    {
+      "epoch": 21.468130971801443,
+      "grad_norm": 0.4060593545436859,
+      "learning_rate": 0.00034258583503351793,
+      "loss": 3.2158,
+      "step": 73700
+    },
+    {
+      "epoch": 21.482696341179214,
+      "grad_norm": 0.41588184237480164,
+      "learning_rate": 0.00034241095890410957,
+      "loss": 3.2228,
+      "step": 73750
+    },
+    {
+      "epoch": 21.49726171055698,
+      "grad_norm": 0.3939662277698517,
+      "learning_rate": 0.00034223608277470126,
+      "loss": 3.2143,
+      "step": 73800
+    },
+    {
+      "epoch": 21.511827079934747,
+      "grad_norm": 0.4258659780025482,
+      "learning_rate": 0.0003420612066452929,
+      "loss": 3.2068,
+      "step": 73850
+    },
+    {
+      "epoch": 21.526392449312514,
+      "grad_norm": 0.387615829706192,
+      "learning_rate": 0.00034188633051588453,
+      "loss": 3.2207,
+      "step": 73900
+    },
+    {
+      "epoch": 21.54095781869028,
+      "grad_norm": 0.4352627694606781,
+      "learning_rate": 0.0003417114543864762,
+      "loss": 3.2228,
+      "step": 73950
+    },
+    {
+      "epoch": 21.55552318806805,
+      "grad_norm": 0.4175478219985962,
+      "learning_rate": 0.00034153657825706786,
+      "loss": 3.2036,
+      "step": 74000
+    },
+    {
+      "epoch": 21.55552318806805,
+      "eval_accuracy": 0.3738021855026015,
+      "eval_loss": 3.54341983795166,
+      "eval_runtime": 183.3969,
+      "eval_samples_per_second": 90.759,
+      "eval_steps_per_second": 5.676,
+      "step": 74000
+    },
+    {
+      "epoch": 21.570088557445818,
+      "grad_norm": 0.386900395154953,
+      "learning_rate": 0.00034136170212765955,
+      "loss": 3.2198,
+      "step": 74050
+    },
+    {
+      "epoch": 21.584653926823584,
+      "grad_norm": 0.3872682452201843,
+      "learning_rate": 0.0003411868259982512,
+      "loss": 3.2253,
+      "step": 74100
+    },
+    {
+      "epoch": 21.59921929620135,
+      "grad_norm": 0.4258601665496826,
+      "learning_rate": 0.00034101194986884293,
+      "loss": 3.2129,
+      "step": 74150
+    },
+    {
+      "epoch": 21.613784665579118,
+      "grad_norm": 0.3833194077014923,
+      "learning_rate": 0.00034083707373943456,
+      "loss": 3.2152,
+      "step": 74200
+    },
+    {
+      "epoch": 21.62835003495689,
+      "grad_norm": 0.4214939475059509,
+      "learning_rate": 0.0003406621976100262,
+      "loss": 3.2255,
+      "step": 74250
+    },
+    {
+      "epoch": 21.642915404334655,
+      "grad_norm": 0.4184681475162506,
+      "learning_rate": 0.0003404873214806179,
+      "loss": 3.2175,
+      "step": 74300
+    },
+    {
+      "epoch": 21.65748077371242,
+      "grad_norm": 0.42748090624809265,
+      "learning_rate": 0.00034031244535120953,
+      "loss": 3.2199,
+      "step": 74350
+    },
+    {
+      "epoch": 21.67204614309019,
+      "grad_norm": 0.4194568991661072,
+      "learning_rate": 0.0003401375692218012,
+      "loss": 3.2254,
+      "step": 74400
+    },
+    {
+      "epoch": 21.686611512467955,
+      "grad_norm": 0.4005660116672516,
+      "learning_rate": 0.00033996269309239285,
+      "loss": 3.2265,
+      "step": 74450
+    },
+    {
+      "epoch": 21.701176881845722,
+      "grad_norm": 0.3818527162075043,
+      "learning_rate": 0.0003397878169629845,
+      "loss": 3.2187,
+      "step": 74500
+    },
+    {
+      "epoch": 21.715742251223492,
+      "grad_norm": 0.4204731285572052,
+      "learning_rate": 0.0003396129408335762,
+      "loss": 3.2483,
+      "step": 74550
+    },
+    {
+      "epoch": 21.73030762060126,
+      "grad_norm": 0.4234389364719391,
+      "learning_rate": 0.0003394380647041678,
+      "loss": 3.236,
+      "step": 74600
+    },
+    {
+      "epoch": 21.744872989979026,
+      "grad_norm": 0.4275653064250946,
+      "learning_rate": 0.00033926318857475956,
+      "loss": 3.2234,
+      "step": 74650
+    },
+    {
+      "epoch": 21.759438359356793,
+      "grad_norm": 0.39097073674201965,
+      "learning_rate": 0.0003390883124453512,
+      "loss": 3.2285,
+      "step": 74700
+    },
+    {
+      "epoch": 21.77400372873456,
+      "grad_norm": 0.40437382459640503,
+      "learning_rate": 0.0003389134363159429,
+      "loss": 3.2288,
+      "step": 74750
+    },
+    {
+      "epoch": 21.78856909811233,
+      "grad_norm": 0.40619733929634094,
+      "learning_rate": 0.0003387385601865345,
+      "loss": 3.2254,
+      "step": 74800
+    },
+    {
+      "epoch": 21.803134467490096,
+      "grad_norm": 0.42823296785354614,
+      "learning_rate": 0.00033856368405712616,
+      "loss": 3.2394,
+      "step": 74850
+    },
+    {
+      "epoch": 21.817699836867863,
+      "grad_norm": 0.4197380542755127,
+      "learning_rate": 0.00033838880792771785,
+      "loss": 3.2329,
+      "step": 74900
+    },
+    {
+      "epoch": 21.83226520624563,
+      "grad_norm": 0.3925802707672119,
+      "learning_rate": 0.0003382139317983095,
+      "loss": 3.2434,
+      "step": 74950
+    },
+    {
+      "epoch": 21.846830575623397,
+      "grad_norm": 0.4069508910179138,
+      "learning_rate": 0.0003380390556689012,
+      "loss": 3.2344,
+      "step": 75000
+    },
+    {
+      "epoch": 21.846830575623397,
+      "eval_accuracy": 0.37450748595483324,
+      "eval_loss": 3.532540798187256,
+      "eval_runtime": 183.2285,
+      "eval_samples_per_second": 90.843,
+      "eval_steps_per_second": 5.681,
+      "step": 75000
+    },
+    {
+      "epoch": 21.861395945001163,
+      "grad_norm": 0.4070306718349457,
+      "learning_rate": 0.0003378641795394928,
+      "loss": 3.2438,
+      "step": 75050
+    },
+    {
+      "epoch": 21.875961314378934,
+      "grad_norm": 0.424502432346344,
+      "learning_rate": 0.00033768930341008445,
+      "loss": 3.2461,
+      "step": 75100
+    },
+    {
+      "epoch": 21.8905266837567,
+      "grad_norm": 0.4162021279335022,
+      "learning_rate": 0.0003375144272806762,
+      "loss": 3.2439,
+      "step": 75150
+    },
+    {
+      "epoch": 21.905092053134467,
+      "grad_norm": 0.4465700387954712,
+      "learning_rate": 0.00033733955115126783,
+      "loss": 3.2426,
+      "step": 75200
+    },
+    {
+      "epoch": 21.919657422512234,
+      "grad_norm": 0.37786081433296204,
+      "learning_rate": 0.0003371646750218595,
+      "loss": 3.2455,
+      "step": 75250
+    },
+    {
+      "epoch": 21.93422279189,
+      "grad_norm": 0.40799352526664734,
+      "learning_rate": 0.00033698979889245116,
+      "loss": 3.2295,
+      "step": 75300
+    },
+    {
+      "epoch": 21.94878816126777,
+      "grad_norm": 0.3989873230457306,
+      "learning_rate": 0.0003368149227630428,
+      "loss": 3.2471,
+      "step": 75350
+    },
+    {
+      "epoch": 21.963353530645538,
+      "grad_norm": 0.39666229486465454,
+      "learning_rate": 0.0003366400466336345,
+      "loss": 3.2369,
+      "step": 75400
+    },
+    {
+      "epoch": 21.977918900023305,
+      "grad_norm": 0.3877425193786621,
+      "learning_rate": 0.0003364651705042261,
+      "loss": 3.243,
+      "step": 75450
+    },
+    {
+      "epoch": 21.99248426940107,
+      "grad_norm": 0.40185490250587463,
+      "learning_rate": 0.0003362902943748178,
+      "loss": 3.2502,
+      "step": 75500
+    },
+    {
+      "epoch": 22.00699137730133,
+      "grad_norm": 0.41637974977493286,
+      "learning_rate": 0.00033611541824540945,
+      "loss": 3.1885,
+      "step": 75550
+    },
+    {
+      "epoch": 22.021556746679096,
+      "grad_norm": 0.3906157612800598,
+      "learning_rate": 0.0003359405421160012,
+      "loss": 3.1367,
+      "step": 75600
+    },
+    {
+      "epoch": 22.036122116056863,
+      "grad_norm": 0.4338746666908264,
+      "learning_rate": 0.0003357656659865928,
+      "loss": 3.1472,
+      "step": 75650
+    },
+    {
+      "epoch": 22.05068748543463,
+      "grad_norm": 0.39344149827957153,
+      "learning_rate": 0.00033559078985718446,
+      "loss": 3.1439,
+      "step": 75700
+    },
+    {
+      "epoch": 22.0652528548124,
+      "grad_norm": 0.3915230631828308,
+      "learning_rate": 0.00033541591372777615,
+      "loss": 3.1467,
+      "step": 75750
+    },
+    {
+      "epoch": 22.079818224190166,
+      "grad_norm": 0.4206335246562958,
+      "learning_rate": 0.0003352410375983678,
+      "loss": 3.1542,
+      "step": 75800
+    },
+    {
+      "epoch": 22.094383593567933,
+      "grad_norm": 0.40375348925590515,
+      "learning_rate": 0.0003350661614689595,
+      "loss": 3.1459,
+      "step": 75850
+    },
+    {
+      "epoch": 22.1089489629457,
+      "grad_norm": 0.405519962310791,
+      "learning_rate": 0.0003348912853395511,
+      "loss": 3.1644,
+      "step": 75900
+    },
+    {
+      "epoch": 22.123514332323467,
+      "grad_norm": 0.40984925627708435,
+      "learning_rate": 0.00033471640921014275,
+      "loss": 3.1715,
+      "step": 75950
+    },
+    {
+      "epoch": 22.138079701701233,
+      "grad_norm": 0.4426792860031128,
+      "learning_rate": 0.00033454153308073444,
+      "loss": 3.162,
+      "step": 76000
+    },
+    {
+      "epoch": 22.138079701701233,
+      "eval_accuracy": 0.37330615906353404,
+      "eval_loss": 3.5484302043914795,
+      "eval_runtime": 182.8859,
+      "eval_samples_per_second": 91.013,
+      "eval_steps_per_second": 5.692,
+      "step": 76000
+    },
+    {
+      "epoch": 22.152645071079004,
+      "grad_norm": 0.43078505992889404,
+      "learning_rate": 0.0003343666569513261,
+      "loss": 3.1689,
+      "step": 76050
+    },
+    {
+      "epoch": 22.16721044045677,
+      "grad_norm": 0.3985705077648163,
+      "learning_rate": 0.0003341917808219178,
+      "loss": 3.1696,
+      "step": 76100
+    },
+    {
+      "epoch": 22.181775809834537,
+      "grad_norm": 0.39892059564590454,
+      "learning_rate": 0.00033401690469250946,
+      "loss": 3.1683,
+      "step": 76150
+    },
+    {
+      "epoch": 22.196341179212304,
+      "grad_norm": 0.4003106355667114,
+      "learning_rate": 0.00033384202856310115,
+      "loss": 3.1748,
+      "step": 76200
+    },
+    {
+      "epoch": 22.21090654859007,
+      "grad_norm": 0.41936761140823364,
+      "learning_rate": 0.0003336671524336928,
+      "loss": 3.1732,
+      "step": 76250
+    },
+    {
+      "epoch": 22.22547191796784,
+      "grad_norm": 0.394231915473938,
+      "learning_rate": 0.0003334922763042844,
+      "loss": 3.1697,
+      "step": 76300
+    },
+    {
+      "epoch": 22.240037287345608,
+      "grad_norm": 0.43360409140586853,
+      "learning_rate": 0.0003333174001748761,
+      "loss": 3.169,
+      "step": 76350
+    },
+    {
+      "epoch": 22.254602656723375,
+      "grad_norm": 0.4110454320907593,
+      "learning_rate": 0.00033314252404546775,
+      "loss": 3.1752,
+      "step": 76400
+    },
+    {
+      "epoch": 22.26916802610114,
+      "grad_norm": 0.41084834933280945,
+      "learning_rate": 0.00033296764791605944,
+      "loss": 3.1829,
+      "step": 76450
+    },
+    {
+      "epoch": 22.283733395478908,
+      "grad_norm": 0.4034361243247986,
+      "learning_rate": 0.0003327927717866511,
+      "loss": 3.1763,
+      "step": 76500
+    },
+    {
+      "epoch": 22.29829876485668,
+      "grad_norm": 0.4124561548233032,
+      "learning_rate": 0.0003326178956572427,
+      "loss": 3.1874,
+      "step": 76550
+    },
+    {
+      "epoch": 22.312864134234445,
+      "grad_norm": 0.41186967492103577,
+      "learning_rate": 0.00033244301952783446,
+      "loss": 3.1828,
+      "step": 76600
+    },
+    {
+      "epoch": 22.327429503612212,
+      "grad_norm": 0.4203675389289856,
+      "learning_rate": 0.0003322681433984261,
+      "loss": 3.1781,
+      "step": 76650
+    },
+    {
+      "epoch": 22.34199487298998,
+      "grad_norm": 0.4065147638320923,
+      "learning_rate": 0.0003320932672690178,
+      "loss": 3.1785,
+      "step": 76700
+    },
+    {
+      "epoch": 22.356560242367745,
+      "grad_norm": 0.4006505608558655,
+      "learning_rate": 0.0003319183911396094,
+      "loss": 3.198,
+      "step": 76750
+    },
+    {
+      "epoch": 22.371125611745512,
+      "grad_norm": 0.42733311653137207,
+      "learning_rate": 0.0003317435150102011,
+      "loss": 3.1998,
+      "step": 76800
+    },
+    {
+      "epoch": 22.385690981123282,
+      "grad_norm": 0.4309224784374237,
+      "learning_rate": 0.00033156863888079275,
+      "loss": 3.1957,
+      "step": 76850
+    },
+    {
+      "epoch": 22.40025635050105,
+      "grad_norm": 0.3946070075035095,
+      "learning_rate": 0.0003313937627513844,
+      "loss": 3.2073,
+      "step": 76900
+    },
+    {
+      "epoch": 22.414821719878816,
+      "grad_norm": 0.4475387930870056,
+      "learning_rate": 0.00033121888662197607,
+      "loss": 3.2152,
+      "step": 76950
+    },
+    {
+      "epoch": 22.429387089256583,
+      "grad_norm": 0.4521602988243103,
+      "learning_rate": 0.0003310440104925677,
+      "loss": 3.198,
+      "step": 77000
+    },
+    {
+      "epoch": 22.429387089256583,
+      "eval_accuracy": 0.3736266539848182,
+      "eval_loss": 3.5453152656555176,
+      "eval_runtime": 182.8424,
+      "eval_samples_per_second": 91.035,
+      "eval_steps_per_second": 5.693,
+      "step": 77000
+    },
+    {
+      "epoch": 22.44395245863435,
+      "grad_norm": 0.4259926974773407,
+      "learning_rate": 0.00033086913436315945,
+      "loss": 3.1972,
+      "step": 77050
+    },
+    {
+      "epoch": 22.45851782801212,
+      "grad_norm": 0.39332616329193115,
+      "learning_rate": 0.0003306942582337511,
+      "loss": 3.2036,
+      "step": 77100
+    },
+    {
+      "epoch": 22.473083197389887,
+      "grad_norm": 0.4318368434906006,
+      "learning_rate": 0.0003305193821043427,
+      "loss": 3.1913,
+      "step": 77150
+    },
+    {
+      "epoch": 22.487648566767653,
+      "grad_norm": 0.3975502848625183,
+      "learning_rate": 0.0003303445059749344,
+      "loss": 3.1994,
+      "step": 77200
+    },
+    {
+      "epoch": 22.50221393614542,
+      "grad_norm": 0.39913398027420044,
+      "learning_rate": 0.00033016962984552605,
+      "loss": 3.2114,
+      "step": 77250
+    },
+    {
+      "epoch": 22.516779305523187,
+      "grad_norm": 0.3935025632381439,
+      "learning_rate": 0.00032999475371611774,
+      "loss": 3.1939,
+      "step": 77300
+    },
+    {
+      "epoch": 22.531344674900957,
+      "grad_norm": 0.40351587533950806,
+      "learning_rate": 0.0003298198775867094,
+      "loss": 3.2113,
+      "step": 77350
+    },
+    {
+      "epoch": 22.545910044278724,
+      "grad_norm": 0.4044545292854309,
+      "learning_rate": 0.000329645001457301,
+      "loss": 3.1963,
+      "step": 77400
+    },
+    {
+      "epoch": 22.56047541365649,
+      "grad_norm": 0.41852426528930664,
+      "learning_rate": 0.0003294701253278927,
+      "loss": 3.2051,
+      "step": 77450
+    },
+    {
+      "epoch": 22.575040783034257,
+      "grad_norm": 0.4577685594558716,
+      "learning_rate": 0.00032929524919848434,
+      "loss": 3.2122,
+      "step": 77500
+    },
+    {
+      "epoch": 22.589606152412024,
+      "grad_norm": 0.40526050329208374,
+      "learning_rate": 0.0003291203730690761,
+      "loss": 3.2019,
+      "step": 77550
+    },
+    {
+      "epoch": 22.60417152178979,
+      "grad_norm": 0.39129775762557983,
+      "learning_rate": 0.0003289454969396677,
+      "loss": 3.2085,
+      "step": 77600
+    },
+    {
+      "epoch": 22.61873689116756,
+      "grad_norm": 0.4263695776462555,
+      "learning_rate": 0.0003287706208102594,
+      "loss": 3.2036,
+      "step": 77650
+    },
+    {
+      "epoch": 22.633302260545328,
+      "grad_norm": 0.44772493839263916,
+      "learning_rate": 0.00032859574468085105,
+      "loss": 3.2119,
+      "step": 77700
+    },
+    {
+      "epoch": 22.647867629923095,
+      "grad_norm": 0.4202895164489746,
+      "learning_rate": 0.0003284208685514427,
+      "loss": 3.2207,
+      "step": 77750
+    },
+    {
+      "epoch": 22.66243299930086,
+      "grad_norm": 0.4418281316757202,
+      "learning_rate": 0.0003282459924220344,
+      "loss": 3.2265,
+      "step": 77800
+    },
+    {
+      "epoch": 22.67699836867863,
+      "grad_norm": 0.3960643410682678,
+      "learning_rate": 0.000328071116292626,
+      "loss": 3.2172,
+      "step": 77850
+    },
+    {
+      "epoch": 22.6915637380564,
+      "grad_norm": 0.4430347979068756,
+      "learning_rate": 0.0003278962401632177,
+      "loss": 3.2157,
+      "step": 77900
+    },
+    {
+      "epoch": 22.706129107434165,
+      "grad_norm": 0.3996141254901886,
+      "learning_rate": 0.00032772136403380934,
+      "loss": 3.2088,
+      "step": 77950
+    },
+    {
+      "epoch": 22.720694476811932,
+      "grad_norm": 0.40546032786369324,
+      "learning_rate": 0.000327546487904401,
+      "loss": 3.2291,
+      "step": 78000
+    },
+    {
+      "epoch": 22.720694476811932,
+      "eval_accuracy": 0.3741603027183871,
+      "eval_loss": 3.538947820663452,
+      "eval_runtime": 182.8782,
+      "eval_samples_per_second": 91.017,
+      "eval_steps_per_second": 5.692,
+      "step": 78000
+    },
+    {
+      "epoch": 22.7352598461897,
+      "grad_norm": 0.41795864701271057,
+      "learning_rate": 0.0003273716117749927,
+      "loss": 3.2355,
+      "step": 78050
+    },
+    {
+      "epoch": 22.749825215567466,
+      "grad_norm": 0.3912990391254425,
+      "learning_rate": 0.00032719673564558435,
+      "loss": 3.2291,
+      "step": 78100
+    },
+    {
+      "epoch": 22.764390584945236,
+      "grad_norm": 0.40984243154525757,
+      "learning_rate": 0.00032702185951617605,
+      "loss": 3.2333,
+      "step": 78150
+    },
+    {
+      "epoch": 22.778955954323003,
+      "grad_norm": 0.4171477258205414,
+      "learning_rate": 0.0003268469833867677,
+      "loss": 3.2243,
+      "step": 78200
+    },
+    {
+      "epoch": 22.79352132370077,
+      "grad_norm": 0.4628252685070038,
+      "learning_rate": 0.00032667210725735937,
+      "loss": 3.2146,
+      "step": 78250
+    },
+    {
+      "epoch": 22.808086693078536,
+      "grad_norm": 0.4350678324699402,
+      "learning_rate": 0.000326497231127951,
+      "loss": 3.2147,
+      "step": 78300
+    },
+    {
+      "epoch": 22.822652062456303,
+      "grad_norm": 0.42457160353660583,
+      "learning_rate": 0.00032632235499854264,
+      "loss": 3.2326,
+      "step": 78350
+    },
+    {
+      "epoch": 22.83721743183407,
+      "grad_norm": 0.4339698255062103,
+      "learning_rate": 0.00032614747886913433,
+      "loss": 3.2335,
+      "step": 78400
+    },
+    {
+      "epoch": 22.85178280121184,
+      "grad_norm": 0.41745054721832275,
+      "learning_rate": 0.00032597260273972597,
+      "loss": 3.2318,
+      "step": 78450
+    },
+    {
+      "epoch": 22.866348170589607,
+      "grad_norm": 0.43340957164764404,
+      "learning_rate": 0.0003257977266103177,
+      "loss": 3.2356,
+      "step": 78500
+    },
+    {
+      "epoch": 22.880913539967374,
+      "grad_norm": 0.41358107328414917,
+      "learning_rate": 0.00032562285048090935,
+      "loss": 3.2305,
+      "step": 78550
+    },
+    {
+      "epoch": 22.89547890934514,
+      "grad_norm": 0.4206470549106598,
+      "learning_rate": 0.000325447974351501,
+      "loss": 3.2347,
+      "step": 78600
+    },
+    {
+      "epoch": 22.910044278722907,
+      "grad_norm": 0.4107297658920288,
+      "learning_rate": 0.0003252730982220927,
+      "loss": 3.2458,
+      "step": 78650
+    },
+    {
+      "epoch": 22.924609648100677,
+      "grad_norm": 0.4097016453742981,
+      "learning_rate": 0.0003250982220926843,
+      "loss": 3.2243,
+      "step": 78700
+    },
+    {
+      "epoch": 22.939175017478444,
+      "grad_norm": 0.4124782383441925,
+      "learning_rate": 0.000324923345963276,
+      "loss": 3.217,
+      "step": 78750
+    },
+    {
+      "epoch": 22.95374038685621,
+      "grad_norm": 0.43206652998924255,
+      "learning_rate": 0.00032474846983386764,
+      "loss": 3.2337,
+      "step": 78800
+    },
+    {
+      "epoch": 22.968305756233978,
+      "grad_norm": 0.38564333319664,
+      "learning_rate": 0.00032457359370445933,
+      "loss": 3.2259,
+      "step": 78850
+    },
+    {
+      "epoch": 22.982871125611744,
+      "grad_norm": 0.43358278274536133,
+      "learning_rate": 0.00032439871757505097,
+      "loss": 3.2241,
+      "step": 78900
+    },
+    {
+      "epoch": 22.997436494989515,
+      "grad_norm": 0.4222619831562042,
+      "learning_rate": 0.0003242238414456426,
+      "loss": 3.2384,
+      "step": 78950
+    },
+    {
+      "epoch": 23.01194360288977,
+      "grad_norm": 0.4120100736618042,
+      "learning_rate": 0.00032404896531623435,
+      "loss": 3.16,
+      "step": 79000
+    },
+    {
+      "epoch": 23.01194360288977,
+      "eval_accuracy": 0.3737924272199652,
+      "eval_loss": 3.5487711429595947,
+      "eval_runtime": 183.134,
+      "eval_samples_per_second": 90.89,
+      "eval_steps_per_second": 5.684,
+      "step": 79000
+    },
+    {
+      "epoch": 23.026508972267536,
+      "grad_norm": 0.41206803917884827,
+      "learning_rate": 0.000323874089186826,
+      "loss": 3.1426,
+      "step": 79050
+    },
+    {
+      "epoch": 23.041074341645306,
+      "grad_norm": 0.421497642993927,
+      "learning_rate": 0.0003236992130574177,
+      "loss": 3.1405,
+      "step": 79100
+    },
+    {
+      "epoch": 23.055639711023073,
+      "grad_norm": 0.41028302907943726,
+      "learning_rate": 0.0003235243369280093,
+      "loss": 3.1416,
+      "step": 79150
+    },
+    {
+      "epoch": 23.07020508040084,
+      "grad_norm": 0.423358291387558,
+      "learning_rate": 0.00032334946079860095,
+      "loss": 3.132,
+      "step": 79200
+    },
+    {
+      "epoch": 23.084770449778606,
+      "grad_norm": 0.42327702045440674,
+      "learning_rate": 0.00032317458466919264,
+      "loss": 3.1533,
+      "step": 79250
+    },
+    {
+      "epoch": 23.099335819156373,
+      "grad_norm": 0.4391932189464569,
+      "learning_rate": 0.0003229997085397843,
+      "loss": 3.1389,
+      "step": 79300
+    },
+    {
+      "epoch": 23.11390118853414,
+      "grad_norm": 0.39794474840164185,
+      "learning_rate": 0.00032282483241037596,
+      "loss": 3.1481,
+      "step": 79350
+    },
+    {
+      "epoch": 23.12846655791191,
+      "grad_norm": 0.4148556590080261,
+      "learning_rate": 0.0003226499562809676,
+      "loss": 3.151,
+      "step": 79400
+    },
+    {
+      "epoch": 23.143031927289677,
+      "grad_norm": 0.4218699038028717,
+      "learning_rate": 0.00032247508015155924,
+      "loss": 3.1488,
+      "step": 79450
+    },
+    {
+      "epoch": 23.157597296667443,
+      "grad_norm": 0.440711110830307,
+      "learning_rate": 0.000322300204022151,
+      "loss": 3.161,
+      "step": 79500
+    },
+    {
+      "epoch": 23.17216266604521,
+      "grad_norm": 0.41110166907310486,
+      "learning_rate": 0.0003221253278927426,
+      "loss": 3.1612,
+      "step": 79550
+    },
+    {
+      "epoch": 23.186728035422977,
+      "grad_norm": 0.4470248520374298,
+      "learning_rate": 0.0003219504517633343,
+      "loss": 3.1722,
+      "step": 79600
+    },
+    {
+      "epoch": 23.201293404800747,
+      "grad_norm": 0.4257759749889374,
+      "learning_rate": 0.00032177557563392594,
+      "loss": 3.154,
+      "step": 79650
+    },
+    {
+      "epoch": 23.215858774178514,
+      "grad_norm": 0.44018658995628357,
+      "learning_rate": 0.00032160069950451763,
+      "loss": 3.1628,
+      "step": 79700
+    },
+    {
+      "epoch": 23.23042414355628,
+      "grad_norm": 0.40629279613494873,
+      "learning_rate": 0.00032142582337510927,
+      "loss": 3.1663,
+      "step": 79750
+    },
+    {
+      "epoch": 23.244989512934048,
+      "grad_norm": 0.41147536039352417,
+      "learning_rate": 0.0003212509472457009,
+      "loss": 3.1745,
+      "step": 79800
+    },
+    {
+      "epoch": 23.259554882311814,
+      "grad_norm": 0.43514347076416016,
+      "learning_rate": 0.0003210760711162926,
+      "loss": 3.1809,
+      "step": 79850
+    },
+    {
+      "epoch": 23.27412025168958,
+      "grad_norm": 0.42042967677116394,
+      "learning_rate": 0.00032090119498688423,
+      "loss": 3.1808,
+      "step": 79900
+    },
+    {
+      "epoch": 23.28868562106735,
+      "grad_norm": 0.41074612736701965,
+      "learning_rate": 0.0003207263188574759,
+      "loss": 3.1626,
+      "step": 79950
+    },
+    {
+      "epoch": 23.303250990445118,
+      "grad_norm": 0.41137275099754333,
+      "learning_rate": 0.0003205514427280676,
+      "loss": 3.1854,
+      "step": 80000
+    },
+    {
+      "epoch": 23.303250990445118,
+      "eval_accuracy": 0.3741075139364148,
+      "eval_loss": 3.5451858043670654,
+      "eval_runtime": 183.2227,
+      "eval_samples_per_second": 90.846,
+      "eval_steps_per_second": 5.682,
+      "step": 80000
+    },
+    {
+      "epoch": 23.317816359822885,
+      "grad_norm": 0.4325380027294159,
+      "learning_rate": 0.00032037656659865925,
+      "loss": 3.1423,
+      "step": 80050
+    },
+    {
+      "epoch": 23.33238172920065,
+      "grad_norm": 0.4023299813270569,
+      "learning_rate": 0.00032020169046925094,
+      "loss": 3.1404,
+      "step": 80100
+    },
+    {
+      "epoch": 23.34694709857842,
+      "grad_norm": 0.46395108103752136,
+      "learning_rate": 0.0003200268143398426,
+      "loss": 3.1341,
+      "step": 80150
+    },
+    {
+      "epoch": 23.36151246795619,
+      "grad_norm": 0.41562238335609436,
+      "learning_rate": 0.00031985193821043427,
+      "loss": 3.137,
+      "step": 80200
+    },
+    {
+      "epoch": 23.376077837333956,
+      "grad_norm": 0.42964160442352295,
+      "learning_rate": 0.0003196770620810259,
+      "loss": 3.1355,
+      "step": 80250
+    },
+    {
+      "epoch": 23.390643206711722,
+      "grad_norm": 0.4194352328777313,
+      "learning_rate": 0.0003195021859516176,
+      "loss": 3.15,
+      "step": 80300
+    },
+    {
+      "epoch": 23.40520857608949,
+      "grad_norm": 0.41986915469169617,
+      "learning_rate": 0.00031932730982220923,
+      "loss": 3.1585,
+      "step": 80350
+    },
+    {
+      "epoch": 23.419773945467256,
+      "grad_norm": 0.4277133345603943,
+      "learning_rate": 0.00031915243369280087,
+      "loss": 3.1664,
+      "step": 80400
+    },
+    {
+      "epoch": 23.434339314845026,
+      "grad_norm": 0.4289986491203308,
+      "learning_rate": 0.00031897755756339256,
+      "loss": 3.1489,
+      "step": 80450
+    },
+    {
+      "epoch": 23.448904684222793,
+      "grad_norm": 0.4255027770996094,
+      "learning_rate": 0.0003188026814339842,
+      "loss": 3.1757,
+      "step": 80500
+    },
+    {
+      "epoch": 23.46347005360056,
+      "grad_norm": 0.43643444776535034,
+      "learning_rate": 0.00031862780530457594,
+      "loss": 3.1501,
+      "step": 80550
+    },
+    {
+      "epoch": 23.478035422978326,
+      "grad_norm": 0.4336095452308655,
+      "learning_rate": 0.0003184529291751676,
+      "loss": 3.1631,
+      "step": 80600
+    },
+    {
+      "epoch": 23.492600792356093,
+      "grad_norm": 0.40593820810317993,
+      "learning_rate": 0.0003182780530457592,
+      "loss": 3.1595,
+      "step": 80650
+    },
+    {
+      "epoch": 23.50716616173386,
+      "grad_norm": 0.4076375961303711,
+      "learning_rate": 0.0003181031769163509,
+      "loss": 3.1733,
+      "step": 80700
+    },
+    {
+      "epoch": 23.52173153111163,
+      "grad_norm": 0.4589744508266449,
+      "learning_rate": 0.00031792830078694254,
+      "loss": 3.1649,
+      "step": 80750
+    },
+    {
+      "epoch": 23.536296900489397,
+      "grad_norm": 0.47728487849235535,
+      "learning_rate": 0.0003177534246575342,
+      "loss": 3.1545,
+      "step": 80800
+    },
+    {
+      "epoch": 23.550862269867164,
+      "grad_norm": 0.4282548725605011,
+      "learning_rate": 0.00031757854852812586,
+      "loss": 3.1784,
+      "step": 80850
+    },
+    {
+      "epoch": 23.56542763924493,
+      "grad_norm": 0.40563443303108215,
+      "learning_rate": 0.00031740367239871755,
+      "loss": 3.18,
+      "step": 80900
+    },
+    {
+      "epoch": 23.579993008622697,
+      "grad_norm": 0.4412521421909332,
+      "learning_rate": 0.0003172287962693092,
+      "loss": 3.181,
+      "step": 80950
+    },
+    {
+      "epoch": 23.594558378000468,
+      "grad_norm": 0.45530885457992554,
+      "learning_rate": 0.0003170539201399008,
+      "loss": 3.1779,
+      "step": 81000
+    },
+    {
+      "epoch": 23.594558378000468,
+      "eval_accuracy": 0.37350226527362285,
+      "eval_loss": 3.5499300956726074,
+      "eval_runtime": 179.6391,
+      "eval_samples_per_second": 92.658,
+      "eval_steps_per_second": 5.795,
+      "step": 81000
+    },
+    {
+      "epoch": 23.609123747378234,
+      "grad_norm": 0.38101035356521606,
+      "learning_rate": 0.00031687904401049257,
+      "loss": 3.1674,
+      "step": 81050
+    },
+    {
+      "epoch": 23.623689116756,
+      "grad_norm": 0.476788729429245,
+      "learning_rate": 0.0003167041678810842,
+      "loss": 3.1721,
+      "step": 81100
+    },
+    {
+      "epoch": 23.638254486133768,
+      "grad_norm": 0.4510898292064667,
+      "learning_rate": 0.0003165292917516759,
+      "loss": 3.1877,
+      "step": 81150
+    },
+    {
+      "epoch": 23.652819855511535,
+      "grad_norm": 0.4234790802001953,
+      "learning_rate": 0.00031635441562226753,
+      "loss": 3.1894,
+      "step": 81200
+    },
+    {
+      "epoch": 23.667385224889305,
+      "grad_norm": 0.4436555504798889,
+      "learning_rate": 0.00031617953949285917,
+      "loss": 3.1929,
+      "step": 81250
+    },
+    {
+      "epoch": 23.68195059426707,
+      "grad_norm": 0.4603196978569031,
+      "learning_rate": 0.00031600466336345086,
+      "loss": 3.1905,
+      "step": 81300
+    },
+    {
+      "epoch": 23.69651596364484,
+      "grad_norm": 0.4264376163482666,
+      "learning_rate": 0.0003158297872340425,
+      "loss": 3.1853,
+      "step": 81350
+    },
+    {
+      "epoch": 23.711081333022605,
+      "grad_norm": 0.46008777618408203,
+      "learning_rate": 0.0003156549111046342,
+      "loss": 3.1832,
+      "step": 81400
+    },
+    {
+      "epoch": 23.725646702400372,
+      "grad_norm": 0.3979647159576416,
+      "learning_rate": 0.0003154800349752258,
+      "loss": 3.2036,
+      "step": 81450
+    },
+    {
+      "epoch": 23.74021207177814,
+      "grad_norm": 0.41289305686950684,
+      "learning_rate": 0.00031530515884581757,
+      "loss": 3.2056,
+      "step": 81500
+    },
+    {
+      "epoch": 23.75477744115591,
+      "grad_norm": 0.3942318260669708,
+      "learning_rate": 0.0003151302827164092,
+      "loss": 3.1917,
+      "step": 81550
+    },
+    {
+      "epoch": 23.769342810533676,
+      "grad_norm": 0.408755362033844,
+      "learning_rate": 0.00031495540658700084,
+      "loss": 3.1889,
+      "step": 81600
+    },
+    {
+      "epoch": 23.783908179911442,
+      "grad_norm": 0.41167670488357544,
+      "learning_rate": 0.00031478053045759253,
+      "loss": 3.1915,
+      "step": 81650
+    },
+    {
+      "epoch": 23.79847354928921,
+      "grad_norm": 0.4455852508544922,
+      "learning_rate": 0.00031460565432818417,
+      "loss": 3.1898,
+      "step": 81700
+    },
+    {
+      "epoch": 23.813038918666976,
+      "grad_norm": 0.42409783601760864,
+      "learning_rate": 0.00031443077819877586,
+      "loss": 3.1875,
+      "step": 81750
+    },
+    {
+      "epoch": 23.827604288044746,
+      "grad_norm": 0.41861656308174133,
+      "learning_rate": 0.0003142559020693675,
+      "loss": 3.2021,
+      "step": 81800
+    },
+    {
+      "epoch": 23.842169657422513,
+      "grad_norm": 0.44218793511390686,
+      "learning_rate": 0.00031408102593995913,
+      "loss": 3.1987,
+      "step": 81850
+    },
+    {
+      "epoch": 23.85673502680028,
+      "grad_norm": 0.43261197209358215,
+      "learning_rate": 0.0003139061498105508,
+      "loss": 3.2028,
+      "step": 81900
+    },
+    {
+      "epoch": 23.871300396178047,
+      "grad_norm": 0.4348728656768799,
+      "learning_rate": 0.00031373127368114245,
+      "loss": 3.1951,
+      "step": 81950
+    },
+    {
+      "epoch": 23.885865765555813,
+      "grad_norm": 0.3952457010746002,
+      "learning_rate": 0.0003135563975517342,
+      "loss": 3.2077,
+      "step": 82000
+    },
+    {
+      "epoch": 23.885865765555813,
+      "eval_accuracy": 0.37416606363223265,
+      "eval_loss": 3.5419678688049316,
+      "eval_runtime": 179.7187,
+      "eval_samples_per_second": 92.617,
+      "eval_steps_per_second": 5.792,
+      "step": 82000
+    },
+    {
+      "epoch": 23.900431134933584,
+      "grad_norm": 0.4126368761062622,
+      "learning_rate": 0.00031338152142232584,
+      "loss": 3.2086,
+      "step": 82050
+    },
+    {
+      "epoch": 23.91499650431135,
+      "grad_norm": 0.42765992879867554,
+      "learning_rate": 0.00031320664529291747,
+      "loss": 3.2027,
+      "step": 82100
+    },
+    {
+      "epoch": 23.929561873689117,
+      "grad_norm": 0.44232431054115295,
+      "learning_rate": 0.00031303176916350916,
+      "loss": 3.2065,
+      "step": 82150
+    },
+    {
+      "epoch": 23.944127243066884,
+      "grad_norm": 0.41848278045654297,
+      "learning_rate": 0.0003128568930341008,
+      "loss": 3.1995,
+      "step": 82200
+    },
+    {
+      "epoch": 23.95869261244465,
+      "grad_norm": 0.431692898273468,
+      "learning_rate": 0.0003126820169046925,
+      "loss": 3.2061,
+      "step": 82250
+    },
+    {
+      "epoch": 23.973257981822417,
+      "grad_norm": 0.44336676597595215,
+      "learning_rate": 0.0003125071407752841,
+      "loss": 3.2129,
+      "step": 82300
+    },
+    {
+      "epoch": 23.987823351200188,
+      "grad_norm": 0.42227089405059814,
+      "learning_rate": 0.0003123322646458758,
+      "loss": 3.1983,
+      "step": 82350
+    },
+    {
+      "epoch": 24.002621766487998,
+      "grad_norm": 0.42413151264190674,
+      "learning_rate": 0.00031215738851646745,
+      "loss": 3.2515,
+      "step": 82400
+    },
+    {
+      "epoch": 24.017187135865765,
+      "grad_norm": 0.42558392882347107,
+      "learning_rate": 0.0003119825123870591,
+      "loss": 3.1148,
+      "step": 82450
+    },
+    {
+      "epoch": 24.03175250524353,
+      "grad_norm": 0.4119874835014343,
+      "learning_rate": 0.00031180763625765083,
+      "loss": 3.1196,
+      "step": 82500
+    },
+    {
+      "epoch": 24.0463178746213,
+      "grad_norm": 0.4366919994354248,
+      "learning_rate": 0.00031163276012824247,
+      "loss": 3.1312,
+      "step": 82550
+    },
+    {
+      "epoch": 24.06088324399907,
+      "grad_norm": 0.4563874304294586,
+      "learning_rate": 0.00031145788399883416,
+      "loss": 3.1297,
+      "step": 82600
+    },
+    {
+      "epoch": 24.075448613376835,
+      "grad_norm": 0.4409359395503998,
+      "learning_rate": 0.0003112830078694258,
+      "loss": 3.138,
+      "step": 82650
+    },
+    {
+      "epoch": 24.090013982754602,
+      "grad_norm": 0.419706255197525,
+      "learning_rate": 0.00031110813174001743,
+      "loss": 3.143,
+      "step": 82700
+    },
+    {
+      "epoch": 24.10457935213237,
+      "grad_norm": 0.41549739241600037,
+      "learning_rate": 0.0003109332556106091,
+      "loss": 3.1495,
+      "step": 82750
+    },
+    {
+      "epoch": 24.11914472151014,
+      "grad_norm": 0.4378606379032135,
+      "learning_rate": 0.00031075837948120076,
+      "loss": 3.1369,
+      "step": 82800
+    },
+    {
+      "epoch": 24.133710090887906,
+      "grad_norm": 0.45427650213241577,
+      "learning_rate": 0.00031058350335179245,
+      "loss": 3.159,
+      "step": 82850
+    },
+    {
+      "epoch": 24.148275460265673,
+      "grad_norm": 0.41194722056388855,
+      "learning_rate": 0.0003104086272223841,
+      "loss": 3.147,
+      "step": 82900
+    },
+    {
+      "epoch": 24.16284082964344,
+      "grad_norm": 0.4255869388580322,
+      "learning_rate": 0.00031023375109297583,
+      "loss": 3.1491,
+      "step": 82950
+    },
+    {
+      "epoch": 24.177406199021206,
+      "grad_norm": 0.4222257435321808,
+      "learning_rate": 0.00031005887496356746,
+      "loss": 3.1569,
+      "step": 83000
+    },
+    {
+      "epoch": 24.177406199021206,
+      "eval_accuracy": 0.3738667312516056,
+      "eval_loss": 3.5533342361450195,
+      "eval_runtime": 179.7382,
+      "eval_samples_per_second": 92.607,
+      "eval_steps_per_second": 5.792,
+      "step": 83000
+    },
+    {
+      "epoch": 24.191971568398973,
+      "grad_norm": 0.42919695377349854,
+      "learning_rate": 0.0003098839988341591,
+      "loss": 3.1545,
+      "step": 83050
+    },
+    {
+      "epoch": 24.206536937776743,
+      "grad_norm": 0.4294886291027069,
+      "learning_rate": 0.0003097091227047508,
+      "loss": 3.1609,
+      "step": 83100
+    },
+    {
+      "epoch": 24.22110230715451,
+      "grad_norm": 0.4393008053302765,
+      "learning_rate": 0.00030953424657534243,
+      "loss": 3.1681,
+      "step": 83150
+    },
+    {
+      "epoch": 24.235667676532277,
+      "grad_norm": 0.43206796050071716,
+      "learning_rate": 0.0003093593704459341,
+      "loss": 3.175,
+      "step": 83200
+    },
+    {
+      "epoch": 24.250233045910043,
+      "grad_norm": 0.42569947242736816,
+      "learning_rate": 0.00030918449431652575,
+      "loss": 3.1698,
+      "step": 83250
+    },
+    {
+      "epoch": 24.26479841528781,
+      "grad_norm": 0.4225282669067383,
+      "learning_rate": 0.0003090096181871174,
+      "loss": 3.1676,
+      "step": 83300
+    },
+    {
+      "epoch": 24.27936378466558,
+      "grad_norm": 0.43696901202201843,
+      "learning_rate": 0.0003088347420577091,
+      "loss": 3.168,
+      "step": 83350
+    },
+    {
+      "epoch": 24.293929154043347,
+      "grad_norm": 0.43535012006759644,
+      "learning_rate": 0.0003086598659283007,
+      "loss": 3.1767,
+      "step": 83400
+    },
+    {
+      "epoch": 24.308494523421114,
+      "grad_norm": 0.42330238223075867,
+      "learning_rate": 0.00030848498979889246,
+      "loss": 3.1706,
+      "step": 83450
+    },
+    {
+      "epoch": 24.32305989279888,
+      "grad_norm": 0.4490221440792084,
+      "learning_rate": 0.0003083101136694841,
+      "loss": 3.1757,
+      "step": 83500
+    },
+    {
+      "epoch": 24.337625262176648,
+      "grad_norm": 0.4481862783432007,
+      "learning_rate": 0.0003081352375400758,
+      "loss": 3.1745,
+      "step": 83550
+    },
+    {
+      "epoch": 24.352190631554418,
+      "grad_norm": 0.4269773066043854,
+      "learning_rate": 0.0003079603614106674,
+      "loss": 3.1707,
+      "step": 83600
+    },
+    {
+      "epoch": 24.366756000932185,
+      "grad_norm": 0.42694324254989624,
+      "learning_rate": 0.00030778548528125906,
+      "loss": 3.1766,
+      "step": 83650
+    },
+    {
+      "epoch": 24.38132137030995,
+      "grad_norm": 0.41363558173179626,
+      "learning_rate": 0.00030761060915185075,
+      "loss": 3.1951,
+      "step": 83700
+    },
+    {
+      "epoch": 24.395886739687718,
+      "grad_norm": 0.39224985241889954,
+      "learning_rate": 0.0003074357330224424,
+      "loss": 3.1796,
+      "step": 83750
+    },
+    {
+      "epoch": 24.410452109065485,
+      "grad_norm": 0.44722500443458557,
+      "learning_rate": 0.0003072608568930341,
+      "loss": 3.1833,
+      "step": 83800
+    },
+    {
+      "epoch": 24.42501747844325,
+      "grad_norm": 0.44555240869522095,
+      "learning_rate": 0.0003070859807636257,
+      "loss": 3.1924,
+      "step": 83850
+    },
+    {
+      "epoch": 24.439582847821022,
+      "grad_norm": 0.42261767387390137,
+      "learning_rate": 0.00030691110463421735,
+      "loss": 3.1853,
+      "step": 83900
+    },
+    {
+      "epoch": 24.45414821719879,
+      "grad_norm": 0.4288679361343384,
+      "learning_rate": 0.0003067362285048091,
+      "loss": 3.1812,
+      "step": 83950
+    },
+    {
+      "epoch": 24.468713586576555,
+      "grad_norm": 0.41109663248062134,
+      "learning_rate": 0.00030656135237540073,
+      "loss": 3.182,
+      "step": 84000
+    },
+    {
+      "epoch": 24.468713586576555,
+      "eval_accuracy": 0.37449290731571394,
+      "eval_loss": 3.5434653759002686,
+      "eval_runtime": 179.7152,
+      "eval_samples_per_second": 92.619,
+      "eval_steps_per_second": 5.792,
+      "step": 84000
+    },
+    {
+      "epoch": 24.483278955954322,
+      "grad_norm": 0.4189678430557251,
+      "learning_rate": 0.0003063864762459924,
+      "loss": 3.1902,
+      "step": 84050
+    },
+    {
+      "epoch": 24.49784432533209,
+      "grad_norm": 0.42589837312698364,
+      "learning_rate": 0.00030621160011658406,
+      "loss": 3.1965,
+      "step": 84100
+    },
+    {
+      "epoch": 24.51240969470986,
+      "grad_norm": 0.4201832115650177,
+      "learning_rate": 0.0003060367239871757,
+      "loss": 3.1925,
+      "step": 84150
+    },
+    {
+      "epoch": 24.526975064087626,
+      "grad_norm": 0.4573235809803009,
+      "learning_rate": 0.0003058618478577674,
+      "loss": 3.1754,
+      "step": 84200
+    },
+    {
+      "epoch": 24.541540433465393,
+      "grad_norm": 0.44319775700569153,
+      "learning_rate": 0.000305686971728359,
+      "loss": 3.1957,
+      "step": 84250
+    },
+    {
+      "epoch": 24.55610580284316,
+      "grad_norm": 0.43211740255355835,
+      "learning_rate": 0.0003055120955989507,
+      "loss": 3.1839,
+      "step": 84300
+    },
+    {
+      "epoch": 24.570671172220926,
+      "grad_norm": 0.4124164283275604,
+      "learning_rate": 0.00030533721946954235,
+      "loss": 3.1993,
+      "step": 84350
+    },
+    {
+      "epoch": 24.585236541598697,
+      "grad_norm": 0.44055572152137756,
+      "learning_rate": 0.0003051623433401341,
+      "loss": 3.1992,
+      "step": 84400
+    },
+    {
+      "epoch": 24.599801910976463,
+      "grad_norm": 0.39688199758529663,
+      "learning_rate": 0.00030498746721072573,
+      "loss": 3.1891,
+      "step": 84450
+    },
+    {
+      "epoch": 24.61436728035423,
+      "grad_norm": 0.4425677955150604,
+      "learning_rate": 0.00030481259108131736,
+      "loss": 3.2038,
+      "step": 84500
+    },
+    {
+      "epoch": 24.628932649731997,
+      "grad_norm": 0.4194990396499634,
+      "learning_rate": 0.00030463771495190905,
+      "loss": 3.1793,
+      "step": 84550
+    },
+    {
+      "epoch": 24.643498019109764,
+      "grad_norm": 0.4422365427017212,
+      "learning_rate": 0.0003044628388225007,
+      "loss": 3.1917,
+      "step": 84600
+    },
+    {
+      "epoch": 24.65806338848753,
+      "grad_norm": 0.43441274762153625,
+      "learning_rate": 0.0003042879626930924,
+      "loss": 3.205,
+      "step": 84650
+    },
+    {
+      "epoch": 24.6726287578653,
+      "grad_norm": 0.41350895166397095,
+      "learning_rate": 0.000304113086563684,
+      "loss": 3.2026,
+      "step": 84700
+    },
+    {
+      "epoch": 24.687194127243067,
+      "grad_norm": 0.45683303475379944,
+      "learning_rate": 0.00030393821043427565,
+      "loss": 3.2147,
+      "step": 84750
+    },
+    {
+      "epoch": 24.701759496620834,
+      "grad_norm": 0.4254341721534729,
+      "learning_rate": 0.00030376333430486734,
+      "loss": 3.1934,
+      "step": 84800
+    },
+    {
+      "epoch": 24.7163248659986,
+      "grad_norm": 0.446032851934433,
+      "learning_rate": 0.000303588458175459,
+      "loss": 3.1964,
+      "step": 84850
+    },
+    {
+      "epoch": 24.730890235376368,
+      "grad_norm": 0.4153694808483124,
+      "learning_rate": 0.0003034135820460507,
+      "loss": 3.2111,
+      "step": 84900
+    },
+    {
+      "epoch": 24.745455604754138,
+      "grad_norm": 0.4243052303791046,
+      "learning_rate": 0.00030323870591664236,
+      "loss": 3.2088,
+      "step": 84950
+    },
+    {
+      "epoch": 24.760020974131905,
+      "grad_norm": 0.4362107515335083,
+      "learning_rate": 0.00030306382978723405,
+      "loss": 3.2022,
+      "step": 85000
+    },
+    {
+      "epoch": 24.760020974131905,
+      "eval_accuracy": 0.3745567476466961,
+      "eval_loss": 3.536757707595825,
+      "eval_runtime": 179.8297,
+      "eval_samples_per_second": 92.56,
+      "eval_steps_per_second": 5.789,
+      "step": 85000
+    },
+    {
+      "epoch": 24.77458634350967,
+      "grad_norm": 0.4239880442619324,
+      "learning_rate": 0.0003028889536578257,
+      "loss": 3.194,
+      "step": 85050
+    },
+    {
+      "epoch": 24.78915171288744,
+      "grad_norm": 0.42055609822273254,
+      "learning_rate": 0.0003027140775284173,
+      "loss": 3.2129,
+      "step": 85100
+    },
+    {
+      "epoch": 24.803717082265205,
+      "grad_norm": 0.4157819449901581,
+      "learning_rate": 0.000302539201399009,
+      "loss": 3.1931,
+      "step": 85150
+    },
+    {
+      "epoch": 24.818282451642972,
+      "grad_norm": 0.38243451714515686,
+      "learning_rate": 0.00030236432526960065,
+      "loss": 3.2069,
+      "step": 85200
+    },
+    {
+      "epoch": 24.832847821020742,
+      "grad_norm": 0.4196301996707916,
+      "learning_rate": 0.00030218944914019234,
+      "loss": 3.2226,
+      "step": 85250
+    },
+    {
+      "epoch": 24.84741319039851,
+      "grad_norm": 0.43389689922332764,
+      "learning_rate": 0.000302014573010784,
+      "loss": 3.2144,
+      "step": 85300
+    },
+    {
+      "epoch": 24.861978559776276,
+      "grad_norm": 0.42272713780403137,
+      "learning_rate": 0.0003018396968813756,
+      "loss": 3.2223,
+      "step": 85350
+    },
+    {
+      "epoch": 24.876543929154042,
+      "grad_norm": 0.4422394335269928,
+      "learning_rate": 0.00030166482075196736,
+      "loss": 3.208,
+      "step": 85400
+    },
+    {
+      "epoch": 24.89110929853181,
+      "grad_norm": 0.4388560652732849,
+      "learning_rate": 0.000301489944622559,
+      "loss": 3.2095,
+      "step": 85450
+    },
+    {
+      "epoch": 24.90567466790958,
+      "grad_norm": 0.39322882890701294,
+      "learning_rate": 0.0003013150684931507,
+      "loss": 3.2127,
+      "step": 85500
+    },
+    {
+      "epoch": 24.920240037287346,
+      "grad_norm": 0.45943403244018555,
+      "learning_rate": 0.0003011401923637423,
+      "loss": 3.2097,
+      "step": 85550
+    },
+    {
+      "epoch": 24.934805406665113,
+      "grad_norm": 0.4043842852115631,
+      "learning_rate": 0.000300965316234334,
+      "loss": 3.2095,
+      "step": 85600
+    },
+    {
+      "epoch": 24.94937077604288,
+      "grad_norm": 0.41772520542144775,
+      "learning_rate": 0.00030079044010492565,
+      "loss": 3.2133,
+      "step": 85650
+    },
+    {
+      "epoch": 24.963936145420647,
+      "grad_norm": 0.44280874729156494,
+      "learning_rate": 0.0003006155639755173,
+      "loss": 3.2232,
+      "step": 85700
+    },
+    {
+      "epoch": 24.978501514798417,
+      "grad_norm": 0.4212973117828369,
+      "learning_rate": 0.00030044068784610897,
+      "loss": 3.2087,
+      "step": 85750
+    },
+    {
+      "epoch": 24.993066884176184,
+      "grad_norm": 0.4439827501773834,
+      "learning_rate": 0.0003002658117167006,
+      "loss": 3.2043,
+      "step": 85800
+    },
+    {
+      "epoch": 25.007573992076438,
+      "grad_norm": 0.41100841760635376,
+      "learning_rate": 0.00030009093558729235,
+      "loss": 3.1611,
+      "step": 85850
+    },
+    {
+      "epoch": 25.022139361454208,
+      "grad_norm": 0.40830469131469727,
+      "learning_rate": 0.000299916059457884,
+      "loss": 3.1028,
+      "step": 85900
+    },
+    {
+      "epoch": 25.036704730831975,
+      "grad_norm": 0.43471017479896545,
+      "learning_rate": 0.0002997411833284756,
+      "loss": 3.112,
+      "step": 85950
+    },
+    {
+      "epoch": 25.05127010020974,
+      "grad_norm": 0.4032357931137085,
+      "learning_rate": 0.0002995663071990673,
+      "loss": 3.1186,
+      "step": 86000
+    },
+    {
+      "epoch": 25.05127010020974,
+      "eval_accuracy": 0.37411198158388687,
+      "eval_loss": 3.550753116607666,
+      "eval_runtime": 179.7226,
+      "eval_samples_per_second": 92.615,
+      "eval_steps_per_second": 5.792,
+      "step": 86000
+    },
+    {
+      "epoch": 25.06583546958751,
+      "grad_norm": 0.41526585817337036,
+      "learning_rate": 0.00029939143106965895,
+      "loss": 3.127,
+      "step": 86050
+    },
+    {
+      "epoch": 25.080400838965275,
+      "grad_norm": 0.433913916349411,
+      "learning_rate": 0.00029921655494025064,
+      "loss": 3.1225,
+      "step": 86100
+    },
+    {
+      "epoch": 25.094966208343042,
+      "grad_norm": 0.4391394257545471,
+      "learning_rate": 0.0002990416788108423,
+      "loss": 3.1331,
+      "step": 86150
+    },
+    {
+      "epoch": 25.109531577720812,
+      "grad_norm": 0.4312891364097595,
+      "learning_rate": 0.00029886680268143397,
+      "loss": 3.1245,
+      "step": 86200
+    },
+    {
+      "epoch": 25.12409694709858,
+      "grad_norm": 0.4313848316669464,
+      "learning_rate": 0.0002986919265520256,
+      "loss": 3.1307,
+      "step": 86250
+    },
+    {
+      "epoch": 25.138662316476346,
+      "grad_norm": 0.4232284724712372,
+      "learning_rate": 0.0002985170504226173,
+      "loss": 3.1399,
+      "step": 86300
+    },
+    {
+      "epoch": 25.153227685854112,
+      "grad_norm": 0.46254777908325195,
+      "learning_rate": 0.00029834217429320893,
+      "loss": 3.146,
+      "step": 86350
+    },
+    {
+      "epoch": 25.16779305523188,
+      "grad_norm": 0.42061716318130493,
+      "learning_rate": 0.0002981672981638006,
+      "loss": 3.1409,
+      "step": 86400
+    },
+    {
+      "epoch": 25.18235842460965,
+      "grad_norm": 0.42561444640159607,
+      "learning_rate": 0.00029799242203439226,
+      "loss": 3.1381,
+      "step": 86450
+    },
+    {
+      "epoch": 25.196923793987416,
+      "grad_norm": 0.43058279156684875,
+      "learning_rate": 0.00029781754590498395,
+      "loss": 3.1516,
+      "step": 86500
+    },
+    {
+      "epoch": 25.211489163365183,
+      "grad_norm": 0.4757503867149353,
+      "learning_rate": 0.00029764266977557564,
+      "loss": 3.1511,
+      "step": 86550
+    },
+    {
+      "epoch": 25.22605453274295,
+      "grad_norm": 0.44378894567489624,
+      "learning_rate": 0.0002974677936461673,
+      "loss": 3.141,
+      "step": 86600
+    },
+    {
+      "epoch": 25.240619902120716,
+      "grad_norm": 0.4458007216453552,
+      "learning_rate": 0.0002972929175167589,
+      "loss": 3.1534,
+      "step": 86650
+    },
+    {
+      "epoch": 25.255185271498487,
+      "grad_norm": 0.43031567335128784,
+      "learning_rate": 0.0002971180413873506,
+      "loss": 3.1493,
+      "step": 86700
+    },
+    {
+      "epoch": 25.269750640876254,
+      "grad_norm": 0.4496525824069977,
+      "learning_rate": 0.00029694316525794224,
+      "loss": 3.1595,
+      "step": 86750
+    },
+    {
+      "epoch": 25.28431601025402,
+      "grad_norm": 0.45122572779655457,
+      "learning_rate": 0.00029676828912853393,
+      "loss": 3.1619,
+      "step": 86800
+    },
+    {
+      "epoch": 25.298881379631787,
+      "grad_norm": 0.4212110638618469,
+      "learning_rate": 0.0002965934129991256,
+      "loss": 3.1619,
+      "step": 86850
+    },
+    {
+      "epoch": 25.313446749009554,
+      "grad_norm": 0.4641132354736328,
+      "learning_rate": 0.00029641853686971726,
+      "loss": 3.1727,
+      "step": 86900
+    },
+    {
+      "epoch": 25.32801211838732,
+      "grad_norm": 0.45251330733299255,
+      "learning_rate": 0.0002962436607403089,
+      "loss": 3.169,
+      "step": 86950
+    },
+    {
+      "epoch": 25.34257748776509,
+      "grad_norm": 0.46525469422340393,
+      "learning_rate": 0.0002960687846109006,
+      "loss": 3.1696,
+      "step": 87000
+    },
+    {
+      "epoch": 25.34257748776509,
+      "eval_accuracy": 0.3741974547342073,
+      "eval_loss": 3.545832872390747,
+      "eval_runtime": 179.8303,
+      "eval_samples_per_second": 92.559,
+      "eval_steps_per_second": 5.789,
+      "step": 87000
+    },
+    {
+      "epoch": 25.357142857142858,
+      "grad_norm": 0.4239409267902374,
+      "learning_rate": 0.00029589390848149227,
+      "loss": 3.168,
+      "step": 87050
+    },
+    {
+      "epoch": 25.371708226520624,
+      "grad_norm": 0.46311140060424805,
+      "learning_rate": 0.0002957190323520839,
+      "loss": 3.1748,
+      "step": 87100
+    },
+    {
+      "epoch": 25.38627359589839,
+      "grad_norm": 0.4287487268447876,
+      "learning_rate": 0.0002955441562226756,
+      "loss": 3.1698,
+      "step": 87150
+    },
+    {
+      "epoch": 25.400838965276158,
+      "grad_norm": 0.4293224811553955,
+      "learning_rate": 0.00029536928009326723,
+      "loss": 3.1713,
+      "step": 87200
+    },
+    {
+      "epoch": 25.41540433465393,
+      "grad_norm": 0.4797385632991791,
+      "learning_rate": 0.00029519440396385887,
+      "loss": 3.178,
+      "step": 87250
+    },
+    {
+      "epoch": 25.429969704031695,
+      "grad_norm": 0.4255479872226715,
+      "learning_rate": 0.00029501952783445056,
+      "loss": 3.1895,
+      "step": 87300
+    },
+    {
+      "epoch": 25.44453507340946,
+      "grad_norm": 0.44172418117523193,
+      "learning_rate": 0.00029484465170504225,
+      "loss": 3.1718,
+      "step": 87350
+    },
+    {
+      "epoch": 25.45910044278723,
+      "grad_norm": 0.46718594431877136,
+      "learning_rate": 0.0002946697755756339,
+      "loss": 3.1744,
+      "step": 87400
+    },
+    {
+      "epoch": 25.473665812164995,
+      "grad_norm": 0.45305508375167847,
+      "learning_rate": 0.0002944948994462256,
+      "loss": 3.1695,
+      "step": 87450
+    },
+    {
+      "epoch": 25.488231181542766,
+      "grad_norm": 0.4728144109249115,
+      "learning_rate": 0.00029432002331681727,
+      "loss": 3.1797,
+      "step": 87500
+    },
+    {
+      "epoch": 25.502796550920532,
+      "grad_norm": 0.42346832156181335,
+      "learning_rate": 0.0002941451471874089,
+      "loss": 3.1763,
+      "step": 87550
+    },
+    {
+      "epoch": 25.5173619202983,
+      "grad_norm": 0.4842385947704315,
+      "learning_rate": 0.00029397027105800054,
+      "loss": 3.1913,
+      "step": 87600
+    },
+    {
+      "epoch": 25.531927289676066,
+      "grad_norm": 0.41806545853614807,
+      "learning_rate": 0.00029379539492859223,
+      "loss": 3.1802,
+      "step": 87650
+    },
+    {
+      "epoch": 25.546492659053833,
+      "grad_norm": 0.4152505397796631,
+      "learning_rate": 0.00029362051879918387,
+      "loss": 3.189,
+      "step": 87700
+    },
+    {
+      "epoch": 25.5610580284316,
+      "grad_norm": 0.47798603773117065,
+      "learning_rate": 0.00029344564266977556,
+      "loss": 3.195,
+      "step": 87750
+    },
+    {
+      "epoch": 25.57562339780937,
+      "grad_norm": 0.4499337673187256,
+      "learning_rate": 0.00029327076654036725,
+      "loss": 3.1853,
+      "step": 87800
+    },
+    {
+      "epoch": 25.590188767187136,
+      "grad_norm": 0.4044957458972931,
+      "learning_rate": 0.0002930958904109589,
+      "loss": 3.1807,
+      "step": 87850
+    },
+    {
+      "epoch": 25.604754136564903,
+      "grad_norm": 0.4229501485824585,
+      "learning_rate": 0.0002929210142815505,
+      "loss": 3.1816,
+      "step": 87900
+    },
+    {
+      "epoch": 25.61931950594267,
+      "grad_norm": 0.4179123342037201,
+      "learning_rate": 0.0002927461381521422,
+      "loss": 3.1858,
+      "step": 87950
+    },
+    {
+      "epoch": 25.633884875320437,
+      "grad_norm": 0.4255021810531616,
+      "learning_rate": 0.0002925712620227339,
+      "loss": 3.1863,
+      "step": 88000
+    },
+    {
+      "epoch": 25.633884875320437,
+      "eval_accuracy": 0.37432819220759983,
+      "eval_loss": 3.543217658996582,
+      "eval_runtime": 179.9205,
+      "eval_samples_per_second": 92.513,
+      "eval_steps_per_second": 5.786,
+      "step": 88000
+    },
+    {
+      "epoch": 25.648450244698207,
+      "grad_norm": 0.41656792163848877,
+      "learning_rate": 0.00029239638589332554,
+      "loss": 3.1984,
+      "step": 88050
+    },
+    {
+      "epoch": 25.663015614075974,
+      "grad_norm": 0.43181586265563965,
+      "learning_rate": 0.0002922215097639172,
+      "loss": 3.1834,
+      "step": 88100
+    },
+    {
+      "epoch": 25.67758098345374,
+      "grad_norm": 0.42854011058807373,
+      "learning_rate": 0.00029204663363450886,
+      "loss": 3.1853,
+      "step": 88150
+    },
+    {
+      "epoch": 25.692146352831507,
+      "grad_norm": 0.4229859709739685,
+      "learning_rate": 0.0002918717575051005,
+      "loss": 3.1944,
+      "step": 88200
+    },
+    {
+      "epoch": 25.706711722209274,
+      "grad_norm": 0.4565353989601135,
+      "learning_rate": 0.0002916968813756922,
+      "loss": 3.1926,
+      "step": 88250
+    },
+    {
+      "epoch": 25.721277091587044,
+      "grad_norm": 0.4147500991821289,
+      "learning_rate": 0.0002915220052462839,
+      "loss": 3.1754,
+      "step": 88300
+    },
+    {
+      "epoch": 25.73584246096481,
+      "grad_norm": 0.4348998963832855,
+      "learning_rate": 0.0002913471291168755,
+      "loss": 3.1911,
+      "step": 88350
+    },
+    {
+      "epoch": 25.750407830342578,
+      "grad_norm": 0.43094000220298767,
+      "learning_rate": 0.00029117225298746715,
+      "loss": 3.203,
+      "step": 88400
+    },
+    {
+      "epoch": 25.764973199720345,
+      "grad_norm": 0.467695415019989,
+      "learning_rate": 0.00029099737685805884,
+      "loss": 3.1814,
+      "step": 88450
+    },
+    {
+      "epoch": 25.77953856909811,
+      "grad_norm": 0.4421153962612152,
+      "learning_rate": 0.00029082250072865053,
+      "loss": 3.2027,
+      "step": 88500
+    },
+    {
+      "epoch": 25.794103938475878,
+      "grad_norm": 0.4013921916484833,
+      "learning_rate": 0.00029064762459924217,
+      "loss": 3.1853,
+      "step": 88550
+    },
+    {
+      "epoch": 25.80866930785365,
+      "grad_norm": 0.41378721594810486,
+      "learning_rate": 0.00029047274846983386,
+      "loss": 3.1945,
+      "step": 88600
+    },
+    {
+      "epoch": 25.823234677231415,
+      "grad_norm": 0.4195496141910553,
+      "learning_rate": 0.0002902978723404255,
+      "loss": 3.1907,
+      "step": 88650
+    },
+    {
+      "epoch": 25.837800046609182,
+      "grad_norm": 0.4184630811214447,
+      "learning_rate": 0.00029012299621101713,
+      "loss": 3.19,
+      "step": 88700
+    },
+    {
+      "epoch": 25.85236541598695,
+      "grad_norm": 0.44368013739585876,
+      "learning_rate": 0.0002899481200816088,
+      "loss": 3.2054,
+      "step": 88750
+    },
+    {
+      "epoch": 25.866930785364715,
+      "grad_norm": 0.41648930311203003,
+      "learning_rate": 0.0002897732439522005,
+      "loss": 3.19,
+      "step": 88800
+    },
+    {
+      "epoch": 25.881496154742486,
+      "grad_norm": 0.4401901662349701,
+      "learning_rate": 0.00028959836782279215,
+      "loss": 3.2152,
+      "step": 88850
+    },
+    {
+      "epoch": 25.896061524120253,
+      "grad_norm": 0.421739786863327,
+      "learning_rate": 0.00028942349169338384,
+      "loss": 3.1994,
+      "step": 88900
+    },
+    {
+      "epoch": 25.91062689349802,
+      "grad_norm": 0.45117563009262085,
+      "learning_rate": 0.00028924861556397553,
+      "loss": 3.1998,
+      "step": 88950
+    },
+    {
+      "epoch": 25.925192262875786,
+      "grad_norm": 0.4407546818256378,
+      "learning_rate": 0.00028907373943456717,
+      "loss": 3.1981,
+      "step": 89000
+    },
+    {
+      "epoch": 25.925192262875786,
+      "eval_accuracy": 0.37512190505190995,
+      "eval_loss": 3.5322093963623047,
+      "eval_runtime": 179.7889,
+      "eval_samples_per_second": 92.581,
+      "eval_steps_per_second": 5.79,
+      "step": 89000
+    },
+    {
+      "epoch": 25.939757632253553,
+      "grad_norm": 0.42473137378692627,
+      "learning_rate": 0.0002888988633051588,
+      "loss": 3.1987,
+      "step": 89050
+    },
+    {
+      "epoch": 25.954323001631323,
+      "grad_norm": 0.42266032099723816,
+      "learning_rate": 0.0002887239871757505,
+      "loss": 3.2087,
+      "step": 89100
+    },
+    {
+      "epoch": 25.96888837100909,
+      "grad_norm": 0.43083688616752625,
+      "learning_rate": 0.00028854911104634213,
+      "loss": 3.1976,
+      "step": 89150
+    },
+    {
+      "epoch": 25.983453740386857,
+      "grad_norm": 0.4433348476886749,
+      "learning_rate": 0.0002883742349169338,
+      "loss": 3.2047,
+      "step": 89200
+    },
+    {
+      "epoch": 25.998019109764623,
+      "grad_norm": 0.40691253542900085,
+      "learning_rate": 0.0002881993587875255,
+      "loss": 3.2031,
+      "step": 89250
+    },
+    {
+      "epoch": 26.01252621766488,
+      "grad_norm": 0.470239520072937,
+      "learning_rate": 0.00028802448265811715,
+      "loss": 3.1109,
+      "step": 89300
+    },
+    {
+      "epoch": 26.027091587042648,
+      "grad_norm": 0.43022575974464417,
+      "learning_rate": 0.0002878496065287088,
+      "loss": 3.1127,
+      "step": 89350
+    },
+    {
+      "epoch": 26.041656956420415,
+      "grad_norm": 0.4232766926288605,
+      "learning_rate": 0.0002876747303993005,
+      "loss": 3.1152,
+      "step": 89400
+    },
+    {
+      "epoch": 26.05622232579818,
+      "grad_norm": 0.449580579996109,
+      "learning_rate": 0.00028749985426989216,
+      "loss": 3.1077,
+      "step": 89450
+    },
+    {
+      "epoch": 26.070787695175948,
+      "grad_norm": 0.46129152178764343,
+      "learning_rate": 0.0002873249781404838,
+      "loss": 3.1017,
+      "step": 89500
+    },
+    {
+      "epoch": 26.08535306455372,
+      "grad_norm": 0.4587564468383789,
+      "learning_rate": 0.0002871501020110755,
+      "loss": 3.1181,
+      "step": 89550
+    },
+    {
+      "epoch": 26.099918433931485,
+      "grad_norm": 0.44854047894477844,
+      "learning_rate": 0.0002869752258816671,
+      "loss": 3.1179,
+      "step": 89600
+    },
+    {
+      "epoch": 26.114483803309252,
+      "grad_norm": 0.4396362006664276,
+      "learning_rate": 0.00028680034975225876,
+      "loss": 3.1191,
+      "step": 89650
+    },
+    {
+      "epoch": 26.12904917268702,
+      "grad_norm": 0.42307373881340027,
+      "learning_rate": 0.00028662547362285045,
+      "loss": 3.1332,
+      "step": 89700
+    },
+    {
+      "epoch": 26.143614542064785,
+      "grad_norm": 0.4569436311721802,
+      "learning_rate": 0.00028645059749344214,
+      "loss": 3.1235,
+      "step": 89750
+    },
+    {
+      "epoch": 26.158179911442556,
+      "grad_norm": 0.44630715250968933,
+      "learning_rate": 0.0002862757213640338,
+      "loss": 3.1303,
+      "step": 89800
+    },
+    {
+      "epoch": 26.172745280820322,
+      "grad_norm": 0.4254963994026184,
+      "learning_rate": 0.00028610084523462547,
+      "loss": 3.138,
+      "step": 89850
+    },
+    {
+      "epoch": 26.18731065019809,
+      "grad_norm": 0.4579327404499054,
+      "learning_rate": 0.0002859259691052171,
+      "loss": 3.1222,
+      "step": 89900
+    },
+    {
+      "epoch": 26.201876019575856,
+      "grad_norm": 0.4161672294139862,
+      "learning_rate": 0.0002857510929758088,
+      "loss": 3.1452,
+      "step": 89950
+    },
+    {
+      "epoch": 26.216441388953623,
+      "grad_norm": 0.4550322890281677,
+      "learning_rate": 0.00028557621684640043,
+      "loss": 3.1484,
+      "step": 90000
+    },
+    {
+      "epoch": 26.216441388953623,
+      "eval_accuracy": 0.37423554730739,
+      "eval_loss": 3.546004295349121,
+      "eval_runtime": 179.8845,
+      "eval_samples_per_second": 92.532,
+      "eval_steps_per_second": 5.787,
+      "step": 90000
+    },
+    {
+      "epoch": 26.23100675833139,
+      "grad_norm": 0.4519776403903961,
+      "learning_rate": 0.0002854013407169921,
+      "loss": 3.1474,
+      "step": 90050
+    },
+    {
+      "epoch": 26.24557212770916,
+      "grad_norm": 0.4274028539657593,
+      "learning_rate": 0.00028522646458758376,
+      "loss": 3.1445,
+      "step": 90100
+    },
+    {
+      "epoch": 26.260137497086927,
+      "grad_norm": 0.4484902620315552,
+      "learning_rate": 0.00028505158845817545,
+      "loss": 3.1477,
+      "step": 90150
+    },
+    {
+      "epoch": 26.274702866464693,
+      "grad_norm": 0.4268806576728821,
+      "learning_rate": 0.0002848767123287671,
+      "loss": 3.144,
+      "step": 90200
+    },
+    {
+      "epoch": 26.28926823584246,
+      "grad_norm": 0.4616459310054779,
+      "learning_rate": 0.0002847018361993588,
+      "loss": 3.1451,
+      "step": 90250
+    },
+    {
+      "epoch": 26.303833605220227,
+      "grad_norm": 0.4314412772655487,
+      "learning_rate": 0.0002845269600699504,
+      "loss": 3.1629,
+      "step": 90300
+    },
+    {
+      "epoch": 26.318398974597997,
+      "grad_norm": 0.4529995918273926,
+      "learning_rate": 0.0002843520839405421,
+      "loss": 3.1517,
+      "step": 90350
+    },
+    {
+      "epoch": 26.332964343975764,
+      "grad_norm": 0.4490029811859131,
+      "learning_rate": 0.0002841772078111338,
+      "loss": 3.1627,
+      "step": 90400
+    },
+    {
+      "epoch": 26.34752971335353,
+      "grad_norm": 0.46888235211372375,
+      "learning_rate": 0.00028400233168172543,
+      "loss": 3.1591,
+      "step": 90450
+    },
+    {
+      "epoch": 26.362095082731297,
+      "grad_norm": 0.43123456835746765,
+      "learning_rate": 0.00028382745555231707,
+      "loss": 3.1538,
+      "step": 90500
+    },
+    {
+      "epoch": 26.376660452109064,
+      "grad_norm": 0.4406403601169586,
+      "learning_rate": 0.00028365257942290876,
+      "loss": 3.1637,
+      "step": 90550
+    },
+    {
+      "epoch": 26.391225821486834,
+      "grad_norm": 0.44110873341560364,
+      "learning_rate": 0.0002834777032935004,
+      "loss": 3.1546,
+      "step": 90600
+    },
+    {
+      "epoch": 26.4057911908646,
+      "grad_norm": 0.45763951539993286,
+      "learning_rate": 0.0002833028271640921,
+      "loss": 3.167,
+      "step": 90650
+    },
+    {
+      "epoch": 26.420356560242368,
+      "grad_norm": 0.4703918695449829,
+      "learning_rate": 0.00028312795103468377,
+      "loss": 3.1606,
+      "step": 90700
+    },
+    {
+      "epoch": 26.434921929620135,
+      "grad_norm": 0.4339727759361267,
+      "learning_rate": 0.0002829530749052754,
+      "loss": 3.1597,
+      "step": 90750
+    },
+    {
+      "epoch": 26.4494872989979,
+      "grad_norm": 0.4744960367679596,
+      "learning_rate": 0.00028277819877586705,
+      "loss": 3.1638,
+      "step": 90800
+    },
+    {
+      "epoch": 26.46405266837567,
+      "grad_norm": 0.4523671269416809,
+      "learning_rate": 0.00028260332264645874,
+      "loss": 3.156,
+      "step": 90850
+    },
+    {
+      "epoch": 26.47861803775344,
+      "grad_norm": 0.4665144383907318,
+      "learning_rate": 0.0002824284465170504,
+      "loss": 3.1772,
+      "step": 90900
+    },
+    {
+      "epoch": 26.493183407131205,
+      "grad_norm": 0.5128541588783264,
+      "learning_rate": 0.00028225357038764206,
+      "loss": 3.167,
+      "step": 90950
+    },
+    {
+      "epoch": 26.507748776508972,
+      "grad_norm": 0.4322499930858612,
+      "learning_rate": 0.00028207869425823375,
+      "loss": 3.1698,
+      "step": 91000
+    },
+    {
+      "epoch": 26.507748776508972,
+      "eval_accuracy": 0.3743130257201289,
+      "eval_loss": 3.545729875564575,
+      "eval_runtime": 179.6993,
+      "eval_samples_per_second": 92.627,
+      "eval_steps_per_second": 5.793,
+      "step": 91000
+    },
+    {
+      "epoch": 26.52231414588674,
+      "grad_norm": 0.4499381482601166,
+      "learning_rate": 0.0002819038181288254,
+      "loss": 3.1626,
+      "step": 91050
+    },
+    {
+      "epoch": 26.536879515264506,
+      "grad_norm": 0.4228927195072174,
+      "learning_rate": 0.000281728941999417,
+      "loss": 3.161,
+      "step": 91100
+    },
+    {
+      "epoch": 26.551444884642276,
+      "grad_norm": 0.44955986738204956,
+      "learning_rate": 0.0002815540658700087,
+      "loss": 3.18,
+      "step": 91150
+    },
+    {
+      "epoch": 26.566010254020043,
+      "grad_norm": 0.4338165521621704,
+      "learning_rate": 0.0002813791897406004,
+      "loss": 3.1676,
+      "step": 91200
+    },
+    {
+      "epoch": 26.58057562339781,
+      "grad_norm": 0.4299832880496979,
+      "learning_rate": 0.00028120431361119204,
+      "loss": 3.1764,
+      "step": 91250
+    },
+    {
+      "epoch": 26.595140992775576,
+      "grad_norm": 0.4154253304004669,
+      "learning_rate": 0.00028102943748178373,
+      "loss": 3.1798,
+      "step": 91300
+    },
+    {
+      "epoch": 26.609706362153343,
+      "grad_norm": 0.44844770431518555,
+      "learning_rate": 0.00028085456135237537,
+      "loss": 3.1812,
+      "step": 91350
+    },
+    {
+      "epoch": 26.624271731531113,
+      "grad_norm": 0.45521214604377747,
+      "learning_rate": 0.00028067968522296706,
+      "loss": 3.1815,
+      "step": 91400
+    },
+    {
+      "epoch": 26.63883710090888,
+      "grad_norm": 0.41416242718696594,
+      "learning_rate": 0.0002805048090935587,
+      "loss": 3.1811,
+      "step": 91450
+    },
+    {
+      "epoch": 26.653402470286647,
+      "grad_norm": 0.4366213381290436,
+      "learning_rate": 0.0002803299329641504,
+      "loss": 3.1767,
+      "step": 91500
+    },
+    {
+      "epoch": 26.667967839664414,
+      "grad_norm": 0.4341444969177246,
+      "learning_rate": 0.000280155056834742,
+      "loss": 3.191,
+      "step": 91550
+    },
+    {
+      "epoch": 26.68253320904218,
+      "grad_norm": 0.4299844205379486,
+      "learning_rate": 0.0002799801807053337,
+      "loss": 3.1736,
+      "step": 91600
+    },
+    {
+      "epoch": 26.697098578419947,
+      "grad_norm": 0.4490987956523895,
+      "learning_rate": 0.00027980530457592535,
+      "loss": 3.1789,
+      "step": 91650
+    },
+    {
+      "epoch": 26.711663947797717,
+      "grad_norm": 0.4265900254249573,
+      "learning_rate": 0.00027963042844651704,
+      "loss": 3.169,
+      "step": 91700
+    },
+    {
+      "epoch": 26.726229317175484,
+      "grad_norm": 0.4504244923591614,
+      "learning_rate": 0.0002794555523171087,
+      "loss": 3.19,
+      "step": 91750
+    },
+    {
+      "epoch": 26.74079468655325,
+      "grad_norm": 0.441986620426178,
+      "learning_rate": 0.00027928067618770037,
+      "loss": 3.1873,
+      "step": 91800
+    },
+    {
+      "epoch": 26.755360055931018,
+      "grad_norm": 0.46930548548698425,
+      "learning_rate": 0.00027910580005829206,
+      "loss": 3.1911,
+      "step": 91850
+    },
+    {
+      "epoch": 26.769925425308784,
+      "grad_norm": 0.4317058026790619,
+      "learning_rate": 0.0002789309239288837,
+      "loss": 3.1826,
+      "step": 91900
+    },
+    {
+      "epoch": 26.784490794686555,
+      "grad_norm": 0.4412967562675476,
+      "learning_rate": 0.00027875604779947533,
+      "loss": 3.186,
+      "step": 91950
+    },
+    {
+      "epoch": 26.79905616406432,
+      "grad_norm": 0.4440974295139313,
+      "learning_rate": 0.000278581171670067,
+      "loss": 3.1977,
+      "step": 92000
+    },
+    {
+      "epoch": 26.79905616406432,
+      "eval_accuracy": 0.3749948122382973,
+      "eval_loss": 3.535423994064331,
+      "eval_runtime": 179.7826,
+      "eval_samples_per_second": 92.584,
+      "eval_steps_per_second": 5.79,
+      "step": 92000
+    },
+    {
+      "epoch": 26.813621533442088,
+      "grad_norm": 0.4377935826778412,
+      "learning_rate": 0.00027840629554065865,
+      "loss": 3.1886,
+      "step": 92050
+    },
+    {
+      "epoch": 26.828186902819855,
+      "grad_norm": 0.4575270414352417,
+      "learning_rate": 0.00027823141941125034,
+      "loss": 3.19,
+      "step": 92100
+    },
+    {
+      "epoch": 26.84275227219762,
+      "grad_norm": 0.45877814292907715,
+      "learning_rate": 0.00027805654328184204,
+      "loss": 3.191,
+      "step": 92150
+    },
+    {
+      "epoch": 26.857317641575392,
+      "grad_norm": 0.4479317367076874,
+      "learning_rate": 0.00027788166715243367,
+      "loss": 3.2019,
+      "step": 92200
+    },
+    {
+      "epoch": 26.87188301095316,
+      "grad_norm": 0.42838388681411743,
+      "learning_rate": 0.0002777067910230253,
+      "loss": 3.1834,
+      "step": 92250
+    },
+    {
+      "epoch": 26.886448380330926,
+      "grad_norm": 0.4574011564254761,
+      "learning_rate": 0.000277531914893617,
+      "loss": 3.1953,
+      "step": 92300
+    },
+    {
+      "epoch": 26.901013749708692,
+      "grad_norm": 0.43894481658935547,
+      "learning_rate": 0.00027735703876420863,
+      "loss": 3.1959,
+      "step": 92350
+    },
+    {
+      "epoch": 26.91557911908646,
+      "grad_norm": 0.42568439245224,
+      "learning_rate": 0.0002771821626348003,
+      "loss": 3.1886,
+      "step": 92400
+    },
+    {
+      "epoch": 26.930144488464226,
+      "grad_norm": 0.4158543348312378,
+      "learning_rate": 0.000277007286505392,
+      "loss": 3.1995,
+      "step": 92450
+    },
+    {
+      "epoch": 26.944709857841996,
+      "grad_norm": 0.4558238685131073,
+      "learning_rate": 0.00027683241037598365,
+      "loss": 3.1805,
+      "step": 92500
+    },
+    {
+      "epoch": 26.959275227219763,
+      "grad_norm": 0.4516817331314087,
+      "learning_rate": 0.0002766575342465753,
+      "loss": 3.1962,
+      "step": 92550
+    },
+    {
+      "epoch": 26.97384059659753,
+      "grad_norm": 0.42139333486557007,
+      "learning_rate": 0.000276482658117167,
+      "loss": 3.1903,
+      "step": 92600
+    },
+    {
+      "epoch": 26.988405965975296,
+      "grad_norm": 0.40730905532836914,
+      "learning_rate": 0.00027630778198775867,
+      "loss": 3.1924,
+      "step": 92650
+    },
+    {
+      "epoch": 27.002913073875554,
+      "grad_norm": 0.4469035863876343,
+      "learning_rate": 0.0002761329058583503,
+      "loss": 3.1777,
+      "step": 92700
+    },
+    {
+      "epoch": 27.01747844325332,
+      "grad_norm": 0.5051560997962952,
+      "learning_rate": 0.000275958029728942,
+      "loss": 3.0968,
+      "step": 92750
+    },
+    {
+      "epoch": 27.032043812631088,
+      "grad_norm": 0.4543512165546417,
+      "learning_rate": 0.00027578315359953363,
+      "loss": 3.0962,
+      "step": 92800
+    },
+    {
+      "epoch": 27.046609182008854,
+      "grad_norm": 0.4353955388069153,
+      "learning_rate": 0.00027560827747012527,
+      "loss": 3.1153,
+      "step": 92850
+    },
+    {
+      "epoch": 27.061174551386625,
+      "grad_norm": 0.48911988735198975,
+      "learning_rate": 0.00027543340134071696,
+      "loss": 3.1041,
+      "step": 92900
+    },
+    {
+      "epoch": 27.07573992076439,
+      "grad_norm": 0.4155479371547699,
+      "learning_rate": 0.00027525852521130865,
+      "loss": 3.0933,
+      "step": 92950
+    },
+    {
+      "epoch": 27.090305290142158,
+      "grad_norm": 0.43685251474380493,
+      "learning_rate": 0.0002750836490819003,
+      "loss": 3.1125,
+      "step": 93000
+    },
+    {
+      "epoch": 27.090305290142158,
+      "eval_accuracy": 0.3741177424977324,
+      "eval_loss": 3.550111770629883,
+      "eval_runtime": 179.8551,
+      "eval_samples_per_second": 92.547,
+      "eval_steps_per_second": 5.788,
+      "step": 93000
+    },
+    {
+      "epoch": 27.104870659519925,
+      "grad_norm": 0.4509027600288391,
+      "learning_rate": 0.000274908772952492,
+      "loss": 3.1063,
+      "step": 93050
+    },
+    {
+      "epoch": 27.11943602889769,
+      "grad_norm": 0.4444672465324402,
+      "learning_rate": 0.0002747338968230836,
+      "loss": 3.1204,
+      "step": 93100
+    },
+    {
+      "epoch": 27.134001398275462,
+      "grad_norm": 0.45079129934310913,
+      "learning_rate": 0.0002745590206936753,
+      "loss": 3.1147,
+      "step": 93150
+    },
+    {
+      "epoch": 27.14856676765323,
+      "grad_norm": 0.4966619610786438,
+      "learning_rate": 0.00027438414456426694,
+      "loss": 3.1341,
+      "step": 93200
+    },
+    {
+      "epoch": 27.163132137030995,
+      "grad_norm": 0.48015525937080383,
+      "learning_rate": 0.00027420926843485863,
+      "loss": 3.13,
+      "step": 93250
+    },
+    {
+      "epoch": 27.177697506408762,
+      "grad_norm": 0.483812540769577,
+      "learning_rate": 0.00027403439230545026,
+      "loss": 3.1373,
+      "step": 93300
+    },
+    {
+      "epoch": 27.19226287578653,
+      "grad_norm": 0.4629931151866913,
+      "learning_rate": 0.00027385951617604195,
+      "loss": 3.1216,
+      "step": 93350
+    },
+    {
+      "epoch": 27.206828245164296,
+      "grad_norm": 0.4691258668899536,
+      "learning_rate": 0.0002736846400466336,
+      "loss": 3.1361,
+      "step": 93400
+    },
+    {
+      "epoch": 27.221393614542066,
+      "grad_norm": 0.42496374249458313,
+      "learning_rate": 0.0002735097639172253,
+      "loss": 3.133,
+      "step": 93450
+    },
+    {
+      "epoch": 27.235958983919833,
+      "grad_norm": 0.46841350197792053,
+      "learning_rate": 0.0002733348877878169,
+      "loss": 3.1306,
+      "step": 93500
+    },
+    {
+      "epoch": 27.2505243532976,
+      "grad_norm": 0.43265077471733093,
+      "learning_rate": 0.0002731600116584086,
+      "loss": 3.1355,
+      "step": 93550
+    },
+    {
+      "epoch": 27.265089722675366,
+      "grad_norm": 0.47501933574676514,
+      "learning_rate": 0.0002729851355290003,
+      "loss": 3.1378,
+      "step": 93600
+    },
+    {
+      "epoch": 27.279655092053133,
+      "grad_norm": 0.469482958316803,
+      "learning_rate": 0.00027281025939959193,
+      "loss": 3.1349,
+      "step": 93650
+    },
+    {
+      "epoch": 27.294220461430903,
+      "grad_norm": 0.4316178262233734,
+      "learning_rate": 0.00027263538327018357,
+      "loss": 3.1486,
+      "step": 93700
+    },
+    {
+      "epoch": 27.30878583080867,
+      "grad_norm": 0.45734158158302307,
+      "learning_rate": 0.00027246050714077526,
+      "loss": 3.1382,
+      "step": 93750
+    },
+    {
+      "epoch": 27.323351200186437,
+      "grad_norm": 0.44604817032814026,
+      "learning_rate": 0.0002722856310113669,
+      "loss": 3.153,
+      "step": 93800
+    },
+    {
+      "epoch": 27.337916569564204,
+      "grad_norm": 0.4513714015483856,
+      "learning_rate": 0.0002721107548819586,
+      "loss": 3.1443,
+      "step": 93850
+    },
+    {
+      "epoch": 27.35248193894197,
+      "grad_norm": 0.4659077227115631,
+      "learning_rate": 0.0002719358787525503,
+      "loss": 3.135,
+      "step": 93900
+    },
+    {
+      "epoch": 27.36704730831974,
+      "grad_norm": 0.4294387698173523,
+      "learning_rate": 0.0002717610026231419,
+      "loss": 3.1445,
+      "step": 93950
+    },
+    {
+      "epoch": 27.381612677697508,
+      "grad_norm": 0.4388628900051117,
+      "learning_rate": 0.00027158612649373355,
+      "loss": 3.1546,
+      "step": 94000
+    },
+    {
+      "epoch": 27.381612677697508,
+      "eval_accuracy": 0.3744574012752782,
+      "eval_loss": 3.5467703342437744,
+      "eval_runtime": 179.9446,
+      "eval_samples_per_second": 92.501,
+      "eval_steps_per_second": 5.785,
+      "step": 94000
+    },
+    {
+      "epoch": 27.396178047075274,
+      "grad_norm": 0.4699251353740692,
+      "learning_rate": 0.00027141125036432524,
+      "loss": 3.1437,
+      "step": 94050
+    },
+    {
+      "epoch": 27.41074341645304,
+      "grad_norm": 0.44644948840141296,
+      "learning_rate": 0.00027123637423491693,
+      "loss": 3.1516,
+      "step": 94100
+    },
+    {
+      "epoch": 27.425308785830808,
+      "grad_norm": 0.4366833567619324,
+      "learning_rate": 0.00027106149810550857,
+      "loss": 3.157,
+      "step": 94150
+    },
+    {
+      "epoch": 27.439874155208575,
+      "grad_norm": 0.47586560249328613,
+      "learning_rate": 0.00027088662197610026,
+      "loss": 3.158,
+      "step": 94200
+    },
+    {
+      "epoch": 27.454439524586345,
+      "grad_norm": 0.449791818857193,
+      "learning_rate": 0.0002707117458466919,
+      "loss": 3.1498,
+      "step": 94250
+    },
+    {
+      "epoch": 27.46900489396411,
+      "grad_norm": 0.46725863218307495,
+      "learning_rate": 0.00027053686971728353,
+      "loss": 3.1569,
+      "step": 94300
+    },
+    {
+      "epoch": 27.48357026334188,
+      "grad_norm": 0.44953715801239014,
+      "learning_rate": 0.0002703619935878752,
+      "loss": 3.1548,
+      "step": 94350
+    },
+    {
+      "epoch": 27.498135632719645,
+      "grad_norm": 0.4506903290748596,
+      "learning_rate": 0.0002701871174584669,
+      "loss": 3.1679,
+      "step": 94400
+    },
+    {
+      "epoch": 27.512701002097412,
+      "grad_norm": 0.44679129123687744,
+      "learning_rate": 0.00027001224132905855,
+      "loss": 3.1668,
+      "step": 94450
+    },
+    {
+      "epoch": 27.527266371475182,
+      "grad_norm": 0.43930813670158386,
+      "learning_rate": 0.00026983736519965024,
+      "loss": 3.1537,
+      "step": 94500
+    },
+    {
+      "epoch": 27.54183174085295,
+      "grad_norm": 0.43353012204170227,
+      "learning_rate": 0.0002696624890702419,
+      "loss": 3.1702,
+      "step": 94550
+    },
+    {
+      "epoch": 27.556397110230716,
+      "grad_norm": 0.491400808095932,
+      "learning_rate": 0.00026948761294083356,
+      "loss": 3.1703,
+      "step": 94600
+    },
+    {
+      "epoch": 27.570962479608482,
+      "grad_norm": 0.4375465214252472,
+      "learning_rate": 0.0002693127368114252,
+      "loss": 3.1697,
+      "step": 94650
+    },
+    {
+      "epoch": 27.58552784898625,
+      "grad_norm": 0.42356833815574646,
+      "learning_rate": 0.0002691378606820169,
+      "loss": 3.1576,
+      "step": 94700
+    },
+    {
+      "epoch": 27.600093218364016,
+      "grad_norm": 0.4304982125759125,
+      "learning_rate": 0.0002689629845526085,
+      "loss": 3.1705,
+      "step": 94750
+    },
+    {
+      "epoch": 27.614658587741786,
+      "grad_norm": 0.4731658399105072,
+      "learning_rate": 0.0002687881084232002,
+      "loss": 3.1662,
+      "step": 94800
+    },
+    {
+      "epoch": 27.629223957119553,
+      "grad_norm": 0.44949278235435486,
+      "learning_rate": 0.00026861323229379185,
+      "loss": 3.165,
+      "step": 94850
+    },
+    {
+      "epoch": 27.64378932649732,
+      "grad_norm": 0.4739612638950348,
+      "learning_rate": 0.00026843835616438354,
+      "loss": 3.1635,
+      "step": 94900
+    },
+    {
+      "epoch": 27.658354695875087,
+      "grad_norm": 0.48769617080688477,
+      "learning_rate": 0.0002682634800349752,
+      "loss": 3.1613,
+      "step": 94950
+    },
+    {
+      "epoch": 27.672920065252853,
+      "grad_norm": 0.4974506199359894,
+      "learning_rate": 0.00026808860390556687,
+      "loss": 3.1704,
+      "step": 95000
+    },
+    {
+      "epoch": 27.672920065252853,
+      "eval_accuracy": 0.37493602740313875,
+      "eval_loss": 3.5381674766540527,
+      "eval_runtime": 179.802,
+      "eval_samples_per_second": 92.574,
+      "eval_steps_per_second": 5.79,
+      "step": 95000
+    },
+    {
+      "epoch": 27.687485434630624,
+      "grad_norm": 0.4498634934425354,
+      "learning_rate": 0.00026791372777615856,
+      "loss": 3.1687,
+      "step": 95050
+    },
+    {
+      "epoch": 27.70205080400839,
+      "grad_norm": 0.4463023841381073,
+      "learning_rate": 0.0002677388516467502,
+      "loss": 3.1677,
+      "step": 95100
+    },
+    {
+      "epoch": 27.716616173386157,
+      "grad_norm": 0.47234046459198,
+      "learning_rate": 0.00026756397551734183,
+      "loss": 3.1763,
+      "step": 95150
+    },
+    {
+      "epoch": 27.731181542763924,
+      "grad_norm": 0.4431990385055542,
+      "learning_rate": 0.0002673890993879335,
+      "loss": 3.1777,
+      "step": 95200
+    },
+    {
+      "epoch": 27.74574691214169,
+      "grad_norm": 0.43977609276771545,
+      "learning_rate": 0.00026721422325852516,
+      "loss": 3.1732,
+      "step": 95250
+    },
+    {
+      "epoch": 27.76031228151946,
+      "grad_norm": 0.43646240234375,
+      "learning_rate": 0.00026703934712911685,
+      "loss": 3.1871,
+      "step": 95300
+    },
+    {
+      "epoch": 27.774877650897228,
+      "grad_norm": 0.45789384841918945,
+      "learning_rate": 0.00026686447099970854,
+      "loss": 3.1791,
+      "step": 95350
+    },
+    {
+      "epoch": 27.789443020274994,
+      "grad_norm": 0.45078974962234497,
+      "learning_rate": 0.0002666895948703002,
+      "loss": 3.1786,
+      "step": 95400
+    },
+    {
+      "epoch": 27.80400838965276,
+      "grad_norm": 0.4329206347465515,
+      "learning_rate": 0.0002665147187408918,
+      "loss": 3.1785,
+      "step": 95450
+    },
+    {
+      "epoch": 27.818573759030528,
+      "grad_norm": 0.4348940849304199,
+      "learning_rate": 0.0002663398426114835,
+      "loss": 3.1867,
+      "step": 95500
+    },
+    {
+      "epoch": 27.833139128408295,
+      "grad_norm": 0.45594125986099243,
+      "learning_rate": 0.0002661649664820752,
+      "loss": 3.187,
+      "step": 95550
+    },
+    {
+      "epoch": 27.847704497786065,
+      "grad_norm": 0.4796610176563263,
+      "learning_rate": 0.00026599009035266683,
+      "loss": 3.1756,
+      "step": 95600
+    },
+    {
+      "epoch": 27.862269867163832,
+      "grad_norm": 0.4777950644493103,
+      "learning_rate": 0.0002658152142232585,
+      "loss": 3.1781,
+      "step": 95650
+    },
+    {
+      "epoch": 27.8768352365416,
+      "grad_norm": 0.46176958084106445,
+      "learning_rate": 0.00026564033809385016,
+      "loss": 3.1788,
+      "step": 95700
+    },
+    {
+      "epoch": 27.891400605919365,
+      "grad_norm": 0.434174120426178,
+      "learning_rate": 0.0002654654619644418,
+      "loss": 3.1862,
+      "step": 95750
+    },
+    {
+      "epoch": 27.905965975297132,
+      "grad_norm": 0.44971075654029846,
+      "learning_rate": 0.0002652905858350335,
+      "loss": 3.1892,
+      "step": 95800
+    },
+    {
+      "epoch": 27.920531344674902,
+      "grad_norm": 0.43806204199790955,
+      "learning_rate": 0.00026511570970562517,
+      "loss": 3.1733,
+      "step": 95850
+    },
+    {
+      "epoch": 27.93509671405267,
+      "grad_norm": 0.4868404269218445,
+      "learning_rate": 0.0002649408335762168,
+      "loss": 3.1825,
+      "step": 95900
+    },
+    {
+      "epoch": 27.949662083430436,
+      "grad_norm": 0.45844024419784546,
+      "learning_rate": 0.0002647659574468085,
+      "loss": 3.1906,
+      "step": 95950
+    },
+    {
+      "epoch": 27.964227452808203,
+      "grad_norm": 0.424188494682312,
+      "learning_rate": 0.0002645910813174002,
+      "loss": 3.1875,
+      "step": 96000
+    },
+    {
+      "epoch": 27.964227452808203,
+      "eval_accuracy": 0.3754160643670431,
+      "eval_loss": 3.5307044982910156,
+      "eval_runtime": 179.9575,
+      "eval_samples_per_second": 92.494,
+      "eval_steps_per_second": 5.785,
+      "step": 96000
+    },
+    {
+      "epoch": 27.97879282218597,
+      "grad_norm": 0.46329542994499207,
+      "learning_rate": 0.0002644162051879918,
+      "loss": 3.1886,
+      "step": 96050
+    },
+    {
+      "epoch": 27.99335819156374,
+      "grad_norm": 0.4756506681442261,
+      "learning_rate": 0.00026424132905858346,
+      "loss": 3.1786,
+      "step": 96100
+    },
+    {
+      "epoch": 28.007865299463994,
+      "grad_norm": 0.44213542342185974,
+      "learning_rate": 0.00026406645292917515,
+      "loss": 3.1369,
+      "step": 96150
+    },
+    {
+      "epoch": 28.02243066884176,
+      "grad_norm": 0.4677712917327881,
+      "learning_rate": 0.0002638915767997668,
+      "loss": 3.0854,
+      "step": 96200
+    },
+    {
+      "epoch": 28.03699603821953,
+      "grad_norm": 0.4562908709049225,
+      "learning_rate": 0.0002637167006703585,
+      "loss": 3.0957,
+      "step": 96250
+    },
+    {
+      "epoch": 28.051561407597298,
+      "grad_norm": 0.4334988594055176,
+      "learning_rate": 0.00026354182454095017,
+      "loss": 3.0935,
+      "step": 96300
+    },
+    {
+      "epoch": 28.066126776975064,
+      "grad_norm": 0.45454612374305725,
+      "learning_rate": 0.0002633669484115418,
+      "loss": 3.0987,
+      "step": 96350
+    },
+    {
+      "epoch": 28.08069214635283,
+      "grad_norm": 0.4465283453464508,
+      "learning_rate": 0.00026319207228213344,
+      "loss": 3.1039,
+      "step": 96400
+    },
+    {
+      "epoch": 28.095257515730598,
+      "grad_norm": 0.5014046430587769,
+      "learning_rate": 0.00026301719615272513,
+      "loss": 3.1036,
+      "step": 96450
+    },
+    {
+      "epoch": 28.109822885108365,
+      "grad_norm": 0.45270198583602905,
+      "learning_rate": 0.0002628423200233168,
+      "loss": 3.1143,
+      "step": 96500
+    },
+    {
+      "epoch": 28.124388254486135,
+      "grad_norm": 0.46294882893562317,
+      "learning_rate": 0.00026266744389390846,
+      "loss": 3.1056,
+      "step": 96550
+    },
+    {
+      "epoch": 28.1389536238639,
+      "grad_norm": 0.4628591239452362,
+      "learning_rate": 0.00026249256776450015,
+      "loss": 3.1132,
+      "step": 96600
+    },
+    {
+      "epoch": 28.15351899324167,
+      "grad_norm": 0.4873234033584595,
+      "learning_rate": 0.0002623176916350918,
+      "loss": 3.1179,
+      "step": 96650
+    },
+    {
+      "epoch": 28.168084362619435,
+      "grad_norm": 0.44500336050987244,
+      "learning_rate": 0.0002621428155056834,
+      "loss": 3.1162,
+      "step": 96700
+    },
+    {
+      "epoch": 28.182649731997202,
+      "grad_norm": 0.43908581137657166,
+      "learning_rate": 0.0002619679393762751,
+      "loss": 3.1182,
+      "step": 96750
+    },
+    {
+      "epoch": 28.197215101374972,
+      "grad_norm": 0.4487292170524597,
+      "learning_rate": 0.0002617930632468668,
+      "loss": 3.1227,
+      "step": 96800
+    },
+    {
+      "epoch": 28.21178047075274,
+      "grad_norm": 0.4873857796192169,
+      "learning_rate": 0.00026161818711745844,
+      "loss": 3.1296,
+      "step": 96850
+    },
+    {
+      "epoch": 28.226345840130506,
+      "grad_norm": 0.4577917754650116,
+      "learning_rate": 0.0002614433109880501,
+      "loss": 3.1267,
+      "step": 96900
+    },
+    {
+      "epoch": 28.240911209508273,
+      "grad_norm": 0.44587984681129456,
+      "learning_rate": 0.00026126843485864176,
+      "loss": 3.142,
+      "step": 96950
+    },
+    {
+      "epoch": 28.25547657888604,
+      "grad_norm": 0.44689640402793884,
+      "learning_rate": 0.00026109355872923345,
+      "loss": 3.1274,
+      "step": 97000
+    },
+    {
+      "epoch": 28.25547657888604,
+      "eval_accuracy": 0.3748532583552356,
+      "eval_loss": 3.5460498332977295,
+      "eval_runtime": 180.0191,
+      "eval_samples_per_second": 92.462,
+      "eval_steps_per_second": 5.783,
+      "step": 97000
+    },
+    {
+      "epoch": 28.27004194826381,
+      "grad_norm": 0.461459755897522,
+      "learning_rate": 0.0002609186825998251,
+      "loss": 3.1274,
+      "step": 97050
+    },
+    {
+      "epoch": 28.284607317641576,
+      "grad_norm": 0.4559226632118225,
+      "learning_rate": 0.0002607438064704168,
+      "loss": 3.1259,
+      "step": 97100
+    },
+    {
+      "epoch": 28.299172687019343,
+      "grad_norm": 0.4652230739593506,
+      "learning_rate": 0.0002605689303410084,
+      "loss": 3.1397,
+      "step": 97150
+    },
+    {
+      "epoch": 28.31373805639711,
+      "grad_norm": 0.4658327102661133,
+      "learning_rate": 0.00026039405421160005,
+      "loss": 3.1371,
+      "step": 97200
+    },
+    {
+      "epoch": 28.328303425774877,
+      "grad_norm": 0.44982901215553284,
+      "learning_rate": 0.00026021917808219174,
+      "loss": 3.1404,
+      "step": 97250
+    },
+    {
+      "epoch": 28.342868795152643,
+      "grad_norm": 0.45533254742622375,
+      "learning_rate": 0.00026004430195278343,
+      "loss": 3.1315,
+      "step": 97300
+    },
+    {
+      "epoch": 28.357434164530414,
+      "grad_norm": 0.4789443910121918,
+      "learning_rate": 0.00025986942582337507,
+      "loss": 3.1545,
+      "step": 97350
+    },
+    {
+      "epoch": 28.37199953390818,
+      "grad_norm": 0.4561361074447632,
+      "learning_rate": 0.00025969454969396676,
+      "loss": 3.1378,
+      "step": 97400
+    },
+    {
+      "epoch": 28.386564903285947,
+      "grad_norm": 0.48362991213798523,
+      "learning_rate": 0.00025951967356455845,
+      "loss": 3.1287,
+      "step": 97450
+    },
+    {
+      "epoch": 28.401130272663714,
+      "grad_norm": 0.4448145925998688,
+      "learning_rate": 0.0002593447974351501,
+      "loss": 3.1475,
+      "step": 97500
+    },
+    {
+      "epoch": 28.41569564204148,
+      "grad_norm": 0.4719396233558655,
+      "learning_rate": 0.0002591699213057417,
+      "loss": 3.1403,
+      "step": 97550
+    },
+    {
+      "epoch": 28.43026101141925,
+      "grad_norm": 0.4635832905769348,
+      "learning_rate": 0.0002589950451763334,
+      "loss": 3.1464,
+      "step": 97600
+    },
+    {
+      "epoch": 28.444826380797018,
+      "grad_norm": 0.44658058881759644,
+      "learning_rate": 0.00025882016904692505,
+      "loss": 3.1514,
+      "step": 97650
+    },
+    {
+      "epoch": 28.459391750174785,
+      "grad_norm": 0.4418274462223053,
+      "learning_rate": 0.00025864529291751674,
+      "loss": 3.1456,
+      "step": 97700
+    },
+    {
+      "epoch": 28.47395711955255,
+      "grad_norm": 0.4710671901702881,
+      "learning_rate": 0.00025847041678810843,
+      "loss": 3.1439,
+      "step": 97750
+    },
+    {
+      "epoch": 28.488522488930318,
+      "grad_norm": 0.4393328130245209,
+      "learning_rate": 0.00025829554065870007,
+      "loss": 3.1564,
+      "step": 97800
+    },
+    {
+      "epoch": 28.503087858308085,
+      "grad_norm": 0.4646104574203491,
+      "learning_rate": 0.0002581206645292917,
+      "loss": 3.1444,
+      "step": 97850
+    },
+    {
+      "epoch": 28.517653227685855,
+      "grad_norm": 0.46661272644996643,
+      "learning_rate": 0.0002579457883998834,
+      "loss": 3.1503,
+      "step": 97900
+    },
+    {
+      "epoch": 28.532218597063622,
+      "grad_norm": 0.4600259065628052,
+      "learning_rate": 0.0002577709122704751,
+      "loss": 3.1562,
+      "step": 97950
+    },
+    {
+      "epoch": 28.54678396644139,
+      "grad_norm": 0.4362897574901581,
+      "learning_rate": 0.0002575960361410667,
+      "loss": 3.1492,
+      "step": 98000
+    },
+    {
+      "epoch": 28.54678396644139,
+      "eval_accuracy": 0.37473874549634684,
+      "eval_loss": 3.5448083877563477,
+      "eval_runtime": 179.7838,
+      "eval_samples_per_second": 92.583,
+      "eval_steps_per_second": 5.79,
+      "step": 98000
+    },
+    {
+      "epoch": 28.561349335819155,
+      "grad_norm": 0.44857919216156006,
+      "learning_rate": 0.0002574211600116584,
+      "loss": 3.157,
+      "step": 98050
+    },
+    {
+      "epoch": 28.575914705196922,
+      "grad_norm": 0.4593052268028259,
+      "learning_rate": 0.00025724628388225005,
+      "loss": 3.1505,
+      "step": 98100
+    },
+    {
+      "epoch": 28.590480074574693,
+      "grad_norm": 0.42843860387802124,
+      "learning_rate": 0.0002570714077528417,
+      "loss": 3.1613,
+      "step": 98150
+    },
+    {
+      "epoch": 28.60504544395246,
+      "grad_norm": 0.44706371426582336,
+      "learning_rate": 0.0002568965316234334,
+      "loss": 3.1625,
+      "step": 98200
+    },
+    {
+      "epoch": 28.619610813330226,
+      "grad_norm": 0.47429358959198,
+      "learning_rate": 0.00025672165549402506,
+      "loss": 3.1648,
+      "step": 98250
+    },
+    {
+      "epoch": 28.634176182707993,
+      "grad_norm": 0.46534454822540283,
+      "learning_rate": 0.0002565467793646167,
+      "loss": 3.1579,
+      "step": 98300
+    },
+    {
+      "epoch": 28.64874155208576,
+      "grad_norm": 0.4708279073238373,
+      "learning_rate": 0.0002563719032352084,
+      "loss": 3.1612,
+      "step": 98350
+    },
+    {
+      "epoch": 28.66330692146353,
+      "grad_norm": 0.4480709135532379,
+      "learning_rate": 0.0002561970271058,
+      "loss": 3.1623,
+      "step": 98400
+    },
+    {
+      "epoch": 28.677872290841297,
+      "grad_norm": 0.5100323557853699,
+      "learning_rate": 0.0002560221509763917,
+      "loss": 3.1586,
+      "step": 98450
+    },
+    {
+      "epoch": 28.692437660219063,
+      "grad_norm": 0.46166589856147766,
+      "learning_rate": 0.00025584727484698335,
+      "loss": 3.1564,
+      "step": 98500
+    },
+    {
+      "epoch": 28.70700302959683,
+      "grad_norm": 0.4602072238922119,
+      "learning_rate": 0.00025567239871757504,
+      "loss": 3.1748,
+      "step": 98550
+    },
+    {
+      "epoch": 28.721568398974597,
+      "grad_norm": 0.44640272855758667,
+      "learning_rate": 0.0002554975225881667,
+      "loss": 3.1635,
+      "step": 98600
+    },
+    {
+      "epoch": 28.736133768352367,
+      "grad_norm": 0.45697900652885437,
+      "learning_rate": 0.00025532264645875837,
+      "loss": 3.1582,
+      "step": 98650
+    },
+    {
+      "epoch": 28.750699137730134,
+      "grad_norm": 0.45967525243759155,
+      "learning_rate": 0.00025514777032935,
+      "loss": 3.1723,
+      "step": 98700
+    },
+    {
+      "epoch": 28.7652645071079,
+      "grad_norm": 0.44456747174263,
+      "learning_rate": 0.0002549728941999417,
+      "loss": 3.1739,
+      "step": 98750
+    },
+    {
+      "epoch": 28.779829876485667,
+      "grad_norm": 0.449693888425827,
+      "learning_rate": 0.00025479801807053333,
+      "loss": 3.1562,
+      "step": 98800
+    },
+    {
+      "epoch": 28.794395245863434,
+      "grad_norm": 0.4497344195842743,
+      "learning_rate": 0.000254623141941125,
+      "loss": 3.1548,
+      "step": 98850
+    },
+    {
+      "epoch": 28.8089606152412,
+      "grad_norm": 0.45112860202789307,
+      "learning_rate": 0.0002544482658117167,
+      "loss": 3.1702,
+      "step": 98900
+    },
+    {
+      "epoch": 28.82352598461897,
+      "grad_norm": 0.4688662886619568,
+      "learning_rate": 0.00025427338968230835,
+      "loss": 3.1639,
+      "step": 98950
+    },
+    {
+      "epoch": 28.838091353996738,
+      "grad_norm": 0.4467281103134155,
+      "learning_rate": 0.0002540985135529,
+      "loss": 3.1687,
+      "step": 99000
+    },
+    {
+      "epoch": 28.838091353996738,
+      "eval_accuracy": 0.37525193710728055,
+      "eval_loss": 3.536275863647461,
+      "eval_runtime": 179.7842,
+      "eval_samples_per_second": 92.583,
+      "eval_steps_per_second": 5.79,
+      "step": 99000
+    },
+    {
+      "epoch": 28.852656723374505,
+      "grad_norm": 0.5201435685157776,
+      "learning_rate": 0.0002539236374234917,
+      "loss": 3.182,
+      "step": 99050
+    },
+    {
+      "epoch": 28.86722209275227,
+      "grad_norm": 0.48700618743896484,
+      "learning_rate": 0.0002537487612940833,
+      "loss": 3.1711,
+      "step": 99100
+    },
+    {
+      "epoch": 28.88178746213004,
+      "grad_norm": 0.427386611700058,
+      "learning_rate": 0.000253573885164675,
+      "loss": 3.1761,
+      "step": 99150
+    },
+    {
+      "epoch": 28.89635283150781,
+      "grad_norm": 0.4574876129627228,
+      "learning_rate": 0.0002533990090352667,
+      "loss": 3.1807,
+      "step": 99200
+    },
+    {
+      "epoch": 28.910918200885575,
+      "grad_norm": 0.4709291458129883,
+      "learning_rate": 0.00025322413290585833,
+      "loss": 3.1662,
+      "step": 99250
+    },
+    {
+      "epoch": 28.925483570263342,
+      "grad_norm": 0.5189015865325928,
+      "learning_rate": 0.00025304925677644997,
+      "loss": 3.1808,
+      "step": 99300
+    },
+    {
+      "epoch": 28.94004893964111,
+      "grad_norm": 0.4347054064273834,
+      "learning_rate": 0.00025287438064704166,
+      "loss": 3.1713,
+      "step": 99350
+    },
+    {
+      "epoch": 28.954614309018876,
+      "grad_norm": 0.4408940076828003,
+      "learning_rate": 0.00025269950451763335,
+      "loss": 3.1781,
+      "step": 99400
+    },
+    {
+      "epoch": 28.969179678396642,
+      "grad_norm": 0.44796082377433777,
+      "learning_rate": 0.000252524628388225,
+      "loss": 3.1815,
+      "step": 99450
+    },
+    {
+      "epoch": 28.983745047774413,
+      "grad_norm": 0.4597788453102112,
+      "learning_rate": 0.0002523497522588167,
+      "loss": 3.1787,
+      "step": 99500
+    },
+    {
+      "epoch": 28.99831041715218,
+      "grad_norm": 0.4454861581325531,
+      "learning_rate": 0.0002521748761294083,
+      "loss": 3.1821,
+      "step": 99550
+    },
+    {
+      "epoch": 29.012817525052434,
+      "grad_norm": 0.46412530541419983,
+      "learning_rate": 0.00025199999999999995,
+      "loss": 3.103,
+      "step": 99600
+    },
+    {
+      "epoch": 29.027382894430204,
+      "grad_norm": 0.46690133213996887,
+      "learning_rate": 0.00025182512387059164,
+      "loss": 3.0811,
+      "step": 99650
+    },
+    {
+      "epoch": 29.04194826380797,
+      "grad_norm": 0.4453943371772766,
+      "learning_rate": 0.0002516502477411833,
+      "loss": 3.0869,
+      "step": 99700
+    },
+    {
+      "epoch": 29.056513633185737,
+      "grad_norm": 0.45379212498664856,
+      "learning_rate": 0.00025147537161177496,
+      "loss": 3.1016,
+      "step": 99750
+    },
+    {
+      "epoch": 29.071079002563504,
+      "grad_norm": 0.4479667544364929,
+      "learning_rate": 0.00025130049548236665,
+      "loss": 3.0876,
+      "step": 99800
+    },
+    {
+      "epoch": 29.08564437194127,
+      "grad_norm": 0.44696569442749023,
+      "learning_rate": 0.0002511256193529583,
+      "loss": 3.0893,
+      "step": 99850
+    },
+    {
+      "epoch": 29.10020974131904,
+      "grad_norm": 0.4410049617290497,
+      "learning_rate": 0.00025095074322355,
+      "loss": 3.0996,
+      "step": 99900
+    },
+    {
+      "epoch": 29.114775110696808,
+      "grad_norm": 0.501434326171875,
+      "learning_rate": 0.0002507758670941416,
+      "loss": 3.0975,
+      "step": 99950
+    },
+    {
+      "epoch": 29.129340480074575,
+      "grad_norm": 0.4482145309448242,
+      "learning_rate": 0.0002506009909647333,
+      "loss": 3.0999,
+      "step": 100000
+    },
+    {
+      "epoch": 29.129340480074575,
+      "eval_accuracy": 0.37459178340845056,
+      "eval_loss": 3.550246000289917,
+      "eval_runtime": 180.3156,
+      "eval_samples_per_second": 92.31,
+      "eval_steps_per_second": 5.773,
+      "step": 100000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171650,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 20
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.090213187452928e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}