diff --git "a/last_to_hit_frequency_3591/checkpoint-30000/trainer_state.json" "b/last_to_hit_frequency_3591/checkpoint-30000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/last_to_hit_frequency_3591/checkpoint-30000/trainer_state.json"
@@ -0,0 +1,4513 @@
+{
+  "best_global_step": 30000,
+  "best_metric": 3.574705123901367,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_3591/checkpoint-30000",
+  "epoch": 8.738755534840363,
+  "eval_steps": 1000,
+  "global_step": 30000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01456536937776742,
+      "grad_norm": 1.7487624883651733,
+      "learning_rate": 0.000294,
+      "loss": 8.4085,
+      "step": 50
+    },
+    {
+      "epoch": 0.02913073875553484,
+      "grad_norm": 0.8792372941970825,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7474,
+      "step": 100
+    },
+    {
+      "epoch": 0.04369610813330226,
+      "grad_norm": 0.49055325984954834,
+      "learning_rate": 0.0005998286213931798,
+      "loss": 6.369,
+      "step": 150
+    },
+    {
+      "epoch": 0.05826147751106968,
+      "grad_norm": 0.47406330704689026,
+      "learning_rate": 0.0005996537452637714,
+      "loss": 6.152,
+      "step": 200
+    },
+    {
+      "epoch": 0.0728268468888371,
+      "grad_norm": 0.5468786358833313,
+      "learning_rate": 0.0005994788691343632,
+      "loss": 6.0121,
+      "step": 250
+    },
+    {
+      "epoch": 0.08739221626660452,
+      "grad_norm": 0.5636491775512695,
+      "learning_rate": 0.0005993039930049548,
+      "loss": 5.8836,
+      "step": 300
+    },
+    {
+      "epoch": 0.10195758564437195,
+      "grad_norm": 0.47112441062927246,
+      "learning_rate": 0.0005991291168755465,
+      "loss": 5.7826,
+      "step": 350
+    },
+    {
+      "epoch": 0.11652295502213936,
+      "grad_norm": 0.614771842956543,
+      "learning_rate": 0.0005989542407461382,
+      "loss": 5.6472,
+      "step": 400
+    },
+    {
+      "epoch": 0.13108832439990678,
+      "grad_norm": 0.49487683176994324,
+      "learning_rate": 0.0005987793646167297,
+      "loss": 5.5373,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456536937776742,
+      "grad_norm": 0.4133830964565277,
+      "learning_rate": 0.0005986044884873214,
+      "loss": 5.4464,
+      "step": 500
+    },
+    {
+      "epoch": 0.16021906315544163,
+      "grad_norm": 0.43713563680648804,
+      "learning_rate": 0.0005984296123579131,
+      "loss": 5.3584,
+      "step": 550
+    },
+    {
+      "epoch": 0.17478443253320905,
+      "grad_norm": 0.4758622348308563,
+      "learning_rate": 0.0005982547362285047,
+      "loss": 5.2639,
+      "step": 600
+    },
+    {
+      "epoch": 0.18934980191097647,
+      "grad_norm": 0.5036144852638245,
+      "learning_rate": 0.0005980798600990964,
+      "loss": 5.2065,
+      "step": 650
+    },
+    {
+      "epoch": 0.2039151712887439,
+      "grad_norm": 0.4153907597064972,
+      "learning_rate": 0.0005979049839696881,
+      "loss": 5.1452,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184805406665113,
+      "grad_norm": 0.49257349967956543,
+      "learning_rate": 0.0005977301078402798,
+      "loss": 5.0858,
+      "step": 750
+    },
+    {
+      "epoch": 0.23304591004427871,
+      "grad_norm": 0.39646583795547485,
+      "learning_rate": 0.0005975552317108715,
+      "loss": 5.0535,
+      "step": 800
+    },
+    {
+      "epoch": 0.24761127942204614,
+      "grad_norm": 0.4354263246059418,
+      "learning_rate": 0.0005973803555814631,
+      "loss": 4.9802,
+      "step": 850
+    },
+    {
+      "epoch": 0.26217664879981356,
+      "grad_norm": 0.45780736207962036,
+      "learning_rate": 0.0005972054794520547,
+      "loss": 4.9416,
+      "step": 900
+    },
+    {
+      "epoch": 0.276742018177581,
+      "grad_norm": 0.4360142946243286,
+      "learning_rate": 0.0005970306033226464,
+      "loss": 4.8913,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "grad_norm": 0.49000921845436096,
+      "learning_rate": 0.0005968557271932381,
+      "loss": 4.8516,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "eval_accuracy": 0.25072978433607523,
+      "eval_loss": 4.782108783721924,
+      "eval_runtime": 182.688,
+      "eval_samples_per_second": 91.112,
+      "eval_steps_per_second": 5.698,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30587275693311583,
+      "grad_norm": 0.4772098958492279,
+      "learning_rate": 0.0005966808510638297,
+      "loss": 4.7909,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32043812631088325,
+      "grad_norm": 0.4113984704017639,
+      "learning_rate": 0.0005965059749344214,
+      "loss": 4.7556,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350034956886507,
+      "grad_norm": 0.4781721830368042,
+      "learning_rate": 0.0005963310988050131,
+      "loss": 4.7197,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3495688650664181,
+      "grad_norm": 0.5447441935539246,
+      "learning_rate": 0.0005961562226756047,
+      "loss": 4.6751,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3641342344441855,
+      "grad_norm": 0.41747575998306274,
+      "learning_rate": 0.0005959813465461965,
+      "loss": 4.6392,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37869960382195295,
+      "grad_norm": 0.44658181071281433,
+      "learning_rate": 0.000595806470416788,
+      "loss": 4.6204,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39326497319972037,
+      "grad_norm": 0.4609386622905731,
+      "learning_rate": 0.0005956315942873797,
+      "loss": 4.5836,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4078303425774878,
+      "grad_norm": 0.4060616195201874,
+      "learning_rate": 0.0005954567181579714,
+      "loss": 4.5615,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42239571195525516,
+      "grad_norm": 0.4222695827484131,
+      "learning_rate": 0.000595281842028563,
+      "loss": 4.5376,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4369610813330226,
+      "grad_norm": 0.4395248591899872,
+      "learning_rate": 0.0005951069658991547,
+      "loss": 4.5107,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45152645071079,
+      "grad_norm": 0.4157288372516632,
+      "learning_rate": 0.0005949320897697464,
+      "loss": 4.5016,
+      "step": 1550
+    },
+    {
+      "epoch": 0.46609182008855743,
+      "grad_norm": 0.4212968349456787,
+      "learning_rate": 0.0005947572136403381,
+      "loss": 4.4616,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48065718946632485,
+      "grad_norm": 0.49588635563850403,
+      "learning_rate": 0.0005945823375109297,
+      "loss": 4.4497,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4952225588440923,
+      "grad_norm": 0.44670262932777405,
+      "learning_rate": 0.0005944074613815215,
+      "loss": 4.4344,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5097879282218597,
+      "grad_norm": 0.36793598532676697,
+      "learning_rate": 0.000594232585252113,
+      "loss": 4.4265,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5243532975996271,
+      "grad_norm": 0.4189833700656891,
+      "learning_rate": 0.0005940577091227047,
+      "loss": 4.4012,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5389186669773945,
+      "grad_norm": 0.4342043995857239,
+      "learning_rate": 0.0005938828329932964,
+      "loss": 4.3671,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553484036355162,
+      "grad_norm": 0.4385491609573364,
+      "learning_rate": 0.000593707956863888,
+      "loss": 4.3797,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5680494057329294,
+      "grad_norm": 0.3876365125179291,
+      "learning_rate": 0.0005935330807344797,
+      "loss": 4.3617,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "grad_norm": 0.41458868980407715,
+      "learning_rate": 0.0005933582046050714,
+      "loss": 4.342,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "eval_accuracy": 0.2995414195009285,
+      "eval_loss": 4.284348011016846,
+      "eval_runtime": 182.6917,
+      "eval_samples_per_second": 91.11,
+      "eval_steps_per_second": 5.698,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5971801444884642,
+      "grad_norm": 0.41804736852645874,
+      "learning_rate": 0.000593183328475663,
+      "loss": 4.3264,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6117455138662317,
+      "grad_norm": 0.36194130778312683,
+      "learning_rate": 0.0005930084523462546,
+      "loss": 4.3039,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6263108832439991,
+      "grad_norm": 0.3918125629425049,
+      "learning_rate": 0.0005928335762168463,
+      "loss": 4.2908,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6408762526217665,
+      "grad_norm": 0.40795278549194336,
+      "learning_rate": 0.000592658700087438,
+      "loss": 4.3007,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6554416219995339,
+      "grad_norm": 0.38976508378982544,
+      "learning_rate": 0.0005924838239580297,
+      "loss": 4.2836,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6700069913773014,
+      "grad_norm": 0.40438467264175415,
+      "learning_rate": 0.0005923089478286214,
+      "loss": 4.2674,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6845723607550688,
+      "grad_norm": 0.41360440850257874,
+      "learning_rate": 0.000592134071699213,
+      "loss": 4.2654,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6991377301328362,
+      "grad_norm": 0.377794474363327,
+      "learning_rate": 0.0005919591955698047,
+      "loss": 4.247,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7137030995106036,
+      "grad_norm": 0.391387015581131,
+      "learning_rate": 0.0005917843194403964,
+      "loss": 4.2377,
+      "step": 2450
+    },
+    {
+      "epoch": 0.728268468888371,
+      "grad_norm": 0.355221688747406,
+      "learning_rate": 0.000591609443310988,
+      "loss": 4.2396,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7428338382661385,
+      "grad_norm": 0.4049054682254791,
+      "learning_rate": 0.0005914345671815796,
+      "loss": 4.225,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7573992076439059,
+      "grad_norm": 0.3985639214515686,
+      "learning_rate": 0.0005912596910521713,
+      "loss": 4.204,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7719645770216733,
+      "grad_norm": 0.3753962814807892,
+      "learning_rate": 0.0005910848149227629,
+      "loss": 4.207,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7865299463994407,
+      "grad_norm": 0.3719504773616791,
+      "learning_rate": 0.0005909099387933547,
+      "loss": 4.1872,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8010953157772082,
+      "grad_norm": 0.3820745348930359,
+      "learning_rate": 0.0005907350626639463,
+      "loss": 4.1728,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8156606851549756,
+      "grad_norm": 0.39611145853996277,
+      "learning_rate": 0.000590560186534538,
+      "loss": 4.175,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8302260545327429,
+      "grad_norm": 0.3846186697483063,
+      "learning_rate": 0.0005903853104051297,
+      "loss": 4.171,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8447914239105103,
+      "grad_norm": 0.37356239557266235,
+      "learning_rate": 0.0005902104342757214,
+      "loss": 4.1733,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8593567932882777,
+      "grad_norm": 0.3760608434677124,
+      "learning_rate": 0.000590035558146313,
+      "loss": 4.1446,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "grad_norm": 0.36522382497787476,
+      "learning_rate": 0.0005898606820169046,
+      "loss": 4.1349,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "eval_accuracy": 0.3161861104367184,
+      "eval_loss": 4.093681335449219,
+      "eval_runtime": 182.834,
+      "eval_samples_per_second": 91.039,
+      "eval_steps_per_second": 5.694,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8884875320438126,
+      "grad_norm": 0.3807234764099121,
+      "learning_rate": 0.0005896858058874963,
+      "loss": 4.1488,
+      "step": 3050
+    },
+    {
+      "epoch": 0.90305290142158,
+      "grad_norm": 0.37441378831863403,
+      "learning_rate": 0.0005895109297580879,
+      "loss": 4.1366,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9176182707993474,
+      "grad_norm": 0.3428540825843811,
+      "learning_rate": 0.0005893360536286797,
+      "loss": 4.1134,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9321836401771149,
+      "grad_norm": 0.37319910526275635,
+      "learning_rate": 0.0005891611774992713,
+      "loss": 4.1224,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9467490095548823,
+      "grad_norm": 0.3176666796207428,
+      "learning_rate": 0.000588986301369863,
+      "loss": 4.1011,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9613143789326497,
+      "grad_norm": 0.33169302344322205,
+      "learning_rate": 0.0005888114252404547,
+      "loss": 4.109,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9758797483104171,
+      "grad_norm": 0.37893936038017273,
+      "learning_rate": 0.0005886365491110463,
+      "loss": 4.0925,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9904451176881846,
+      "grad_norm": 0.37214741110801697,
+      "learning_rate": 0.000588461672981638,
+      "loss": 4.0892,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0049522255884409,
+      "grad_norm": 0.36634066700935364,
+      "learning_rate": 0.0005882867968522296,
+      "loss": 4.0616,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0195175949662083,
+      "grad_norm": 0.35482263565063477,
+      "learning_rate": 0.0005881119207228212,
+      "loss": 4.0124,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0340829643439757,
+      "grad_norm": 0.3779667913913727,
+      "learning_rate": 0.0005879370445934129,
+      "loss": 4.016,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0486483337217432,
+      "grad_norm": 0.3397766947746277,
+      "learning_rate": 0.0005877621684640046,
+      "loss": 4.0033,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0632137030995106,
+      "grad_norm": 0.3722945749759674,
+      "learning_rate": 0.0005875872923345963,
+      "loss": 4.0129,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077779072477278,
+      "grad_norm": 0.34330493211746216,
+      "learning_rate": 0.000587412416205188,
+      "loss": 4.003,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0923444418550454,
+      "grad_norm": 0.3765234649181366,
+      "learning_rate": 0.0005872375400757797,
+      "loss": 4.0107,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1069098112328128,
+      "grad_norm": 0.35264670848846436,
+      "learning_rate": 0.0005870626639463713,
+      "loss": 4.0043,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1214751806105803,
+      "grad_norm": 0.3567696213722229,
+      "learning_rate": 0.0005868877878169629,
+      "loss": 4.0016,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1360405499883477,
+      "grad_norm": 0.37721458077430725,
+      "learning_rate": 0.0005867129116875546,
+      "loss": 4.0014,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1506059193661151,
+      "grad_norm": 0.34036508202552795,
+      "learning_rate": 0.0005865380355581462,
+      "loss": 3.9731,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "grad_norm": 0.33936476707458496,
+      "learning_rate": 0.0005863631594287379,
+      "loss": 3.9879,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "eval_accuracy": 0.3259847194699489,
+      "eval_loss": 3.9844980239868164,
+      "eval_runtime": 182.8731,
+      "eval_samples_per_second": 91.019,
+      "eval_steps_per_second": 5.692,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17973665812165,
+      "grad_norm": 0.35038861632347107,
+      "learning_rate": 0.0005861882832993296,
+      "loss": 3.9796,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1943020274994174,
+      "grad_norm": 0.3499089777469635,
+      "learning_rate": 0.0005860134071699212,
+      "loss": 4.0012,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2088673968771848,
+      "grad_norm": 0.35973456501960754,
+      "learning_rate": 0.000585838531040513,
+      "loss": 3.9843,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2234327662549522,
+      "grad_norm": 0.364793598651886,
+      "learning_rate": 0.0005856636549111046,
+      "loss": 3.9738,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2379981356327197,
+      "grad_norm": 0.3340921998023987,
+      "learning_rate": 0.0005854887787816963,
+      "loss": 3.9804,
+      "step": 4250
+    },
+    {
+      "epoch": 1.252563505010487,
+      "grad_norm": 0.33135804533958435,
+      "learning_rate": 0.0005853139026522879,
+      "loss": 3.9741,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2671288743882545,
+      "grad_norm": 0.348160058259964,
+      "learning_rate": 0.0005851390265228796,
+      "loss": 3.9693,
+      "step": 4350
+    },
+    {
+      "epoch": 1.281694243766022,
+      "grad_norm": 0.3392653167247772,
+      "learning_rate": 0.0005849641503934712,
+      "loss": 3.9645,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2962596131437893,
+      "grad_norm": 0.33709239959716797,
+      "learning_rate": 0.0005847892742640629,
+      "loss": 3.9534,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3108249825215568,
+      "grad_norm": 0.3292732238769531,
+      "learning_rate": 0.0005846143981346546,
+      "loss": 3.9565,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3253903518993242,
+      "grad_norm": 0.346902459859848,
+      "learning_rate": 0.0005844395220052462,
+      "loss": 3.945,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3399557212770916,
+      "grad_norm": 0.3570997416973114,
+      "learning_rate": 0.000584264645875838,
+      "loss": 3.9458,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354521090654859,
+      "grad_norm": 0.3431386351585388,
+      "learning_rate": 0.0005840897697464296,
+      "loss": 3.9343,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3690864600326265,
+      "grad_norm": 0.32554003596305847,
+      "learning_rate": 0.0005839148936170212,
+      "loss": 3.9334,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3836518294103939,
+      "grad_norm": 0.33119750022888184,
+      "learning_rate": 0.0005837400174876129,
+      "loss": 3.9402,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3982171987881613,
+      "grad_norm": 0.32726046442985535,
+      "learning_rate": 0.0005835651413582045,
+      "loss": 3.9414,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4127825681659287,
+      "grad_norm": 0.37814900279045105,
+      "learning_rate": 0.0005833902652287962,
+      "loss": 3.9381,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4273479375436962,
+      "grad_norm": 0.35007208585739136,
+      "learning_rate": 0.0005832153890993879,
+      "loss": 3.941,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4419133069214636,
+      "grad_norm": 0.47103646397590637,
+      "learning_rate": 0.0005830405129699796,
+      "loss": 3.9365,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456478676299231,
+      "grad_norm": 0.3166019916534424,
+      "learning_rate": 0.0005828656368405712,
+      "loss": 3.9229,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456478676299231,
+      "eval_accuracy": 0.33251971202484953,
+      "eval_loss": 3.9095144271850586,
+      "eval_runtime": 182.8302,
+      "eval_samples_per_second": 91.041,
+      "eval_steps_per_second": 5.694,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4710440456769984,
+      "grad_norm": 0.32818934321403503,
+      "learning_rate": 0.0005826907607111629,
+      "loss": 3.9321,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4856094150547658,
+      "grad_norm": 0.3326047658920288,
+      "learning_rate": 0.0005825158845817546,
+      "loss": 3.9226,
+      "step": 5100
+    },
+    {
+      "epoch": 1.500174784432533,
+      "grad_norm": 0.3376672863960266,
+      "learning_rate": 0.0005823410084523462,
+      "loss": 3.93,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5147401538103007,
+      "grad_norm": 0.3359002470970154,
+      "learning_rate": 0.0005821661323229379,
+      "loss": 3.9337,
+      "step": 5200
+    },
+    {
+      "epoch": 1.529305523188068,
+      "grad_norm": 0.33936458826065063,
+      "learning_rate": 0.0005819912561935295,
+      "loss": 3.9042,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5438708925658355,
+      "grad_norm": 0.34475040435791016,
+      "learning_rate": 0.0005818163800641212,
+      "loss": 3.9182,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5584362619436027,
+      "grad_norm": 0.3286258578300476,
+      "learning_rate": 0.0005816415039347129,
+      "loss": 3.9125,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5730016313213704,
+      "grad_norm": 0.3313068747520447,
+      "learning_rate": 0.0005814666278053045,
+      "loss": 3.8986,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5875670006991376,
+      "grad_norm": 0.3305990695953369,
+      "learning_rate": 0.0005812917516758962,
+      "loss": 3.9158,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6021323700769052,
+      "grad_norm": 0.3188944458961487,
+      "learning_rate": 0.0005811168755464879,
+      "loss": 3.8946,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6166977394546724,
+      "grad_norm": 0.353261798620224,
+      "learning_rate": 0.0005809419994170794,
+      "loss": 3.9022,
+      "step": 5550
+    },
+    {
+      "epoch": 1.63126310883244,
+      "grad_norm": 0.3191840648651123,
+      "learning_rate": 0.0005807671232876712,
+      "loss": 3.8972,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6458284782102073,
+      "grad_norm": 0.3437453508377075,
+      "learning_rate": 0.0005805922471582628,
+      "loss": 3.8846,
+      "step": 5650
+    },
+    {
+      "epoch": 1.660393847587975,
+      "grad_norm": 0.3136034309864044,
+      "learning_rate": 0.0005804173710288545,
+      "loss": 3.8851,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6749592169657421,
+      "grad_norm": 0.31763720512390137,
+      "learning_rate": 0.0005802424948994462,
+      "loss": 3.8838,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6895245863435098,
+      "grad_norm": 0.352448970079422,
+      "learning_rate": 0.0005800676187700379,
+      "loss": 3.8774,
+      "step": 5800
+    },
+    {
+      "epoch": 1.704089955721277,
+      "grad_norm": 0.32752853631973267,
+      "learning_rate": 0.0005798927426406295,
+      "loss": 3.8735,
+      "step": 5850
+    },
+    {
+      "epoch": 1.7186553250990446,
+      "grad_norm": 0.31873783469200134,
+      "learning_rate": 0.0005797178665112212,
+      "loss": 3.8819,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7332206944768118,
+      "grad_norm": 0.3134726881980896,
+      "learning_rate": 0.0005795429903818129,
+      "loss": 3.8784,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "grad_norm": 0.32978641986846924,
+      "learning_rate": 0.0005793681142524044,
+      "loss": 3.8805,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "eval_accuracy": 0.33774250948934204,
+      "eval_loss": 3.851607084274292,
+      "eval_runtime": 183.0051,
+      "eval_samples_per_second": 90.954,
+      "eval_steps_per_second": 5.688,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7623514332323467,
+      "grad_norm": 0.31962209939956665,
+      "learning_rate": 0.0005791932381229961,
+      "loss": 3.8609,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7769168026101143,
+      "grad_norm": 0.353483647108078,
+      "learning_rate": 0.0005790183619935878,
+      "loss": 3.866,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7914821719878815,
+      "grad_norm": 0.34389597177505493,
+      "learning_rate": 0.0005788434858641795,
+      "loss": 3.8668,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8060475413656492,
+      "grad_norm": 0.306185245513916,
+      "learning_rate": 0.0005786686097347712,
+      "loss": 3.8713,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8206129107434164,
+      "grad_norm": 0.3174114525318146,
+      "learning_rate": 0.0005784937336053628,
+      "loss": 3.854,
+      "step": 6250
+    },
+    {
+      "epoch": 1.835178280121184,
+      "grad_norm": 0.3356582522392273,
+      "learning_rate": 0.0005783188574759545,
+      "loss": 3.8665,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8497436494989512,
+      "grad_norm": 0.3357125222682953,
+      "learning_rate": 0.0005781439813465462,
+      "loss": 3.8615,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8643090188767188,
+      "grad_norm": 0.3359164297580719,
+      "learning_rate": 0.0005779691052171379,
+      "loss": 3.8584,
+      "step": 6400
+    },
+    {
+      "epoch": 1.878874388254486,
+      "grad_norm": 0.3328222632408142,
+      "learning_rate": 0.0005777942290877294,
+      "loss": 3.8494,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8934397576322537,
+      "grad_norm": 0.3154282569885254,
+      "learning_rate": 0.0005776193529583211,
+      "loss": 3.8592,
+      "step": 6500
+    },
+    {
+      "epoch": 1.908005127010021,
+      "grad_norm": 0.35024163126945496,
+      "learning_rate": 0.0005774444768289128,
+      "loss": 3.8619,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9225704963877885,
+      "grad_norm": 0.3441106379032135,
+      "learning_rate": 0.0005772696006995045,
+      "loss": 3.8442,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9371358657655557,
+      "grad_norm": 0.3300905227661133,
+      "learning_rate": 0.0005770947245700962,
+      "loss": 3.8414,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9517012351433234,
+      "grad_norm": 0.32087442278862,
+      "learning_rate": 0.0005769198484406878,
+      "loss": 3.8356,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9662666045210906,
+      "grad_norm": 0.3275013864040375,
+      "learning_rate": 0.0005767449723112795,
+      "loss": 3.837,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9808319738988582,
+      "grad_norm": 0.3164341151714325,
+      "learning_rate": 0.0005765700961818712,
+      "loss": 3.836,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9953973432766254,
+      "grad_norm": 0.3237488269805908,
+      "learning_rate": 0.0005763952200524627,
+      "loss": 3.8226,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0099044511768818,
+      "grad_norm": 0.3478614091873169,
+      "learning_rate": 0.0005762203439230544,
+      "loss": 3.7605,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0244698205546494,
+      "grad_norm": 0.3280088007450104,
+      "learning_rate": 0.0005760454677936461,
+      "loss": 3.7386,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "grad_norm": 0.3092538118362427,
+      "learning_rate": 0.0005758705916642378,
+      "loss": 3.7313,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "eval_accuracy": 0.34221756385061836,
+      "eval_loss": 3.8092734813690186,
+      "eval_runtime": 182.6845,
+      "eval_samples_per_second": 91.113,
+      "eval_steps_per_second": 5.698,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0536005593101843,
+      "grad_norm": 0.3326481282711029,
+      "learning_rate": 0.0005756957155348294,
+      "loss": 3.7507,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0681659286879515,
+      "grad_norm": 0.3384568989276886,
+      "learning_rate": 0.0005755208394054211,
+      "loss": 3.7393,
+      "step": 7100
+    },
+    {
+      "epoch": 2.082731298065719,
+      "grad_norm": 0.33243587613105774,
+      "learning_rate": 0.0005753459632760128,
+      "loss": 3.7399,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0972966674434863,
+      "grad_norm": 0.31076061725616455,
+      "learning_rate": 0.0005751710871466045,
+      "loss": 3.7378,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111862036821254,
+      "grad_norm": 0.3274184465408325,
+      "learning_rate": 0.0005749962110171962,
+      "loss": 3.7429,
+      "step": 7250
+    },
+    {
+      "epoch": 2.126427406199021,
+      "grad_norm": 0.32674604654312134,
+      "learning_rate": 0.0005748213348877877,
+      "loss": 3.7583,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140992775576789,
+      "grad_norm": 0.32339632511138916,
+      "learning_rate": 0.0005746464587583794,
+      "loss": 3.7516,
+      "step": 7350
+    },
+    {
+      "epoch": 2.155558144954556,
+      "grad_norm": 0.3199286162853241,
+      "learning_rate": 0.0005744715826289711,
+      "loss": 3.7587,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1701235143323236,
+      "grad_norm": 0.3267718553543091,
+      "learning_rate": 0.0005742967064995627,
+      "loss": 3.749,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184688883710091,
+      "grad_norm": 0.32316339015960693,
+      "learning_rate": 0.0005741218303701544,
+      "loss": 3.747,
+      "step": 7500
+    },
+    {
+      "epoch": 2.1992542530878585,
+      "grad_norm": 0.3192504048347473,
+      "learning_rate": 0.0005739469542407461,
+      "loss": 3.7527,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2138196224656257,
+      "grad_norm": 0.3137800097465515,
+      "learning_rate": 0.0005737720781113378,
+      "loss": 3.7386,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2283849918433933,
+      "grad_norm": 0.3079003393650055,
+      "learning_rate": 0.0005735972019819295,
+      "loss": 3.7412,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2429503612211605,
+      "grad_norm": 0.317622572183609,
+      "learning_rate": 0.000573422325852521,
+      "loss": 3.7524,
+      "step": 7700
+    },
+    {
+      "epoch": 2.257515730598928,
+      "grad_norm": 0.3800288140773773,
+      "learning_rate": 0.0005732474497231127,
+      "loss": 3.7621,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2720810999766954,
+      "grad_norm": 0.3083147704601288,
+      "learning_rate": 0.0005730725735937044,
+      "loss": 3.7586,
+      "step": 7800
+    },
+    {
+      "epoch": 2.286646469354463,
+      "grad_norm": 0.3407652974128723,
+      "learning_rate": 0.0005728976974642961,
+      "loss": 3.7421,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3012118387322302,
+      "grad_norm": 0.3287487030029297,
+      "learning_rate": 0.0005727228213348877,
+      "loss": 3.7566,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3157772081099974,
+      "grad_norm": 0.3335205912590027,
+      "learning_rate": 0.0005725479452054794,
+      "loss": 3.7335,
+      "step": 7950
+    },
+    {
+      "epoch": 2.330342577487765,
+      "grad_norm": 0.3264673352241516,
+      "learning_rate": 0.0005723730690760711,
+      "loss": 3.7565,
+      "step": 8000
+    },
+    {
+      "epoch": 2.330342577487765,
+      "eval_accuracy": 0.34492789746043634,
+      "eval_loss": 3.7804434299468994,
+      "eval_runtime": 183.2368,
+      "eval_samples_per_second": 90.839,
+      "eval_steps_per_second": 5.681,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3449079468655327,
+      "grad_norm": 0.3331637382507324,
+      "learning_rate": 0.0005721981929466627,
+      "loss": 3.7361,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3594733162433,
+      "grad_norm": 0.332265704870224,
+      "learning_rate": 0.0005720233168172545,
+      "loss": 3.7488,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374038685621067,
+      "grad_norm": 0.3338277041912079,
+      "learning_rate": 0.000571848440687846,
+      "loss": 3.749,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3886040549988348,
+      "grad_norm": 0.3343018889427185,
+      "learning_rate": 0.0005716735645584377,
+      "loss": 3.7482,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4031694243766024,
+      "grad_norm": 0.3533099591732025,
+      "learning_rate": 0.0005714986884290294,
+      "loss": 3.7509,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4177347937543696,
+      "grad_norm": 0.3301255404949188,
+      "learning_rate": 0.000571323812299621,
+      "loss": 3.7334,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432300163132137,
+      "grad_norm": 0.31493982672691345,
+      "learning_rate": 0.0005711489361702127,
+      "loss": 3.7405,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4468655325099045,
+      "grad_norm": 0.3294617235660553,
+      "learning_rate": 0.0005709740600408044,
+      "loss": 3.7471,
+      "step": 8400
+    },
+    {
+      "epoch": 2.461430901887672,
+      "grad_norm": 0.316875696182251,
+      "learning_rate": 0.0005707991839113961,
+      "loss": 3.756,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4759962712654393,
+      "grad_norm": 0.310937762260437,
+      "learning_rate": 0.0005706243077819877,
+      "loss": 3.7412,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4905616406432065,
+      "grad_norm": 0.31915387511253357,
+      "learning_rate": 0.0005704494316525793,
+      "loss": 3.7455,
+      "step": 8550
+    },
+    {
+      "epoch": 2.505127010020974,
+      "grad_norm": 0.32014259696006775,
+      "learning_rate": 0.000570274555523171,
+      "loss": 3.7348,
+      "step": 8600
+    },
+    {
+      "epoch": 2.519692379398742,
+      "grad_norm": 0.32315295934677124,
+      "learning_rate": 0.0005700996793937627,
+      "loss": 3.7426,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534257748776509,
+      "grad_norm": 0.33680716156959534,
+      "learning_rate": 0.0005699248032643544,
+      "loss": 3.7458,
+      "step": 8700
+    },
+    {
+      "epoch": 2.548823118154276,
+      "grad_norm": 0.30035462975502014,
+      "learning_rate": 0.000569749927134946,
+      "loss": 3.7355,
+      "step": 8750
+    },
+    {
+      "epoch": 2.563388487532044,
+      "grad_norm": 0.3163704574108124,
+      "learning_rate": 0.0005695750510055377,
+      "loss": 3.745,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5779538569098115,
+      "grad_norm": 0.3217114806175232,
+      "learning_rate": 0.0005694001748761294,
+      "loss": 3.7532,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5925192262875787,
+      "grad_norm": 0.3296523988246918,
+      "learning_rate": 0.000569225298746721,
+      "loss": 3.7536,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607084595665346,
+      "grad_norm": 0.3380911648273468,
+      "learning_rate": 0.0005690504226173127,
+      "loss": 3.7383,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "grad_norm": 0.3250406086444855,
+      "learning_rate": 0.0005688755464879043,
+      "loss": 3.7287,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "eval_accuracy": 0.3478768975009979,
+      "eval_loss": 3.7482099533081055,
+      "eval_runtime": 182.7889,
+      "eval_samples_per_second": 91.061,
+      "eval_steps_per_second": 5.695,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636215334420881,
+      "grad_norm": 0.2994716465473175,
+      "learning_rate": 0.000568700670358496,
+      "loss": 3.7238,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6507807037986484,
+      "grad_norm": 0.34974968433380127,
+      "learning_rate": 0.0005685257942290877,
+      "loss": 3.7324,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6653460731764156,
+      "grad_norm": 0.3160337209701538,
+      "learning_rate": 0.0005683509180996793,
+      "loss": 3.7271,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6799114425541832,
+      "grad_norm": 0.3240397274494171,
+      "learning_rate": 0.000568176041970271,
+      "loss": 3.7258,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6944768119319504,
+      "grad_norm": 0.32082828879356384,
+      "learning_rate": 0.0005680011658408627,
+      "loss": 3.7358,
+      "step": 9250
+    },
+    {
+      "epoch": 2.709042181309718,
+      "grad_norm": 0.31649598479270935,
+      "learning_rate": 0.0005678262897114544,
+      "loss": 3.7392,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7236075506874853,
+      "grad_norm": 0.31169888377189636,
+      "learning_rate": 0.000567651413582046,
+      "loss": 3.735,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738172920065253,
+      "grad_norm": 0.3002908229827881,
+      "learning_rate": 0.0005674765374526377,
+      "loss": 3.7288,
+      "step": 9400
+    },
+    {
+      "epoch": 2.75273828944302,
+      "grad_norm": 0.30253180861473083,
+      "learning_rate": 0.0005673016613232293,
+      "loss": 3.7204,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7673036588207878,
+      "grad_norm": 0.31831657886505127,
+      "learning_rate": 0.0005671267851938209,
+      "loss": 3.7258,
+      "step": 9500
+    },
+    {
+      "epoch": 2.781869028198555,
+      "grad_norm": 0.31231117248535156,
+      "learning_rate": 0.0005669519090644127,
+      "loss": 3.721,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7964343975763226,
+      "grad_norm": 0.3149794936180115,
+      "learning_rate": 0.0005667770329350043,
+      "loss": 3.7337,
+      "step": 9600
+    },
+    {
+      "epoch": 2.81099976695409,
+      "grad_norm": 0.3181709945201874,
+      "learning_rate": 0.000566602156805596,
+      "loss": 3.7265,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8255651363318575,
+      "grad_norm": 0.31284210085868835,
+      "learning_rate": 0.0005664272806761877,
+      "loss": 3.7288,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8401305057096247,
+      "grad_norm": 0.3245266377925873,
+      "learning_rate": 0.0005662524045467793,
+      "loss": 3.7222,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8546958750873923,
+      "grad_norm": 0.319336473941803,
+      "learning_rate": 0.000566077528417371,
+      "loss": 3.7298,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8692612444651595,
+      "grad_norm": 0.3112030625343323,
+      "learning_rate": 0.0005659026522879626,
+      "loss": 3.7303,
+      "step": 9850
+    },
+    {
+      "epoch": 2.883826613842927,
+      "grad_norm": 0.3266408443450928,
+      "learning_rate": 0.0005657277761585543,
+      "loss": 3.7114,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8983919832206944,
+      "grad_norm": 0.3135480284690857,
+      "learning_rate": 0.0005655529000291459,
+      "loss": 3.7203,
+      "step": 9950
+    },
+    {
+      "epoch": 2.912957352598462,
+      "grad_norm": 0.3140251040458679,
+      "learning_rate": 0.0005653780238997376,
+      "loss": 3.7117,
+      "step": 10000
+    },
+    {
+      "epoch": 2.912957352598462,
+      "eval_accuracy": 0.3501354108677876,
+      "eval_loss": 3.7232017517089844,
+      "eval_runtime": 182.9143,
+      "eval_samples_per_second": 90.999,
+      "eval_steps_per_second": 5.691,
+      "step": 10000
+    },
+    {
+      "epoch": 2.927522721976229,
+      "grad_norm": 0.30501222610473633,
+      "learning_rate": 0.0005652031477703293,
+      "loss": 3.7205,
+      "step": 10050
+    },
+    {
+      "epoch": 2.942088091353997,
+      "grad_norm": 0.31396257877349854,
+      "learning_rate": 0.000565028271640921,
+      "loss": 3.7231,
+      "step": 10100
+    },
+    {
+      "epoch": 2.956653460731764,
+      "grad_norm": 0.29565396904945374,
+      "learning_rate": 0.0005648533955115127,
+      "loss": 3.7167,
+      "step": 10150
+    },
+    {
+      "epoch": 2.9712188301095317,
+      "grad_norm": 0.29209980368614197,
+      "learning_rate": 0.0005646785193821043,
+      "loss": 3.7011,
+      "step": 10200
+    },
+    {
+      "epoch": 2.985784199487299,
+      "grad_norm": 0.341580867767334,
+      "learning_rate": 0.000564503643252696,
+      "loss": 3.7199,
+      "step": 10250
+    },
+    {
+      "epoch": 3.0002913073875552,
+      "grad_norm": 0.306345134973526,
+      "learning_rate": 0.0005643287671232876,
+      "loss": 3.7135,
+      "step": 10300
+    },
+    {
+      "epoch": 3.014856676765323,
+      "grad_norm": 0.3232329487800598,
+      "learning_rate": 0.0005641538909938792,
+      "loss": 3.602,
+      "step": 10350
+    },
+    {
+      "epoch": 3.02942204614309,
+      "grad_norm": 0.3200088441371918,
+      "learning_rate": 0.0005639790148644709,
+      "loss": 3.6044,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0439874155208577,
+      "grad_norm": 0.31947940587997437,
+      "learning_rate": 0.0005638041387350626,
+      "loss": 3.612,
+      "step": 10450
+    },
+    {
+      "epoch": 3.058552784898625,
+      "grad_norm": 0.3416725695133209,
+      "learning_rate": 0.0005636292626056543,
+      "loss": 3.6212,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0731181542763926,
+      "grad_norm": 0.3318784832954407,
+      "learning_rate": 0.000563454386476246,
+      "loss": 3.6079,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0876835236541598,
+      "grad_norm": 0.3215178847312927,
+      "learning_rate": 0.0005632795103468376,
+      "loss": 3.623,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1022488930319274,
+      "grad_norm": 0.3258126974105835,
+      "learning_rate": 0.0005631046342174293,
+      "loss": 3.6237,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1168142624096946,
+      "grad_norm": 0.3248828947544098,
+      "learning_rate": 0.000562929758088021,
+      "loss": 3.6206,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1313796317874623,
+      "grad_norm": 0.3171083331108093,
+      "learning_rate": 0.0005627548819586126,
+      "loss": 3.6301,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1459450011652295,
+      "grad_norm": 0.3476962745189667,
+      "learning_rate": 0.0005625800058292042,
+      "loss": 3.6342,
+      "step": 10800
+    },
+    {
+      "epoch": 3.160510370542997,
+      "grad_norm": 0.32831183075904846,
+      "learning_rate": 0.0005624051296997959,
+      "loss": 3.6253,
+      "step": 10850
+    },
+    {
+      "epoch": 3.1750757399207643,
+      "grad_norm": 0.3208300769329071,
+      "learning_rate": 0.0005622302535703876,
+      "loss": 3.6201,
+      "step": 10900
+    },
+    {
+      "epoch": 3.189641109298532,
+      "grad_norm": 0.34289947152137756,
+      "learning_rate": 0.0005620553774409792,
+      "loss": 3.6407,
+      "step": 10950
+    },
+    {
+      "epoch": 3.204206478676299,
+      "grad_norm": 0.34401383996009827,
+      "learning_rate": 0.000561880501311571,
+      "loss": 3.6328,
+      "step": 11000
+    },
+    {
+      "epoch": 3.204206478676299,
+      "eval_accuracy": 0.35195491908561366,
+      "eval_loss": 3.711780548095703,
+      "eval_runtime": 183.0966,
+      "eval_samples_per_second": 90.908,
+      "eval_steps_per_second": 5.686,
+      "step": 11000
+    },
+    {
+      "epoch": 3.218771848054067,
+      "grad_norm": 0.3176616132259369,
+      "learning_rate": 0.0005617056251821626,
+      "loss": 3.6359,
+      "step": 11050
+    },
+    {
+      "epoch": 3.233337217431834,
+      "grad_norm": 0.3091390132904053,
+      "learning_rate": 0.0005615307490527543,
+      "loss": 3.6274,
+      "step": 11100
+    },
+    {
+      "epoch": 3.2479025868096016,
+      "grad_norm": 0.31460776925086975,
+      "learning_rate": 0.000561355872923346,
+      "loss": 3.6288,
+      "step": 11150
+    },
+    {
+      "epoch": 3.262467956187369,
+      "grad_norm": 0.33371245861053467,
+      "learning_rate": 0.0005611809967939375,
+      "loss": 3.6434,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2770333255651365,
+      "grad_norm": 0.3317493796348572,
+      "learning_rate": 0.0005610061206645292,
+      "loss": 3.6385,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2915986949429037,
+      "grad_norm": 0.3349264860153198,
+      "learning_rate": 0.0005608312445351209,
+      "loss": 3.6397,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3061640643206713,
+      "grad_norm": 0.32256776094436646,
+      "learning_rate": 0.0005606563684057126,
+      "loss": 3.6336,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3207294336984385,
+      "grad_norm": 0.32920846343040466,
+      "learning_rate": 0.0005604814922763042,
+      "loss": 3.633,
+      "step": 11400
+    },
+    {
+      "epoch": 3.335294803076206,
+      "grad_norm": 0.32023611664772034,
+      "learning_rate": 0.0005603066161468959,
+      "loss": 3.6273,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3498601724539734,
+      "grad_norm": 0.3239540755748749,
+      "learning_rate": 0.0005601317400174876,
+      "loss": 3.6354,
+      "step": 11500
+    },
+    {
+      "epoch": 3.364425541831741,
+      "grad_norm": 0.3216831386089325,
+      "learning_rate": 0.0005599568638880793,
+      "loss": 3.6423,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3789909112095082,
+      "grad_norm": 0.3342783749103546,
+      "learning_rate": 0.0005597819877586709,
+      "loss": 3.6189,
+      "step": 11600
+    },
+    {
+      "epoch": 3.393556280587276,
+      "grad_norm": 0.30918747186660767,
+      "learning_rate": 0.0005596071116292625,
+      "loss": 3.6395,
+      "step": 11650
+    },
+    {
+      "epoch": 3.408121649965043,
+      "grad_norm": 0.3271181583404541,
+      "learning_rate": 0.0005594322354998542,
+      "loss": 3.6403,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4226870193428107,
+      "grad_norm": 0.30378925800323486,
+      "learning_rate": 0.0005592573593704459,
+      "loss": 3.6356,
+      "step": 11750
+    },
+    {
+      "epoch": 3.437252388720578,
+      "grad_norm": 0.3317507803440094,
+      "learning_rate": 0.0005590824832410375,
+      "loss": 3.6269,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4518177580983456,
+      "grad_norm": 0.33540189266204834,
+      "learning_rate": 0.0005589076071116292,
+      "loss": 3.64,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4663831274761128,
+      "grad_norm": 0.2987757921218872,
+      "learning_rate": 0.0005587327309822209,
+      "loss": 3.6381,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4809484968538804,
+      "grad_norm": 0.3357625901699066,
+      "learning_rate": 0.0005585578548528126,
+      "loss": 3.6504,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "grad_norm": 0.3251186013221741,
+      "learning_rate": 0.0005583829787234043,
+      "loss": 3.6393,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "eval_accuracy": 0.3538467326506846,
+      "eval_loss": 3.695300817489624,
+      "eval_runtime": 183.2049,
+      "eval_samples_per_second": 90.855,
+      "eval_steps_per_second": 5.682,
+      "step": 12000
+    },
+    {
+      "epoch": 3.510079235609415,
+      "grad_norm": 0.3250819742679596,
+      "learning_rate": 0.0005582081025939958,
+      "loss": 3.628,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5246446049871825,
+      "grad_norm": 0.3002704381942749,
+      "learning_rate": 0.0005580332264645875,
+      "loss": 3.6323,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53920997436495,
+      "grad_norm": 0.3166797161102295,
+      "learning_rate": 0.0005578583503351792,
+      "loss": 3.6524,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5537753437427173,
+      "grad_norm": 0.32995665073394775,
+      "learning_rate": 0.0005576834742057709,
+      "loss": 3.6375,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5683407131204845,
+      "grad_norm": 0.3259824812412262,
+      "learning_rate": 0.0005575085980763625,
+      "loss": 3.6469,
+      "step": 12250
+    },
+    {
+      "epoch": 3.582906082498252,
+      "grad_norm": 0.35942357778549194,
+      "learning_rate": 0.0005573337219469542,
+      "loss": 3.6256,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59747145187602,
+      "grad_norm": 0.35430946946144104,
+      "learning_rate": 0.0005571588458175459,
+      "loss": 3.6376,
+      "step": 12350
+    },
+    {
+      "epoch": 3.612036821253787,
+      "grad_norm": 0.31567761301994324,
+      "learning_rate": 0.0005569839696881374,
+      "loss": 3.6417,
+      "step": 12400
+    },
+    {
+      "epoch": 3.626602190631554,
+      "grad_norm": 0.32618406414985657,
+      "learning_rate": 0.0005568090935587292,
+      "loss": 3.6446,
+      "step": 12450
+    },
+    {
+      "epoch": 3.641167560009322,
+      "grad_norm": 0.31533339619636536,
+      "learning_rate": 0.0005566342174293208,
+      "loss": 3.6602,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6557329293870895,
+      "grad_norm": 0.3274492621421814,
+      "learning_rate": 0.0005564593412999125,
+      "loss": 3.6288,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6702982987648567,
+      "grad_norm": 0.3271532952785492,
+      "learning_rate": 0.0005562844651705042,
+      "loss": 3.6392,
+      "step": 12600
+    },
+    {
+      "epoch": 3.684863668142624,
+      "grad_norm": 0.3347550630569458,
+      "learning_rate": 0.0005561095890410958,
+      "loss": 3.6288,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6994290375203915,
+      "grad_norm": 0.32583537697792053,
+      "learning_rate": 0.0005559347129116875,
+      "loss": 3.6302,
+      "step": 12700
+    },
+    {
+      "epoch": 3.713994406898159,
+      "grad_norm": 0.30043119192123413,
+      "learning_rate": 0.0005557598367822792,
+      "loss": 3.6391,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7285597762759264,
+      "grad_norm": 0.328071653842926,
+      "learning_rate": 0.0005555849606528709,
+      "loss": 3.6402,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7431251456536936,
+      "grad_norm": 0.3114347755908966,
+      "learning_rate": 0.0005554100845234624,
+      "loss": 3.6368,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7576905150314612,
+      "grad_norm": 0.3095006048679352,
+      "learning_rate": 0.0005552352083940541,
+      "loss": 3.642,
+      "step": 12900
+    },
+    {
+      "epoch": 3.772255884409229,
+      "grad_norm": 0.341740220785141,
+      "learning_rate": 0.0005550603322646458,
+      "loss": 3.6366,
+      "step": 12950
+    },
+    {
+      "epoch": 3.786821253786996,
+      "grad_norm": 0.3182758390903473,
+      "learning_rate": 0.0005548854561352375,
+      "loss": 3.6428,
+      "step": 13000
+    },
+    {
+      "epoch": 3.786821253786996,
+      "eval_accuracy": 0.35534045531206226,
+      "eval_loss": 3.67643141746521,
+      "eval_runtime": 182.9247,
+      "eval_samples_per_second": 90.994,
+      "eval_steps_per_second": 5.691,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8013866231647633,
+      "grad_norm": 0.3386680781841278,
+      "learning_rate": 0.0005547105800058292,
+      "loss": 3.6498,
+      "step": 13050
+    },
+    {
+      "epoch": 3.815951992542531,
+      "grad_norm": 0.317020446062088,
+      "learning_rate": 0.0005545357038764208,
+      "loss": 3.6364,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8305173619202986,
+      "grad_norm": 0.3221585154533386,
+      "learning_rate": 0.0005543608277470125,
+      "loss": 3.639,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8450827312980658,
+      "grad_norm": 0.2992505133152008,
+      "learning_rate": 0.0005541859516176042,
+      "loss": 3.6415,
+      "step": 13200
+    },
+    {
+      "epoch": 3.859648100675833,
+      "grad_norm": 0.30075782537460327,
+      "learning_rate": 0.0005540110754881958,
+      "loss": 3.6458,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8742134700536006,
+      "grad_norm": 0.3180374205112457,
+      "learning_rate": 0.0005538361993587874,
+      "loss": 3.6236,
+      "step": 13300
+    },
+    {
+      "epoch": 3.888778839431368,
+      "grad_norm": 0.3106091022491455,
+      "learning_rate": 0.0005536613232293791,
+      "loss": 3.6353,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9033442088091355,
+      "grad_norm": 0.29708361625671387,
+      "learning_rate": 0.0005534864470999708,
+      "loss": 3.6366,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9179095781869027,
+      "grad_norm": 0.34170088171958923,
+      "learning_rate": 0.0005533115709705625,
+      "loss": 3.6262,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9324749475646703,
+      "grad_norm": 0.3265573978424072,
+      "learning_rate": 0.0005531366948411541,
+      "loss": 3.635,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9470403169424375,
+      "grad_norm": 0.31741398572921753,
+      "learning_rate": 0.0005529618187117458,
+      "loss": 3.631,
+      "step": 13550
+    },
+    {
+      "epoch": 3.961605686320205,
+      "grad_norm": 0.32067936658859253,
+      "learning_rate": 0.0005527869425823375,
+      "loss": 3.6362,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9761710556979724,
+      "grad_norm": 0.31947606801986694,
+      "learning_rate": 0.0005526120664529292,
+      "loss": 3.6329,
+      "step": 13650
+    },
+    {
+      "epoch": 3.99073642507574,
+      "grad_norm": 0.2974706292152405,
+      "learning_rate": 0.0005524371903235207,
+      "loss": 3.636,
+      "step": 13700
+    },
+    {
+      "epoch": 4.005243532975996,
+      "grad_norm": 0.3165639042854309,
+      "learning_rate": 0.0005522623141941124,
+      "loss": 3.5888,
+      "step": 13750
+    },
+    {
+      "epoch": 4.0198089023537635,
+      "grad_norm": 0.3143453299999237,
+      "learning_rate": 0.0005520874380647041,
+      "loss": 3.5185,
+      "step": 13800
+    },
+    {
+      "epoch": 4.034374271731531,
+      "grad_norm": 0.3318156599998474,
+      "learning_rate": 0.0005519125619352957,
+      "loss": 3.52,
+      "step": 13850
+    },
+    {
+      "epoch": 4.048939641109299,
+      "grad_norm": 0.30892449617385864,
+      "learning_rate": 0.0005517376858058875,
+      "loss": 3.5368,
+      "step": 13900
+    },
+    {
+      "epoch": 4.063505010487066,
+      "grad_norm": 0.3253442645072937,
+      "learning_rate": 0.0005515628096764791,
+      "loss": 3.5317,
+      "step": 13950
+    },
+    {
+      "epoch": 4.078070379864833,
+      "grad_norm": 0.31239765882492065,
+      "learning_rate": 0.0005513879335470708,
+      "loss": 3.5319,
+      "step": 14000
+    },
+    {
+      "epoch": 4.078070379864833,
+      "eval_accuracy": 0.35649757600732224,
+      "eval_loss": 3.668315887451172,
+      "eval_runtime": 182.8782,
+      "eval_samples_per_second": 91.017,
+      "eval_steps_per_second": 5.692,
+      "step": 14000
+    },
+    {
+      "epoch": 4.092635749242601,
+      "grad_norm": 0.32942360639572144,
+      "learning_rate": 0.0005512130574176625,
+      "loss": 3.5415,
+      "step": 14050
+    },
+    {
+      "epoch": 4.1072011186203685,
+      "grad_norm": 0.3166195750236511,
+      "learning_rate": 0.000551038181288254,
+      "loss": 3.5305,
+      "step": 14100
+    },
+    {
+      "epoch": 4.121766487998135,
+      "grad_norm": 0.3233526349067688,
+      "learning_rate": 0.0005508633051588457,
+      "loss": 3.5415,
+      "step": 14150
+    },
+    {
+      "epoch": 4.136331857375903,
+      "grad_norm": 0.32190340757369995,
+      "learning_rate": 0.0005506884290294374,
+      "loss": 3.5477,
+      "step": 14200
+    },
+    {
+      "epoch": 4.150897226753671,
+      "grad_norm": 0.34432554244995117,
+      "learning_rate": 0.0005505135529000291,
+      "loss": 3.5525,
+      "step": 14250
+    },
+    {
+      "epoch": 4.165462596131438,
+      "grad_norm": 0.3212747275829315,
+      "learning_rate": 0.0005503386767706207,
+      "loss": 3.5454,
+      "step": 14300
+    },
+    {
+      "epoch": 4.180027965509205,
+      "grad_norm": 0.3225768804550171,
+      "learning_rate": 0.0005501638006412124,
+      "loss": 3.5577,
+      "step": 14350
+    },
+    {
+      "epoch": 4.194593334886973,
+      "grad_norm": 0.3135558068752289,
+      "learning_rate": 0.0005499889245118041,
+      "loss": 3.5534,
+      "step": 14400
+    },
+    {
+      "epoch": 4.20915870426474,
+      "grad_norm": 0.342723548412323,
+      "learning_rate": 0.0005498140483823958,
+      "loss": 3.5381,
+      "step": 14450
+    },
+    {
+      "epoch": 4.223724073642508,
+      "grad_norm": 0.3262888193130493,
+      "learning_rate": 0.0005496391722529875,
+      "loss": 3.5649,
+      "step": 14500
+    },
+    {
+      "epoch": 4.238289443020275,
+      "grad_norm": 0.3289225399494171,
+      "learning_rate": 0.000549464296123579,
+      "loss": 3.5588,
+      "step": 14550
+    },
+    {
+      "epoch": 4.252854812398042,
+      "grad_norm": 0.30956584215164185,
+      "learning_rate": 0.0005492894199941707,
+      "loss": 3.5632,
+      "step": 14600
+    },
+    {
+      "epoch": 4.26742018177581,
+      "grad_norm": 0.33964231610298157,
+      "learning_rate": 0.0005491145438647624,
+      "loss": 3.5687,
+      "step": 14650
+    },
+    {
+      "epoch": 4.281985551153578,
+      "grad_norm": 0.33205127716064453,
+      "learning_rate": 0.000548939667735354,
+      "loss": 3.5543,
+      "step": 14700
+    },
+    {
+      "epoch": 4.296550920531344,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0005487647916059457,
+      "loss": 3.5658,
+      "step": 14750
+    },
+    {
+      "epoch": 4.311116289909112,
+      "grad_norm": 0.3393228054046631,
+      "learning_rate": 0.0005485899154765374,
+      "loss": 3.5577,
+      "step": 14800
+    },
+    {
+      "epoch": 4.32568165928688,
+      "grad_norm": 0.31260740756988525,
+      "learning_rate": 0.0005484150393471291,
+      "loss": 3.5722,
+      "step": 14850
+    },
+    {
+      "epoch": 4.340247028664647,
+      "grad_norm": 0.331304132938385,
+      "learning_rate": 0.0005482401632177208,
+      "loss": 3.5601,
+      "step": 14900
+    },
+    {
+      "epoch": 4.354812398042414,
+      "grad_norm": 0.3720129728317261,
+      "learning_rate": 0.0005480652870883124,
+      "loss": 3.5754,
+      "step": 14950
+    },
+    {
+      "epoch": 4.369377767420182,
+      "grad_norm": 0.31915804743766785,
+      "learning_rate": 0.000547890410958904,
+      "loss": 3.5417,
+      "step": 15000
+    },
+    {
+      "epoch": 4.369377767420182,
+      "eval_accuracy": 0.3573578332850318,
+      "eval_loss": 3.6594953536987305,
+      "eval_runtime": 183.0024,
+      "eval_samples_per_second": 90.955,
+      "eval_steps_per_second": 5.688,
+      "step": 15000
+    },
+    {
+      "epoch": 4.383943136797949,
+      "grad_norm": 0.3377160429954529,
+      "learning_rate": 0.0005477155348294957,
+      "loss": 3.5704,
+      "step": 15050
+    },
+    {
+      "epoch": 4.398508506175717,
+      "grad_norm": 0.31284624338150024,
+      "learning_rate": 0.0005475406587000874,
+      "loss": 3.562,
+      "step": 15100
+    },
+    {
+      "epoch": 4.413073875553484,
+      "grad_norm": 0.331798255443573,
+      "learning_rate": 0.000547365782570679,
+      "loss": 3.5583,
+      "step": 15150
+    },
+    {
+      "epoch": 4.427639244931251,
+      "grad_norm": 0.31809282302856445,
+      "learning_rate": 0.0005471909064412707,
+      "loss": 3.5653,
+      "step": 15200
+    },
+    {
+      "epoch": 4.442204614309019,
+      "grad_norm": 0.3349161446094513,
+      "learning_rate": 0.0005470160303118624,
+      "loss": 3.5674,
+      "step": 15250
+    },
+    {
+      "epoch": 4.456769983686787,
+      "grad_norm": 0.32030266523361206,
+      "learning_rate": 0.000546841154182454,
+      "loss": 3.5776,
+      "step": 15300
+    },
+    {
+      "epoch": 4.471335353064553,
+      "grad_norm": NaN,
+      "learning_rate": 0.0005466662780530458,
+      "loss": 3.5804,
+      "step": 15350
+    },
+    {
+      "epoch": 4.485900722442321,
+      "grad_norm": 0.3176333010196686,
+      "learning_rate": 0.0005464914019236374,
+      "loss": 3.5782,
+      "step": 15400
+    },
+    {
+      "epoch": 4.500466091820089,
+      "grad_norm": 0.3277718126773834,
+      "learning_rate": 0.000546316525794229,
+      "loss": 3.5864,
+      "step": 15450
+    },
+    {
+      "epoch": 4.515031461197856,
+      "grad_norm": 0.3107328712940216,
+      "learning_rate": 0.0005461416496648207,
+      "loss": 3.5818,
+      "step": 15500
+    },
+    {
+      "epoch": 4.529596830575623,
+      "grad_norm": 0.32739174365997314,
+      "learning_rate": 0.0005459667735354123,
+      "loss": 3.5574,
+      "step": 15550
+    },
+    {
+      "epoch": 4.544162199953391,
+      "grad_norm": 0.3131246268749237,
+      "learning_rate": 0.000545791897406004,
+      "loss": 3.5691,
+      "step": 15600
+    },
+    {
+      "epoch": 4.558727569331158,
+      "grad_norm": 0.33344247937202454,
+      "learning_rate": 0.0005456170212765957,
+      "loss": 3.5742,
+      "step": 15650
+    },
+    {
+      "epoch": 4.573292938708926,
+      "grad_norm": 0.3450835347175598,
+      "learning_rate": 0.0005454421451471874,
+      "loss": 3.5739,
+      "step": 15700
+    },
+    {
+      "epoch": 4.587858308086693,
+      "grad_norm": 0.31020715832710266,
+      "learning_rate": 0.000545267269017779,
+      "loss": 3.5733,
+      "step": 15750
+    },
+    {
+      "epoch": 4.6024236774644605,
+      "grad_norm": 0.3077200651168823,
+      "learning_rate": 0.0005450923928883708,
+      "loss": 3.5576,
+      "step": 15800
+    },
+    {
+      "epoch": 4.616989046842228,
+      "grad_norm": 0.3353123068809509,
+      "learning_rate": 0.0005449175167589623,
+      "loss": 3.5723,
+      "step": 15850
+    },
+    {
+      "epoch": 4.631554416219995,
+      "grad_norm": 0.32540038228034973,
+      "learning_rate": 0.000544742640629554,
+      "loss": 3.5793,
+      "step": 15900
+    },
+    {
+      "epoch": 4.6461197855977625,
+      "grad_norm": 0.3153678774833679,
+      "learning_rate": 0.0005445677645001457,
+      "loss": 3.566,
+      "step": 15950
+    },
+    {
+      "epoch": 4.66068515497553,
+      "grad_norm": 0.3324628472328186,
+      "learning_rate": 0.0005443928883707373,
+      "loss": 3.5679,
+      "step": 16000
+    },
+    {
+      "epoch": 4.66068515497553,
+      "eval_accuracy": 0.35883380292619155,
+      "eval_loss": 3.6458113193511963,
+      "eval_runtime": 182.9054,
+      "eval_samples_per_second": 91.003,
+      "eval_steps_per_second": 5.691,
+      "step": 16000
+    },
+    {
+      "epoch": 4.675250524353298,
+      "grad_norm": 0.30943557620048523,
+      "learning_rate": 0.000544218012241329,
+      "loss": 3.5781,
+      "step": 16050
+    },
+    {
+      "epoch": 4.689815893731065,
+      "grad_norm": 0.33318641781806946,
+      "learning_rate": 0.0005440431361119207,
+      "loss": 3.5782,
+      "step": 16100
+    },
+    {
+      "epoch": 4.704381263108832,
+      "grad_norm": 0.32952025532722473,
+      "learning_rate": 0.0005438682599825123,
+      "loss": 3.5731,
+      "step": 16150
+    },
+    {
+      "epoch": 4.7189466324866,
+      "grad_norm": 0.33214232325553894,
+      "learning_rate": 0.000543693383853104,
+      "loss": 3.5754,
+      "step": 16200
+    },
+    {
+      "epoch": 4.7335120018643675,
+      "grad_norm": 0.3278939723968506,
+      "learning_rate": 0.0005435185077236957,
+      "loss": 3.5669,
+      "step": 16250
+    },
+    {
+      "epoch": 4.748077371242134,
+      "grad_norm": 0.35436153411865234,
+      "learning_rate": 0.0005433436315942873,
+      "loss": 3.5705,
+      "step": 16300
+    },
+    {
+      "epoch": 4.762642740619902,
+      "grad_norm": 0.32666391134262085,
+      "learning_rate": 0.000543168755464879,
+      "loss": 3.567,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7772081099976695,
+      "grad_norm": 0.3220253586769104,
+      "learning_rate": 0.0005429938793354706,
+      "loss": 3.5843,
+      "step": 16400
+    },
+    {
+      "epoch": 4.791773479375437,
+      "grad_norm": 0.3143666982650757,
+      "learning_rate": 0.0005428190032060623,
+      "loss": 3.5715,
+      "step": 16450
+    },
+    {
+      "epoch": 4.806338848753205,
+      "grad_norm": 0.30706289410591125,
+      "learning_rate": 0.000542644127076654,
+      "loss": 3.5738,
+      "step": 16500
+    },
+    {
+      "epoch": 4.820904218130972,
+      "grad_norm": 0.3221777677536011,
+      "learning_rate": 0.0005424692509472457,
+      "loss": 3.5769,
+      "step": 16550
+    },
+    {
+      "epoch": 4.835469587508739,
+      "grad_norm": 0.32023727893829346,
+      "learning_rate": 0.0005422943748178373,
+      "loss": 3.5685,
+      "step": 16600
+    },
+    {
+      "epoch": 4.850034956886507,
+      "grad_norm": 0.3428962230682373,
+      "learning_rate": 0.000542119498688429,
+      "loss": 3.5825,
+      "step": 16650
+    },
+    {
+      "epoch": 4.864600326264274,
+      "grad_norm": 0.3187549412250519,
+      "learning_rate": 0.0005419446225590207,
+      "loss": 3.5684,
+      "step": 16700
+    },
+    {
+      "epoch": 4.879165695642041,
+      "grad_norm": 0.32895606756210327,
+      "learning_rate": 0.0005417697464296122,
+      "loss": 3.5723,
+      "step": 16750
+    },
+    {
+      "epoch": 4.893731065019809,
+      "grad_norm": 0.31578466296195984,
+      "learning_rate": 0.000541594870300204,
+      "loss": 3.5769,
+      "step": 16800
+    },
+    {
+      "epoch": 4.908296434397577,
+      "grad_norm": 0.3406703472137451,
+      "learning_rate": 0.0005414199941707956,
+      "loss": 3.5636,
+      "step": 16850
+    },
+    {
+      "epoch": 4.922861803775344,
+      "grad_norm": 0.32653164863586426,
+      "learning_rate": 0.0005412451180413873,
+      "loss": 3.5667,
+      "step": 16900
+    },
+    {
+      "epoch": 4.937427173153111,
+      "grad_norm": 0.31869933009147644,
+      "learning_rate": 0.000541070241911979,
+      "loss": 3.5796,
+      "step": 16950
+    },
+    {
+      "epoch": 4.951992542530879,
+      "grad_norm": 0.3086072504520416,
+      "learning_rate": 0.0005408953657825706,
+      "loss": 3.5802,
+      "step": 17000
+    },
+    {
+      "epoch": 4.951992542530879,
+      "eval_accuracy": 0.36002831077661235,
+      "eval_loss": 3.6297221183776855,
+      "eval_runtime": 182.8739,
+      "eval_samples_per_second": 91.019,
+      "eval_steps_per_second": 5.692,
+      "step": 17000
+    },
+    {
+      "epoch": 4.966557911908646,
+      "grad_norm": 0.3254891335964203,
+      "learning_rate": 0.0005407204896531623,
+      "loss": 3.5614,
+      "step": 17050
+    },
+    {
+      "epoch": 4.981123281286413,
+      "grad_norm": 0.3315959870815277,
+      "learning_rate": 0.000540545613523754,
+      "loss": 3.5736,
+      "step": 17100
+    },
+    {
+      "epoch": 4.995688650664181,
+      "grad_norm": 0.308977335691452,
+      "learning_rate": 0.0005403707373943456,
+      "loss": 3.5687,
+      "step": 17150
+    },
+    {
+      "epoch": 5.010195758564437,
+      "grad_norm": 0.33981844782829285,
+      "learning_rate": 0.0005401958612649372,
+      "loss": 3.5112,
+      "step": 17200
+    },
+    {
+      "epoch": 5.024761127942204,
+      "grad_norm": 0.32200807332992554,
+      "learning_rate": 0.000540020985135529,
+      "loss": 3.4613,
+      "step": 17250
+    },
+    {
+      "epoch": 5.039326497319972,
+      "grad_norm": 0.30297327041625977,
+      "learning_rate": 0.0005398461090061206,
+      "loss": 3.4576,
+      "step": 17300
+    },
+    {
+      "epoch": 5.0538918666977395,
+      "grad_norm": 0.32989853620529175,
+      "learning_rate": 0.0005396712328767123,
+      "loss": 3.4718,
+      "step": 17350
+    },
+    {
+      "epoch": 5.068457236075507,
+      "grad_norm": 0.31648069620132446,
+      "learning_rate": 0.000539496356747304,
+      "loss": 3.4785,
+      "step": 17400
+    },
+    {
+      "epoch": 5.083022605453274,
+      "grad_norm": 0.3199455440044403,
+      "learning_rate": 0.0005393214806178956,
+      "loss": 3.4711,
+      "step": 17450
+    },
+    {
+      "epoch": 5.0975879748310415,
+      "grad_norm": 0.33137065172195435,
+      "learning_rate": 0.0005391466044884873,
+      "loss": 3.491,
+      "step": 17500
+    },
+    {
+      "epoch": 5.112153344208809,
+      "grad_norm": 0.33032289147377014,
+      "learning_rate": 0.000538971728359079,
+      "loss": 3.4838,
+      "step": 17550
+    },
+    {
+      "epoch": 5.126718713586577,
+      "grad_norm": 0.32575368881225586,
+      "learning_rate": 0.0005387968522296705,
+      "loss": 3.492,
+      "step": 17600
+    },
+    {
+      "epoch": 5.141284082964344,
+      "grad_norm": 0.3349052965641022,
+      "learning_rate": 0.0005386219761002622,
+      "loss": 3.4838,
+      "step": 17650
+    },
+    {
+      "epoch": 5.155849452342111,
+      "grad_norm": 0.3152545094490051,
+      "learning_rate": 0.0005384470999708539,
+      "loss": 3.4819,
+      "step": 17700
+    },
+    {
+      "epoch": 5.170414821719879,
+      "grad_norm": 0.321737676858902,
+      "learning_rate": 0.0005382722238414456,
+      "loss": 3.486,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1849801910976465,
+      "grad_norm": 0.36409512162208557,
+      "learning_rate": 0.0005380973477120373,
+      "loss": 3.4959,
+      "step": 17800
+    },
+    {
+      "epoch": 5.199545560475413,
+      "grad_norm": 0.32220658659935,
+      "learning_rate": 0.000537922471582629,
+      "loss": 3.4938,
+      "step": 17850
+    },
+    {
+      "epoch": 5.214110929853181,
+      "grad_norm": 0.3427848219871521,
+      "learning_rate": 0.0005377475954532206,
+      "loss": 3.4992,
+      "step": 17900
+    },
+    {
+      "epoch": 5.228676299230949,
+      "grad_norm": 0.33268028497695923,
+      "learning_rate": 0.0005375727193238123,
+      "loss": 3.4945,
+      "step": 17950
+    },
+    {
+      "epoch": 5.243241668608716,
+      "grad_norm": 0.2996683120727539,
+      "learning_rate": 0.000537397843194404,
+      "loss": 3.4929,
+      "step": 18000
+    },
+    {
+      "epoch": 5.243241668608716,
+      "eval_accuracy": 0.36062627011984466,
+      "eval_loss": 3.6342105865478516,
+      "eval_runtime": 182.824,
+      "eval_samples_per_second": 91.044,
+      "eval_steps_per_second": 5.694,
+      "step": 18000
+    },
+    {
+      "epoch": 5.257807037986483,
+      "grad_norm": 0.30999842286109924,
+      "learning_rate": 0.0005372229670649955,
+      "loss": 3.5096,
+      "step": 18050
+    },
+    {
+      "epoch": 5.272372407364251,
+      "grad_norm": 0.32671719789505005,
+      "learning_rate": 0.0005370480909355872,
+      "loss": 3.5092,
+      "step": 18100
+    },
+    {
+      "epoch": 5.286937776742018,
+      "grad_norm": 0.32220521569252014,
+      "learning_rate": 0.0005368732148061789,
+      "loss": 3.5055,
+      "step": 18150
+    },
+    {
+      "epoch": 5.301503146119786,
+      "grad_norm": 0.3446556031703949,
+      "learning_rate": 0.0005366983386767705,
+      "loss": 3.5103,
+      "step": 18200
+    },
+    {
+      "epoch": 5.316068515497553,
+      "grad_norm": 0.333375483751297,
+      "learning_rate": 0.0005365234625473623,
+      "loss": 3.5227,
+      "step": 18250
+    },
+    {
+      "epoch": 5.33063388487532,
+      "grad_norm": 0.32386448979377747,
+      "learning_rate": 0.0005363485864179539,
+      "loss": 3.5153,
+      "step": 18300
+    },
+    {
+      "epoch": 5.345199254253088,
+      "grad_norm": 0.31273576617240906,
+      "learning_rate": 0.0005361737102885456,
+      "loss": 3.5114,
+      "step": 18350
+    },
+    {
+      "epoch": 5.359764623630856,
+      "grad_norm": 0.32386016845703125,
+      "learning_rate": 0.0005359988341591373,
+      "loss": 3.5176,
+      "step": 18400
+    },
+    {
+      "epoch": 5.374329993008622,
+      "grad_norm": 0.3084457516670227,
+      "learning_rate": 0.000535823958029729,
+      "loss": 3.5144,
+      "step": 18450
+    },
+    {
+      "epoch": 5.38889536238639,
+      "grad_norm": 0.34510570764541626,
+      "learning_rate": 0.0005356490819003205,
+      "loss": 3.5283,
+      "step": 18500
+    },
+    {
+      "epoch": 5.403460731764158,
+      "grad_norm": 0.33156850934028625,
+      "learning_rate": 0.0005354742057709122,
+      "loss": 3.5198,
+      "step": 18550
+    },
+    {
+      "epoch": 5.418026101141925,
+      "grad_norm": 0.32253149151802063,
+      "learning_rate": 0.0005352993296415039,
+      "loss": 3.5129,
+      "step": 18600
+    },
+    {
+      "epoch": 5.432591470519692,
+      "grad_norm": 0.343431293964386,
+      "learning_rate": 0.0005351244535120955,
+      "loss": 3.5214,
+      "step": 18650
+    },
+    {
+      "epoch": 5.44715683989746,
+      "grad_norm": 0.320697158575058,
+      "learning_rate": 0.0005349495773826873,
+      "loss": 3.5089,
+      "step": 18700
+    },
+    {
+      "epoch": 5.461722209275227,
+      "grad_norm": 0.3079582154750824,
+      "learning_rate": 0.0005347747012532789,
+      "loss": 3.523,
+      "step": 18750
+    },
+    {
+      "epoch": 5.476287578652995,
+      "grad_norm": 0.33627447485923767,
+      "learning_rate": 0.0005345998251238706,
+      "loss": 3.5175,
+      "step": 18800
+    },
+    {
+      "epoch": 5.490852948030762,
+      "grad_norm": 0.3293239176273346,
+      "learning_rate": 0.0005344249489944623,
+      "loss": 3.5214,
+      "step": 18850
+    },
+    {
+      "epoch": 5.505418317408529,
+      "grad_norm": 0.321121484041214,
+      "learning_rate": 0.0005342500728650538,
+      "loss": 3.5227,
+      "step": 18900
+    },
+    {
+      "epoch": 5.519983686786297,
+      "grad_norm": 0.32323557138442993,
+      "learning_rate": 0.0005340751967356455,
+      "loss": 3.5089,
+      "step": 18950
+    },
+    {
+      "epoch": 5.534549056164065,
+      "grad_norm": 0.3353577256202698,
+      "learning_rate": 0.0005339003206062372,
+      "loss": 3.5125,
+      "step": 19000
+    },
+    {
+      "epoch": 5.534549056164065,
+      "eval_accuracy": 0.36119366134879455,
+      "eval_loss": 3.62491774559021,
+      "eval_runtime": 182.9044,
+      "eval_samples_per_second": 91.004,
+      "eval_steps_per_second": 5.691,
+      "step": 19000
+    },
+    {
+      "epoch": 5.549114425541831,
+      "grad_norm": 0.3373064696788788,
+      "learning_rate": 0.0005337254444768288,
+      "loss": 3.5161,
+      "step": 19050
+    },
+    {
+      "epoch": 5.563679794919599,
+      "grad_norm": 0.3205850422382355,
+      "learning_rate": 0.0005335505683474205,
+      "loss": 3.5198,
+      "step": 19100
+    },
+    {
+      "epoch": 5.578245164297367,
+      "grad_norm": 0.33948105573654175,
+      "learning_rate": 0.0005333756922180122,
+      "loss": 3.5155,
+      "step": 19150
+    },
+    {
+      "epoch": 5.592810533675134,
+      "grad_norm": 0.30776116251945496,
+      "learning_rate": 0.0005332008160886039,
+      "loss": 3.5019,
+      "step": 19200
+    },
+    {
+      "epoch": 5.607375903052901,
+      "grad_norm": 0.42596328258514404,
+      "learning_rate": 0.0005330259399591956,
+      "loss": 3.5311,
+      "step": 19250
+    },
+    {
+      "epoch": 5.621941272430669,
+      "grad_norm": 0.34155476093292236,
+      "learning_rate": 0.0005328510638297873,
+      "loss": 3.5215,
+      "step": 19300
+    },
+    {
+      "epoch": 5.636506641808436,
+      "grad_norm": 0.33613601326942444,
+      "learning_rate": 0.0005326761877003788,
+      "loss": 3.5089,
+      "step": 19350
+    },
+    {
+      "epoch": 5.651072011186204,
+      "grad_norm": 0.32420244812965393,
+      "learning_rate": 0.0005325013115709705,
+      "loss": 3.5318,
+      "step": 19400
+    },
+    {
+      "epoch": 5.665637380563971,
+      "grad_norm": 0.30287185311317444,
+      "learning_rate": 0.0005323264354415622,
+      "loss": 3.5207,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6802027499417385,
+      "grad_norm": 0.3400566577911377,
+      "learning_rate": 0.0005321515593121538,
+      "loss": 3.5213,
+      "step": 19500
+    },
+    {
+      "epoch": 5.694768119319506,
+      "grad_norm": 0.31341907382011414,
+      "learning_rate": 0.0005319766831827455,
+      "loss": 3.5249,
+      "step": 19550
+    },
+    {
+      "epoch": 5.709333488697274,
+      "grad_norm": 0.3233737647533417,
+      "learning_rate": 0.0005318018070533372,
+      "loss": 3.5391,
+      "step": 19600
+    },
+    {
+      "epoch": 5.7238988580750405,
+      "grad_norm": 0.3121044635772705,
+      "learning_rate": 0.0005316269309239288,
+      "loss": 3.5232,
+      "step": 19650
+    },
+    {
+      "epoch": 5.738464227452808,
+      "grad_norm": 0.32169246673583984,
+      "learning_rate": 0.0005314520547945206,
+      "loss": 3.5159,
+      "step": 19700
+    },
+    {
+      "epoch": 5.753029596830576,
+      "grad_norm": 0.3152818977832794,
+      "learning_rate": 0.0005312771786651121,
+      "loss": 3.5194,
+      "step": 19750
+    },
+    {
+      "epoch": 5.7675949662083426,
+      "grad_norm": 0.31117457151412964,
+      "learning_rate": 0.0005311023025357038,
+      "loss": 3.5284,
+      "step": 19800
+    },
+    {
+      "epoch": 5.78216033558611,
+      "grad_norm": 0.3282223641872406,
+      "learning_rate": 0.0005309274264062955,
+      "loss": 3.526,
+      "step": 19850
+    },
+    {
+      "epoch": 5.796725704963878,
+      "grad_norm": 0.3373311161994934,
+      "learning_rate": 0.0005307525502768872,
+      "loss": 3.5291,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8112910743416455,
+      "grad_norm": 0.3154096007347107,
+      "learning_rate": 0.0005305776741474788,
+      "loss": 3.53,
+      "step": 19950
+    },
+    {
+      "epoch": 5.825856443719413,
+      "grad_norm": 0.31639188528060913,
+      "learning_rate": 0.0005304027980180705,
+      "loss": 3.5162,
+      "step": 20000
+    },
+    {
+      "epoch": 5.825856443719413,
+      "eval_accuracy": 0.3622192215829698,
+      "eval_loss": 3.612488269805908,
+      "eval_runtime": 182.8981,
+      "eval_samples_per_second": 91.007,
+      "eval_steps_per_second": 5.692,
+      "step": 20000
+    },
+    {
+      "epoch": 5.84042181309718,
+      "grad_norm": 0.33335795998573303,
+      "learning_rate": 0.0005302279218886622,
+      "loss": 3.5267,
+      "step": 20050
+    },
+    {
+      "epoch": 5.8549871824749475,
+      "grad_norm": 0.32977914810180664,
+      "learning_rate": 0.0005300530457592538,
+      "loss": 3.5278,
+      "step": 20100
+    },
+    {
+      "epoch": 5.869552551852715,
+      "grad_norm": 0.319991797208786,
+      "learning_rate": 0.0005298781696298456,
+      "loss": 3.5243,
+      "step": 20150
+    },
+    {
+      "epoch": 5.884117921230482,
+      "grad_norm": 0.31696072220802307,
+      "learning_rate": 0.0005297032935004371,
+      "loss": 3.5255,
+      "step": 20200
+    },
+    {
+      "epoch": 5.89868329060825,
+      "grad_norm": 0.31803423166275024,
+      "learning_rate": 0.0005295284173710288,
+      "loss": 3.5163,
+      "step": 20250
+    },
+    {
+      "epoch": 5.913248659986017,
+      "grad_norm": 0.30268293619155884,
+      "learning_rate": 0.0005293535412416205,
+      "loss": 3.5247,
+      "step": 20300
+    },
+    {
+      "epoch": 5.927814029363785,
+      "grad_norm": 0.3222343325614929,
+      "learning_rate": 0.0005291786651122121,
+      "loss": 3.5226,
+      "step": 20350
+    },
+    {
+      "epoch": 5.9423793987415525,
+      "grad_norm": 0.3210581839084625,
+      "learning_rate": 0.0005290037889828038,
+      "loss": 3.5315,
+      "step": 20400
+    },
+    {
+      "epoch": 5.956944768119319,
+      "grad_norm": 0.30163270235061646,
+      "learning_rate": 0.0005288289128533955,
+      "loss": 3.531,
+      "step": 20450
+    },
+    {
+      "epoch": 5.971510137497087,
+      "grad_norm": 0.3174237906932831,
+      "learning_rate": 0.0005286540367239872,
+      "loss": 3.5308,
+      "step": 20500
+    },
+    {
+      "epoch": 5.986075506874855,
+      "grad_norm": 0.3105069398880005,
+      "learning_rate": 0.0005284791605945788,
+      "loss": 3.5229,
+      "step": 20550
+    },
+    {
+      "epoch": 6.0005826147751105,
+      "grad_norm": 0.35065528750419617,
+      "learning_rate": 0.0005283042844651704,
+      "loss": 3.5306,
+      "step": 20600
+    },
+    {
+      "epoch": 6.015147984152878,
+      "grad_norm": 0.3209701180458069,
+      "learning_rate": 0.0005281294083357621,
+      "loss": 3.4202,
+      "step": 20650
+    },
+    {
+      "epoch": 6.029713353530646,
+      "grad_norm": 0.35732001066207886,
+      "learning_rate": 0.0005279545322063538,
+      "loss": 3.4097,
+      "step": 20700
+    },
+    {
+      "epoch": 6.044278722908413,
+      "grad_norm": 0.3293222188949585,
+      "learning_rate": 0.0005277796560769455,
+      "loss": 3.4211,
+      "step": 20750
+    },
+    {
+      "epoch": 6.05884409228618,
+      "grad_norm": 0.3226439952850342,
+      "learning_rate": 0.0005276047799475371,
+      "loss": 3.4286,
+      "step": 20800
+    },
+    {
+      "epoch": 6.073409461663948,
+      "grad_norm": 0.33143454790115356,
+      "learning_rate": 0.0005274299038181288,
+      "loss": 3.4274,
+      "step": 20850
+    },
+    {
+      "epoch": 6.087974831041715,
+      "grad_norm": 0.31629180908203125,
+      "learning_rate": 0.0005272550276887205,
+      "loss": 3.4214,
+      "step": 20900
+    },
+    {
+      "epoch": 6.102540200419483,
+      "grad_norm": 0.3248637020587921,
+      "learning_rate": 0.0005270801515593121,
+      "loss": 3.448,
+      "step": 20950
+    },
+    {
+      "epoch": 6.11710556979725,
+      "grad_norm": 0.333915114402771,
+      "learning_rate": 0.0005269052754299037,
+      "loss": 3.4304,
+      "step": 21000
+    },
+    {
+      "epoch": 6.11710556979725,
+      "eval_accuracy": 0.36297096205497675,
+      "eval_loss": 3.6168880462646484,
+      "eval_runtime": 183.1564,
+      "eval_samples_per_second": 90.879,
+      "eval_steps_per_second": 5.684,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1316709391750175,
+      "grad_norm": 0.339853435754776,
+      "learning_rate": 0.0005267303993004954,
+      "loss": 3.432,
+      "step": 21050
+    },
+    {
+      "epoch": 6.146236308552785,
+      "grad_norm": 0.3413807153701782,
+      "learning_rate": 0.000526555523171087,
+      "loss": 3.4503,
+      "step": 21100
+    },
+    {
+      "epoch": 6.160801677930552,
+      "grad_norm": 0.3138124942779541,
+      "learning_rate": 0.0005263806470416788,
+      "loss": 3.4536,
+      "step": 21150
+    },
+    {
+      "epoch": 6.1753670473083195,
+      "grad_norm": 0.3247540593147278,
+      "learning_rate": 0.0005262057709122704,
+      "loss": 3.4454,
+      "step": 21200
+    },
+    {
+      "epoch": 6.189932416686087,
+      "grad_norm": 0.30432644486427307,
+      "learning_rate": 0.0005260308947828621,
+      "loss": 3.4577,
+      "step": 21250
+    },
+    {
+      "epoch": 6.204497786063855,
+      "grad_norm": 0.3186846077442169,
+      "learning_rate": 0.0005258560186534538,
+      "loss": 3.4504,
+      "step": 21300
+    },
+    {
+      "epoch": 6.219063155441622,
+      "grad_norm": 0.3390951454639435,
+      "learning_rate": 0.0005256811425240455,
+      "loss": 3.4427,
+      "step": 21350
+    },
+    {
+      "epoch": 6.233628524819389,
+      "grad_norm": 0.3416147828102112,
+      "learning_rate": 0.0005255062663946371,
+      "loss": 3.4451,
+      "step": 21400
+    },
+    {
+      "epoch": 6.248193894197157,
+      "grad_norm": 0.330108106136322,
+      "learning_rate": 0.0005253313902652287,
+      "loss": 3.4634,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2627592635749245,
+      "grad_norm": 0.32614409923553467,
+      "learning_rate": 0.0005251565141358204,
+      "loss": 3.4689,
+      "step": 21500
+    },
+    {
+      "epoch": 6.277324632952691,
+      "grad_norm": 0.3183272182941437,
+      "learning_rate": 0.000524981638006412,
+      "loss": 3.4614,
+      "step": 21550
+    },
+    {
+      "epoch": 6.291890002330459,
+      "grad_norm": 0.3260941803455353,
+      "learning_rate": 0.0005248067618770038,
+      "loss": 3.4687,
+      "step": 21600
+    },
+    {
+      "epoch": 6.306455371708227,
+      "grad_norm": 0.34689396619796753,
+      "learning_rate": 0.0005246318857475954,
+      "loss": 3.4607,
+      "step": 21650
+    },
+    {
+      "epoch": 6.321020741085994,
+      "grad_norm": 0.3259231150150299,
+      "learning_rate": 0.0005244570096181871,
+      "loss": 3.4606,
+      "step": 21700
+    },
+    {
+      "epoch": 6.335586110463761,
+      "grad_norm": 0.33066433668136597,
+      "learning_rate": 0.0005242821334887788,
+      "loss": 3.4666,
+      "step": 21750
+    },
+    {
+      "epoch": 6.350151479841529,
+      "grad_norm": 0.3282213807106018,
+      "learning_rate": 0.0005241072573593704,
+      "loss": 3.4566,
+      "step": 21800
+    },
+    {
+      "epoch": 6.364716849219296,
+      "grad_norm": 0.32075440883636475,
+      "learning_rate": 0.000523932381229962,
+      "loss": 3.4641,
+      "step": 21850
+    },
+    {
+      "epoch": 6.379282218597064,
+      "grad_norm": 0.3442796468734741,
+      "learning_rate": 0.0005237575051005537,
+      "loss": 3.4588,
+      "step": 21900
+    },
+    {
+      "epoch": 6.393847587974831,
+      "grad_norm": 0.3072233498096466,
+      "learning_rate": 0.0005235826289711454,
+      "loss": 3.462,
+      "step": 21950
+    },
+    {
+      "epoch": 6.408412957352598,
+      "grad_norm": 0.35822081565856934,
+      "learning_rate": 0.000523407752841737,
+      "loss": 3.4737,
+      "step": 22000
+    },
+    {
+      "epoch": 6.408412957352598,
+      "eval_accuracy": 0.3631809414861629,
+      "eval_loss": 3.6075851917266846,
+      "eval_runtime": 183.2532,
+      "eval_samples_per_second": 90.831,
+      "eval_steps_per_second": 5.681,
+      "step": 22000
+    },
+    {
+      "epoch": 6.422978326730366,
+      "grad_norm": 0.3440370559692383,
+      "learning_rate": 0.0005232328767123287,
+      "loss": 3.4619,
+      "step": 22050
+    },
+    {
+      "epoch": 6.437543696108134,
+      "grad_norm": 0.3295625150203705,
+      "learning_rate": 0.0005230580005829204,
+      "loss": 3.4756,
+      "step": 22100
+    },
+    {
+      "epoch": 6.4521090654859,
+      "grad_norm": 0.32679641246795654,
+      "learning_rate": 0.0005228831244535121,
+      "loss": 3.4795,
+      "step": 22150
+    },
+    {
+      "epoch": 6.466674434863668,
+      "grad_norm": 0.3422777056694031,
+      "learning_rate": 0.0005227082483241038,
+      "loss": 3.4777,
+      "step": 22200
+    },
+    {
+      "epoch": 6.481239804241436,
+      "grad_norm": 0.3188982903957367,
+      "learning_rate": 0.0005225333721946954,
+      "loss": 3.4789,
+      "step": 22250
+    },
+    {
+      "epoch": 6.495805173619203,
+      "grad_norm": 0.33180058002471924,
+      "learning_rate": 0.000522358496065287,
+      "loss": 3.4764,
+      "step": 22300
+    },
+    {
+      "epoch": 6.51037054299697,
+      "grad_norm": 0.35259467363357544,
+      "learning_rate": 0.0005221836199358787,
+      "loss": 3.4783,
+      "step": 22350
+    },
+    {
+      "epoch": 6.524935912374738,
+      "grad_norm": 0.31127235293388367,
+      "learning_rate": 0.0005220087438064703,
+      "loss": 3.4812,
+      "step": 22400
+    },
+    {
+      "epoch": 6.539501281752505,
+      "grad_norm": 0.3426322340965271,
+      "learning_rate": 0.000521833867677062,
+      "loss": 3.4858,
+      "step": 22450
+    },
+    {
+      "epoch": 6.554066651130273,
+      "grad_norm": 0.33666297793388367,
+      "learning_rate": 0.0005216589915476537,
+      "loss": 3.4842,
+      "step": 22500
+    },
+    {
+      "epoch": 6.56863202050804,
+      "grad_norm": 0.33777403831481934,
+      "learning_rate": 0.0005214841154182454,
+      "loss": 3.4867,
+      "step": 22550
+    },
+    {
+      "epoch": 6.583197389885807,
+      "grad_norm": 0.33145034313201904,
+      "learning_rate": 0.0005213092392888371,
+      "loss": 3.4843,
+      "step": 22600
+    },
+    {
+      "epoch": 6.597762759263575,
+      "grad_norm": 0.3483746647834778,
+      "learning_rate": 0.0005211343631594287,
+      "loss": 3.4842,
+      "step": 22650
+    },
+    {
+      "epoch": 6.612328128641343,
+      "grad_norm": 0.34439757466316223,
+      "learning_rate": 0.0005209594870300204,
+      "loss": 3.4747,
+      "step": 22700
+    },
+    {
+      "epoch": 6.626893498019109,
+      "grad_norm": 0.330509752035141,
+      "learning_rate": 0.000520784610900612,
+      "loss": 3.4757,
+      "step": 22750
+    },
+    {
+      "epoch": 6.641458867396877,
+      "grad_norm": 0.3516590893268585,
+      "learning_rate": 0.0005206097347712037,
+      "loss": 3.4689,
+      "step": 22800
+    },
+    {
+      "epoch": 6.656024236774645,
+      "grad_norm": 0.35408666729927063,
+      "learning_rate": 0.0005204348586417953,
+      "loss": 3.4825,
+      "step": 22850
+    },
+    {
+      "epoch": 6.670589606152412,
+      "grad_norm": 0.35224005579948425,
+      "learning_rate": 0.000520259982512387,
+      "loss": 3.4933,
+      "step": 22900
+    },
+    {
+      "epoch": 6.685154975530179,
+      "grad_norm": 0.33899161219596863,
+      "learning_rate": 0.0005200851063829787,
+      "loss": 3.4857,
+      "step": 22950
+    },
+    {
+      "epoch": 6.699720344907947,
+      "grad_norm": 0.3293270170688629,
+      "learning_rate": 0.0005199102302535703,
+      "loss": 3.4775,
+      "step": 23000
+    },
+    {
+      "epoch": 6.699720344907947,
+      "eval_accuracy": 0.3640601274807935,
+      "eval_loss": 3.599479913711548,
+      "eval_runtime": 183.341,
+      "eval_samples_per_second": 90.787,
+      "eval_steps_per_second": 5.678,
+      "step": 23000
+    },
+    {
+      "epoch": 6.714285714285714,
+      "grad_norm": 0.32254496216773987,
+      "learning_rate": 0.0005197353541241621,
+      "loss": 3.4873,
+      "step": 23050
+    },
+    {
+      "epoch": 6.728851083663482,
+      "grad_norm": 0.32425615191459656,
+      "learning_rate": 0.0005195604779947537,
+      "loss": 3.4951,
+      "step": 23100
+    },
+    {
+      "epoch": 6.743416453041249,
+      "grad_norm": 0.3163357675075531,
+      "learning_rate": 0.0005193856018653454,
+      "loss": 3.4845,
+      "step": 23150
+    },
+    {
+      "epoch": 6.7579818224190165,
+      "grad_norm": 0.3210630416870117,
+      "learning_rate": 0.000519210725735937,
+      "loss": 3.4869,
+      "step": 23200
+    },
+    {
+      "epoch": 6.772547191796784,
+      "grad_norm": 0.33914175629615784,
+      "learning_rate": 0.0005190358496065286,
+      "loss": 3.4848,
+      "step": 23250
+    },
+    {
+      "epoch": 6.787112561174552,
+      "grad_norm": 0.3026992082595825,
+      "learning_rate": 0.0005188609734771203,
+      "loss": 3.4891,
+      "step": 23300
+    },
+    {
+      "epoch": 6.8016779305523185,
+      "grad_norm": 0.3406318724155426,
+      "learning_rate": 0.000518686097347712,
+      "loss": 3.4859,
+      "step": 23350
+    },
+    {
+      "epoch": 6.816243299930086,
+      "grad_norm": 0.3310200273990631,
+      "learning_rate": 0.0005185112212183037,
+      "loss": 3.5022,
+      "step": 23400
+    },
+    {
+      "epoch": 6.830808669307854,
+      "grad_norm": 0.30199411511421204,
+      "learning_rate": 0.0005183363450888953,
+      "loss": 3.4872,
+      "step": 23450
+    },
+    {
+      "epoch": 6.845374038685621,
+      "grad_norm": 0.3239602744579315,
+      "learning_rate": 0.000518161468959487,
+      "loss": 3.4867,
+      "step": 23500
+    },
+    {
+      "epoch": 6.859939408063388,
+      "grad_norm": 0.32365167140960693,
+      "learning_rate": 0.0005179865928300787,
+      "loss": 3.4923,
+      "step": 23550
+    },
+    {
+      "epoch": 6.874504777441156,
+      "grad_norm": 0.31722819805145264,
+      "learning_rate": 0.0005178117167006703,
+      "loss": 3.4779,
+      "step": 23600
+    },
+    {
+      "epoch": 6.8890701468189235,
+      "grad_norm": 0.3375761806964874,
+      "learning_rate": 0.000517636840571262,
+      "loss": 3.4864,
+      "step": 23650
+    },
+    {
+      "epoch": 6.903635516196691,
+      "grad_norm": 0.30887332558631897,
+      "learning_rate": 0.0005174619644418536,
+      "loss": 3.4974,
+      "step": 23700
+    },
+    {
+      "epoch": 6.918200885574458,
+      "grad_norm": 0.34449508786201477,
+      "learning_rate": 0.0005172870883124453,
+      "loss": 3.4916,
+      "step": 23750
+    },
+    {
+      "epoch": 6.9327662549522255,
+      "grad_norm": 0.3328838348388672,
+      "learning_rate": 0.000517112212183037,
+      "loss": 3.4975,
+      "step": 23800
+    },
+    {
+      "epoch": 6.947331624329993,
+      "grad_norm": 0.33597683906555176,
+      "learning_rate": 0.0005169373360536286,
+      "loss": 3.4857,
+      "step": 23850
+    },
+    {
+      "epoch": 6.961896993707761,
+      "grad_norm": 0.3501330316066742,
+      "learning_rate": 0.0005167624599242203,
+      "loss": 3.4972,
+      "step": 23900
+    },
+    {
+      "epoch": 6.976462363085528,
+      "grad_norm": 0.3559366762638092,
+      "learning_rate": 0.000516587583794812,
+      "loss": 3.4917,
+      "step": 23950
+    },
+    {
+      "epoch": 6.991027732463295,
+      "grad_norm": 0.3154732286930084,
+      "learning_rate": 0.0005164127076654037,
+      "loss": 3.49,
+      "step": 24000
+    },
+    {
+      "epoch": 6.991027732463295,
+      "eval_accuracy": 0.36516716349649847,
+      "eval_loss": 3.586782932281494,
+      "eval_runtime": 182.9453,
+      "eval_samples_per_second": 90.983,
+      "eval_steps_per_second": 5.69,
+      "step": 24000
+    },
+    {
+      "epoch": 7.005534840363552,
+      "grad_norm": 0.3098287880420685,
+      "learning_rate": 0.0005162378315359953,
+      "loss": 3.4496,
+      "step": 24050
+    },
+    {
+      "epoch": 7.020100209741319,
+      "grad_norm": 0.34997889399528503,
+      "learning_rate": 0.0005160629554065869,
+      "loss": 3.3794,
+      "step": 24100
+    },
+    {
+      "epoch": 7.034665579119086,
+      "grad_norm": 0.33267155289649963,
+      "learning_rate": 0.0005158880792771786,
+      "loss": 3.3792,
+      "step": 24150
+    },
+    {
+      "epoch": 7.049230948496854,
+      "grad_norm": 0.3368155360221863,
+      "learning_rate": 0.0005157132031477703,
+      "loss": 3.3876,
+      "step": 24200
+    },
+    {
+      "epoch": 7.063796317874622,
+      "grad_norm": 0.33117562532424927,
+      "learning_rate": 0.000515538327018362,
+      "loss": 3.4,
+      "step": 24250
+    },
+    {
+      "epoch": 7.0783616872523885,
+      "grad_norm": 0.3448326587677002,
+      "learning_rate": 0.0005153634508889536,
+      "loss": 3.4042,
+      "step": 24300
+    },
+    {
+      "epoch": 7.092927056630156,
+      "grad_norm": 0.3393695652484894,
+      "learning_rate": 0.0005151885747595453,
+      "loss": 3.4025,
+      "step": 24350
+    },
+    {
+      "epoch": 7.107492426007924,
+      "grad_norm": 0.32103875279426575,
+      "learning_rate": 0.000515013698630137,
+      "loss": 3.3923,
+      "step": 24400
+    },
+    {
+      "epoch": 7.122057795385691,
+      "grad_norm": 0.32782039046287537,
+      "learning_rate": 0.0005148388225007285,
+      "loss": 3.4004,
+      "step": 24450
+    },
+    {
+      "epoch": 7.136623164763458,
+      "grad_norm": 0.321836918592453,
+      "learning_rate": 0.0005146639463713203,
+      "loss": 3.4083,
+      "step": 24500
+    },
+    {
+      "epoch": 7.151188534141226,
+      "grad_norm": 0.3271026015281677,
+      "learning_rate": 0.0005144890702419119,
+      "loss": 3.4261,
+      "step": 24550
+    },
+    {
+      "epoch": 7.165753903518993,
+      "grad_norm": 0.32715457677841187,
+      "learning_rate": 0.0005143141941125036,
+      "loss": 3.4129,
+      "step": 24600
+    },
+    {
+      "epoch": 7.180319272896761,
+      "grad_norm": 0.3357333838939667,
+      "learning_rate": 0.0005141393179830953,
+      "loss": 3.4146,
+      "step": 24650
+    },
+    {
+      "epoch": 7.194884642274528,
+      "grad_norm": 0.3631840646266937,
+      "learning_rate": 0.0005139644418536869,
+      "loss": 3.4181,
+      "step": 24700
+    },
+    {
+      "epoch": 7.2094500116522955,
+      "grad_norm": 0.3274785578250885,
+      "learning_rate": 0.0005137895657242786,
+      "loss": 3.4125,
+      "step": 24750
+    },
+    {
+      "epoch": 7.224015381030063,
+      "grad_norm": 0.3449844419956207,
+      "learning_rate": 0.0005136146895948703,
+      "loss": 3.4166,
+      "step": 24800
+    },
+    {
+      "epoch": 7.238580750407831,
+      "grad_norm": 0.32263556122779846,
+      "learning_rate": 0.000513439813465462,
+      "loss": 3.4285,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2531461197855975,
+      "grad_norm": 0.3413465619087219,
+      "learning_rate": 0.0005132649373360535,
+      "loss": 3.4285,
+      "step": 24900
+    },
+    {
+      "epoch": 7.267711489163365,
+      "grad_norm": 0.3473232388496399,
+      "learning_rate": 0.0005130900612066452,
+      "loss": 3.4347,
+      "step": 24950
+    },
+    {
+      "epoch": 7.282276858541133,
+      "grad_norm": 0.32374686002731323,
+      "learning_rate": 0.0005129151850772369,
+      "loss": 3.4352,
+      "step": 25000
+    },
+    {
+      "epoch": 7.282276858541133,
+      "eval_accuracy": 0.3647126391510529,
+      "eval_loss": 3.598836898803711,
+      "eval_runtime": 183.3183,
+      "eval_samples_per_second": 90.798,
+      "eval_steps_per_second": 5.679,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2968422279189,
+      "grad_norm": 0.3352644443511963,
+      "learning_rate": 0.0005127403089478286,
+      "loss": 3.4219,
+      "step": 25050
+    },
+    {
+      "epoch": 7.311407597296667,
+      "grad_norm": 0.36998191475868225,
+      "learning_rate": 0.0005125654328184203,
+      "loss": 3.4354,
+      "step": 25100
+    },
+    {
+      "epoch": 7.325972966674435,
+      "grad_norm": 0.33111071586608887,
+      "learning_rate": 0.0005123905566890119,
+      "loss": 3.4273,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3405383360522025,
+      "grad_norm": 0.3304993212223053,
+      "learning_rate": 0.0005122156805596036,
+      "loss": 3.4285,
+      "step": 25200
+    },
+    {
+      "epoch": 7.35510370542997,
+      "grad_norm": 0.34959882497787476,
+      "learning_rate": 0.0005120408044301953,
+      "loss": 3.4227,
+      "step": 25250
+    },
+    {
+      "epoch": 7.369669074807737,
+      "grad_norm": 0.36705702543258667,
+      "learning_rate": 0.0005118659283007868,
+      "loss": 3.4247,
+      "step": 25300
+    },
+    {
+      "epoch": 7.384234444185505,
+      "grad_norm": 0.3186073899269104,
+      "learning_rate": 0.0005116910521713785,
+      "loss": 3.4439,
+      "step": 25350
+    },
+    {
+      "epoch": 7.398799813563272,
+      "grad_norm": 0.3507457971572876,
+      "learning_rate": 0.0005115161760419702,
+      "loss": 3.4373,
+      "step": 25400
+    },
+    {
+      "epoch": 7.413365182941039,
+      "grad_norm": 0.3345440626144409,
+      "learning_rate": 0.0005113412999125619,
+      "loss": 3.4429,
+      "step": 25450
+    },
+    {
+      "epoch": 7.427930552318807,
+      "grad_norm": 0.3337697684764862,
+      "learning_rate": 0.0005111664237831536,
+      "loss": 3.4366,
+      "step": 25500
+    },
+    {
+      "epoch": 7.442495921696574,
+      "grad_norm": 0.3243204653263092,
+      "learning_rate": 0.0005109915476537452,
+      "loss": 3.451,
+      "step": 25550
+    },
+    {
+      "epoch": 7.457061291074342,
+      "grad_norm": 0.34631916880607605,
+      "learning_rate": 0.0005108166715243369,
+      "loss": 3.4431,
+      "step": 25600
+    },
+    {
+      "epoch": 7.471626660452109,
+      "grad_norm": 0.31301483511924744,
+      "learning_rate": 0.0005106417953949286,
+      "loss": 3.436,
+      "step": 25650
+    },
+    {
+      "epoch": 7.486192029829876,
+      "grad_norm": 0.3258545994758606,
+      "learning_rate": 0.0005104669192655203,
+      "loss": 3.4431,
+      "step": 25700
+    },
+    {
+      "epoch": 7.500757399207644,
+      "grad_norm": 0.3321167230606079,
+      "learning_rate": 0.0005102920431361118,
+      "loss": 3.4506,
+      "step": 25750
+    },
+    {
+      "epoch": 7.515322768585412,
+      "grad_norm": 0.3389933407306671,
+      "learning_rate": 0.0005101171670067035,
+      "loss": 3.4348,
+      "step": 25800
+    },
+    {
+      "epoch": 7.529888137963178,
+      "grad_norm": 0.34453973174095154,
+      "learning_rate": 0.0005099422908772952,
+      "loss": 3.43,
+      "step": 25850
+    },
+    {
+      "epoch": 7.544453507340946,
+      "grad_norm": 0.3527055084705353,
+      "learning_rate": 0.0005097674147478868,
+      "loss": 3.4518,
+      "step": 25900
+    },
+    {
+      "epoch": 7.559018876718714,
+      "grad_norm": 0.33843597769737244,
+      "learning_rate": 0.0005095925386184786,
+      "loss": 3.4435,
+      "step": 25950
+    },
+    {
+      "epoch": 7.573584246096481,
+      "grad_norm": 0.3532460927963257,
+      "learning_rate": 0.0005094176624890702,
+      "loss": 3.4508,
+      "step": 26000
+    },
+    {
+      "epoch": 7.573584246096481,
+      "eval_accuracy": 0.36540406638218725,
+      "eval_loss": 3.589508056640625,
+      "eval_runtime": 183.429,
+      "eval_samples_per_second": 90.744,
+      "eval_steps_per_second": 5.675,
+      "step": 26000
+    },
+    {
+      "epoch": 7.588149615474248,
+      "grad_norm": 0.33581236004829407,
+      "learning_rate": 0.0005092427863596619,
+      "loss": 3.4393,
+      "step": 26050
+    },
+    {
+      "epoch": 7.602714984852016,
+      "grad_norm": 0.31811484694480896,
+      "learning_rate": 0.0005090679102302536,
+      "loss": 3.4466,
+      "step": 26100
+    },
+    {
+      "epoch": 7.617280354229783,
+      "grad_norm": 0.34038013219833374,
+      "learning_rate": 0.0005088930341008451,
+      "loss": 3.4549,
+      "step": 26150
+    },
+    {
+      "epoch": 7.631845723607551,
+      "grad_norm": 0.314603716135025,
+      "learning_rate": 0.0005087181579714368,
+      "loss": 3.4585,
+      "step": 26200
+    },
+    {
+      "epoch": 7.646411092985318,
+      "grad_norm": 0.3383300006389618,
+      "learning_rate": 0.0005085432818420285,
+      "loss": 3.4489,
+      "step": 26250
+    },
+    {
+      "epoch": 7.660976462363085,
+      "grad_norm": 0.32058727741241455,
+      "learning_rate": 0.0005083684057126202,
+      "loss": 3.4562,
+      "step": 26300
+    },
+    {
+      "epoch": 7.675541831740853,
+      "grad_norm": 0.3361349105834961,
+      "learning_rate": 0.0005081935295832118,
+      "loss": 3.4513,
+      "step": 26350
+    },
+    {
+      "epoch": 7.690107201118621,
+      "grad_norm": 0.3190141022205353,
+      "learning_rate": 0.0005080186534538035,
+      "loss": 3.4377,
+      "step": 26400
+    },
+    {
+      "epoch": 7.704672570496387,
+      "grad_norm": 0.32522907853126526,
+      "learning_rate": 0.0005078437773243952,
+      "loss": 3.4548,
+      "step": 26450
+    },
+    {
+      "epoch": 7.719237939874155,
+      "grad_norm": 0.3320629596710205,
+      "learning_rate": 0.0005076689011949869,
+      "loss": 3.4482,
+      "step": 26500
+    },
+    {
+      "epoch": 7.733803309251923,
+      "grad_norm": 0.3172106146812439,
+      "learning_rate": 0.0005074940250655786,
+      "loss": 3.4546,
+      "step": 26550
+    },
+    {
+      "epoch": 7.74836867862969,
+      "grad_norm": 0.339695006608963,
+      "learning_rate": 0.0005073191489361701,
+      "loss": 3.456,
+      "step": 26600
+    },
+    {
+      "epoch": 7.762934048007457,
+      "grad_norm": 0.3172706365585327,
+      "learning_rate": 0.0005071442728067618,
+      "loss": 3.4525,
+      "step": 26650
+    },
+    {
+      "epoch": 7.777499417385225,
+      "grad_norm": 0.3277393877506256,
+      "learning_rate": 0.0005069693966773535,
+      "loss": 3.4459,
+      "step": 26700
+    },
+    {
+      "epoch": 7.792064786762992,
+      "grad_norm": 0.3323872685432434,
+      "learning_rate": 0.0005067945205479451,
+      "loss": 3.4475,
+      "step": 26750
+    },
+    {
+      "epoch": 7.80663015614076,
+      "grad_norm": 0.339382529258728,
+      "learning_rate": 0.0005066196444185368,
+      "loss": 3.4398,
+      "step": 26800
+    },
+    {
+      "epoch": 7.821195525518527,
+      "grad_norm": 0.33565738797187805,
+      "learning_rate": 0.0005064447682891285,
+      "loss": 3.4522,
+      "step": 26850
+    },
+    {
+      "epoch": 7.8357608948962945,
+      "grad_norm": 0.33060815930366516,
+      "learning_rate": 0.0005062698921597202,
+      "loss": 3.4555,
+      "step": 26900
+    },
+    {
+      "epoch": 7.850326264274062,
+      "grad_norm": 0.3253958225250244,
+      "learning_rate": 0.0005060950160303119,
+      "loss": 3.4451,
+      "step": 26950
+    },
+    {
+      "epoch": 7.86489163365183,
+      "grad_norm": 0.3636493384838104,
+      "learning_rate": 0.0005059201399009035,
+      "loss": 3.4523,
+      "step": 27000
+    },
+    {
+      "epoch": 7.86489163365183,
+      "eval_accuracy": 0.36607174453991753,
+      "eval_loss": 3.5809550285339355,
+      "eval_runtime": 183.0826,
+      "eval_samples_per_second": 90.915,
+      "eval_steps_per_second": 5.686,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8794570030295965,
+      "grad_norm": 0.3568457365036011,
+      "learning_rate": 0.0005057452637714951,
+      "loss": 3.4527,
+      "step": 27050
+    },
+    {
+      "epoch": 7.894022372407364,
+      "grad_norm": 0.3288189172744751,
+      "learning_rate": 0.0005055703876420868,
+      "loss": 3.4552,
+      "step": 27100
+    },
+    {
+      "epoch": 7.908587741785132,
+      "grad_norm": 0.29840394854545593,
+      "learning_rate": 0.0005053955115126785,
+      "loss": 3.4563,
+      "step": 27150
+    },
+    {
+      "epoch": 7.923153111162899,
+      "grad_norm": 0.37738728523254395,
+      "learning_rate": 0.0005052206353832701,
+      "loss": 3.4605,
+      "step": 27200
+    },
+    {
+      "epoch": 7.937718480540666,
+      "grad_norm": 0.3255780339241028,
+      "learning_rate": 0.0005050457592538618,
+      "loss": 3.4607,
+      "step": 27250
+    },
+    {
+      "epoch": 7.952283849918434,
+      "grad_norm": 0.3199935853481293,
+      "learning_rate": 0.0005048708831244535,
+      "loss": 3.4621,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9668492192962015,
+      "grad_norm": 0.34419822692871094,
+      "learning_rate": 0.0005046960069950451,
+      "loss": 3.4547,
+      "step": 27350
+    },
+    {
+      "epoch": 7.981414588673969,
+      "grad_norm": 0.31108030676841736,
+      "learning_rate": 0.0005045211308656369,
+      "loss": 3.4622,
+      "step": 27400
+    },
+    {
+      "epoch": 7.995979958051736,
+      "grad_norm": 0.3241368532180786,
+      "learning_rate": 0.0005043462547362284,
+      "loss": 3.4599,
+      "step": 27450
+    },
+    {
+      "epoch": 8.010487065951992,
+      "grad_norm": 0.34238871932029724,
+      "learning_rate": 0.0005041713786068201,
+      "loss": 3.3724,
+      "step": 27500
+    },
+    {
+      "epoch": 8.02505243532976,
+      "grad_norm": 0.3241780400276184,
+      "learning_rate": 0.0005039965024774118,
+      "loss": 3.3516,
+      "step": 27550
+    },
+    {
+      "epoch": 8.039617804707527,
+      "grad_norm": 0.33774638175964355,
+      "learning_rate": 0.0005038216263480034,
+      "loss": 3.3319,
+      "step": 27600
+    },
+    {
+      "epoch": 8.054183174085296,
+      "grad_norm": 0.33839425444602966,
+      "learning_rate": 0.0005036467502185951,
+      "loss": 3.3643,
+      "step": 27650
+    },
+    {
+      "epoch": 8.068748543463062,
+      "grad_norm": 0.33138301968574524,
+      "learning_rate": 0.0005034718740891868,
+      "loss": 3.3576,
+      "step": 27700
+    },
+    {
+      "epoch": 8.08331391284083,
+      "grad_norm": 0.331544429063797,
+      "learning_rate": 0.0005032969979597785,
+      "loss": 3.3717,
+      "step": 27750
+    },
+    {
+      "epoch": 8.097879282218598,
+      "grad_norm": 0.3336641192436218,
+      "learning_rate": 0.0005031221218303701,
+      "loss": 3.3637,
+      "step": 27800
+    },
+    {
+      "epoch": 8.112444651596364,
+      "grad_norm": 0.3461131453514099,
+      "learning_rate": 0.0005029472457009618,
+      "loss": 3.367,
+      "step": 27850
+    },
+    {
+      "epoch": 8.127010020974131,
+      "grad_norm": 0.3179863691329956,
+      "learning_rate": 0.0005027723695715534,
+      "loss": 3.3618,
+      "step": 27900
+    },
+    {
+      "epoch": 8.1415753903519,
+      "grad_norm": 0.3158666491508484,
+      "learning_rate": 0.0005025974934421451,
+      "loss": 3.3798,
+      "step": 27950
+    },
+    {
+      "epoch": 8.156140759729666,
+      "grad_norm": 0.34379681944847107,
+      "learning_rate": 0.0005024226173127368,
+      "loss": 3.3762,
+      "step": 28000
+    },
+    {
+      "epoch": 8.156140759729666,
+      "eval_accuracy": 0.3662919525324213,
+      "eval_loss": 3.588055372238159,
+      "eval_runtime": 182.9945,
+      "eval_samples_per_second": 90.959,
+      "eval_steps_per_second": 5.689,
+      "step": 28000
+    },
+    {
+      "epoch": 8.170706129107435,
+      "grad_norm": 0.3444662392139435,
+      "learning_rate": 0.0005022477411833284,
+      "loss": 3.3854,
+      "step": 28050
+    },
+    {
+      "epoch": 8.185271498485202,
+      "grad_norm": 0.3341796398162842,
+      "learning_rate": 0.0005020728650539201,
+      "loss": 3.3786,
+      "step": 28100
+    },
+    {
+      "epoch": 8.199836867862969,
+      "grad_norm": 0.3741398751735687,
+      "learning_rate": 0.0005018979889245118,
+      "loss": 3.3783,
+      "step": 28150
+    },
+    {
+      "epoch": 8.214402237240737,
+      "grad_norm": 0.329504132270813,
+      "learning_rate": 0.0005017231127951034,
+      "loss": 3.3892,
+      "step": 28200
+    },
+    {
+      "epoch": 8.228967606618504,
+      "grad_norm": 0.37047260999679565,
+      "learning_rate": 0.0005015482366656951,
+      "loss": 3.3807,
+      "step": 28250
+    },
+    {
+      "epoch": 8.24353297599627,
+      "grad_norm": 0.35633140802383423,
+      "learning_rate": 0.0005013733605362868,
+      "loss": 3.392,
+      "step": 28300
+    },
+    {
+      "epoch": 8.258098345374039,
+      "grad_norm": 0.39560022950172424,
+      "learning_rate": 0.0005011984844068784,
+      "loss": 3.3931,
+      "step": 28350
+    },
+    {
+      "epoch": 8.272663714751806,
+      "grad_norm": 0.3520383834838867,
+      "learning_rate": 0.0005010236082774701,
+      "loss": 3.404,
+      "step": 28400
+    },
+    {
+      "epoch": 8.287229084129574,
+      "grad_norm": 0.3606109023094177,
+      "learning_rate": 0.0005008487321480617,
+      "loss": 3.3849,
+      "step": 28450
+    },
+    {
+      "epoch": 8.301794453507341,
+      "grad_norm": 0.35454607009887695,
+      "learning_rate": 0.0005006738560186534,
+      "loss": 3.3959,
+      "step": 28500
+    },
+    {
+      "epoch": 8.316359822885108,
+      "grad_norm": 0.3578483462333679,
+      "learning_rate": 0.0005004989798892451,
+      "loss": 3.4044,
+      "step": 28550
+    },
+    {
+      "epoch": 8.330925192262876,
+      "grad_norm": 0.36204418540000916,
+      "learning_rate": 0.0005003241037598368,
+      "loss": 3.3994,
+      "step": 28600
+    },
+    {
+      "epoch": 8.345490561640643,
+      "grad_norm": 0.35746678709983826,
+      "learning_rate": 0.0005001492276304284,
+      "loss": 3.3966,
+      "step": 28650
+    },
+    {
+      "epoch": 8.36005593101841,
+      "grad_norm": 0.32434630393981934,
+      "learning_rate": 0.0004999743515010201,
+      "loss": 3.408,
+      "step": 28700
+    },
+    {
+      "epoch": 8.374621300396178,
+      "grad_norm": 0.3382759094238281,
+      "learning_rate": 0.0004997994753716117,
+      "loss": 3.3965,
+      "step": 28750
+    },
+    {
+      "epoch": 8.389186669773945,
+      "grad_norm": 0.3254185616970062,
+      "learning_rate": 0.0004996245992422033,
+      "loss": 3.4115,
+      "step": 28800
+    },
+    {
+      "epoch": 8.403752039151712,
+      "grad_norm": 0.3328564763069153,
+      "learning_rate": 0.0004994497231127951,
+      "loss": 3.3962,
+      "step": 28850
+    },
+    {
+      "epoch": 8.41831740852948,
+      "grad_norm": 0.3341105580329895,
+      "learning_rate": 0.0004992748469833867,
+      "loss": 3.4067,
+      "step": 28900
+    },
+    {
+      "epoch": 8.432882777907247,
+      "grad_norm": 0.3595927655696869,
+      "learning_rate": 0.0004990999708539784,
+      "loss": 3.4093,
+      "step": 28950
+    },
+    {
+      "epoch": 8.447448147285016,
+      "grad_norm": 0.32767578959465027,
+      "learning_rate": 0.0004989250947245701,
+      "loss": 3.4033,
+      "step": 29000
+    },
+    {
+      "epoch": 8.447448147285016,
+      "eval_accuracy": 0.3668947322321366,
+      "eval_loss": 3.5828919410705566,
+      "eval_runtime": 183.1656,
+      "eval_samples_per_second": 90.874,
+      "eval_steps_per_second": 5.683,
+      "step": 29000
+    },
+    {
+      "epoch": 8.462013516662783,
+      "grad_norm": 0.335256963968277,
+      "learning_rate": 0.0004987502185951617,
+      "loss": 3.4103,
+      "step": 29050
+    },
+    {
+      "epoch": 8.47657888604055,
+      "grad_norm": 0.328800767660141,
+      "learning_rate": 0.0004985753424657534,
+      "loss": 3.4104,
+      "step": 29100
+    },
+    {
+      "epoch": 8.491144255418318,
+      "grad_norm": 0.3295251429080963,
+      "learning_rate": 0.000498400466336345,
+      "loss": 3.4066,
+      "step": 29150
+    },
+    {
+      "epoch": 8.505709624796085,
+      "grad_norm": 0.3609812259674072,
+      "learning_rate": 0.0004982255902069367,
+      "loss": 3.4191,
+      "step": 29200
+    },
+    {
+      "epoch": 8.520274994173853,
+      "grad_norm": 0.353671669960022,
+      "learning_rate": 0.0004980507140775283,
+      "loss": 3.4129,
+      "step": 29250
+    },
+    {
+      "epoch": 8.53484036355162,
+      "grad_norm": 0.32933372259140015,
+      "learning_rate": 0.0004978758379481201,
+      "loss": 3.423,
+      "step": 29300
+    },
+    {
+      "epoch": 8.549405732929387,
+      "grad_norm": 0.32520967721939087,
+      "learning_rate": 0.0004977009618187117,
+      "loss": 3.4124,
+      "step": 29350
+    },
+    {
+      "epoch": 8.563971102307155,
+      "grad_norm": 0.3547916114330292,
+      "learning_rate": 0.0004975260856893034,
+      "loss": 3.4198,
+      "step": 29400
+    },
+    {
+      "epoch": 8.578536471684922,
+      "grad_norm": 0.34057968854904175,
+      "learning_rate": 0.0004973512095598951,
+      "loss": 3.4191,
+      "step": 29450
+    },
+    {
+      "epoch": 8.593101841062689,
+      "grad_norm": 0.33685600757598877,
+      "learning_rate": 0.0004971763334304867,
+      "loss": 3.4209,
+      "step": 29500
+    },
+    {
+      "epoch": 8.607667210440457,
+      "grad_norm": 0.34290191531181335,
+      "learning_rate": 0.0004970014573010784,
+      "loss": 3.4257,
+      "step": 29550
+    },
+    {
+      "epoch": 8.622232579818224,
+      "grad_norm": 0.311635822057724,
+      "learning_rate": 0.00049682658117167,
+      "loss": 3.4108,
+      "step": 29600
+    },
+    {
+      "epoch": 8.63679794919599,
+      "grad_norm": 0.3206999599933624,
+      "learning_rate": 0.0004966517050422616,
+      "loss": 3.4277,
+      "step": 29650
+    },
+    {
+      "epoch": 8.65136331857376,
+      "grad_norm": 0.378736674785614,
+      "learning_rate": 0.0004964768289128533,
+      "loss": 3.4192,
+      "step": 29700
+    },
+    {
+      "epoch": 8.665928687951526,
+      "grad_norm": 0.35404956340789795,
+      "learning_rate": 0.000496301952783445,
+      "loss": 3.4304,
+      "step": 29750
+    },
+    {
+      "epoch": 8.680494057329295,
+      "grad_norm": 0.33619192242622375,
+      "learning_rate": 0.0004961270766540367,
+      "loss": 3.4157,
+      "step": 29800
+    },
+    {
+      "epoch": 8.695059426707061,
+      "grad_norm": 0.3306516110897064,
+      "learning_rate": 0.0004959522005246284,
+      "loss": 3.4257,
+      "step": 29850
+    },
+    {
+      "epoch": 8.709624796084828,
+      "grad_norm": 0.314596563577652,
+      "learning_rate": 0.00049577732439522,
+      "loss": 3.4326,
+      "step": 29900
+    },
+    {
+      "epoch": 8.724190165462597,
+      "grad_norm": 0.3481888771057129,
+      "learning_rate": 0.0004956024482658117,
+      "loss": 3.4345,
+      "step": 29950
+    },
+    {
+      "epoch": 8.738755534840363,
+      "grad_norm": 0.33885663747787476,
+      "learning_rate": 0.0004954275721364034,
+      "loss": 3.4225,
+      "step": 30000
+    },
+    {
+      "epoch": 8.738755534840363,
+      "eval_accuracy": 0.3673473754628571,
+      "eval_loss": 3.574705123901367,
+      "eval_runtime": 183.0508,
+      "eval_samples_per_second": 90.931,
+      "eval_steps_per_second": 5.687,
+      "step": 30000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171650,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.27061160411136e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}