diff --git "a/cost_to_carry_frequency_2128/checkpoint-40000/trainer_state.json" "b/cost_to_carry_frequency_2128/checkpoint-40000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/cost_to_carry_frequency_2128/checkpoint-40000/trainer_state.json"
@@ -0,0 +1,6003 @@
+{
+  "best_global_step": 40000,
+  "best_metric": 3.5568654537200928,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_carry_frequency_2128/checkpoint-40000",
+  "epoch": 11.651654625961314,
+  "eval_steps": 1000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01456536937776742,
+      "grad_norm": 0.8238836526870728,
+      "learning_rate": 0.000294,
+      "loss": 8.4647,
+      "step": 50
+    },
+    {
+      "epoch": 0.02913073875553484,
+      "grad_norm": 0.8511333465576172,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7154,
+      "step": 100
+    },
+    {
+      "epoch": 0.04369610813330226,
+      "grad_norm": 0.5358927249908447,
+      "learning_rate": 0.0005998286213931798,
+      "loss": 6.3402,
+      "step": 150
+    },
+    {
+      "epoch": 0.05826147751106968,
+      "grad_norm": 0.393213152885437,
+      "learning_rate": 0.0005996537452637714,
+      "loss": 6.128,
+      "step": 200
+    },
+    {
+      "epoch": 0.0728268468888371,
+      "grad_norm": 0.44214680790901184,
+      "learning_rate": 0.0005994788691343632,
+      "loss": 5.9984,
+      "step": 250
+    },
+    {
+      "epoch": 0.08739221626660452,
+      "grad_norm": 0.39625659584999084,
+      "learning_rate": 0.0005993039930049548,
+      "loss": 5.8476,
+      "step": 300
+    },
+    {
+      "epoch": 0.10195758564437195,
+      "grad_norm": 0.47089600563049316,
+      "learning_rate": 0.0005991291168755465,
+      "loss": 5.717,
+      "step": 350
+    },
+    {
+      "epoch": 0.11652295502213936,
+      "grad_norm": 0.49012529850006104,
+      "learning_rate": 0.0005989542407461382,
+      "loss": 5.6004,
+      "step": 400
+    },
+    {
+      "epoch": 0.13108832439990678,
+      "grad_norm": 0.44449353218078613,
+      "learning_rate": 0.0005987793646167297,
+      "loss": 5.5036,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456536937776742,
+      "grad_norm": 0.4581308662891388,
+      "learning_rate": 0.0005986044884873214,
+      "loss": 5.4036,
+      "step": 500
+    },
+    {
+      "epoch": 0.16021906315544163,
+      "grad_norm": 0.42073458433151245,
+      "learning_rate": 0.0005984296123579131,
+      "loss": 5.3157,
+      "step": 550
+    },
+    {
+      "epoch": 0.17478443253320905,
+      "grad_norm": 0.40689051151275635,
+      "learning_rate": 0.0005982547362285047,
+      "loss": 5.2389,
+      "step": 600
+    },
+    {
+      "epoch": 0.18934980191097647,
+      "grad_norm": 0.43460237979888916,
+      "learning_rate": 0.0005980798600990964,
+      "loss": 5.1757,
+      "step": 650
+    },
+    {
+      "epoch": 0.2039151712887439,
+      "grad_norm": 0.45589685440063477,
+      "learning_rate": 0.0005979049839696881,
+      "loss": 5.1372,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184805406665113,
+      "grad_norm": 0.3859228491783142,
+      "learning_rate": 0.0005977301078402798,
+      "loss": 5.0648,
+      "step": 750
+    },
+    {
+      "epoch": 0.23304591004427871,
+      "grad_norm": 0.4326412081718445,
+      "learning_rate": 0.0005975552317108715,
+      "loss": 5.0189,
+      "step": 800
+    },
+    {
+      "epoch": 0.24761127942204614,
+      "grad_norm": 0.4279559552669525,
+      "learning_rate": 0.0005973803555814631,
+      "loss": 4.9551,
+      "step": 850
+    },
+    {
+      "epoch": 0.26217664879981356,
+      "grad_norm": 0.4713323414325714,
+      "learning_rate": 0.0005972054794520547,
+      "loss": 4.9013,
+      "step": 900
+    },
+    {
+      "epoch": 0.276742018177581,
+      "grad_norm": 0.43930691480636597,
+      "learning_rate": 0.0005970306033226464,
+      "loss": 4.8775,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "grad_norm": 0.4358115792274475,
+      "learning_rate": 0.0005968557271932381,
+      "loss": 4.8034,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "eval_accuracy": 0.2553606134660685,
+      "eval_loss": 4.747249603271484,
+      "eval_runtime": 181.1133,
+      "eval_samples_per_second": 91.882,
+      "eval_steps_per_second": 5.748,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30587275693311583,
+      "grad_norm": 0.4385921061038971,
+      "learning_rate": 0.0005966808510638297,
+      "loss": 4.764,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32043812631088325,
+      "grad_norm": 0.4408250153064728,
+      "learning_rate": 0.0005965059749344214,
+      "loss": 4.7228,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350034956886507,
+      "grad_norm": 0.42670053243637085,
+      "learning_rate": 0.0005963310988050131,
+      "loss": 4.6831,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3495688650664181,
+      "grad_norm": 0.45188194513320923,
+      "learning_rate": 0.0005961562226756047,
+      "loss": 4.6466,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3641342344441855,
+      "grad_norm": 0.45061755180358887,
+      "learning_rate": 0.0005959813465461965,
+      "loss": 4.6151,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37869960382195295,
+      "grad_norm": 0.4137357175350189,
+      "learning_rate": 0.000595806470416788,
+      "loss": 4.594,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39326497319972037,
+      "grad_norm": 0.4820528030395508,
+      "learning_rate": 0.0005956315942873797,
+      "loss": 4.5574,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4078303425774878,
+      "grad_norm": 0.4236258864402771,
+      "learning_rate": 0.0005954567181579714,
+      "loss": 4.547,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42239571195525516,
+      "grad_norm": 0.46075788140296936,
+      "learning_rate": 0.000595281842028563,
+      "loss": 4.5196,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4369610813330226,
+      "grad_norm": 0.4334660470485687,
+      "learning_rate": 0.0005951069658991547,
+      "loss": 4.4987,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45152645071079,
+      "grad_norm": 0.4550553858280182,
+      "learning_rate": 0.0005949320897697464,
+      "loss": 4.4661,
+      "step": 1550
+    },
+    {
+      "epoch": 0.46609182008855743,
+      "grad_norm": 0.43172529339790344,
+      "learning_rate": 0.0005947572136403381,
+      "loss": 4.4598,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48065718946632485,
+      "grad_norm": 0.43075188994407654,
+      "learning_rate": 0.0005945823375109297,
+      "loss": 4.4466,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4952225588440923,
+      "grad_norm": 0.39069297909736633,
+      "learning_rate": 0.0005944074613815215,
+      "loss": 4.4363,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5097879282218597,
+      "grad_norm": 0.4138490855693817,
+      "learning_rate": 0.000594232585252113,
+      "loss": 4.4134,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5243532975996271,
+      "grad_norm": 0.39502963423728943,
+      "learning_rate": 0.0005940577091227047,
+      "loss": 4.3856,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5389186669773945,
+      "grad_norm": 0.43347927927970886,
+      "learning_rate": 0.0005938828329932964,
+      "loss": 4.3706,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553484036355162,
+      "grad_norm": 0.3977086842060089,
+      "learning_rate": 0.000593707956863888,
+      "loss": 4.3561,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5680494057329294,
+      "grad_norm": 0.41190075874328613,
+      "learning_rate": 0.0005935330807344797,
+      "loss": 4.3474,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "grad_norm": 0.46395954489707947,
+      "learning_rate": 0.0005933582046050714,
+      "loss": 4.3307,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "eval_accuracy": 0.3005255098722875,
+      "eval_loss": 4.274885654449463,
+      "eval_runtime": 181.1908,
+      "eval_samples_per_second": 91.842,
+      "eval_steps_per_second": 5.745,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5971801444884642,
+      "grad_norm": 0.38975879549980164,
+      "learning_rate": 0.000593183328475663,
+      "loss": 4.3244,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6117455138662317,
+      "grad_norm": 0.43265265226364136,
+      "learning_rate": 0.0005930084523462546,
+      "loss": 4.3044,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6263108832439991,
+      "grad_norm": 0.39545658230781555,
+      "learning_rate": 0.0005928335762168463,
+      "loss": 4.3117,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6408762526217665,
+      "grad_norm": 0.3943400979042053,
+      "learning_rate": 0.000592658700087438,
+      "loss": 4.2746,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6554416219995339,
+      "grad_norm": 0.38592591881752014,
+      "learning_rate": 0.0005924838239580297,
+      "loss": 4.2631,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6700069913773014,
+      "grad_norm": 0.4045238792896271,
+      "learning_rate": 0.0005923089478286214,
+      "loss": 4.252,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6845723607550688,
+      "grad_norm": 0.3675093650817871,
+      "learning_rate": 0.000592134071699213,
+      "loss": 4.2618,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6991377301328362,
+      "grad_norm": 0.3889940679073334,
+      "learning_rate": 0.0005919591955698047,
+      "loss": 4.2367,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7137030995106036,
+      "grad_norm": 0.3890587091445923,
+      "learning_rate": 0.0005917843194403964,
+      "loss": 4.2284,
+      "step": 2450
+    },
+    {
+      "epoch": 0.728268468888371,
+      "grad_norm": 0.3815591633319855,
+      "learning_rate": 0.000591609443310988,
+      "loss": 4.2156,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7428338382661385,
+      "grad_norm": 0.37089216709136963,
+      "learning_rate": 0.0005914345671815796,
+      "loss": 4.212,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7573992076439059,
+      "grad_norm": 0.3704032003879547,
+      "learning_rate": 0.0005912596910521713,
+      "loss": 4.2067,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7719645770216733,
+      "grad_norm": 0.35531026124954224,
+      "learning_rate": 0.0005910848149227629,
+      "loss": 4.1983,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7865299463994407,
+      "grad_norm": 0.3560468554496765,
+      "learning_rate": 0.0005909099387933547,
+      "loss": 4.1943,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8010953157772082,
+      "grad_norm": 0.34763428568840027,
+      "learning_rate": 0.0005907350626639463,
+      "loss": 4.1821,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8156606851549756,
+      "grad_norm": 0.3786865174770355,
+      "learning_rate": 0.000590560186534538,
+      "loss": 4.182,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8302260545327429,
+      "grad_norm": 0.36594879627227783,
+      "learning_rate": 0.0005903853104051297,
+      "loss": 4.1746,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8447914239105103,
+      "grad_norm": 0.3726920485496521,
+      "learning_rate": 0.0005902104342757214,
+      "loss": 4.1519,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8593567932882777,
+      "grad_norm": 0.3717030882835388,
+      "learning_rate": 0.000590035558146313,
+      "loss": 4.1468,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "grad_norm": 0.3748721480369568,
+      "learning_rate": 0.0005898606820169046,
+      "loss": 4.133,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "eval_accuracy": 0.31594671449609696,
+      "eval_loss": 4.093606948852539,
+      "eval_runtime": 181.8584,
+      "eval_samples_per_second": 91.505,
+      "eval_steps_per_second": 5.724,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8884875320438126,
+      "grad_norm": 0.3633127510547638,
+      "learning_rate": 0.0005896858058874963,
+      "loss": 4.1293,
+      "step": 3050
+    },
+    {
+      "epoch": 0.90305290142158,
+      "grad_norm": 0.3572661280632019,
+      "learning_rate": 0.0005895109297580879,
+      "loss": 4.1391,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9176182707993474,
+      "grad_norm": 0.36137935519218445,
+      "learning_rate": 0.0005893360536286797,
+      "loss": 4.1105,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9321836401771149,
+      "grad_norm": 0.32254934310913086,
+      "learning_rate": 0.0005891611774992713,
+      "loss": 4.1254,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9467490095548823,
+      "grad_norm": 0.39124587178230286,
+      "learning_rate": 0.000588986301369863,
+      "loss": 4.1111,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9613143789326497,
+      "grad_norm": 0.3515588343143463,
+      "learning_rate": 0.0005888114252404547,
+      "loss": 4.1108,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9758797483104171,
+      "grad_norm": 0.34326955676078796,
+      "learning_rate": 0.0005886365491110463,
+      "loss": 4.093,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9904451176881846,
+      "grad_norm": 0.355753630399704,
+      "learning_rate": 0.000588461672981638,
+      "loss": 4.0918,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0049522255884409,
+      "grad_norm": 0.3507901728153229,
+      "learning_rate": 0.0005882867968522296,
+      "loss": 4.0691,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0195175949662083,
+      "grad_norm": 0.3766329288482666,
+      "learning_rate": 0.0005881119207228212,
+      "loss": 4.0107,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0340829643439757,
+      "grad_norm": 0.3579021692276001,
+      "learning_rate": 0.0005879370445934129,
+      "loss": 4.007,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0486483337217432,
+      "grad_norm": 0.3645196855068207,
+      "learning_rate": 0.0005877621684640046,
+      "loss": 3.9979,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0632137030995106,
+      "grad_norm": 0.3481312096118927,
+      "learning_rate": 0.0005875872923345963,
+      "loss": 4.0211,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077779072477278,
+      "grad_norm": 0.3510258197784424,
+      "learning_rate": 0.000587412416205188,
+      "loss": 4.0166,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0923444418550454,
+      "grad_norm": 0.3350948393344879,
+      "learning_rate": 0.0005872375400757797,
+      "loss": 4.0069,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1069098112328128,
+      "grad_norm": 0.3625650405883789,
+      "learning_rate": 0.0005870626639463713,
+      "loss": 4.0113,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1214751806105803,
+      "grad_norm": 0.3299196660518646,
+      "learning_rate": 0.0005868877878169629,
+      "loss": 4.0002,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1360405499883477,
+      "grad_norm": 0.3324016034603119,
+      "learning_rate": 0.0005867129116875546,
+      "loss": 3.9859,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1506059193661151,
+      "grad_norm": 0.3453924059867859,
+      "learning_rate": 0.0005865380355581462,
+      "loss": 3.9777,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "grad_norm": 0.3689613342285156,
+      "learning_rate": 0.0005863631594287379,
+      "loss": 3.9994,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "eval_accuracy": 0.3255163636932383,
+      "eval_loss": 3.986950635910034,
+      "eval_runtime": 181.1582,
+      "eval_samples_per_second": 91.859,
+      "eval_steps_per_second": 5.746,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17973665812165,
+      "grad_norm": 0.33577021956443787,
+      "learning_rate": 0.0005861882832993296,
+      "loss": 3.9865,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1943020274994174,
+      "grad_norm": 0.36526042222976685,
+      "learning_rate": 0.0005860134071699212,
+      "loss": 3.9788,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2088673968771848,
+      "grad_norm": 0.38221442699432373,
+      "learning_rate": 0.000585838531040513,
+      "loss": 3.9812,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2234327662549522,
+      "grad_norm": 0.33103591203689575,
+      "learning_rate": 0.0005856636549111046,
+      "loss": 3.9718,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2379981356327197,
+      "grad_norm": 0.35662996768951416,
+      "learning_rate": 0.0005854887787816963,
+      "loss": 3.9494,
+      "step": 4250
+    },
+    {
+      "epoch": 1.252563505010487,
+      "grad_norm": 0.34380409121513367,
+      "learning_rate": 0.0005853139026522879,
+      "loss": 3.9743,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2671288743882545,
+      "grad_norm": 0.37011435627937317,
+      "learning_rate": 0.0005851390265228796,
+      "loss": 3.9679,
+      "step": 4350
+    },
+    {
+      "epoch": 1.281694243766022,
+      "grad_norm": 0.3393860161304474,
+      "learning_rate": 0.0005849641503934712,
+      "loss": 3.9668,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2962596131437893,
+      "grad_norm": 0.37732088565826416,
+      "learning_rate": 0.0005847892742640629,
+      "loss": 3.954,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3108249825215568,
+      "grad_norm": 0.34702256321907043,
+      "learning_rate": 0.0005846143981346546,
+      "loss": 3.945,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3253903518993242,
+      "grad_norm": 0.347993940114975,
+      "learning_rate": 0.0005844395220052462,
+      "loss": 3.9482,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3399557212770916,
+      "grad_norm": 0.36191609501838684,
+      "learning_rate": 0.000584264645875838,
+      "loss": 3.9631,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354521090654859,
+      "grad_norm": 0.3485741913318634,
+      "learning_rate": 0.0005840897697464296,
+      "loss": 3.9519,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3690864600326265,
+      "grad_norm": 0.35182255506515503,
+      "learning_rate": 0.0005839148936170212,
+      "loss": 3.9376,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3836518294103939,
+      "grad_norm": 0.3750380277633667,
+      "learning_rate": 0.0005837400174876129,
+      "loss": 3.9428,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3982171987881613,
+      "grad_norm": 0.360505610704422,
+      "learning_rate": 0.0005835651413582045,
+      "loss": 3.9449,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4127825681659287,
+      "grad_norm": 0.33969995379447937,
+      "learning_rate": 0.0005833902652287962,
+      "loss": 3.9371,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4273479375436962,
+      "grad_norm": 0.35013002157211304,
+      "learning_rate": 0.0005832153890993879,
+      "loss": 3.9337,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4419133069214636,
+      "grad_norm": 0.3549332916736603,
+      "learning_rate": 0.0005830405129699796,
+      "loss": 3.9288,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456478676299231,
+      "grad_norm": 0.350067675113678,
+      "learning_rate": 0.0005828656368405712,
+      "loss": 3.9189,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456478676299231,
+      "eval_accuracy": 0.3319192182183655,
+      "eval_loss": 3.9136765003204346,
+      "eval_runtime": 180.9327,
+      "eval_samples_per_second": 91.973,
+      "eval_steps_per_second": 5.754,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4710440456769984,
+      "grad_norm": 0.3185882866382599,
+      "learning_rate": 0.0005826907607111629,
+      "loss": 3.9184,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4856094150547658,
+      "grad_norm": 0.32343989610671997,
+      "learning_rate": 0.0005825158845817546,
+      "loss": 3.9226,
+      "step": 5100
+    },
+    {
+      "epoch": 1.500174784432533,
+      "grad_norm": 0.3393322825431824,
+      "learning_rate": 0.0005823410084523462,
+      "loss": 3.9186,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5147401538103007,
+      "grad_norm": 0.33320391178131104,
+      "learning_rate": 0.0005821661323229379,
+      "loss": 3.9163,
+      "step": 5200
+    },
+    {
+      "epoch": 1.529305523188068,
+      "grad_norm": 0.32511457800865173,
+      "learning_rate": 0.0005819912561935295,
+      "loss": 3.9171,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5438708925658355,
+      "grad_norm": 0.3350389897823334,
+      "learning_rate": 0.0005818163800641212,
+      "loss": 3.9138,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5584362619436027,
+      "grad_norm": 0.33055445551872253,
+      "learning_rate": 0.0005816415039347129,
+      "loss": 3.9016,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5730016313213704,
+      "grad_norm": 0.33436310291290283,
+      "learning_rate": 0.0005814666278053045,
+      "loss": 3.9029,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5875670006991376,
+      "grad_norm": 0.3497842252254486,
+      "learning_rate": 0.0005812917516758962,
+      "loss": 3.9082,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6021323700769052,
+      "grad_norm": 0.345002144575119,
+      "learning_rate": 0.0005811168755464879,
+      "loss": 3.9131,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6166977394546724,
+      "grad_norm": 0.34684985876083374,
+      "learning_rate": 0.0005809419994170794,
+      "loss": 3.8984,
+      "step": 5550
+    },
+    {
+      "epoch": 1.63126310883244,
+      "grad_norm": 0.31695857644081116,
+      "learning_rate": 0.0005807671232876712,
+      "loss": 3.883,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6458284782102073,
+      "grad_norm": 0.36025357246398926,
+      "learning_rate": 0.0005805922471582628,
+      "loss": 3.9014,
+      "step": 5650
+    },
+    {
+      "epoch": 1.660393847587975,
+      "grad_norm": 0.34583941102027893,
+      "learning_rate": 0.0005804173710288545,
+      "loss": 3.8942,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6749592169657421,
+      "grad_norm": 0.33271774649620056,
+      "learning_rate": 0.0005802424948994462,
+      "loss": 3.8925,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6895245863435098,
+      "grad_norm": 0.3309691548347473,
+      "learning_rate": 0.0005800676187700379,
+      "loss": 3.8858,
+      "step": 5800
+    },
+    {
+      "epoch": 1.704089955721277,
+      "grad_norm": 0.3122190833091736,
+      "learning_rate": 0.0005798927426406295,
+      "loss": 3.8657,
+      "step": 5850
+    },
+    {
+      "epoch": 1.7186553250990446,
+      "grad_norm": 0.3420509397983551,
+      "learning_rate": 0.0005797178665112212,
+      "loss": 3.8769,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7332206944768118,
+      "grad_norm": 0.32377851009368896,
+      "learning_rate": 0.0005795429903818129,
+      "loss": 3.8841,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "grad_norm": 0.31988197565078735,
+      "learning_rate": 0.0005793681142524044,
+      "loss": 3.873,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "eval_accuracy": 0.33766140756961416,
+      "eval_loss": 3.8517301082611084,
+      "eval_runtime": 180.5226,
+      "eval_samples_per_second": 92.182,
+      "eval_steps_per_second": 5.767,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7623514332323467,
+      "grad_norm": 0.3411223888397217,
+      "learning_rate": 0.0005791932381229961,
+      "loss": 3.8641,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7769168026101143,
+      "grad_norm": 0.34019729495048523,
+      "learning_rate": 0.0005790183619935878,
+      "loss": 3.861,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7914821719878815,
+      "grad_norm": 0.3440145254135132,
+      "learning_rate": 0.0005788434858641795,
+      "loss": 3.872,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8060475413656492,
+      "grad_norm": 0.33574172854423523,
+      "learning_rate": 0.0005786686097347712,
+      "loss": 3.8661,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8206129107434164,
+      "grad_norm": 0.3292637765407562,
+      "learning_rate": 0.0005784937336053628,
+      "loss": 3.8686,
+      "step": 6250
+    },
+    {
+      "epoch": 1.835178280121184,
+      "grad_norm": 0.3391307294368744,
+      "learning_rate": 0.0005783188574759545,
+      "loss": 3.8548,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8497436494989512,
+      "grad_norm": 0.32597413659095764,
+      "learning_rate": 0.0005781439813465462,
+      "loss": 3.8698,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8643090188767188,
+      "grad_norm": 0.3270658552646637,
+      "learning_rate": 0.0005779691052171379,
+      "loss": 3.8609,
+      "step": 6400
+    },
+    {
+      "epoch": 1.878874388254486,
+      "grad_norm": 0.3589136302471161,
+      "learning_rate": 0.0005777942290877294,
+      "loss": 3.8593,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8934397576322537,
+      "grad_norm": 0.33831045031547546,
+      "learning_rate": 0.0005776193529583211,
+      "loss": 3.8622,
+      "step": 6500
+    },
+    {
+      "epoch": 1.908005127010021,
+      "grad_norm": 0.3330910801887512,
+      "learning_rate": 0.0005774444768289128,
+      "loss": 3.8481,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9225704963877885,
+      "grad_norm": 0.3293820321559906,
+      "learning_rate": 0.0005772696006995045,
+      "loss": 3.8514,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9371358657655557,
+      "grad_norm": 0.3244706094264984,
+      "learning_rate": 0.0005770947245700962,
+      "loss": 3.8415,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9517012351433234,
+      "grad_norm": 0.3254324495792389,
+      "learning_rate": 0.0005769198484406878,
+      "loss": 3.8378,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9662666045210906,
+      "grad_norm": 0.3237997889518738,
+      "learning_rate": 0.0005767449723112795,
+      "loss": 3.8317,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9808319738988582,
+      "grad_norm": 0.3228047788143158,
+      "learning_rate": 0.0005765700961818712,
+      "loss": 3.8368,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9953973432766254,
+      "grad_norm": 0.32064223289489746,
+      "learning_rate": 0.0005763952200524627,
+      "loss": 3.8394,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0099044511768818,
+      "grad_norm": 0.3343140184879303,
+      "learning_rate": 0.0005762203439230544,
+      "loss": 3.777,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0244698205546494,
+      "grad_norm": 0.3371165692806244,
+      "learning_rate": 0.0005760454677936461,
+      "loss": 3.7455,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "grad_norm": 0.32197079062461853,
+      "learning_rate": 0.0005758705916642378,
+      "loss": 3.7513,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "eval_accuracy": 0.34149063138446517,
+      "eval_loss": 3.810184955596924,
+      "eval_runtime": 180.5526,
+      "eval_samples_per_second": 92.167,
+      "eval_steps_per_second": 5.766,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0536005593101843,
+      "grad_norm": 0.33044517040252686,
+      "learning_rate": 0.0005756957155348294,
+      "loss": 3.7375,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0681659286879515,
+      "grad_norm": 0.3351292014122009,
+      "learning_rate": 0.0005755208394054211,
+      "loss": 3.7394,
+      "step": 7100
+    },
+    {
+      "epoch": 2.082731298065719,
+      "grad_norm": 0.3348188102245331,
+      "learning_rate": 0.0005753459632760128,
+      "loss": 3.7432,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0972966674434863,
+      "grad_norm": 0.33414652943611145,
+      "learning_rate": 0.0005751710871466045,
+      "loss": 3.7535,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111862036821254,
+      "grad_norm": 0.34815260767936707,
+      "learning_rate": 0.0005749962110171962,
+      "loss": 3.748,
+      "step": 7250
+    },
+    {
+      "epoch": 2.126427406199021,
+      "grad_norm": 0.3307594954967499,
+      "learning_rate": 0.0005748213348877877,
+      "loss": 3.7448,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140992775576789,
+      "grad_norm": 0.34988632798194885,
+      "learning_rate": 0.0005746464587583794,
+      "loss": 3.7542,
+      "step": 7350
+    },
+    {
+      "epoch": 2.155558144954556,
+      "grad_norm": 0.3482917249202728,
+      "learning_rate": 0.0005744715826289711,
+      "loss": 3.7581,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1701235143323236,
+      "grad_norm": 0.3216814696788788,
+      "learning_rate": 0.0005742967064995627,
+      "loss": 3.7627,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184688883710091,
+      "grad_norm": 0.3261340856552124,
+      "learning_rate": 0.0005741218303701544,
+      "loss": 3.7539,
+      "step": 7500
+    },
+    {
+      "epoch": 2.1992542530878585,
+      "grad_norm": 0.32120323181152344,
+      "learning_rate": 0.0005739469542407461,
+      "loss": 3.7448,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2138196224656257,
+      "grad_norm": 0.3328768312931061,
+      "learning_rate": 0.0005737720781113378,
+      "loss": 3.7542,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2283849918433933,
+      "grad_norm": 0.32364141941070557,
+      "learning_rate": 0.0005735972019819295,
+      "loss": 3.7494,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2429503612211605,
+      "grad_norm": 0.3354133367538452,
+      "learning_rate": 0.000573422325852521,
+      "loss": 3.7409,
+      "step": 7700
+    },
+    {
+      "epoch": 2.257515730598928,
+      "grad_norm": 0.3489600718021393,
+      "learning_rate": 0.0005732474497231127,
+      "loss": 3.7527,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2720810999766954,
+      "grad_norm": 0.33028727769851685,
+      "learning_rate": 0.0005730725735937044,
+      "loss": 3.746,
+      "step": 7800
+    },
+    {
+      "epoch": 2.286646469354463,
+      "grad_norm": 0.3276793658733368,
+      "learning_rate": 0.0005728976974642961,
+      "loss": 3.7565,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3012118387322302,
+      "grad_norm": 0.3518573045730591,
+      "learning_rate": 0.0005727228213348877,
+      "loss": 3.7492,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3157772081099974,
+      "grad_norm": 0.31932392716407776,
+      "learning_rate": 0.0005725479452054794,
+      "loss": 3.7511,
+      "step": 7950
+    },
+    {
+      "epoch": 2.330342577487765,
+      "grad_norm": 0.306573748588562,
+      "learning_rate": 0.0005723730690760711,
+      "loss": 3.7592,
+      "step": 8000
+    },
+    {
+      "epoch": 2.330342577487765,
+      "eval_accuracy": 0.34500433995162727,
+      "eval_loss": 3.7794785499572754,
+      "eval_runtime": 180.6748,
+      "eval_samples_per_second": 92.105,
+      "eval_steps_per_second": 5.762,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3449079468655327,
+      "grad_norm": 0.3530043959617615,
+      "learning_rate": 0.0005721981929466627,
+      "loss": 3.7412,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3594733162433,
+      "grad_norm": 0.31992873549461365,
+      "learning_rate": 0.0005720233168172545,
+      "loss": 3.7336,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374038685621067,
+      "grad_norm": 0.3247889578342438,
+      "learning_rate": 0.000571848440687846,
+      "loss": 3.7426,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3886040549988348,
+      "grad_norm": 0.32873016595840454,
+      "learning_rate": 0.0005716735645584377,
+      "loss": 3.7411,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4031694243766024,
+      "grad_norm": 0.34048840403556824,
+      "learning_rate": 0.0005714986884290294,
+      "loss": 3.7446,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4177347937543696,
+      "grad_norm": 0.3255440890789032,
+      "learning_rate": 0.000571323812299621,
+      "loss": 3.7486,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432300163132137,
+      "grad_norm": 0.3284962475299835,
+      "learning_rate": 0.0005711489361702127,
+      "loss": 3.7478,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4468655325099045,
+      "grad_norm": 0.31856876611709595,
+      "learning_rate": 0.0005709740600408044,
+      "loss": 3.7488,
+      "step": 8400
+    },
+    {
+      "epoch": 2.461430901887672,
+      "grad_norm": 0.34519293904304504,
+      "learning_rate": 0.0005707991839113961,
+      "loss": 3.7563,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4759962712654393,
+      "grad_norm": 0.32581546902656555,
+      "learning_rate": 0.0005706243077819877,
+      "loss": 3.7507,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4905616406432065,
+      "grad_norm": 0.3163212537765503,
+      "learning_rate": 0.0005704494316525793,
+      "loss": 3.738,
+      "step": 8550
+    },
+    {
+      "epoch": 2.505127010020974,
+      "grad_norm": 0.3154032826423645,
+      "learning_rate": 0.000570274555523171,
+      "loss": 3.7477,
+      "step": 8600
+    },
+    {
+      "epoch": 2.519692379398742,
+      "grad_norm": 0.3356831967830658,
+      "learning_rate": 0.0005700996793937627,
+      "loss": 3.7488,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534257748776509,
+      "grad_norm": 0.35420989990234375,
+      "learning_rate": 0.0005699248032643544,
+      "loss": 3.7355,
+      "step": 8700
+    },
+    {
+      "epoch": 2.548823118154276,
+      "grad_norm": 0.34032773971557617,
+      "learning_rate": 0.000569749927134946,
+      "loss": 3.7363,
+      "step": 8750
+    },
+    {
+      "epoch": 2.563388487532044,
+      "grad_norm": 0.3459492623806,
+      "learning_rate": 0.0005695750510055377,
+      "loss": 3.7408,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5779538569098115,
+      "grad_norm": 0.31637832522392273,
+      "learning_rate": 0.0005694001748761294,
+      "loss": 3.739,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5925192262875787,
+      "grad_norm": 0.34592801332473755,
+      "learning_rate": 0.000569225298746721,
+      "loss": 3.7549,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607084595665346,
+      "grad_norm": 0.34058457612991333,
+      "learning_rate": 0.0005690504226173127,
+      "loss": 3.7433,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "grad_norm": 0.3215969204902649,
+      "learning_rate": 0.0005688755464879043,
+      "loss": 3.7318,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "eval_accuracy": 0.3473946354881625,
+      "eval_loss": 3.7510855197906494,
+      "eval_runtime": 180.685,
+      "eval_samples_per_second": 92.1,
+      "eval_steps_per_second": 5.761,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636215334420881,
+      "grad_norm": 0.31068679690361023,
+      "learning_rate": 0.000568700670358496,
+      "loss": 3.7287,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6507807037986484,
+      "grad_norm": 0.3394843637943268,
+      "learning_rate": 0.0005685257942290877,
+      "loss": 3.7376,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6653460731764156,
+      "grad_norm": 0.31130099296569824,
+      "learning_rate": 0.0005683509180996793,
+      "loss": 3.7278,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6799114425541832,
+      "grad_norm": 0.32781389355659485,
+      "learning_rate": 0.000568176041970271,
+      "loss": 3.7263,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6944768119319504,
+      "grad_norm": 0.3248126208782196,
+      "learning_rate": 0.0005680011658408627,
+      "loss": 3.7386,
+      "step": 9250
+    },
+    {
+      "epoch": 2.709042181309718,
+      "grad_norm": 0.32771995663642883,
+      "learning_rate": 0.0005678262897114544,
+      "loss": 3.7293,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7236075506874853,
+      "grad_norm": 0.3164122998714447,
+      "learning_rate": 0.000567651413582046,
+      "loss": 3.7312,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738172920065253,
+      "grad_norm": 0.3185890018939972,
+      "learning_rate": 0.0005674765374526377,
+      "loss": 3.7251,
+      "step": 9400
+    },
+    {
+      "epoch": 2.75273828944302,
+      "grad_norm": 0.3235971927642822,
+      "learning_rate": 0.0005673016613232293,
+      "loss": 3.7237,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7673036588207878,
+      "grad_norm": 0.3337843418121338,
+      "learning_rate": 0.0005671267851938209,
+      "loss": 3.7302,
+      "step": 9500
+    },
+    {
+      "epoch": 2.781869028198555,
+      "grad_norm": 0.32673659920692444,
+      "learning_rate": 0.0005669519090644127,
+      "loss": 3.7265,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7964343975763226,
+      "grad_norm": 0.31469056010246277,
+      "learning_rate": 0.0005667770329350043,
+      "loss": 3.7272,
+      "step": 9600
+    },
+    {
+      "epoch": 2.81099976695409,
+      "grad_norm": 0.33078935742378235,
+      "learning_rate": 0.000566602156805596,
+      "loss": 3.7226,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8255651363318575,
+      "grad_norm": 0.3159051239490509,
+      "learning_rate": 0.0005664272806761877,
+      "loss": 3.7284,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8401305057096247,
+      "grad_norm": 0.32226207852363586,
+      "learning_rate": 0.0005662524045467793,
+      "loss": 3.7184,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8546958750873923,
+      "grad_norm": 0.3349704444408417,
+      "learning_rate": 0.000566077528417371,
+      "loss": 3.7276,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8692612444651595,
+      "grad_norm": 0.3188948333263397,
+      "learning_rate": 0.0005659026522879626,
+      "loss": 3.7167,
+      "step": 9850
+    },
+    {
+      "epoch": 2.883826613842927,
+      "grad_norm": 0.3187043368816376,
+      "learning_rate": 0.0005657277761585543,
+      "loss": 3.7229,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8983919832206944,
+      "grad_norm": 0.3272295296192169,
+      "learning_rate": 0.0005655529000291459,
+      "loss": 3.7178,
+      "step": 9950
+    },
+    {
+      "epoch": 2.912957352598462,
+      "grad_norm": 0.33525437116622925,
+      "learning_rate": 0.0005653780238997376,
+      "loss": 3.7092,
+      "step": 10000
+    },
+    {
+      "epoch": 2.912957352598462,
+      "eval_accuracy": 0.3501877039368612,
+      "eval_loss": 3.723299741744995,
+      "eval_runtime": 181.1596,
+      "eval_samples_per_second": 91.858,
+      "eval_steps_per_second": 5.746,
+      "step": 10000
+    },
+    {
+      "epoch": 2.927522721976229,
+      "grad_norm": 0.3185238242149353,
+      "learning_rate": 0.0005652031477703293,
+      "loss": 3.721,
+      "step": 10050
+    },
+    {
+      "epoch": 2.942088091353997,
+      "grad_norm": 0.3339935541152954,
+      "learning_rate": 0.000565028271640921,
+      "loss": 3.7196,
+      "step": 10100
+    },
+    {
+      "epoch": 2.956653460731764,
+      "grad_norm": 0.3274375796318054,
+      "learning_rate": 0.0005648533955115127,
+      "loss": 3.7089,
+      "step": 10150
+    },
+    {
+      "epoch": 2.9712188301095317,
+      "grad_norm": 0.33105677366256714,
+      "learning_rate": 0.0005646785193821043,
+      "loss": 3.7093,
+      "step": 10200
+    },
+    {
+      "epoch": 2.985784199487299,
+      "grad_norm": 0.3180982768535614,
+      "learning_rate": 0.000564503643252696,
+      "loss": 3.7031,
+      "step": 10250
+    },
+    {
+      "epoch": 3.0002913073875552,
+      "grad_norm": 0.342707097530365,
+      "learning_rate": 0.0005643287671232876,
+      "loss": 3.7172,
+      "step": 10300
+    },
+    {
+      "epoch": 3.014856676765323,
+      "grad_norm": 0.32696056365966797,
+      "learning_rate": 0.0005641538909938792,
+      "loss": 3.6025,
+      "step": 10350
+    },
+    {
+      "epoch": 3.02942204614309,
+      "grad_norm": 0.33608219027519226,
+      "learning_rate": 0.0005639790148644709,
+      "loss": 3.614,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0439874155208577,
+      "grad_norm": 0.31899672746658325,
+      "learning_rate": 0.0005638041387350626,
+      "loss": 3.6131,
+      "step": 10450
+    },
+    {
+      "epoch": 3.058552784898625,
+      "grad_norm": 0.3347124457359314,
+      "learning_rate": 0.0005636292626056543,
+      "loss": 3.6169,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0731181542763926,
+      "grad_norm": 0.3240697383880615,
+      "learning_rate": 0.000563454386476246,
+      "loss": 3.6303,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0876835236541598,
+      "grad_norm": 0.3579652011394501,
+      "learning_rate": 0.0005632795103468376,
+      "loss": 3.6228,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1022488930319274,
+      "grad_norm": 0.3257977366447449,
+      "learning_rate": 0.0005631046342174293,
+      "loss": 3.6246,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1168142624096946,
+      "grad_norm": 0.33490806818008423,
+      "learning_rate": 0.000562929758088021,
+      "loss": 3.6311,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1313796317874623,
+      "grad_norm": 0.3260871469974518,
+      "learning_rate": 0.0005627548819586126,
+      "loss": 3.6349,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1459450011652295,
+      "grad_norm": 0.34829798340797424,
+      "learning_rate": 0.0005625800058292042,
+      "loss": 3.6296,
+      "step": 10800
+    },
+    {
+      "epoch": 3.160510370542997,
+      "grad_norm": 0.39637139439582825,
+      "learning_rate": 0.0005624051296997959,
+      "loss": 3.6221,
+      "step": 10850
+    },
+    {
+      "epoch": 3.1750757399207643,
+      "grad_norm": 0.361925333738327,
+      "learning_rate": 0.0005622302535703876,
+      "loss": 3.6232,
+      "step": 10900
+    },
+    {
+      "epoch": 3.189641109298532,
+      "grad_norm": 0.3251033127307892,
+      "learning_rate": 0.0005620553774409792,
+      "loss": 3.6361,
+      "step": 10950
+    },
+    {
+      "epoch": 3.204206478676299,
+      "grad_norm": 0.32738086581230164,
+      "learning_rate": 0.000561880501311571,
+      "loss": 3.6303,
+      "step": 11000
+    },
+    {
+      "epoch": 3.204206478676299,
+      "eval_accuracy": 0.35179714921448696,
+      "eval_loss": 3.71109676361084,
+      "eval_runtime": 180.569,
+      "eval_samples_per_second": 92.159,
+      "eval_steps_per_second": 5.765,
+      "step": 11000
+    },
+    {
+      "epoch": 3.218771848054067,
+      "grad_norm": 0.3152799606323242,
+      "learning_rate": 0.0005617056251821626,
+      "loss": 3.6392,
+      "step": 11050
+    },
+    {
+      "epoch": 3.233337217431834,
+      "grad_norm": 0.31268447637557983,
+      "learning_rate": 0.0005615307490527543,
+      "loss": 3.6429,
+      "step": 11100
+    },
+    {
+      "epoch": 3.2479025868096016,
+      "grad_norm": 0.32059621810913086,
+      "learning_rate": 0.000561355872923346,
+      "loss": 3.6429,
+      "step": 11150
+    },
+    {
+      "epoch": 3.262467956187369,
+      "grad_norm": 0.33950409293174744,
+      "learning_rate": 0.0005611809967939375,
+      "loss": 3.6292,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2770333255651365,
+      "grad_norm": 0.34160423278808594,
+      "learning_rate": 0.0005610061206645292,
+      "loss": 3.6465,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2915986949429037,
+      "grad_norm": 0.32576170563697815,
+      "learning_rate": 0.0005608312445351209,
+      "loss": 3.6273,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3061640643206713,
+      "grad_norm": 0.34006011486053467,
+      "learning_rate": 0.0005606563684057126,
+      "loss": 3.6418,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3207294336984385,
+      "grad_norm": 0.3233993947505951,
+      "learning_rate": 0.0005604814922763042,
+      "loss": 3.6329,
+      "step": 11400
+    },
+    {
+      "epoch": 3.335294803076206,
+      "grad_norm": 0.33062100410461426,
+      "learning_rate": 0.0005603066161468959,
+      "loss": 3.6387,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3498601724539734,
+      "grad_norm": 0.3202456533908844,
+      "learning_rate": 0.0005601317400174876,
+      "loss": 3.6509,
+      "step": 11500
+    },
+    {
+      "epoch": 3.364425541831741,
+      "grad_norm": 0.325444757938385,
+      "learning_rate": 0.0005599568638880793,
+      "loss": 3.6396,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3789909112095082,
+      "grad_norm": 0.32571959495544434,
+      "learning_rate": 0.0005597819877586709,
+      "loss": 3.6497,
+      "step": 11600
+    },
+    {
+      "epoch": 3.393556280587276,
+      "grad_norm": 0.3135475218296051,
+      "learning_rate": 0.0005596071116292625,
+      "loss": 3.6388,
+      "step": 11650
+    },
+    {
+      "epoch": 3.408121649965043,
+      "grad_norm": 0.34956151247024536,
+      "learning_rate": 0.0005594322354998542,
+      "loss": 3.6315,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4226870193428107,
+      "grad_norm": 0.3408454656600952,
+      "learning_rate": 0.0005592573593704459,
+      "loss": 3.639,
+      "step": 11750
+    },
+    {
+      "epoch": 3.437252388720578,
+      "grad_norm": 0.32927653193473816,
+      "learning_rate": 0.0005590824832410375,
+      "loss": 3.6389,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4518177580983456,
+      "grad_norm": 0.331126868724823,
+      "learning_rate": 0.0005589076071116292,
+      "loss": 3.6399,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4663831274761128,
+      "grad_norm": 0.32898014783859253,
+      "learning_rate": 0.0005587327309822209,
+      "loss": 3.6464,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4809484968538804,
+      "grad_norm": 0.3311103582382202,
+      "learning_rate": 0.0005585578548528126,
+      "loss": 3.647,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "grad_norm": 0.32251957058906555,
+      "learning_rate": 0.0005583829787234043,
+      "loss": 3.634,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "eval_accuracy": 0.35319656458813503,
+      "eval_loss": 3.6934473514556885,
+      "eval_runtime": 180.9126,
+      "eval_samples_per_second": 91.984,
+      "eval_steps_per_second": 5.754,
+      "step": 12000
+    },
+    {
+      "epoch": 3.510079235609415,
+      "grad_norm": 0.31824731826782227,
+      "learning_rate": 0.0005582081025939958,
+      "loss": 3.6325,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5246446049871825,
+      "grad_norm": 0.323544979095459,
+      "learning_rate": 0.0005580332264645875,
+      "loss": 3.6328,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53920997436495,
+      "grad_norm": 0.3168325424194336,
+      "learning_rate": 0.0005578583503351792,
+      "loss": 3.6318,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5537753437427173,
+      "grad_norm": 0.29865562915802,
+      "learning_rate": 0.0005576834742057709,
+      "loss": 3.6316,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5683407131204845,
+      "grad_norm": 0.32099124789237976,
+      "learning_rate": 0.0005575085980763625,
+      "loss": 3.641,
+      "step": 12250
+    },
+    {
+      "epoch": 3.582906082498252,
+      "grad_norm": 0.31370314955711365,
+      "learning_rate": 0.0005573337219469542,
+      "loss": 3.6376,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59747145187602,
+      "grad_norm": 0.34374183416366577,
+      "learning_rate": 0.0005571588458175459,
+      "loss": 3.6366,
+      "step": 12350
+    },
+    {
+      "epoch": 3.612036821253787,
+      "grad_norm": 0.32212623953819275,
+      "learning_rate": 0.0005569839696881374,
+      "loss": 3.6539,
+      "step": 12400
+    },
+    {
+      "epoch": 3.626602190631554,
+      "grad_norm": 0.3175927400588989,
+      "learning_rate": 0.0005568090935587292,
+      "loss": 3.6245,
+      "step": 12450
+    },
+    {
+      "epoch": 3.641167560009322,
+      "grad_norm": 0.32453852891921997,
+      "learning_rate": 0.0005566342174293208,
+      "loss": 3.632,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6557329293870895,
+      "grad_norm": 0.3191271424293518,
+      "learning_rate": 0.0005564593412999125,
+      "loss": 3.6513,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6702982987648567,
+      "grad_norm": 0.3355797827243805,
+      "learning_rate": 0.0005562844651705042,
+      "loss": 3.6365,
+      "step": 12600
+    },
+    {
+      "epoch": 3.684863668142624,
+      "grad_norm": 0.3287925124168396,
+      "learning_rate": 0.0005561095890410958,
+      "loss": 3.6323,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6994290375203915,
+      "grad_norm": 0.3259026110172272,
+      "learning_rate": 0.0005559347129116875,
+      "loss": 3.6389,
+      "step": 12700
+    },
+    {
+      "epoch": 3.713994406898159,
+      "grad_norm": 0.3568135201931,
+      "learning_rate": 0.0005557598367822792,
+      "loss": 3.6406,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7285597762759264,
+      "grad_norm": 0.3320167362689972,
+      "learning_rate": 0.0005555849606528709,
+      "loss": 3.6365,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7431251456536936,
+      "grad_norm": 0.31746959686279297,
+      "learning_rate": 0.0005554100845234624,
+      "loss": 3.635,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7576905150314612,
+      "grad_norm": 0.32215115427970886,
+      "learning_rate": 0.0005552352083940541,
+      "loss": 3.6346,
+      "step": 12900
+    },
+    {
+      "epoch": 3.772255884409229,
+      "grad_norm": 0.3303179442882538,
+      "learning_rate": 0.0005550603322646458,
+      "loss": 3.6321,
+      "step": 12950
+    },
+    {
+      "epoch": 3.786821253786996,
+      "grad_norm": 0.3192589581012726,
+      "learning_rate": 0.0005548854561352375,
+      "loss": 3.6417,
+      "step": 13000
+    },
+    {
+      "epoch": 3.786821253786996,
+      "eval_accuracy": 0.3552138394889382,
+      "eval_loss": 3.6750006675720215,
+      "eval_runtime": 180.7632,
+      "eval_samples_per_second": 92.06,
+      "eval_steps_per_second": 5.759,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8013866231647633,
+      "grad_norm": 0.32486605644226074,
+      "learning_rate": 0.0005547105800058292,
+      "loss": 3.6201,
+      "step": 13050
+    },
+    {
+      "epoch": 3.815951992542531,
+      "grad_norm": 0.32540053129196167,
+      "learning_rate": 0.0005545357038764208,
+      "loss": 3.6341,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8305173619202986,
+      "grad_norm": 0.31797096133232117,
+      "learning_rate": 0.0005543608277470125,
+      "loss": 3.6461,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8450827312980658,
+      "grad_norm": 0.3220556676387787,
+      "learning_rate": 0.0005541859516176042,
+      "loss": 3.6295,
+      "step": 13200
+    },
+    {
+      "epoch": 3.859648100675833,
+      "grad_norm": 0.3416280746459961,
+      "learning_rate": 0.0005540110754881958,
+      "loss": 3.6403,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8742134700536006,
+      "grad_norm": 0.31618985533714294,
+      "learning_rate": 0.0005538361993587874,
+      "loss": 3.6357,
+      "step": 13300
+    },
+    {
+      "epoch": 3.888778839431368,
+      "grad_norm": 0.32899636030197144,
+      "learning_rate": 0.0005536613232293791,
+      "loss": 3.6354,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9033442088091355,
+      "grad_norm": 0.3490321636199951,
+      "learning_rate": 0.0005534864470999708,
+      "loss": 3.6209,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9179095781869027,
+      "grad_norm": 0.31270232796669006,
+      "learning_rate": 0.0005533115709705625,
+      "loss": 3.6345,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9324749475646703,
+      "grad_norm": 0.3164960443973541,
+      "learning_rate": 0.0005531366948411541,
+      "loss": 3.6336,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9470403169424375,
+      "grad_norm": 0.31384697556495667,
+      "learning_rate": 0.0005529618187117458,
+      "loss": 3.6306,
+      "step": 13550
+    },
+    {
+      "epoch": 3.961605686320205,
+      "grad_norm": 0.3088199198246002,
+      "learning_rate": 0.0005527869425823375,
+      "loss": 3.6351,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9761710556979724,
+      "grad_norm": 0.3234373927116394,
+      "learning_rate": 0.0005526120664529292,
+      "loss": 3.6327,
+      "step": 13650
+    },
+    {
+      "epoch": 3.99073642507574,
+      "grad_norm": 0.3382308781147003,
+      "learning_rate": 0.0005524371903235207,
+      "loss": 3.6321,
+      "step": 13700
+    },
+    {
+      "epoch": 4.005243532975996,
+      "grad_norm": 0.32010650634765625,
+      "learning_rate": 0.0005522623141941124,
+      "loss": 3.5813,
+      "step": 13750
+    },
+    {
+      "epoch": 4.0198089023537635,
+      "grad_norm": 0.3284594714641571,
+      "learning_rate": 0.0005520874380647041,
+      "loss": 3.5195,
+      "step": 13800
+    },
+    {
+      "epoch": 4.034374271731531,
+      "grad_norm": 0.3116910457611084,
+      "learning_rate": 0.0005519125619352957,
+      "loss": 3.5323,
+      "step": 13850
+    },
+    {
+      "epoch": 4.048939641109299,
+      "grad_norm": 0.30614858865737915,
+      "learning_rate": 0.0005517376858058875,
+      "loss": 3.526,
+      "step": 13900
+    },
+    {
+      "epoch": 4.063505010487066,
+      "grad_norm": 0.30232539772987366,
+      "learning_rate": 0.0005515628096764791,
+      "loss": 3.5272,
+      "step": 13950
+    },
+    {
+      "epoch": 4.078070379864833,
+      "grad_norm": 0.34007346630096436,
+      "learning_rate": 0.0005513879335470708,
+      "loss": 3.5387,
+      "step": 14000
+    },
+    {
+      "epoch": 4.078070379864833,
+      "eval_accuracy": 0.35625963788539633,
+      "eval_loss": 3.6710259914398193,
+      "eval_runtime": 180.6469,
+      "eval_samples_per_second": 92.119,
+      "eval_steps_per_second": 5.763,
+      "step": 14000
+    },
+    {
+      "epoch": 4.092635749242601,
+      "grad_norm": 0.31514182686805725,
+      "learning_rate": 0.0005512130574176625,
+      "loss": 3.5323,
+      "step": 14050
+    },
+    {
+      "epoch": 4.1072011186203685,
+      "grad_norm": 0.31488853693008423,
+      "learning_rate": 0.000551038181288254,
+      "loss": 3.5381,
+      "step": 14100
+    },
+    {
+      "epoch": 4.121766487998135,
+      "grad_norm": 0.3174729347229004,
+      "learning_rate": 0.0005508633051588457,
+      "loss": 3.5473,
+      "step": 14150
+    },
+    {
+      "epoch": 4.136331857375903,
+      "grad_norm": 0.3296089470386505,
+      "learning_rate": 0.0005506884290294374,
+      "loss": 3.532,
+      "step": 14200
+    },
+    {
+      "epoch": 4.150897226753671,
+      "grad_norm": 0.3251158893108368,
+      "learning_rate": 0.0005505135529000291,
+      "loss": 3.5579,
+      "step": 14250
+    },
+    {
+      "epoch": 4.165462596131438,
+      "grad_norm": 0.3330046236515045,
+      "learning_rate": 0.0005503386767706207,
+      "loss": 3.5448,
+      "step": 14300
+    },
+    {
+      "epoch": 4.180027965509205,
+      "grad_norm": 0.329488605260849,
+      "learning_rate": 0.0005501638006412124,
+      "loss": 3.5567,
+      "step": 14350
+    },
+    {
+      "epoch": 4.194593334886973,
+      "grad_norm": 0.32068127393722534,
+      "learning_rate": 0.0005499889245118041,
+      "loss": 3.5561,
+      "step": 14400
+    },
+    {
+      "epoch": 4.20915870426474,
+      "grad_norm": 0.3509705662727356,
+      "learning_rate": 0.0005498140483823958,
+      "loss": 3.5629,
+      "step": 14450
+    },
+    {
+      "epoch": 4.223724073642508,
+      "grad_norm": 0.33269885182380676,
+      "learning_rate": 0.0005496391722529875,
+      "loss": 3.5629,
+      "step": 14500
+    },
+    {
+      "epoch": 4.238289443020275,
+      "grad_norm": 0.33287978172302246,
+      "learning_rate": 0.000549464296123579,
+      "loss": 3.5532,
+      "step": 14550
+    },
+    {
+      "epoch": 4.252854812398042,
+      "grad_norm": 0.3189695477485657,
+      "learning_rate": 0.0005492894199941707,
+      "loss": 3.5571,
+      "step": 14600
+    },
+    {
+      "epoch": 4.26742018177581,
+      "grad_norm": 0.33141258358955383,
+      "learning_rate": 0.0005491145438647624,
+      "loss": 3.5586,
+      "step": 14650
+    },
+    {
+      "epoch": 4.281985551153578,
+      "grad_norm": 0.319828063249588,
+      "learning_rate": 0.000548939667735354,
+      "loss": 3.5612,
+      "step": 14700
+    },
+    {
+      "epoch": 4.296550920531344,
+      "grad_norm": 0.334999144077301,
+      "learning_rate": 0.0005487647916059457,
+      "loss": 3.553,
+      "step": 14750
+    },
+    {
+      "epoch": 4.311116289909112,
+      "grad_norm": 0.3195878565311432,
+      "learning_rate": 0.0005485899154765374,
+      "loss": 3.5709,
+      "step": 14800
+    },
+    {
+      "epoch": 4.32568165928688,
+      "grad_norm": 0.3178456723690033,
+      "learning_rate": 0.0005484150393471291,
+      "loss": 3.5663,
+      "step": 14850
+    },
+    {
+      "epoch": 4.340247028664647,
+      "grad_norm": 0.3257606327533722,
+      "learning_rate": 0.0005482401632177208,
+      "loss": 3.5596,
+      "step": 14900
+    },
+    {
+      "epoch": 4.354812398042414,
+      "grad_norm": 0.3173132836818695,
+      "learning_rate": 0.0005480652870883124,
+      "loss": 3.5624,
+      "step": 14950
+    },
+    {
+      "epoch": 4.369377767420182,
+      "grad_norm": 0.3216117322444916,
+      "learning_rate": 0.000547890410958904,
+      "loss": 3.5645,
+      "step": 15000
+    },
+    {
+      "epoch": 4.369377767420182,
+      "eval_accuracy": 0.35712233630397466,
+      "eval_loss": 3.6592366695404053,
+      "eval_runtime": 180.8389,
+      "eval_samples_per_second": 92.021,
+      "eval_steps_per_second": 5.757,
+      "step": 15000
+    },
+    {
+      "epoch": 4.383943136797949,
+      "grad_norm": 0.3293343186378479,
+      "learning_rate": 0.0005477155348294957,
+      "loss": 3.5799,
+      "step": 15050
+    },
+    {
+      "epoch": 4.398508506175717,
+      "grad_norm": 0.3351048231124878,
+      "learning_rate": 0.0005475406587000874,
+      "loss": 3.5695,
+      "step": 15100
+    },
+    {
+      "epoch": 4.413073875553484,
+      "grad_norm": 0.3400932252407074,
+      "learning_rate": 0.000547365782570679,
+      "loss": 3.5654,
+      "step": 15150
+    },
+    {
+      "epoch": 4.427639244931251,
+      "grad_norm": 0.3267037272453308,
+      "learning_rate": 0.0005471909064412707,
+      "loss": 3.5663,
+      "step": 15200
+    },
+    {
+      "epoch": 4.442204614309019,
+      "grad_norm": 0.32394009828567505,
+      "learning_rate": 0.0005470160303118624,
+      "loss": 3.5739,
+      "step": 15250
+    },
+    {
+      "epoch": 4.456769983686787,
+      "grad_norm": 0.3179683983325958,
+      "learning_rate": 0.000546841154182454,
+      "loss": 3.5629,
+      "step": 15300
+    },
+    {
+      "epoch": 4.471335353064553,
+      "grad_norm": 0.33053240180015564,
+      "learning_rate": 0.0005466662780530458,
+      "loss": 3.5767,
+      "step": 15350
+    },
+    {
+      "epoch": 4.485900722442321,
+      "grad_norm": 0.34838345646858215,
+      "learning_rate": 0.0005464914019236374,
+      "loss": 3.5734,
+      "step": 15400
+    },
+    {
+      "epoch": 4.500466091820089,
+      "grad_norm": 0.34467417001724243,
+      "learning_rate": 0.000546316525794229,
+      "loss": 3.5659,
+      "step": 15450
+    },
+    {
+      "epoch": 4.515031461197856,
+      "grad_norm": 0.317242830991745,
+      "learning_rate": 0.0005461416496648207,
+      "loss": 3.5814,
+      "step": 15500
+    },
+    {
+      "epoch": 4.529596830575623,
+      "grad_norm": 0.3329235911369324,
+      "learning_rate": 0.0005459667735354123,
+      "loss": 3.5601,
+      "step": 15550
+    },
+    {
+      "epoch": 4.544162199953391,
+      "grad_norm": 0.3190889358520508,
+      "learning_rate": 0.000545791897406004,
+      "loss": 3.5674,
+      "step": 15600
+    },
+    {
+      "epoch": 4.558727569331158,
+      "grad_norm": 0.34190186858177185,
+      "learning_rate": 0.0005456170212765957,
+      "loss": 3.5614,
+      "step": 15650
+    },
+    {
+      "epoch": 4.573292938708926,
+      "grad_norm": 0.3235403895378113,
+      "learning_rate": 0.0005454421451471874,
+      "loss": 3.5748,
+      "step": 15700
+    },
+    {
+      "epoch": 4.587858308086693,
+      "grad_norm": 0.32054969668388367,
+      "learning_rate": 0.000545267269017779,
+      "loss": 3.5636,
+      "step": 15750
+    },
+    {
+      "epoch": 4.6024236774644605,
+      "grad_norm": 0.33121681213378906,
+      "learning_rate": 0.0005450923928883708,
+      "loss": 3.5788,
+      "step": 15800
+    },
+    {
+      "epoch": 4.616989046842228,
+      "grad_norm": 0.3060116469860077,
+      "learning_rate": 0.0005449175167589623,
+      "loss": 3.5646,
+      "step": 15850
+    },
+    {
+      "epoch": 4.631554416219995,
+      "grad_norm": 0.31824249029159546,
+      "learning_rate": 0.000544742640629554,
+      "loss": 3.5675,
+      "step": 15900
+    },
+    {
+      "epoch": 4.6461197855977625,
+      "grad_norm": 0.3265272080898285,
+      "learning_rate": 0.0005445677645001457,
+      "loss": 3.5737,
+      "step": 15950
+    },
+    {
+      "epoch": 4.66068515497553,
+      "grad_norm": 0.3331158757209778,
+      "learning_rate": 0.0005443928883707373,
+      "loss": 3.5712,
+      "step": 16000
+    },
+    {
+      "epoch": 4.66068515497553,
+      "eval_accuracy": 0.35869673739829394,
+      "eval_loss": 3.6470656394958496,
+      "eval_runtime": 180.6963,
+      "eval_samples_per_second": 92.094,
+      "eval_steps_per_second": 5.761,
+      "step": 16000
+    },
+    {
+      "epoch": 4.675250524353298,
+      "grad_norm": 0.34401053190231323,
+      "learning_rate": 0.000544218012241329,
+      "loss": 3.5747,
+      "step": 16050
+    },
+    {
+      "epoch": 4.689815893731065,
+      "grad_norm": 0.33307909965515137,
+      "learning_rate": 0.0005440431361119207,
+      "loss": 3.5658,
+      "step": 16100
+    },
+    {
+      "epoch": 4.704381263108832,
+      "grad_norm": 0.34673160314559937,
+      "learning_rate": 0.0005438682599825123,
+      "loss": 3.5709,
+      "step": 16150
+    },
+    {
+      "epoch": 4.7189466324866,
+      "grad_norm": 0.3231510818004608,
+      "learning_rate": 0.000543693383853104,
+      "loss": 3.5722,
+      "step": 16200
+    },
+    {
+      "epoch": 4.7335120018643675,
+      "grad_norm": 0.3428361117839813,
+      "learning_rate": 0.0005435185077236957,
+      "loss": 3.5723,
+      "step": 16250
+    },
+    {
+      "epoch": 4.748077371242134,
+      "grad_norm": 0.328531414270401,
+      "learning_rate": 0.0005433436315942873,
+      "loss": 3.5885,
+      "step": 16300
+    },
+    {
+      "epoch": 4.762642740619902,
+      "grad_norm": 0.32748499512672424,
+      "learning_rate": 0.000543168755464879,
+      "loss": 3.5731,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7772081099976695,
+      "grad_norm": 0.32989397644996643,
+      "learning_rate": 0.0005429938793354706,
+      "loss": 3.5731,
+      "step": 16400
+    },
+    {
+      "epoch": 4.791773479375437,
+      "grad_norm": 0.3433181345462799,
+      "learning_rate": 0.0005428190032060623,
+      "loss": 3.5764,
+      "step": 16450
+    },
+    {
+      "epoch": 4.806338848753205,
+      "grad_norm": 0.3318285644054413,
+      "learning_rate": 0.000542644127076654,
+      "loss": 3.5753,
+      "step": 16500
+    },
+    {
+      "epoch": 4.820904218130972,
+      "grad_norm": 0.3202374279499054,
+      "learning_rate": 0.0005424692509472457,
+      "loss": 3.5821,
+      "step": 16550
+    },
+    {
+      "epoch": 4.835469587508739,
+      "grad_norm": 0.32062003016471863,
+      "learning_rate": 0.0005422943748178373,
+      "loss": 3.5817,
+      "step": 16600
+    },
+    {
+      "epoch": 4.850034956886507,
+      "grad_norm": 0.33390170335769653,
+      "learning_rate": 0.000542119498688429,
+      "loss": 3.5706,
+      "step": 16650
+    },
+    {
+      "epoch": 4.864600326264274,
+      "grad_norm": 0.31805121898651123,
+      "learning_rate": 0.0005419446225590207,
+      "loss": 3.5652,
+      "step": 16700
+    },
+    {
+      "epoch": 4.879165695642041,
+      "grad_norm": 0.31974926590919495,
+      "learning_rate": 0.0005417697464296122,
+      "loss": 3.5732,
+      "step": 16750
+    },
+    {
+      "epoch": 4.893731065019809,
+      "grad_norm": 0.3215559124946594,
+      "learning_rate": 0.000541594870300204,
+      "loss": 3.5705,
+      "step": 16800
+    },
+    {
+      "epoch": 4.908296434397577,
+      "grad_norm": 0.31701287627220154,
+      "learning_rate": 0.0005414199941707956,
+      "loss": 3.5707,
+      "step": 16850
+    },
+    {
+      "epoch": 4.922861803775344,
+      "grad_norm": 0.29899007081985474,
+      "learning_rate": 0.0005412451180413873,
+      "loss": 3.5722,
+      "step": 16900
+    },
+    {
+      "epoch": 4.937427173153111,
+      "grad_norm": 0.3220870792865753,
+      "learning_rate": 0.000541070241911979,
+      "loss": 3.5804,
+      "step": 16950
+    },
+    {
+      "epoch": 4.951992542530879,
+      "grad_norm": 0.31385698914527893,
+      "learning_rate": 0.0005408953657825706,
+      "loss": 3.5678,
+      "step": 17000
+    },
+    {
+      "epoch": 4.951992542530879,
+      "eval_accuracy": 0.35995856319318836,
+      "eval_loss": 3.631695508956909,
+      "eval_runtime": 180.6546,
+      "eval_samples_per_second": 92.115,
+      "eval_steps_per_second": 5.762,
+      "step": 17000
+    },
+    {
+      "epoch": 4.966557911908646,
+      "grad_norm": 0.30999556183815,
+      "learning_rate": 0.0005407204896531623,
+      "loss": 3.5824,
+      "step": 17050
+    },
+    {
+      "epoch": 4.981123281286413,
+      "grad_norm": 0.31185469031333923,
+      "learning_rate": 0.000540545613523754,
+      "loss": 3.5701,
+      "step": 17100
+    },
+    {
+      "epoch": 4.995688650664181,
+      "grad_norm": 0.31638017296791077,
+      "learning_rate": 0.0005403707373943456,
+      "loss": 3.5758,
+      "step": 17150
+    },
+    {
+      "epoch": 5.010195758564437,
+      "grad_norm": 0.3234923779964447,
+      "learning_rate": 0.0005401958612649372,
+      "loss": 3.4972,
+      "step": 17200
+    },
+    {
+      "epoch": 5.024761127942204,
+      "grad_norm": 0.3230464458465576,
+      "learning_rate": 0.000540020985135529,
+      "loss": 3.4543,
+      "step": 17250
+    },
+    {
+      "epoch": 5.039326497319972,
+      "grad_norm": 0.32958754897117615,
+      "learning_rate": 0.0005398461090061206,
+      "loss": 3.4646,
+      "step": 17300
+    },
+    {
+      "epoch": 5.0538918666977395,
+      "grad_norm": 0.3155558705329895,
+      "learning_rate": 0.0005396712328767123,
+      "loss": 3.4774,
+      "step": 17350
+    },
+    {
+      "epoch": 5.068457236075507,
+      "grad_norm": 0.3168497681617737,
+      "learning_rate": 0.000539496356747304,
+      "loss": 3.4642,
+      "step": 17400
+    },
+    {
+      "epoch": 5.083022605453274,
+      "grad_norm": 0.3253559172153473,
+      "learning_rate": 0.0005393214806178956,
+      "loss": 3.4729,
+      "step": 17450
+    },
+    {
+      "epoch": 5.0975879748310415,
+      "grad_norm": 0.32378828525543213,
+      "learning_rate": 0.0005391466044884873,
+      "loss": 3.4818,
+      "step": 17500
+    },
+    {
+      "epoch": 5.112153344208809,
+      "grad_norm": 0.3335542380809784,
+      "learning_rate": 0.000538971728359079,
+      "loss": 3.4832,
+      "step": 17550
+    },
+    {
+      "epoch": 5.126718713586577,
+      "grad_norm": 0.3030463755130768,
+      "learning_rate": 0.0005387968522296705,
+      "loss": 3.4849,
+      "step": 17600
+    },
+    {
+      "epoch": 5.141284082964344,
+      "grad_norm": 0.31400778889656067,
+      "learning_rate": 0.0005386219761002622,
+      "loss": 3.4826,
+      "step": 17650
+    },
+    {
+      "epoch": 5.155849452342111,
+      "grad_norm": 0.3250598907470703,
+      "learning_rate": 0.0005384470999708539,
+      "loss": 3.4884,
+      "step": 17700
+    },
+    {
+      "epoch": 5.170414821719879,
+      "grad_norm": 0.345520555973053,
+      "learning_rate": 0.0005382722238414456,
+      "loss": 3.4921,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1849801910976465,
+      "grad_norm": 0.3516606390476227,
+      "learning_rate": 0.0005380973477120373,
+      "loss": 3.4985,
+      "step": 17800
+    },
+    {
+      "epoch": 5.199545560475413,
+      "grad_norm": 0.352478563785553,
+      "learning_rate": 0.000537922471582629,
+      "loss": 3.4826,
+      "step": 17850
+    },
+    {
+      "epoch": 5.214110929853181,
+      "grad_norm": 0.35319584608078003,
+      "learning_rate": 0.0005377475954532206,
+      "loss": 3.4955,
+      "step": 17900
+    },
+    {
+      "epoch": 5.228676299230949,
+      "grad_norm": 0.3407022953033447,
+      "learning_rate": 0.0005375727193238123,
+      "loss": 3.5012,
+      "step": 17950
+    },
+    {
+      "epoch": 5.243241668608716,
+      "grad_norm": 0.35426145792007446,
+      "learning_rate": 0.000537397843194404,
+      "loss": 3.5042,
+      "step": 18000
+    },
+    {
+      "epoch": 5.243241668608716,
+      "eval_accuracy": 0.3603563969922683,
+      "eval_loss": 3.6395750045776367,
+      "eval_runtime": 180.8321,
+      "eval_samples_per_second": 92.025,
+      "eval_steps_per_second": 5.757,
+      "step": 18000
+    },
+    {
+      "epoch": 5.257807037986483,
+      "grad_norm": 0.35406365990638733,
+      "learning_rate": 0.0005372229670649955,
+      "loss": 3.5002,
+      "step": 18050
+    },
+    {
+      "epoch": 5.272372407364251,
+      "grad_norm": 0.31914931535720825,
+      "learning_rate": 0.0005370480909355872,
+      "loss": 3.5036,
+      "step": 18100
+    },
+    {
+      "epoch": 5.286937776742018,
+      "grad_norm": 0.3213772773742676,
+      "learning_rate": 0.0005368732148061789,
+      "loss": 3.5054,
+      "step": 18150
+    },
+    {
+      "epoch": 5.301503146119786,
+      "grad_norm": 0.34711068868637085,
+      "learning_rate": 0.0005366983386767705,
+      "loss": 3.5062,
+      "step": 18200
+    },
+    {
+      "epoch": 5.316068515497553,
+      "grad_norm": 0.31905606389045715,
+      "learning_rate": 0.0005365234625473623,
+      "loss": 3.5136,
+      "step": 18250
+    },
+    {
+      "epoch": 5.33063388487532,
+      "grad_norm": 0.34507957100868225,
+      "learning_rate": 0.0005363485864179539,
+      "loss": 3.5083,
+      "step": 18300
+    },
+    {
+      "epoch": 5.345199254253088,
+      "grad_norm": 0.33473140001296997,
+      "learning_rate": 0.0005361737102885456,
+      "loss": 3.5181,
+      "step": 18350
+    },
+    {
+      "epoch": 5.359764623630856,
+      "grad_norm": 0.32312649488449097,
+      "learning_rate": 0.0005359988341591373,
+      "loss": 3.5086,
+      "step": 18400
+    },
+    {
+      "epoch": 5.374329993008622,
+      "grad_norm": 0.3478247821331024,
+      "learning_rate": 0.000535823958029729,
+      "loss": 3.5049,
+      "step": 18450
+    },
+    {
+      "epoch": 5.38889536238639,
+      "grad_norm": 0.3395753502845764,
+      "learning_rate": 0.0005356490819003205,
+      "loss": 3.5144,
+      "step": 18500
+    },
+    {
+      "epoch": 5.403460731764158,
+      "grad_norm": 0.35865122079849243,
+      "learning_rate": 0.0005354742057709122,
+      "loss": 3.5177,
+      "step": 18550
+    },
+    {
+      "epoch": 5.418026101141925,
+      "grad_norm": 0.3185734748840332,
+      "learning_rate": 0.0005352993296415039,
+      "loss": 3.5112,
+      "step": 18600
+    },
+    {
+      "epoch": 5.432591470519692,
+      "grad_norm": 0.35112476348876953,
+      "learning_rate": 0.0005351244535120955,
+      "loss": 3.5156,
+      "step": 18650
+    },
+    {
+      "epoch": 5.44715683989746,
+      "grad_norm": 0.3340792953968048,
+      "learning_rate": 0.0005349495773826873,
+      "loss": 3.5167,
+      "step": 18700
+    },
+    {
+      "epoch": 5.461722209275227,
+      "grad_norm": 0.3263596296310425,
+      "learning_rate": 0.0005347747012532789,
+      "loss": 3.4987,
+      "step": 18750
+    },
+    {
+      "epoch": 5.476287578652995,
+      "grad_norm": 0.3156237304210663,
+      "learning_rate": 0.0005345998251238706,
+      "loss": 3.5255,
+      "step": 18800
+    },
+    {
+      "epoch": 5.490852948030762,
+      "grad_norm": 0.34373199939727783,
+      "learning_rate": 0.0005344249489944623,
+      "loss": 3.5092,
+      "step": 18850
+    },
+    {
+      "epoch": 5.505418317408529,
+      "grad_norm": 0.3335568606853485,
+      "learning_rate": 0.0005342500728650538,
+      "loss": 3.5306,
+      "step": 18900
+    },
+    {
+      "epoch": 5.519983686786297,
+      "grad_norm": 0.34739938378334045,
+      "learning_rate": 0.0005340751967356455,
+      "loss": 3.5186,
+      "step": 18950
+    },
+    {
+      "epoch": 5.534549056164065,
+      "grad_norm": 0.3066945970058441,
+      "learning_rate": 0.0005339003206062372,
+      "loss": 3.5238,
+      "step": 19000
+    },
+    {
+      "epoch": 5.534549056164065,
+      "eval_accuracy": 0.3612617834596394,
+      "eval_loss": 3.6238057613372803,
+      "eval_runtime": 181.0108,
+      "eval_samples_per_second": 91.934,
+      "eval_steps_per_second": 5.751,
+      "step": 19000
+    },
+    {
+      "epoch": 5.549114425541831,
+      "grad_norm": 0.35906171798706055,
+      "learning_rate": 0.0005337254444768288,
+      "loss": 3.5188,
+      "step": 19050
+    },
+    {
+      "epoch": 5.563679794919599,
+      "grad_norm": 0.3499426245689392,
+      "learning_rate": 0.0005335505683474205,
+      "loss": 3.5172,
+      "step": 19100
+    },
+    {
+      "epoch": 5.578245164297367,
+      "grad_norm": 0.32113078236579895,
+      "learning_rate": 0.0005333756922180122,
+      "loss": 3.5124,
+      "step": 19150
+    },
+    {
+      "epoch": 5.592810533675134,
+      "grad_norm": 0.3309899866580963,
+      "learning_rate": 0.0005332008160886039,
+      "loss": 3.521,
+      "step": 19200
+    },
+    {
+      "epoch": 5.607375903052901,
+      "grad_norm": 0.33870729804039,
+      "learning_rate": 0.0005330259399591956,
+      "loss": 3.5212,
+      "step": 19250
+    },
+    {
+      "epoch": 5.621941272430669,
+      "grad_norm": 0.3483949899673462,
+      "learning_rate": 0.0005328510638297873,
+      "loss": 3.5272,
+      "step": 19300
+    },
+    {
+      "epoch": 5.636506641808436,
+      "grad_norm": 0.31153854727745056,
+      "learning_rate": 0.0005326761877003788,
+      "loss": 3.5116,
+      "step": 19350
+    },
+    {
+      "epoch": 5.651072011186204,
+      "grad_norm": 0.32981279492378235,
+      "learning_rate": 0.0005325013115709705,
+      "loss": 3.5192,
+      "step": 19400
+    },
+    {
+      "epoch": 5.665637380563971,
+      "grad_norm": 0.336296409368515,
+      "learning_rate": 0.0005323264354415622,
+      "loss": 3.525,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6802027499417385,
+      "grad_norm": 0.34458428621292114,
+      "learning_rate": 0.0005321515593121538,
+      "loss": 3.5289,
+      "step": 19500
+    },
+    {
+      "epoch": 5.694768119319506,
+      "grad_norm": 0.3392544388771057,
+      "learning_rate": 0.0005319766831827455,
+      "loss": 3.5259,
+      "step": 19550
+    },
+    {
+      "epoch": 5.709333488697274,
+      "grad_norm": 0.3356161117553711,
+      "learning_rate": 0.0005318018070533372,
+      "loss": 3.5266,
+      "step": 19600
+    },
+    {
+      "epoch": 5.7238988580750405,
+      "grad_norm": 0.34901565313339233,
+      "learning_rate": 0.0005316269309239288,
+      "loss": 3.5224,
+      "step": 19650
+    },
+    {
+      "epoch": 5.738464227452808,
+      "grad_norm": 0.33175837993621826,
+      "learning_rate": 0.0005314520547945206,
+      "loss": 3.5364,
+      "step": 19700
+    },
+    {
+      "epoch": 5.753029596830576,
+      "grad_norm": 0.31978893280029297,
+      "learning_rate": 0.0005312771786651121,
+      "loss": 3.5287,
+      "step": 19750
+    },
+    {
+      "epoch": 5.7675949662083426,
+      "grad_norm": 0.3431372344493866,
+      "learning_rate": 0.0005311023025357038,
+      "loss": 3.5381,
+      "step": 19800
+    },
+    {
+      "epoch": 5.78216033558611,
+      "grad_norm": 0.3272165358066559,
+      "learning_rate": 0.0005309274264062955,
+      "loss": 3.5206,
+      "step": 19850
+    },
+    {
+      "epoch": 5.796725704963878,
+      "grad_norm": 0.3220779597759247,
+      "learning_rate": 0.0005307525502768872,
+      "loss": 3.5213,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8112910743416455,
+      "grad_norm": 0.32556435465812683,
+      "learning_rate": 0.0005305776741474788,
+      "loss": 3.5191,
+      "step": 19950
+    },
+    {
+      "epoch": 5.825856443719413,
+      "grad_norm": 0.32084694504737854,
+      "learning_rate": 0.0005304027980180705,
+      "loss": 3.5195,
+      "step": 20000
+    },
+    {
+      "epoch": 5.825856443719413,
+      "eval_accuracy": 0.36219539343034457,
+      "eval_loss": 3.6152045726776123,
+      "eval_runtime": 181.196,
+      "eval_samples_per_second": 91.84,
+      "eval_steps_per_second": 5.745,
+      "step": 20000
+    },
+    {
+      "epoch": 5.84042181309718,
+      "grad_norm": 0.3214913606643677,
+      "learning_rate": 0.0005302279218886622,
+      "loss": 3.5319,
+      "step": 20050
+    },
+    {
+      "epoch": 5.8549871824749475,
+      "grad_norm": 0.3432092070579529,
+      "learning_rate": 0.0005300530457592538,
+      "loss": 3.5317,
+      "step": 20100
+    },
+    {
+      "epoch": 5.869552551852715,
+      "grad_norm": 0.351404070854187,
+      "learning_rate": 0.0005298781696298456,
+      "loss": 3.5237,
+      "step": 20150
+    },
+    {
+      "epoch": 5.884117921230482,
+      "grad_norm": 0.36213329434394836,
+      "learning_rate": 0.0005297032935004371,
+      "loss": 3.5253,
+      "step": 20200
+    },
+    {
+      "epoch": 5.89868329060825,
+      "grad_norm": 0.3420363664627075,
+      "learning_rate": 0.0005295284173710288,
+      "loss": 3.5339,
+      "step": 20250
+    },
+    {
+      "epoch": 5.913248659986017,
+      "grad_norm": 0.31901925802230835,
+      "learning_rate": 0.0005293535412416205,
+      "loss": 3.533,
+      "step": 20300
+    },
+    {
+      "epoch": 5.927814029363785,
+      "grad_norm": 0.3374883234500885,
+      "learning_rate": 0.0005291786651122121,
+      "loss": 3.5282,
+      "step": 20350
+    },
+    {
+      "epoch": 5.9423793987415525,
+      "grad_norm": 0.33261144161224365,
+      "learning_rate": 0.0005290037889828038,
+      "loss": 3.5292,
+      "step": 20400
+    },
+    {
+      "epoch": 5.956944768119319,
+      "grad_norm": 0.32766902446746826,
+      "learning_rate": 0.0005288289128533955,
+      "loss": 3.5325,
+      "step": 20450
+    },
+    {
+      "epoch": 5.971510137497087,
+      "grad_norm": 0.2985290288925171,
+      "learning_rate": 0.0005286540367239872,
+      "loss": 3.5208,
+      "step": 20500
+    },
+    {
+      "epoch": 5.986075506874855,
+      "grad_norm": 0.3742130994796753,
+      "learning_rate": 0.0005284791605945788,
+      "loss": 3.5248,
+      "step": 20550
+    },
+    {
+      "epoch": 6.0005826147751105,
+      "grad_norm": 0.3418431580066681,
+      "learning_rate": 0.0005283042844651704,
+      "loss": 3.5182,
+      "step": 20600
+    },
+    {
+      "epoch": 6.015147984152878,
+      "grad_norm": 0.33612483739852905,
+      "learning_rate": 0.0005281294083357621,
+      "loss": 3.4176,
+      "step": 20650
+    },
+    {
+      "epoch": 6.029713353530646,
+      "grad_norm": 0.3870946168899536,
+      "learning_rate": 0.0005279545322063538,
+      "loss": 3.4103,
+      "step": 20700
+    },
+    {
+      "epoch": 6.044278722908413,
+      "grad_norm": 0.33531826734542847,
+      "learning_rate": 0.0005277796560769455,
+      "loss": 3.4213,
+      "step": 20750
+    },
+    {
+      "epoch": 6.05884409228618,
+      "grad_norm": 0.3558143973350525,
+      "learning_rate": 0.0005276047799475371,
+      "loss": 3.4305,
+      "step": 20800
+    },
+    {
+      "epoch": 6.073409461663948,
+      "grad_norm": 0.3327952027320862,
+      "learning_rate": 0.0005274299038181288,
+      "loss": 3.428,
+      "step": 20850
+    },
+    {
+      "epoch": 6.087974831041715,
+      "grad_norm": 0.32903343439102173,
+      "learning_rate": 0.0005272550276887205,
+      "loss": 3.4273,
+      "step": 20900
+    },
+    {
+      "epoch": 6.102540200419483,
+      "grad_norm": 0.3103554844856262,
+      "learning_rate": 0.0005270801515593121,
+      "loss": 3.4361,
+      "step": 20950
+    },
+    {
+      "epoch": 6.11710556979725,
+      "grad_norm": 0.336482435464859,
+      "learning_rate": 0.0005269052754299037,
+      "loss": 3.4431,
+      "step": 21000
+    },
+    {
+      "epoch": 6.11710556979725,
+      "eval_accuracy": 0.36242612057010065,
+      "eval_loss": 3.6179239749908447,
+      "eval_runtime": 181.3924,
+      "eval_samples_per_second": 91.74,
+      "eval_steps_per_second": 5.739,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1316709391750175,
+      "grad_norm": 0.3301815092563629,
+      "learning_rate": 0.0005267303993004954,
+      "loss": 3.44,
+      "step": 21050
+    },
+    {
+      "epoch": 6.146236308552785,
+      "grad_norm": 0.31713324785232544,
+      "learning_rate": 0.000526555523171087,
+      "loss": 3.4436,
+      "step": 21100
+    },
+    {
+      "epoch": 6.160801677930552,
+      "grad_norm": 0.3336808383464813,
+      "learning_rate": 0.0005263806470416788,
+      "loss": 3.4506,
+      "step": 21150
+    },
+    {
+      "epoch": 6.1753670473083195,
+      "grad_norm": 0.3340149521827698,
+      "learning_rate": 0.0005262057709122704,
+      "loss": 3.4424,
+      "step": 21200
+    },
+    {
+      "epoch": 6.189932416686087,
+      "grad_norm": 0.3524874150753021,
+      "learning_rate": 0.0005260308947828621,
+      "loss": 3.4413,
+      "step": 21250
+    },
+    {
+      "epoch": 6.204497786063855,
+      "grad_norm": 0.33669140934944153,
+      "learning_rate": 0.0005258560186534538,
+      "loss": 3.4488,
+      "step": 21300
+    },
+    {
+      "epoch": 6.219063155441622,
+      "grad_norm": 0.33581581711769104,
+      "learning_rate": 0.0005256811425240455,
+      "loss": 3.456,
+      "step": 21350
+    },
+    {
+      "epoch": 6.233628524819389,
+      "grad_norm": 0.3273789584636688,
+      "learning_rate": 0.0005255062663946371,
+      "loss": 3.445,
+      "step": 21400
+    },
+    {
+      "epoch": 6.248193894197157,
+      "grad_norm": 0.3533139228820801,
+      "learning_rate": 0.0005253313902652287,
+      "loss": 3.4585,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2627592635749245,
+      "grad_norm": 0.35069921612739563,
+      "learning_rate": 0.0005251565141358204,
+      "loss": 3.4586,
+      "step": 21500
+    },
+    {
+      "epoch": 6.277324632952691,
+      "grad_norm": 0.3373342752456665,
+      "learning_rate": 0.000524981638006412,
+      "loss": 3.4618,
+      "step": 21550
+    },
+    {
+      "epoch": 6.291890002330459,
+      "grad_norm": 0.32910624146461487,
+      "learning_rate": 0.0005248067618770038,
+      "loss": 3.4705,
+      "step": 21600
+    },
+    {
+      "epoch": 6.306455371708227,
+      "grad_norm": 0.3465712368488312,
+      "learning_rate": 0.0005246318857475954,
+      "loss": 3.4618,
+      "step": 21650
+    },
+    {
+      "epoch": 6.321020741085994,
+      "grad_norm": 0.33147528767585754,
+      "learning_rate": 0.0005244570096181871,
+      "loss": 3.4512,
+      "step": 21700
+    },
+    {
+      "epoch": 6.335586110463761,
+      "grad_norm": 0.3352925181388855,
+      "learning_rate": 0.0005242821334887788,
+      "loss": 3.4722,
+      "step": 21750
+    },
+    {
+      "epoch": 6.350151479841529,
+      "grad_norm": 0.36607617139816284,
+      "learning_rate": 0.0005241072573593704,
+      "loss": 3.4672,
+      "step": 21800
+    },
+    {
+      "epoch": 6.364716849219296,
+      "grad_norm": 0.3554653823375702,
+      "learning_rate": 0.000523932381229962,
+      "loss": 3.4626,
+      "step": 21850
+    },
+    {
+      "epoch": 6.379282218597064,
+      "grad_norm": 0.33701369166374207,
+      "learning_rate": 0.0005237575051005537,
+      "loss": 3.4676,
+      "step": 21900
+    },
+    {
+      "epoch": 6.393847587974831,
+      "grad_norm": 0.33805081248283386,
+      "learning_rate": 0.0005235826289711454,
+      "loss": 3.4543,
+      "step": 21950
+    },
+    {
+      "epoch": 6.408412957352598,
+      "grad_norm": 0.335484117269516,
+      "learning_rate": 0.000523407752841737,
+      "loss": 3.4638,
+      "step": 22000
+    },
+    {
+      "epoch": 6.408412957352598,
+      "eval_accuracy": 0.3632213177765383,
+      "eval_loss": 3.6108696460723877,
+      "eval_runtime": 181.2596,
+      "eval_samples_per_second": 91.808,
+      "eval_steps_per_second": 5.743,
+      "step": 22000
+    },
+    {
+      "epoch": 6.422978326730366,
+      "grad_norm": 0.33982568979263306,
+      "learning_rate": 0.0005232328767123287,
+      "loss": 3.4719,
+      "step": 22050
+    },
+    {
+      "epoch": 6.437543696108134,
+      "grad_norm": 0.34843680262565613,
+      "learning_rate": 0.0005230580005829204,
+      "loss": 3.4806,
+      "step": 22100
+    },
+    {
+      "epoch": 6.4521090654859,
+      "grad_norm": 0.3565368056297302,
+      "learning_rate": 0.0005228831244535121,
+      "loss": 3.4761,
+      "step": 22150
+    },
+    {
+      "epoch": 6.466674434863668,
+      "grad_norm": 0.3219994306564331,
+      "learning_rate": 0.0005227082483241038,
+      "loss": 3.4619,
+      "step": 22200
+    },
+    {
+      "epoch": 6.481239804241436,
+      "grad_norm": 0.3399719297885895,
+      "learning_rate": 0.0005225333721946954,
+      "loss": 3.4699,
+      "step": 22250
+    },
+    {
+      "epoch": 6.495805173619203,
+      "grad_norm": 0.33617985248565674,
+      "learning_rate": 0.000522358496065287,
+      "loss": 3.4852,
+      "step": 22300
+    },
+    {
+      "epoch": 6.51037054299697,
+      "grad_norm": 0.33567848801612854,
+      "learning_rate": 0.0005221836199358787,
+      "loss": 3.4678,
+      "step": 22350
+    },
+    {
+      "epoch": 6.524935912374738,
+      "grad_norm": 0.3187302350997925,
+      "learning_rate": 0.0005220087438064703,
+      "loss": 3.4789,
+      "step": 22400
+    },
+    {
+      "epoch": 6.539501281752505,
+      "grad_norm": 0.3293958306312561,
+      "learning_rate": 0.000521833867677062,
+      "loss": 3.487,
+      "step": 22450
+    },
+    {
+      "epoch": 6.554066651130273,
+      "grad_norm": 0.32739055156707764,
+      "learning_rate": 0.0005216589915476537,
+      "loss": 3.4803,
+      "step": 22500
+    },
+    {
+      "epoch": 6.56863202050804,
+      "grad_norm": 0.3373546898365021,
+      "learning_rate": 0.0005214841154182454,
+      "loss": 3.4806,
+      "step": 22550
+    },
+    {
+      "epoch": 6.583197389885807,
+      "grad_norm": 0.30911925435066223,
+      "learning_rate": 0.0005213092392888371,
+      "loss": 3.479,
+      "step": 22600
+    },
+    {
+      "epoch": 6.597762759263575,
+      "grad_norm": 0.34881287813186646,
+      "learning_rate": 0.0005211343631594287,
+      "loss": 3.4737,
+      "step": 22650
+    },
+    {
+      "epoch": 6.612328128641343,
+      "grad_norm": 0.32742807269096375,
+      "learning_rate": 0.0005209594870300204,
+      "loss": 3.4909,
+      "step": 22700
+    },
+    {
+      "epoch": 6.626893498019109,
+      "grad_norm": 0.34914788603782654,
+      "learning_rate": 0.000520784610900612,
+      "loss": 3.4834,
+      "step": 22750
+    },
+    {
+      "epoch": 6.641458867396877,
+      "grad_norm": 0.34026578068733215,
+      "learning_rate": 0.0005206097347712037,
+      "loss": 3.4829,
+      "step": 22800
+    },
+    {
+      "epoch": 6.656024236774645,
+      "grad_norm": 0.33478906750679016,
+      "learning_rate": 0.0005204348586417953,
+      "loss": 3.4747,
+      "step": 22850
+    },
+    {
+      "epoch": 6.670589606152412,
+      "grad_norm": 0.31324368715286255,
+      "learning_rate": 0.000520259982512387,
+      "loss": 3.4695,
+      "step": 22900
+    },
+    {
+      "epoch": 6.685154975530179,
+      "grad_norm": 0.33750981092453003,
+      "learning_rate": 0.0005200851063829787,
+      "loss": 3.4852,
+      "step": 22950
+    },
+    {
+      "epoch": 6.699720344907947,
+      "grad_norm": 0.3517208695411682,
+      "learning_rate": 0.0005199102302535703,
+      "loss": 3.505,
+      "step": 23000
+    },
+    {
+      "epoch": 6.699720344907947,
+      "eval_accuracy": 0.36391443997925105,
+      "eval_loss": 3.6010148525238037,
+      "eval_runtime": 181.0814,
+      "eval_samples_per_second": 91.898,
+      "eval_steps_per_second": 5.749,
+      "step": 23000
+    },
+    {
+      "epoch": 6.714285714285714,
+      "grad_norm": 0.32067185640335083,
+      "learning_rate": 0.0005197353541241621,
+      "loss": 3.4911,
+      "step": 23050
+    },
+    {
+      "epoch": 6.728851083663482,
+      "grad_norm": 0.34073543548583984,
+      "learning_rate": 0.0005195604779947537,
+      "loss": 3.4761,
+      "step": 23100
+    },
+    {
+      "epoch": 6.743416453041249,
+      "grad_norm": 0.3122085928916931,
+      "learning_rate": 0.0005193856018653454,
+      "loss": 3.484,
+      "step": 23150
+    },
+    {
+      "epoch": 6.7579818224190165,
+      "grad_norm": 0.33687078952789307,
+      "learning_rate": 0.000519210725735937,
+      "loss": 3.4858,
+      "step": 23200
+    },
+    {
+      "epoch": 6.772547191796784,
+      "grad_norm": 0.34494155645370483,
+      "learning_rate": 0.0005190358496065286,
+      "loss": 3.4851,
+      "step": 23250
+    },
+    {
+      "epoch": 6.787112561174552,
+      "grad_norm": 0.33593103289604187,
+      "learning_rate": 0.0005188609734771203,
+      "loss": 3.4894,
+      "step": 23300
+    },
+    {
+      "epoch": 6.8016779305523185,
+      "grad_norm": 0.3480103313922882,
+      "learning_rate": 0.000518686097347712,
+      "loss": 3.4892,
+      "step": 23350
+    },
+    {
+      "epoch": 6.816243299930086,
+      "grad_norm": 0.3221217691898346,
+      "learning_rate": 0.0005185112212183037,
+      "loss": 3.4947,
+      "step": 23400
+    },
+    {
+      "epoch": 6.830808669307854,
+      "grad_norm": 0.32625964283943176,
+      "learning_rate": 0.0005183363450888953,
+      "loss": 3.493,
+      "step": 23450
+    },
+    {
+      "epoch": 6.845374038685621,
+      "grad_norm": 0.34171169996261597,
+      "learning_rate": 0.000518161468959487,
+      "loss": 3.4883,
+      "step": 23500
+    },
+    {
+      "epoch": 6.859939408063388,
+      "grad_norm": 0.33914807438850403,
+      "learning_rate": 0.0005179865928300787,
+      "loss": 3.4836,
+      "step": 23550
+    },
+    {
+      "epoch": 6.874504777441156,
+      "grad_norm": 0.3401309549808502,
+      "learning_rate": 0.0005178117167006703,
+      "loss": 3.4926,
+      "step": 23600
+    },
+    {
+      "epoch": 6.8890701468189235,
+      "grad_norm": 0.3442193269729614,
+      "learning_rate": 0.000517636840571262,
+      "loss": 3.501,
+      "step": 23650
+    },
+    {
+      "epoch": 6.903635516196691,
+      "grad_norm": 0.3085883557796478,
+      "learning_rate": 0.0005174619644418536,
+      "loss": 3.4841,
+      "step": 23700
+    },
+    {
+      "epoch": 6.918200885574458,
+      "grad_norm": 0.3226356506347656,
+      "learning_rate": 0.0005172870883124453,
+      "loss": 3.4904,
+      "step": 23750
+    },
+    {
+      "epoch": 6.9327662549522255,
+      "grad_norm": 0.32244643568992615,
+      "learning_rate": 0.000517112212183037,
+      "loss": 3.489,
+      "step": 23800
+    },
+    {
+      "epoch": 6.947331624329993,
+      "grad_norm": 0.3091798722743988,
+      "learning_rate": 0.0005169373360536286,
+      "loss": 3.4939,
+      "step": 23850
+    },
+    {
+      "epoch": 6.961896993707761,
+      "grad_norm": 0.34266242384910583,
+      "learning_rate": 0.0005167624599242203,
+      "loss": 3.4886,
+      "step": 23900
+    },
+    {
+      "epoch": 6.976462363085528,
+      "grad_norm": 0.3311766982078552,
+      "learning_rate": 0.000516587583794812,
+      "loss": 3.4971,
+      "step": 23950
+    },
+    {
+      "epoch": 6.991027732463295,
+      "grad_norm": 0.32126250863075256,
+      "learning_rate": 0.0005164127076654037,
+      "loss": 3.4849,
+      "step": 24000
+    },
+    {
+      "epoch": 6.991027732463295,
+      "eval_accuracy": 0.36486815919608173,
+      "eval_loss": 3.5899109840393066,
+      "eval_runtime": 181.1825,
+      "eval_samples_per_second": 91.847,
+      "eval_steps_per_second": 5.746,
+      "step": 24000
+    },
+    {
+      "epoch": 7.005534840363552,
+      "grad_norm": 0.34321558475494385,
+      "learning_rate": 0.0005162378315359953,
+      "loss": 3.456,
+      "step": 24050
+    },
+    {
+      "epoch": 7.020100209741319,
+      "grad_norm": 0.32099732756614685,
+      "learning_rate": 0.0005160629554065869,
+      "loss": 3.3697,
+      "step": 24100
+    },
+    {
+      "epoch": 7.034665579119086,
+      "grad_norm": 0.3492303490638733,
+      "learning_rate": 0.0005158880792771786,
+      "loss": 3.3825,
+      "step": 24150
+    },
+    {
+      "epoch": 7.049230948496854,
+      "grad_norm": 0.3613590598106384,
+      "learning_rate": 0.0005157132031477703,
+      "loss": 3.3826,
+      "step": 24200
+    },
+    {
+      "epoch": 7.063796317874622,
+      "grad_norm": 0.3345809876918793,
+      "learning_rate": 0.000515538327018362,
+      "loss": 3.3882,
+      "step": 24250
+    },
+    {
+      "epoch": 7.0783616872523885,
+      "grad_norm": 0.34374889731407166,
+      "learning_rate": 0.0005153634508889536,
+      "loss": 3.3903,
+      "step": 24300
+    },
+    {
+      "epoch": 7.092927056630156,
+      "grad_norm": 0.34649938344955444,
+      "learning_rate": 0.0005151885747595453,
+      "loss": 3.3955,
+      "step": 24350
+    },
+    {
+      "epoch": 7.107492426007924,
+      "grad_norm": 0.32787710428237915,
+      "learning_rate": 0.000515013698630137,
+      "loss": 3.3989,
+      "step": 24400
+    },
+    {
+      "epoch": 7.122057795385691,
+      "grad_norm": 0.35148805379867554,
+      "learning_rate": 0.0005148388225007285,
+      "loss": 3.4054,
+      "step": 24450
+    },
+    {
+      "epoch": 7.136623164763458,
+      "grad_norm": 0.3529978096485138,
+      "learning_rate": 0.0005146639463713203,
+      "loss": 3.4034,
+      "step": 24500
+    },
+    {
+      "epoch": 7.151188534141226,
+      "grad_norm": 0.3426077961921692,
+      "learning_rate": 0.0005144890702419119,
+      "loss": 3.4015,
+      "step": 24550
+    },
+    {
+      "epoch": 7.165753903518993,
+      "grad_norm": 0.36019691824913025,
+      "learning_rate": 0.0005143141941125036,
+      "loss": 3.4042,
+      "step": 24600
+    },
+    {
+      "epoch": 7.180319272896761,
+      "grad_norm": 0.32822075486183167,
+      "learning_rate": 0.0005141393179830953,
+      "loss": 3.4107,
+      "step": 24650
+    },
+    {
+      "epoch": 7.194884642274528,
+      "grad_norm": 0.3263060450553894,
+      "learning_rate": 0.0005139644418536869,
+      "loss": 3.4203,
+      "step": 24700
+    },
+    {
+      "epoch": 7.2094500116522955,
+      "grad_norm": 0.3379085063934326,
+      "learning_rate": 0.0005137895657242786,
+      "loss": 3.4229,
+      "step": 24750
+    },
+    {
+      "epoch": 7.224015381030063,
+      "grad_norm": 0.3335205316543579,
+      "learning_rate": 0.0005136146895948703,
+      "loss": 3.4138,
+      "step": 24800
+    },
+    {
+      "epoch": 7.238580750407831,
+      "grad_norm": 0.3487148880958557,
+      "learning_rate": 0.000513439813465462,
+      "loss": 3.4128,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2531461197855975,
+      "grad_norm": 0.3310547471046448,
+      "learning_rate": 0.0005132649373360535,
+      "loss": 3.4229,
+      "step": 24900
+    },
+    {
+      "epoch": 7.267711489163365,
+      "grad_norm": 0.34830957651138306,
+      "learning_rate": 0.0005130900612066452,
+      "loss": 3.4208,
+      "step": 24950
+    },
+    {
+      "epoch": 7.282276858541133,
+      "grad_norm": 0.32062774896621704,
+      "learning_rate": 0.0005129151850772369,
+      "loss": 3.4216,
+      "step": 25000
+    },
+    {
+      "epoch": 7.282276858541133,
+      "eval_accuracy": 0.3646655379617292,
+      "eval_loss": 3.5988800525665283,
+      "eval_runtime": 181.3926,
+      "eval_samples_per_second": 91.74,
+      "eval_steps_per_second": 5.739,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2968422279189,
+      "grad_norm": 0.3289690613746643,
+      "learning_rate": 0.0005127403089478286,
+      "loss": 3.4188,
+      "step": 25050
+    },
+    {
+      "epoch": 7.311407597296667,
+      "grad_norm": 0.34852126240730286,
+      "learning_rate": 0.0005125654328184203,
+      "loss": 3.4199,
+      "step": 25100
+    },
+    {
+      "epoch": 7.325972966674435,
+      "grad_norm": 0.3791807293891907,
+      "learning_rate": 0.0005123905566890119,
+      "loss": 3.4376,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3405383360522025,
+      "grad_norm": 0.33741068840026855,
+      "learning_rate": 0.0005122156805596036,
+      "loss": 3.4319,
+      "step": 25200
+    },
+    {
+      "epoch": 7.35510370542997,
+      "grad_norm": 0.3323241174221039,
+      "learning_rate": 0.0005120408044301953,
+      "loss": 3.4227,
+      "step": 25250
+    },
+    {
+      "epoch": 7.369669074807737,
+      "grad_norm": 0.35564154386520386,
+      "learning_rate": 0.0005118659283007868,
+      "loss": 3.4385,
+      "step": 25300
+    },
+    {
+      "epoch": 7.384234444185505,
+      "grad_norm": 0.34174442291259766,
+      "learning_rate": 0.0005116910521713785,
+      "loss": 3.4283,
+      "step": 25350
+    },
+    {
+      "epoch": 7.398799813563272,
+      "grad_norm": 0.3545416295528412,
+      "learning_rate": 0.0005115161760419702,
+      "loss": 3.4331,
+      "step": 25400
+    },
+    {
+      "epoch": 7.413365182941039,
+      "grad_norm": 0.3415866196155548,
+      "learning_rate": 0.0005113412999125619,
+      "loss": 3.4329,
+      "step": 25450
+    },
+    {
+      "epoch": 7.427930552318807,
+      "grad_norm": 0.32280662655830383,
+      "learning_rate": 0.0005111664237831536,
+      "loss": 3.4482,
+      "step": 25500
+    },
+    {
+      "epoch": 7.442495921696574,
+      "grad_norm": 0.34938618540763855,
+      "learning_rate": 0.0005109915476537452,
+      "loss": 3.4327,
+      "step": 25550
+    },
+    {
+      "epoch": 7.457061291074342,
+      "grad_norm": 0.34118571877479553,
+      "learning_rate": 0.0005108166715243369,
+      "loss": 3.4423,
+      "step": 25600
+    },
+    {
+      "epoch": 7.471626660452109,
+      "grad_norm": 0.3390498459339142,
+      "learning_rate": 0.0005106417953949286,
+      "loss": 3.4347,
+      "step": 25650
+    },
+    {
+      "epoch": 7.486192029829876,
+      "grad_norm": 0.33087974786758423,
+      "learning_rate": 0.0005104669192655203,
+      "loss": 3.4475,
+      "step": 25700
+    },
+    {
+      "epoch": 7.500757399207644,
+      "grad_norm": 0.32783931493759155,
+      "learning_rate": 0.0005102920431361118,
+      "loss": 3.4459,
+      "step": 25750
+    },
+    {
+      "epoch": 7.515322768585412,
+      "grad_norm": 0.3460298478603363,
+      "learning_rate": 0.0005101171670067035,
+      "loss": 3.4426,
+      "step": 25800
+    },
+    {
+      "epoch": 7.529888137963178,
+      "grad_norm": 0.3336711525917053,
+      "learning_rate": 0.0005099422908772952,
+      "loss": 3.4442,
+      "step": 25850
+    },
+    {
+      "epoch": 7.544453507340946,
+      "grad_norm": 0.33612728118896484,
+      "learning_rate": 0.0005097674147478868,
+      "loss": 3.449,
+      "step": 25900
+    },
+    {
+      "epoch": 7.559018876718714,
+      "grad_norm": 0.3177841603755951,
+      "learning_rate": 0.0005095925386184786,
+      "loss": 3.4447,
+      "step": 25950
+    },
+    {
+      "epoch": 7.573584246096481,
+      "grad_norm": 0.3445686995983124,
+      "learning_rate": 0.0005094176624890702,
+      "loss": 3.4423,
+      "step": 26000
+    },
+    {
+      "epoch": 7.573584246096481,
+      "eval_accuracy": 0.3656406600019215,
+      "eval_loss": 3.589552164077759,
+      "eval_runtime": 180.5358,
+      "eval_samples_per_second": 92.176,
+      "eval_steps_per_second": 5.766,
+      "step": 26000
+    },
+    {
+      "epoch": 7.588149615474248,
+      "grad_norm": 0.3337860107421875,
+      "learning_rate": 0.0005092427863596619,
+      "loss": 3.434,
+      "step": 26050
+    },
+    {
+      "epoch": 7.602714984852016,
+      "grad_norm": 0.35997340083122253,
+      "learning_rate": 0.0005090679102302536,
+      "loss": 3.4465,
+      "step": 26100
+    },
+    {
+      "epoch": 7.617280354229783,
+      "grad_norm": 0.32720959186553955,
+      "learning_rate": 0.0005088930341008451,
+      "loss": 3.4555,
+      "step": 26150
+    },
+    {
+      "epoch": 7.631845723607551,
+      "grad_norm": 0.34168654680252075,
+      "learning_rate": 0.0005087181579714368,
+      "loss": 3.4556,
+      "step": 26200
+    },
+    {
+      "epoch": 7.646411092985318,
+      "grad_norm": 0.332603394985199,
+      "learning_rate": 0.0005085432818420285,
+      "loss": 3.4463,
+      "step": 26250
+    },
+    {
+      "epoch": 7.660976462363085,
+      "grad_norm": 0.31847983598709106,
+      "learning_rate": 0.0005083684057126202,
+      "loss": 3.4566,
+      "step": 26300
+    },
+    {
+      "epoch": 7.675541831740853,
+      "grad_norm": 0.3408845067024231,
+      "learning_rate": 0.0005081935295832118,
+      "loss": 3.4529,
+      "step": 26350
+    },
+    {
+      "epoch": 7.690107201118621,
+      "grad_norm": 0.34977152943611145,
+      "learning_rate": 0.0005080186534538035,
+      "loss": 3.4471,
+      "step": 26400
+    },
+    {
+      "epoch": 7.704672570496387,
+      "grad_norm": 0.31974101066589355,
+      "learning_rate": 0.0005078437773243952,
+      "loss": 3.451,
+      "step": 26450
+    },
+    {
+      "epoch": 7.719237939874155,
+      "grad_norm": 0.3461310863494873,
+      "learning_rate": 0.0005076689011949869,
+      "loss": 3.4536,
+      "step": 26500
+    },
+    {
+      "epoch": 7.733803309251923,
+      "grad_norm": 0.3448072075843811,
+      "learning_rate": 0.0005074940250655786,
+      "loss": 3.4542,
+      "step": 26550
+    },
+    {
+      "epoch": 7.74836867862969,
+      "grad_norm": 0.3268807530403137,
+      "learning_rate": 0.0005073191489361701,
+      "loss": 3.4477,
+      "step": 26600
+    },
+    {
+      "epoch": 7.762934048007457,
+      "grad_norm": 0.36917299032211304,
+      "learning_rate": 0.0005071442728067618,
+      "loss": 3.461,
+      "step": 26650
+    },
+    {
+      "epoch": 7.777499417385225,
+      "grad_norm": 0.3446688950061798,
+      "learning_rate": 0.0005069693966773535,
+      "loss": 3.4474,
+      "step": 26700
+    },
+    {
+      "epoch": 7.792064786762992,
+      "grad_norm": 0.3360874056816101,
+      "learning_rate": 0.0005067945205479451,
+      "loss": 3.4546,
+      "step": 26750
+    },
+    {
+      "epoch": 7.80663015614076,
+      "grad_norm": 0.3311786651611328,
+      "learning_rate": 0.0005066196444185368,
+      "loss": 3.4458,
+      "step": 26800
+    },
+    {
+      "epoch": 7.821195525518527,
+      "grad_norm": 0.33738234639167786,
+      "learning_rate": 0.0005064447682891285,
+      "loss": 3.4496,
+      "step": 26850
+    },
+    {
+      "epoch": 7.8357608948962945,
+      "grad_norm": 0.33856943249702454,
+      "learning_rate": 0.0005062698921597202,
+      "loss": 3.4499,
+      "step": 26900
+    },
+    {
+      "epoch": 7.850326264274062,
+      "grad_norm": 0.37138065695762634,
+      "learning_rate": 0.0005060950160303119,
+      "loss": 3.4548,
+      "step": 26950
+    },
+    {
+      "epoch": 7.86489163365183,
+      "grad_norm": 0.3353155553340912,
+      "learning_rate": 0.0005059201399009035,
+      "loss": 3.4548,
+      "step": 27000
+    },
+    {
+      "epoch": 7.86489163365183,
+      "eval_accuracy": 0.3661931350796861,
+      "eval_loss": 3.581078052520752,
+      "eval_runtime": 180.3799,
+      "eval_samples_per_second": 92.255,
+      "eval_steps_per_second": 5.771,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8794570030295965,
+      "grad_norm": 0.3436886966228485,
+      "learning_rate": 0.0005057452637714951,
+      "loss": 3.4584,
+      "step": 27050
+    },
+    {
+      "epoch": 7.894022372407364,
+      "grad_norm": 0.35296398401260376,
+      "learning_rate": 0.0005055703876420868,
+      "loss": 3.4657,
+      "step": 27100
+    },
+    {
+      "epoch": 7.908587741785132,
+      "grad_norm": 0.32724249362945557,
+      "learning_rate": 0.0005053955115126785,
+      "loss": 3.4646,
+      "step": 27150
+    },
+    {
+      "epoch": 7.923153111162899,
+      "grad_norm": 0.33248910307884216,
+      "learning_rate": 0.0005052206353832701,
+      "loss": 3.4631,
+      "step": 27200
+    },
+    {
+      "epoch": 7.937718480540666,
+      "grad_norm": 0.3228563368320465,
+      "learning_rate": 0.0005050457592538618,
+      "loss": 3.4495,
+      "step": 27250
+    },
+    {
+      "epoch": 7.952283849918434,
+      "grad_norm": 0.3430377244949341,
+      "learning_rate": 0.0005048708831244535,
+      "loss": 3.4564,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9668492192962015,
+      "grad_norm": 0.3293624222278595,
+      "learning_rate": 0.0005046960069950451,
+      "loss": 3.4625,
+      "step": 27350
+    },
+    {
+      "epoch": 7.981414588673969,
+      "grad_norm": 0.33939313888549805,
+      "learning_rate": 0.0005045211308656369,
+      "loss": 3.4708,
+      "step": 27400
+    },
+    {
+      "epoch": 7.995979958051736,
+      "grad_norm": 0.33478549122810364,
+      "learning_rate": 0.0005043462547362284,
+      "loss": 3.4623,
+      "step": 27450
+    },
+    {
+      "epoch": 8.010487065951992,
+      "grad_norm": 0.34706467390060425,
+      "learning_rate": 0.0005041713786068201,
+      "loss": 3.3717,
+      "step": 27500
+    },
+    {
+      "epoch": 8.02505243532976,
+      "grad_norm": 0.3538230359554291,
+      "learning_rate": 0.0005039965024774118,
+      "loss": 3.3461,
+      "step": 27550
+    },
+    {
+      "epoch": 8.039617804707527,
+      "grad_norm": 0.3367486596107483,
+      "learning_rate": 0.0005038216263480034,
+      "loss": 3.3598,
+      "step": 27600
+    },
+    {
+      "epoch": 8.054183174085296,
+      "grad_norm": 0.3318694829940796,
+      "learning_rate": 0.0005036467502185951,
+      "loss": 3.3606,
+      "step": 27650
+    },
+    {
+      "epoch": 8.068748543463062,
+      "grad_norm": 0.370888352394104,
+      "learning_rate": 0.0005034718740891868,
+      "loss": 3.3628,
+      "step": 27700
+    },
+    {
+      "epoch": 8.08331391284083,
+      "grad_norm": 0.3453328311443329,
+      "learning_rate": 0.0005032969979597785,
+      "loss": 3.3619,
+      "step": 27750
+    },
+    {
+      "epoch": 8.097879282218598,
+      "grad_norm": 0.3351195156574249,
+      "learning_rate": 0.0005031221218303701,
+      "loss": 3.3658,
+      "step": 27800
+    },
+    {
+      "epoch": 8.112444651596364,
+      "grad_norm": 0.32804664969444275,
+      "learning_rate": 0.0005029472457009618,
+      "loss": 3.3629,
+      "step": 27850
+    },
+    {
+      "epoch": 8.127010020974131,
+      "grad_norm": 0.31457826495170593,
+      "learning_rate": 0.0005027723695715534,
+      "loss": 3.372,
+      "step": 27900
+    },
+    {
+      "epoch": 8.1415753903519,
+      "grad_norm": 0.3434246778488159,
+      "learning_rate": 0.0005025974934421451,
+      "loss": 3.3817,
+      "step": 27950
+    },
+    {
+      "epoch": 8.156140759729666,
+      "grad_norm": 0.31584593653678894,
+      "learning_rate": 0.0005024226173127368,
+      "loss": 3.3804,
+      "step": 28000
+    },
+    {
+      "epoch": 8.156140759729666,
+      "eval_accuracy": 0.36580259235230084,
+      "eval_loss": 3.5929579734802246,
+      "eval_runtime": 180.492,
+      "eval_samples_per_second": 92.198,
+      "eval_steps_per_second": 5.768,
+      "step": 28000
+    },
+    {
+      "epoch": 8.170706129107435,
+      "grad_norm": 0.34201693534851074,
+      "learning_rate": 0.0005022477411833284,
+      "loss": 3.3746,
+      "step": 28050
+    },
+    {
+      "epoch": 8.185271498485202,
+      "grad_norm": 0.33196499943733215,
+      "learning_rate": 0.0005020728650539201,
+      "loss": 3.3854,
+      "step": 28100
+    },
+    {
+      "epoch": 8.199836867862969,
+      "grad_norm": 0.3429182767868042,
+      "learning_rate": 0.0005018979889245118,
+      "loss": 3.3806,
+      "step": 28150
+    },
+    {
+      "epoch": 8.214402237240737,
+      "grad_norm": 0.3488587737083435,
+      "learning_rate": 0.0005017231127951034,
+      "loss": 3.3733,
+      "step": 28200
+    },
+    {
+      "epoch": 8.228967606618504,
+      "grad_norm": 0.3547254204750061,
+      "learning_rate": 0.0005015482366656951,
+      "loss": 3.384,
+      "step": 28250
+    },
+    {
+      "epoch": 8.24353297599627,
+      "grad_norm": 0.33073434233665466,
+      "learning_rate": 0.0005013733605362868,
+      "loss": 3.3974,
+      "step": 28300
+    },
+    {
+      "epoch": 8.258098345374039,
+      "grad_norm": 0.3444958031177521,
+      "learning_rate": 0.0005011984844068784,
+      "loss": 3.4023,
+      "step": 28350
+    },
+    {
+      "epoch": 8.272663714751806,
+      "grad_norm": 0.3397659361362457,
+      "learning_rate": 0.0005010236082774701,
+      "loss": 3.3907,
+      "step": 28400
+    },
+    {
+      "epoch": 8.287229084129574,
+      "grad_norm": 0.3528975248336792,
+      "learning_rate": 0.0005008487321480617,
+      "loss": 3.4025,
+      "step": 28450
+    },
+    {
+      "epoch": 8.301794453507341,
+      "grad_norm": 0.325632780790329,
+      "learning_rate": 0.0005006738560186534,
+      "loss": 3.4043,
+      "step": 28500
+    },
+    {
+      "epoch": 8.316359822885108,
+      "grad_norm": 0.3388330936431885,
+      "learning_rate": 0.0005004989798892451,
+      "loss": 3.4008,
+      "step": 28550
+    },
+    {
+      "epoch": 8.330925192262876,
+      "grad_norm": 0.33034101128578186,
+      "learning_rate": 0.0005003241037598368,
+      "loss": 3.4041,
+      "step": 28600
+    },
+    {
+      "epoch": 8.345490561640643,
+      "grad_norm": 0.3294675350189209,
+      "learning_rate": 0.0005001492276304284,
+      "loss": 3.3978,
+      "step": 28650
+    },
+    {
+      "epoch": 8.36005593101841,
+      "grad_norm": 0.3379242718219757,
+      "learning_rate": 0.0004999743515010201,
+      "loss": 3.3931,
+      "step": 28700
+    },
+    {
+      "epoch": 8.374621300396178,
+      "grad_norm": 0.3348020315170288,
+      "learning_rate": 0.0004997994753716117,
+      "loss": 3.4,
+      "step": 28750
+    },
+    {
+      "epoch": 8.389186669773945,
+      "grad_norm": 0.33885735273361206,
+      "learning_rate": 0.0004996245992422033,
+      "loss": 3.4037,
+      "step": 28800
+    },
+    {
+      "epoch": 8.403752039151712,
+      "grad_norm": 0.3465324342250824,
+      "learning_rate": 0.0004994497231127951,
+      "loss": 3.3976,
+      "step": 28850
+    },
+    {
+      "epoch": 8.41831740852948,
+      "grad_norm": 0.35643112659454346,
+      "learning_rate": 0.0004992748469833867,
+      "loss": 3.4031,
+      "step": 28900
+    },
+    {
+      "epoch": 8.432882777907247,
+      "grad_norm": 0.327167272567749,
+      "learning_rate": 0.0004990999708539784,
+      "loss": 3.4055,
+      "step": 28950
+    },
+    {
+      "epoch": 8.447448147285016,
+      "grad_norm": 0.3230350613594055,
+      "learning_rate": 0.0004989250947245701,
+      "loss": 3.4071,
+      "step": 29000
+    },
+    {
+      "epoch": 8.447448147285016,
+      "eval_accuracy": 0.3665124134611529,
+      "eval_loss": 3.5824544429779053,
+      "eval_runtime": 180.4921,
+      "eval_samples_per_second": 92.198,
+      "eval_steps_per_second": 5.768,
+      "step": 29000
+    },
+    {
+      "epoch": 8.462013516662783,
+      "grad_norm": 0.3105214536190033,
+      "learning_rate": 0.0004987502185951617,
+      "loss": 3.4164,
+      "step": 29050
+    },
+    {
+      "epoch": 8.47657888604055,
+      "grad_norm": 0.3333645164966583,
+      "learning_rate": 0.0004985753424657534,
+      "loss": 3.4057,
+      "step": 29100
+    },
+    {
+      "epoch": 8.491144255418318,
+      "grad_norm": 0.3227234482765198,
+      "learning_rate": 0.000498400466336345,
+      "loss": 3.4038,
+      "step": 29150
+    },
+    {
+      "epoch": 8.505709624796085,
+      "grad_norm": 0.37023699283599854,
+      "learning_rate": 0.0004982255902069367,
+      "loss": 3.4149,
+      "step": 29200
+    },
+    {
+      "epoch": 8.520274994173853,
+      "grad_norm": 0.3446352183818817,
+      "learning_rate": 0.0004980507140775283,
+      "loss": 3.4098,
+      "step": 29250
+    },
+    {
+      "epoch": 8.53484036355162,
+      "grad_norm": 0.3288004696369171,
+      "learning_rate": 0.0004978758379481201,
+      "loss": 3.4005,
+      "step": 29300
+    },
+    {
+      "epoch": 8.549405732929387,
+      "grad_norm": 0.314487099647522,
+      "learning_rate": 0.0004977009618187117,
+      "loss": 3.4191,
+      "step": 29350
+    },
+    {
+      "epoch": 8.563971102307155,
+      "grad_norm": 0.3452147841453552,
+      "learning_rate": 0.0004975260856893034,
+      "loss": 3.41,
+      "step": 29400
+    },
+    {
+      "epoch": 8.578536471684922,
+      "grad_norm": 0.31154942512512207,
+      "learning_rate": 0.0004973512095598951,
+      "loss": 3.4108,
+      "step": 29450
+    },
+    {
+      "epoch": 8.593101841062689,
+      "grad_norm": 0.3399568200111389,
+      "learning_rate": 0.0004971763334304867,
+      "loss": 3.4201,
+      "step": 29500
+    },
+    {
+      "epoch": 8.607667210440457,
+      "grad_norm": 0.34255409240722656,
+      "learning_rate": 0.0004970014573010784,
+      "loss": 3.4154,
+      "step": 29550
+    },
+    {
+      "epoch": 8.622232579818224,
+      "grad_norm": 0.3516691327095032,
+      "learning_rate": 0.00049682658117167,
+      "loss": 3.4224,
+      "step": 29600
+    },
+    {
+      "epoch": 8.63679794919599,
+      "grad_norm": 0.33777916431427,
+      "learning_rate": 0.0004966517050422616,
+      "loss": 3.4208,
+      "step": 29650
+    },
+    {
+      "epoch": 8.65136331857376,
+      "grad_norm": 0.3323017358779907,
+      "learning_rate": 0.0004964768289128533,
+      "loss": 3.4177,
+      "step": 29700
+    },
+    {
+      "epoch": 8.665928687951526,
+      "grad_norm": 0.3520144820213318,
+      "learning_rate": 0.000496301952783445,
+      "loss": 3.4251,
+      "step": 29750
+    },
+    {
+      "epoch": 8.680494057329295,
+      "grad_norm": 0.33961278200149536,
+      "learning_rate": 0.0004961270766540367,
+      "loss": 3.4323,
+      "step": 29800
+    },
+    {
+      "epoch": 8.695059426707061,
+      "grad_norm": 0.33390894532203674,
+      "learning_rate": 0.0004959522005246284,
+      "loss": 3.4244,
+      "step": 29850
+    },
+    {
+      "epoch": 8.709624796084828,
+      "grad_norm": 0.3280866742134094,
+      "learning_rate": 0.00049577732439522,
+      "loss": 3.4277,
+      "step": 29900
+    },
+    {
+      "epoch": 8.724190165462597,
+      "grad_norm": 0.34025970101356506,
+      "learning_rate": 0.0004956024482658117,
+      "loss": 3.4221,
+      "step": 29950
+    },
+    {
+      "epoch": 8.738755534840363,
+      "grad_norm": 0.3388609290122986,
+      "learning_rate": 0.0004954275721364034,
+      "loss": 3.4312,
+      "step": 30000
+    },
+    {
+      "epoch": 8.738755534840363,
+      "eval_accuracy": 0.3672837382876871,
+      "eval_loss": 3.5726044178009033,
+      "eval_runtime": 180.5709,
+      "eval_samples_per_second": 92.158,
+      "eval_steps_per_second": 5.765,
+      "step": 30000
+    },
+    {
+      "epoch": 8.753320904218132,
+      "grad_norm": 0.3268741965293884,
+      "learning_rate": 0.000495252696006995,
+      "loss": 3.4261,
+      "step": 30050
+    },
+    {
+      "epoch": 8.767886273595899,
+      "grad_norm": 0.34607651829719543,
+      "learning_rate": 0.0004950778198775866,
+      "loss": 3.4155,
+      "step": 30100
+    },
+    {
+      "epoch": 8.782451642973665,
+      "grad_norm": 0.31372374296188354,
+      "learning_rate": 0.0004949029437481783,
+      "loss": 3.4126,
+      "step": 30150
+    },
+    {
+      "epoch": 8.797017012351434,
+      "grad_norm": 0.345786452293396,
+      "learning_rate": 0.00049472806761877,
+      "loss": 3.4428,
+      "step": 30200
+    },
+    {
+      "epoch": 8.8115823817292,
+      "grad_norm": 0.3313784599304199,
+      "learning_rate": 0.0004945531914893616,
+      "loss": 3.424,
+      "step": 30250
+    },
+    {
+      "epoch": 8.826147751106967,
+      "grad_norm": 0.3862437307834625,
+      "learning_rate": 0.0004943783153599534,
+      "loss": 3.4353,
+      "step": 30300
+    },
+    {
+      "epoch": 8.840713120484736,
+      "grad_norm": 0.3465390205383301,
+      "learning_rate": 0.000494203439230545,
+      "loss": 3.4368,
+      "step": 30350
+    },
+    {
+      "epoch": 8.855278489862503,
+      "grad_norm": 0.3554335832595825,
+      "learning_rate": 0.0004940285631011367,
+      "loss": 3.4241,
+      "step": 30400
+    },
+    {
+      "epoch": 8.86984385924027,
+      "grad_norm": 0.3209091126918793,
+      "learning_rate": 0.0004938536869717284,
+      "loss": 3.4351,
+      "step": 30450
+    },
+    {
+      "epoch": 8.884409228618038,
+      "grad_norm": 0.33575254678726196,
+      "learning_rate": 0.0004936788108423199,
+      "loss": 3.4228,
+      "step": 30500
+    },
+    {
+      "epoch": 8.898974597995805,
+      "grad_norm": 0.3382629156112671,
+      "learning_rate": 0.0004935039347129116,
+      "loss": 3.4385,
+      "step": 30550
+    },
+    {
+      "epoch": 8.913539967373573,
+      "grad_norm": 0.31963130831718445,
+      "learning_rate": 0.0004933290585835033,
+      "loss": 3.4193,
+      "step": 30600
+    },
+    {
+      "epoch": 8.92810533675134,
+      "grad_norm": 0.3423698842525482,
+      "learning_rate": 0.000493154182454095,
+      "loss": 3.4429,
+      "step": 30650
+    },
+    {
+      "epoch": 8.942670706129107,
+      "grad_norm": 0.35074007511138916,
+      "learning_rate": 0.0004929793063246866,
+      "loss": 3.4345,
+      "step": 30700
+    },
+    {
+      "epoch": 8.957236075506875,
+      "grad_norm": 0.3303930461406708,
+      "learning_rate": 0.0004928044301952783,
+      "loss": 3.4338,
+      "step": 30750
+    },
+    {
+      "epoch": 8.971801444884642,
+      "grad_norm": 0.36244720220565796,
+      "learning_rate": 0.00049262955406587,
+      "loss": 3.4198,
+      "step": 30800
+    },
+    {
+      "epoch": 8.986366814262409,
+      "grad_norm": 0.3402937650680542,
+      "learning_rate": 0.0004924546779364617,
+      "loss": 3.422,
+      "step": 30850
+    },
+    {
+      "epoch": 9.000873922162667,
+      "grad_norm": 0.3200979232788086,
+      "learning_rate": 0.0004922798018070533,
+      "loss": 3.4227,
+      "step": 30900
+    },
+    {
+      "epoch": 9.015439291540433,
+      "grad_norm": 0.3527183532714844,
+      "learning_rate": 0.0004921049256776449,
+      "loss": 3.3389,
+      "step": 30950
+    },
+    {
+      "epoch": 9.0300046609182,
+      "grad_norm": 0.357515811920166,
+      "learning_rate": 0.0004919300495482366,
+      "loss": 3.3276,
+      "step": 31000
+    },
+    {
+      "epoch": 9.0300046609182,
+      "eval_accuracy": 0.36772073219764306,
+      "eval_loss": 3.577789783477783,
+      "eval_runtime": 180.5131,
+      "eval_samples_per_second": 92.187,
+      "eval_steps_per_second": 5.767,
+      "step": 31000
+    },
+    {
+      "epoch": 9.044570030295969,
+      "grad_norm": 0.3615468144416809,
+      "learning_rate": 0.0004917551734188283,
+      "loss": 3.329,
+      "step": 31050
+    },
+    {
+      "epoch": 9.059135399673735,
+      "grad_norm": 0.3507601022720337,
+      "learning_rate": 0.0004915802972894199,
+      "loss": 3.3342,
+      "step": 31100
+    },
+    {
+      "epoch": 9.073700769051504,
+      "grad_norm": 0.35503971576690674,
+      "learning_rate": 0.0004914054211600116,
+      "loss": 3.3349,
+      "step": 31150
+    },
+    {
+      "epoch": 9.08826613842927,
+      "grad_norm": 0.353206068277359,
+      "learning_rate": 0.0004912305450306033,
+      "loss": 3.3419,
+      "step": 31200
+    },
+    {
+      "epoch": 9.102831507807037,
+      "grad_norm": 0.359823077917099,
+      "learning_rate": 0.000491055668901195,
+      "loss": 3.352,
+      "step": 31250
+    },
+    {
+      "epoch": 9.117396877184806,
+      "grad_norm": 0.33940252661705017,
+      "learning_rate": 0.0004908807927717865,
+      "loss": 3.358,
+      "step": 31300
+    },
+    {
+      "epoch": 9.131962246562573,
+      "grad_norm": 0.3473168909549713,
+      "learning_rate": 0.0004907059166423783,
+      "loss": 3.3456,
+      "step": 31350
+    },
+    {
+      "epoch": 9.14652761594034,
+      "grad_norm": 0.34054091572761536,
+      "learning_rate": 0.0004905310405129699,
+      "loss": 3.3383,
+      "step": 31400
+    },
+    {
+      "epoch": 9.161092985318108,
+      "grad_norm": 0.3713337779045105,
+      "learning_rate": 0.0004903561643835616,
+      "loss": 3.3514,
+      "step": 31450
+    },
+    {
+      "epoch": 9.175658354695875,
+      "grad_norm": 0.3296445310115814,
+      "learning_rate": 0.0004901812882541533,
+      "loss": 3.3533,
+      "step": 31500
+    },
+    {
+      "epoch": 9.190223724073643,
+      "grad_norm": 0.34697720408439636,
+      "learning_rate": 0.0004900064121247449,
+      "loss": 3.3548,
+      "step": 31550
+    },
+    {
+      "epoch": 9.20478909345141,
+      "grad_norm": 0.3309634029865265,
+      "learning_rate": 0.0004898315359953366,
+      "loss": 3.35,
+      "step": 31600
+    },
+    {
+      "epoch": 9.219354462829177,
+      "grad_norm": 0.3397737145423889,
+      "learning_rate": 0.0004896566598659283,
+      "loss": 3.3565,
+      "step": 31650
+    },
+    {
+      "epoch": 9.233919832206945,
+      "grad_norm": 0.36492007970809937,
+      "learning_rate": 0.0004894817837365199,
+      "loss": 3.3646,
+      "step": 31700
+    },
+    {
+      "epoch": 9.248485201584712,
+      "grad_norm": 0.38047462701797485,
+      "learning_rate": 0.0004893069076071115,
+      "loss": 3.3694,
+      "step": 31750
+    },
+    {
+      "epoch": 9.263050570962479,
+      "grad_norm": 0.37059515714645386,
+      "learning_rate": 0.0004891320314777032,
+      "loss": 3.3674,
+      "step": 31800
+    },
+    {
+      "epoch": 9.277615940340247,
+      "grad_norm": 0.3410259187221527,
+      "learning_rate": 0.0004889571553482949,
+      "loss": 3.3716,
+      "step": 31850
+    },
+    {
+      "epoch": 9.292181309718014,
+      "grad_norm": 0.3509190082550049,
+      "learning_rate": 0.0004887822792188866,
+      "loss": 3.3652,
+      "step": 31900
+    },
+    {
+      "epoch": 9.306746679095783,
+      "grad_norm": 0.33213210105895996,
+      "learning_rate": 0.0004886074030894782,
+      "loss": 3.3606,
+      "step": 31950
+    },
+    {
+      "epoch": 9.32131204847355,
+      "grad_norm": 0.3355337381362915,
+      "learning_rate": 0.0004884325269600699,
+      "loss": 3.3689,
+      "step": 32000
+    },
+    {
+      "epoch": 9.32131204847355,
+      "eval_accuracy": 0.3674749525227755,
+      "eval_loss": 3.5782315731048584,
+      "eval_runtime": 180.5937,
+      "eval_samples_per_second": 92.146,
+      "eval_steps_per_second": 5.764,
+      "step": 32000
+    },
+    {
+      "epoch": 9.335877417851316,
+      "grad_norm": 0.34377825260162354,
+      "learning_rate": 0.0004882576508306615,
+      "loss": 3.3749,
+      "step": 32050
+    },
+    {
+      "epoch": 9.350442787229085,
+      "grad_norm": 0.3456684947013855,
+      "learning_rate": 0.00048808277470125327,
+      "loss": 3.3629,
+      "step": 32100
+    },
+    {
+      "epoch": 9.365008156606851,
+      "grad_norm": 0.33172163367271423,
+      "learning_rate": 0.0004879078985718449,
+      "loss": 3.3571,
+      "step": 32150
+    },
+    {
+      "epoch": 9.379573525984618,
+      "grad_norm": 0.3585987091064453,
+      "learning_rate": 0.0004877330224424366,
+      "loss": 3.3849,
+      "step": 32200
+    },
+    {
+      "epoch": 9.394138895362387,
+      "grad_norm": 0.3346097469329834,
+      "learning_rate": 0.00048755814631302823,
+      "loss": 3.383,
+      "step": 32250
+    },
+    {
+      "epoch": 9.408704264740154,
+      "grad_norm": 0.36908599734306335,
+      "learning_rate": 0.00048738327018361987,
+      "loss": 3.3743,
+      "step": 32300
+    },
+    {
+      "epoch": 9.423269634117922,
+      "grad_norm": 0.3361928462982178,
+      "learning_rate": 0.00048720839405421156,
+      "loss": 3.3991,
+      "step": 32350
+    },
+    {
+      "epoch": 9.437835003495689,
+      "grad_norm": 0.32366377115249634,
+      "learning_rate": 0.0004870335179248032,
+      "loss": 3.3874,
+      "step": 32400
+    },
+    {
+      "epoch": 9.452400372873456,
+      "grad_norm": 0.3365907073020935,
+      "learning_rate": 0.0004868586417953949,
+      "loss": 3.3823,
+      "step": 32450
+    },
+    {
+      "epoch": 9.466965742251224,
+      "grad_norm": 0.3624427914619446,
+      "learning_rate": 0.0004866837656659865,
+      "loss": 3.3778,
+      "step": 32500
+    },
+    {
+      "epoch": 9.48153111162899,
+      "grad_norm": 0.3516867756843567,
+      "learning_rate": 0.00048650888953657816,
+      "loss": 3.3825,
+      "step": 32550
+    },
+    {
+      "epoch": 9.496096481006758,
+      "grad_norm": 0.3406991958618164,
+      "learning_rate": 0.0004863340134071699,
+      "loss": 3.3884,
+      "step": 32600
+    },
+    {
+      "epoch": 9.510661850384526,
+      "grad_norm": 0.32491427659988403,
+      "learning_rate": 0.00048615913727776154,
+      "loss": 3.3927,
+      "step": 32650
+    },
+    {
+      "epoch": 9.525227219762293,
+      "grad_norm": 0.35882461071014404,
+      "learning_rate": 0.00048598426114835323,
+      "loss": 3.3866,
+      "step": 32700
+    },
+    {
+      "epoch": 9.53979258914006,
+      "grad_norm": 0.3702818751335144,
+      "learning_rate": 0.00048580938501894486,
+      "loss": 3.3808,
+      "step": 32750
+    },
+    {
+      "epoch": 9.554357958517828,
+      "grad_norm": 0.3388266861438751,
+      "learning_rate": 0.00048563450888953655,
+      "loss": 3.3844,
+      "step": 32800
+    },
+    {
+      "epoch": 9.568923327895595,
+      "grad_norm": 0.3394116461277008,
+      "learning_rate": 0.0004854596327601282,
+      "loss": 3.3815,
+      "step": 32850
+    },
+    {
+      "epoch": 9.583488697273363,
+      "grad_norm": 0.3533828854560852,
+      "learning_rate": 0.0004852847566307198,
+      "loss": 3.3932,
+      "step": 32900
+    },
+    {
+      "epoch": 9.59805406665113,
+      "grad_norm": 0.36314722895622253,
+      "learning_rate": 0.0004851098805013115,
+      "loss": 3.3978,
+      "step": 32950
+    },
+    {
+      "epoch": 9.612619436028897,
+      "grad_norm": 0.35144150257110596,
+      "learning_rate": 0.00048493500437190315,
+      "loss": 3.3928,
+      "step": 33000
+    },
+    {
+      "epoch": 9.612619436028897,
+      "eval_accuracy": 0.367995205767567,
+      "eval_loss": 3.5680465698242188,
+      "eval_runtime": 180.7083,
+      "eval_samples_per_second": 92.088,
+      "eval_steps_per_second": 5.761,
+      "step": 33000
+    },
+    {
+      "epoch": 9.627184805406666,
+      "grad_norm": 0.33490830659866333,
+      "learning_rate": 0.0004847601282424949,
+      "loss": 3.3954,
+      "step": 33050
+    },
+    {
+      "epoch": 9.641750174784432,
+      "grad_norm": 0.35083308815956116,
+      "learning_rate": 0.00048458525211308653,
+      "loss": 3.3905,
+      "step": 33100
+    },
+    {
+      "epoch": 9.6563155441622,
+      "grad_norm": 0.3497445285320282,
+      "learning_rate": 0.00048441037598367817,
+      "loss": 3.392,
+      "step": 33150
+    },
+    {
+      "epoch": 9.670880913539968,
+      "grad_norm": 0.37605422735214233,
+      "learning_rate": 0.00048423549985426986,
+      "loss": 3.3965,
+      "step": 33200
+    },
+    {
+      "epoch": 9.685446282917734,
+      "grad_norm": 0.3329659104347229,
+      "learning_rate": 0.0004840606237248615,
+      "loss": 3.3914,
+      "step": 33250
+    },
+    {
+      "epoch": 9.700011652295503,
+      "grad_norm": 0.3359730839729309,
+      "learning_rate": 0.0004838857475954532,
+      "loss": 3.3981,
+      "step": 33300
+    },
+    {
+      "epoch": 9.71457702167327,
+      "grad_norm": 0.3315553367137909,
+      "learning_rate": 0.0004837108714660448,
+      "loss": 3.3989,
+      "step": 33350
+    },
+    {
+      "epoch": 9.729142391051036,
+      "grad_norm": 0.3292211890220642,
+      "learning_rate": 0.0004835359953366365,
+      "loss": 3.4053,
+      "step": 33400
+    },
+    {
+      "epoch": 9.743707760428805,
+      "grad_norm": 0.3730834424495697,
+      "learning_rate": 0.00048336111920722815,
+      "loss": 3.4121,
+      "step": 33450
+    },
+    {
+      "epoch": 9.758273129806572,
+      "grad_norm": 0.3661917448043823,
+      "learning_rate": 0.0004831862430778198,
+      "loss": 3.4055,
+      "step": 33500
+    },
+    {
+      "epoch": 9.772838499184338,
+      "grad_norm": 0.35423949360847473,
+      "learning_rate": 0.00048301136694841153,
+      "loss": 3.3926,
+      "step": 33550
+    },
+    {
+      "epoch": 9.787403868562107,
+      "grad_norm": 0.3627997934818268,
+      "learning_rate": 0.00048283649081900317,
+      "loss": 3.4121,
+      "step": 33600
+    },
+    {
+      "epoch": 9.801969237939874,
+      "grad_norm": 0.34267759323120117,
+      "learning_rate": 0.00048266161468959486,
+      "loss": 3.3994,
+      "step": 33650
+    },
+    {
+      "epoch": 9.816534607317642,
+      "grad_norm": 0.33956974744796753,
+      "learning_rate": 0.0004824867385601865,
+      "loss": 3.403,
+      "step": 33700
+    },
+    {
+      "epoch": 9.831099976695409,
+      "grad_norm": 0.32388654351234436,
+      "learning_rate": 0.00048231186243077813,
+      "loss": 3.3997,
+      "step": 33750
+    },
+    {
+      "epoch": 9.845665346073176,
+      "grad_norm": 0.3442712724208832,
+      "learning_rate": 0.0004821369863013698,
+      "loss": 3.4033,
+      "step": 33800
+    },
+    {
+      "epoch": 9.860230715450944,
+      "grad_norm": 0.3338857889175415,
+      "learning_rate": 0.00048196211017196146,
+      "loss": 3.3966,
+      "step": 33850
+    },
+    {
+      "epoch": 9.874796084828711,
+      "grad_norm": 0.3293977677822113,
+      "learning_rate": 0.00048178723404255315,
+      "loss": 3.4135,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88936145420648,
+      "grad_norm": 0.36276566982269287,
+      "learning_rate": 0.0004816123579131448,
+      "loss": 3.4042,
+      "step": 33950
+    },
+    {
+      "epoch": 9.903926823584246,
+      "grad_norm": 0.35830309987068176,
+      "learning_rate": 0.0004814374817837364,
+      "loss": 3.4054,
+      "step": 34000
+    },
+    {
+      "epoch": 9.903926823584246,
+      "eval_accuracy": 0.36873207440044753,
+      "eval_loss": 3.5603408813476562,
+      "eval_runtime": 180.5399,
+      "eval_samples_per_second": 92.174,
+      "eval_steps_per_second": 5.766,
+      "step": 34000
+    },
+    {
+      "epoch": 9.918492192962013,
+      "grad_norm": 0.3282095789909363,
+      "learning_rate": 0.00048126260565432816,
+      "loss": 3.4053,
+      "step": 34050
+    },
+    {
+      "epoch": 9.933057562339782,
+      "grad_norm": 0.32416263222694397,
+      "learning_rate": 0.0004810877295249198,
+      "loss": 3.407,
+      "step": 34100
+    },
+    {
+      "epoch": 9.947622931717548,
+      "grad_norm": 0.33595913648605347,
+      "learning_rate": 0.0004809128533955115,
+      "loss": 3.3987,
+      "step": 34150
+    },
+    {
+      "epoch": 9.962188301095315,
+      "grad_norm": 0.3410275876522064,
+      "learning_rate": 0.0004807379772661031,
+      "loss": 3.4105,
+      "step": 34200
+    },
+    {
+      "epoch": 9.976753670473084,
+      "grad_norm": 0.3720123767852783,
+      "learning_rate": 0.0004805631011366948,
+      "loss": 3.4018,
+      "step": 34250
+    },
+    {
+      "epoch": 9.99131903985085,
+      "grad_norm": 0.3672340512275696,
+      "learning_rate": 0.00048038822500728645,
+      "loss": 3.4149,
+      "step": 34300
+    },
+    {
+      "epoch": 10.005826147751106,
+      "grad_norm": 0.35301780700683594,
+      "learning_rate": 0.0004802133488778781,
+      "loss": 3.365,
+      "step": 34350
+    },
+    {
+      "epoch": 10.020391517128875,
+      "grad_norm": 0.3307633697986603,
+      "learning_rate": 0.0004800384727484698,
+      "loss": 3.2972,
+      "step": 34400
+    },
+    {
+      "epoch": 10.034956886506642,
+      "grad_norm": 0.33249130845069885,
+      "learning_rate": 0.0004798635966190614,
+      "loss": 3.3138,
+      "step": 34450
+    },
+    {
+      "epoch": 10.049522255884408,
+      "grad_norm": 0.3358910083770752,
+      "learning_rate": 0.00047968872048965316,
+      "loss": 3.3001,
+      "step": 34500
+    },
+    {
+      "epoch": 10.064087625262177,
+      "grad_norm": 0.36032402515411377,
+      "learning_rate": 0.0004795138443602448,
+      "loss": 3.3061,
+      "step": 34550
+    },
+    {
+      "epoch": 10.078652994639944,
+      "grad_norm": 0.3334386348724365,
+      "learning_rate": 0.00047933896823083643,
+      "loss": 3.3094,
+      "step": 34600
+    },
+    {
+      "epoch": 10.093218364017712,
+      "grad_norm": 0.3427768647670746,
+      "learning_rate": 0.0004791640921014281,
+      "loss": 3.3143,
+      "step": 34650
+    },
+    {
+      "epoch": 10.107783733395479,
+      "grad_norm": 0.34351709485054016,
+      "learning_rate": 0.00047898921597201976,
+      "loss": 3.3146,
+      "step": 34700
+    },
+    {
+      "epoch": 10.122349102773246,
+      "grad_norm": 0.3420124351978302,
+      "learning_rate": 0.00047881433984261145,
+      "loss": 3.3151,
+      "step": 34750
+    },
+    {
+      "epoch": 10.136914472151014,
+      "grad_norm": 0.35633090138435364,
+      "learning_rate": 0.0004786394637132031,
+      "loss": 3.3189,
+      "step": 34800
+    },
+    {
+      "epoch": 10.151479841528781,
+      "grad_norm": 0.33765673637390137,
+      "learning_rate": 0.0004784645875837948,
+      "loss": 3.3225,
+      "step": 34850
+    },
+    {
+      "epoch": 10.166045210906548,
+      "grad_norm": 0.32485634088516235,
+      "learning_rate": 0.0004782897114543864,
+      "loss": 3.3276,
+      "step": 34900
+    },
+    {
+      "epoch": 10.180610580284316,
+      "grad_norm": 0.3453110456466675,
+      "learning_rate": 0.00047811483532497805,
+      "loss": 3.3325,
+      "step": 34950
+    },
+    {
+      "epoch": 10.195175949662083,
+      "grad_norm": 0.33640891313552856,
+      "learning_rate": 0.0004779399591955698,
+      "loss": 3.3389,
+      "step": 35000
+    },
+    {
+      "epoch": 10.195175949662083,
+      "eval_accuracy": 0.3684539552946763,
+      "eval_loss": 3.574207305908203,
+      "eval_runtime": 180.7408,
+      "eval_samples_per_second": 92.071,
+      "eval_steps_per_second": 5.76,
+      "step": 35000
+    },
+    {
+      "epoch": 10.209741319039852,
+      "grad_norm": 0.3730228543281555,
+      "learning_rate": 0.00047776508306616143,
+      "loss": 3.3386,
+      "step": 35050
+    },
+    {
+      "epoch": 10.224306688417618,
+      "grad_norm": 0.337240993976593,
+      "learning_rate": 0.0004775902069367531,
+      "loss": 3.3323,
+      "step": 35100
+    },
+    {
+      "epoch": 10.238872057795385,
+      "grad_norm": 0.35810065269470215,
+      "learning_rate": 0.00047741533080734476,
+      "loss": 3.3288,
+      "step": 35150
+    },
+    {
+      "epoch": 10.253437427173154,
+      "grad_norm": 0.36488911509513855,
+      "learning_rate": 0.0004772404546779364,
+      "loss": 3.3393,
+      "step": 35200
+    },
+    {
+      "epoch": 10.26800279655092,
+      "grad_norm": 0.3996935784816742,
+      "learning_rate": 0.0004770655785485281,
+      "loss": 3.3444,
+      "step": 35250
+    },
+    {
+      "epoch": 10.282568165928687,
+      "grad_norm": 0.3564625382423401,
+      "learning_rate": 0.0004768907024191197,
+      "loss": 3.3532,
+      "step": 35300
+    },
+    {
+      "epoch": 10.297133535306456,
+      "grad_norm": 0.34120872616767883,
+      "learning_rate": 0.0004767158262897114,
+      "loss": 3.3494,
+      "step": 35350
+    },
+    {
+      "epoch": 10.311698904684222,
+      "grad_norm": 0.3489512801170349,
+      "learning_rate": 0.00047654095016030305,
+      "loss": 3.3401,
+      "step": 35400
+    },
+    {
+      "epoch": 10.326264274061991,
+      "grad_norm": 0.3304221034049988,
+      "learning_rate": 0.0004763660740308948,
+      "loss": 3.3385,
+      "step": 35450
+    },
+    {
+      "epoch": 10.340829643439758,
+      "grad_norm": 0.35581955313682556,
+      "learning_rate": 0.0004761911979014864,
+      "loss": 3.3412,
+      "step": 35500
+    },
+    {
+      "epoch": 10.355395012817525,
+      "grad_norm": 0.336819052696228,
+      "learning_rate": 0.00047601632177207806,
+      "loss": 3.353,
+      "step": 35550
+    },
+    {
+      "epoch": 10.369960382195293,
+      "grad_norm": 0.38106346130371094,
+      "learning_rate": 0.00047584144564266975,
+      "loss": 3.3524,
+      "step": 35600
+    },
+    {
+      "epoch": 10.38452575157306,
+      "grad_norm": 0.3523114025592804,
+      "learning_rate": 0.0004756665695132614,
+      "loss": 3.3468,
+      "step": 35650
+    },
+    {
+      "epoch": 10.399091120950827,
+      "grad_norm": 0.3667258620262146,
+      "learning_rate": 0.0004754916933838531,
+      "loss": 3.3502,
+      "step": 35700
+    },
+    {
+      "epoch": 10.413656490328595,
+      "grad_norm": 0.3516891896724701,
+      "learning_rate": 0.0004753168172544447,
+      "loss": 3.3623,
+      "step": 35750
+    },
+    {
+      "epoch": 10.428221859706362,
+      "grad_norm": 0.34636440873146057,
+      "learning_rate": 0.00047514194112503635,
+      "loss": 3.3703,
+      "step": 35800
+    },
+    {
+      "epoch": 10.44278722908413,
+      "grad_norm": 0.3442688584327698,
+      "learning_rate": 0.00047496706499562804,
+      "loss": 3.3575,
+      "step": 35850
+    },
+    {
+      "epoch": 10.457352598461897,
+      "grad_norm": 0.35010236501693726,
+      "learning_rate": 0.0004747921888662197,
+      "loss": 3.3677,
+      "step": 35900
+    },
+    {
+      "epoch": 10.471917967839664,
+      "grad_norm": 0.3223050534725189,
+      "learning_rate": 0.0004746173127368114,
+      "loss": 3.361,
+      "step": 35950
+    },
+    {
+      "epoch": 10.486483337217432,
+      "grad_norm": 0.3651178479194641,
+      "learning_rate": 0.00047444243660740306,
+      "loss": 3.3566,
+      "step": 36000
+    },
+    {
+      "epoch": 10.486483337217432,
+      "eval_accuracy": 0.36855473672116507,
+      "eval_loss": 3.566204071044922,
+      "eval_runtime": 180.6127,
+      "eval_samples_per_second": 92.136,
+      "eval_steps_per_second": 5.764,
+      "step": 36000
+    },
+    {
+      "epoch": 10.5010487065952,
+      "grad_norm": 0.34635260701179504,
+      "learning_rate": 0.0004742675604779947,
+      "loss": 3.3656,
+      "step": 36050
+    },
+    {
+      "epoch": 10.515614075972966,
+      "grad_norm": 0.3358325660228729,
+      "learning_rate": 0.0004740926843485864,
+      "loss": 3.3699,
+      "step": 36100
+    },
+    {
+      "epoch": 10.530179445350734,
+      "grad_norm": 0.3611391484737396,
+      "learning_rate": 0.000473917808219178,
+      "loss": 3.3752,
+      "step": 36150
+    },
+    {
+      "epoch": 10.544744814728501,
+      "grad_norm": 0.34786006808280945,
+      "learning_rate": 0.0004737429320897697,
+      "loss": 3.3712,
+      "step": 36200
+    },
+    {
+      "epoch": 10.55931018410627,
+      "grad_norm": 0.34254857897758484,
+      "learning_rate": 0.00047356805596036135,
+      "loss": 3.3698,
+      "step": 36250
+    },
+    {
+      "epoch": 10.573875553484037,
+      "grad_norm": 0.3432486951351166,
+      "learning_rate": 0.00047339317983095304,
+      "loss": 3.358,
+      "step": 36300
+    },
+    {
+      "epoch": 10.588440922861803,
+      "grad_norm": 0.3423634171485901,
+      "learning_rate": 0.0004732183037015447,
+      "loss": 3.3799,
+      "step": 36350
+    },
+    {
+      "epoch": 10.603006292239572,
+      "grad_norm": 0.3549594283103943,
+      "learning_rate": 0.0004730434275721363,
+      "loss": 3.3809,
+      "step": 36400
+    },
+    {
+      "epoch": 10.617571661617339,
+      "grad_norm": 0.36914047598838806,
+      "learning_rate": 0.00047286855144272806,
+      "loss": 3.3755,
+      "step": 36450
+    },
+    {
+      "epoch": 10.632137030995105,
+      "grad_norm": 0.34810906648635864,
+      "learning_rate": 0.0004726936753133197,
+      "loss": 3.3678,
+      "step": 36500
+    },
+    {
+      "epoch": 10.646702400372874,
+      "grad_norm": 0.36608827114105225,
+      "learning_rate": 0.0004725187991839114,
+      "loss": 3.3748,
+      "step": 36550
+    },
+    {
+      "epoch": 10.66126776975064,
+      "grad_norm": 0.3450092077255249,
+      "learning_rate": 0.000472343923054503,
+      "loss": 3.3695,
+      "step": 36600
+    },
+    {
+      "epoch": 10.675833139128407,
+      "grad_norm": 0.3681490421295166,
+      "learning_rate": 0.00047216904692509465,
+      "loss": 3.3782,
+      "step": 36650
+    },
+    {
+      "epoch": 10.690398508506176,
+      "grad_norm": 0.369987428188324,
+      "learning_rate": 0.00047199417079568634,
+      "loss": 3.3902,
+      "step": 36700
+    },
+    {
+      "epoch": 10.704963877883943,
+      "grad_norm": 0.3441363275051117,
+      "learning_rate": 0.000471819294666278,
+      "loss": 3.3744,
+      "step": 36750
+    },
+    {
+      "epoch": 10.719529247261711,
+      "grad_norm": 0.35835736989974976,
+      "learning_rate": 0.00047164441853686967,
+      "loss": 3.3766,
+      "step": 36800
+    },
+    {
+      "epoch": 10.734094616639478,
+      "grad_norm": 0.34251275658607483,
+      "learning_rate": 0.0004714695424074613,
+      "loss": 3.3909,
+      "step": 36850
+    },
+    {
+      "epoch": 10.748659986017245,
+      "grad_norm": 0.3410945236682892,
+      "learning_rate": 0.00047129466627805305,
+      "loss": 3.3767,
+      "step": 36900
+    },
+    {
+      "epoch": 10.763225355395013,
+      "grad_norm": 0.3362538516521454,
+      "learning_rate": 0.0004711197901486447,
+      "loss": 3.3841,
+      "step": 36950
+    },
+    {
+      "epoch": 10.77779072477278,
+      "grad_norm": 0.3534057140350342,
+      "learning_rate": 0.0004709449140192363,
+      "loss": 3.3865,
+      "step": 37000
+    },
+    {
+      "epoch": 10.77779072477278,
+      "eval_accuracy": 0.36938544850263144,
+      "eval_loss": 3.5582756996154785,
+      "eval_runtime": 180.5512,
+      "eval_samples_per_second": 92.168,
+      "eval_steps_per_second": 5.766,
+      "step": 37000
+    },
+    {
+      "epoch": 10.792356094150549,
+      "grad_norm": 0.3291257917881012,
+      "learning_rate": 0.000470770037889828,
+      "loss": 3.3841,
+      "step": 37050
+    },
+    {
+      "epoch": 10.806921463528315,
+      "grad_norm": 0.31563735008239746,
+      "learning_rate": 0.00047059516176041965,
+      "loss": 3.3827,
+      "step": 37100
+    },
+    {
+      "epoch": 10.821486832906082,
+      "grad_norm": 0.34236061573028564,
+      "learning_rate": 0.00047042028563101134,
+      "loss": 3.3738,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83605220228385,
+      "grad_norm": 0.3518962860107422,
+      "learning_rate": 0.000470245409501603,
+      "loss": 3.3873,
+      "step": 37200
+    },
+    {
+      "epoch": 10.850617571661617,
+      "grad_norm": 0.34845709800720215,
+      "learning_rate": 0.0004700705333721946,
+      "loss": 3.3749,
+      "step": 37250
+    },
+    {
+      "epoch": 10.865182941039384,
+      "grad_norm": 0.3321661651134491,
+      "learning_rate": 0.0004698956572427863,
+      "loss": 3.3812,
+      "step": 37300
+    },
+    {
+      "epoch": 10.879748310417153,
+      "grad_norm": 0.35323044657707214,
+      "learning_rate": 0.00046972078111337794,
+      "loss": 3.3762,
+      "step": 37350
+    },
+    {
+      "epoch": 10.89431367979492,
+      "grad_norm": 0.358534574508667,
+      "learning_rate": 0.0004695459049839697,
+      "loss": 3.3799,
+      "step": 37400
+    },
+    {
+      "epoch": 10.908879049172686,
+      "grad_norm": 0.3211372494697571,
+      "learning_rate": 0.0004693710288545613,
+      "loss": 3.3873,
+      "step": 37450
+    },
+    {
+      "epoch": 10.923444418550455,
+      "grad_norm": 0.3522772490978241,
+      "learning_rate": 0.000469196152725153,
+      "loss": 3.393,
+      "step": 37500
+    },
+    {
+      "epoch": 10.938009787928221,
+      "grad_norm": 0.329208105802536,
+      "learning_rate": 0.00046902127659574465,
+      "loss": 3.3878,
+      "step": 37550
+    },
+    {
+      "epoch": 10.95257515730599,
+      "grad_norm": 0.3144879639148712,
+      "learning_rate": 0.0004688464004663363,
+      "loss": 3.3864,
+      "step": 37600
+    },
+    {
+      "epoch": 10.967140526683757,
+      "grad_norm": 0.351262629032135,
+      "learning_rate": 0.000468671524336928,
+      "loss": 3.3842,
+      "step": 37650
+    },
+    {
+      "epoch": 10.981705896061523,
+      "grad_norm": 0.335332989692688,
+      "learning_rate": 0.0004684966482075196,
+      "loss": 3.3909,
+      "step": 37700
+    },
+    {
+      "epoch": 10.996271265439292,
+      "grad_norm": 0.3745553493499756,
+      "learning_rate": 0.0004683217720781113,
+      "loss": 3.3869,
+      "step": 37750
+    },
+    {
+      "epoch": 11.010778373339548,
+      "grad_norm": 0.39662572741508484,
+      "learning_rate": 0.00046814689594870294,
+      "loss": 3.3025,
+      "step": 37800
+    },
+    {
+      "epoch": 11.025343742717315,
+      "grad_norm": 0.35523492097854614,
+      "learning_rate": 0.0004679720198192946,
+      "loss": 3.2731,
+      "step": 37850
+    },
+    {
+      "epoch": 11.039909112095083,
+      "grad_norm": 0.36821720004081726,
+      "learning_rate": 0.0004677971436898863,
+      "loss": 3.2698,
+      "step": 37900
+    },
+    {
+      "epoch": 11.05447448147285,
+      "grad_norm": 0.37665146589279175,
+      "learning_rate": 0.00046762226756047795,
+      "loss": 3.2943,
+      "step": 37950
+    },
+    {
+      "epoch": 11.069039850850617,
+      "grad_norm": 0.33821412920951843,
+      "learning_rate": 0.00046744739143106964,
+      "loss": 3.3071,
+      "step": 38000
+    },
+    {
+      "epoch": 11.069039850850617,
+      "eval_accuracy": 0.3689831459821903,
+      "eval_loss": 3.5702402591705322,
+      "eval_runtime": 180.66,
+      "eval_samples_per_second": 92.112,
+      "eval_steps_per_second": 5.762,
+      "step": 38000
+    },
+    {
+      "epoch": 11.083605220228385,
+      "grad_norm": 0.3681955635547638,
+      "learning_rate": 0.0004672725153016613,
+      "loss": 3.3036,
+      "step": 38050
+    },
+    {
+      "epoch": 11.098170589606152,
+      "grad_norm": 0.34856081008911133,
+      "learning_rate": 0.00046709763917225297,
+      "loss": 3.2974,
+      "step": 38100
+    },
+    {
+      "epoch": 11.11273595898392,
+      "grad_norm": 0.3701181411743164,
+      "learning_rate": 0.0004669227630428446,
+      "loss": 3.2957,
+      "step": 38150
+    },
+    {
+      "epoch": 11.127301328361687,
+      "grad_norm": 0.3402550220489502,
+      "learning_rate": 0.00046674788691343624,
+      "loss": 3.3014,
+      "step": 38200
+    },
+    {
+      "epoch": 11.141866697739454,
+      "grad_norm": 0.3537476658821106,
+      "learning_rate": 0.00046657301078402793,
+      "loss": 3.2916,
+      "step": 38250
+    },
+    {
+      "epoch": 11.156432067117223,
+      "grad_norm": 0.3368578553199768,
+      "learning_rate": 0.00046639813465461957,
+      "loss": 3.2984,
+      "step": 38300
+    },
+    {
+      "epoch": 11.17099743649499,
+      "grad_norm": 0.35655760765075684,
+      "learning_rate": 0.0004662232585252113,
+      "loss": 3.3049,
+      "step": 38350
+    },
+    {
+      "epoch": 11.185562805872756,
+      "grad_norm": 0.32713720202445984,
+      "learning_rate": 0.00046604838239580295,
+      "loss": 3.307,
+      "step": 38400
+    },
+    {
+      "epoch": 11.200128175250525,
+      "grad_norm": 0.3530442416667938,
+      "learning_rate": 0.0004658735062663946,
+      "loss": 3.3251,
+      "step": 38450
+    },
+    {
+      "epoch": 11.214693544628291,
+      "grad_norm": 0.3867241144180298,
+      "learning_rate": 0.0004656986301369863,
+      "loss": 3.3222,
+      "step": 38500
+    },
+    {
+      "epoch": 11.22925891400606,
+      "grad_norm": 0.3441658616065979,
+      "learning_rate": 0.0004655237540075779,
+      "loss": 3.3201,
+      "step": 38550
+    },
+    {
+      "epoch": 11.243824283383827,
+      "grad_norm": 0.3687216639518738,
+      "learning_rate": 0.0004653488778781696,
+      "loss": 3.3179,
+      "step": 38600
+    },
+    {
+      "epoch": 11.258389652761593,
+      "grad_norm": 0.3919098973274231,
+      "learning_rate": 0.00046517400174876124,
+      "loss": 3.3203,
+      "step": 38650
+    },
+    {
+      "epoch": 11.272955022139362,
+      "grad_norm": 0.3542415201663971,
+      "learning_rate": 0.0004649991256193529,
+      "loss": 3.3219,
+      "step": 38700
+    },
+    {
+      "epoch": 11.287520391517129,
+      "grad_norm": 0.33851155638694763,
+      "learning_rate": 0.00046482424948994457,
+      "loss": 3.3277,
+      "step": 38750
+    },
+    {
+      "epoch": 11.302085760894895,
+      "grad_norm": 0.3498404026031494,
+      "learning_rate": 0.0004646493733605362,
+      "loss": 3.3304,
+      "step": 38800
+    },
+    {
+      "epoch": 11.316651130272664,
+      "grad_norm": 0.3452480435371399,
+      "learning_rate": 0.00046447449723112795,
+      "loss": 3.3376,
+      "step": 38850
+    },
+    {
+      "epoch": 11.33121649965043,
+      "grad_norm": 0.3540496230125427,
+      "learning_rate": 0.0004642996211017196,
+      "loss": 3.3389,
+      "step": 38900
+    },
+    {
+      "epoch": 11.3457818690282,
+      "grad_norm": 0.3557718098163605,
+      "learning_rate": 0.0004641247449723113,
+      "loss": 3.3363,
+      "step": 38950
+    },
+    {
+      "epoch": 11.360347238405966,
+      "grad_norm": 0.3436686396598816,
+      "learning_rate": 0.0004639498688429029,
+      "loss": 3.3269,
+      "step": 39000
+    },
+    {
+      "epoch": 11.360347238405966,
+      "eval_accuracy": 0.3691524870021947,
+      "eval_loss": 3.5654149055480957,
+      "eval_runtime": 180.5254,
+      "eval_samples_per_second": 92.181,
+      "eval_steps_per_second": 5.767,
+      "step": 39000
+    },
+    {
+      "epoch": 11.374912607783733,
+      "grad_norm": 0.35435765981674194,
+      "learning_rate": 0.00046377499271349455,
+      "loss": 3.3498,
+      "step": 39050
+    },
+    {
+      "epoch": 11.389477977161501,
+      "grad_norm": 0.33486226201057434,
+      "learning_rate": 0.00046360011658408624,
+      "loss": 3.334,
+      "step": 39100
+    },
+    {
+      "epoch": 11.404043346539268,
+      "grad_norm": 0.3701058328151703,
+      "learning_rate": 0.00046342524045467787,
+      "loss": 3.3274,
+      "step": 39150
+    },
+    {
+      "epoch": 11.418608715917035,
+      "grad_norm": 0.3655923306941986,
+      "learning_rate": 0.00046325036432526956,
+      "loss": 3.3407,
+      "step": 39200
+    },
+    {
+      "epoch": 11.433174085294803,
+      "grad_norm": 0.3761111795902252,
+      "learning_rate": 0.0004630754881958612,
+      "loss": 3.3474,
+      "step": 39250
+    },
+    {
+      "epoch": 11.44773945467257,
+      "grad_norm": 0.3308749198913574,
+      "learning_rate": 0.00046290061206645284,
+      "loss": 3.329,
+      "step": 39300
+    },
+    {
+      "epoch": 11.462304824050339,
+      "grad_norm": 0.3537072241306305,
+      "learning_rate": 0.0004627257359370446,
+      "loss": 3.3509,
+      "step": 39350
+    },
+    {
+      "epoch": 11.476870193428105,
+      "grad_norm": 0.36065319180488586,
+      "learning_rate": 0.0004625508598076362,
+      "loss": 3.3393,
+      "step": 39400
+    },
+    {
+      "epoch": 11.491435562805872,
+      "grad_norm": 0.3387892246246338,
+      "learning_rate": 0.0004623759836782279,
+      "loss": 3.3344,
+      "step": 39450
+    },
+    {
+      "epoch": 11.50600093218364,
+      "grad_norm": 0.3416099548339844,
+      "learning_rate": 0.00046220110754881954,
+      "loss": 3.3415,
+      "step": 39500
+    },
+    {
+      "epoch": 11.520566301561407,
+      "grad_norm": 0.34926944971084595,
+      "learning_rate": 0.00046202623141941123,
+      "loss": 3.3434,
+      "step": 39550
+    },
+    {
+      "epoch": 11.535131670939174,
+      "grad_norm": 0.3529592752456665,
+      "learning_rate": 0.00046185135529000287,
+      "loss": 3.352,
+      "step": 39600
+    },
+    {
+      "epoch": 11.549697040316943,
+      "grad_norm": 0.32331353425979614,
+      "learning_rate": 0.0004616764791605945,
+      "loss": 3.3386,
+      "step": 39650
+    },
+    {
+      "epoch": 11.56426240969471,
+      "grad_norm": 0.32540374994277954,
+      "learning_rate": 0.0004615016030311862,
+      "loss": 3.3506,
+      "step": 39700
+    },
+    {
+      "epoch": 11.578827779072478,
+      "grad_norm": 0.36556684970855713,
+      "learning_rate": 0.00046132672690177783,
+      "loss": 3.3511,
+      "step": 39750
+    },
+    {
+      "epoch": 11.593393148450245,
+      "grad_norm": 0.37641826272010803,
+      "learning_rate": 0.0004611518507723696,
+      "loss": 3.3488,
+      "step": 39800
+    },
+    {
+      "epoch": 11.607958517828012,
+      "grad_norm": 0.33244505524635315,
+      "learning_rate": 0.0004609769746429612,
+      "loss": 3.3593,
+      "step": 39850
+    },
+    {
+      "epoch": 11.62252388720578,
+      "grad_norm": 0.3254227042198181,
+      "learning_rate": 0.00046080209851355285,
+      "loss": 3.3406,
+      "step": 39900
+    },
+    {
+      "epoch": 11.637089256583547,
+      "grad_norm": 0.35844728350639343,
+      "learning_rate": 0.00046062722238414454,
+      "loss": 3.351,
+      "step": 39950
+    },
+    {
+      "epoch": 11.651654625961314,
+      "grad_norm": 0.34296858310699463,
+      "learning_rate": 0.0004604523462547362,
+      "loss": 3.3528,
+      "step": 40000
+    },
+    {
+      "epoch": 11.651654625961314,
+      "eval_accuracy": 0.3700508175937323,
+      "eval_loss": 3.5568654537200928,
+      "eval_runtime": 182.4459,
+      "eval_samples_per_second": 91.211,
+      "eval_steps_per_second": 5.706,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171650,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.36068395515904e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}