diff --git "a/cost_to_hit_frequency_2128/checkpoint-40000/trainer_state.json" "b/cost_to_hit_frequency_2128/checkpoint-40000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/cost_to_hit_frequency_2128/checkpoint-40000/trainer_state.json"
@@ -0,0 +1,6003 @@
+{
+  "best_global_step": 40000,
+  "best_metric": 3.5464437007904053,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_hit_frequency_2128/checkpoint-40000",
+  "epoch": 11.651654625961314,
+  "eval_steps": 1000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01456536937776742,
+      "grad_norm": 1.1913238763809204,
+      "learning_rate": 0.000294,
+      "loss": 8.4438,
+      "step": 50
+    },
+    {
+      "epoch": 0.02913073875553484,
+      "grad_norm": 0.7407680749893188,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7092,
+      "step": 100
+    },
+    {
+      "epoch": 0.04369610813330226,
+      "grad_norm": 0.637363851070404,
+      "learning_rate": 0.0005995711785297549,
+      "loss": 6.3304,
+      "step": 150
+    },
+    {
+      "epoch": 0.05826147751106968,
+      "grad_norm": 0.5566583275794983,
+      "learning_rate": 0.0005991336056009335,
+      "loss": 6.1355,
+      "step": 200
+    },
+    {
+      "epoch": 0.0728268468888371,
+      "grad_norm": 0.47791680693626404,
+      "learning_rate": 0.000598696032672112,
+      "loss": 5.9734,
+      "step": 250
+    },
+    {
+      "epoch": 0.08739221626660452,
+      "grad_norm": 0.4901290833950043,
+      "learning_rate": 0.0005982584597432905,
+      "loss": 5.8595,
+      "step": 300
+    },
+    {
+      "epoch": 0.10195758564437195,
+      "grad_norm": 0.4541548788547516,
+      "learning_rate": 0.0005978208868144691,
+      "loss": 5.7462,
+      "step": 350
+    },
+    {
+      "epoch": 0.11652295502213936,
+      "grad_norm": 0.5347028374671936,
+      "learning_rate": 0.0005973833138856476,
+      "loss": 5.621,
+      "step": 400
+    },
+    {
+      "epoch": 0.13108832439990678,
+      "grad_norm": 0.5162431597709656,
+      "learning_rate": 0.000596945740956826,
+      "loss": 5.5125,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456536937776742,
+      "grad_norm": 0.46286195516586304,
+      "learning_rate": 0.0005965081680280046,
+      "loss": 5.4126,
+      "step": 500
+    },
+    {
+      "epoch": 0.16021906315544163,
+      "grad_norm": 0.4214600622653961,
+      "learning_rate": 0.0005960705950991831,
+      "loss": 5.3353,
+      "step": 550
+    },
+    {
+      "epoch": 0.17478443253320905,
+      "grad_norm": 0.4491782486438751,
+      "learning_rate": 0.0005956330221703616,
+      "loss": 5.2624,
+      "step": 600
+    },
+    {
+      "epoch": 0.18934980191097647,
+      "grad_norm": 0.5439804792404175,
+      "learning_rate": 0.0005951954492415402,
+      "loss": 5.1937,
+      "step": 650
+    },
+    {
+      "epoch": 0.2039151712887439,
+      "grad_norm": 0.4217555522918701,
+      "learning_rate": 0.0005947578763127188,
+      "loss": 5.1259,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184805406665113,
+      "grad_norm": 0.3988608121871948,
+      "learning_rate": 0.0005943203033838973,
+      "loss": 5.0741,
+      "step": 750
+    },
+    {
+      "epoch": 0.23304591004427871,
+      "grad_norm": 0.3983953297138214,
+      "learning_rate": 0.0005938827304550758,
+      "loss": 5.0375,
+      "step": 800
+    },
+    {
+      "epoch": 0.24761127942204614,
+      "grad_norm": 0.4305328130722046,
+      "learning_rate": 0.0005934451575262544,
+      "loss": 4.9659,
+      "step": 850
+    },
+    {
+      "epoch": 0.26217664879981356,
+      "grad_norm": 0.5058379173278809,
+      "learning_rate": 0.0005930075845974328,
+      "loss": 4.9392,
+      "step": 900
+    },
+    {
+      "epoch": 0.276742018177581,
+      "grad_norm": 0.4486232399940491,
+      "learning_rate": 0.0005925700116686113,
+      "loss": 4.888,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "grad_norm": 0.41395482420921326,
+      "learning_rate": 0.0005921324387397899,
+      "loss": 4.836,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "eval_accuracy": 0.25472411559956376,
+      "eval_loss": 4.754175662994385,
+      "eval_runtime": 179.9073,
+      "eval_samples_per_second": 92.514,
+      "eval_steps_per_second": 5.786,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30587275693311583,
+      "grad_norm": 0.4587019383907318,
+      "learning_rate": 0.0005916948658109684,
+      "loss": 4.7942,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32043812631088325,
+      "grad_norm": 0.45875293016433716,
+      "learning_rate": 0.000591257292882147,
+      "loss": 4.7273,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350034956886507,
+      "grad_norm": 0.4781964123249054,
+      "learning_rate": 0.0005908197199533255,
+      "loss": 4.7014,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3495688650664181,
+      "grad_norm": 0.492291122674942,
+      "learning_rate": 0.0005903821470245041,
+      "loss": 4.6737,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3641342344441855,
+      "grad_norm": 0.42865535616874695,
+      "learning_rate": 0.0005899445740956826,
+      "loss": 4.6405,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37869960382195295,
+      "grad_norm": 0.43935948610305786,
+      "learning_rate": 0.0005895070011668611,
+      "loss": 4.594,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39326497319972037,
+      "grad_norm": 0.43477925658226013,
+      "learning_rate": 0.0005890694282380397,
+      "loss": 4.5799,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4078303425774878,
+      "grad_norm": 0.43411099910736084,
+      "learning_rate": 0.0005886318553092181,
+      "loss": 4.5577,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42239571195525516,
+      "grad_norm": 0.4020370841026306,
+      "learning_rate": 0.0005881942823803966,
+      "loss": 4.5398,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4369610813330226,
+      "grad_norm": 0.4862031936645508,
+      "learning_rate": 0.0005877567094515752,
+      "loss": 4.5186,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45152645071079,
+      "grad_norm": 0.3783005177974701,
+      "learning_rate": 0.0005873191365227537,
+      "loss": 4.4805,
+      "step": 1550
+    },
+    {
+      "epoch": 0.46609182008855743,
+      "grad_norm": 0.4555555582046509,
+      "learning_rate": 0.0005868815635939323,
+      "loss": 4.4699,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48065718946632485,
+      "grad_norm": 0.3953476846218109,
+      "learning_rate": 0.0005864439906651108,
+      "loss": 4.4484,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4952225588440923,
+      "grad_norm": 0.44499197602272034,
+      "learning_rate": 0.0005860064177362894,
+      "loss": 4.4183,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5097879282218597,
+      "grad_norm": 0.45466503500938416,
+      "learning_rate": 0.0005855688448074679,
+      "loss": 4.4102,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5243532975996271,
+      "grad_norm": 0.43747514486312866,
+      "learning_rate": 0.0005851312718786464,
+      "loss": 4.3962,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5389186669773945,
+      "grad_norm": 0.3796531558036804,
+      "learning_rate": 0.0005846936989498249,
+      "loss": 4.3864,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553484036355162,
+      "grad_norm": 0.4407300353050232,
+      "learning_rate": 0.0005842561260210034,
+      "loss": 4.3651,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5680494057329294,
+      "grad_norm": 0.4483519196510315,
+      "learning_rate": 0.000583818553092182,
+      "loss": 4.3465,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "grad_norm": 0.4081006348133087,
+      "learning_rate": 0.0005833809801633605,
+      "loss": 4.3423,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "eval_accuracy": 0.29926300551528945,
+      "eval_loss": 4.2851409912109375,
+      "eval_runtime": 179.879,
+      "eval_samples_per_second": 92.529,
+      "eval_steps_per_second": 5.787,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5971801444884642,
+      "grad_norm": 0.3948284685611725,
+      "learning_rate": 0.000582943407234539,
+      "loss": 4.3215,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6117455138662317,
+      "grad_norm": 0.38788270950317383,
+      "learning_rate": 0.0005825058343057176,
+      "loss": 4.3212,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6263108832439991,
+      "grad_norm": 0.37169507145881653,
+      "learning_rate": 0.0005820682613768961,
+      "loss": 4.3175,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6408762526217665,
+      "grad_norm": 0.36698848009109497,
+      "learning_rate": 0.0005816306884480747,
+      "loss": 4.2876,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6554416219995339,
+      "grad_norm": 0.42129504680633545,
+      "learning_rate": 0.0005811931155192532,
+      "loss": 4.2811,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6700069913773014,
+      "grad_norm": 0.40725767612457275,
+      "learning_rate": 0.0005807555425904316,
+      "loss": 4.2676,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6845723607550688,
+      "grad_norm": 0.3666594922542572,
+      "learning_rate": 0.0005803179696616102,
+      "loss": 4.259,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6991377301328362,
+      "grad_norm": 0.411522775888443,
+      "learning_rate": 0.0005798803967327887,
+      "loss": 4.2528,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7137030995106036,
+      "grad_norm": 0.343448668718338,
+      "learning_rate": 0.0005794428238039673,
+      "loss": 4.2396,
+      "step": 2450
+    },
+    {
+      "epoch": 0.728268468888371,
+      "grad_norm": 0.3675486445426941,
+      "learning_rate": 0.0005790052508751458,
+      "loss": 4.2248,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7428338382661385,
+      "grad_norm": 0.36124101281166077,
+      "learning_rate": 0.0005785676779463243,
+      "loss": 4.2171,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7573992076439059,
+      "grad_norm": 0.4069299101829529,
+      "learning_rate": 0.0005781301050175029,
+      "loss": 4.2053,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7719645770216733,
+      "grad_norm": 0.39434754848480225,
+      "learning_rate": 0.0005776925320886814,
+      "loss": 4.21,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7865299463994407,
+      "grad_norm": 0.3574545085430145,
+      "learning_rate": 0.00057725495915986,
+      "loss": 4.1949,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8010953157772082,
+      "grad_norm": 0.36652427911758423,
+      "learning_rate": 0.0005768173862310384,
+      "loss": 4.1699,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8156606851549756,
+      "grad_norm": 0.3449154496192932,
+      "learning_rate": 0.0005763798133022169,
+      "loss": 4.1699,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8302260545327429,
+      "grad_norm": 0.3682800531387329,
+      "learning_rate": 0.0005759422403733955,
+      "loss": 4.1685,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8447914239105103,
+      "grad_norm": 0.39873358607292175,
+      "learning_rate": 0.000575504667444574,
+      "loss": 4.1501,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8593567932882777,
+      "grad_norm": 0.3736512362957001,
+      "learning_rate": 0.0005750670945157526,
+      "loss": 4.1473,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "grad_norm": 0.38885951042175293,
+      "learning_rate": 0.0005746295215869311,
+      "loss": 4.1418,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "eval_accuracy": 0.31548835966817024,
+      "eval_loss": 4.09907865524292,
+      "eval_runtime": 179.8464,
+      "eval_samples_per_second": 92.546,
+      "eval_steps_per_second": 5.788,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8884875320438126,
+      "grad_norm": 0.34839940071105957,
+      "learning_rate": 0.0005741919486581096,
+      "loss": 4.1499,
+      "step": 3050
+    },
+    {
+      "epoch": 0.90305290142158,
+      "grad_norm": 0.3890705704689026,
+      "learning_rate": 0.0005737543757292882,
+      "loss": 4.1416,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9176182707993474,
+      "grad_norm": 0.3607308566570282,
+      "learning_rate": 0.0005733168028004667,
+      "loss": 4.1086,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9321836401771149,
+      "grad_norm": 0.351632297039032,
+      "learning_rate": 0.0005728792298716453,
+      "loss": 4.1185,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9467490095548823,
+      "grad_norm": 0.3545263707637787,
+      "learning_rate": 0.0005724416569428237,
+      "loss": 4.1113,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9613143789326497,
+      "grad_norm": 0.37550088763237,
+      "learning_rate": 0.0005720040840140023,
+      "loss": 4.107,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9758797483104171,
+      "grad_norm": 0.40243715047836304,
+      "learning_rate": 0.0005715665110851808,
+      "loss": 4.1036,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9904451176881846,
+      "grad_norm": 0.3520545959472656,
+      "learning_rate": 0.0005711289381563593,
+      "loss": 4.0915,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0049522255884409,
+      "grad_norm": 0.3803820013999939,
+      "learning_rate": 0.0005706913652275379,
+      "loss": 4.0746,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0195175949662083,
+      "grad_norm": 0.3791220784187317,
+      "learning_rate": 0.0005702537922987164,
+      "loss": 4.0221,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0340829643439757,
+      "grad_norm": 0.3519602417945862,
+      "learning_rate": 0.0005698162193698949,
+      "loss": 4.0213,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0486483337217432,
+      "grad_norm": 0.36723291873931885,
+      "learning_rate": 0.0005693786464410735,
+      "loss": 4.0157,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0632137030995106,
+      "grad_norm": 0.3530062139034271,
+      "learning_rate": 0.000568941073512252,
+      "loss": 4.0094,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077779072477278,
+      "grad_norm": 0.3847055435180664,
+      "learning_rate": 0.0005685035005834305,
+      "loss": 4.0082,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0923444418550454,
+      "grad_norm": 0.3274436593055725,
+      "learning_rate": 0.000568065927654609,
+      "loss": 4.0044,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1069098112328128,
+      "grad_norm": 0.34672999382019043,
+      "learning_rate": 0.0005676283547257876,
+      "loss": 4.0247,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1214751806105803,
+      "grad_norm": 0.33540818095207214,
+      "learning_rate": 0.0005671907817969661,
+      "loss": 3.9935,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1360405499883477,
+      "grad_norm": 0.3458578288555145,
+      "learning_rate": 0.0005667532088681446,
+      "loss": 3.9953,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1506059193661151,
+      "grad_norm": 0.3823903799057007,
+      "learning_rate": 0.0005663156359393232,
+      "loss": 4.0052,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "grad_norm": 0.37612202763557434,
+      "learning_rate": 0.0005658780630105017,
+      "loss": 3.991,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "eval_accuracy": 0.3252509910542918,
+      "eval_loss": 3.993682384490967,
+      "eval_runtime": 179.8018,
+      "eval_samples_per_second": 92.569,
+      "eval_steps_per_second": 5.79,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17973665812165,
+      "grad_norm": 0.36251288652420044,
+      "learning_rate": 0.0005654404900816802,
+      "loss": 3.986,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1943020274994174,
+      "grad_norm": 0.3479040265083313,
+      "learning_rate": 0.0005650029171528588,
+      "loss": 3.9921,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2088673968771848,
+      "grad_norm": 0.3300114870071411,
+      "learning_rate": 0.0005645653442240373,
+      "loss": 3.9802,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2234327662549522,
+      "grad_norm": 0.33517342805862427,
+      "learning_rate": 0.0005641277712952158,
+      "loss": 3.9771,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2379981356327197,
+      "grad_norm": 0.3363065719604492,
+      "learning_rate": 0.0005636901983663943,
+      "loss": 3.9818,
+      "step": 4250
+    },
+    {
+      "epoch": 1.252563505010487,
+      "grad_norm": 0.39410993456840515,
+      "learning_rate": 0.0005632526254375729,
+      "loss": 3.9672,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2671288743882545,
+      "grad_norm": 0.3455098271369934,
+      "learning_rate": 0.0005628150525087514,
+      "loss": 3.9731,
+      "step": 4350
+    },
+    {
+      "epoch": 1.281694243766022,
+      "grad_norm": 0.3514918088912964,
+      "learning_rate": 0.0005623774795799299,
+      "loss": 3.9632,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2962596131437893,
+      "grad_norm": 0.35402923822402954,
+      "learning_rate": 0.0005619399066511085,
+      "loss": 3.9597,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3108249825215568,
+      "grad_norm": 0.331036239862442,
+      "learning_rate": 0.000561502333722287,
+      "loss": 3.9444,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3253903518993242,
+      "grad_norm": 0.35744568705558777,
+      "learning_rate": 0.0005610647607934655,
+      "loss": 3.9531,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3399557212770916,
+      "grad_norm": 0.3515322208404541,
+      "learning_rate": 0.000560627187864644,
+      "loss": 3.9612,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354521090654859,
+      "grad_norm": 0.32326042652130127,
+      "learning_rate": 0.0005601896149358226,
+      "loss": 3.9558,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3690864600326265,
+      "grad_norm": 0.32586470246315,
+      "learning_rate": 0.0005597520420070011,
+      "loss": 3.9452,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3836518294103939,
+      "grad_norm": 0.3541634976863861,
+      "learning_rate": 0.0005593144690781796,
+      "loss": 3.9452,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3982171987881613,
+      "grad_norm": 0.33788588643074036,
+      "learning_rate": 0.0005588768961493582,
+      "loss": 3.9384,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4127825681659287,
+      "grad_norm": 0.3441423773765564,
+      "learning_rate": 0.0005584393232205367,
+      "loss": 3.936,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4273479375436962,
+      "grad_norm": 0.3493860960006714,
+      "learning_rate": 0.0005580017502917152,
+      "loss": 3.939,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4419133069214636,
+      "grad_norm": 0.3450307846069336,
+      "learning_rate": 0.0005575641773628938,
+      "loss": 3.9262,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456478676299231,
+      "grad_norm": 0.3555508553981781,
+      "learning_rate": 0.0005571266044340723,
+      "loss": 3.9297,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456478676299231,
+      "eval_accuracy": 0.33168749420934585,
+      "eval_loss": 3.914679527282715,
+      "eval_runtime": 179.6257,
+      "eval_samples_per_second": 92.659,
+      "eval_steps_per_second": 5.795,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4710440456769984,
+      "grad_norm": 0.31701815128326416,
+      "learning_rate": 0.0005566890315052507,
+      "loss": 3.9287,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4856094150547658,
+      "grad_norm": 0.34660544991493225,
+      "learning_rate": 0.0005562514585764293,
+      "loss": 3.9227,
+      "step": 5100
+    },
+    {
+      "epoch": 1.500174784432533,
+      "grad_norm": 0.31572017073631287,
+      "learning_rate": 0.0005558138856476079,
+      "loss": 3.9105,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5147401538103007,
+      "grad_norm": 0.36169928312301636,
+      "learning_rate": 0.0005553763127187864,
+      "loss": 3.9195,
+      "step": 5200
+    },
+    {
+      "epoch": 1.529305523188068,
+      "grad_norm": 0.3572075068950653,
+      "learning_rate": 0.0005549387397899649,
+      "loss": 3.9057,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5438708925658355,
+      "grad_norm": 0.35884082317352295,
+      "learning_rate": 0.0005545011668611435,
+      "loss": 3.9182,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5584362619436027,
+      "grad_norm": 0.33528369665145874,
+      "learning_rate": 0.000554063593932322,
+      "loss": 3.9182,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5730016313213704,
+      "grad_norm": 0.31933894753456116,
+      "learning_rate": 0.0005536260210035005,
+      "loss": 3.9024,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5875670006991376,
+      "grad_norm": 0.34619244933128357,
+      "learning_rate": 0.0005531884480746791,
+      "loss": 3.9082,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6021323700769052,
+      "grad_norm": 0.34431859850883484,
+      "learning_rate": 0.0005527508751458577,
+      "loss": 3.9046,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6166977394546724,
+      "grad_norm": 0.3247990608215332,
+      "learning_rate": 0.0005523133022170361,
+      "loss": 3.8984,
+      "step": 5550
+    },
+    {
+      "epoch": 1.63126310883244,
+      "grad_norm": 0.33569249510765076,
+      "learning_rate": 0.0005518757292882146,
+      "loss": 3.8857,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6458284782102073,
+      "grad_norm": 0.326007217168808,
+      "learning_rate": 0.0005514381563593932,
+      "loss": 3.8984,
+      "step": 5650
+    },
+    {
+      "epoch": 1.660393847587975,
+      "grad_norm": 0.3553823232650757,
+      "learning_rate": 0.0005510005834305717,
+      "loss": 3.8857,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6749592169657421,
+      "grad_norm": 0.31800103187561035,
+      "learning_rate": 0.0005505630105017502,
+      "loss": 3.8956,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6895245863435098,
+      "grad_norm": 0.3153764009475708,
+      "learning_rate": 0.0005501254375729288,
+      "loss": 3.8895,
+      "step": 5800
+    },
+    {
+      "epoch": 1.704089955721277,
+      "grad_norm": 0.3242632746696472,
+      "learning_rate": 0.0005496878646441073,
+      "loss": 3.888,
+      "step": 5850
+    },
+    {
+      "epoch": 1.7186553250990446,
+      "grad_norm": 0.33604300022125244,
+      "learning_rate": 0.0005492502917152858,
+      "loss": 3.886,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7332206944768118,
+      "grad_norm": 0.35920587182044983,
+      "learning_rate": 0.0005488127187864644,
+      "loss": 3.8789,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "grad_norm": 0.3230355381965637,
+      "learning_rate": 0.000548375145857643,
+      "loss": 3.8779,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "eval_accuracy": 0.33706357279951615,
+      "eval_loss": 3.8543789386749268,
+      "eval_runtime": 179.7654,
+      "eval_samples_per_second": 92.587,
+      "eval_steps_per_second": 5.791,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7623514332323467,
+      "grad_norm": 0.3344341814517975,
+      "learning_rate": 0.0005479375729288214,
+      "loss": 3.8676,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7769168026101143,
+      "grad_norm": 0.3225545585155487,
+      "learning_rate": 0.0005474999999999999,
+      "loss": 3.8757,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7914821719878815,
+      "grad_norm": 0.33318787813186646,
+      "learning_rate": 0.0005470624270711785,
+      "loss": 3.8667,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8060475413656492,
+      "grad_norm": 0.3072070777416229,
+      "learning_rate": 0.000546624854142357,
+      "loss": 3.8627,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8206129107434164,
+      "grad_norm": 0.32514503598213196,
+      "learning_rate": 0.0005461872812135355,
+      "loss": 3.8647,
+      "step": 6250
+    },
+    {
+      "epoch": 1.835178280121184,
+      "grad_norm": 0.3363496661186218,
+      "learning_rate": 0.0005457497082847141,
+      "loss": 3.8712,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8497436494989512,
+      "grad_norm": 0.31175696849823,
+      "learning_rate": 0.0005453121353558927,
+      "loss": 3.8674,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8643090188767188,
+      "grad_norm": 0.33162710070610046,
+      "learning_rate": 0.0005448745624270712,
+      "loss": 3.8721,
+      "step": 6400
+    },
+    {
+      "epoch": 1.878874388254486,
+      "grad_norm": 0.3097343146800995,
+      "learning_rate": 0.0005444369894982496,
+      "loss": 3.8637,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8934397576322537,
+      "grad_norm": 0.33735862374305725,
+      "learning_rate": 0.0005439994165694282,
+      "loss": 3.8451,
+      "step": 6500
+    },
+    {
+      "epoch": 1.908005127010021,
+      "grad_norm": 0.3092336058616638,
+      "learning_rate": 0.0005435618436406067,
+      "loss": 3.8552,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9225704963877885,
+      "grad_norm": 0.32548123598098755,
+      "learning_rate": 0.0005431242707117852,
+      "loss": 3.8537,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9371358657655557,
+      "grad_norm": 0.3191821575164795,
+      "learning_rate": 0.0005426866977829638,
+      "loss": 3.846,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9517012351433234,
+      "grad_norm": 0.33417537808418274,
+      "learning_rate": 0.0005422491248541423,
+      "loss": 3.8518,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9662666045210906,
+      "grad_norm": 0.32248765230178833,
+      "learning_rate": 0.0005418115519253208,
+      "loss": 3.8579,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9808319738988582,
+      "grad_norm": 0.33786964416503906,
+      "learning_rate": 0.0005413739789964994,
+      "loss": 3.8316,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9953973432766254,
+      "grad_norm": 0.3355385661125183,
+      "learning_rate": 0.000540936406067678,
+      "loss": 3.8404,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0099044511768818,
+      "grad_norm": 0.32335567474365234,
+      "learning_rate": 0.0005404988331388564,
+      "loss": 3.7795,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0244698205546494,
+      "grad_norm": 0.3314116895198822,
+      "learning_rate": 0.0005400612602100349,
+      "loss": 3.7512,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "grad_norm": 0.3404753804206848,
+      "learning_rate": 0.0005396236872812135,
+      "loss": 3.7427,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "eval_accuracy": 0.34164365689980253,
+      "eval_loss": 3.8123531341552734,
+      "eval_runtime": 180.0803,
+      "eval_samples_per_second": 92.425,
+      "eval_steps_per_second": 5.781,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0536005593101843,
+      "grad_norm": 0.35776785016059875,
+      "learning_rate": 0.000539186114352392,
+      "loss": 3.7381,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0681659286879515,
+      "grad_norm": 0.3245725929737091,
+      "learning_rate": 0.0005387485414235705,
+      "loss": 3.7485,
+      "step": 7100
+    },
+    {
+      "epoch": 2.082731298065719,
+      "grad_norm": 0.3353221118450165,
+      "learning_rate": 0.0005383109684947491,
+      "loss": 3.7375,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0972966674434863,
+      "grad_norm": 0.33368387818336487,
+      "learning_rate": 0.0005378733955659276,
+      "loss": 3.7618,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111862036821254,
+      "grad_norm": 0.35302773118019104,
+      "learning_rate": 0.0005374358226371061,
+      "loss": 3.7519,
+      "step": 7250
+    },
+    {
+      "epoch": 2.126427406199021,
+      "grad_norm": 0.3177225589752197,
+      "learning_rate": 0.0005369982497082847,
+      "loss": 3.7598,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140992775576789,
+      "grad_norm": 0.34392455220222473,
+      "learning_rate": 0.0005365606767794633,
+      "loss": 3.7631,
+      "step": 7350
+    },
+    {
+      "epoch": 2.155558144954556,
+      "grad_norm": 0.3233015835285187,
+      "learning_rate": 0.0005361231038506417,
+      "loss": 3.7618,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1701235143323236,
+      "grad_norm": 0.31578004360198975,
+      "learning_rate": 0.0005356855309218202,
+      "loss": 3.7679,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184688883710091,
+      "grad_norm": 0.3494773209095001,
+      "learning_rate": 0.0005352479579929988,
+      "loss": 3.7493,
+      "step": 7500
+    },
+    {
+      "epoch": 2.1992542530878585,
+      "grad_norm": 0.3228057026863098,
+      "learning_rate": 0.0005348103850641773,
+      "loss": 3.7475,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2138196224656257,
+      "grad_norm": 0.3268294632434845,
+      "learning_rate": 0.0005343728121353558,
+      "loss": 3.7601,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2283849918433933,
+      "grad_norm": 0.32106488943099976,
+      "learning_rate": 0.0005339352392065344,
+      "loss": 3.7661,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2429503612211605,
+      "grad_norm": 0.3412560224533081,
+      "learning_rate": 0.000533497666277713,
+      "loss": 3.7514,
+      "step": 7700
+    },
+    {
+      "epoch": 2.257515730598928,
+      "grad_norm": 0.33492568135261536,
+      "learning_rate": 0.0005330600933488915,
+      "loss": 3.7667,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2720810999766954,
+      "grad_norm": 0.34016239643096924,
+      "learning_rate": 0.00053262252042007,
+      "loss": 3.7499,
+      "step": 7800
+    },
+    {
+      "epoch": 2.286646469354463,
+      "grad_norm": 0.327343225479126,
+      "learning_rate": 0.0005321849474912485,
+      "loss": 3.7587,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3012118387322302,
+      "grad_norm": 0.3204312026500702,
+      "learning_rate": 0.000531747374562427,
+      "loss": 3.7561,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3157772081099974,
+      "grad_norm": 0.3505200445652008,
+      "learning_rate": 0.0005313098016336055,
+      "loss": 3.7534,
+      "step": 7950
+    },
+    {
+      "epoch": 2.330342577487765,
+      "grad_norm": 0.31797534227371216,
+      "learning_rate": 0.0005308722287047841,
+      "loss": 3.7492,
+      "step": 8000
+    },
+    {
+      "epoch": 2.330342577487765,
+      "eval_accuracy": 0.3447044144419973,
+      "eval_loss": 3.781189441680908,
+      "eval_runtime": 180.3098,
+      "eval_samples_per_second": 92.308,
+      "eval_steps_per_second": 5.773,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3449079468655327,
+      "grad_norm": 0.3364429771900177,
+      "learning_rate": 0.0005304346557759626,
+      "loss": 3.7339,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3594733162433,
+      "grad_norm": 0.31472164392471313,
+      "learning_rate": 0.0005299970828471411,
+      "loss": 3.73,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374038685621067,
+      "grad_norm": 0.3065818250179291,
+      "learning_rate": 0.0005295595099183197,
+      "loss": 3.7486,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3886040549988348,
+      "grad_norm": 0.3349589705467224,
+      "learning_rate": 0.0005291219369894983,
+      "loss": 3.7328,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4031694243766024,
+      "grad_norm": 0.33365535736083984,
+      "learning_rate": 0.0005286843640606768,
+      "loss": 3.7319,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4177347937543696,
+      "grad_norm": 0.34016332030296326,
+      "learning_rate": 0.0005282467911318552,
+      "loss": 3.7389,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432300163132137,
+      "grad_norm": 0.32065296173095703,
+      "learning_rate": 0.0005278092182030338,
+      "loss": 3.7554,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4468655325099045,
+      "grad_norm": 0.3385309875011444,
+      "learning_rate": 0.0005273716452742123,
+      "loss": 3.7452,
+      "step": 8400
+    },
+    {
+      "epoch": 2.461430901887672,
+      "grad_norm": 0.3271431028842926,
+      "learning_rate": 0.0005269340723453908,
+      "loss": 3.7506,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4759962712654393,
+      "grad_norm": 0.3121528625488281,
+      "learning_rate": 0.0005264964994165694,
+      "loss": 3.7521,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4905616406432065,
+      "grad_norm": 0.3151465654373169,
+      "learning_rate": 0.000526058926487748,
+      "loss": 3.7439,
+      "step": 8550
+    },
+    {
+      "epoch": 2.505127010020974,
+      "grad_norm": 0.3443325161933899,
+      "learning_rate": 0.0005256213535589265,
+      "loss": 3.7441,
+      "step": 8600
+    },
+    {
+      "epoch": 2.519692379398742,
+      "grad_norm": 0.32063865661621094,
+      "learning_rate": 0.000525183780630105,
+      "loss": 3.7501,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534257748776509,
+      "grad_norm": 0.3402131497859955,
+      "learning_rate": 0.0005247462077012836,
+      "loss": 3.7476,
+      "step": 8700
+    },
+    {
+      "epoch": 2.548823118154276,
+      "grad_norm": 0.322399377822876,
+      "learning_rate": 0.000524308634772462,
+      "loss": 3.7502,
+      "step": 8750
+    },
+    {
+      "epoch": 2.563388487532044,
+      "grad_norm": 0.33992311358451843,
+      "learning_rate": 0.0005238710618436405,
+      "loss": 3.7434,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5779538569098115,
+      "grad_norm": 0.31691116094589233,
+      "learning_rate": 0.0005234334889148191,
+      "loss": 3.7352,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5925192262875787,
+      "grad_norm": 0.3292892277240753,
+      "learning_rate": 0.0005229959159859976,
+      "loss": 3.7338,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607084595665346,
+      "grad_norm": 0.31549301743507385,
+      "learning_rate": 0.0005225583430571761,
+      "loss": 3.7453,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "grad_norm": 0.3125064969062805,
+      "learning_rate": 0.0005221207701283547,
+      "loss": 3.7372,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "eval_accuracy": 0.34737834452898997,
+      "eval_loss": 3.7514328956604004,
+      "eval_runtime": 179.7257,
+      "eval_samples_per_second": 92.608,
+      "eval_steps_per_second": 5.792,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636215334420881,
+      "grad_norm": 0.3117610514163971,
+      "learning_rate": 0.0005216831971995333,
+      "loss": 3.7072,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6507807037986484,
+      "grad_norm": 0.3186918795108795,
+      "learning_rate": 0.0005212456242707118,
+      "loss": 3.7313,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6653460731764156,
+      "grad_norm": 0.33037465810775757,
+      "learning_rate": 0.0005208080513418903,
+      "loss": 3.7283,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6799114425541832,
+      "grad_norm": 0.32975658774375916,
+      "learning_rate": 0.0005203704784130689,
+      "loss": 3.7291,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6944768119319504,
+      "grad_norm": 0.3381806015968323,
+      "learning_rate": 0.0005199329054842473,
+      "loss": 3.7192,
+      "step": 9250
+    },
+    {
+      "epoch": 2.709042181309718,
+      "grad_norm": 0.3226553797721863,
+      "learning_rate": 0.0005194953325554258,
+      "loss": 3.7279,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7236075506874853,
+      "grad_norm": 0.320027232170105,
+      "learning_rate": 0.0005190577596266044,
+      "loss": 3.7251,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738172920065253,
+      "grad_norm": 0.3155761957168579,
+      "learning_rate": 0.0005186201866977829,
+      "loss": 3.7159,
+      "step": 9400
+    },
+    {
+      "epoch": 2.75273828944302,
+      "grad_norm": 0.3248502314090729,
+      "learning_rate": 0.0005181826137689614,
+      "loss": 3.7201,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7673036588207878,
+      "grad_norm": 0.3296821117401123,
+      "learning_rate": 0.00051774504084014,
+      "loss": 3.7267,
+      "step": 9500
+    },
+    {
+      "epoch": 2.781869028198555,
+      "grad_norm": 0.3494844138622284,
+      "learning_rate": 0.0005173074679113186,
+      "loss": 3.725,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7964343975763226,
+      "grad_norm": 0.3254939913749695,
+      "learning_rate": 0.0005168698949824971,
+      "loss": 3.7221,
+      "step": 9600
+    },
+    {
+      "epoch": 2.81099976695409,
+      "grad_norm": 0.3118003308773041,
+      "learning_rate": 0.0005164323220536755,
+      "loss": 3.719,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8255651363318575,
+      "grad_norm": 0.3459826707839966,
+      "learning_rate": 0.0005159947491248541,
+      "loss": 3.7268,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8401305057096247,
+      "grad_norm": 0.31478872895240784,
+      "learning_rate": 0.0005155571761960326,
+      "loss": 3.7241,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8546958750873923,
+      "grad_norm": 0.3117355704307556,
+      "learning_rate": 0.0005151196032672111,
+      "loss": 3.7101,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8692612444651595,
+      "grad_norm": 0.3150256872177124,
+      "learning_rate": 0.0005146820303383897,
+      "loss": 3.7186,
+      "step": 9850
+    },
+    {
+      "epoch": 2.883826613842927,
+      "grad_norm": 0.3244450092315674,
+      "learning_rate": 0.0005142444574095682,
+      "loss": 3.7062,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8983919832206944,
+      "grad_norm": 0.3051128387451172,
+      "learning_rate": 0.0005138068844807468,
+      "loss": 3.7218,
+      "step": 9950
+    },
+    {
+      "epoch": 2.912957352598462,
+      "grad_norm": 0.35902562737464905,
+      "learning_rate": 0.0005133693115519253,
+      "loss": 3.7056,
+      "step": 10000
+    },
+    {
+      "epoch": 2.912957352598462,
+      "eval_accuracy": 0.3498066568184394,
+      "eval_loss": 3.726057767868042,
+      "eval_runtime": 180.1415,
+      "eval_samples_per_second": 92.394,
+      "eval_steps_per_second": 5.779,
+      "step": 10000
+    },
+    {
+      "epoch": 2.927522721976229,
+      "grad_norm": 0.31749260425567627,
+      "learning_rate": 0.0005129317386231039,
+      "loss": 3.7179,
+      "step": 10050
+    },
+    {
+      "epoch": 2.942088091353997,
+      "grad_norm": 0.3012363612651825,
+      "learning_rate": 0.0005124941656942824,
+      "loss": 3.712,
+      "step": 10100
+    },
+    {
+      "epoch": 2.956653460731764,
+      "grad_norm": 0.3233492076396942,
+      "learning_rate": 0.0005120565927654608,
+      "loss": 3.7127,
+      "step": 10150
+    },
+    {
+      "epoch": 2.9712188301095317,
+      "grad_norm": 0.3380107283592224,
+      "learning_rate": 0.0005116190198366394,
+      "loss": 3.7189,
+      "step": 10200
+    },
+    {
+      "epoch": 2.985784199487299,
+      "grad_norm": 0.3121177852153778,
+      "learning_rate": 0.0005111814469078179,
+      "loss": 3.7162,
+      "step": 10250
+    },
+    {
+      "epoch": 3.0002913073875552,
+      "grad_norm": 0.3301170766353607,
+      "learning_rate": 0.0005107438739789964,
+      "loss": 3.7098,
+      "step": 10300
+    },
+    {
+      "epoch": 3.014856676765323,
+      "grad_norm": 0.3270030617713928,
+      "learning_rate": 0.000510306301050175,
+      "loss": 3.6062,
+      "step": 10350
+    },
+    {
+      "epoch": 3.02942204614309,
+      "grad_norm": 0.3287683129310608,
+      "learning_rate": 0.0005098687281213535,
+      "loss": 3.597,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0439874155208577,
+      "grad_norm": 0.3282028138637543,
+      "learning_rate": 0.0005094311551925321,
+      "loss": 3.6045,
+      "step": 10450
+    },
+    {
+      "epoch": 3.058552784898625,
+      "grad_norm": 0.34291520714759827,
+      "learning_rate": 0.0005089935822637106,
+      "loss": 3.6151,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0731181542763926,
+      "grad_norm": 0.31705862283706665,
+      "learning_rate": 0.0005085560093348892,
+      "loss": 3.6231,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0876835236541598,
+      "grad_norm": 0.3140444755554199,
+      "learning_rate": 0.0005081184364060676,
+      "loss": 3.6261,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1022488930319274,
+      "grad_norm": 0.3536335527896881,
+      "learning_rate": 0.0005076808634772461,
+      "loss": 3.6241,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1168142624096946,
+      "grad_norm": 0.33482107520103455,
+      "learning_rate": 0.0005072432905484247,
+      "loss": 3.6337,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1313796317874623,
+      "grad_norm": 0.32872864603996277,
+      "learning_rate": 0.0005068057176196032,
+      "loss": 3.6181,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1459450011652295,
+      "grad_norm": 0.326739639043808,
+      "learning_rate": 0.0005063681446907818,
+      "loss": 3.6269,
+      "step": 10800
+    },
+    {
+      "epoch": 3.160510370542997,
+      "grad_norm": 0.31593042612075806,
+      "learning_rate": 0.0005059305717619603,
+      "loss": 3.6352,
+      "step": 10850
+    },
+    {
+      "epoch": 3.1750757399207643,
+      "grad_norm": 0.3400070369243622,
+      "learning_rate": 0.0005054929988331388,
+      "loss": 3.6435,
+      "step": 10900
+    },
+    {
+      "epoch": 3.189641109298532,
+      "grad_norm": 0.33030685782432556,
+      "learning_rate": 0.0005050554259043174,
+      "loss": 3.6292,
+      "step": 10950
+    },
+    {
+      "epoch": 3.204206478676299,
+      "grad_norm": 0.3333072066307068,
+      "learning_rate": 0.0005046178529754959,
+      "loss": 3.6285,
+      "step": 11000
+    },
+    {
+      "epoch": 3.204206478676299,
+      "eval_accuracy": 0.35201263150369827,
+      "eval_loss": 3.7096149921417236,
+      "eval_runtime": 180.2303,
+      "eval_samples_per_second": 92.348,
+      "eval_steps_per_second": 5.776,
+      "step": 11000
+    },
+    {
+      "epoch": 3.218771848054067,
+      "grad_norm": 0.32576659321784973,
+      "learning_rate": 0.0005041802800466744,
+      "loss": 3.6225,
+      "step": 11050
+    },
+    {
+      "epoch": 3.233337217431834,
+      "grad_norm": 0.3293328881263733,
+      "learning_rate": 0.0005037427071178529,
+      "loss": 3.6389,
+      "step": 11100
+    },
+    {
+      "epoch": 3.2479025868096016,
+      "grad_norm": 0.30969351530075073,
+      "learning_rate": 0.0005033051341890314,
+      "loss": 3.6297,
+      "step": 11150
+    },
+    {
+      "epoch": 3.262467956187369,
+      "grad_norm": 0.3289102613925934,
+      "learning_rate": 0.00050286756126021,
+      "loss": 3.6321,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2770333255651365,
+      "grad_norm": 0.340069979429245,
+      "learning_rate": 0.0005024299883313885,
+      "loss": 3.6328,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2915986949429037,
+      "grad_norm": 0.3201046586036682,
+      "learning_rate": 0.0005019924154025671,
+      "loss": 3.6342,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3061640643206713,
+      "grad_norm": 0.356716513633728,
+      "learning_rate": 0.0005015548424737456,
+      "loss": 3.6261,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3207294336984385,
+      "grad_norm": 0.3404761254787445,
+      "learning_rate": 0.0005011172695449241,
+      "loss": 3.6347,
+      "step": 11400
+    },
+    {
+      "epoch": 3.335294803076206,
+      "grad_norm": 0.3204871714115143,
+      "learning_rate": 0.0005006796966161027,
+      "loss": 3.6267,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3498601724539734,
+      "grad_norm": 0.33498939871788025,
+      "learning_rate": 0.0005002421236872811,
+      "loss": 3.6321,
+      "step": 11500
+    },
+    {
+      "epoch": 3.364425541831741,
+      "grad_norm": 0.33658871054649353,
+      "learning_rate": 0.0004998045507584597,
+      "loss": 3.6384,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3789909112095082,
+      "grad_norm": 0.3517683148384094,
+      "learning_rate": 0.0004993669778296382,
+      "loss": 3.6373,
+      "step": 11600
+    },
+    {
+      "epoch": 3.393556280587276,
+      "grad_norm": 0.33312374353408813,
+      "learning_rate": 0.0004989294049008167,
+      "loss": 3.6344,
+      "step": 11650
+    },
+    {
+      "epoch": 3.408121649965043,
+      "grad_norm": 0.330685555934906,
+      "learning_rate": 0.0004984918319719953,
+      "loss": 3.633,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4226870193428107,
+      "grad_norm": 0.3397407829761505,
+      "learning_rate": 0.0004980542590431738,
+      "loss": 3.6344,
+      "step": 11750
+    },
+    {
+      "epoch": 3.437252388720578,
+      "grad_norm": 0.3209087550640106,
+      "learning_rate": 0.0004976166861143524,
+      "loss": 3.612,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4518177580983456,
+      "grad_norm": 0.3204724192619324,
+      "learning_rate": 0.0004971791131855309,
+      "loss": 3.6334,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4663831274761128,
+      "grad_norm": 0.33590126037597656,
+      "learning_rate": 0.0004967415402567094,
+      "loss": 3.6389,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4809484968538804,
+      "grad_norm": 0.34319397807121277,
+      "learning_rate": 0.000496303967327888,
+      "loss": 3.6334,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "grad_norm": 0.34008243680000305,
+      "learning_rate": 0.0004958663943990664,
+      "loss": 3.638,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "eval_accuracy": 0.35395158942580696,
+      "eval_loss": 3.6934237480163574,
+      "eval_runtime": 179.6583,
+      "eval_samples_per_second": 92.643,
+      "eval_steps_per_second": 5.794,
+      "step": 12000
+    },
+    {
+      "epoch": 3.510079235609415,
+      "grad_norm": 0.3379422724246979,
+      "learning_rate": 0.000495428821470245,
+      "loss": 3.6293,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5246446049871825,
+      "grad_norm": 0.31869712471961975,
+      "learning_rate": 0.0004949912485414235,
+      "loss": 3.6372,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53920997436495,
+      "grad_norm": 0.3277227282524109,
+      "learning_rate": 0.0004945536756126021,
+      "loss": 3.6288,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5537753437427173,
+      "grad_norm": 0.3422459661960602,
+      "learning_rate": 0.0004941161026837806,
+      "loss": 3.6445,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5683407131204845,
+      "grad_norm": 0.3337474465370178,
+      "learning_rate": 0.0004936785297549591,
+      "loss": 3.64,
+      "step": 12250
+    },
+    {
+      "epoch": 3.582906082498252,
+      "grad_norm": 0.3305768370628357,
+      "learning_rate": 0.0004932409568261377,
+      "loss": 3.6281,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59747145187602,
+      "grad_norm": 0.332967609167099,
+      "learning_rate": 0.0004928033838973162,
+      "loss": 3.6366,
+      "step": 12350
+    },
+    {
+      "epoch": 3.612036821253787,
+      "grad_norm": 0.32114502787590027,
+      "learning_rate": 0.0004923658109684946,
+      "loss": 3.6434,
+      "step": 12400
+    },
+    {
+      "epoch": 3.626602190631554,
+      "grad_norm": 0.32796093821525574,
+      "learning_rate": 0.0004919282380396732,
+      "loss": 3.6469,
+      "step": 12450
+    },
+    {
+      "epoch": 3.641167560009322,
+      "grad_norm": 0.3242839574813843,
+      "learning_rate": 0.0004914906651108517,
+      "loss": 3.6295,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6557329293870895,
+      "grad_norm": 0.35202574729919434,
+      "learning_rate": 0.0004910530921820303,
+      "loss": 3.6405,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6702982987648567,
+      "grad_norm": 0.32159286737442017,
+      "learning_rate": 0.0004906155192532088,
+      "loss": 3.6284,
+      "step": 12600
+    },
+    {
+      "epoch": 3.684863668142624,
+      "grad_norm": 0.3278411030769348,
+      "learning_rate": 0.0004901779463243874,
+      "loss": 3.6271,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6994290375203915,
+      "grad_norm": 0.35316187143325806,
+      "learning_rate": 0.0004897403733955659,
+      "loss": 3.6254,
+      "step": 12700
+    },
+    {
+      "epoch": 3.713994406898159,
+      "grad_norm": 0.31677183508872986,
+      "learning_rate": 0.0004893028004667444,
+      "loss": 3.6352,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7285597762759264,
+      "grad_norm": 0.32869166135787964,
+      "learning_rate": 0.000488865227537923,
+      "loss": 3.6263,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7431251456536936,
+      "grad_norm": 0.323147714138031,
+      "learning_rate": 0.0004884276546091015,
+      "loss": 3.6161,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7576905150314612,
+      "grad_norm": 0.33107396960258484,
+      "learning_rate": 0.00048799008168028,
+      "loss": 3.6205,
+      "step": 12900
+    },
+    {
+      "epoch": 3.772255884409229,
+      "grad_norm": 0.3485564589500427,
+      "learning_rate": 0.00048755250875145853,
+      "loss": 3.6427,
+      "step": 12950
+    },
+    {
+      "epoch": 3.786821253786996,
+      "grad_norm": 0.31731998920440674,
+      "learning_rate": 0.0004871149358226371,
+      "loss": 3.616,
+      "step": 13000
+    },
+    {
+      "epoch": 3.786821253786996,
+      "eval_accuracy": 0.35548902280095057,
+      "eval_loss": 3.6741294860839844,
+      "eval_runtime": 179.6397,
+      "eval_samples_per_second": 92.652,
+      "eval_steps_per_second": 5.795,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8013866231647633,
+      "grad_norm": 0.3320983052253723,
+      "learning_rate": 0.0004866773628938156,
+      "loss": 3.6431,
+      "step": 13050
+    },
+    {
+      "epoch": 3.815951992542531,
+      "grad_norm": 0.34173107147216797,
+      "learning_rate": 0.0004862397899649941,
+      "loss": 3.6304,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8305173619202986,
+      "grad_norm": 0.3142543435096741,
+      "learning_rate": 0.00048580221703617264,
+      "loss": 3.644,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8450827312980658,
+      "grad_norm": 0.3302863836288452,
+      "learning_rate": 0.00048536464410735123,
+      "loss": 3.6304,
+      "step": 13200
+    },
+    {
+      "epoch": 3.859648100675833,
+      "grad_norm": 0.3198452889919281,
+      "learning_rate": 0.00048492707117852966,
+      "loss": 3.6437,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8742134700536006,
+      "grad_norm": 0.3371002674102783,
+      "learning_rate": 0.00048448949824970826,
+      "loss": 3.6221,
+      "step": 13300
+    },
+    {
+      "epoch": 3.888778839431368,
+      "grad_norm": 0.330727756023407,
+      "learning_rate": 0.0004840519253208868,
+      "loss": 3.629,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9033442088091355,
+      "grad_norm": 0.33198925852775574,
+      "learning_rate": 0.0004836143523920653,
+      "loss": 3.6337,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9179095781869027,
+      "grad_norm": 0.3273507356643677,
+      "learning_rate": 0.0004831767794632438,
+      "loss": 3.6255,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9324749475646703,
+      "grad_norm": 0.31935301423072815,
+      "learning_rate": 0.00048273920653442236,
+      "loss": 3.6307,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9470403169424375,
+      "grad_norm": 0.3273596167564392,
+      "learning_rate": 0.0004823016336056009,
+      "loss": 3.6276,
+      "step": 13550
+    },
+    {
+      "epoch": 3.961605686320205,
+      "grad_norm": 0.3235807716846466,
+      "learning_rate": 0.0004818640606767794,
+      "loss": 3.6299,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9761710556979724,
+      "grad_norm": 0.3113962709903717,
+      "learning_rate": 0.00048142648774795793,
+      "loss": 3.6323,
+      "step": 13650
+    },
+    {
+      "epoch": 3.99073642507574,
+      "grad_norm": 0.3231075704097748,
+      "learning_rate": 0.0004809889148191365,
+      "loss": 3.6357,
+      "step": 13700
+    },
+    {
+      "epoch": 4.005243532975996,
+      "grad_norm": 0.35100987553596497,
+      "learning_rate": 0.000480551341890315,
+      "loss": 3.5912,
+      "step": 13750
+    },
+    {
+      "epoch": 4.0198089023537635,
+      "grad_norm": 0.37618836760520935,
+      "learning_rate": 0.00048011376896149355,
+      "loss": 3.5127,
+      "step": 13800
+    },
+    {
+      "epoch": 4.034374271731531,
+      "grad_norm": 0.35220426321029663,
+      "learning_rate": 0.0004796761960326721,
+      "loss": 3.534,
+      "step": 13850
+    },
+    {
+      "epoch": 4.048939641109299,
+      "grad_norm": 0.3237028419971466,
+      "learning_rate": 0.0004792386231038506,
+      "loss": 3.5277,
+      "step": 13900
+    },
+    {
+      "epoch": 4.063505010487066,
+      "grad_norm": 0.32475364208221436,
+      "learning_rate": 0.0004788010501750291,
+      "loss": 3.526,
+      "step": 13950
+    },
+    {
+      "epoch": 4.078070379864833,
+      "grad_norm": 0.3234672546386719,
+      "learning_rate": 0.00047836347724620766,
+      "loss": 3.5277,
+      "step": 14000
+    },
+    {
+      "epoch": 4.078070379864833,
+      "eval_accuracy": 0.3567962409307186,
+      "eval_loss": 3.6671650409698486,
+      "eval_runtime": 179.7799,
+      "eval_samples_per_second": 92.58,
+      "eval_steps_per_second": 5.79,
+      "step": 14000
+    },
+    {
+      "epoch": 4.092635749242601,
+      "grad_norm": 0.31527358293533325,
+      "learning_rate": 0.0004779259043173862,
+      "loss": 3.5321,
+      "step": 14050
+    },
+    {
+      "epoch": 4.1072011186203685,
+      "grad_norm": 0.32927918434143066,
+      "learning_rate": 0.0004774883313885647,
+      "loss": 3.5355,
+      "step": 14100
+    },
+    {
+      "epoch": 4.121766487998135,
+      "grad_norm": 0.33241936564445496,
+      "learning_rate": 0.0004770507584597433,
+      "loss": 3.5547,
+      "step": 14150
+    },
+    {
+      "epoch": 4.136331857375903,
+      "grad_norm": 0.32307010889053345,
+      "learning_rate": 0.0004766131855309218,
+      "loss": 3.5384,
+      "step": 14200
+    },
+    {
+      "epoch": 4.150897226753671,
+      "grad_norm": 0.31569403409957886,
+      "learning_rate": 0.0004761756126021003,
+      "loss": 3.5468,
+      "step": 14250
+    },
+    {
+      "epoch": 4.165462596131438,
+      "grad_norm": 0.3237732946872711,
+      "learning_rate": 0.00047573803967327884,
+      "loss": 3.5403,
+      "step": 14300
+    },
+    {
+      "epoch": 4.180027965509205,
+      "grad_norm": 0.33519721031188965,
+      "learning_rate": 0.0004753004667444574,
+      "loss": 3.5417,
+      "step": 14350
+    },
+    {
+      "epoch": 4.194593334886973,
+      "grad_norm": 0.3299810588359833,
+      "learning_rate": 0.00047486289381563587,
+      "loss": 3.5518,
+      "step": 14400
+    },
+    {
+      "epoch": 4.20915870426474,
+      "grad_norm": 0.3094955384731293,
+      "learning_rate": 0.0004744253208868144,
+      "loss": 3.5611,
+      "step": 14450
+    },
+    {
+      "epoch": 4.223724073642508,
+      "grad_norm": 0.3562442660331726,
+      "learning_rate": 0.00047398774795799295,
+      "loss": 3.5484,
+      "step": 14500
+    },
+    {
+      "epoch": 4.238289443020275,
+      "grad_norm": 0.31193310022354126,
+      "learning_rate": 0.00047355017502917154,
+      "loss": 3.5554,
+      "step": 14550
+    },
+    {
+      "epoch": 4.252854812398042,
+      "grad_norm": 0.32945749163627625,
+      "learning_rate": 0.00047311260210035,
+      "loss": 3.5588,
+      "step": 14600
+    },
+    {
+      "epoch": 4.26742018177581,
+      "grad_norm": 0.30832111835479736,
+      "learning_rate": 0.00047267502917152857,
+      "loss": 3.5549,
+      "step": 14650
+    },
+    {
+      "epoch": 4.281985551153578,
+      "grad_norm": 0.31878674030303955,
+      "learning_rate": 0.0004722374562427071,
+      "loss": 3.5508,
+      "step": 14700
+    },
+    {
+      "epoch": 4.296550920531344,
+      "grad_norm": 0.3359583914279938,
+      "learning_rate": 0.0004717998833138856,
+      "loss": 3.5533,
+      "step": 14750
+    },
+    {
+      "epoch": 4.311116289909112,
+      "grad_norm": 0.3579261302947998,
+      "learning_rate": 0.00047136231038506413,
+      "loss": 3.5548,
+      "step": 14800
+    },
+    {
+      "epoch": 4.32568165928688,
+      "grad_norm": 0.3416401147842407,
+      "learning_rate": 0.00047092473745624267,
+      "loss": 3.5548,
+      "step": 14850
+    },
+    {
+      "epoch": 4.340247028664647,
+      "grad_norm": 0.31429240107536316,
+      "learning_rate": 0.00047048716452742116,
+      "loss": 3.5629,
+      "step": 14900
+    },
+    {
+      "epoch": 4.354812398042414,
+      "grad_norm": 0.3537607192993164,
+      "learning_rate": 0.0004700495915985997,
+      "loss": 3.5561,
+      "step": 14950
+    },
+    {
+      "epoch": 4.369377767420182,
+      "grad_norm": 0.32719892263412476,
+      "learning_rate": 0.0004696120186697783,
+      "loss": 3.5578,
+      "step": 15000
+    },
+    {
+      "epoch": 4.369377767420182,
+      "eval_accuracy": 0.35763656184935977,
+      "eval_loss": 3.656822681427002,
+      "eval_runtime": 179.9827,
+      "eval_samples_per_second": 92.476,
+      "eval_steps_per_second": 5.784,
+      "step": 15000
+    },
+    {
+      "epoch": 4.383943136797949,
+      "grad_norm": 0.33471086621284485,
+      "learning_rate": 0.00046917444574095683,
+      "loss": 3.557,
+      "step": 15050
+    },
+    {
+      "epoch": 4.398508506175717,
+      "grad_norm": 0.33362334966659546,
+      "learning_rate": 0.0004687368728121353,
+      "loss": 3.5585,
+      "step": 15100
+    },
+    {
+      "epoch": 4.413073875553484,
+      "grad_norm": 0.32362911105155945,
+      "learning_rate": 0.00046829929988331386,
+      "loss": 3.546,
+      "step": 15150
+    },
+    {
+      "epoch": 4.427639244931251,
+      "grad_norm": 0.32808783650398254,
+      "learning_rate": 0.0004678617269544924,
+      "loss": 3.5373,
+      "step": 15200
+    },
+    {
+      "epoch": 4.442204614309019,
+      "grad_norm": 0.33728596568107605,
+      "learning_rate": 0.0004674241540256709,
+      "loss": 3.5617,
+      "step": 15250
+    },
+    {
+      "epoch": 4.456769983686787,
+      "grad_norm": 0.3254978358745575,
+      "learning_rate": 0.0004669865810968494,
+      "loss": 3.5573,
+      "step": 15300
+    },
+    {
+      "epoch": 4.471335353064553,
+      "grad_norm": 0.35697412490844727,
+      "learning_rate": 0.00046654900816802796,
+      "loss": 3.557,
+      "step": 15350
+    },
+    {
+      "epoch": 4.485900722442321,
+      "grad_norm": 0.36106449365615845,
+      "learning_rate": 0.00046611143523920645,
+      "loss": 3.5659,
+      "step": 15400
+    },
+    {
+      "epoch": 4.500466091820089,
+      "grad_norm": 0.3264298141002655,
+      "learning_rate": 0.00046567386231038504,
+      "loss": 3.5652,
+      "step": 15450
+    },
+    {
+      "epoch": 4.515031461197856,
+      "grad_norm": 0.3491400480270386,
+      "learning_rate": 0.0004652362893815636,
+      "loss": 3.5647,
+      "step": 15500
+    },
+    {
+      "epoch": 4.529596830575623,
+      "grad_norm": 0.36997708678245544,
+      "learning_rate": 0.0004647987164527421,
+      "loss": 3.5688,
+      "step": 15550
+    },
+    {
+      "epoch": 4.544162199953391,
+      "grad_norm": 0.3246486783027649,
+      "learning_rate": 0.0004643611435239206,
+      "loss": 3.5807,
+      "step": 15600
+    },
+    {
+      "epoch": 4.558727569331158,
+      "grad_norm": 0.3414935767650604,
+      "learning_rate": 0.00046392357059509915,
+      "loss": 3.5806,
+      "step": 15650
+    },
+    {
+      "epoch": 4.573292938708926,
+      "grad_norm": 0.33668363094329834,
+      "learning_rate": 0.0004634859976662777,
+      "loss": 3.5698,
+      "step": 15700
+    },
+    {
+      "epoch": 4.587858308086693,
+      "grad_norm": 0.3316305875778198,
+      "learning_rate": 0.0004630484247374562,
+      "loss": 3.5598,
+      "step": 15750
+    },
+    {
+      "epoch": 4.6024236774644605,
+      "grad_norm": 0.34208086133003235,
+      "learning_rate": 0.0004626108518086347,
+      "loss": 3.5574,
+      "step": 15800
+    },
+    {
+      "epoch": 4.616989046842228,
+      "grad_norm": 0.3184557259082794,
+      "learning_rate": 0.0004621732788798133,
+      "loss": 3.5573,
+      "step": 15850
+    },
+    {
+      "epoch": 4.631554416219995,
+      "grad_norm": 0.33165499567985535,
+      "learning_rate": 0.00046173570595099174,
+      "loss": 3.5583,
+      "step": 15900
+    },
+    {
+      "epoch": 4.6461197855977625,
+      "grad_norm": 0.3253953456878662,
+      "learning_rate": 0.00046129813302217033,
+      "loss": 3.5671,
+      "step": 15950
+    },
+    {
+      "epoch": 4.66068515497553,
+      "grad_norm": 0.33802923560142517,
+      "learning_rate": 0.00046086056009334887,
+      "loss": 3.5631,
+      "step": 16000
+    },
+    {
+      "epoch": 4.66068515497553,
+      "eval_accuracy": 0.3593661156080293,
+      "eval_loss": 3.642476797103882,
+      "eval_runtime": 179.5652,
+      "eval_samples_per_second": 92.691,
+      "eval_steps_per_second": 5.797,
+      "step": 16000
+    },
+    {
+      "epoch": 4.675250524353298,
+      "grad_norm": 0.3159468472003937,
+      "learning_rate": 0.0004604229871645274,
+      "loss": 3.5539,
+      "step": 16050
+    },
+    {
+      "epoch": 4.689815893731065,
+      "grad_norm": 0.31020215153694153,
+      "learning_rate": 0.0004599854142357059,
+      "loss": 3.5584,
+      "step": 16100
+    },
+    {
+      "epoch": 4.704381263108832,
+      "grad_norm": 0.32496050000190735,
+      "learning_rate": 0.00045954784130688444,
+      "loss": 3.5691,
+      "step": 16150
+    },
+    {
+      "epoch": 4.7189466324866,
+      "grad_norm": 0.3142569363117218,
+      "learning_rate": 0.000459110268378063,
+      "loss": 3.5447,
+      "step": 16200
+    },
+    {
+      "epoch": 4.7335120018643675,
+      "grad_norm": 0.32899901270866394,
+      "learning_rate": 0.00045867269544924146,
+      "loss": 3.5739,
+      "step": 16250
+    },
+    {
+      "epoch": 4.748077371242134,
+      "grad_norm": 0.3278380334377289,
+      "learning_rate": 0.00045823512252042,
+      "loss": 3.5614,
+      "step": 16300
+    },
+    {
+      "epoch": 4.762642740619902,
+      "grad_norm": 0.3125869631767273,
+      "learning_rate": 0.0004577975495915986,
+      "loss": 3.5607,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7772081099976695,
+      "grad_norm": 0.34363460540771484,
+      "learning_rate": 0.0004573599766627771,
+      "loss": 3.5555,
+      "step": 16400
+    },
+    {
+      "epoch": 4.791773479375437,
+      "grad_norm": 0.4942171275615692,
+      "learning_rate": 0.0004569224037339556,
+      "loss": 3.5587,
+      "step": 16450
+    },
+    {
+      "epoch": 4.806338848753205,
+      "grad_norm": 0.3390342891216278,
+      "learning_rate": 0.00045648483080513416,
+      "loss": 3.5731,
+      "step": 16500
+    },
+    {
+      "epoch": 4.820904218130972,
+      "grad_norm": 0.3250505328178406,
+      "learning_rate": 0.0004560472578763127,
+      "loss": 3.5623,
+      "step": 16550
+    },
+    {
+      "epoch": 4.835469587508739,
+      "grad_norm": 0.3381325602531433,
+      "learning_rate": 0.0004556096849474912,
+      "loss": 3.5687,
+      "step": 16600
+    },
+    {
+      "epoch": 4.850034956886507,
+      "grad_norm": 0.3354376256465912,
+      "learning_rate": 0.00045517211201866973,
+      "loss": 3.5694,
+      "step": 16650
+    },
+    {
+      "epoch": 4.864600326264274,
+      "grad_norm": 0.3529720902442932,
+      "learning_rate": 0.00045473453908984827,
+      "loss": 3.5484,
+      "step": 16700
+    },
+    {
+      "epoch": 4.879165695642041,
+      "grad_norm": 0.32668349146842957,
+      "learning_rate": 0.00045429696616102675,
+      "loss": 3.5741,
+      "step": 16750
+    },
+    {
+      "epoch": 4.893731065019809,
+      "grad_norm": 0.31390804052352905,
+      "learning_rate": 0.00045385939323220535,
+      "loss": 3.5553,
+      "step": 16800
+    },
+    {
+      "epoch": 4.908296434397577,
+      "grad_norm": 0.3186156153678894,
+      "learning_rate": 0.0004534218203033839,
+      "loss": 3.5625,
+      "step": 16850
+    },
+    {
+      "epoch": 4.922861803775344,
+      "grad_norm": 0.32360565662384033,
+      "learning_rate": 0.0004529842473745624,
+      "loss": 3.5605,
+      "step": 16900
+    },
+    {
+      "epoch": 4.937427173153111,
+      "grad_norm": 0.3357037305831909,
+      "learning_rate": 0.0004525466744457409,
+      "loss": 3.5627,
+      "step": 16950
+    },
+    {
+      "epoch": 4.951992542530879,
+      "grad_norm": 0.3252997100353241,
+      "learning_rate": 0.00045210910151691945,
+      "loss": 3.5746,
+      "step": 17000
+    },
+    {
+      "epoch": 4.951992542530879,
+      "eval_accuracy": 0.3607067255302828,
+      "eval_loss": 3.6273179054260254,
+      "eval_runtime": 179.7214,
+      "eval_samples_per_second": 92.61,
+      "eval_steps_per_second": 5.792,
+      "step": 17000
+    },
+    {
+      "epoch": 4.966557911908646,
+      "grad_norm": 0.33965301513671875,
+      "learning_rate": 0.000451671528588098,
+      "loss": 3.564,
+      "step": 17050
+    },
+    {
+      "epoch": 4.981123281286413,
+      "grad_norm": 0.31816357374191284,
+      "learning_rate": 0.0004512339556592765,
+      "loss": 3.5581,
+      "step": 17100
+    },
+    {
+      "epoch": 4.995688650664181,
+      "grad_norm": 0.3402574360370636,
+      "learning_rate": 0.000450796382730455,
+      "loss": 3.5655,
+      "step": 17150
+    },
+    {
+      "epoch": 5.010195758564437,
+      "grad_norm": 0.33764371275901794,
+      "learning_rate": 0.0004503588098016336,
+      "loss": 3.4851,
+      "step": 17200
+    },
+    {
+      "epoch": 5.024761127942204,
+      "grad_norm": 0.3261650502681732,
+      "learning_rate": 0.0004499212368728121,
+      "loss": 3.4587,
+      "step": 17250
+    },
+    {
+      "epoch": 5.039326497319972,
+      "grad_norm": 0.33577781915664673,
+      "learning_rate": 0.00044948366394399064,
+      "loss": 3.4494,
+      "step": 17300
+    },
+    {
+      "epoch": 5.0538918666977395,
+      "grad_norm": 0.3431420922279358,
+      "learning_rate": 0.0004490460910151692,
+      "loss": 3.4595,
+      "step": 17350
+    },
+    {
+      "epoch": 5.068457236075507,
+      "grad_norm": 0.3210841119289398,
+      "learning_rate": 0.00044860851808634767,
+      "loss": 3.4537,
+      "step": 17400
+    },
+    {
+      "epoch": 5.083022605453274,
+      "grad_norm": 0.33073899149894714,
+      "learning_rate": 0.0004481709451575262,
+      "loss": 3.4588,
+      "step": 17450
+    },
+    {
+      "epoch": 5.0975879748310415,
+      "grad_norm": 0.32355406880378723,
+      "learning_rate": 0.00044773337222870475,
+      "loss": 3.4677,
+      "step": 17500
+    },
+    {
+      "epoch": 5.112153344208809,
+      "grad_norm": 0.3393101990222931,
+      "learning_rate": 0.0004472957992998833,
+      "loss": 3.4768,
+      "step": 17550
+    },
+    {
+      "epoch": 5.126718713586577,
+      "grad_norm": 0.3306352198123932,
+      "learning_rate": 0.00044685822637106177,
+      "loss": 3.4789,
+      "step": 17600
+    },
+    {
+      "epoch": 5.141284082964344,
+      "grad_norm": 0.32835835218429565,
+      "learning_rate": 0.00044642065344224037,
+      "loss": 3.4766,
+      "step": 17650
+    },
+    {
+      "epoch": 5.155849452342111,
+      "grad_norm": 0.3455768823623657,
+      "learning_rate": 0.0004459830805134189,
+      "loss": 3.4781,
+      "step": 17700
+    },
+    {
+      "epoch": 5.170414821719879,
+      "grad_norm": 0.33673617243766785,
+      "learning_rate": 0.0004455455075845974,
+      "loss": 3.4899,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1849801910976465,
+      "grad_norm": 0.3145007789134979,
+      "learning_rate": 0.00044510793465577593,
+      "loss": 3.4957,
+      "step": 17800
+    },
+    {
+      "epoch": 5.199545560475413,
+      "grad_norm": 0.3510107696056366,
+      "learning_rate": 0.00044467036172695447,
+      "loss": 3.4766,
+      "step": 17850
+    },
+    {
+      "epoch": 5.214110929853181,
+      "grad_norm": 0.3493402302265167,
+      "learning_rate": 0.00044423278879813296,
+      "loss": 3.4842,
+      "step": 17900
+    },
+    {
+      "epoch": 5.228676299230949,
+      "grad_norm": 0.33777111768722534,
+      "learning_rate": 0.0004437952158693115,
+      "loss": 3.4779,
+      "step": 17950
+    },
+    {
+      "epoch": 5.243241668608716,
+      "grad_norm": 0.3274925947189331,
+      "learning_rate": 0.00044335764294049004,
+      "loss": 3.4835,
+      "step": 18000
+    },
+    {
+      "epoch": 5.243241668608716,
+      "eval_accuracy": 0.3611752688156872,
+      "eval_loss": 3.630121946334839,
+      "eval_runtime": 179.8645,
+      "eval_samples_per_second": 92.536,
+      "eval_steps_per_second": 5.788,
+      "step": 18000
+    },
+    {
+      "epoch": 5.257807037986483,
+      "grad_norm": 0.3470570743083954,
+      "learning_rate": 0.00044292007001166863,
+      "loss": 3.4807,
+      "step": 18050
+    },
+    {
+      "epoch": 5.272372407364251,
+      "grad_norm": 0.3509993255138397,
+      "learning_rate": 0.00044248249708284706,
+      "loss": 3.492,
+      "step": 18100
+    },
+    {
+      "epoch": 5.286937776742018,
+      "grad_norm": 0.3337821662425995,
+      "learning_rate": 0.00044204492415402566,
+      "loss": 3.4895,
+      "step": 18150
+    },
+    {
+      "epoch": 5.301503146119786,
+      "grad_norm": 0.3395850956439972,
+      "learning_rate": 0.0004416073512252042,
+      "loss": 3.488,
+      "step": 18200
+    },
+    {
+      "epoch": 5.316068515497553,
+      "grad_norm": 0.32552629709243774,
+      "learning_rate": 0.0004411697782963827,
+      "loss": 3.4976,
+      "step": 18250
+    },
+    {
+      "epoch": 5.33063388487532,
+      "grad_norm": 0.34504395723342896,
+      "learning_rate": 0.0004407322053675612,
+      "loss": 3.4972,
+      "step": 18300
+    },
+    {
+      "epoch": 5.345199254253088,
+      "grad_norm": 0.3230566084384918,
+      "learning_rate": 0.00044029463243873976,
+      "loss": 3.4979,
+      "step": 18350
+    },
+    {
+      "epoch": 5.359764623630856,
+      "grad_norm": 0.337710440158844,
+      "learning_rate": 0.00043985705950991825,
+      "loss": 3.5005,
+      "step": 18400
+    },
+    {
+      "epoch": 5.374329993008622,
+      "grad_norm": 0.3194814920425415,
+      "learning_rate": 0.0004394194865810968,
+      "loss": 3.5184,
+      "step": 18450
+    },
+    {
+      "epoch": 5.38889536238639,
+      "grad_norm": 0.3493628203868866,
+      "learning_rate": 0.00043898191365227533,
+      "loss": 3.5122,
+      "step": 18500
+    },
+    {
+      "epoch": 5.403460731764158,
+      "grad_norm": 0.3941914737224579,
+      "learning_rate": 0.0004385443407234539,
+      "loss": 3.5004,
+      "step": 18550
+    },
+    {
+      "epoch": 5.418026101141925,
+      "grad_norm": 0.35434696078300476,
+      "learning_rate": 0.0004381067677946324,
+      "loss": 3.5012,
+      "step": 18600
+    },
+    {
+      "epoch": 5.432591470519692,
+      "grad_norm": 0.34531646966934204,
+      "learning_rate": 0.00043766919486581095,
+      "loss": 3.493,
+      "step": 18650
+    },
+    {
+      "epoch": 5.44715683989746,
+      "grad_norm": 0.33792534470558167,
+      "learning_rate": 0.0004372316219369895,
+      "loss": 3.516,
+      "step": 18700
+    },
+    {
+      "epoch": 5.461722209275227,
+      "grad_norm": 0.3471498191356659,
+      "learning_rate": 0.00043679404900816797,
+      "loss": 3.4946,
+      "step": 18750
+    },
+    {
+      "epoch": 5.476287578652995,
+      "grad_norm": 0.32506677508354187,
+      "learning_rate": 0.0004363564760793465,
+      "loss": 3.5004,
+      "step": 18800
+    },
+    {
+      "epoch": 5.490852948030762,
+      "grad_norm": 0.3650604486465454,
+      "learning_rate": 0.00043591890315052505,
+      "loss": 3.4976,
+      "step": 18850
+    },
+    {
+      "epoch": 5.505418317408529,
+      "grad_norm": 0.3289859890937805,
+      "learning_rate": 0.00043548133022170354,
+      "loss": 3.5157,
+      "step": 18900
+    },
+    {
+      "epoch": 5.519983686786297,
+      "grad_norm": 0.3407946527004242,
+      "learning_rate": 0.0004350437572928821,
+      "loss": 3.5102,
+      "step": 18950
+    },
+    {
+      "epoch": 5.534549056164065,
+      "grad_norm": 0.3405243158340454,
+      "learning_rate": 0.00043460618436406067,
+      "loss": 3.5199,
+      "step": 19000
+    },
+    {
+      "epoch": 5.534549056164065,
+      "eval_accuracy": 0.36196820631048443,
+      "eval_loss": 3.6200644969940186,
+      "eval_runtime": 179.7535,
+      "eval_samples_per_second": 92.593,
+      "eval_steps_per_second": 5.791,
+      "step": 19000
+    },
+    {
+      "epoch": 5.549114425541831,
+      "grad_norm": 0.3322257995605469,
+      "learning_rate": 0.0004341686114352392,
+      "loss": 3.5128,
+      "step": 19050
+    },
+    {
+      "epoch": 5.563679794919599,
+      "grad_norm": 0.3334507346153259,
+      "learning_rate": 0.0004337310385064177,
+      "loss": 3.5019,
+      "step": 19100
+    },
+    {
+      "epoch": 5.578245164297367,
+      "grad_norm": 0.3119942247867584,
+      "learning_rate": 0.00043329346557759624,
+      "loss": 3.5248,
+      "step": 19150
+    },
+    {
+      "epoch": 5.592810533675134,
+      "grad_norm": 0.32865530252456665,
+      "learning_rate": 0.0004328558926487748,
+      "loss": 3.5017,
+      "step": 19200
+    },
+    {
+      "epoch": 5.607375903052901,
+      "grad_norm": 0.32390671968460083,
+      "learning_rate": 0.00043241831971995326,
+      "loss": 3.5087,
+      "step": 19250
+    },
+    {
+      "epoch": 5.621941272430669,
+      "grad_norm": 0.3391891419887543,
+      "learning_rate": 0.0004319807467911318,
+      "loss": 3.5117,
+      "step": 19300
+    },
+    {
+      "epoch": 5.636506641808436,
+      "grad_norm": 0.3408890664577484,
+      "learning_rate": 0.00043154317386231034,
+      "loss": 3.5164,
+      "step": 19350
+    },
+    {
+      "epoch": 5.651072011186204,
+      "grad_norm": 0.33605512976646423,
+      "learning_rate": 0.00043110560093348883,
+      "loss": 3.5009,
+      "step": 19400
+    },
+    {
+      "epoch": 5.665637380563971,
+      "grad_norm": 0.35284584760665894,
+      "learning_rate": 0.0004306680280046674,
+      "loss": 3.5017,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6802027499417385,
+      "grad_norm": 0.35283035039901733,
+      "learning_rate": 0.00043023045507584596,
+      "loss": 3.5166,
+      "step": 19500
+    },
+    {
+      "epoch": 5.694768119319506,
+      "grad_norm": 0.33758777379989624,
+      "learning_rate": 0.0004297928821470245,
+      "loss": 3.5104,
+      "step": 19550
+    },
+    {
+      "epoch": 5.709333488697274,
+      "grad_norm": 0.3389108180999756,
+      "learning_rate": 0.000429355309218203,
+      "loss": 3.5154,
+      "step": 19600
+    },
+    {
+      "epoch": 5.7238988580750405,
+      "grad_norm": 0.43688979744911194,
+      "learning_rate": 0.00042891773628938153,
+      "loss": 3.5108,
+      "step": 19650
+    },
+    {
+      "epoch": 5.738464227452808,
+      "grad_norm": 0.33347204327583313,
+      "learning_rate": 0.00042848016336056007,
+      "loss": 3.511,
+      "step": 19700
+    },
+    {
+      "epoch": 5.753029596830576,
+      "grad_norm": 0.30911001563072205,
+      "learning_rate": 0.00042804259043173855,
+      "loss": 3.512,
+      "step": 19750
+    },
+    {
+      "epoch": 5.7675949662083426,
+      "grad_norm": 0.32809844613075256,
+      "learning_rate": 0.0004276050175029171,
+      "loss": 3.498,
+      "step": 19800
+    },
+    {
+      "epoch": 5.78216033558611,
+      "grad_norm": 0.3727506697177887,
+      "learning_rate": 0.0004271674445740957,
+      "loss": 3.5211,
+      "step": 19850
+    },
+    {
+      "epoch": 5.796725704963878,
+      "grad_norm": 0.33282268047332764,
+      "learning_rate": 0.0004267298716452741,
+      "loss": 3.5089,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8112910743416455,
+      "grad_norm": 0.3280541002750397,
+      "learning_rate": 0.0004262922987164527,
+      "loss": 3.521,
+      "step": 19950
+    },
+    {
+      "epoch": 5.825856443719413,
+      "grad_norm": 0.3236095905303955,
+      "learning_rate": 0.00042585472578763125,
+      "loss": 3.5176,
+      "step": 20000
+    },
+    {
+      "epoch": 5.825856443719413,
+      "eval_accuracy": 0.36285532276929894,
+      "eval_loss": 3.609638214111328,
+      "eval_runtime": 179.7926,
+      "eval_samples_per_second": 92.573,
+      "eval_steps_per_second": 5.79,
+      "step": 20000
+    },
+    {
+      "epoch": 5.84042181309718,
+      "grad_norm": 0.3244244456291199,
+      "learning_rate": 0.0004254171528588098,
+      "loss": 3.5114,
+      "step": 20050
+    },
+    {
+      "epoch": 5.8549871824749475,
+      "grad_norm": 0.33964627981185913,
+      "learning_rate": 0.0004249795799299883,
+      "loss": 3.5003,
+      "step": 20100
+    },
+    {
+      "epoch": 5.869552551852715,
+      "grad_norm": 0.33158302307128906,
+      "learning_rate": 0.0004245420070011668,
+      "loss": 3.5042,
+      "step": 20150
+    },
+    {
+      "epoch": 5.884117921230482,
+      "grad_norm": 0.3141254186630249,
+      "learning_rate": 0.00042410443407234536,
+      "loss": 3.5203,
+      "step": 20200
+    },
+    {
+      "epoch": 5.89868329060825,
+      "grad_norm": 0.33124253153800964,
+      "learning_rate": 0.00042366686114352385,
+      "loss": 3.5171,
+      "step": 20250
+    },
+    {
+      "epoch": 5.913248659986017,
+      "grad_norm": 0.31624454259872437,
+      "learning_rate": 0.0004232292882147024,
+      "loss": 3.5317,
+      "step": 20300
+    },
+    {
+      "epoch": 5.927814029363785,
+      "grad_norm": 0.3152634799480438,
+      "learning_rate": 0.000422791715285881,
+      "loss": 3.5065,
+      "step": 20350
+    },
+    {
+      "epoch": 5.9423793987415525,
+      "grad_norm": 0.3478392958641052,
+      "learning_rate": 0.00042235414235705947,
+      "loss": 3.5132,
+      "step": 20400
+    },
+    {
+      "epoch": 5.956944768119319,
+      "grad_norm": 0.33496901392936707,
+      "learning_rate": 0.000421916569428238,
+      "loss": 3.5178,
+      "step": 20450
+    },
+    {
+      "epoch": 5.971510137497087,
+      "grad_norm": 0.3329765796661377,
+      "learning_rate": 0.00042147899649941654,
+      "loss": 3.5147,
+      "step": 20500
+    },
+    {
+      "epoch": 5.986075506874855,
+      "grad_norm": 0.3469819724559784,
+      "learning_rate": 0.0004210414235705951,
+      "loss": 3.5163,
+      "step": 20550
+    },
+    {
+      "epoch": 6.0005826147751105,
+      "grad_norm": 0.3333797752857208,
+      "learning_rate": 0.00042060385064177357,
+      "loss": 3.5088,
+      "step": 20600
+    },
+    {
+      "epoch": 6.015147984152878,
+      "grad_norm": 0.3532175123691559,
+      "learning_rate": 0.0004201662777129521,
+      "loss": 3.3952,
+      "step": 20650
+    },
+    {
+      "epoch": 6.029713353530646,
+      "grad_norm": 0.35990819334983826,
+      "learning_rate": 0.00041972870478413065,
+      "loss": 3.3972,
+      "step": 20700
+    },
+    {
+      "epoch": 6.044278722908413,
+      "grad_norm": 0.3456185758113861,
+      "learning_rate": 0.00041929113185530914,
+      "loss": 3.4024,
+      "step": 20750
+    },
+    {
+      "epoch": 6.05884409228618,
+      "grad_norm": 0.34514689445495605,
+      "learning_rate": 0.00041885355892648773,
+      "loss": 3.4088,
+      "step": 20800
+    },
+    {
+      "epoch": 6.073409461663948,
+      "grad_norm": 0.32557061314582825,
+      "learning_rate": 0.00041841598599766627,
+      "loss": 3.4169,
+      "step": 20850
+    },
+    {
+      "epoch": 6.087974831041715,
+      "grad_norm": 0.3310924172401428,
+      "learning_rate": 0.00041797841306884476,
+      "loss": 3.4211,
+      "step": 20900
+    },
+    {
+      "epoch": 6.102540200419483,
+      "grad_norm": 0.34286898374557495,
+      "learning_rate": 0.0004175408401400233,
+      "loss": 3.4188,
+      "step": 20950
+    },
+    {
+      "epoch": 6.11710556979725,
+      "grad_norm": 0.33326268196105957,
+      "learning_rate": 0.00041710326721120184,
+      "loss": 3.4233,
+      "step": 21000
+    },
+    {
+      "epoch": 6.11710556979725,
+      "eval_accuracy": 0.3633101095768131,
+      "eval_loss": 3.612283945083618,
+      "eval_runtime": 179.8679,
+      "eval_samples_per_second": 92.535,
+      "eval_steps_per_second": 5.788,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1316709391750175,
+      "grad_norm": 0.3543689548969269,
+      "learning_rate": 0.0004166656942823804,
+      "loss": 3.4277,
+      "step": 21050
+    },
+    {
+      "epoch": 6.146236308552785,
+      "grad_norm": 0.3318440616130829,
+      "learning_rate": 0.00041622812135355886,
+      "loss": 3.4383,
+      "step": 21100
+    },
+    {
+      "epoch": 6.160801677930552,
+      "grad_norm": 0.3441929817199707,
+      "learning_rate": 0.0004157905484247374,
+      "loss": 3.4261,
+      "step": 21150
+    },
+    {
+      "epoch": 6.1753670473083195,
+      "grad_norm": 0.35300642251968384,
+      "learning_rate": 0.000415352975495916,
+      "loss": 3.4354,
+      "step": 21200
+    },
+    {
+      "epoch": 6.189932416686087,
+      "grad_norm": 0.3246099054813385,
+      "learning_rate": 0.0004149154025670945,
+      "loss": 3.4363,
+      "step": 21250
+    },
+    {
+      "epoch": 6.204497786063855,
+      "grad_norm": 0.34103959798812866,
+      "learning_rate": 0.000414477829638273,
+      "loss": 3.4373,
+      "step": 21300
+    },
+    {
+      "epoch": 6.219063155441622,
+      "grad_norm": 0.3544823229312897,
+      "learning_rate": 0.00041404025670945156,
+      "loss": 3.4519,
+      "step": 21350
+    },
+    {
+      "epoch": 6.233628524819389,
+      "grad_norm": 0.34362295269966125,
+      "learning_rate": 0.00041360268378063005,
+      "loss": 3.4422,
+      "step": 21400
+    },
+    {
+      "epoch": 6.248193894197157,
+      "grad_norm": 0.32189178466796875,
+      "learning_rate": 0.0004131651108518086,
+      "loss": 3.4465,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2627592635749245,
+      "grad_norm": 0.336997389793396,
+      "learning_rate": 0.0004127275379229871,
+      "loss": 3.4324,
+      "step": 21500
+    },
+    {
+      "epoch": 6.277324632952691,
+      "grad_norm": 0.3384324610233307,
+      "learning_rate": 0.00041228996499416567,
+      "loss": 3.4471,
+      "step": 21550
+    },
+    {
+      "epoch": 6.291890002330459,
+      "grad_norm": 0.3506065607070923,
+      "learning_rate": 0.00041185239206534415,
+      "loss": 3.4416,
+      "step": 21600
+    },
+    {
+      "epoch": 6.306455371708227,
+      "grad_norm": 0.3377762734889984,
+      "learning_rate": 0.00041141481913652275,
+      "loss": 3.4461,
+      "step": 21650
+    },
+    {
+      "epoch": 6.321020741085994,
+      "grad_norm": 0.35600459575653076,
+      "learning_rate": 0.0004109772462077013,
+      "loss": 3.4513,
+      "step": 21700
+    },
+    {
+      "epoch": 6.335586110463761,
+      "grad_norm": 0.3653104305267334,
+      "learning_rate": 0.00041053967327887977,
+      "loss": 3.4612,
+      "step": 21750
+    },
+    {
+      "epoch": 6.350151479841529,
+      "grad_norm": 0.3408794403076172,
+      "learning_rate": 0.0004101021003500583,
+      "loss": 3.4538,
+      "step": 21800
+    },
+    {
+      "epoch": 6.364716849219296,
+      "grad_norm": 0.32172125577926636,
+      "learning_rate": 0.00040966452742123685,
+      "loss": 3.4484,
+      "step": 21850
+    },
+    {
+      "epoch": 6.379282218597064,
+      "grad_norm": 0.35422587394714355,
+      "learning_rate": 0.00040922695449241534,
+      "loss": 3.4553,
+      "step": 21900
+    },
+    {
+      "epoch": 6.393847587974831,
+      "grad_norm": 0.36388930678367615,
+      "learning_rate": 0.0004087893815635939,
+      "loss": 3.4626,
+      "step": 21950
+    },
+    {
+      "epoch": 6.408412957352598,
+      "grad_norm": 0.3398551940917969,
+      "learning_rate": 0.0004083518086347724,
+      "loss": 3.4484,
+      "step": 22000
+    },
+    {
+      "epoch": 6.408412957352598,
+      "eval_accuracy": 0.36410598648996295,
+      "eval_loss": 3.6026387214660645,
+      "eval_runtime": 179.6634,
+      "eval_samples_per_second": 92.64,
+      "eval_steps_per_second": 5.794,
+      "step": 22000
+    },
+    {
+      "epoch": 6.422978326730366,
+      "grad_norm": 0.3316047489643097,
+      "learning_rate": 0.000407914235705951,
+      "loss": 3.4635,
+      "step": 22050
+    },
+    {
+      "epoch": 6.437543696108134,
+      "grad_norm": 0.3395221531391144,
+      "learning_rate": 0.00040747666277712944,
+      "loss": 3.4541,
+      "step": 22100
+    },
+    {
+      "epoch": 6.4521090654859,
+      "grad_norm": 0.33023080229759216,
+      "learning_rate": 0.00040703908984830804,
+      "loss": 3.455,
+      "step": 22150
+    },
+    {
+      "epoch": 6.466674434863668,
+      "grad_norm": 0.3635895848274231,
+      "learning_rate": 0.0004066015169194866,
+      "loss": 3.454,
+      "step": 22200
+    },
+    {
+      "epoch": 6.481239804241436,
+      "grad_norm": 0.33634689450263977,
+      "learning_rate": 0.00040616394399066506,
+      "loss": 3.4543,
+      "step": 22250
+    },
+    {
+      "epoch": 6.495805173619203,
+      "grad_norm": 0.3248524069786072,
+      "learning_rate": 0.0004057263710618436,
+      "loss": 3.46,
+      "step": 22300
+    },
+    {
+      "epoch": 6.51037054299697,
+      "grad_norm": 0.35410505533218384,
+      "learning_rate": 0.00040528879813302214,
+      "loss": 3.4507,
+      "step": 22350
+    },
+    {
+      "epoch": 6.524935912374738,
+      "grad_norm": 0.3269991874694824,
+      "learning_rate": 0.00040485122520420063,
+      "loss": 3.4695,
+      "step": 22400
+    },
+    {
+      "epoch": 6.539501281752505,
+      "grad_norm": 0.35111793875694275,
+      "learning_rate": 0.00040441365227537917,
+      "loss": 3.4761,
+      "step": 22450
+    },
+    {
+      "epoch": 6.554066651130273,
+      "grad_norm": 0.33786168694496155,
+      "learning_rate": 0.0004039760793465577,
+      "loss": 3.4625,
+      "step": 22500
+    },
+    {
+      "epoch": 6.56863202050804,
+      "grad_norm": 0.34099647402763367,
+      "learning_rate": 0.0004035385064177363,
+      "loss": 3.4667,
+      "step": 22550
+    },
+    {
+      "epoch": 6.583197389885807,
+      "grad_norm": 0.33352482318878174,
+      "learning_rate": 0.0004031009334889148,
+      "loss": 3.4585,
+      "step": 22600
+    },
+    {
+      "epoch": 6.597762759263575,
+      "grad_norm": 0.3284354507923126,
+      "learning_rate": 0.00040266336056009333,
+      "loss": 3.4651,
+      "step": 22650
+    },
+    {
+      "epoch": 6.612328128641343,
+      "grad_norm": 0.3649536073207855,
+      "learning_rate": 0.00040222578763127187,
+      "loss": 3.4706,
+      "step": 22700
+    },
+    {
+      "epoch": 6.626893498019109,
+      "grad_norm": 0.32932668924331665,
+      "learning_rate": 0.00040178821470245035,
+      "loss": 3.4653,
+      "step": 22750
+    },
+    {
+      "epoch": 6.641458867396877,
+      "grad_norm": 0.3425554037094116,
+      "learning_rate": 0.0004013506417736289,
+      "loss": 3.455,
+      "step": 22800
+    },
+    {
+      "epoch": 6.656024236774645,
+      "grad_norm": 0.36532074213027954,
+      "learning_rate": 0.00040091306884480743,
+      "loss": 3.468,
+      "step": 22850
+    },
+    {
+      "epoch": 6.670589606152412,
+      "grad_norm": 0.33179134130477905,
+      "learning_rate": 0.0004004754959159859,
+      "loss": 3.4657,
+      "step": 22900
+    },
+    {
+      "epoch": 6.685154975530179,
+      "grad_norm": 0.3254914879798889,
+      "learning_rate": 0.00040003792298716446,
+      "loss": 3.4557,
+      "step": 22950
+    },
+    {
+      "epoch": 6.699720344907947,
+      "grad_norm": 0.3517943322658539,
+      "learning_rate": 0.00039960035005834305,
+      "loss": 3.4673,
+      "step": 23000
+    },
+    {
+      "epoch": 6.699720344907947,
+      "eval_accuracy": 0.36481873665210124,
+      "eval_loss": 3.593592405319214,
+      "eval_runtime": 179.8124,
+      "eval_samples_per_second": 92.563,
+      "eval_steps_per_second": 5.789,
+      "step": 23000
+    },
+    {
+      "epoch": 6.714285714285714,
+      "grad_norm": 0.35354673862457275,
+      "learning_rate": 0.0003991627771295216,
+      "loss": 3.4631,
+      "step": 23050
+    },
+    {
+      "epoch": 6.728851083663482,
+      "grad_norm": 0.3463275730609894,
+      "learning_rate": 0.0003987252042007001,
+      "loss": 3.4739,
+      "step": 23100
+    },
+    {
+      "epoch": 6.743416453041249,
+      "grad_norm": 0.33963003754615784,
+      "learning_rate": 0.0003982876312718786,
+      "loss": 3.4689,
+      "step": 23150
+    },
+    {
+      "epoch": 6.7579818224190165,
+      "grad_norm": 0.3357282280921936,
+      "learning_rate": 0.00039785005834305716,
+      "loss": 3.4717,
+      "step": 23200
+    },
+    {
+      "epoch": 6.772547191796784,
+      "grad_norm": 0.34744784235954285,
+      "learning_rate": 0.00039741248541423564,
+      "loss": 3.4744,
+      "step": 23250
+    },
+    {
+      "epoch": 6.787112561174552,
+      "grad_norm": 0.32183945178985596,
+      "learning_rate": 0.0003969749124854142,
+      "loss": 3.4596,
+      "step": 23300
+    },
+    {
+      "epoch": 6.8016779305523185,
+      "grad_norm": 0.33145228028297424,
+      "learning_rate": 0.0003965373395565927,
+      "loss": 3.4604,
+      "step": 23350
+    },
+    {
+      "epoch": 6.816243299930086,
+      "grad_norm": 0.32555410265922546,
+      "learning_rate": 0.0003960997666277712,
+      "loss": 3.4691,
+      "step": 23400
+    },
+    {
+      "epoch": 6.830808669307854,
+      "grad_norm": 0.3336983323097229,
+      "learning_rate": 0.0003956621936989498,
+      "loss": 3.4653,
+      "step": 23450
+    },
+    {
+      "epoch": 6.845374038685621,
+      "grad_norm": 0.3609044551849365,
+      "learning_rate": 0.00039522462077012834,
+      "loss": 3.4661,
+      "step": 23500
+    },
+    {
+      "epoch": 6.859939408063388,
+      "grad_norm": 0.3462687134742737,
+      "learning_rate": 0.0003947870478413069,
+      "loss": 3.4719,
+      "step": 23550
+    },
+    {
+      "epoch": 6.874504777441156,
+      "grad_norm": 0.3334873914718628,
+      "learning_rate": 0.00039434947491248537,
+      "loss": 3.4628,
+      "step": 23600
+    },
+    {
+      "epoch": 6.8890701468189235,
+      "grad_norm": 0.32324063777923584,
+      "learning_rate": 0.0003939119019836639,
+      "loss": 3.4662,
+      "step": 23650
+    },
+    {
+      "epoch": 6.903635516196691,
+      "grad_norm": 0.3384288251399994,
+      "learning_rate": 0.00039347432905484245,
+      "loss": 3.4789,
+      "step": 23700
+    },
+    {
+      "epoch": 6.918200885574458,
+      "grad_norm": 0.3569772243499756,
+      "learning_rate": 0.00039303675612602094,
+      "loss": 3.4659,
+      "step": 23750
+    },
+    {
+      "epoch": 6.9327662549522255,
+      "grad_norm": 0.3427547514438629,
+      "learning_rate": 0.0003925991831971995,
+      "loss": 3.4815,
+      "step": 23800
+    },
+    {
+      "epoch": 6.947331624329993,
+      "grad_norm": 0.33504244685173035,
+      "learning_rate": 0.00039216161026837807,
+      "loss": 3.4714,
+      "step": 23850
+    },
+    {
+      "epoch": 6.961896993707761,
+      "grad_norm": 0.3363841474056244,
+      "learning_rate": 0.00039172403733955656,
+      "loss": 3.4772,
+      "step": 23900
+    },
+    {
+      "epoch": 6.976462363085528,
+      "grad_norm": 0.34754055738449097,
+      "learning_rate": 0.0003912864644107351,
+      "loss": 3.4624,
+      "step": 23950
+    },
+    {
+      "epoch": 6.991027732463295,
+      "grad_norm": 0.332419216632843,
+      "learning_rate": 0.00039084889148191364,
+      "loss": 3.4728,
+      "step": 24000
+    },
+    {
+      "epoch": 6.991027732463295,
+      "eval_accuracy": 0.3658571743677076,
+      "eval_loss": 3.583200693130493,
+      "eval_runtime": 179.9492,
+      "eval_samples_per_second": 92.493,
+      "eval_steps_per_second": 5.785,
+      "step": 24000
+    },
+    {
+      "epoch": 7.005534840363552,
+      "grad_norm": 0.34445714950561523,
+      "learning_rate": 0.0003904113185530922,
+      "loss": 3.4335,
+      "step": 24050
+    },
+    {
+      "epoch": 7.020100209741319,
+      "grad_norm": 0.359764963388443,
+      "learning_rate": 0.00038997374562427066,
+      "loss": 3.357,
+      "step": 24100
+    },
+    {
+      "epoch": 7.034665579119086,
+      "grad_norm": 0.3404935598373413,
+      "learning_rate": 0.0003895361726954492,
+      "loss": 3.3683,
+      "step": 24150
+    },
+    {
+      "epoch": 7.049230948496854,
+      "grad_norm": 0.3708736300468445,
+      "learning_rate": 0.00038909859976662774,
+      "loss": 3.3762,
+      "step": 24200
+    },
+    {
+      "epoch": 7.063796317874622,
+      "grad_norm": 0.35109418630599976,
+      "learning_rate": 0.0003886610268378062,
+      "loss": 3.3722,
+      "step": 24250
+    },
+    {
+      "epoch": 7.0783616872523885,
+      "grad_norm": 0.38338735699653625,
+      "learning_rate": 0.0003882234539089848,
+      "loss": 3.3831,
+      "step": 24300
+    },
+    {
+      "epoch": 7.092927056630156,
+      "grad_norm": 0.34883055090904236,
+      "learning_rate": 0.00038778588098016336,
+      "loss": 3.3716,
+      "step": 24350
+    },
+    {
+      "epoch": 7.107492426007924,
+      "grad_norm": 0.3360002040863037,
+      "learning_rate": 0.00038734830805134185,
+      "loss": 3.3854,
+      "step": 24400
+    },
+    {
+      "epoch": 7.122057795385691,
+      "grad_norm": 0.36411646008491516,
+      "learning_rate": 0.0003869107351225204,
+      "loss": 3.3736,
+      "step": 24450
+    },
+    {
+      "epoch": 7.136623164763458,
+      "grad_norm": 0.3585089445114136,
+      "learning_rate": 0.0003864731621936989,
+      "loss": 3.3879,
+      "step": 24500
+    },
+    {
+      "epoch": 7.151188534141226,
+      "grad_norm": 0.34876003861427307,
+      "learning_rate": 0.00038603558926487747,
+      "loss": 3.3885,
+      "step": 24550
+    },
+    {
+      "epoch": 7.165753903518993,
+      "grad_norm": 0.34521597623825073,
+      "learning_rate": 0.00038559801633605595,
+      "loss": 3.3867,
+      "step": 24600
+    },
+    {
+      "epoch": 7.180319272896761,
+      "grad_norm": 0.3658317029476166,
+      "learning_rate": 0.0003851604434072345,
+      "loss": 3.3843,
+      "step": 24650
+    },
+    {
+      "epoch": 7.194884642274528,
+      "grad_norm": 0.3635459840297699,
+      "learning_rate": 0.0003847228704784131,
+      "loss": 3.3932,
+      "step": 24700
+    },
+    {
+      "epoch": 7.2094500116522955,
+      "grad_norm": 0.3436873257160187,
+      "learning_rate": 0.0003842852975495915,
+      "loss": 3.411,
+      "step": 24750
+    },
+    {
+      "epoch": 7.224015381030063,
+      "grad_norm": 0.3474794924259186,
+      "learning_rate": 0.0003838477246207701,
+      "loss": 3.3999,
+      "step": 24800
+    },
+    {
+      "epoch": 7.238580750407831,
+      "grad_norm": 0.3771800100803375,
+      "learning_rate": 0.00038341015169194865,
+      "loss": 3.4002,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2531461197855975,
+      "grad_norm": 0.3322898745536804,
+      "learning_rate": 0.00038297257876312714,
+      "loss": 3.4123,
+      "step": 24900
+    },
+    {
+      "epoch": 7.267711489163365,
+      "grad_norm": 0.3354223072528839,
+      "learning_rate": 0.0003825350058343057,
+      "loss": 3.3915,
+      "step": 24950
+    },
+    {
+      "epoch": 7.282276858541133,
+      "grad_norm": 0.359828919172287,
+      "learning_rate": 0.0003820974329054842,
+      "loss": 3.4081,
+      "step": 25000
+    },
+    {
+      "epoch": 7.282276858541133,
+      "eval_accuracy": 0.36572290173736083,
+      "eval_loss": 3.591287136077881,
+      "eval_runtime": 179.8422,
+      "eval_samples_per_second": 92.548,
+      "eval_steps_per_second": 5.788,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2968422279189,
+      "grad_norm": 0.3653123676776886,
+      "learning_rate": 0.00038165985997666276,
+      "loss": 3.4106,
+      "step": 25050
+    },
+    {
+      "epoch": 7.311407597296667,
+      "grad_norm": 0.38982659578323364,
+      "learning_rate": 0.00038122228704784124,
+      "loss": 3.408,
+      "step": 25100
+    },
+    {
+      "epoch": 7.325972966674435,
+      "grad_norm": 0.3451847732067108,
+      "learning_rate": 0.0003807847141190198,
+      "loss": 3.4184,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3405383360522025,
+      "grad_norm": 0.33326441049575806,
+      "learning_rate": 0.0003803471411901984,
+      "loss": 3.4236,
+      "step": 25200
+    },
+    {
+      "epoch": 7.35510370542997,
+      "grad_norm": 0.3506428003311157,
+      "learning_rate": 0.00037990956826137686,
+      "loss": 3.4202,
+      "step": 25250
+    },
+    {
+      "epoch": 7.369669074807737,
+      "grad_norm": 0.35344335436820984,
+      "learning_rate": 0.0003794719953325554,
+      "loss": 3.4086,
+      "step": 25300
+    },
+    {
+      "epoch": 7.384234444185505,
+      "grad_norm": 0.35276249051094055,
+      "learning_rate": 0.00037903442240373394,
+      "loss": 3.4224,
+      "step": 25350
+    },
+    {
+      "epoch": 7.398799813563272,
+      "grad_norm": 0.34853968024253845,
+      "learning_rate": 0.00037859684947491243,
+      "loss": 3.4125,
+      "step": 25400
+    },
+    {
+      "epoch": 7.413365182941039,
+      "grad_norm": 0.3490748107433319,
+      "learning_rate": 0.00037815927654609097,
+      "loss": 3.4044,
+      "step": 25450
+    },
+    {
+      "epoch": 7.427930552318807,
+      "grad_norm": 0.34673872590065,
+      "learning_rate": 0.0003777217036172695,
+      "loss": 3.4131,
+      "step": 25500
+    },
+    {
+      "epoch": 7.442495921696574,
+      "grad_norm": 0.3763735592365265,
+      "learning_rate": 0.00037728413068844805,
+      "loss": 3.407,
+      "step": 25550
+    },
+    {
+      "epoch": 7.457061291074342,
+      "grad_norm": 0.3731195032596588,
+      "learning_rate": 0.00037684655775962653,
+      "loss": 3.4255,
+      "step": 25600
+    },
+    {
+      "epoch": 7.471626660452109,
+      "grad_norm": 0.34575656056404114,
+      "learning_rate": 0.00037640898483080513,
+      "loss": 3.4082,
+      "step": 25650
+    },
+    {
+      "epoch": 7.486192029829876,
+      "grad_norm": 0.3527657985687256,
+      "learning_rate": 0.00037597141190198367,
+      "loss": 3.4289,
+      "step": 25700
+    },
+    {
+      "epoch": 7.500757399207644,
+      "grad_norm": 0.3446659743785858,
+      "learning_rate": 0.00037553383897316215,
+      "loss": 3.4194,
+      "step": 25750
+    },
+    {
+      "epoch": 7.515322768585412,
+      "grad_norm": 0.3565196394920349,
+      "learning_rate": 0.0003750962660443407,
+      "loss": 3.416,
+      "step": 25800
+    },
+    {
+      "epoch": 7.529888137963178,
+      "grad_norm": 0.3380652666091919,
+      "learning_rate": 0.00037465869311551923,
+      "loss": 3.4235,
+      "step": 25850
+    },
+    {
+      "epoch": 7.544453507340946,
+      "grad_norm": 0.34874868392944336,
+      "learning_rate": 0.0003742211201866977,
+      "loss": 3.4153,
+      "step": 25900
+    },
+    {
+      "epoch": 7.559018876718714,
+      "grad_norm": 0.3525692820549011,
+      "learning_rate": 0.00037378354725787626,
+      "loss": 3.4213,
+      "step": 25950
+    },
+    {
+      "epoch": 7.573584246096481,
+      "grad_norm": 0.38578587770462036,
+      "learning_rate": 0.0003733459743290548,
+      "loss": 3.4258,
+      "step": 26000
+    },
+    {
+      "epoch": 7.573584246096481,
+      "eval_accuracy": 0.3662004984312912,
+      "eval_loss": 3.582775354385376,
+      "eval_runtime": 179.914,
+      "eval_samples_per_second": 92.511,
+      "eval_steps_per_second": 5.786,
+      "step": 26000
+    },
+    {
+      "epoch": 7.588149615474248,
+      "grad_norm": 0.36569854617118835,
+      "learning_rate": 0.0003729084014002334,
+      "loss": 3.4249,
+      "step": 26050
+    },
+    {
+      "epoch": 7.602714984852016,
+      "grad_norm": 0.35230007767677307,
+      "learning_rate": 0.0003724708284714119,
+      "loss": 3.4167,
+      "step": 26100
+    },
+    {
+      "epoch": 7.617280354229783,
+      "grad_norm": 0.3570927381515503,
+      "learning_rate": 0.0003720332555425904,
+      "loss": 3.4219,
+      "step": 26150
+    },
+    {
+      "epoch": 7.631845723607551,
+      "grad_norm": 0.3544664680957794,
+      "learning_rate": 0.00037159568261376896,
+      "loss": 3.4319,
+      "step": 26200
+    },
+    {
+      "epoch": 7.646411092985318,
+      "grad_norm": 0.32789161801338196,
+      "learning_rate": 0.00037115810968494744,
+      "loss": 3.4319,
+      "step": 26250
+    },
+    {
+      "epoch": 7.660976462363085,
+      "grad_norm": 0.38033467531204224,
+      "learning_rate": 0.000370720536756126,
+      "loss": 3.4309,
+      "step": 26300
+    },
+    {
+      "epoch": 7.675541831740853,
+      "grad_norm": 0.35393211245536804,
+      "learning_rate": 0.0003702829638273045,
+      "loss": 3.4309,
+      "step": 26350
+    },
+    {
+      "epoch": 7.690107201118621,
+      "grad_norm": 0.3252100348472595,
+      "learning_rate": 0.000369845390898483,
+      "loss": 3.4214,
+      "step": 26400
+    },
+    {
+      "epoch": 7.704672570496387,
+      "grad_norm": 0.34850725531578064,
+      "learning_rate": 0.00036940781796966155,
+      "loss": 3.4223,
+      "step": 26450
+    },
+    {
+      "epoch": 7.719237939874155,
+      "grad_norm": 0.36846786737442017,
+      "learning_rate": 0.00036897024504084014,
+      "loss": 3.4336,
+      "step": 26500
+    },
+    {
+      "epoch": 7.733803309251923,
+      "grad_norm": 0.36332714557647705,
+      "learning_rate": 0.0003685326721120187,
+      "loss": 3.4295,
+      "step": 26550
+    },
+    {
+      "epoch": 7.74836867862969,
+      "grad_norm": 0.34099632501602173,
+      "learning_rate": 0.00036809509918319717,
+      "loss": 3.4295,
+      "step": 26600
+    },
+    {
+      "epoch": 7.762934048007457,
+      "grad_norm": 0.3657893240451813,
+      "learning_rate": 0.0003676575262543757,
+      "loss": 3.4278,
+      "step": 26650
+    },
+    {
+      "epoch": 7.777499417385225,
+      "grad_norm": 0.3631269931793213,
+      "learning_rate": 0.00036721995332555425,
+      "loss": 3.436,
+      "step": 26700
+    },
+    {
+      "epoch": 7.792064786762992,
+      "grad_norm": 0.3591324985027313,
+      "learning_rate": 0.00036678238039673274,
+      "loss": 3.4196,
+      "step": 26750
+    },
+    {
+      "epoch": 7.80663015614076,
+      "grad_norm": 0.3412059247493744,
+      "learning_rate": 0.0003663448074679113,
+      "loss": 3.436,
+      "step": 26800
+    },
+    {
+      "epoch": 7.821195525518527,
+      "grad_norm": 0.3525434732437134,
+      "learning_rate": 0.0003659072345390898,
+      "loss": 3.4165,
+      "step": 26850
+    },
+    {
+      "epoch": 7.8357608948962945,
+      "grad_norm": 0.3410341441631317,
+      "learning_rate": 0.0003654696616102683,
+      "loss": 3.4301,
+      "step": 26900
+    },
+    {
+      "epoch": 7.850326264274062,
+      "grad_norm": 0.3529902696609497,
+      "learning_rate": 0.00036503208868144684,
+      "loss": 3.4276,
+      "step": 26950
+    },
+    {
+      "epoch": 7.86489163365183,
+      "grad_norm": 0.36684471368789673,
+      "learning_rate": 0.00036459451575262543,
+      "loss": 3.4357,
+      "step": 27000
+    },
+    {
+      "epoch": 7.86489163365183,
+      "eval_accuracy": 0.36754980903186846,
+      "eval_loss": 3.5753531455993652,
+      "eval_runtime": 179.9,
+      "eval_samples_per_second": 92.518,
+      "eval_steps_per_second": 5.787,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8794570030295965,
+      "grad_norm": 0.33539390563964844,
+      "learning_rate": 0.000364156942823804,
+      "loss": 3.4317,
+      "step": 27050
+    },
+    {
+      "epoch": 7.894022372407364,
+      "grad_norm": 0.3382086157798767,
+      "learning_rate": 0.00036371936989498246,
+      "loss": 3.4443,
+      "step": 27100
+    },
+    {
+      "epoch": 7.908587741785132,
+      "grad_norm": 0.37185636162757874,
+      "learning_rate": 0.000363281796966161,
+      "loss": 3.4331,
+      "step": 27150
+    },
+    {
+      "epoch": 7.923153111162899,
+      "grad_norm": 0.3374182879924774,
+      "learning_rate": 0.00036284422403733954,
+      "loss": 3.4368,
+      "step": 27200
+    },
+    {
+      "epoch": 7.937718480540666,
+      "grad_norm": 0.3447705805301666,
+      "learning_rate": 0.000362406651108518,
+      "loss": 3.4381,
+      "step": 27250
+    },
+    {
+      "epoch": 7.952283849918434,
+      "grad_norm": 0.3456777334213257,
+      "learning_rate": 0.00036196907817969657,
+      "loss": 3.4348,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9668492192962015,
+      "grad_norm": 0.34811699390411377,
+      "learning_rate": 0.0003615315052508751,
+      "loss": 3.4324,
+      "step": 27350
+    },
+    {
+      "epoch": 7.981414588673969,
+      "grad_norm": 0.350790798664093,
+      "learning_rate": 0.0003610939323220536,
+      "loss": 3.4395,
+      "step": 27400
+    },
+    {
+      "epoch": 7.995979958051736,
+      "grad_norm": 0.3383653163909912,
+      "learning_rate": 0.0003606563593932322,
+      "loss": 3.4285,
+      "step": 27450
+    },
+    {
+      "epoch": 8.010487065951992,
+      "grad_norm": 0.39396196603775024,
+      "learning_rate": 0.0003602187864644107,
+      "loss": 3.3725,
+      "step": 27500
+    },
+    {
+      "epoch": 8.02505243532976,
+      "grad_norm": 0.34884384274482727,
+      "learning_rate": 0.00035978121353558927,
+      "loss": 3.3305,
+      "step": 27550
+    },
+    {
+      "epoch": 8.039617804707527,
+      "grad_norm": 0.35126855969429016,
+      "learning_rate": 0.00035934364060676775,
+      "loss": 3.3353,
+      "step": 27600
+    },
+    {
+      "epoch": 8.054183174085296,
+      "grad_norm": 0.3708069324493408,
+      "learning_rate": 0.0003589060676779463,
+      "loss": 3.3266,
+      "step": 27650
+    },
+    {
+      "epoch": 8.068748543463062,
+      "grad_norm": 0.33984529972076416,
+      "learning_rate": 0.00035846849474912483,
+      "loss": 3.3367,
+      "step": 27700
+    },
+    {
+      "epoch": 8.08331391284083,
+      "grad_norm": 0.3667299449443817,
+      "learning_rate": 0.0003580309218203033,
+      "loss": 3.3359,
+      "step": 27750
+    },
+    {
+      "epoch": 8.097879282218598,
+      "grad_norm": 0.35364869236946106,
+      "learning_rate": 0.00035759334889148186,
+      "loss": 3.3403,
+      "step": 27800
+    },
+    {
+      "epoch": 8.112444651596364,
+      "grad_norm": 0.39318904280662537,
+      "learning_rate": 0.00035715577596266045,
+      "loss": 3.3463,
+      "step": 27850
+    },
+    {
+      "epoch": 8.127010020974131,
+      "grad_norm": 0.3744325041770935,
+      "learning_rate": 0.00035671820303383894,
+      "loss": 3.3489,
+      "step": 27900
+    },
+    {
+      "epoch": 8.1415753903519,
+      "grad_norm": 0.34499189257621765,
+      "learning_rate": 0.0003562806301050175,
+      "loss": 3.3631,
+      "step": 27950
+    },
+    {
+      "epoch": 8.156140759729666,
+      "grad_norm": 0.3449557423591614,
+      "learning_rate": 0.000355843057176196,
+      "loss": 3.3437,
+      "step": 28000
+    },
+    {
+      "epoch": 8.156140759729666,
+      "eval_accuracy": 0.3672165965674178,
+      "eval_loss": 3.5825846195220947,
+      "eval_runtime": 179.8108,
+      "eval_samples_per_second": 92.564,
+      "eval_steps_per_second": 5.789,
+      "step": 28000
+    },
+    {
+      "epoch": 8.170706129107435,
+      "grad_norm": 0.3479335606098175,
+      "learning_rate": 0.00035540548424737456,
+      "loss": 3.3574,
+      "step": 28050
+    },
+    {
+      "epoch": 8.185271498485202,
+      "grad_norm": 0.3682080805301666,
+      "learning_rate": 0.00035496791131855304,
+      "loss": 3.361,
+      "step": 28100
+    },
+    {
+      "epoch": 8.199836867862969,
+      "grad_norm": 0.36186373233795166,
+      "learning_rate": 0.0003545303383897316,
+      "loss": 3.3727,
+      "step": 28150
+    },
+    {
+      "epoch": 8.214402237240737,
+      "grad_norm": 0.3528668284416199,
+      "learning_rate": 0.0003540927654609101,
+      "loss": 3.3544,
+      "step": 28200
+    },
+    {
+      "epoch": 8.228967606618504,
+      "grad_norm": 0.3623187839984894,
+      "learning_rate": 0.0003536551925320886,
+      "loss": 3.3608,
+      "step": 28250
+    },
+    {
+      "epoch": 8.24353297599627,
+      "grad_norm": 0.36072465777397156,
+      "learning_rate": 0.0003532176196032672,
+      "loss": 3.3592,
+      "step": 28300
+    },
+    {
+      "epoch": 8.258098345374039,
+      "grad_norm": 0.35227975249290466,
+      "learning_rate": 0.00035278004667444574,
+      "loss": 3.3801,
+      "step": 28350
+    },
+    {
+      "epoch": 8.272663714751806,
+      "grad_norm": 0.3518006205558777,
+      "learning_rate": 0.00035234247374562423,
+      "loss": 3.3707,
+      "step": 28400
+    },
+    {
+      "epoch": 8.287229084129574,
+      "grad_norm": 0.36904311180114746,
+      "learning_rate": 0.00035190490081680277,
+      "loss": 3.3709,
+      "step": 28450
+    },
+    {
+      "epoch": 8.301794453507341,
+      "grad_norm": 0.3661719262599945,
+      "learning_rate": 0.0003514673278879813,
+      "loss": 3.3712,
+      "step": 28500
+    },
+    {
+      "epoch": 8.316359822885108,
+      "grad_norm": 0.3581629693508148,
+      "learning_rate": 0.00035102975495915985,
+      "loss": 3.3655,
+      "step": 28550
+    },
+    {
+      "epoch": 8.330925192262876,
+      "grad_norm": 0.34586071968078613,
+      "learning_rate": 0.00035059218203033833,
+      "loss": 3.3784,
+      "step": 28600
+    },
+    {
+      "epoch": 8.345490561640643,
+      "grad_norm": 0.3492356240749359,
+      "learning_rate": 0.0003501546091015169,
+      "loss": 3.3773,
+      "step": 28650
+    },
+    {
+      "epoch": 8.36005593101841,
+      "grad_norm": 0.35812652111053467,
+      "learning_rate": 0.00034971703617269547,
+      "loss": 3.3898,
+      "step": 28700
+    },
+    {
+      "epoch": 8.374621300396178,
+      "grad_norm": 0.3359387218952179,
+      "learning_rate": 0.0003492794632438739,
+      "loss": 3.3724,
+      "step": 28750
+    },
+    {
+      "epoch": 8.389186669773945,
+      "grad_norm": 0.3566528856754303,
+      "learning_rate": 0.0003488418903150525,
+      "loss": 3.3749,
+      "step": 28800
+    },
+    {
+      "epoch": 8.403752039151712,
+      "grad_norm": 0.3726493716239929,
+      "learning_rate": 0.00034840431738623103,
+      "loss": 3.3748,
+      "step": 28850
+    },
+    {
+      "epoch": 8.41831740852948,
+      "grad_norm": 0.36885538697242737,
+      "learning_rate": 0.0003479667444574095,
+      "loss": 3.382,
+      "step": 28900
+    },
+    {
+      "epoch": 8.432882777907247,
+      "grad_norm": 0.37485557794570923,
+      "learning_rate": 0.00034752917152858806,
+      "loss": 3.3809,
+      "step": 28950
+    },
+    {
+      "epoch": 8.447448147285016,
+      "grad_norm": 0.3613327443599701,
+      "learning_rate": 0.0003470915985997666,
+      "loss": 3.3846,
+      "step": 29000
+    },
+    {
+      "epoch": 8.447448147285016,
+      "eval_accuracy": 0.3677331111603366,
+      "eval_loss": 3.574125289916992,
+      "eval_runtime": 179.9699,
+      "eval_samples_per_second": 92.482,
+      "eval_steps_per_second": 5.784,
+      "step": 29000
+    },
+    {
+      "epoch": 8.462013516662783,
+      "grad_norm": 0.3570326566696167,
+      "learning_rate": 0.00034665402567094514,
+      "loss": 3.3894,
+      "step": 29050
+    },
+    {
+      "epoch": 8.47657888604055,
+      "grad_norm": 0.36068421602249146,
+      "learning_rate": 0.0003462164527421236,
+      "loss": 3.3895,
+      "step": 29100
+    },
+    {
+      "epoch": 8.491144255418318,
+      "grad_norm": 0.36702850461006165,
+      "learning_rate": 0.00034577887981330216,
+      "loss": 3.393,
+      "step": 29150
+    },
+    {
+      "epoch": 8.505709624796085,
+      "grad_norm": 0.3736766278743744,
+      "learning_rate": 0.00034534130688448076,
+      "loss": 3.3906,
+      "step": 29200
+    },
+    {
+      "epoch": 8.520274994173853,
+      "grad_norm": 0.3675585091114044,
+      "learning_rate": 0.00034490373395565924,
+      "loss": 3.3849,
+      "step": 29250
+    },
+    {
+      "epoch": 8.53484036355162,
+      "grad_norm": 0.3682818114757538,
+      "learning_rate": 0.0003444661610268378,
+      "loss": 3.381,
+      "step": 29300
+    },
+    {
+      "epoch": 8.549405732929387,
+      "grad_norm": 0.41155508160591125,
+      "learning_rate": 0.0003440285880980163,
+      "loss": 3.3882,
+      "step": 29350
+    },
+    {
+      "epoch": 8.563971102307155,
+      "grad_norm": 0.38835835456848145,
+      "learning_rate": 0.0003435910151691948,
+      "loss": 3.3873,
+      "step": 29400
+    },
+    {
+      "epoch": 8.578536471684922,
+      "grad_norm": 0.36495402455329895,
+      "learning_rate": 0.00034315344224037335,
+      "loss": 3.3954,
+      "step": 29450
+    },
+    {
+      "epoch": 8.593101841062689,
+      "grad_norm": 0.35648706555366516,
+      "learning_rate": 0.0003427158693115519,
+      "loss": 3.3944,
+      "step": 29500
+    },
+    {
+      "epoch": 8.607667210440457,
+      "grad_norm": 0.36086899042129517,
+      "learning_rate": 0.00034227829638273043,
+      "loss": 3.3892,
+      "step": 29550
+    },
+    {
+      "epoch": 8.622232579818224,
+      "grad_norm": 0.3824726343154907,
+      "learning_rate": 0.0003418407234539089,
+      "loss": 3.3972,
+      "step": 29600
+    },
+    {
+      "epoch": 8.63679794919599,
+      "grad_norm": 0.3354533910751343,
+      "learning_rate": 0.0003414031505250875,
+      "loss": 3.3857,
+      "step": 29650
+    },
+    {
+      "epoch": 8.65136331857376,
+      "grad_norm": 0.3533078730106354,
+      "learning_rate": 0.00034096557759626605,
+      "loss": 3.4028,
+      "step": 29700
+    },
+    {
+      "epoch": 8.665928687951526,
+      "grad_norm": 0.35228005051612854,
+      "learning_rate": 0.00034052800466744453,
+      "loss": 3.3961,
+      "step": 29750
+    },
+    {
+      "epoch": 8.680494057329295,
+      "grad_norm": 0.3567520081996918,
+      "learning_rate": 0.0003400904317386231,
+      "loss": 3.3975,
+      "step": 29800
+    },
+    {
+      "epoch": 8.695059426707061,
+      "grad_norm": 0.36763066053390503,
+      "learning_rate": 0.0003396528588098016,
+      "loss": 3.402,
+      "step": 29850
+    },
+    {
+      "epoch": 8.709624796084828,
+      "grad_norm": 0.35932788252830505,
+      "learning_rate": 0.0003392152858809801,
+      "loss": 3.3969,
+      "step": 29900
+    },
+    {
+      "epoch": 8.724190165462597,
+      "grad_norm": 0.3732490837574005,
+      "learning_rate": 0.00033877771295215864,
+      "loss": 3.3923,
+      "step": 29950
+    },
+    {
+      "epoch": 8.738755534840363,
+      "grad_norm": 0.35130372643470764,
+      "learning_rate": 0.0003383401400233372,
+      "loss": 3.397,
+      "step": 30000
+    },
+    {
+      "epoch": 8.738755534840363,
+      "eval_accuracy": 0.36869112638981577,
+      "eval_loss": 3.565412998199463,
+      "eval_runtime": 179.8685,
+      "eval_samples_per_second": 92.534,
+      "eval_steps_per_second": 5.788,
+      "step": 30000
+    },
+    {
+      "epoch": 8.753320904218132,
+      "grad_norm": 0.36606141924858093,
+      "learning_rate": 0.0003379025670945158,
+      "loss": 3.3837,
+      "step": 30050
+    },
+    {
+      "epoch": 8.767886273595899,
+      "grad_norm": 0.3647548258304596,
+      "learning_rate": 0.00033746499416569426,
+      "loss": 3.3854,
+      "step": 30100
+    },
+    {
+      "epoch": 8.782451642973665,
+      "grad_norm": 0.3695903718471527,
+      "learning_rate": 0.0003370274212368728,
+      "loss": 3.4027,
+      "step": 30150
+    },
+    {
+      "epoch": 8.797017012351434,
+      "grad_norm": 0.35933852195739746,
+      "learning_rate": 0.00033658984830805134,
+      "loss": 3.3907,
+      "step": 30200
+    },
+    {
+      "epoch": 8.8115823817292,
+      "grad_norm": 0.3846684396266937,
+      "learning_rate": 0.0003361522753792298,
+      "loss": 3.403,
+      "step": 30250
+    },
+    {
+      "epoch": 8.826147751106967,
+      "grad_norm": 0.35925909876823425,
+      "learning_rate": 0.00033571470245040837,
+      "loss": 3.3971,
+      "step": 30300
+    },
+    {
+      "epoch": 8.840713120484736,
+      "grad_norm": 0.3694295287132263,
+      "learning_rate": 0.0003352771295215869,
+      "loss": 3.4021,
+      "step": 30350
+    },
+    {
+      "epoch": 8.855278489862503,
+      "grad_norm": 0.352285236120224,
+      "learning_rate": 0.0003348395565927654,
+      "loss": 3.3962,
+      "step": 30400
+    },
+    {
+      "epoch": 8.86984385924027,
+      "grad_norm": 0.35616302490234375,
+      "learning_rate": 0.00033440198366394393,
+      "loss": 3.3957,
+      "step": 30450
+    },
+    {
+      "epoch": 8.884409228618038,
+      "grad_norm": 0.35298022627830505,
+      "learning_rate": 0.0003339644107351225,
+      "loss": 3.3837,
+      "step": 30500
+    },
+    {
+      "epoch": 8.898974597995805,
+      "grad_norm": 0.38288286328315735,
+      "learning_rate": 0.00033352683780630107,
+      "loss": 3.4005,
+      "step": 30550
+    },
+    {
+      "epoch": 8.913539967373573,
+      "grad_norm": 0.3534913659095764,
+      "learning_rate": 0.00033308926487747955,
+      "loss": 3.4102,
+      "step": 30600
+    },
+    {
+      "epoch": 8.92810533675134,
+      "grad_norm": 0.34419965744018555,
+      "learning_rate": 0.0003326516919486581,
+      "loss": 3.4074,
+      "step": 30650
+    },
+    {
+      "epoch": 8.942670706129107,
+      "grad_norm": 0.361150324344635,
+      "learning_rate": 0.00033221411901983663,
+      "loss": 3.3897,
+      "step": 30700
+    },
+    {
+      "epoch": 8.957236075506875,
+      "grad_norm": 0.3879886567592621,
+      "learning_rate": 0.0003317765460910151,
+      "loss": 3.405,
+      "step": 30750
+    },
+    {
+      "epoch": 8.971801444884642,
+      "grad_norm": 0.37308424711227417,
+      "learning_rate": 0.00033133897316219366,
+      "loss": 3.3974,
+      "step": 30800
+    },
+    {
+      "epoch": 8.986366814262409,
+      "grad_norm": 0.3615827262401581,
+      "learning_rate": 0.0003309014002333722,
+      "loss": 3.3938,
+      "step": 30850
+    },
+    {
+      "epoch": 9.000873922162667,
+      "grad_norm": 0.3499894142150879,
+      "learning_rate": 0.0003304638273045507,
+      "loss": 3.398,
+      "step": 30900
+    },
+    {
+      "epoch": 9.015439291540433,
+      "grad_norm": 0.3602531850337982,
+      "learning_rate": 0.0003300262543757292,
+      "loss": 3.2991,
+      "step": 30950
+    },
+    {
+      "epoch": 9.0300046609182,
+      "grad_norm": 0.3546774089336395,
+      "learning_rate": 0.0003295886814469078,
+      "loss": 3.2933,
+      "step": 31000
+    },
+    {
+      "epoch": 9.0300046609182,
+      "eval_accuracy": 0.36841858352016277,
+      "eval_loss": 3.570578098297119,
+      "eval_runtime": 181.1213,
+      "eval_samples_per_second": 91.894,
+      "eval_steps_per_second": 5.748,
+      "step": 31000
+    },
+    {
+      "epoch": 9.044570030295969,
+      "grad_norm": 0.3855380117893219,
+      "learning_rate": 0.00032915110851808636,
+      "loss": 3.3132,
+      "step": 31050
+    },
+    {
+      "epoch": 9.059135399673735,
+      "grad_norm": 0.35755249857902527,
+      "learning_rate": 0.00032871353558926484,
+      "loss": 3.3147,
+      "step": 31100
+    },
+    {
+      "epoch": 9.073700769051504,
+      "grad_norm": 0.34872257709503174,
+      "learning_rate": 0.0003282759626604434,
+      "loss": 3.3101,
+      "step": 31150
+    },
+    {
+      "epoch": 9.08826613842927,
+      "grad_norm": 0.36924898624420166,
+      "learning_rate": 0.0003278383897316219,
+      "loss": 3.3097,
+      "step": 31200
+    },
+    {
+      "epoch": 9.102831507807037,
+      "grad_norm": 0.353230357170105,
+      "learning_rate": 0.0003274008168028004,
+      "loss": 3.3117,
+      "step": 31250
+    },
+    {
+      "epoch": 9.117396877184806,
+      "grad_norm": 0.3573058247566223,
+      "learning_rate": 0.00032696324387397895,
+      "loss": 3.3038,
+      "step": 31300
+    },
+    {
+      "epoch": 9.131962246562573,
+      "grad_norm": 0.3910951316356659,
+      "learning_rate": 0.0003265256709451575,
+      "loss": 3.3208,
+      "step": 31350
+    },
+    {
+      "epoch": 9.14652761594034,
+      "grad_norm": 0.3972455859184265,
+      "learning_rate": 0.00032608809801633597,
+      "loss": 3.3242,
+      "step": 31400
+    },
+    {
+      "epoch": 9.161092985318108,
+      "grad_norm": 0.4021930694580078,
+      "learning_rate": 0.00032565052508751457,
+      "loss": 3.3291,
+      "step": 31450
+    },
+    {
+      "epoch": 9.175658354695875,
+      "grad_norm": 0.36197373270988464,
+      "learning_rate": 0.0003252129521586931,
+      "loss": 3.325,
+      "step": 31500
+    },
+    {
+      "epoch": 9.190223724073643,
+      "grad_norm": 0.363652765750885,
+      "learning_rate": 0.00032477537922987165,
+      "loss": 3.3357,
+      "step": 31550
+    },
+    {
+      "epoch": 9.20478909345141,
+      "grad_norm": 0.37855982780456543,
+      "learning_rate": 0.00032433780630105013,
+      "loss": 3.3269,
+      "step": 31600
+    },
+    {
+      "epoch": 9.219354462829177,
+      "grad_norm": 0.36747410893440247,
+      "learning_rate": 0.00032390023337222867,
+      "loss": 3.3331,
+      "step": 31650
+    },
+    {
+      "epoch": 9.233919832206945,
+      "grad_norm": 0.36995381116867065,
+      "learning_rate": 0.0003234626604434072,
+      "loss": 3.3424,
+      "step": 31700
+    },
+    {
+      "epoch": 9.248485201584712,
+      "grad_norm": 0.3854932188987732,
+      "learning_rate": 0.0003230250875145857,
+      "loss": 3.3534,
+      "step": 31750
+    },
+    {
+      "epoch": 9.263050570962479,
+      "grad_norm": 0.3708399534225464,
+      "learning_rate": 0.00032258751458576424,
+      "loss": 3.3303,
+      "step": 31800
+    },
+    {
+      "epoch": 9.277615940340247,
+      "grad_norm": 0.3602900803089142,
+      "learning_rate": 0.00032214994165694283,
+      "loss": 3.3368,
+      "step": 31850
+    },
+    {
+      "epoch": 9.292181309718014,
+      "grad_norm": 0.3634423017501831,
+      "learning_rate": 0.0003217123687281213,
+      "loss": 3.3472,
+      "step": 31900
+    },
+    {
+      "epoch": 9.306746679095783,
+      "grad_norm": 0.35544899106025696,
+      "learning_rate": 0.00032127479579929986,
+      "loss": 3.3291,
+      "step": 31950
+    },
+    {
+      "epoch": 9.32131204847355,
+      "grad_norm": 0.37147095799446106,
+      "learning_rate": 0.0003208372228704784,
+      "loss": 3.3448,
+      "step": 32000
+    },
+    {
+      "epoch": 9.32131204847355,
+      "eval_accuracy": 0.36875214871481576,
+      "eval_loss": 3.570387840270996,
+      "eval_runtime": 181.1619,
+      "eval_samples_per_second": 91.874,
+      "eval_steps_per_second": 5.746,
+      "step": 32000
+    },
+    {
+      "epoch": 9.335877417851316,
+      "grad_norm": 0.3772315979003906,
+      "learning_rate": 0.00032039964994165694,
+      "loss": 3.3357,
+      "step": 32050
+    },
+    {
+      "epoch": 9.350442787229085,
+      "grad_norm": 0.3781045973300934,
+      "learning_rate": 0.0003199620770128354,
+      "loss": 3.3417,
+      "step": 32100
+    },
+    {
+      "epoch": 9.365008156606851,
+      "grad_norm": 0.3656591773033142,
+      "learning_rate": 0.00031952450408401396,
+      "loss": 3.3503,
+      "step": 32150
+    },
+    {
+      "epoch": 9.379573525984618,
+      "grad_norm": 0.3560366928577423,
+      "learning_rate": 0.0003190869311551925,
+      "loss": 3.3543,
+      "step": 32200
+    },
+    {
+      "epoch": 9.394138895362387,
+      "grad_norm": 0.37504252791404724,
+      "learning_rate": 0.000318649358226371,
+      "loss": 3.3364,
+      "step": 32250
+    },
+    {
+      "epoch": 9.408704264740154,
+      "grad_norm": 0.3750084936618805,
+      "learning_rate": 0.0003182117852975496,
+      "loss": 3.3571,
+      "step": 32300
+    },
+    {
+      "epoch": 9.423269634117922,
+      "grad_norm": 0.3538859486579895,
+      "learning_rate": 0.0003177742123687281,
+      "loss": 3.3531,
+      "step": 32350
+    },
+    {
+      "epoch": 9.437835003495689,
+      "grad_norm": 0.3584098815917969,
+      "learning_rate": 0.0003173366394399066,
+      "loss": 3.3625,
+      "step": 32400
+    },
+    {
+      "epoch": 9.452400372873456,
+      "grad_norm": 0.373279333114624,
+      "learning_rate": 0.00031689906651108515,
+      "loss": 3.3453,
+      "step": 32450
+    },
+    {
+      "epoch": 9.466965742251224,
+      "grad_norm": 0.35882455110549927,
+      "learning_rate": 0.0003164614935822637,
+      "loss": 3.358,
+      "step": 32500
+    },
+    {
+      "epoch": 9.48153111162899,
+      "grad_norm": 0.3616470694541931,
+      "learning_rate": 0.00031602392065344223,
+      "loss": 3.3555,
+      "step": 32550
+    },
+    {
+      "epoch": 9.496096481006758,
+      "grad_norm": 0.3601871132850647,
+      "learning_rate": 0.0003155863477246207,
+      "loss": 3.3421,
+      "step": 32600
+    },
+    {
+      "epoch": 9.510661850384526,
+      "grad_norm": 0.3817446231842041,
+      "learning_rate": 0.00031514877479579925,
+      "loss": 3.3635,
+      "step": 32650
+    },
+    {
+      "epoch": 9.525227219762293,
+      "grad_norm": 0.3797430992126465,
+      "learning_rate": 0.00031471120186697785,
+      "loss": 3.3555,
+      "step": 32700
+    },
+    {
+      "epoch": 9.53979258914006,
+      "grad_norm": 0.3647226095199585,
+      "learning_rate": 0.00031427362893815633,
+      "loss": 3.3539,
+      "step": 32750
+    },
+    {
+      "epoch": 9.554357958517828,
+      "grad_norm": 0.38534408807754517,
+      "learning_rate": 0.0003138360560093349,
+      "loss": 3.353,
+      "step": 32800
+    },
+    {
+      "epoch": 9.568923327895595,
+      "grad_norm": 0.35576704144477844,
+      "learning_rate": 0.0003133984830805134,
+      "loss": 3.3503,
+      "step": 32850
+    },
+    {
+      "epoch": 9.583488697273363,
+      "grad_norm": 0.3623790740966797,
+      "learning_rate": 0.0003129609101516919,
+      "loss": 3.3567,
+      "step": 32900
+    },
+    {
+      "epoch": 9.59805406665113,
+      "grad_norm": 0.37745630741119385,
+      "learning_rate": 0.00031252333722287044,
+      "loss": 3.3555,
+      "step": 32950
+    },
+    {
+      "epoch": 9.612619436028897,
+      "grad_norm": 0.35436564683914185,
+      "learning_rate": 0.000312085764294049,
+      "loss": 3.362,
+      "step": 33000
+    },
+    {
+      "epoch": 9.612619436028897,
+      "eval_accuracy": 0.3694510248223298,
+      "eval_loss": 3.5591442584991455,
+      "eval_runtime": 181.163,
+      "eval_samples_per_second": 91.873,
+      "eval_steps_per_second": 5.746,
+      "step": 33000
+    },
+    {
+      "epoch": 9.627184805406666,
+      "grad_norm": 0.3626103401184082,
+      "learning_rate": 0.0003116481913652275,
+      "loss": 3.3583,
+      "step": 33050
+    },
+    {
+      "epoch": 9.641750174784432,
+      "grad_norm": 0.3621595501899719,
+      "learning_rate": 0.000311210618436406,
+      "loss": 3.3653,
+      "step": 33100
+    },
+    {
+      "epoch": 9.6563155441622,
+      "grad_norm": 0.37698203325271606,
+      "learning_rate": 0.0003107730455075846,
+      "loss": 3.369,
+      "step": 33150
+    },
+    {
+      "epoch": 9.670880913539968,
+      "grad_norm": 0.3685494661331177,
+      "learning_rate": 0.00031033547257876314,
+      "loss": 3.3695,
+      "step": 33200
+    },
+    {
+      "epoch": 9.685446282917734,
+      "grad_norm": 0.3542827367782593,
+      "learning_rate": 0.0003098978996499416,
+      "loss": 3.3554,
+      "step": 33250
+    },
+    {
+      "epoch": 9.700011652295503,
+      "grad_norm": 0.3731638491153717,
+      "learning_rate": 0.00030946032672112016,
+      "loss": 3.3616,
+      "step": 33300
+    },
+    {
+      "epoch": 9.71457702167327,
+      "grad_norm": 0.3674522638320923,
+      "learning_rate": 0.0003090227537922987,
+      "loss": 3.3553,
+      "step": 33350
+    },
+    {
+      "epoch": 9.729142391051036,
+      "grad_norm": 0.3795247972011566,
+      "learning_rate": 0.0003085851808634772,
+      "loss": 3.3591,
+      "step": 33400
+    },
+    {
+      "epoch": 9.743707760428805,
+      "grad_norm": 0.3751859664916992,
+      "learning_rate": 0.00030814760793465573,
+      "loss": 3.3737,
+      "step": 33450
+    },
+    {
+      "epoch": 9.758273129806572,
+      "grad_norm": 0.36385780572891235,
+      "learning_rate": 0.00030771003500583427,
+      "loss": 3.363,
+      "step": 33500
+    },
+    {
+      "epoch": 9.772838499184338,
+      "grad_norm": 0.3745647370815277,
+      "learning_rate": 0.00030727246207701286,
+      "loss": 3.365,
+      "step": 33550
+    },
+    {
+      "epoch": 9.787403868562107,
+      "grad_norm": 0.3658393621444702,
+      "learning_rate": 0.0003068348891481913,
+      "loss": 3.3603,
+      "step": 33600
+    },
+    {
+      "epoch": 9.801969237939874,
+      "grad_norm": 0.3640536963939667,
+      "learning_rate": 0.0003063973162193699,
+      "loss": 3.3559,
+      "step": 33650
+    },
+    {
+      "epoch": 9.816534607317642,
+      "grad_norm": 0.34730592370033264,
+      "learning_rate": 0.00030595974329054843,
+      "loss": 3.3719,
+      "step": 33700
+    },
+    {
+      "epoch": 9.831099976695409,
+      "grad_norm": 0.4093884229660034,
+      "learning_rate": 0.0003055221703617269,
+      "loss": 3.3566,
+      "step": 33750
+    },
+    {
+      "epoch": 9.845665346073176,
+      "grad_norm": 0.379686176776886,
+      "learning_rate": 0.00030508459743290546,
+      "loss": 3.3694,
+      "step": 33800
+    },
+    {
+      "epoch": 9.860230715450944,
+      "grad_norm": 0.3721369206905365,
+      "learning_rate": 0.000304647024504084,
+      "loss": 3.3565,
+      "step": 33850
+    },
+    {
+      "epoch": 9.874796084828711,
+      "grad_norm": 0.40420958399772644,
+      "learning_rate": 0.0003042094515752625,
+      "loss": 3.3739,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88936145420648,
+      "grad_norm": 0.3653509020805359,
+      "learning_rate": 0.000303771878646441,
+      "loss": 3.3874,
+      "step": 33950
+    },
+    {
+      "epoch": 9.903926823584246,
+      "grad_norm": 0.3726848363876343,
+      "learning_rate": 0.00030333430571761956,
+      "loss": 3.3664,
+      "step": 34000
+    },
+    {
+      "epoch": 9.903926823584246,
+      "eval_accuracy": 0.37010945453331207,
+      "eval_loss": 3.552675724029541,
+      "eval_runtime": 180.1762,
+      "eval_samples_per_second": 92.376,
+      "eval_steps_per_second": 5.778,
+      "step": 34000
+    },
+    {
+      "epoch": 9.918492192962013,
+      "grad_norm": 0.37708285450935364,
+      "learning_rate": 0.00030289673278879816,
+      "loss": 3.3704,
+      "step": 34050
+    },
+    {
+      "epoch": 9.933057562339782,
+      "grad_norm": 0.3552982211112976,
+      "learning_rate": 0.00030245915985997664,
+      "loss": 3.3698,
+      "step": 34100
+    },
+    {
+      "epoch": 9.947622931717548,
+      "grad_norm": 0.36946627497673035,
+      "learning_rate": 0.0003020215869311552,
+      "loss": 3.3806,
+      "step": 34150
+    },
+    {
+      "epoch": 9.962188301095315,
+      "grad_norm": 0.35199493169784546,
+      "learning_rate": 0.0003015840140023337,
+      "loss": 3.3826,
+      "step": 34200
+    },
+    {
+      "epoch": 9.976753670473084,
+      "grad_norm": 0.3719441592693329,
+      "learning_rate": 0.0003011464410735122,
+      "loss": 3.3646,
+      "step": 34250
+    },
+    {
+      "epoch": 9.99131903985085,
+      "grad_norm": 0.363594651222229,
+      "learning_rate": 0.00030070886814469075,
+      "loss": 3.3622,
+      "step": 34300
+    },
+    {
+      "epoch": 10.005826147751106,
+      "grad_norm": 0.35639119148254395,
+      "learning_rate": 0.0003002712952158693,
+      "loss": 3.3249,
+      "step": 34350
+    },
+    {
+      "epoch": 10.020391517128875,
+      "grad_norm": 0.37611323595046997,
+      "learning_rate": 0.0002998337222870478,
+      "loss": 3.273,
+      "step": 34400
+    },
+    {
+      "epoch": 10.034956886506642,
+      "grad_norm": 0.37235331535339355,
+      "learning_rate": 0.0002993961493582263,
+      "loss": 3.2791,
+      "step": 34450
+    },
+    {
+      "epoch": 10.049522255884408,
+      "grad_norm": 0.37707796692848206,
+      "learning_rate": 0.0002989585764294049,
+      "loss": 3.2771,
+      "step": 34500
+    },
+    {
+      "epoch": 10.064087625262177,
+      "grad_norm": 0.3716845214366913,
+      "learning_rate": 0.0002985210035005834,
+      "loss": 3.2701,
+      "step": 34550
+    },
+    {
+      "epoch": 10.078652994639944,
+      "grad_norm": 0.36124205589294434,
+      "learning_rate": 0.00029808343057176193,
+      "loss": 3.2854,
+      "step": 34600
+    },
+    {
+      "epoch": 10.093218364017712,
+      "grad_norm": 0.35858553647994995,
+      "learning_rate": 0.00029764585764294047,
+      "loss": 3.2791,
+      "step": 34650
+    },
+    {
+      "epoch": 10.107783733395479,
+      "grad_norm": 0.37975695729255676,
+      "learning_rate": 0.00029720828471411896,
+      "loss": 3.2846,
+      "step": 34700
+    },
+    {
+      "epoch": 10.122349102773246,
+      "grad_norm": 0.3737419545650482,
+      "learning_rate": 0.00029677071178529755,
+      "loss": 3.2932,
+      "step": 34750
+    },
+    {
+      "epoch": 10.136914472151014,
+      "grad_norm": 0.3647433817386627,
+      "learning_rate": 0.00029633313885647604,
+      "loss": 3.2903,
+      "step": 34800
+    },
+    {
+      "epoch": 10.151479841528781,
+      "grad_norm": 0.3602680265903473,
+      "learning_rate": 0.0002958955659276546,
+      "loss": 3.2915,
+      "step": 34850
+    },
+    {
+      "epoch": 10.166045210906548,
+      "grad_norm": 0.3720751404762268,
+      "learning_rate": 0.0002954579929988331,
+      "loss": 3.2912,
+      "step": 34900
+    },
+    {
+      "epoch": 10.180610580284316,
+      "grad_norm": 0.36713486909866333,
+      "learning_rate": 0.00029502042007001166,
+      "loss": 3.3051,
+      "step": 34950
+    },
+    {
+      "epoch": 10.195175949662083,
+      "grad_norm": 0.3789300322532654,
+      "learning_rate": 0.0002945828471411902,
+      "loss": 3.3097,
+      "step": 35000
+    },
+    {
+      "epoch": 10.195175949662083,
+      "eval_accuracy": 0.3695753034302777,
+      "eval_loss": 3.563159227371216,
+      "eval_runtime": 179.9801,
+      "eval_samples_per_second": 92.477,
+      "eval_steps_per_second": 5.784,
+      "step": 35000
+    },
+    {
+      "epoch": 10.209741319039852,
+      "grad_norm": 0.3704666793346405,
+      "learning_rate": 0.0002941452742123687,
+      "loss": 3.305,
+      "step": 35050
+    },
+    {
+      "epoch": 10.224306688417618,
+      "grad_norm": 0.36152052879333496,
+      "learning_rate": 0.0002937077012835472,
+      "loss": 3.3008,
+      "step": 35100
+    },
+    {
+      "epoch": 10.238872057795385,
+      "grad_norm": 0.3830110728740692,
+      "learning_rate": 0.00029327012835472576,
+      "loss": 3.3154,
+      "step": 35150
+    },
+    {
+      "epoch": 10.253437427173154,
+      "grad_norm": 0.3606763482093811,
+      "learning_rate": 0.0002928325554259043,
+      "loss": 3.2964,
+      "step": 35200
+    },
+    {
+      "epoch": 10.26800279655092,
+      "grad_norm": 0.3661087453365326,
+      "learning_rate": 0.00029239498249708284,
+      "loss": 3.3058,
+      "step": 35250
+    },
+    {
+      "epoch": 10.282568165928687,
+      "grad_norm": 0.3702368438243866,
+      "learning_rate": 0.00029195740956826133,
+      "loss": 3.315,
+      "step": 35300
+    },
+    {
+      "epoch": 10.297133535306456,
+      "grad_norm": 0.3803708553314209,
+      "learning_rate": 0.0002915198366394399,
+      "loss": 3.3081,
+      "step": 35350
+    },
+    {
+      "epoch": 10.311698904684222,
+      "grad_norm": 0.36514583230018616,
+      "learning_rate": 0.0002910822637106184,
+      "loss": 3.301,
+      "step": 35400
+    },
+    {
+      "epoch": 10.326264274061991,
+      "grad_norm": 0.36688533425331116,
+      "learning_rate": 0.00029064469078179695,
+      "loss": 3.3136,
+      "step": 35450
+    },
+    {
+      "epoch": 10.340829643439758,
+      "grad_norm": 0.36582139134407043,
+      "learning_rate": 0.0002902071178529755,
+      "loss": 3.321,
+      "step": 35500
+    },
+    {
+      "epoch": 10.355395012817525,
+      "grad_norm": 0.36554041504859924,
+      "learning_rate": 0.000289769544924154,
+      "loss": 3.3259,
+      "step": 35550
+    },
+    {
+      "epoch": 10.369960382195293,
+      "grad_norm": 0.3956117331981659,
+      "learning_rate": 0.00028933197199533257,
+      "loss": 3.3208,
+      "step": 35600
+    },
+    {
+      "epoch": 10.38452575157306,
+      "grad_norm": 0.3716447949409485,
+      "learning_rate": 0.00028889439906651105,
+      "loss": 3.3143,
+      "step": 35650
+    },
+    {
+      "epoch": 10.399091120950827,
+      "grad_norm": 0.38861215114593506,
+      "learning_rate": 0.0002884568261376896,
+      "loss": 3.3173,
+      "step": 35700
+    },
+    {
+      "epoch": 10.413656490328595,
+      "grad_norm": 0.39447951316833496,
+      "learning_rate": 0.00028801925320886813,
+      "loss": 3.3316,
+      "step": 35750
+    },
+    {
+      "epoch": 10.428221859706362,
+      "grad_norm": 0.3649604618549347,
+      "learning_rate": 0.0002875816802800466,
+      "loss": 3.3131,
+      "step": 35800
+    },
+    {
+      "epoch": 10.44278722908413,
+      "grad_norm": 0.3737121522426605,
+      "learning_rate": 0.0002871441073512252,
+      "loss": 3.3156,
+      "step": 35850
+    },
+    {
+      "epoch": 10.457352598461897,
+      "grad_norm": 0.3692328929901123,
+      "learning_rate": 0.0002867065344224037,
+      "loss": 3.3158,
+      "step": 35900
+    },
+    {
+      "epoch": 10.471917967839664,
+      "grad_norm": 0.3931700587272644,
+      "learning_rate": 0.00028626896149358224,
+      "loss": 3.3158,
+      "step": 35950
+    },
+    {
+      "epoch": 10.486483337217432,
+      "grad_norm": 0.35947710275650024,
+      "learning_rate": 0.0002858313885647608,
+      "loss": 3.3296,
+      "step": 36000
+    },
+    {
+      "epoch": 10.486483337217432,
+      "eval_accuracy": 0.37002562232189595,
+      "eval_loss": 3.5596346855163574,
+      "eval_runtime": 201.7421,
+      "eval_samples_per_second": 82.501,
+      "eval_steps_per_second": 5.16,
+      "step": 36000
+    },
+    {
+      "epoch": 10.5010487065952,
+      "grad_norm": 0.3691442310810089,
+      "learning_rate": 0.0002853938156359393,
+      "loss": 3.3234,
+      "step": 36050
+    },
+    {
+      "epoch": 10.515614075972966,
+      "grad_norm": 0.36635181307792664,
+      "learning_rate": 0.00028495624270711786,
+      "loss": 3.3215,
+      "step": 36100
+    },
+    {
+      "epoch": 10.530179445350734,
+      "grad_norm": 0.3895931839942932,
+      "learning_rate": 0.00028451866977829634,
+      "loss": 3.327,
+      "step": 36150
+    },
+    {
+      "epoch": 10.544744814728501,
+      "grad_norm": 0.369814395904541,
+      "learning_rate": 0.0002840810968494749,
+      "loss": 3.3193,
+      "step": 36200
+    },
+    {
+      "epoch": 10.55931018410627,
+      "grad_norm": 0.3740673363208771,
+      "learning_rate": 0.0002836435239206534,
+      "loss": 3.3338,
+      "step": 36250
+    },
+    {
+      "epoch": 10.573875553484037,
+      "grad_norm": 0.3762377202510834,
+      "learning_rate": 0.00028320595099183196,
+      "loss": 3.3323,
+      "step": 36300
+    },
+    {
+      "epoch": 10.588440922861803,
+      "grad_norm": 0.3816418945789337,
+      "learning_rate": 0.0002827683780630105,
+      "loss": 3.326,
+      "step": 36350
+    },
+    {
+      "epoch": 10.603006292239572,
+      "grad_norm": 0.3672488331794739,
+      "learning_rate": 0.000282330805134189,
+      "loss": 3.3272,
+      "step": 36400
+    },
+    {
+      "epoch": 10.617571661617339,
+      "grad_norm": 0.3663332462310791,
+      "learning_rate": 0.00028189323220536753,
+      "loss": 3.3412,
+      "step": 36450
+    },
+    {
+      "epoch": 10.632137030995105,
+      "grad_norm": 0.36435452103614807,
+      "learning_rate": 0.00028145565927654607,
+      "loss": 3.3405,
+      "step": 36500
+    },
+    {
+      "epoch": 10.646702400372874,
+      "grad_norm": 0.3970140218734741,
+      "learning_rate": 0.0002810180863477246,
+      "loss": 3.3301,
+      "step": 36550
+    },
+    {
+      "epoch": 10.66126776975064,
+      "grad_norm": 0.37601590156555176,
+      "learning_rate": 0.00028058051341890315,
+      "loss": 3.3305,
+      "step": 36600
+    },
+    {
+      "epoch": 10.675833139128407,
+      "grad_norm": 0.38534775376319885,
+      "learning_rate": 0.00028014294049008164,
+      "loss": 3.3418,
+      "step": 36650
+    },
+    {
+      "epoch": 10.690398508506176,
+      "grad_norm": 0.3686717450618744,
+      "learning_rate": 0.0002797053675612602,
+      "loss": 3.334,
+      "step": 36700
+    },
+    {
+      "epoch": 10.704963877883943,
+      "grad_norm": 0.37328463792800903,
+      "learning_rate": 0.0002792677946324387,
+      "loss": 3.3337,
+      "step": 36750
+    },
+    {
+      "epoch": 10.719529247261711,
+      "grad_norm": 0.37012574076652527,
+      "learning_rate": 0.00027883022170361726,
+      "loss": 3.3439,
+      "step": 36800
+    },
+    {
+      "epoch": 10.734094616639478,
+      "grad_norm": 0.3841780722141266,
+      "learning_rate": 0.0002783926487747958,
+      "loss": 3.3365,
+      "step": 36850
+    },
+    {
+      "epoch": 10.748659986017245,
+      "grad_norm": 0.3672944903373718,
+      "learning_rate": 0.0002779550758459743,
+      "loss": 3.3333,
+      "step": 36900
+    },
+    {
+      "epoch": 10.763225355395013,
+      "grad_norm": 0.3785678744316101,
+      "learning_rate": 0.0002775175029171528,
+      "loss": 3.3393,
+      "step": 36950
+    },
+    {
+      "epoch": 10.77779072477278,
+      "grad_norm": 0.3595028817653656,
+      "learning_rate": 0.00027707992998833136,
+      "loss": 3.3485,
+      "step": 37000
+    },
+    {
+      "epoch": 10.77779072477278,
+      "eval_accuracy": 0.37081573797507467,
+      "eval_loss": 3.5510876178741455,
+      "eval_runtime": 180.0166,
+      "eval_samples_per_second": 92.458,
+      "eval_steps_per_second": 5.783,
+      "step": 37000
+    },
+    {
+      "epoch": 10.792356094150549,
+      "grad_norm": 0.37431466579437256,
+      "learning_rate": 0.0002766423570595099,
+      "loss": 3.3371,
+      "step": 37050
+    },
+    {
+      "epoch": 10.806921463528315,
+      "grad_norm": 0.3843117654323578,
+      "learning_rate": 0.00027620478413068844,
+      "loss": 3.3304,
+      "step": 37100
+    },
+    {
+      "epoch": 10.821486832906082,
+      "grad_norm": 0.3688303530216217,
+      "learning_rate": 0.000275767211201867,
+      "loss": 3.3318,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83605220228385,
+      "grad_norm": 0.38997411727905273,
+      "learning_rate": 0.00027532963827304547,
+      "loss": 3.3445,
+      "step": 37200
+    },
+    {
+      "epoch": 10.850617571661617,
+      "grad_norm": 0.3861274719238281,
+      "learning_rate": 0.000274892065344224,
+      "loss": 3.3383,
+      "step": 37250
+    },
+    {
+      "epoch": 10.865182941039384,
+      "grad_norm": 0.3819262683391571,
+      "learning_rate": 0.00027445449241540255,
+      "loss": 3.349,
+      "step": 37300
+    },
+    {
+      "epoch": 10.879748310417153,
+      "grad_norm": 0.3718022406101227,
+      "learning_rate": 0.0002740169194865811,
+      "loss": 3.3363,
+      "step": 37350
+    },
+    {
+      "epoch": 10.89431367979492,
+      "grad_norm": 0.3736136257648468,
+      "learning_rate": 0.0002735793465577596,
+      "loss": 3.3459,
+      "step": 37400
+    },
+    {
+      "epoch": 10.908879049172686,
+      "grad_norm": 0.37865564227104187,
+      "learning_rate": 0.0002731417736289381,
+      "loss": 3.3505,
+      "step": 37450
+    },
+    {
+      "epoch": 10.923444418550455,
+      "grad_norm": 0.3839370310306549,
+      "learning_rate": 0.00027270420070011665,
+      "loss": 3.3508,
+      "step": 37500
+    },
+    {
+      "epoch": 10.938009787928221,
+      "grad_norm": 0.39603373408317566,
+      "learning_rate": 0.0002722666277712952,
+      "loss": 3.3416,
+      "step": 37550
+    },
+    {
+      "epoch": 10.95257515730599,
+      "grad_norm": 0.4063169062137604,
+      "learning_rate": 0.00027182905484247373,
+      "loss": 3.3462,
+      "step": 37600
+    },
+    {
+      "epoch": 10.967140526683757,
+      "grad_norm": 0.37765178084373474,
+      "learning_rate": 0.00027139148191365227,
+      "loss": 3.347,
+      "step": 37650
+    },
+    {
+      "epoch": 10.981705896061523,
+      "grad_norm": 0.3753688931465149,
+      "learning_rate": 0.00027095390898483076,
+      "loss": 3.3402,
+      "step": 37700
+    },
+    {
+      "epoch": 10.996271265439292,
+      "grad_norm": 0.39264529943466187,
+      "learning_rate": 0.0002705163360560093,
+      "loss": 3.3455,
+      "step": 37750
+    },
+    {
+      "epoch": 11.010778373339548,
+      "grad_norm": 0.3830313980579376,
+      "learning_rate": 0.00027007876312718784,
+      "loss": 3.2566,
+      "step": 37800
+    },
+    {
+      "epoch": 11.025343742717315,
+      "grad_norm": 0.39328911900520325,
+      "learning_rate": 0.0002696411901983664,
+      "loss": 3.2371,
+      "step": 37850
+    },
+    {
+      "epoch": 11.039909112095083,
+      "grad_norm": 0.39774519205093384,
+      "learning_rate": 0.0002692036172695449,
+      "loss": 3.2478,
+      "step": 37900
+    },
+    {
+      "epoch": 11.05447448147285,
+      "grad_norm": 0.36643147468566895,
+      "learning_rate": 0.0002687660443407234,
+      "loss": 3.2477,
+      "step": 37950
+    },
+    {
+      "epoch": 11.069039850850617,
+      "grad_norm": 0.39002707600593567,
+      "learning_rate": 0.00026832847141190194,
+      "loss": 3.2602,
+      "step": 38000
+    },
+    {
+      "epoch": 11.069039850850617,
+      "eval_accuracy": 0.3708169137424157,
+      "eval_loss": 3.5576553344726562,
+      "eval_runtime": 179.8126,
+      "eval_samples_per_second": 92.563,
+      "eval_steps_per_second": 5.789,
+      "step": 38000
+    },
+    {
+      "epoch": 11.083605220228385,
+      "grad_norm": 0.3755168318748474,
+      "learning_rate": 0.0002678908984830805,
+      "loss": 3.2608,
+      "step": 38050
+    },
+    {
+      "epoch": 11.098170589606152,
+      "grad_norm": 0.38174381852149963,
+      "learning_rate": 0.000267453325554259,
+      "loss": 3.2649,
+      "step": 38100
+    },
+    {
+      "epoch": 11.11273595898392,
+      "grad_norm": 0.372764527797699,
+      "learning_rate": 0.00026701575262543756,
+      "loss": 3.265,
+      "step": 38150
+    },
+    {
+      "epoch": 11.127301328361687,
+      "grad_norm": 0.3840652406215668,
+      "learning_rate": 0.00026657817969661605,
+      "loss": 3.2659,
+      "step": 38200
+    },
+    {
+      "epoch": 11.141866697739454,
+      "grad_norm": 0.3895314037799835,
+      "learning_rate": 0.00026614060676779464,
+      "loss": 3.2548,
+      "step": 38250
+    },
+    {
+      "epoch": 11.156432067117223,
+      "grad_norm": 0.3741670250892639,
+      "learning_rate": 0.00026570303383897313,
+      "loss": 3.2596,
+      "step": 38300
+    },
+    {
+      "epoch": 11.17099743649499,
+      "grad_norm": 0.4137849807739258,
+      "learning_rate": 0.00026526546091015167,
+      "loss": 3.2742,
+      "step": 38350
+    },
+    {
+      "epoch": 11.185562805872756,
+      "grad_norm": 0.40267008543014526,
+      "learning_rate": 0.0002648278879813302,
+      "loss": 3.2766,
+      "step": 38400
+    },
+    {
+      "epoch": 11.200128175250525,
+      "grad_norm": 0.38519883155822754,
+      "learning_rate": 0.0002643903150525087,
+      "loss": 3.2839,
+      "step": 38450
+    },
+    {
+      "epoch": 11.214693544628291,
+      "grad_norm": 0.37960830330848694,
+      "learning_rate": 0.0002639527421236873,
+      "loss": 3.2779,
+      "step": 38500
+    },
+    {
+      "epoch": 11.22925891400606,
+      "grad_norm": 0.40030044317245483,
+      "learning_rate": 0.0002635151691948658,
+      "loss": 3.2851,
+      "step": 38550
+    },
+    {
+      "epoch": 11.243824283383827,
+      "grad_norm": 0.37748202681541443,
+      "learning_rate": 0.0002630775962660443,
+      "loss": 3.2801,
+      "step": 38600
+    },
+    {
+      "epoch": 11.258389652761593,
+      "grad_norm": 0.393085241317749,
+      "learning_rate": 0.00026264002333722285,
+      "loss": 3.2861,
+      "step": 38650
+    },
+    {
+      "epoch": 11.272955022139362,
+      "grad_norm": 0.40957602858543396,
+      "learning_rate": 0.00026220245040840134,
+      "loss": 3.2805,
+      "step": 38700
+    },
+    {
+      "epoch": 11.287520391517129,
+      "grad_norm": 0.3886652886867523,
+      "learning_rate": 0.00026176487747957993,
+      "loss": 3.293,
+      "step": 38750
+    },
+    {
+      "epoch": 11.302085760894895,
+      "grad_norm": 0.3791932463645935,
+      "learning_rate": 0.0002613273045507584,
+      "loss": 3.2911,
+      "step": 38800
+    },
+    {
+      "epoch": 11.316651130272664,
+      "grad_norm": 0.3910757899284363,
+      "learning_rate": 0.00026088973162193696,
+      "loss": 3.2861,
+      "step": 38850
+    },
+    {
+      "epoch": 11.33121649965043,
+      "grad_norm": 0.38568395376205444,
+      "learning_rate": 0.0002604521586931155,
+      "loss": 3.2795,
+      "step": 38900
+    },
+    {
+      "epoch": 11.3457818690282,
+      "grad_norm": 0.3812981843948364,
+      "learning_rate": 0.00026001458576429404,
+      "loss": 3.2975,
+      "step": 38950
+    },
+    {
+      "epoch": 11.360347238405966,
+      "grad_norm": 0.4014233648777008,
+      "learning_rate": 0.0002595770128354726,
+      "loss": 3.2927,
+      "step": 39000
+    },
+    {
+      "epoch": 11.360347238405966,
+      "eval_accuracy": 0.3709491875682827,
+      "eval_loss": 3.5536797046661377,
+      "eval_runtime": 179.8,
+      "eval_samples_per_second": 92.57,
+      "eval_steps_per_second": 5.79,
+      "step": 39000
+    },
+    {
+      "epoch": 11.374912607783733,
+      "grad_norm": 0.4019148349761963,
+      "learning_rate": 0.00025913943990665106,
+      "loss": 3.2973,
+      "step": 39050
+    },
+    {
+      "epoch": 11.389477977161501,
+      "grad_norm": 0.386180579662323,
+      "learning_rate": 0.0002587018669778296,
+      "loss": 3.3019,
+      "step": 39100
+    },
+    {
+      "epoch": 11.404043346539268,
+      "grad_norm": 0.367136687040329,
+      "learning_rate": 0.00025826429404900814,
+      "loss": 3.2868,
+      "step": 39150
+    },
+    {
+      "epoch": 11.418608715917035,
+      "grad_norm": 0.38054171204566956,
+      "learning_rate": 0.0002578267211201867,
+      "loss": 3.2943,
+      "step": 39200
+    },
+    {
+      "epoch": 11.433174085294803,
+      "grad_norm": 0.38401472568511963,
+      "learning_rate": 0.0002573891481913652,
+      "loss": 3.2884,
+      "step": 39250
+    },
+    {
+      "epoch": 11.44773945467257,
+      "grad_norm": 0.3807559609413147,
+      "learning_rate": 0.0002569515752625437,
+      "loss": 3.3096,
+      "step": 39300
+    },
+    {
+      "epoch": 11.462304824050339,
+      "grad_norm": 0.3866496682167053,
+      "learning_rate": 0.0002565140023337223,
+      "loss": 3.2945,
+      "step": 39350
+    },
+    {
+      "epoch": 11.476870193428105,
+      "grad_norm": 0.39391186833381653,
+      "learning_rate": 0.0002560764294049008,
+      "loss": 3.3052,
+      "step": 39400
+    },
+    {
+      "epoch": 11.491435562805872,
+      "grad_norm": 0.3963194489479065,
+      "learning_rate": 0.00025563885647607933,
+      "loss": 3.307,
+      "step": 39450
+    },
+    {
+      "epoch": 11.50600093218364,
+      "grad_norm": 0.378620445728302,
+      "learning_rate": 0.00025520128354725787,
+      "loss": 3.2964,
+      "step": 39500
+    },
+    {
+      "epoch": 11.520566301561407,
+      "grad_norm": 0.3991275727748871,
+      "learning_rate": 0.00025476371061843636,
+      "loss": 3.3136,
+      "step": 39550
+    },
+    {
+      "epoch": 11.535131670939174,
+      "grad_norm": 0.403228759765625,
+      "learning_rate": 0.00025432613768961495,
+      "loss": 3.3107,
+      "step": 39600
+    },
+    {
+      "epoch": 11.549697040316943,
+      "grad_norm": 0.3849295377731323,
+      "learning_rate": 0.00025388856476079343,
+      "loss": 3.2815,
+      "step": 39650
+    },
+    {
+      "epoch": 11.56426240969471,
+      "grad_norm": 0.39309900999069214,
+      "learning_rate": 0.000253450991831972,
+      "loss": 3.2973,
+      "step": 39700
+    },
+    {
+      "epoch": 11.578827779072478,
+      "grad_norm": 0.376312255859375,
+      "learning_rate": 0.0002530134189031505,
+      "loss": 3.3027,
+      "step": 39750
+    },
+    {
+      "epoch": 11.593393148450245,
+      "grad_norm": 0.3923032283782959,
+      "learning_rate": 0.000252575845974329,
+      "loss": 3.2931,
+      "step": 39800
+    },
+    {
+      "epoch": 11.607958517828012,
+      "grad_norm": 0.37784478068351746,
+      "learning_rate": 0.0002521382730455076,
+      "loss": 3.3102,
+      "step": 39850
+    },
+    {
+      "epoch": 11.62252388720578,
+      "grad_norm": 0.3833751380443573,
+      "learning_rate": 0.0002517007001166861,
+      "loss": 3.3157,
+      "step": 39900
+    },
+    {
+      "epoch": 11.637089256583547,
+      "grad_norm": 0.38745492696762085,
+      "learning_rate": 0.0002512631271878646,
+      "loss": 3.3039,
+      "step": 39950
+    },
+    {
+      "epoch": 11.651654625961314,
+      "grad_norm": 0.3720225393772125,
+      "learning_rate": 0.00025082555425904316,
+      "loss": 3.2965,
+      "step": 40000
+    },
+    {
+      "epoch": 11.651654625961314,
+      "eval_accuracy": 0.3716299568587447,
+      "eval_loss": 3.5464437007904053,
+      "eval_runtime": 180.224,
+      "eval_samples_per_second": 92.352,
+      "eval_steps_per_second": 5.776,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 68660,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.36085640790016e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}