diff --git "a/resemble_to_hit_frequency_5039/checkpoint-40000/trainer_state.json" "b/resemble_to_hit_frequency_5039/checkpoint-40000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/resemble_to_hit_frequency_5039/checkpoint-40000/trainer_state.json"
@@ -0,0 +1,6003 @@
+{
+  "best_global_step": 40000,
+  "best_metric": 3.5570318698883057,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_hit_frequency_5039/checkpoint-40000",
+  "epoch": 11.655011655011656,
+  "eval_steps": 1000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014568764568764568,
+      "grad_norm": 1.6134361028671265,
+      "learning_rate": 0.000294,
+      "loss": 8.4822,
+      "step": 50
+    },
+    {
+      "epoch": 0.029137529137529136,
+      "grad_norm": 0.6467522382736206,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7172,
+      "step": 100
+    },
+    {
+      "epoch": 0.043706293706293704,
+      "grad_norm": 0.4256949722766876,
+      "learning_rate": 0.0005998285714285713,
+      "loss": 6.3649,
+      "step": 150
+    },
+    {
+      "epoch": 0.05827505827505827,
+      "grad_norm": 0.4922082722187042,
+      "learning_rate": 0.0005996536443148687,
+      "loss": 6.1487,
+      "step": 200
+    },
+    {
+      "epoch": 0.07284382284382285,
+      "grad_norm": 0.4709911346435547,
+      "learning_rate": 0.0005994787172011662,
+      "loss": 6.0174,
+      "step": 250
+    },
+    {
+      "epoch": 0.08741258741258741,
+      "grad_norm": 0.48052042722702026,
+      "learning_rate": 0.0005993037900874635,
+      "loss": 5.8773,
+      "step": 300
+    },
+    {
+      "epoch": 0.10198135198135198,
+      "grad_norm": 0.5878971219062805,
+      "learning_rate": 0.0005991288629737609,
+      "loss": 5.7603,
+      "step": 350
+    },
+    {
+      "epoch": 0.11655011655011654,
+      "grad_norm": 0.4191853702068329,
+      "learning_rate": 0.0005989539358600582,
+      "loss": 5.6447,
+      "step": 400
+    },
+    {
+      "epoch": 0.13111888111888112,
+      "grad_norm": 0.49607983231544495,
+      "learning_rate": 0.0005987790087463557,
+      "loss": 5.5178,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456876456876457,
+      "grad_norm": 0.4248947501182556,
+      "learning_rate": 0.000598604081632653,
+      "loss": 5.4198,
+      "step": 500
+    },
+    {
+      "epoch": 0.16025641025641027,
+      "grad_norm": 0.48206138610839844,
+      "learning_rate": 0.0005984291545189504,
+      "loss": 5.3384,
+      "step": 550
+    },
+    {
+      "epoch": 0.17482517482517482,
+      "grad_norm": 0.4465309679508209,
+      "learning_rate": 0.0005982542274052477,
+      "loss": 5.2645,
+      "step": 600
+    },
+    {
+      "epoch": 0.1893939393939394,
+      "grad_norm": 0.42823970317840576,
+      "learning_rate": 0.0005980793002915452,
+      "loss": 5.2088,
+      "step": 650
+    },
+    {
+      "epoch": 0.20396270396270397,
+      "grad_norm": 0.4172956943511963,
+      "learning_rate": 0.0005979043731778425,
+      "loss": 5.1419,
+      "step": 700
+    },
+    {
+      "epoch": 0.21853146853146854,
+      "grad_norm": 0.424402117729187,
+      "learning_rate": 0.0005977294460641399,
+      "loss": 5.0645,
+      "step": 750
+    },
+    {
+      "epoch": 0.2331002331002331,
+      "grad_norm": 0.4406491816043854,
+      "learning_rate": 0.0005975545189504372,
+      "loss": 5.0224,
+      "step": 800
+    },
+    {
+      "epoch": 0.24766899766899766,
+      "grad_norm": 0.4717820882797241,
+      "learning_rate": 0.0005973795918367347,
+      "loss": 5.003,
+      "step": 850
+    },
+    {
+      "epoch": 0.26223776223776224,
+      "grad_norm": 0.4521999657154083,
+      "learning_rate": 0.000597204664723032,
+      "loss": 4.9142,
+      "step": 900
+    },
+    {
+      "epoch": 0.2768065268065268,
+      "grad_norm": 0.4754863679409027,
+      "learning_rate": 0.0005970297376093294,
+      "loss": 4.8762,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913752913752914,
+      "grad_norm": 0.41961297392845154,
+      "learning_rate": 0.0005968548104956268,
+      "loss": 4.8402,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913752913752914,
+      "eval_accuracy": 0.2549595710849709,
+      "eval_loss": 4.753758430480957,
+      "eval_runtime": 180.4427,
+      "eval_samples_per_second": 92.229,
+      "eval_steps_per_second": 5.769,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30594405594405594,
+      "grad_norm": 0.6563605070114136,
+      "learning_rate": 0.0005966798833819242,
+      "loss": 4.7815,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32051282051282054,
+      "grad_norm": 0.4702153503894806,
+      "learning_rate": 0.0005965049562682215,
+      "loss": 4.7461,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350815850815851,
+      "grad_norm": 0.4264092743396759,
+      "learning_rate": 0.0005963300291545189,
+      "loss": 4.6878,
+      "step": 1150
+    },
+    {
+      "epoch": 0.34965034965034963,
+      "grad_norm": 0.4903077185153961,
+      "learning_rate": 0.0005961551020408162,
+      "loss": 4.6656,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36421911421911424,
+      "grad_norm": 0.4931991994380951,
+      "learning_rate": 0.0005959801749271137,
+      "loss": 4.6333,
+      "step": 1250
+    },
+    {
+      "epoch": 0.3787878787878788,
+      "grad_norm": 0.43286022543907166,
+      "learning_rate": 0.000595805247813411,
+      "loss": 4.6066,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39335664335664333,
+      "grad_norm": 0.40360233187675476,
+      "learning_rate": 0.0005956303206997084,
+      "loss": 4.5706,
+      "step": 1350
+    },
+    {
+      "epoch": 0.40792540792540793,
+      "grad_norm": 0.420491486787796,
+      "learning_rate": 0.0005954553935860059,
+      "loss": 4.5591,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4224941724941725,
+      "grad_norm": 0.4152667820453644,
+      "learning_rate": 0.0005952804664723032,
+      "loss": 4.5331,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4370629370629371,
+      "grad_norm": 0.4153015613555908,
+      "learning_rate": 0.0005951055393586005,
+      "loss": 4.5102,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45163170163170163,
+      "grad_norm": 0.4187549352645874,
+      "learning_rate": 0.0005949306122448979,
+      "loss": 4.4927,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4662004662004662,
+      "grad_norm": 0.4402385354042053,
+      "learning_rate": 0.0005947556851311952,
+      "loss": 4.4652,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 0.41887184977531433,
+      "learning_rate": 0.0005945807580174927,
+      "loss": 4.4431,
+      "step": 1650
+    },
+    {
+      "epoch": 0.49533799533799533,
+      "grad_norm": 0.4349214434623718,
+      "learning_rate": 0.00059440583090379,
+      "loss": 4.4302,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5099067599067599,
+      "grad_norm": 0.416457861661911,
+      "learning_rate": 0.0005942309037900874,
+      "loss": 4.4188,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5244755244755245,
+      "grad_norm": 0.3888656198978424,
+      "learning_rate": 0.0005940559766763847,
+      "loss": 4.3824,
+      "step": 1800
+    },
+    {
+      "epoch": 0.539044289044289,
+      "grad_norm": 0.38429805636405945,
+      "learning_rate": 0.0005938810495626822,
+      "loss": 4.38,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5536130536130536,
+      "grad_norm": 0.4373445510864258,
+      "learning_rate": 0.0005937061224489796,
+      "loss": 4.3664,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5681818181818182,
+      "grad_norm": 0.43909236788749695,
+      "learning_rate": 0.0005935311953352769,
+      "loss": 4.3394,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5827505827505828,
+      "grad_norm": 0.3650919795036316,
+      "learning_rate": 0.0005933562682215743,
+      "loss": 4.3353,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5827505827505828,
+      "eval_accuracy": 0.30004649542771444,
+      "eval_loss": 4.282717704772949,
+      "eval_runtime": 180.3042,
+      "eval_samples_per_second": 92.3,
+      "eval_steps_per_second": 5.774,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5973193473193473,
+      "grad_norm": 0.4164070188999176,
+      "learning_rate": 0.0005931813411078717,
+      "loss": 4.3193,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6118881118881119,
+      "grad_norm": 0.370237797498703,
+      "learning_rate": 0.000593006413994169,
+      "loss": 4.3105,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6264568764568764,
+      "grad_norm": 0.4082745611667633,
+      "learning_rate": 0.0005928314868804664,
+      "loss": 4.2941,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6410256410256411,
+      "grad_norm": 0.3935624957084656,
+      "learning_rate": 0.0005926565597667638,
+      "loss": 4.2833,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6555944055944056,
+      "grad_norm": 0.3984358310699463,
+      "learning_rate": 0.0005924816326530612,
+      "loss": 4.2679,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6701631701631702,
+      "grad_norm": 0.3854668140411377,
+      "learning_rate": 0.0005923067055393586,
+      "loss": 4.2746,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6847319347319347,
+      "grad_norm": 0.37700507044792175,
+      "learning_rate": 0.0005921317784256559,
+      "loss": 4.2429,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6993006993006993,
+      "grad_norm": 0.3662063777446747,
+      "learning_rate": 0.0005919568513119533,
+      "loss": 4.2584,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7138694638694638,
+      "grad_norm": 0.38568246364593506,
+      "learning_rate": 0.0005917819241982507,
+      "loss": 4.2344,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7284382284382285,
+      "grad_norm": 0.43455713987350464,
+      "learning_rate": 0.000591606997084548,
+      "loss": 4.2328,
+      "step": 2500
+    },
+    {
+      "epoch": 0.743006993006993,
+      "grad_norm": 0.3856061100959778,
+      "learning_rate": 0.0005914320699708454,
+      "loss": 4.2092,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.38549911975860596,
+      "learning_rate": 0.0005912571428571428,
+      "loss": 4.218,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7721445221445221,
+      "grad_norm": 0.39746782183647156,
+      "learning_rate": 0.0005910822157434402,
+      "loss": 4.1968,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7867132867132867,
+      "grad_norm": 0.39923539757728577,
+      "learning_rate": 0.0005909072886297376,
+      "loss": 4.1886,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8012820512820513,
+      "grad_norm": 0.3931720554828644,
+      "learning_rate": 0.0005907323615160349,
+      "loss": 4.1767,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8158508158508159,
+      "grad_norm": 0.3459513783454895,
+      "learning_rate": 0.0005905574344023324,
+      "loss": 4.1785,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8304195804195804,
+      "grad_norm": 0.39997783303260803,
+      "learning_rate": 0.0005903825072886297,
+      "loss": 4.1691,
+      "step": 2850
+    },
+    {
+      "epoch": 0.844988344988345,
+      "grad_norm": 0.3813166618347168,
+      "learning_rate": 0.000590207580174927,
+      "loss": 4.1596,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8595571095571095,
+      "grad_norm": 0.37703993916511536,
+      "learning_rate": 0.0005900326530612244,
+      "loss": 4.1536,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8741258741258742,
+      "grad_norm": 0.3787032961845398,
+      "learning_rate": 0.0005898577259475218,
+      "loss": 4.1411,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8741258741258742,
+      "eval_accuracy": 0.31596194853706383,
+      "eval_loss": 4.094162940979004,
+      "eval_runtime": 180.2783,
+      "eval_samples_per_second": 92.313,
+      "eval_steps_per_second": 5.774,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8886946386946387,
+      "grad_norm": 0.35312145948410034,
+      "learning_rate": 0.0005896827988338192,
+      "loss": 4.1438,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9032634032634033,
+      "grad_norm": 0.38526853919029236,
+      "learning_rate": 0.0005895078717201166,
+      "loss": 4.1301,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9178321678321678,
+      "grad_norm": 0.36466994881629944,
+      "learning_rate": 0.000589332944606414,
+      "loss": 4.1235,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9324009324009324,
+      "grad_norm": 0.3571998178958893,
+      "learning_rate": 0.0005891580174927114,
+      "loss": 4.1292,
+      "step": 3200
+    },
+    {
+      "epoch": 0.946969696969697,
+      "grad_norm": 0.3403795063495636,
+      "learning_rate": 0.0005889830903790087,
+      "loss": 4.1082,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 0.38671016693115234,
+      "learning_rate": 0.000588808163265306,
+      "loss": 4.096,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9761072261072261,
+      "grad_norm": 0.3343498408794403,
+      "learning_rate": 0.0005886332361516035,
+      "loss": 4.0914,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9906759906759907,
+      "grad_norm": 0.3740348815917969,
+      "learning_rate": 0.0005884583090379008,
+      "loss": 4.1069,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0052447552447552,
+      "grad_norm": 0.37788498401641846,
+      "learning_rate": 0.0005882833819241982,
+      "loss": 4.0461,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0198135198135199,
+      "grad_norm": 0.3331277072429657,
+      "learning_rate": 0.0005881084548104955,
+      "loss": 4.0172,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0343822843822843,
+      "grad_norm": 0.38395169377326965,
+      "learning_rate": 0.000587933527696793,
+      "loss": 4.014,
+      "step": 3550
+    },
+    {
+      "epoch": 1.048951048951049,
+      "grad_norm": 0.35354799032211304,
+      "learning_rate": 0.0005877586005830904,
+      "loss": 4.0174,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0635198135198136,
+      "grad_norm": 0.35139200091362,
+      "learning_rate": 0.0005875836734693877,
+      "loss": 4.0323,
+      "step": 3650
+    },
+    {
+      "epoch": 1.078088578088578,
+      "grad_norm": 0.36168310046195984,
+      "learning_rate": 0.0005874087463556851,
+      "loss": 4.0024,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0926573426573427,
+      "grad_norm": 0.3537745773792267,
+      "learning_rate": 0.0005872338192419825,
+      "loss": 4.0106,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1072261072261071,
+      "grad_norm": 0.3509289026260376,
+      "learning_rate": 0.0005870588921282798,
+      "loss": 3.9951,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1217948717948718,
+      "grad_norm": 0.3399880826473236,
+      "learning_rate": 0.0005868839650145772,
+      "loss": 3.9968,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1363636363636362,
+      "grad_norm": 0.31629034876823425,
+      "learning_rate": 0.0005867090379008745,
+      "loss": 3.9957,
+      "step": 3900
+    },
+    {
+      "epoch": 1.150932400932401,
+      "grad_norm": 0.34750989079475403,
+      "learning_rate": 0.000586534110787172,
+      "loss": 3.9916,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1655011655011656,
+      "grad_norm": 0.34375834465026855,
+      "learning_rate": 0.0005863591836734694,
+      "loss": 3.994,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1655011655011656,
+      "eval_accuracy": 0.3255772359138492,
+      "eval_loss": 3.989028215408325,
+      "eval_runtime": 180.1395,
+      "eval_samples_per_second": 92.384,
+      "eval_steps_per_second": 5.779,
+      "step": 4000
+    },
+    {
+      "epoch": 1.18006993006993,
+      "grad_norm": 0.37413716316223145,
+      "learning_rate": 0.0005861842565597667,
+      "loss": 3.9981,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1946386946386947,
+      "grad_norm": 0.35629984736442566,
+      "learning_rate": 0.0005860093294460641,
+      "loss": 3.975,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2092074592074593,
+      "grad_norm": 0.3381233215332031,
+      "learning_rate": 0.0005858344023323615,
+      "loss": 3.9851,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2237762237762237,
+      "grad_norm": 0.34585943818092346,
+      "learning_rate": 0.0005856594752186588,
+      "loss": 3.9808,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2383449883449884,
+      "grad_norm": 0.35993272066116333,
+      "learning_rate": 0.0005854845481049562,
+      "loss": 3.975,
+      "step": 4250
+    },
+    {
+      "epoch": 1.2529137529137528,
+      "grad_norm": 0.32540130615234375,
+      "learning_rate": 0.0005853096209912535,
+      "loss": 3.9714,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2674825174825175,
+      "grad_norm": 0.36224445700645447,
+      "learning_rate": 0.000585134693877551,
+      "loss": 3.9754,
+      "step": 4350
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 0.3662620186805725,
+      "learning_rate": 0.0005849597667638484,
+      "loss": 3.9694,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2966200466200466,
+      "grad_norm": 0.35438838601112366,
+      "learning_rate": 0.0005847848396501457,
+      "loss": 3.9519,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3111888111888113,
+      "grad_norm": 0.34450942277908325,
+      "learning_rate": 0.0005846099125364432,
+      "loss": 3.963,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3257575757575757,
+      "grad_norm": 0.351962685585022,
+      "learning_rate": 0.0005844349854227405,
+      "loss": 3.9591,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3403263403263403,
+      "grad_norm": 0.3839578926563263,
+      "learning_rate": 0.0005842600583090379,
+      "loss": 3.9561,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354895104895105,
+      "grad_norm": 0.32113179564476013,
+      "learning_rate": 0.0005840851311953352,
+      "loss": 3.949,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3694638694638694,
+      "grad_norm": 0.33071938157081604,
+      "learning_rate": 0.0005839102040816325,
+      "loss": 3.9608,
+      "step": 4700
+    },
+    {
+      "epoch": 1.384032634032634,
+      "grad_norm": 0.33803558349609375,
+      "learning_rate": 0.00058373527696793,
+      "loss": 3.9482,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3986013986013985,
+      "grad_norm": 0.31636884808540344,
+      "learning_rate": 0.0005835603498542273,
+      "loss": 3.9437,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4131701631701632,
+      "grad_norm": 0.3646225035190582,
+      "learning_rate": 0.0005833854227405247,
+      "loss": 3.9303,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4277389277389276,
+      "grad_norm": 0.3559642732143402,
+      "learning_rate": 0.0005832104956268222,
+      "loss": 3.9403,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4423076923076923,
+      "grad_norm": 0.3481752276420593,
+      "learning_rate": 0.0005830355685131195,
+      "loss": 3.9357,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456876456876457,
+      "grad_norm": 0.313125878572464,
+      "learning_rate": 0.0005828606413994169,
+      "loss": 3.9303,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456876456876457,
+      "eval_accuracy": 0.3321257535516557,
+      "eval_loss": 3.9129536151885986,
+      "eval_runtime": 180.4532,
+      "eval_samples_per_second": 92.223,
+      "eval_steps_per_second": 5.769,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4714452214452214,
+      "grad_norm": 0.33051010966300964,
+      "learning_rate": 0.0005826857142857142,
+      "loss": 3.9226,
+      "step": 5050
+    },
+    {
+      "epoch": 1.486013986013986,
+      "grad_norm": 0.3060428500175476,
+      "learning_rate": 0.0005825107871720116,
+      "loss": 3.9254,
+      "step": 5100
+    },
+    {
+      "epoch": 1.5005827505827507,
+      "grad_norm": 0.34262314438819885,
+      "learning_rate": 0.000582335860058309,
+      "loss": 3.9131,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.33539673686027527,
+      "learning_rate": 0.0005821609329446063,
+      "loss": 3.9158,
+      "step": 5200
+    },
+    {
+      "epoch": 1.5297202797202796,
+      "grad_norm": 0.3277048170566559,
+      "learning_rate": 0.0005819860058309037,
+      "loss": 3.9228,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5442890442890445,
+      "grad_norm": 0.31714221835136414,
+      "learning_rate": 0.0005818110787172012,
+      "loss": 3.9245,
+      "step": 5300
+    },
+    {
+      "epoch": 1.558857808857809,
+      "grad_norm": 0.329098105430603,
+      "learning_rate": 0.0005816361516034985,
+      "loss": 3.9212,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5734265734265733,
+      "grad_norm": 0.33248335123062134,
+      "learning_rate": 0.0005814612244897959,
+      "loss": 3.9066,
+      "step": 5400
+    },
+    {
+      "epoch": 1.587995337995338,
+      "grad_norm": 0.3300471305847168,
+      "learning_rate": 0.0005812862973760932,
+      "loss": 3.9076,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6025641025641026,
+      "grad_norm": 0.3110630214214325,
+      "learning_rate": 0.0005811113702623907,
+      "loss": 3.8996,
+      "step": 5500
+    },
+    {
+      "epoch": 1.617132867132867,
+      "grad_norm": 0.34096479415893555,
+      "learning_rate": 0.000580936443148688,
+      "loss": 3.8914,
+      "step": 5550
+    },
+    {
+      "epoch": 1.6317016317016317,
+      "grad_norm": 0.3256978690624237,
+      "learning_rate": 0.0005807615160349853,
+      "loss": 3.8901,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6462703962703964,
+      "grad_norm": 0.3170398771762848,
+      "learning_rate": 0.0005805865889212827,
+      "loss": 3.9086,
+      "step": 5650
+    },
+    {
+      "epoch": 1.6608391608391608,
+      "grad_norm": 0.32134151458740234,
+      "learning_rate": 0.0005804116618075802,
+      "loss": 3.8843,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6754079254079253,
+      "grad_norm": 0.3455315828323364,
+      "learning_rate": 0.0005802367346938775,
+      "loss": 3.8936,
+      "step": 5750
+    },
+    {
+      "epoch": 1.68997668997669,
+      "grad_norm": 0.33487361669540405,
+      "learning_rate": 0.0005800618075801749,
+      "loss": 3.903,
+      "step": 5800
+    },
+    {
+      "epoch": 1.7045454545454546,
+      "grad_norm": 0.3249671459197998,
+      "learning_rate": 0.0005798868804664722,
+      "loss": 3.8913,
+      "step": 5850
+    },
+    {
+      "epoch": 1.719114219114219,
+      "grad_norm": 0.35598769783973694,
+      "learning_rate": 0.0005797119533527697,
+      "loss": 3.8821,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7336829836829837,
+      "grad_norm": 0.34034013748168945,
+      "learning_rate": 0.000579537026239067,
+      "loss": 3.8849,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7482517482517483,
+      "grad_norm": 0.33674389123916626,
+      "learning_rate": 0.0005793620991253643,
+      "loss": 3.8992,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7482517482517483,
+      "eval_accuracy": 0.33753787307759514,
+      "eval_loss": 3.85577654838562,
+      "eval_runtime": 180.1447,
+      "eval_samples_per_second": 92.381,
+      "eval_steps_per_second": 5.779,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7628205128205128,
+      "grad_norm": 0.32885122299194336,
+      "learning_rate": 0.0005791871720116617,
+      "loss": 3.8805,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7773892773892774,
+      "grad_norm": 0.32068461179733276,
+      "learning_rate": 0.0005790122448979591,
+      "loss": 3.8668,
+      "step": 6100
+    },
+    {
+      "epoch": 1.791958041958042,
+      "grad_norm": 0.3308079242706299,
+      "learning_rate": 0.0005788373177842565,
+      "loss": 3.8776,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8065268065268065,
+      "grad_norm": 0.32728639245033264,
+      "learning_rate": 0.0005786623906705539,
+      "loss": 3.8633,
+      "step": 6200
+    },
+    {
+      "epoch": 1.821095571095571,
+      "grad_norm": 0.3404487073421478,
+      "learning_rate": 0.0005784874635568512,
+      "loss": 3.8712,
+      "step": 6250
+    },
+    {
+      "epoch": 1.8356643356643356,
+      "grad_norm": 0.32237741351127625,
+      "learning_rate": 0.0005783125364431487,
+      "loss": 3.8582,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8502331002331003,
+      "grad_norm": 0.3479669392108917,
+      "learning_rate": 0.000578137609329446,
+      "loss": 3.8647,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8648018648018647,
+      "grad_norm": 0.3184560239315033,
+      "learning_rate": 0.0005779626822157434,
+      "loss": 3.847,
+      "step": 6400
+    },
+    {
+      "epoch": 1.8793706293706294,
+      "grad_norm": 0.3197358548641205,
+      "learning_rate": 0.0005777877551020408,
+      "loss": 3.8617,
+      "step": 6450
+    },
+    {
+      "epoch": 1.893939393939394,
+      "grad_norm": 0.2957116663455963,
+      "learning_rate": 0.0005776128279883381,
+      "loss": 3.854,
+      "step": 6500
+    },
+    {
+      "epoch": 1.9085081585081585,
+      "grad_norm": 0.3220060169696808,
+      "learning_rate": 0.0005774379008746355,
+      "loss": 3.851,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.3108726441860199,
+      "learning_rate": 0.0005772629737609329,
+      "loss": 3.8559,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9376456876456878,
+      "grad_norm": 0.33560416102409363,
+      "learning_rate": 0.0005770880466472303,
+      "loss": 3.8505,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9522144522144522,
+      "grad_norm": 0.33253157138824463,
+      "learning_rate": 0.0005769131195335277,
+      "loss": 3.8468,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9667832167832167,
+      "grad_norm": 0.3143483102321625,
+      "learning_rate": 0.000576738192419825,
+      "loss": 3.8416,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9813519813519813,
+      "grad_norm": 0.32564249634742737,
+      "learning_rate": 0.0005765632653061224,
+      "loss": 3.843,
+      "step": 6800
+    },
+    {
+      "epoch": 1.995920745920746,
+      "grad_norm": 0.33519217371940613,
+      "learning_rate": 0.0005763883381924198,
+      "loss": 3.8431,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0104895104895104,
+      "grad_norm": 0.32294219732284546,
+      "learning_rate": 0.0005762134110787171,
+      "loss": 3.7722,
+      "step": 6900
+    },
+    {
+      "epoch": 2.025058275058275,
+      "grad_norm": 0.3262682557106018,
+      "learning_rate": 0.0005760384839650145,
+      "loss": 3.7428,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0396270396270397,
+      "grad_norm": 0.3397265374660492,
+      "learning_rate": 0.0005758635568513119,
+      "loss": 3.7487,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0396270396270397,
+      "eval_accuracy": 0.3415744146738347,
+      "eval_loss": 3.8127987384796143,
+      "eval_runtime": 180.1348,
+      "eval_samples_per_second": 92.386,
+      "eval_steps_per_second": 5.779,
+      "step": 7000
+    },
+    {
+      "epoch": 2.054195804195804,
+      "grad_norm": 0.3330610990524292,
+      "learning_rate": 0.0005756886297376093,
+      "loss": 3.7405,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0687645687645686,
+      "grad_norm": 0.3221195638179779,
+      "learning_rate": 0.0005755137026239067,
+      "loss": 3.7561,
+      "step": 7100
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": 0.32453685998916626,
+      "learning_rate": 0.000575338775510204,
+      "loss": 3.7532,
+      "step": 7150
+    },
+    {
+      "epoch": 2.097902097902098,
+      "grad_norm": 0.3615976870059967,
+      "learning_rate": 0.0005751638483965014,
+      "loss": 3.7618,
+      "step": 7200
+    },
+    {
+      "epoch": 2.1124708624708624,
+      "grad_norm": 0.323742538690567,
+      "learning_rate": 0.0005749889212827988,
+      "loss": 3.7508,
+      "step": 7250
+    },
+    {
+      "epoch": 2.1270396270396272,
+      "grad_norm": 0.3381347954273224,
+      "learning_rate": 0.0005748139941690962,
+      "loss": 3.7588,
+      "step": 7300
+    },
+    {
+      "epoch": 2.1416083916083917,
+      "grad_norm": 0.3426363468170166,
+      "learning_rate": 0.0005746390670553935,
+      "loss": 3.7579,
+      "step": 7350
+    },
+    {
+      "epoch": 2.156177156177156,
+      "grad_norm": 0.31964731216430664,
+      "learning_rate": 0.000574464139941691,
+      "loss": 3.7528,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1707459207459205,
+      "grad_norm": 0.3354383111000061,
+      "learning_rate": 0.0005742892128279883,
+      "loss": 3.7556,
+      "step": 7450
+    },
+    {
+      "epoch": 2.1853146853146854,
+      "grad_norm": 0.3251858353614807,
+      "learning_rate": 0.0005741142857142857,
+      "loss": 3.7556,
+      "step": 7500
+    },
+    {
+      "epoch": 2.19988344988345,
+      "grad_norm": 0.3399089276790619,
+      "learning_rate": 0.000573939358600583,
+      "loss": 3.7415,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2144522144522143,
+      "grad_norm": 0.3444349467754364,
+      "learning_rate": 0.0005737644314868805,
+      "loss": 3.7515,
+      "step": 7600
+    },
+    {
+      "epoch": 2.229020979020979,
+      "grad_norm": 0.31715652346611023,
+      "learning_rate": 0.0005735895043731778,
+      "loss": 3.7618,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2435897435897436,
+      "grad_norm": 0.34369540214538574,
+      "learning_rate": 0.0005734145772594752,
+      "loss": 3.7687,
+      "step": 7700
+    },
+    {
+      "epoch": 2.258158508158508,
+      "grad_norm": 0.3494495153427124,
+      "learning_rate": 0.0005732396501457726,
+      "loss": 3.748,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.31449177861213684,
+      "learning_rate": 0.0005730647230320698,
+      "loss": 3.7541,
+      "step": 7800
+    },
+    {
+      "epoch": 2.2872960372960374,
+      "grad_norm": 0.3397660553455353,
+      "learning_rate": 0.0005728897959183673,
+      "loss": 3.7624,
+      "step": 7850
+    },
+    {
+      "epoch": 2.301864801864802,
+      "grad_norm": 0.34240466356277466,
+      "learning_rate": 0.0005727148688046647,
+      "loss": 3.7432,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3164335664335667,
+      "grad_norm": 0.3217261731624603,
+      "learning_rate": 0.000572539941690962,
+      "loss": 3.7499,
+      "step": 7950
+    },
+    {
+      "epoch": 2.331002331002331,
+      "grad_norm": 0.3246598243713379,
+      "learning_rate": 0.0005723650145772595,
+      "loss": 3.7619,
+      "step": 8000
+    },
+    {
+      "epoch": 2.331002331002331,
+      "eval_accuracy": 0.34476242059382917,
+      "eval_loss": 3.7828927040100098,
+      "eval_runtime": 179.9962,
+      "eval_samples_per_second": 92.458,
+      "eval_steps_per_second": 5.783,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3455710955710956,
+      "grad_norm": 0.3367806673049927,
+      "learning_rate": 0.0005721900874635568,
+      "loss": 3.7485,
+      "step": 8050
+    },
+    {
+      "epoch": 2.36013986013986,
+      "grad_norm": 0.3171541392803192,
+      "learning_rate": 0.0005720151603498542,
+      "loss": 3.7546,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374708624708625,
+      "grad_norm": 0.33225518465042114,
+      "learning_rate": 0.0005718402332361515,
+      "loss": 3.7429,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3892773892773893,
+      "grad_norm": 0.3193056881427765,
+      "learning_rate": 0.000571665306122449,
+      "loss": 3.7622,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4038461538461537,
+      "grad_norm": 0.3187880218029022,
+      "learning_rate": 0.0005714903790087463,
+      "loss": 3.7435,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4184149184149186,
+      "grad_norm": 0.33991068601608276,
+      "learning_rate": 0.0005713154518950437,
+      "loss": 3.7494,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432983682983683,
+      "grad_norm": 0.3092400133609772,
+      "learning_rate": 0.000571140524781341,
+      "loss": 3.7612,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4475524475524475,
+      "grad_norm": 0.31092721223831177,
+      "learning_rate": 0.0005709655976676385,
+      "loss": 3.7488,
+      "step": 8400
+    },
+    {
+      "epoch": 2.462121212121212,
+      "grad_norm": 0.32930874824523926,
+      "learning_rate": 0.0005707906705539358,
+      "loss": 3.758,
+      "step": 8450
+    },
+    {
+      "epoch": 2.476689976689977,
+      "grad_norm": 0.32361528277397156,
+      "learning_rate": 0.0005706157434402332,
+      "loss": 3.7454,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4912587412587412,
+      "grad_norm": 0.33115440607070923,
+      "learning_rate": 0.0005704408163265305,
+      "loss": 3.7402,
+      "step": 8550
+    },
+    {
+      "epoch": 2.5058275058275057,
+      "grad_norm": 0.328485369682312,
+      "learning_rate": 0.000570265889212828,
+      "loss": 3.7372,
+      "step": 8600
+    },
+    {
+      "epoch": 2.5203962703962706,
+      "grad_norm": 0.35709500312805176,
+      "learning_rate": 0.0005700909620991253,
+      "loss": 3.7461,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534965034965035,
+      "grad_norm": 0.32163530588150024,
+      "learning_rate": 0.0005699160349854227,
+      "loss": 3.7541,
+      "step": 8700
+    },
+    {
+      "epoch": 2.5495337995337994,
+      "grad_norm": 0.31789329648017883,
+      "learning_rate": 0.00056974110787172,
+      "loss": 3.7438,
+      "step": 8750
+    },
+    {
+      "epoch": 2.564102564102564,
+      "grad_norm": 0.3170648515224457,
+      "learning_rate": 0.0005695661807580175,
+      "loss": 3.7557,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5786713286713288,
+      "grad_norm": 0.3424239158630371,
+      "learning_rate": 0.0005693912536443148,
+      "loss": 3.7398,
+      "step": 8850
+    },
+    {
+      "epoch": 2.593240093240093,
+      "grad_norm": 0.318135529756546,
+      "learning_rate": 0.0005692163265306122,
+      "loss": 3.7284,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607808857808858,
+      "grad_norm": 0.33802515268325806,
+      "learning_rate": 0.0005690413994169095,
+      "loss": 3.738,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6223776223776225,
+      "grad_norm": 0.32018738985061646,
+      "learning_rate": 0.000568866472303207,
+      "loss": 3.74,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6223776223776225,
+      "eval_accuracy": 0.3477287677347602,
+      "eval_loss": 3.751537799835205,
+      "eval_runtime": 180.2979,
+      "eval_samples_per_second": 92.303,
+      "eval_steps_per_second": 5.774,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636946386946387,
+      "grad_norm": 0.3212384283542633,
+      "learning_rate": 0.0005686915451895044,
+      "loss": 3.7381,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6515151515151514,
+      "grad_norm": 0.3253323435783386,
+      "learning_rate": 0.0005685166180758016,
+      "loss": 3.739,
+      "step": 9100
+    },
+    {
+      "epoch": 2.666083916083916,
+      "grad_norm": 0.3387431502342224,
+      "learning_rate": 0.000568341690962099,
+      "loss": 3.7248,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6806526806526807,
+      "grad_norm": 0.32496801018714905,
+      "learning_rate": 0.0005681667638483965,
+      "loss": 3.7298,
+      "step": 9200
+    },
+    {
+      "epoch": 2.695221445221445,
+      "grad_norm": 0.32816433906555176,
+      "learning_rate": 0.0005679918367346938,
+      "loss": 3.7296,
+      "step": 9250
+    },
+    {
+      "epoch": 2.70979020979021,
+      "grad_norm": 0.3408059775829315,
+      "learning_rate": 0.0005678169096209912,
+      "loss": 3.7364,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7243589743589745,
+      "grad_norm": 0.33964434266090393,
+      "learning_rate": 0.0005676419825072885,
+      "loss": 3.7332,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738927738927739,
+      "grad_norm": 0.31630218029022217,
+      "learning_rate": 0.000567467055393586,
+      "loss": 3.7283,
+      "step": 9400
+    },
+    {
+      "epoch": 2.7534965034965033,
+      "grad_norm": 0.34303176403045654,
+      "learning_rate": 0.0005672921282798833,
+      "loss": 3.7337,
+      "step": 9450
+    },
+    {
+      "epoch": 2.768065268065268,
+      "grad_norm": 0.30772241950035095,
+      "learning_rate": 0.0005671172011661807,
+      "loss": 3.7223,
+      "step": 9500
+    },
+    {
+      "epoch": 2.7826340326340326,
+      "grad_norm": 0.3346325755119324,
+      "learning_rate": 0.000566942274052478,
+      "loss": 3.7366,
+      "step": 9550
+    },
+    {
+      "epoch": 2.797202797202797,
+      "grad_norm": 0.321429580450058,
+      "learning_rate": 0.0005667673469387755,
+      "loss": 3.7289,
+      "step": 9600
+    },
+    {
+      "epoch": 2.811771561771562,
+      "grad_norm": 0.3273778259754181,
+      "learning_rate": 0.0005665924198250728,
+      "loss": 3.7253,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8263403263403264,
+      "grad_norm": 0.33299872279167175,
+      "learning_rate": 0.0005664174927113702,
+      "loss": 3.7264,
+      "step": 9700
+    },
+    {
+      "epoch": 2.840909090909091,
+      "grad_norm": 0.31705546379089355,
+      "learning_rate": 0.0005662425655976676,
+      "loss": 3.7263,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8554778554778553,
+      "grad_norm": 0.34314480423927307,
+      "learning_rate": 0.000566067638483965,
+      "loss": 3.7151,
+      "step": 9800
+    },
+    {
+      "epoch": 2.87004662004662,
+      "grad_norm": 0.32017573714256287,
+      "learning_rate": 0.0005658927113702623,
+      "loss": 3.7329,
+      "step": 9850
+    },
+    {
+      "epoch": 2.8846153846153846,
+      "grad_norm": 0.31930816173553467,
+      "learning_rate": 0.0005657177842565597,
+      "loss": 3.7235,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8991841491841495,
+      "grad_norm": 0.31949570775032043,
+      "learning_rate": 0.0005655428571428572,
+      "loss": 3.7227,
+      "step": 9950
+    },
+    {
+      "epoch": 2.913752913752914,
+      "grad_norm": 0.30999991297721863,
+      "learning_rate": 0.0005653679300291545,
+      "loss": 3.7152,
+      "step": 10000
+    },
+    {
+      "epoch": 2.913752913752914,
+      "eval_accuracy": 0.34985880864932545,
+      "eval_loss": 3.7270307540893555,
+      "eval_runtime": 180.2671,
+      "eval_samples_per_second": 92.319,
+      "eval_steps_per_second": 5.775,
+      "step": 10000
+    },
+    {
+      "epoch": 2.9283216783216783,
+      "grad_norm": 0.3184822201728821,
+      "learning_rate": 0.0005651930029154518,
+      "loss": 3.7289,
+      "step": 10050
+    },
+    {
+      "epoch": 2.9428904428904428,
+      "grad_norm": 0.31392183899879456,
+      "learning_rate": 0.0005650180758017492,
+      "loss": 3.7275,
+      "step": 10100
+    },
+    {
+      "epoch": 2.957459207459207,
+      "grad_norm": 0.3100379407405853,
+      "learning_rate": 0.0005648431486880466,
+      "loss": 3.7078,
+      "step": 10150
+    },
+    {
+      "epoch": 2.972027972027972,
+      "grad_norm": 0.3107777237892151,
+      "learning_rate": 0.000564668221574344,
+      "loss": 3.7191,
+      "step": 10200
+    },
+    {
+      "epoch": 2.9865967365967365,
+      "grad_norm": 0.31457746028900146,
+      "learning_rate": 0.0005644932944606413,
+      "loss": 3.7216,
+      "step": 10250
+    },
+    {
+      "epoch": 3.001165501165501,
+      "grad_norm": 0.3300207555294037,
+      "learning_rate": 0.0005643183673469387,
+      "loss": 3.7241,
+      "step": 10300
+    },
+    {
+      "epoch": 3.015734265734266,
+      "grad_norm": 0.33615049719810486,
+      "learning_rate": 0.0005641434402332362,
+      "loss": 3.6097,
+      "step": 10350
+    },
+    {
+      "epoch": 3.0303030303030303,
+      "grad_norm": 0.32839542627334595,
+      "learning_rate": 0.0005639685131195335,
+      "loss": 3.6192,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0448717948717947,
+      "grad_norm": 0.32775548100471497,
+      "learning_rate": 0.0005637935860058308,
+      "loss": 3.6201,
+      "step": 10450
+    },
+    {
+      "epoch": 3.0594405594405596,
+      "grad_norm": 0.3305208086967468,
+      "learning_rate": 0.0005636186588921282,
+      "loss": 3.6244,
+      "step": 10500
+    },
+    {
+      "epoch": 3.074009324009324,
+      "grad_norm": 0.3248291015625,
+      "learning_rate": 0.0005634437317784256,
+      "loss": 3.6289,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0885780885780885,
+      "grad_norm": 0.334089070558548,
+      "learning_rate": 0.000563268804664723,
+      "loss": 3.6128,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1031468531468533,
+      "grad_norm": 0.33667150139808655,
+      "learning_rate": 0.0005630938775510203,
+      "loss": 3.6316,
+      "step": 10650
+    },
+    {
+      "epoch": 3.117715617715618,
+      "grad_norm": 0.3139183223247528,
+      "learning_rate": 0.0005629189504373177,
+      "loss": 3.6267,
+      "step": 10700
+    },
+    {
+      "epoch": 3.132284382284382,
+      "grad_norm": 0.3240184187889099,
+      "learning_rate": 0.0005627440233236151,
+      "loss": 3.6155,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1468531468531467,
+      "grad_norm": 0.3177716135978699,
+      "learning_rate": 0.0005625690962099125,
+      "loss": 3.6157,
+      "step": 10800
+    },
+    {
+      "epoch": 3.1614219114219115,
+      "grad_norm": 0.32491302490234375,
+      "learning_rate": 0.0005623941690962099,
+      "loss": 3.6529,
+      "step": 10850
+    },
+    {
+      "epoch": 3.175990675990676,
+      "grad_norm": 0.3269357681274414,
+      "learning_rate": 0.0005622192419825073,
+      "loss": 3.6252,
+      "step": 10900
+    },
+    {
+      "epoch": 3.1905594405594404,
+      "grad_norm": 0.33358559012413025,
+      "learning_rate": 0.0005620443148688046,
+      "loss": 3.6477,
+      "step": 10950
+    },
+    {
+      "epoch": 3.2051282051282053,
+      "grad_norm": 0.32112857699394226,
+      "learning_rate": 0.000561869387755102,
+      "loss": 3.6367,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2051282051282053,
+      "eval_accuracy": 0.3516898159961675,
+      "eval_loss": 3.7140629291534424,
+      "eval_runtime": 180.3296,
+      "eval_samples_per_second": 92.287,
+      "eval_steps_per_second": 5.773,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2196969696969697,
+      "grad_norm": 0.328512042760849,
+      "learning_rate": 0.0005616944606413993,
+      "loss": 3.6396,
+      "step": 11050
+    },
+    {
+      "epoch": 3.234265734265734,
+      "grad_norm": 0.3449825644493103,
+      "learning_rate": 0.0005615195335276968,
+      "loss": 3.6327,
+      "step": 11100
+    },
+    {
+      "epoch": 3.248834498834499,
+      "grad_norm": 0.32266926765441895,
+      "learning_rate": 0.0005613446064139941,
+      "loss": 3.6382,
+      "step": 11150
+    },
+    {
+      "epoch": 3.2634032634032635,
+      "grad_norm": 0.3263072073459625,
+      "learning_rate": 0.0005611696793002915,
+      "loss": 3.6265,
+      "step": 11200
+    },
+    {
+      "epoch": 3.277972027972028,
+      "grad_norm": 0.32438746094703674,
+      "learning_rate": 0.0005609947521865889,
+      "loss": 3.6519,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2925407925407923,
+      "grad_norm": 0.3556417226791382,
+      "learning_rate": 0.0005608198250728863,
+      "loss": 3.6388,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3071095571095572,
+      "grad_norm": 0.31459367275238037,
+      "learning_rate": 0.0005606448979591836,
+      "loss": 3.6413,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3216783216783217,
+      "grad_norm": 0.3164815902709961,
+      "learning_rate": 0.000560469970845481,
+      "loss": 3.6394,
+      "step": 11400
+    },
+    {
+      "epoch": 3.336247086247086,
+      "grad_norm": 0.3238040804862976,
+      "learning_rate": 0.0005602950437317783,
+      "loss": 3.639,
+      "step": 11450
+    },
+    {
+      "epoch": 3.350815850815851,
+      "grad_norm": 0.31536027789115906,
+      "learning_rate": 0.0005601201166180758,
+      "loss": 3.651,
+      "step": 11500
+    },
+    {
+      "epoch": 3.3653846153846154,
+      "grad_norm": 0.3251273036003113,
+      "learning_rate": 0.0005599451895043731,
+      "loss": 3.6398,
+      "step": 11550
+    },
+    {
+      "epoch": 3.37995337995338,
+      "grad_norm": 0.3183720111846924,
+      "learning_rate": 0.0005597702623906705,
+      "loss": 3.6425,
+      "step": 11600
+    },
+    {
+      "epoch": 3.3945221445221447,
+      "grad_norm": 0.3452969193458557,
+      "learning_rate": 0.0005595953352769679,
+      "loss": 3.6396,
+      "step": 11650
+    },
+    {
+      "epoch": 3.409090909090909,
+      "grad_norm": 0.31187903881073,
+      "learning_rate": 0.0005594204081632653,
+      "loss": 3.6399,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4236596736596736,
+      "grad_norm": 0.3159955143928528,
+      "learning_rate": 0.0005592454810495627,
+      "loss": 3.6371,
+      "step": 11750
+    },
+    {
+      "epoch": 3.438228438228438,
+      "grad_norm": 0.3242449462413788,
+      "learning_rate": 0.00055907055393586,
+      "loss": 3.6376,
+      "step": 11800
+    },
+    {
+      "epoch": 3.452797202797203,
+      "grad_norm": 0.33960285782814026,
+      "learning_rate": 0.0005588956268221573,
+      "loss": 3.6397,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4673659673659674,
+      "grad_norm": 0.34514838457107544,
+      "learning_rate": 0.0005587206997084548,
+      "loss": 3.6349,
+      "step": 11900
+    },
+    {
+      "epoch": 3.481934731934732,
+      "grad_norm": 0.33326658606529236,
+      "learning_rate": 0.0005585457725947521,
+      "loss": 3.6432,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4965034965034967,
+      "grad_norm": 0.3219590187072754,
+      "learning_rate": 0.0005583708454810495,
+      "loss": 3.642,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4965034965034967,
+      "eval_accuracy": 0.35353681570054407,
+      "eval_loss": 3.697685480117798,
+      "eval_runtime": 180.3495,
+      "eval_samples_per_second": 92.276,
+      "eval_steps_per_second": 5.772,
+      "step": 12000
+    },
+    {
+      "epoch": 3.511072261072261,
+      "grad_norm": 0.315857470035553,
+      "learning_rate": 0.0005581959183673468,
+      "loss": 3.6484,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5256410256410255,
+      "grad_norm": 0.33714818954467773,
+      "learning_rate": 0.0005580209912536443,
+      "loss": 3.6465,
+      "step": 12100
+    },
+    {
+      "epoch": 3.54020979020979,
+      "grad_norm": 0.3196263909339905,
+      "learning_rate": 0.0005578460641399417,
+      "loss": 3.6444,
+      "step": 12150
+    },
+    {
+      "epoch": 3.554778554778555,
+      "grad_norm": 0.34034839272499084,
+      "learning_rate": 0.000557671137026239,
+      "loss": 3.6403,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5693473193473193,
+      "grad_norm": 0.32852211594581604,
+      "learning_rate": 0.0005574962099125363,
+      "loss": 3.6461,
+      "step": 12250
+    },
+    {
+      "epoch": 3.583916083916084,
+      "grad_norm": 0.3598001003265381,
+      "learning_rate": 0.0005573212827988338,
+      "loss": 3.6392,
+      "step": 12300
+    },
+    {
+      "epoch": 3.5984848484848486,
+      "grad_norm": 0.3342962861061096,
+      "learning_rate": 0.0005571463556851311,
+      "loss": 3.6414,
+      "step": 12350
+    },
+    {
+      "epoch": 3.613053613053613,
+      "grad_norm": 0.316803514957428,
+      "learning_rate": 0.0005569714285714285,
+      "loss": 3.6486,
+      "step": 12400
+    },
+    {
+      "epoch": 3.6276223776223775,
+      "grad_norm": 0.31796908378601074,
+      "learning_rate": 0.0005567965014577258,
+      "loss": 3.6369,
+      "step": 12450
+    },
+    {
+      "epoch": 3.642191142191142,
+      "grad_norm": 0.309007465839386,
+      "learning_rate": 0.0005566215743440233,
+      "loss": 3.6429,
+      "step": 12500
+    },
+    {
+      "epoch": 3.656759906759907,
+      "grad_norm": 0.3321513831615448,
+      "learning_rate": 0.0005564466472303207,
+      "loss": 3.6487,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6713286713286712,
+      "grad_norm": 0.35138118267059326,
+      "learning_rate": 0.000556271720116618,
+      "loss": 3.6527,
+      "step": 12600
+    },
+    {
+      "epoch": 3.685897435897436,
+      "grad_norm": 0.3067615032196045,
+      "learning_rate": 0.0005560967930029155,
+      "loss": 3.6444,
+      "step": 12650
+    },
+    {
+      "epoch": 3.7004662004662006,
+      "grad_norm": 0.33694183826446533,
+      "learning_rate": 0.0005559218658892128,
+      "loss": 3.6325,
+      "step": 12700
+    },
+    {
+      "epoch": 3.715034965034965,
+      "grad_norm": 0.31776705384254456,
+      "learning_rate": 0.0005557469387755101,
+      "loss": 3.6527,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7296037296037294,
+      "grad_norm": 0.3377169668674469,
+      "learning_rate": 0.0005555720116618075,
+      "loss": 3.6424,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7441724941724943,
+      "grad_norm": 0.3101692199707031,
+      "learning_rate": 0.0005553970845481049,
+      "loss": 3.6359,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7587412587412588,
+      "grad_norm": 0.3166581392288208,
+      "learning_rate": 0.0005552221574344023,
+      "loss": 3.6416,
+      "step": 12900
+    },
+    {
+      "epoch": 3.773310023310023,
+      "grad_norm": 0.31438636779785156,
+      "learning_rate": 0.0005550472303206997,
+      "loss": 3.6336,
+      "step": 12950
+    },
+    {
+      "epoch": 3.787878787878788,
+      "grad_norm": 0.3247930705547333,
+      "learning_rate": 0.000554872303206997,
+      "loss": 3.6416,
+      "step": 13000
+    },
+    {
+      "epoch": 3.787878787878788,
+      "eval_accuracy": 0.3546786229921654,
+      "eval_loss": 3.6786322593688965,
+      "eval_runtime": 180.419,
+      "eval_samples_per_second": 92.241,
+      "eval_steps_per_second": 5.77,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8024475524475525,
+      "grad_norm": 0.3598824441432953,
+      "learning_rate": 0.0005546973760932945,
+      "loss": 3.6428,
+      "step": 13050
+    },
+    {
+      "epoch": 3.817016317016317,
+      "grad_norm": 0.32811933755874634,
+      "learning_rate": 0.0005545224489795918,
+      "loss": 3.6448,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8315850815850814,
+      "grad_norm": 0.3222385346889496,
+      "learning_rate": 0.0005543475218658891,
+      "loss": 3.6489,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8461538461538463,
+      "grad_norm": 0.326913058757782,
+      "learning_rate": 0.0005541725947521865,
+      "loss": 3.6217,
+      "step": 13200
+    },
+    {
+      "epoch": 3.8607226107226107,
+      "grad_norm": 0.31770044565200806,
+      "learning_rate": 0.0005539976676384839,
+      "loss": 3.6383,
+      "step": 13250
+    },
+    {
+      "epoch": 3.875291375291375,
+      "grad_norm": 0.3197103440761566,
+      "learning_rate": 0.0005538227405247813,
+      "loss": 3.6432,
+      "step": 13300
+    },
+    {
+      "epoch": 3.88986013986014,
+      "grad_norm": 0.33483409881591797,
+      "learning_rate": 0.0005536478134110787,
+      "loss": 3.6325,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9044289044289044,
+      "grad_norm": 0.3026617765426636,
+      "learning_rate": 0.000553472886297376,
+      "loss": 3.6343,
+      "step": 13400
+    },
+    {
+      "epoch": 3.918997668997669,
+      "grad_norm": 0.2976735532283783,
+      "learning_rate": 0.0005532979591836735,
+      "loss": 3.6483,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9335664335664333,
+      "grad_norm": 0.3455604612827301,
+      "learning_rate": 0.0005531230320699708,
+      "loss": 3.6413,
+      "step": 13500
+    },
+    {
+      "epoch": 3.948135198135198,
+      "grad_norm": 0.3204672932624817,
+      "learning_rate": 0.0005529481049562682,
+      "loss": 3.6384,
+      "step": 13550
+    },
+    {
+      "epoch": 3.9627039627039626,
+      "grad_norm": 0.340648889541626,
+      "learning_rate": 0.0005527731778425655,
+      "loss": 3.6425,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9772727272727275,
+      "grad_norm": 0.3379724323749542,
+      "learning_rate": 0.0005525982507288629,
+      "loss": 3.6327,
+      "step": 13650
+    },
+    {
+      "epoch": 3.991841491841492,
+      "grad_norm": 0.3036077320575714,
+      "learning_rate": 0.0005524233236151603,
+      "loss": 3.6375,
+      "step": 13700
+    },
+    {
+      "epoch": 4.006410256410256,
+      "grad_norm": 0.34318360686302185,
+      "learning_rate": 0.0005522483965014576,
+      "loss": 3.5803,
+      "step": 13750
+    },
+    {
+      "epoch": 4.020979020979021,
+      "grad_norm": 0.3264276087284088,
+      "learning_rate": 0.000552073469387755,
+      "loss": 3.5362,
+      "step": 13800
+    },
+    {
+      "epoch": 4.035547785547785,
+      "grad_norm": 0.3238934278488159,
+      "learning_rate": 0.0005518985422740525,
+      "loss": 3.5332,
+      "step": 13850
+    },
+    {
+      "epoch": 4.05011655011655,
+      "grad_norm": 0.32926997542381287,
+      "learning_rate": 0.0005517236151603498,
+      "loss": 3.5372,
+      "step": 13900
+    },
+    {
+      "epoch": 4.064685314685315,
+      "grad_norm": 0.32314813137054443,
+      "learning_rate": 0.0005515486880466472,
+      "loss": 3.5272,
+      "step": 13950
+    },
+    {
+      "epoch": 4.0792540792540795,
+      "grad_norm": 0.3332814872264862,
+      "learning_rate": 0.0005513737609329446,
+      "loss": 3.5382,
+      "step": 14000
+    },
+    {
+      "epoch": 4.0792540792540795,
+      "eval_accuracy": 0.3562127134068402,
+      "eval_loss": 3.6715903282165527,
+      "eval_runtime": 180.15,
+      "eval_samples_per_second": 92.379,
+      "eval_steps_per_second": 5.779,
+      "step": 14000
+    },
+    {
+      "epoch": 4.093822843822844,
+      "grad_norm": 0.33132901787757874,
+      "learning_rate": 0.0005511988338192419,
+      "loss": 3.549,
+      "step": 14050
+    },
+    {
+      "epoch": 4.108391608391608,
+      "grad_norm": 0.32595717906951904,
+      "learning_rate": 0.0005510239067055393,
+      "loss": 3.5445,
+      "step": 14100
+    },
+    {
+      "epoch": 4.122960372960373,
+      "grad_norm": 0.3297913670539856,
+      "learning_rate": 0.0005508489795918366,
+      "loss": 3.5392,
+      "step": 14150
+    },
+    {
+      "epoch": 4.137529137529137,
+      "grad_norm": 0.35622304677963257,
+      "learning_rate": 0.0005506740524781341,
+      "loss": 3.5387,
+      "step": 14200
+    },
+    {
+      "epoch": 4.1520979020979025,
+      "grad_norm": 0.32156145572662354,
+      "learning_rate": 0.0005504991253644315,
+      "loss": 3.5461,
+      "step": 14250
+    },
+    {
+      "epoch": 4.166666666666667,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0005503241982507288,
+      "loss": 3.5555,
+      "step": 14300
+    },
+    {
+      "epoch": 4.181235431235431,
+      "grad_norm": 0.32054704427719116,
+      "learning_rate": 0.0005501492711370262,
+      "loss": 3.5634,
+      "step": 14350
+    },
+    {
+      "epoch": 4.195804195804196,
+      "grad_norm": 0.3304331302642822,
+      "learning_rate": 0.0005499743440233236,
+      "loss": 3.557,
+      "step": 14400
+    },
+    {
+      "epoch": 4.21037296037296,
+      "grad_norm": 0.33280083537101746,
+      "learning_rate": 0.000549799416909621,
+      "loss": 3.5636,
+      "step": 14450
+    },
+    {
+      "epoch": 4.224941724941725,
+      "grad_norm": 0.3097744584083557,
+      "learning_rate": 0.0005496244897959183,
+      "loss": 3.5591,
+      "step": 14500
+    },
+    {
+      "epoch": 4.239510489510489,
+      "grad_norm": 0.3197658658027649,
+      "learning_rate": 0.0005494495626822156,
+      "loss": 3.5661,
+      "step": 14550
+    },
+    {
+      "epoch": 4.2540792540792545,
+      "grad_norm": 0.3759899437427521,
+      "learning_rate": 0.0005492746355685131,
+      "loss": 3.5621,
+      "step": 14600
+    },
+    {
+      "epoch": 4.268648018648019,
+      "grad_norm": 0.34865570068359375,
+      "learning_rate": 0.0005490997084548105,
+      "loss": 3.5642,
+      "step": 14650
+    },
+    {
+      "epoch": 4.283216783216783,
+      "grad_norm": 0.3441263735294342,
+      "learning_rate": 0.0005489247813411078,
+      "loss": 3.5676,
+      "step": 14700
+    },
+    {
+      "epoch": 4.297785547785548,
+      "grad_norm": 0.33596622943878174,
+      "learning_rate": 0.0005487498542274052,
+      "loss": 3.5693,
+      "step": 14750
+    },
+    {
+      "epoch": 4.312354312354312,
+      "grad_norm": 0.3372125029563904,
+      "learning_rate": 0.0005485749271137026,
+      "loss": 3.5674,
+      "step": 14800
+    },
+    {
+      "epoch": 4.326923076923077,
+      "grad_norm": 0.3590675890445709,
+      "learning_rate": 0.0005484,
+      "loss": 3.5677,
+      "step": 14850
+    },
+    {
+      "epoch": 4.341491841491841,
+      "grad_norm": 0.3344537615776062,
+      "learning_rate": 0.0005482250728862973,
+      "loss": 3.5582,
+      "step": 14900
+    },
+    {
+      "epoch": 4.356060606060606,
+      "grad_norm": 0.3320492208003998,
+      "learning_rate": 0.0005480501457725946,
+      "loss": 3.5648,
+      "step": 14950
+    },
+    {
+      "epoch": 4.370629370629371,
+      "grad_norm": 0.336557537317276,
+      "learning_rate": 0.0005478752186588921,
+      "loss": 3.5647,
+      "step": 15000
+    },
+    {
+      "epoch": 4.370629370629371,
+      "eval_accuracy": 0.3576591986276676,
+      "eval_loss": 3.6587440967559814,
+      "eval_runtime": 180.1159,
+      "eval_samples_per_second": 92.396,
+      "eval_steps_per_second": 5.78,
+      "step": 15000
+    },
+    {
+      "epoch": 4.385198135198135,
+      "grad_norm": 0.3224494159221649,
+      "learning_rate": 0.0005477002915451894,
+      "loss": 3.5672,
+      "step": 15050
+    },
+    {
+      "epoch": 4.3997668997669,
+      "grad_norm": 0.3005123734474182,
+      "learning_rate": 0.0005475253644314868,
+      "loss": 3.567,
+      "step": 15100
+    },
+    {
+      "epoch": 4.414335664335664,
+      "grad_norm": 0.31343790888786316,
+      "learning_rate": 0.0005473504373177842,
+      "loss": 3.5575,
+      "step": 15150
+    },
+    {
+      "epoch": 4.428904428904429,
+      "grad_norm": 0.3172782361507416,
+      "learning_rate": 0.0005471755102040816,
+      "loss": 3.5665,
+      "step": 15200
+    },
+    {
+      "epoch": 4.443473193473194,
+      "grad_norm": 0.32356658577919006,
+      "learning_rate": 0.000547000583090379,
+      "loss": 3.5787,
+      "step": 15250
+    },
+    {
+      "epoch": 4.458041958041958,
+      "grad_norm": 0.35370585322380066,
+      "learning_rate": 0.0005468256559766763,
+      "loss": 3.5677,
+      "step": 15300
+    },
+    {
+      "epoch": 4.472610722610723,
+      "grad_norm": 0.32113948464393616,
+      "learning_rate": 0.0005466507288629738,
+      "loss": 3.5588,
+      "step": 15350
+    },
+    {
+      "epoch": 4.487179487179487,
+      "grad_norm": 0.3236648738384247,
+      "learning_rate": 0.0005464758017492711,
+      "loss": 3.5723,
+      "step": 15400
+    },
+    {
+      "epoch": 4.501748251748252,
+      "grad_norm": 0.32024386525154114,
+      "learning_rate": 0.0005463008746355684,
+      "loss": 3.582,
+      "step": 15450
+    },
+    {
+      "epoch": 4.516317016317016,
+      "grad_norm": 0.34335383772850037,
+      "learning_rate": 0.0005461259475218658,
+      "loss": 3.5728,
+      "step": 15500
+    },
+    {
+      "epoch": 4.5308857808857805,
+      "grad_norm": 0.3075568377971649,
+      "learning_rate": 0.0005459510204081633,
+      "loss": 3.5652,
+      "step": 15550
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 0.3292197585105896,
+      "learning_rate": 0.0005457760932944606,
+      "loss": 3.565,
+      "step": 15600
+    },
+    {
+      "epoch": 4.56002331002331,
+      "grad_norm": 0.35107719898223877,
+      "learning_rate": 0.000545601166180758,
+      "loss": 3.5702,
+      "step": 15650
+    },
+    {
+      "epoch": 4.574592074592075,
+      "grad_norm": 0.3471798598766327,
+      "learning_rate": 0.0005454262390670553,
+      "loss": 3.5681,
+      "step": 15700
+    },
+    {
+      "epoch": 4.589160839160839,
+      "grad_norm": 0.31821051239967346,
+      "learning_rate": 0.0005452513119533528,
+      "loss": 3.582,
+      "step": 15750
+    },
+    {
+      "epoch": 4.603729603729604,
+      "grad_norm": 0.3309209644794464,
+      "learning_rate": 0.0005450763848396501,
+      "loss": 3.5883,
+      "step": 15800
+    },
+    {
+      "epoch": 4.618298368298368,
+      "grad_norm": 0.33727866411209106,
+      "learning_rate": 0.0005449014577259474,
+      "loss": 3.5817,
+      "step": 15850
+    },
+    {
+      "epoch": 4.632867132867133,
+      "grad_norm": 0.3144679069519043,
+      "learning_rate": 0.0005447265306122448,
+      "loss": 3.5724,
+      "step": 15900
+    },
+    {
+      "epoch": 4.647435897435898,
+      "grad_norm": 0.32342618703842163,
+      "learning_rate": 0.0005445516034985423,
+      "loss": 3.5855,
+      "step": 15950
+    },
+    {
+      "epoch": 4.662004662004662,
+      "grad_norm": 0.3141750395298004,
+      "learning_rate": 0.0005443766763848396,
+      "loss": 3.5807,
+      "step": 16000
+    },
+    {
+      "epoch": 4.662004662004662,
+      "eval_accuracy": 0.3584899780834147,
+      "eval_loss": 3.6451687812805176,
+      "eval_runtime": 180.0666,
+      "eval_samples_per_second": 92.421,
+      "eval_steps_per_second": 5.781,
+      "step": 16000
+    },
+    {
+      "epoch": 4.676573426573427,
+      "grad_norm": 0.318861186504364,
+      "learning_rate": 0.000544201749271137,
+      "loss": 3.5705,
+      "step": 16050
+    },
+    {
+      "epoch": 4.691142191142191,
+      "grad_norm": 0.31984490156173706,
+      "learning_rate": 0.0005440268221574343,
+      "loss": 3.5858,
+      "step": 16100
+    },
+    {
+      "epoch": 4.7057109557109555,
+      "grad_norm": 0.3313526511192322,
+      "learning_rate": 0.0005438518950437318,
+      "loss": 3.5778,
+      "step": 16150
+    },
+    {
+      "epoch": 4.72027972027972,
+      "grad_norm": 0.332089900970459,
+      "learning_rate": 0.0005436769679300291,
+      "loss": 3.5776,
+      "step": 16200
+    },
+    {
+      "epoch": 4.734848484848484,
+      "grad_norm": 0.33302974700927734,
+      "learning_rate": 0.0005435020408163265,
+      "loss": 3.5832,
+      "step": 16250
+    },
+    {
+      "epoch": 4.74941724941725,
+      "grad_norm": 0.3242354691028595,
+      "learning_rate": 0.0005433271137026238,
+      "loss": 3.5848,
+      "step": 16300
+    },
+    {
+      "epoch": 4.763986013986014,
+      "grad_norm": 0.3078085482120514,
+      "learning_rate": 0.0005431521865889212,
+      "loss": 3.5824,
+      "step": 16350
+    },
+    {
+      "epoch": 4.778554778554779,
+      "grad_norm": 0.3317912220954895,
+      "learning_rate": 0.0005429772594752186,
+      "loss": 3.5782,
+      "step": 16400
+    },
+    {
+      "epoch": 4.793123543123543,
+      "grad_norm": 0.30730515718460083,
+      "learning_rate": 0.000542802332361516,
+      "loss": 3.5779,
+      "step": 16450
+    },
+    {
+      "epoch": 4.8076923076923075,
+      "grad_norm": 0.35136038064956665,
+      "learning_rate": 0.0005426274052478133,
+      "loss": 3.583,
+      "step": 16500
+    },
+    {
+      "epoch": 4.822261072261072,
+      "grad_norm": 0.3428604304790497,
+      "learning_rate": 0.0005424524781341108,
+      "loss": 3.578,
+      "step": 16550
+    },
+    {
+      "epoch": 4.836829836829837,
+      "grad_norm": 0.3045051693916321,
+      "learning_rate": 0.0005422775510204081,
+      "loss": 3.5811,
+      "step": 16600
+    },
+    {
+      "epoch": 4.851398601398602,
+      "grad_norm": 0.3164063096046448,
+      "learning_rate": 0.0005421026239067055,
+      "loss": 3.5821,
+      "step": 16650
+    },
+    {
+      "epoch": 4.865967365967366,
+      "grad_norm": 0.33561450242996216,
+      "learning_rate": 0.0005419276967930028,
+      "loss": 3.5749,
+      "step": 16700
+    },
+    {
+      "epoch": 4.880536130536131,
+      "grad_norm": 0.3375592529773712,
+      "learning_rate": 0.0005417527696793002,
+      "loss": 3.5713,
+      "step": 16750
+    },
+    {
+      "epoch": 4.895104895104895,
+      "grad_norm": 0.3262588083744049,
+      "learning_rate": 0.0005415778425655976,
+      "loss": 3.5773,
+      "step": 16800
+    },
+    {
+      "epoch": 4.909673659673659,
+      "grad_norm": 0.33031025528907776,
+      "learning_rate": 0.000541402915451895,
+      "loss": 3.5719,
+      "step": 16850
+    },
+    {
+      "epoch": 4.924242424242424,
+      "grad_norm": 0.32215115427970886,
+      "learning_rate": 0.0005412279883381923,
+      "loss": 3.5679,
+      "step": 16900
+    },
+    {
+      "epoch": 4.938811188811189,
+      "grad_norm": 0.3194146156311035,
+      "learning_rate": 0.0005410530612244898,
+      "loss": 3.5837,
+      "step": 16950
+    },
+    {
+      "epoch": 4.953379953379954,
+      "grad_norm": 0.3187941312789917,
+      "learning_rate": 0.0005408781341107871,
+      "loss": 3.5693,
+      "step": 17000
+    },
+    {
+      "epoch": 4.953379953379954,
+      "eval_accuracy": 0.35988401777879797,
+      "eval_loss": 3.632936477661133,
+      "eval_runtime": 180.0341,
+      "eval_samples_per_second": 92.438,
+      "eval_steps_per_second": 5.782,
+      "step": 17000
+    },
+    {
+      "epoch": 4.967948717948718,
+      "grad_norm": 0.32214635610580444,
+      "learning_rate": 0.0005407032069970845,
+      "loss": 3.5817,
+      "step": 17050
+    },
+    {
+      "epoch": 4.9825174825174825,
+      "grad_norm": 0.3381812870502472,
+      "learning_rate": 0.0005405282798833819,
+      "loss": 3.5721,
+      "step": 17100
+    },
+    {
+      "epoch": 4.997086247086247,
+      "grad_norm": 0.328273206949234,
+      "learning_rate": 0.0005403533527696793,
+      "loss": 3.5866,
+      "step": 17150
+    },
+    {
+      "epoch": 5.011655011655011,
+      "grad_norm": 0.32486042380332947,
+      "learning_rate": 0.0005401784256559766,
+      "loss": 3.4864,
+      "step": 17200
+    },
+    {
+      "epoch": 5.026223776223776,
+      "grad_norm": 0.3191656172275543,
+      "learning_rate": 0.000540003498542274,
+      "loss": 3.4736,
+      "step": 17250
+    },
+    {
+      "epoch": 5.040792540792541,
+      "grad_norm": 0.3504127264022827,
+      "learning_rate": 0.0005398285714285714,
+      "loss": 3.469,
+      "step": 17300
+    },
+    {
+      "epoch": 5.055361305361306,
+      "grad_norm": 0.3454863727092743,
+      "learning_rate": 0.0005396536443148688,
+      "loss": 3.4665,
+      "step": 17350
+    },
+    {
+      "epoch": 5.06993006993007,
+      "grad_norm": 0.30901169776916504,
+      "learning_rate": 0.0005394787172011661,
+      "loss": 3.4741,
+      "step": 17400
+    },
+    {
+      "epoch": 5.084498834498834,
+      "grad_norm": 0.33311742544174194,
+      "learning_rate": 0.0005393037900874635,
+      "loss": 3.4876,
+      "step": 17450
+    },
+    {
+      "epoch": 5.099067599067599,
+      "grad_norm": 0.33518463373184204,
+      "learning_rate": 0.0005391288629737609,
+      "loss": 3.4755,
+      "step": 17500
+    },
+    {
+      "epoch": 5.113636363636363,
+      "grad_norm": 0.33938467502593994,
+      "learning_rate": 0.0005389539358600583,
+      "loss": 3.489,
+      "step": 17550
+    },
+    {
+      "epoch": 5.128205128205128,
+      "grad_norm": 0.3346013128757477,
+      "learning_rate": 0.0005387790087463557,
+      "loss": 3.4899,
+      "step": 17600
+    },
+    {
+      "epoch": 5.142773892773893,
+      "grad_norm": 0.3396677076816559,
+      "learning_rate": 0.0005386040816326529,
+      "loss": 3.4791,
+      "step": 17650
+    },
+    {
+      "epoch": 5.1573426573426575,
+      "grad_norm": 0.32493624091148376,
+      "learning_rate": 0.0005384291545189504,
+      "loss": 3.4999,
+      "step": 17700
+    },
+    {
+      "epoch": 5.171911421911422,
+      "grad_norm": 0.34523579478263855,
+      "learning_rate": 0.0005382542274052478,
+      "loss": 3.4942,
+      "step": 17750
+    },
+    {
+      "epoch": 5.186480186480186,
+      "grad_norm": 0.34241601824760437,
+      "learning_rate": 0.0005380793002915451,
+      "loss": 3.4986,
+      "step": 17800
+    },
+    {
+      "epoch": 5.201048951048951,
+      "grad_norm": 0.3449043035507202,
+      "learning_rate": 0.0005379043731778425,
+      "loss": 3.5096,
+      "step": 17850
+    },
+    {
+      "epoch": 5.215617715617715,
+      "grad_norm": 0.33027029037475586,
+      "learning_rate": 0.0005377294460641399,
+      "loss": 3.5021,
+      "step": 17900
+    },
+    {
+      "epoch": 5.230186480186481,
+      "grad_norm": 0.33586353063583374,
+      "learning_rate": 0.0005375545189504373,
+      "loss": 3.4964,
+      "step": 17950
+    },
+    {
+      "epoch": 5.244755244755245,
+      "grad_norm": 0.3348841071128845,
+      "learning_rate": 0.0005373795918367346,
+      "loss": 3.5152,
+      "step": 18000
+    },
+    {
+      "epoch": 5.244755244755245,
+      "eval_accuracy": 0.3607333765910926,
+      "eval_loss": 3.6339569091796875,
+      "eval_runtime": 180.5183,
+      "eval_samples_per_second": 92.19,
+      "eval_steps_per_second": 5.767,
+      "step": 18000
+    },
+    {
+      "epoch": 5.2593240093240095,
+      "grad_norm": 0.33033329248428345,
+      "learning_rate": 0.000537204664723032,
+      "loss": 3.4922,
+      "step": 18050
+    },
+    {
+      "epoch": 5.273892773892774,
+      "grad_norm": 0.32480764389038086,
+      "learning_rate": 0.0005370297376093294,
+      "loss": 3.5049,
+      "step": 18100
+    },
+    {
+      "epoch": 5.288461538461538,
+      "grad_norm": 0.3114669919013977,
+      "learning_rate": 0.0005368548104956268,
+      "loss": 3.5045,
+      "step": 18150
+    },
+    {
+      "epoch": 5.303030303030303,
+      "grad_norm": 0.32912948727607727,
+      "learning_rate": 0.0005366798833819241,
+      "loss": 3.5039,
+      "step": 18200
+    },
+    {
+      "epoch": 5.317599067599067,
+      "grad_norm": 0.325888067483902,
+      "learning_rate": 0.0005365049562682215,
+      "loss": 3.5107,
+      "step": 18250
+    },
+    {
+      "epoch": 5.3321678321678325,
+      "grad_norm": 0.3258603811264038,
+      "learning_rate": 0.0005363300291545189,
+      "loss": 3.5079,
+      "step": 18300
+    },
+    {
+      "epoch": 5.346736596736597,
+      "grad_norm": 0.34344643354415894,
+      "learning_rate": 0.0005361551020408163,
+      "loss": 3.5056,
+      "step": 18350
+    },
+    {
+      "epoch": 5.361305361305361,
+      "grad_norm": 0.34246399998664856,
+      "learning_rate": 0.0005359801749271136,
+      "loss": 3.5118,
+      "step": 18400
+    },
+    {
+      "epoch": 5.375874125874126,
+      "grad_norm": 0.35261663794517517,
+      "learning_rate": 0.000535805247813411,
+      "loss": 3.5154,
+      "step": 18450
+    },
+    {
+      "epoch": 5.39044289044289,
+      "grad_norm": 0.33429020643234253,
+      "learning_rate": 0.0005356303206997085,
+      "loss": 3.515,
+      "step": 18500
+    },
+    {
+      "epoch": 5.405011655011655,
+      "grad_norm": 0.3388688266277313,
+      "learning_rate": 0.0005354553935860058,
+      "loss": 3.5011,
+      "step": 18550
+    },
+    {
+      "epoch": 5.41958041958042,
+      "grad_norm": 0.31441932916641235,
+      "learning_rate": 0.0005352804664723031,
+      "loss": 3.524,
+      "step": 18600
+    },
+    {
+      "epoch": 5.4341491841491845,
+      "grad_norm": 0.33346623182296753,
+      "learning_rate": 0.0005351055393586006,
+      "loss": 3.5096,
+      "step": 18650
+    },
+    {
+      "epoch": 5.448717948717949,
+      "grad_norm": 0.3645952045917511,
+      "learning_rate": 0.0005349306122448979,
+      "loss": 3.5162,
+      "step": 18700
+    },
+    {
+      "epoch": 5.463286713286713,
+      "grad_norm": 0.3252617120742798,
+      "learning_rate": 0.0005347556851311953,
+      "loss": 3.5166,
+      "step": 18750
+    },
+    {
+      "epoch": 5.477855477855478,
+      "grad_norm": 0.32356569170951843,
+      "learning_rate": 0.0005345807580174926,
+      "loss": 3.5259,
+      "step": 18800
+    },
+    {
+      "epoch": 5.492424242424242,
+      "grad_norm": 0.32452526688575745,
+      "learning_rate": 0.0005344058309037901,
+      "loss": 3.5419,
+      "step": 18850
+    },
+    {
+      "epoch": 5.506993006993007,
+      "grad_norm": 0.3109516501426697,
+      "learning_rate": 0.0005342309037900875,
+      "loss": 3.523,
+      "step": 18900
+    },
+    {
+      "epoch": 5.521561771561771,
+      "grad_norm": 0.32956892251968384,
+      "learning_rate": 0.0005340559766763848,
+      "loss": 3.5346,
+      "step": 18950
+    },
+    {
+      "epoch": 5.536130536130536,
+      "grad_norm": 0.347649484872818,
+      "learning_rate": 0.0005338810495626821,
+      "loss": 3.5148,
+      "step": 19000
+    },
+    {
+      "epoch": 5.536130536130536,
+      "eval_accuracy": 0.3614395097307616,
+      "eval_loss": 3.624124526977539,
+      "eval_runtime": 180.5076,
+      "eval_samples_per_second": 92.196,
+      "eval_steps_per_second": 5.767,
+      "step": 19000
+    },
+    {
+      "epoch": 5.550699300699301,
+      "grad_norm": 0.34442394971847534,
+      "learning_rate": 0.0005337061224489796,
+      "loss": 3.508,
+      "step": 19050
+    },
+    {
+      "epoch": 5.565268065268065,
+      "grad_norm": 0.3646959960460663,
+      "learning_rate": 0.0005335311953352769,
+      "loss": 3.5272,
+      "step": 19100
+    },
+    {
+      "epoch": 5.57983682983683,
+      "grad_norm": 0.34306755661964417,
+      "learning_rate": 0.0005333562682215743,
+      "loss": 3.5253,
+      "step": 19150
+    },
+    {
+      "epoch": 5.594405594405594,
+      "grad_norm": 0.34549543261528015,
+      "learning_rate": 0.0005331813411078716,
+      "loss": 3.5349,
+      "step": 19200
+    },
+    {
+      "epoch": 5.608974358974359,
+      "grad_norm": 0.3486803472042084,
+      "learning_rate": 0.0005330064139941691,
+      "loss": 3.518,
+      "step": 19250
+    },
+    {
+      "epoch": 5.623543123543124,
+      "grad_norm": 0.3553147315979004,
+      "learning_rate": 0.0005328314868804665,
+      "loss": 3.5229,
+      "step": 19300
+    },
+    {
+      "epoch": 5.638111888111888,
+      "grad_norm": 0.3389810025691986,
+      "learning_rate": 0.0005326565597667638,
+      "loss": 3.5184,
+      "step": 19350
+    },
+    {
+      "epoch": 5.652680652680653,
+      "grad_norm": 0.3389154076576233,
+      "learning_rate": 0.0005324816326530612,
+      "loss": 3.5242,
+      "step": 19400
+    },
+    {
+      "epoch": 5.667249417249417,
+      "grad_norm": 0.31988218426704407,
+      "learning_rate": 0.0005323067055393586,
+      "loss": 3.5365,
+      "step": 19450
+    },
+    {
+      "epoch": 5.681818181818182,
+      "grad_norm": 0.32239192724227905,
+      "learning_rate": 0.0005321317784256559,
+      "loss": 3.5347,
+      "step": 19500
+    },
+    {
+      "epoch": 5.696386946386946,
+      "grad_norm": 0.3520359694957733,
+      "learning_rate": 0.0005319568513119533,
+      "loss": 3.5332,
+      "step": 19550
+    },
+    {
+      "epoch": 5.7109557109557105,
+      "grad_norm": 0.3352511525154114,
+      "learning_rate": 0.0005317819241982506,
+      "loss": 3.534,
+      "step": 19600
+    },
+    {
+      "epoch": 5.725524475524476,
+      "grad_norm": 0.3281591236591339,
+      "learning_rate": 0.0005316069970845481,
+      "loss": 3.5274,
+      "step": 19650
+    },
+    {
+      "epoch": 5.74009324009324,
+      "grad_norm": 0.33789217472076416,
+      "learning_rate": 0.0005314320699708454,
+      "loss": 3.5266,
+      "step": 19700
+    },
+    {
+      "epoch": 5.754662004662005,
+      "grad_norm": 0.34207120537757874,
+      "learning_rate": 0.0005312571428571428,
+      "loss": 3.5315,
+      "step": 19750
+    },
+    {
+      "epoch": 5.769230769230769,
+      "grad_norm": 0.351068913936615,
+      "learning_rate": 0.0005310822157434403,
+      "loss": 3.5341,
+      "step": 19800
+    },
+    {
+      "epoch": 5.783799533799534,
+      "grad_norm": 0.3352493643760681,
+      "learning_rate": 0.0005309072886297376,
+      "loss": 3.53,
+      "step": 19850
+    },
+    {
+      "epoch": 5.798368298368298,
+      "grad_norm": 0.327741801738739,
+      "learning_rate": 0.0005307323615160349,
+      "loss": 3.5304,
+      "step": 19900
+    },
+    {
+      "epoch": 5.812937062937063,
+      "grad_norm": 0.32836633920669556,
+      "learning_rate": 0.0005305574344023323,
+      "loss": 3.5286,
+      "step": 19950
+    },
+    {
+      "epoch": 5.827505827505828,
+      "grad_norm": 0.3504875600337982,
+      "learning_rate": 0.0005303825072886296,
+      "loss": 3.5384,
+      "step": 20000
+    },
+    {
+      "epoch": 5.827505827505828,
+      "eval_accuracy": 0.3625647367105273,
+      "eval_loss": 3.612804412841797,
+      "eval_runtime": 180.4713,
+      "eval_samples_per_second": 92.214,
+      "eval_steps_per_second": 5.768,
+      "step": 20000
+    },
+    {
+      "epoch": 5.842074592074592,
+      "grad_norm": 0.3540632426738739,
+      "learning_rate": 0.0005302075801749271,
+      "loss": 3.5366,
+      "step": 20050
+    },
+    {
+      "epoch": 5.856643356643357,
+      "grad_norm": 0.34035322070121765,
+      "learning_rate": 0.0005300326530612244,
+      "loss": 3.5303,
+      "step": 20100
+    },
+    {
+      "epoch": 5.871212121212121,
+      "grad_norm": 0.31729087233543396,
+      "learning_rate": 0.0005298577259475218,
+      "loss": 3.5335,
+      "step": 20150
+    },
+    {
+      "epoch": 5.8857808857808855,
+      "grad_norm": 0.3735673427581787,
+      "learning_rate": 0.0005296827988338193,
+      "loss": 3.5277,
+      "step": 20200
+    },
+    {
+      "epoch": 5.90034965034965,
+      "grad_norm": 0.314452201128006,
+      "learning_rate": 0.0005295078717201166,
+      "loss": 3.5406,
+      "step": 20250
+    },
+    {
+      "epoch": 5.914918414918415,
+      "grad_norm": 0.3204086422920227,
+      "learning_rate": 0.000529332944606414,
+      "loss": 3.5359,
+      "step": 20300
+    },
+    {
+      "epoch": 5.92948717948718,
+      "grad_norm": 0.3485746681690216,
+      "learning_rate": 0.0005291580174927113,
+      "loss": 3.5245,
+      "step": 20350
+    },
+    {
+      "epoch": 5.944055944055944,
+      "grad_norm": 0.34968072175979614,
+      "learning_rate": 0.0005289830903790087,
+      "loss": 3.54,
+      "step": 20400
+    },
+    {
+      "epoch": 5.958624708624709,
+      "grad_norm": 0.3806632161140442,
+      "learning_rate": 0.0005288081632653061,
+      "loss": 3.525,
+      "step": 20450
+    },
+    {
+      "epoch": 5.973193473193473,
+      "grad_norm": 0.3304056227207184,
+      "learning_rate": 0.0005286332361516034,
+      "loss": 3.5232,
+      "step": 20500
+    },
+    {
+      "epoch": 5.9877622377622375,
+      "grad_norm": 0.33363205194473267,
+      "learning_rate": 0.0005284583090379008,
+      "loss": 3.5174,
+      "step": 20550
+    },
+    {
+      "epoch": 6.002331002331002,
+      "grad_norm": 0.3507980704307556,
+      "learning_rate": 0.0005282833819241983,
+      "loss": 3.5095,
+      "step": 20600
+    },
+    {
+      "epoch": 6.016899766899767,
+      "grad_norm": 0.3389154374599457,
+      "learning_rate": 0.0005281084548104956,
+      "loss": 3.4025,
+      "step": 20650
+    },
+    {
+      "epoch": 6.031468531468532,
+      "grad_norm": 0.33325284719467163,
+      "learning_rate": 0.000527933527696793,
+      "loss": 3.4253,
+      "step": 20700
+    },
+    {
+      "epoch": 6.046037296037296,
+      "grad_norm": 0.34633567929267883,
+      "learning_rate": 0.0005277586005830903,
+      "loss": 3.4288,
+      "step": 20750
+    },
+    {
+      "epoch": 6.0606060606060606,
+      "grad_norm": 0.33911773562431335,
+      "learning_rate": 0.0005275836734693877,
+      "loss": 3.4302,
+      "step": 20800
+    },
+    {
+      "epoch": 6.075174825174825,
+      "grad_norm": 0.3277522027492523,
+      "learning_rate": 0.0005274087463556851,
+      "loss": 3.4381,
+      "step": 20850
+    },
+    {
+      "epoch": 6.089743589743589,
+      "grad_norm": 0.3419731855392456,
+      "learning_rate": 0.0005272338192419824,
+      "loss": 3.4431,
+      "step": 20900
+    },
+    {
+      "epoch": 6.104312354312355,
+      "grad_norm": 0.35028308629989624,
+      "learning_rate": 0.0005270588921282798,
+      "loss": 3.4435,
+      "step": 20950
+    },
+    {
+      "epoch": 6.118881118881119,
+      "grad_norm": 0.3204551339149475,
+      "learning_rate": 0.0005268839650145772,
+      "loss": 3.4338,
+      "step": 21000
+    },
+    {
+      "epoch": 6.118881118881119,
+      "eval_accuracy": 0.3627538228202005,
+      "eval_loss": 3.615595579147339,
+      "eval_runtime": 180.6199,
+      "eval_samples_per_second": 92.138,
+      "eval_steps_per_second": 5.763,
+      "step": 21000
+    },
+    {
+      "epoch": 6.133449883449884,
+      "grad_norm": 0.3347219228744507,
+      "learning_rate": 0.0005267090379008746,
+      "loss": 3.4486,
+      "step": 21050
+    },
+    {
+      "epoch": 6.148018648018648,
+      "grad_norm": 0.3284785747528076,
+      "learning_rate": 0.000526534110787172,
+      "loss": 3.4548,
+      "step": 21100
+    },
+    {
+      "epoch": 6.1625874125874125,
+      "grad_norm": 0.33264586329460144,
+      "learning_rate": 0.0005263591836734693,
+      "loss": 3.4476,
+      "step": 21150
+    },
+    {
+      "epoch": 6.177156177156177,
+      "grad_norm": 0.3285725712776184,
+      "learning_rate": 0.0005261842565597668,
+      "loss": 3.4675,
+      "step": 21200
+    },
+    {
+      "epoch": 6.191724941724941,
+      "grad_norm": 0.3390142321586609,
+      "learning_rate": 0.0005260093294460641,
+      "loss": 3.455,
+      "step": 21250
+    },
+    {
+      "epoch": 6.206293706293707,
+      "grad_norm": 0.33934858441352844,
+      "learning_rate": 0.0005258344023323614,
+      "loss": 3.4463,
+      "step": 21300
+    },
+    {
+      "epoch": 6.220862470862471,
+      "grad_norm": 0.3672083914279938,
+      "learning_rate": 0.0005256594752186588,
+      "loss": 3.4512,
+      "step": 21350
+    },
+    {
+      "epoch": 6.235431235431236,
+      "grad_norm": 0.3115769624710083,
+      "learning_rate": 0.0005254845481049562,
+      "loss": 3.4634,
+      "step": 21400
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.32785558700561523,
+      "learning_rate": 0.0005253096209912536,
+      "loss": 3.4688,
+      "step": 21450
+    },
+    {
+      "epoch": 6.264568764568764,
+      "grad_norm": 0.3327209949493408,
+      "learning_rate": 0.000525134693877551,
+      "loss": 3.4517,
+      "step": 21500
+    },
+    {
+      "epoch": 6.279137529137529,
+      "grad_norm": 0.34631094336509705,
+      "learning_rate": 0.0005249597667638484,
+      "loss": 3.4574,
+      "step": 21550
+    },
+    {
+      "epoch": 6.293706293706293,
+      "grad_norm": 0.3532359004020691,
+      "learning_rate": 0.0005247848396501458,
+      "loss": 3.4656,
+      "step": 21600
+    },
+    {
+      "epoch": 6.308275058275059,
+      "grad_norm": 0.36950933933258057,
+      "learning_rate": 0.0005246099125364431,
+      "loss": 3.4769,
+      "step": 21650
+    },
+    {
+      "epoch": 6.322843822843823,
+      "grad_norm": 0.336834579706192,
+      "learning_rate": 0.0005244349854227404,
+      "loss": 3.4637,
+      "step": 21700
+    },
+    {
+      "epoch": 6.3374125874125875,
+      "grad_norm": 0.30184629559516907,
+      "learning_rate": 0.0005242600583090379,
+      "loss": 3.4716,
+      "step": 21750
+    },
+    {
+      "epoch": 6.351981351981352,
+      "grad_norm": 0.34009432792663574,
+      "learning_rate": 0.0005240851311953352,
+      "loss": 3.4698,
+      "step": 21800
+    },
+    {
+      "epoch": 6.366550116550116,
+      "grad_norm": 0.32678115367889404,
+      "learning_rate": 0.0005239102040816326,
+      "loss": 3.4706,
+      "step": 21850
+    },
+    {
+      "epoch": 6.381118881118881,
+      "grad_norm": 0.34370940923690796,
+      "learning_rate": 0.00052373527696793,
+      "loss": 3.4649,
+      "step": 21900
+    },
+    {
+      "epoch": 6.395687645687646,
+      "grad_norm": 0.31767651438713074,
+      "learning_rate": 0.0005235603498542274,
+      "loss": 3.4903,
+      "step": 21950
+    },
+    {
+      "epoch": 6.410256410256411,
+      "grad_norm": 0.35483428835868835,
+      "learning_rate": 0.0005233854227405248,
+      "loss": 3.4762,
+      "step": 22000
+    },
+    {
+      "epoch": 6.410256410256411,
+      "eval_accuracy": 0.3631498688509091,
+      "eval_loss": 3.6074860095977783,
+      "eval_runtime": 180.0487,
+      "eval_samples_per_second": 92.431,
+      "eval_steps_per_second": 5.782,
+      "step": 22000
+    },
+    {
+      "epoch": 6.424825174825175,
+      "grad_norm": 0.31931906938552856,
+      "learning_rate": 0.0005232104956268221,
+      "loss": 3.4758,
+      "step": 22050
+    },
+    {
+      "epoch": 6.4393939393939394,
+      "grad_norm": 0.3227771818637848,
+      "learning_rate": 0.0005230355685131195,
+      "loss": 3.4678,
+      "step": 22100
+    },
+    {
+      "epoch": 6.453962703962704,
+      "grad_norm": 0.35156136751174927,
+      "learning_rate": 0.0005228606413994169,
+      "loss": 3.4803,
+      "step": 22150
+    },
+    {
+      "epoch": 6.468531468531468,
+      "grad_norm": 0.33394086360931396,
+      "learning_rate": 0.0005226857142857142,
+      "loss": 3.471,
+      "step": 22200
+    },
+    {
+      "epoch": 6.483100233100233,
+      "grad_norm": 0.3395681381225586,
+      "learning_rate": 0.0005225107871720116,
+      "loss": 3.4759,
+      "step": 22250
+    },
+    {
+      "epoch": 6.497668997668998,
+      "grad_norm": 0.32322457432746887,
+      "learning_rate": 0.0005223358600583089,
+      "loss": 3.48,
+      "step": 22300
+    },
+    {
+      "epoch": 6.5122377622377625,
+      "grad_norm": 0.32809075713157654,
+      "learning_rate": 0.0005221609329446064,
+      "loss": 3.4774,
+      "step": 22350
+    },
+    {
+      "epoch": 6.526806526806527,
+      "grad_norm": 0.32868528366088867,
+      "learning_rate": 0.0005219860058309038,
+      "loss": 3.4811,
+      "step": 22400
+    },
+    {
+      "epoch": 6.541375291375291,
+      "grad_norm": 0.33489176630973816,
+      "learning_rate": 0.0005218110787172011,
+      "loss": 3.4916,
+      "step": 22450
+    },
+    {
+      "epoch": 6.555944055944056,
+      "grad_norm": 0.3436543941497803,
+      "learning_rate": 0.0005216361516034985,
+      "loss": 3.4859,
+      "step": 22500
+    },
+    {
+      "epoch": 6.57051282051282,
+      "grad_norm": 0.3015133738517761,
+      "learning_rate": 0.0005214612244897959,
+      "loss": 3.4779,
+      "step": 22550
+    },
+    {
+      "epoch": 6.585081585081585,
+      "grad_norm": 0.3797510862350464,
+      "learning_rate": 0.0005212862973760932,
+      "loss": 3.4846,
+      "step": 22600
+    },
+    {
+      "epoch": 6.59965034965035,
+      "grad_norm": 0.327371209859848,
+      "learning_rate": 0.0005211113702623906,
+      "loss": 3.4941,
+      "step": 22650
+    },
+    {
+      "epoch": 6.6142191142191145,
+      "grad_norm": 0.3728986084461212,
+      "learning_rate": 0.0005209364431486879,
+      "loss": 3.4986,
+      "step": 22700
+    },
+    {
+      "epoch": 6.628787878787879,
+      "grad_norm": 0.3234831988811493,
+      "learning_rate": 0.0005207615160349854,
+      "loss": 3.4824,
+      "step": 22750
+    },
+    {
+      "epoch": 6.643356643356643,
+      "grad_norm": 0.3303401470184326,
+      "learning_rate": 0.0005205865889212828,
+      "loss": 3.4857,
+      "step": 22800
+    },
+    {
+      "epoch": 6.657925407925408,
+      "grad_norm": 0.3562447726726532,
+      "learning_rate": 0.0005204116618075801,
+      "loss": 3.4825,
+      "step": 22850
+    },
+    {
+      "epoch": 6.672494172494172,
+      "grad_norm": 0.3363456428050995,
+      "learning_rate": 0.0005202367346938776,
+      "loss": 3.4786,
+      "step": 22900
+    },
+    {
+      "epoch": 6.687062937062937,
+      "grad_norm": 0.337936669588089,
+      "learning_rate": 0.0005200618075801749,
+      "loss": 3.4894,
+      "step": 22950
+    },
+    {
+      "epoch": 6.701631701631702,
+      "grad_norm": 0.34164348244667053,
+      "learning_rate": 0.0005198868804664723,
+      "loss": 3.4815,
+      "step": 23000
+    },
+    {
+      "epoch": 6.701631701631702,
+      "eval_accuracy": 0.3637440554878363,
+      "eval_loss": 3.6008543968200684,
+      "eval_runtime": 180.4988,
+      "eval_samples_per_second": 92.2,
+      "eval_steps_per_second": 5.767,
+      "step": 23000
+    },
+    {
+      "epoch": 6.716200466200466,
+      "grad_norm": 0.3702085018157959,
+      "learning_rate": 0.0005197119533527696,
+      "loss": 3.4993,
+      "step": 23050
+    },
+    {
+      "epoch": 6.730769230769231,
+      "grad_norm": 0.33993563055992126,
+      "learning_rate": 0.000519537026239067,
+      "loss": 3.4772,
+      "step": 23100
+    },
+    {
+      "epoch": 6.745337995337995,
+      "grad_norm": 0.33401525020599365,
+      "learning_rate": 0.0005193620991253644,
+      "loss": 3.4976,
+      "step": 23150
+    },
+    {
+      "epoch": 6.75990675990676,
+      "grad_norm": 0.37840354442596436,
+      "learning_rate": 0.0005191871720116618,
+      "loss": 3.4828,
+      "step": 23200
+    },
+    {
+      "epoch": 6.774475524475524,
+      "grad_norm": 0.3243924379348755,
+      "learning_rate": 0.0005190122448979591,
+      "loss": 3.4938,
+      "step": 23250
+    },
+    {
+      "epoch": 6.7890442890442895,
+      "grad_norm": 0.3309505581855774,
+      "learning_rate": 0.0005188373177842566,
+      "loss": 3.4723,
+      "step": 23300
+    },
+    {
+      "epoch": 6.803613053613054,
+      "grad_norm": 0.35153377056121826,
+      "learning_rate": 0.0005186623906705539,
+      "loss": 3.4872,
+      "step": 23350
+    },
+    {
+      "epoch": 6.818181818181818,
+      "grad_norm": 0.3381296396255493,
+      "learning_rate": 0.0005184874635568513,
+      "loss": 3.4899,
+      "step": 23400
+    },
+    {
+      "epoch": 6.832750582750583,
+      "grad_norm": 0.3551500737667084,
+      "learning_rate": 0.0005183125364431486,
+      "loss": 3.4895,
+      "step": 23450
+    },
+    {
+      "epoch": 6.847319347319347,
+      "grad_norm": 0.33850058913230896,
+      "learning_rate": 0.000518137609329446,
+      "loss": 3.4793,
+      "step": 23500
+    },
+    {
+      "epoch": 6.861888111888112,
+      "grad_norm": 0.3279431164264679,
+      "learning_rate": 0.0005179626822157434,
+      "loss": 3.4967,
+      "step": 23550
+    },
+    {
+      "epoch": 6.876456876456876,
+      "grad_norm": 0.3145736753940582,
+      "learning_rate": 0.0005177877551020407,
+      "loss": 3.5046,
+      "step": 23600
+    },
+    {
+      "epoch": 6.891025641025641,
+      "grad_norm": 0.3533722162246704,
+      "learning_rate": 0.0005176128279883381,
+      "loss": 3.4892,
+      "step": 23650
+    },
+    {
+      "epoch": 6.905594405594406,
+      "grad_norm": 0.3434518575668335,
+      "learning_rate": 0.0005174379008746356,
+      "loss": 3.4818,
+      "step": 23700
+    },
+    {
+      "epoch": 6.92016317016317,
+      "grad_norm": 0.30422964692115784,
+      "learning_rate": 0.0005172629737609329,
+      "loss": 3.4961,
+      "step": 23750
+    },
+    {
+      "epoch": 6.934731934731935,
+      "grad_norm": 0.34872138500213623,
+      "learning_rate": 0.0005170880466472303,
+      "loss": 3.4941,
+      "step": 23800
+    },
+    {
+      "epoch": 6.949300699300699,
+      "grad_norm": 0.3359842598438263,
+      "learning_rate": 0.0005169131195335276,
+      "loss": 3.4905,
+      "step": 23850
+    },
+    {
+      "epoch": 6.963869463869464,
+      "grad_norm": 0.3362923264503479,
+      "learning_rate": 0.0005167381924198251,
+      "loss": 3.4967,
+      "step": 23900
+    },
+    {
+      "epoch": 6.978438228438229,
+      "grad_norm": 0.33967387676239014,
+      "learning_rate": 0.0005165632653061224,
+      "loss": 3.4997,
+      "step": 23950
+    },
+    {
+      "epoch": 6.993006993006993,
+      "grad_norm": 0.326475590467453,
+      "learning_rate": 0.0005163883381924197,
+      "loss": 3.4942,
+      "step": 24000
+    },
+    {
+      "epoch": 6.993006993006993,
+      "eval_accuracy": 0.36491220313304396,
+      "eval_loss": 3.5894508361816406,
+      "eval_runtime": 180.3515,
+      "eval_samples_per_second": 92.275,
+      "eval_steps_per_second": 5.772,
+      "step": 24000
+    },
+    {
+      "epoch": 7.007575757575758,
+      "grad_norm": 0.35610419511795044,
+      "learning_rate": 0.0005162134110787171,
+      "loss": 3.4349,
+      "step": 24050
+    },
+    {
+      "epoch": 7.022144522144522,
+      "grad_norm": 0.3531475067138672,
+      "learning_rate": 0.0005160384839650146,
+      "loss": 3.3823,
+      "step": 24100
+    },
+    {
+      "epoch": 7.036713286713287,
+      "grad_norm": 0.3476791977882385,
+      "learning_rate": 0.0005158635568513119,
+      "loss": 3.4016,
+      "step": 24150
+    },
+    {
+      "epoch": 7.051282051282051,
+      "grad_norm": 0.35229551792144775,
+      "learning_rate": 0.0005156886297376093,
+      "loss": 3.3868,
+      "step": 24200
+    },
+    {
+      "epoch": 7.0658508158508155,
+      "grad_norm": 0.3391362428665161,
+      "learning_rate": 0.0005155137026239066,
+      "loss": 3.3928,
+      "step": 24250
+    },
+    {
+      "epoch": 7.08041958041958,
+      "grad_norm": 0.34460726380348206,
+      "learning_rate": 0.0005153387755102041,
+      "loss": 3.3913,
+      "step": 24300
+    },
+    {
+      "epoch": 7.094988344988345,
+      "grad_norm": 0.35683906078338623,
+      "learning_rate": 0.0005151638483965014,
+      "loss": 3.3972,
+      "step": 24350
+    },
+    {
+      "epoch": 7.10955710955711,
+      "grad_norm": 0.3500906825065613,
+      "learning_rate": 0.0005149889212827987,
+      "loss": 3.4121,
+      "step": 24400
+    },
+    {
+      "epoch": 7.124125874125874,
+      "grad_norm": 0.32340437173843384,
+      "learning_rate": 0.0005148139941690961,
+      "loss": 3.4042,
+      "step": 24450
+    },
+    {
+      "epoch": 7.138694638694639,
+      "grad_norm": 0.36307796835899353,
+      "learning_rate": 0.0005146390670553936,
+      "loss": 3.4152,
+      "step": 24500
+    },
+    {
+      "epoch": 7.153263403263403,
+      "grad_norm": 0.35622280836105347,
+      "learning_rate": 0.0005144641399416909,
+      "loss": 3.4038,
+      "step": 24550
+    },
+    {
+      "epoch": 7.1678321678321675,
+      "grad_norm": 0.34201404452323914,
+      "learning_rate": 0.0005142892128279883,
+      "loss": 3.413,
+      "step": 24600
+    },
+    {
+      "epoch": 7.182400932400933,
+      "grad_norm": 0.3477611243724823,
+      "learning_rate": 0.0005141142857142856,
+      "loss": 3.4147,
+      "step": 24650
+    },
+    {
+      "epoch": 7.196969696969697,
+      "grad_norm": 0.3193877339363098,
+      "learning_rate": 0.0005139393586005831,
+      "loss": 3.4349,
+      "step": 24700
+    },
+    {
+      "epoch": 7.211538461538462,
+      "grad_norm": 0.3370342254638672,
+      "learning_rate": 0.0005137644314868804,
+      "loss": 3.4269,
+      "step": 24750
+    },
+    {
+      "epoch": 7.226107226107226,
+      "grad_norm": 0.35344481468200684,
+      "learning_rate": 0.0005135895043731778,
+      "loss": 3.4046,
+      "step": 24800
+    },
+    {
+      "epoch": 7.2406759906759905,
+      "grad_norm": 0.3530924320220947,
+      "learning_rate": 0.0005134145772594752,
+      "loss": 3.4115,
+      "step": 24850
+    },
+    {
+      "epoch": 7.255244755244755,
+      "grad_norm": 0.3493140935897827,
+      "learning_rate": 0.0005132396501457726,
+      "loss": 3.4237,
+      "step": 24900
+    },
+    {
+      "epoch": 7.269813519813519,
+      "grad_norm": 0.33685219287872314,
+      "learning_rate": 0.0005130647230320699,
+      "loss": 3.4313,
+      "step": 24950
+    },
+    {
+      "epoch": 7.284382284382285,
+      "grad_norm": 0.3504573702812195,
+      "learning_rate": 0.0005128897959183673,
+      "loss": 3.4237,
+      "step": 25000
+    },
+    {
+      "epoch": 7.284382284382285,
+      "eval_accuracy": 0.3649218455839104,
+      "eval_loss": 3.599851369857788,
+      "eval_runtime": 180.4257,
+      "eval_samples_per_second": 92.237,
+      "eval_steps_per_second": 5.77,
+      "step": 25000
+    },
+    {
+      "epoch": 7.298951048951049,
+      "grad_norm": 0.34710603952407837,
+      "learning_rate": 0.0005127148688046647,
+      "loss": 3.4347,
+      "step": 25050
+    },
+    {
+      "epoch": 7.313519813519814,
+      "grad_norm": 0.3456078767776489,
+      "learning_rate": 0.0005125399416909621,
+      "loss": 3.4325,
+      "step": 25100
+    },
+    {
+      "epoch": 7.328088578088578,
+      "grad_norm": 0.36139947175979614,
+      "learning_rate": 0.0005123650145772594,
+      "loss": 3.4531,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3426573426573425,
+      "grad_norm": 0.3331305980682373,
+      "learning_rate": 0.0005121900874635568,
+      "loss": 3.4372,
+      "step": 25200
+    },
+    {
+      "epoch": 7.357226107226107,
+      "grad_norm": 0.3419002294540405,
+      "learning_rate": 0.0005120151603498543,
+      "loss": 3.4222,
+      "step": 25250
+    },
+    {
+      "epoch": 7.371794871794872,
+      "grad_norm": 0.37077078223228455,
+      "learning_rate": 0.0005118402332361515,
+      "loss": 3.438,
+      "step": 25300
+    },
+    {
+      "epoch": 7.386363636363637,
+      "grad_norm": 0.37061864137649536,
+      "learning_rate": 0.0005116653061224489,
+      "loss": 3.4384,
+      "step": 25350
+    },
+    {
+      "epoch": 7.400932400932401,
+      "grad_norm": 0.33451831340789795,
+      "learning_rate": 0.0005114903790087463,
+      "loss": 3.4323,
+      "step": 25400
+    },
+    {
+      "epoch": 7.415501165501166,
+      "grad_norm": 0.36487630009651184,
+      "learning_rate": 0.0005113154518950437,
+      "loss": 3.4338,
+      "step": 25450
+    },
+    {
+      "epoch": 7.43006993006993,
+      "grad_norm": 0.34303170442581177,
+      "learning_rate": 0.0005111405247813411,
+      "loss": 3.446,
+      "step": 25500
+    },
+    {
+      "epoch": 7.444638694638694,
+      "grad_norm": 0.3491624593734741,
+      "learning_rate": 0.0005109655976676384,
+      "loss": 3.4407,
+      "step": 25550
+    },
+    {
+      "epoch": 7.459207459207459,
+      "grad_norm": 0.3570358455181122,
+      "learning_rate": 0.0005107906705539358,
+      "loss": 3.4499,
+      "step": 25600
+    },
+    {
+      "epoch": 7.473776223776224,
+      "grad_norm": 0.3398280739784241,
+      "learning_rate": 0.0005106157434402332,
+      "loss": 3.438,
+      "step": 25650
+    },
+    {
+      "epoch": 7.488344988344989,
+      "grad_norm": 0.3448866307735443,
+      "learning_rate": 0.0005104408163265306,
+      "loss": 3.4396,
+      "step": 25700
+    },
+    {
+      "epoch": 7.502913752913753,
+      "grad_norm": 0.35469329357147217,
+      "learning_rate": 0.0005102658892128279,
+      "loss": 3.4361,
+      "step": 25750
+    },
+    {
+      "epoch": 7.5174825174825175,
+      "grad_norm": 0.35180532932281494,
+      "learning_rate": 0.0005100909620991253,
+      "loss": 3.4589,
+      "step": 25800
+    },
+    {
+      "epoch": 7.532051282051282,
+      "grad_norm": 0.3383461833000183,
+      "learning_rate": 0.0005099160349854227,
+      "loss": 3.446,
+      "step": 25850
+    },
+    {
+      "epoch": 7.546620046620046,
+      "grad_norm": 0.35350677371025085,
+      "learning_rate": 0.0005097411078717201,
+      "loss": 3.4507,
+      "step": 25900
+    },
+    {
+      "epoch": 7.561188811188811,
+      "grad_norm": 0.3186721205711365,
+      "learning_rate": 0.0005095661807580174,
+      "loss": 3.4341,
+      "step": 25950
+    },
+    {
+      "epoch": 7.575757575757576,
+      "grad_norm": 0.3171408474445343,
+      "learning_rate": 0.0005093912536443149,
+      "loss": 3.4501,
+      "step": 26000
+    },
+    {
+      "epoch": 7.575757575757576,
+      "eval_accuracy": 0.3657292244576768,
+      "eval_loss": 3.5900423526763916,
+      "eval_runtime": 180.2145,
+      "eval_samples_per_second": 92.346,
+      "eval_steps_per_second": 5.776,
+      "step": 26000
+    },
+    {
+      "epoch": 7.590326340326341,
+      "grad_norm": 0.35610276460647583,
+      "learning_rate": 0.0005092163265306122,
+      "loss": 3.4641,
+      "step": 26050
+    },
+    {
+      "epoch": 7.604895104895105,
+      "grad_norm": 0.37525665760040283,
+      "learning_rate": 0.0005090413994169096,
+      "loss": 3.4647,
+      "step": 26100
+    },
+    {
+      "epoch": 7.619463869463869,
+      "grad_norm": 0.33461683988571167,
+      "learning_rate": 0.000508866472303207,
+      "loss": 3.4597,
+      "step": 26150
+    },
+    {
+      "epoch": 7.634032634032634,
+      "grad_norm": 0.3235708773136139,
+      "learning_rate": 0.0005086915451895044,
+      "loss": 3.4497,
+      "step": 26200
+    },
+    {
+      "epoch": 7.648601398601398,
+      "grad_norm": 0.34801435470581055,
+      "learning_rate": 0.0005085166180758017,
+      "loss": 3.4504,
+      "step": 26250
+    },
+    {
+      "epoch": 7.663170163170163,
+      "grad_norm": 0.32955285906791687,
+      "learning_rate": 0.0005083416909620991,
+      "loss": 3.449,
+      "step": 26300
+    },
+    {
+      "epoch": 7.677738927738928,
+      "grad_norm": 0.3284403383731842,
+      "learning_rate": 0.0005081667638483964,
+      "loss": 3.4546,
+      "step": 26350
+    },
+    {
+      "epoch": 7.6923076923076925,
+      "grad_norm": 0.32493704557418823,
+      "learning_rate": 0.0005079918367346939,
+      "loss": 3.4384,
+      "step": 26400
+    },
+    {
+      "epoch": 7.706876456876457,
+      "grad_norm": 0.34628820419311523,
+      "learning_rate": 0.0005078169096209912,
+      "loss": 3.4534,
+      "step": 26450
+    },
+    {
+      "epoch": 7.721445221445221,
+      "grad_norm": 0.33644041419029236,
+      "learning_rate": 0.0005076419825072886,
+      "loss": 3.4552,
+      "step": 26500
+    },
+    {
+      "epoch": 7.736013986013986,
+      "grad_norm": 0.34094300866127014,
+      "learning_rate": 0.000507467055393586,
+      "loss": 3.4536,
+      "step": 26550
+    },
+    {
+      "epoch": 7.75058275058275,
+      "grad_norm": 0.34397369623184204,
+      "learning_rate": 0.0005072921282798834,
+      "loss": 3.4635,
+      "step": 26600
+    },
+    {
+      "epoch": 7.765151515151516,
+      "grad_norm": 0.3402233123779297,
+      "learning_rate": 0.0005071172011661807,
+      "loss": 3.4708,
+      "step": 26650
+    },
+    {
+      "epoch": 7.77972027972028,
+      "grad_norm": 0.3712950050830841,
+      "learning_rate": 0.0005069422740524781,
+      "loss": 3.4713,
+      "step": 26700
+    },
+    {
+      "epoch": 7.7942890442890445,
+      "grad_norm": 0.3284025490283966,
+      "learning_rate": 0.0005067673469387754,
+      "loss": 3.4556,
+      "step": 26750
+    },
+    {
+      "epoch": 7.808857808857809,
+      "grad_norm": 0.34438565373420715,
+      "learning_rate": 0.0005065924198250729,
+      "loss": 3.4744,
+      "step": 26800
+    },
+    {
+      "epoch": 7.823426573426573,
+      "grad_norm": 0.33172059059143066,
+      "learning_rate": 0.0005064174927113702,
+      "loss": 3.4534,
+      "step": 26850
+    },
+    {
+      "epoch": 7.837995337995338,
+      "grad_norm": 0.3375876843929291,
+      "learning_rate": 0.0005062425655976676,
+      "loss": 3.4484,
+      "step": 26900
+    },
+    {
+      "epoch": 7.852564102564102,
+      "grad_norm": 0.3456272780895233,
+      "learning_rate": 0.0005060676384839649,
+      "loss": 3.4474,
+      "step": 26950
+    },
+    {
+      "epoch": 7.867132867132867,
+      "grad_norm": 0.3476708233356476,
+      "learning_rate": 0.0005058927113702624,
+      "loss": 3.4529,
+      "step": 27000
+    },
+    {
+      "epoch": 7.867132867132867,
+      "eval_accuracy": 0.36633469981756955,
+      "eval_loss": 3.5783565044403076,
+      "eval_runtime": 182.5033,
+      "eval_samples_per_second": 91.187,
+      "eval_steps_per_second": 5.704,
+      "step": 27000
+    },
+    {
+      "epoch": 7.881701631701632,
+      "grad_norm": 0.357236385345459,
+      "learning_rate": 0.0005057177842565598,
+      "loss": 3.4582,
+      "step": 27050
+    },
+    {
+      "epoch": 7.896270396270396,
+      "grad_norm": 0.3404090404510498,
+      "learning_rate": 0.0005055428571428571,
+      "loss": 3.4633,
+      "step": 27100
+    },
+    {
+      "epoch": 7.910839160839161,
+      "grad_norm": 0.341049462556839,
+      "learning_rate": 0.0005053679300291544,
+      "loss": 3.4626,
+      "step": 27150
+    },
+    {
+      "epoch": 7.925407925407925,
+      "grad_norm": 0.321346640586853,
+      "learning_rate": 0.0005051930029154519,
+      "loss": 3.4529,
+      "step": 27200
+    },
+    {
+      "epoch": 7.93997668997669,
+      "grad_norm": 0.31583067774772644,
+      "learning_rate": 0.0005050180758017492,
+      "loss": 3.4681,
+      "step": 27250
+    },
+    {
+      "epoch": 7.954545454545455,
+      "grad_norm": 0.36198437213897705,
+      "learning_rate": 0.0005048431486880466,
+      "loss": 3.452,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9691142191142195,
+      "grad_norm": 0.34580230712890625,
+      "learning_rate": 0.0005046682215743439,
+      "loss": 3.4541,
+      "step": 27350
+    },
+    {
+      "epoch": 7.983682983682984,
+      "grad_norm": 0.3525956869125366,
+      "learning_rate": 0.0005044932944606414,
+      "loss": 3.4677,
+      "step": 27400
+    },
+    {
+      "epoch": 7.998251748251748,
+      "grad_norm": 0.312714546918869,
+      "learning_rate": 0.0005043183673469388,
+      "loss": 3.453,
+      "step": 27450
+    },
+    {
+      "epoch": 8.012820512820513,
+      "grad_norm": 0.3349739909172058,
+      "learning_rate": 0.0005041434402332361,
+      "loss": 3.3605,
+      "step": 27500
+    },
+    {
+      "epoch": 8.027389277389277,
+      "grad_norm": 0.35149267315864563,
+      "learning_rate": 0.0005039685131195334,
+      "loss": 3.3568,
+      "step": 27550
+    },
+    {
+      "epoch": 8.041958041958042,
+      "grad_norm": 0.35762932896614075,
+      "learning_rate": 0.0005037935860058309,
+      "loss": 3.3594,
+      "step": 27600
+    },
+    {
+      "epoch": 8.056526806526806,
+      "grad_norm": 0.3227376341819763,
+      "learning_rate": 0.0005036186588921282,
+      "loss": 3.3618,
+      "step": 27650
+    },
+    {
+      "epoch": 8.07109557109557,
+      "grad_norm": 0.3321167826652527,
+      "learning_rate": 0.0005034437317784256,
+      "loss": 3.3757,
+      "step": 27700
+    },
+    {
+      "epoch": 8.085664335664335,
+      "grad_norm": 0.34823182225227356,
+      "learning_rate": 0.000503268804664723,
+      "loss": 3.368,
+      "step": 27750
+    },
+    {
+      "epoch": 8.1002331002331,
+      "grad_norm": 0.3144749701023102,
+      "learning_rate": 0.0005030938775510204,
+      "loss": 3.3762,
+      "step": 27800
+    },
+    {
+      "epoch": 8.114801864801866,
+      "grad_norm": 0.33065780997276306,
+      "learning_rate": 0.0005029189504373178,
+      "loss": 3.3705,
+      "step": 27850
+    },
+    {
+      "epoch": 8.12937062937063,
+      "grad_norm": 0.3569163382053375,
+      "learning_rate": 0.0005027440233236151,
+      "loss": 3.3917,
+      "step": 27900
+    },
+    {
+      "epoch": 8.143939393939394,
+      "grad_norm": 0.336088091135025,
+      "learning_rate": 0.0005025690962099126,
+      "loss": 3.387,
+      "step": 27950
+    },
+    {
+      "epoch": 8.158508158508159,
+      "grad_norm": 0.31934666633605957,
+      "learning_rate": 0.0005023941690962099,
+      "loss": 3.3725,
+      "step": 28000
+    },
+    {
+      "epoch": 8.158508158508159,
+      "eval_accuracy": 0.3663358757262118,
+      "eval_loss": 3.590759754180908,
+      "eval_runtime": 182.8908,
+      "eval_samples_per_second": 90.994,
+      "eval_steps_per_second": 5.692,
+      "step": 28000
+    },
+    {
+      "epoch": 8.173076923076923,
+      "grad_norm": 0.36414483189582825,
+      "learning_rate": 0.0005022192419825072,
+      "loss": 3.3922,
+      "step": 28050
+    },
+    {
+      "epoch": 8.187645687645688,
+      "grad_norm": 0.3432634472846985,
+      "learning_rate": 0.0005020443148688046,
+      "loss": 3.3869,
+      "step": 28100
+    },
+    {
+      "epoch": 8.202214452214452,
+      "grad_norm": 0.34101763367652893,
+      "learning_rate": 0.000501869387755102,
+      "loss": 3.3947,
+      "step": 28150
+    },
+    {
+      "epoch": 8.216783216783217,
+      "grad_norm": 0.36927416920661926,
+      "learning_rate": 0.0005016944606413994,
+      "loss": 3.3831,
+      "step": 28200
+    },
+    {
+      "epoch": 8.231351981351981,
+      "grad_norm": 0.3365326523780823,
+      "learning_rate": 0.0005015195335276967,
+      "loss": 3.3901,
+      "step": 28250
+    },
+    {
+      "epoch": 8.245920745920746,
+      "grad_norm": 0.36989808082580566,
+      "learning_rate": 0.0005013446064139941,
+      "loss": 3.3948,
+      "step": 28300
+    },
+    {
+      "epoch": 8.26048951048951,
+      "grad_norm": 0.3689005374908447,
+      "learning_rate": 0.0005011696793002916,
+      "loss": 3.3917,
+      "step": 28350
+    },
+    {
+      "epoch": 8.275058275058274,
+      "grad_norm": 0.3656901717185974,
+      "learning_rate": 0.0005009947521865889,
+      "loss": 3.3954,
+      "step": 28400
+    },
+    {
+      "epoch": 8.289627039627039,
+      "grad_norm": 0.3664736747741699,
+      "learning_rate": 0.0005008198250728862,
+      "loss": 3.4009,
+      "step": 28450
+    },
+    {
+      "epoch": 8.304195804195805,
+      "grad_norm": 0.3412057161331177,
+      "learning_rate": 0.0005006448979591836,
+      "loss": 3.3916,
+      "step": 28500
+    },
+    {
+      "epoch": 8.31876456876457,
+      "grad_norm": 0.37343958020210266,
+      "learning_rate": 0.000500469970845481,
+      "loss": 3.3981,
+      "step": 28550
+    },
+    {
+      "epoch": 8.333333333333334,
+      "grad_norm": 0.40472412109375,
+      "learning_rate": 0.0005002950437317784,
+      "loss": 3.4063,
+      "step": 28600
+    },
+    {
+      "epoch": 8.347902097902098,
+      "grad_norm": 0.33591440320014954,
+      "learning_rate": 0.0005001201166180757,
+      "loss": 3.3988,
+      "step": 28650
+    },
+    {
+      "epoch": 8.362470862470863,
+      "grad_norm": 0.3387737572193146,
+      "learning_rate": 0.0004999451895043731,
+      "loss": 3.4103,
+      "step": 28700
+    },
+    {
+      "epoch": 8.377039627039627,
+      "grad_norm": 0.3714272975921631,
+      "learning_rate": 0.0004997702623906706,
+      "loss": 3.4127,
+      "step": 28750
+    },
+    {
+      "epoch": 8.391608391608392,
+      "grad_norm": 0.34964922070503235,
+      "learning_rate": 0.0004995953352769679,
+      "loss": 3.409,
+      "step": 28800
+    },
+    {
+      "epoch": 8.406177156177156,
+      "grad_norm": 0.34254536032676697,
+      "learning_rate": 0.0004994204081632653,
+      "loss": 3.4076,
+      "step": 28850
+    },
+    {
+      "epoch": 8.42074592074592,
+      "grad_norm": 0.34269341826438904,
+      "learning_rate": 0.0004992454810495626,
+      "loss": 3.3974,
+      "step": 28900
+    },
+    {
+      "epoch": 8.435314685314685,
+      "grad_norm": 0.32962408661842346,
+      "learning_rate": 0.00049907055393586,
+      "loss": 3.3982,
+      "step": 28950
+    },
+    {
+      "epoch": 8.44988344988345,
+      "grad_norm": 0.37458404898643494,
+      "learning_rate": 0.0004988956268221574,
+      "loss": 3.4032,
+      "step": 29000
+    },
+    {
+      "epoch": 8.44988344988345,
+      "eval_accuracy": 0.3665615325946589,
+      "eval_loss": 3.5825419425964355,
+      "eval_runtime": 180.8696,
+      "eval_samples_per_second": 92.011,
+      "eval_steps_per_second": 5.756,
+      "step": 29000
+    },
+    {
+      "epoch": 8.464452214452214,
+      "grad_norm": 0.34700310230255127,
+      "learning_rate": 0.0004987206997084547,
+      "loss": 3.4302,
+      "step": 29050
+    },
+    {
+      "epoch": 8.479020979020978,
+      "grad_norm": 0.3363369405269623,
+      "learning_rate": 0.0004985457725947521,
+      "loss": 3.4196,
+      "step": 29100
+    },
+    {
+      "epoch": 8.493589743589745,
+      "grad_norm": 0.34493017196655273,
+      "learning_rate": 0.0004983708454810496,
+      "loss": 3.4233,
+      "step": 29150
+    },
+    {
+      "epoch": 8.508158508158509,
+      "grad_norm": 0.3357371389865875,
+      "learning_rate": 0.0004981959183673469,
+      "loss": 3.4124,
+      "step": 29200
+    },
+    {
+      "epoch": 8.522727272727273,
+      "grad_norm": 0.3642560541629791,
+      "learning_rate": 0.0004980209912536443,
+      "loss": 3.4161,
+      "step": 29250
+    },
+    {
+      "epoch": 8.537296037296038,
+      "grad_norm": 0.3482314944267273,
+      "learning_rate": 0.0004978460641399417,
+      "loss": 3.4204,
+      "step": 29300
+    },
+    {
+      "epoch": 8.551864801864802,
+      "grad_norm": 0.3307981491088867,
+      "learning_rate": 0.000497671137026239,
+      "loss": 3.4275,
+      "step": 29350
+    },
+    {
+      "epoch": 8.566433566433567,
+      "grad_norm": 0.3394106924533844,
+      "learning_rate": 0.0004974962099125364,
+      "loss": 3.4057,
+      "step": 29400
+    },
+    {
+      "epoch": 8.581002331002331,
+      "grad_norm": 0.3372842073440552,
+      "learning_rate": 0.0004973212827988337,
+      "loss": 3.4092,
+      "step": 29450
+    },
+    {
+      "epoch": 8.595571095571096,
+      "grad_norm": 0.32758432626724243,
+      "learning_rate": 0.0004971463556851312,
+      "loss": 3.4188,
+      "step": 29500
+    },
+    {
+      "epoch": 8.61013986013986,
+      "grad_norm": 0.3386209309101105,
+      "learning_rate": 0.0004969714285714286,
+      "loss": 3.4205,
+      "step": 29550
+    },
+    {
+      "epoch": 8.624708624708624,
+      "grad_norm": 0.3470524549484253,
+      "learning_rate": 0.0004967965014577259,
+      "loss": 3.4182,
+      "step": 29600
+    },
+    {
+      "epoch": 8.639277389277389,
+      "grad_norm": 0.3339594900608063,
+      "learning_rate": 0.0004966215743440233,
+      "loss": 3.4176,
+      "step": 29650
+    },
+    {
+      "epoch": 8.653846153846153,
+      "grad_norm": 0.3515772223472595,
+      "learning_rate": 0.0004964466472303207,
+      "loss": 3.4288,
+      "step": 29700
+    },
+    {
+      "epoch": 8.668414918414918,
+      "grad_norm": 0.35019543766975403,
+      "learning_rate": 0.000496271720116618,
+      "loss": 3.4287,
+      "step": 29750
+    },
+    {
+      "epoch": 8.682983682983682,
+      "grad_norm": 0.3279973566532135,
+      "learning_rate": 0.0004960967930029154,
+      "loss": 3.4211,
+      "step": 29800
+    },
+    {
+      "epoch": 8.697552447552448,
+      "grad_norm": 0.33548232913017273,
+      "learning_rate": 0.0004959218658892127,
+      "loss": 3.4308,
+      "step": 29850
+    },
+    {
+      "epoch": 8.712121212121213,
+      "grad_norm": 0.3599195182323456,
+      "learning_rate": 0.0004957469387755102,
+      "loss": 3.4228,
+      "step": 29900
+    },
+    {
+      "epoch": 8.726689976689977,
+      "grad_norm": 0.34652629494667053,
+      "learning_rate": 0.0004955720116618075,
+      "loss": 3.4285,
+      "step": 29950
+    },
+    {
+      "epoch": 8.741258741258742,
+      "grad_norm": 0.332381933927536,
+      "learning_rate": 0.0004953970845481049,
+      "loss": 3.4275,
+      "step": 30000
+    },
+    {
+      "epoch": 8.741258741258742,
+      "eval_accuracy": 0.36736185601657184,
+      "eval_loss": 3.5740106105804443,
+      "eval_runtime": 182.2015,
+      "eval_samples_per_second": 91.338,
+      "eval_steps_per_second": 5.713,
+      "step": 30000
+    },
+    {
+      "epoch": 8.755827505827506,
+      "grad_norm": 0.3524293601512909,
+      "learning_rate": 0.0004952221574344023,
+      "loss": 3.4245,
+      "step": 30050
+    },
+    {
+      "epoch": 8.77039627039627,
+      "grad_norm": 0.33680975437164307,
+      "learning_rate": 0.0004950472303206997,
+      "loss": 3.4345,
+      "step": 30100
+    },
+    {
+      "epoch": 8.784965034965035,
+      "grad_norm": 0.34272924065589905,
+      "learning_rate": 0.0004948723032069971,
+      "loss": 3.4335,
+      "step": 30150
+    },
+    {
+      "epoch": 8.7995337995338,
+      "grad_norm": 0.3409082591533661,
+      "learning_rate": 0.0004946973760932944,
+      "loss": 3.4283,
+      "step": 30200
+    },
+    {
+      "epoch": 8.814102564102564,
+      "grad_norm": 0.36862286925315857,
+      "learning_rate": 0.0004945224489795917,
+      "loss": 3.4235,
+      "step": 30250
+    },
+    {
+      "epoch": 8.828671328671328,
+      "grad_norm": 0.3254280388355255,
+      "learning_rate": 0.0004943475218658892,
+      "loss": 3.4312,
+      "step": 30300
+    },
+    {
+      "epoch": 8.843240093240093,
+      "grad_norm": 0.3392513394355774,
+      "learning_rate": 0.0004941725947521865,
+      "loss": 3.4257,
+      "step": 30350
+    },
+    {
+      "epoch": 8.857808857808857,
+      "grad_norm": 0.3554167151451111,
+      "learning_rate": 0.0004939976676384839,
+      "loss": 3.4304,
+      "step": 30400
+    },
+    {
+      "epoch": 8.872377622377622,
+      "grad_norm": 0.35996973514556885,
+      "learning_rate": 0.0004938227405247813,
+      "loss": 3.4399,
+      "step": 30450
+    },
+    {
+      "epoch": 8.886946386946388,
+      "grad_norm": 0.36442074179649353,
+      "learning_rate": 0.0004936478134110787,
+      "loss": 3.4316,
+      "step": 30500
+    },
+    {
+      "epoch": 8.901515151515152,
+      "grad_norm": 0.36240333318710327,
+      "learning_rate": 0.0004934728862973761,
+      "loss": 3.4262,
+      "step": 30550
+    },
+    {
+      "epoch": 8.916083916083917,
+      "grad_norm": 0.33148348331451416,
+      "learning_rate": 0.0004932979591836734,
+      "loss": 3.4361,
+      "step": 30600
+    },
+    {
+      "epoch": 8.930652680652681,
+      "grad_norm": 0.3203504681587219,
+      "learning_rate": 0.0004931230320699707,
+      "loss": 3.4466,
+      "step": 30650
+    },
+    {
+      "epoch": 8.945221445221446,
+      "grad_norm": 0.357393741607666,
+      "learning_rate": 0.0004929481049562682,
+      "loss": 3.4402,
+      "step": 30700
+    },
+    {
+      "epoch": 8.95979020979021,
+      "grad_norm": 0.36473795771598816,
+      "learning_rate": 0.0004927731778425655,
+      "loss": 3.4386,
+      "step": 30750
+    },
+    {
+      "epoch": 8.974358974358974,
+      "grad_norm": 0.31827130913734436,
+      "learning_rate": 0.0004925982507288629,
+      "loss": 3.4416,
+      "step": 30800
+    },
+    {
+      "epoch": 8.988927738927739,
+      "grad_norm": 0.35165274143218994,
+      "learning_rate": 0.0004924233236151604,
+      "loss": 3.4313,
+      "step": 30850
+    },
+    {
+      "epoch": 9.003496503496503,
+      "grad_norm": 0.3829018771648407,
+      "learning_rate": 0.0004922483965014577,
+      "loss": 3.4134,
+      "step": 30900
+    },
+    {
+      "epoch": 9.018065268065268,
+      "grad_norm": 0.3232770264148712,
+      "learning_rate": 0.0004920734693877551,
+      "loss": 3.3232,
+      "step": 30950
+    },
+    {
+      "epoch": 9.032634032634032,
+      "grad_norm": 0.33512336015701294,
+      "learning_rate": 0.0004918985422740524,
+      "loss": 3.3293,
+      "step": 31000
+    },
+    {
+      "epoch": 9.032634032634032,
+      "eval_accuracy": 0.3672867154543323,
+      "eval_loss": 3.579836130142212,
+      "eval_runtime": 181.2154,
+      "eval_samples_per_second": 91.835,
+      "eval_steps_per_second": 5.745,
+      "step": 31000
+    },
+    {
+      "epoch": 9.047202797202797,
+      "grad_norm": 0.35247862339019775,
+      "learning_rate": 0.0004917236151603499,
+      "loss": 3.323,
+      "step": 31050
+    },
+    {
+      "epoch": 9.061771561771561,
+      "grad_norm": 0.33538663387298584,
+      "learning_rate": 0.0004915486880466472,
+      "loss": 3.3462,
+      "step": 31100
+    },
+    {
+      "epoch": 9.076340326340326,
+      "grad_norm": 0.3494170010089874,
+      "learning_rate": 0.0004913737609329445,
+      "loss": 3.3328,
+      "step": 31150
+    },
+    {
+      "epoch": 9.090909090909092,
+      "grad_norm": 0.35296133160591125,
+      "learning_rate": 0.0004911988338192419,
+      "loss": 3.3354,
+      "step": 31200
+    },
+    {
+      "epoch": 9.105477855477856,
+      "grad_norm": 0.3609370291233063,
+      "learning_rate": 0.0004910239067055393,
+      "loss": 3.3555,
+      "step": 31250
+    },
+    {
+      "epoch": 9.12004662004662,
+      "grad_norm": 0.3352583050727844,
+      "learning_rate": 0.0004908489795918367,
+      "loss": 3.3444,
+      "step": 31300
+    },
+    {
+      "epoch": 9.134615384615385,
+      "grad_norm": 0.3525612950325012,
+      "learning_rate": 0.0004906740524781341,
+      "loss": 3.3484,
+      "step": 31350
+    },
+    {
+      "epoch": 9.14918414918415,
+      "grad_norm": 0.37619081139564514,
+      "learning_rate": 0.0004904991253644314,
+      "loss": 3.3576,
+      "step": 31400
+    },
+    {
+      "epoch": 9.163752913752914,
+      "grad_norm": 0.3352401852607727,
+      "learning_rate": 0.0004903241982507289,
+      "loss": 3.3542,
+      "step": 31450
+    },
+    {
+      "epoch": 9.178321678321678,
+      "grad_norm": 0.3672662675380707,
+      "learning_rate": 0.0004901492711370262,
+      "loss": 3.3445,
+      "step": 31500
+    },
+    {
+      "epoch": 9.192890442890443,
+      "grad_norm": 0.36354750394821167,
+      "learning_rate": 0.0004899743440233235,
+      "loss": 3.3598,
+      "step": 31550
+    },
+    {
+      "epoch": 9.207459207459207,
+      "grad_norm": 0.332333505153656,
+      "learning_rate": 0.0004897994169096209,
+      "loss": 3.3689,
+      "step": 31600
+    },
+    {
+      "epoch": 9.222027972027972,
+      "grad_norm": 0.35484346747398376,
+      "learning_rate": 0.0004896244897959183,
+      "loss": 3.3683,
+      "step": 31650
+    },
+    {
+      "epoch": 9.236596736596736,
+      "grad_norm": 0.3454098403453827,
+      "learning_rate": 0.0004894495626822157,
+      "loss": 3.3646,
+      "step": 31700
+    },
+    {
+      "epoch": 9.2511655011655,
+      "grad_norm": 0.34342116117477417,
+      "learning_rate": 0.0004892746355685131,
+      "loss": 3.3622,
+      "step": 31750
+    },
+    {
+      "epoch": 9.265734265734265,
+      "grad_norm": 0.3866739273071289,
+      "learning_rate": 0.0004890997084548104,
+      "loss": 3.3679,
+      "step": 31800
+    },
+    {
+      "epoch": 9.280303030303031,
+      "grad_norm": 0.3526863753795624,
+      "learning_rate": 0.0004889247813411079,
+      "loss": 3.3813,
+      "step": 31850
+    },
+    {
+      "epoch": 9.294871794871796,
+      "grad_norm": 0.36674419045448303,
+      "learning_rate": 0.0004887498542274052,
+      "loss": 3.3658,
+      "step": 31900
+    },
+    {
+      "epoch": 9.30944055944056,
+      "grad_norm": 0.36833953857421875,
+      "learning_rate": 0.0004885749271137026,
+      "loss": 3.3734,
+      "step": 31950
+    },
+    {
+      "epoch": 9.324009324009324,
+      "grad_norm": 0.3428957462310791,
+      "learning_rate": 0.0004883999999999999,
+      "loss": 3.3803,
+      "step": 32000
+    },
+    {
+      "epoch": 9.324009324009324,
+      "eval_accuracy": 0.36745534075363046,
+      "eval_loss": 3.579030990600586,
+      "eval_runtime": 180.7616,
+      "eval_samples_per_second": 92.066,
+      "eval_steps_per_second": 5.759,
+      "step": 32000
+    },
+    {
+      "epoch": 9.338578088578089,
+      "grad_norm": 0.3581482172012329,
+      "learning_rate": 0.0004882250728862973,
+      "loss": 3.3766,
+      "step": 32050
+    },
+    {
+      "epoch": 9.353146853146853,
+      "grad_norm": 0.3714257478713989,
+      "learning_rate": 0.0004880501457725947,
+      "loss": 3.3814,
+      "step": 32100
+    },
+    {
+      "epoch": 9.367715617715618,
+      "grad_norm": 0.361931174993515,
+      "learning_rate": 0.00048787521865889207,
+      "loss": 3.3851,
+      "step": 32150
+    },
+    {
+      "epoch": 9.382284382284382,
+      "grad_norm": 0.3409428596496582,
+      "learning_rate": 0.00048770029154518945,
+      "loss": 3.384,
+      "step": 32200
+    },
+    {
+      "epoch": 9.396853146853147,
+      "grad_norm": 0.40810930728912354,
+      "learning_rate": 0.0004875253644314868,
+      "loss": 3.3816,
+      "step": 32250
+    },
+    {
+      "epoch": 9.411421911421911,
+      "grad_norm": 0.3254898190498352,
+      "learning_rate": 0.0004873504373177842,
+      "loss": 3.381,
+      "step": 32300
+    },
+    {
+      "epoch": 9.425990675990676,
+      "grad_norm": 0.354233056306839,
+      "learning_rate": 0.00048717551020408163,
+      "loss": 3.3847,
+      "step": 32350
+    },
+    {
+      "epoch": 9.44055944055944,
+      "grad_norm": 0.3318980038166046,
+      "learning_rate": 0.000487000583090379,
+      "loss": 3.3792,
+      "step": 32400
+    },
+    {
+      "epoch": 9.455128205128204,
+      "grad_norm": 0.32618919014930725,
+      "learning_rate": 0.00048682565597667633,
+      "loss": 3.3899,
+      "step": 32450
+    },
+    {
+      "epoch": 9.469696969696969,
+      "grad_norm": 0.39949190616607666,
+      "learning_rate": 0.0004866507288629737,
+      "loss": 3.3837,
+      "step": 32500
+    },
+    {
+      "epoch": 9.484265734265735,
+      "grad_norm": 0.3685564398765564,
+      "learning_rate": 0.0004864758017492711,
+      "loss": 3.3851,
+      "step": 32550
+    },
+    {
+      "epoch": 9.4988344988345,
+      "grad_norm": 0.359235018491745,
+      "learning_rate": 0.00048630087463556845,
+      "loss": 3.3893,
+      "step": 32600
+    },
+    {
+      "epoch": 9.513403263403264,
+      "grad_norm": 0.33161383867263794,
+      "learning_rate": 0.00048612594752186583,
+      "loss": 3.4009,
+      "step": 32650
+    },
+    {
+      "epoch": 9.527972027972028,
+      "grad_norm": 0.3646078109741211,
+      "learning_rate": 0.0004859510204081632,
+      "loss": 3.4062,
+      "step": 32700
+    },
+    {
+      "epoch": 9.542540792540793,
+      "grad_norm": 0.32304298877716064,
+      "learning_rate": 0.00048577609329446064,
+      "loss": 3.4058,
+      "step": 32750
+    },
+    {
+      "epoch": 9.557109557109557,
+      "grad_norm": 0.340385764837265,
+      "learning_rate": 0.000485601166180758,
+      "loss": 3.4003,
+      "step": 32800
+    },
+    {
+      "epoch": 9.571678321678322,
+      "grad_norm": 0.353704571723938,
+      "learning_rate": 0.0004854262390670554,
+      "loss": 3.3916,
+      "step": 32850
+    },
+    {
+      "epoch": 9.586247086247086,
+      "grad_norm": 0.3353423476219177,
+      "learning_rate": 0.0004852513119533527,
+      "loss": 3.4019,
+      "step": 32900
+    },
+    {
+      "epoch": 9.60081585081585,
+      "grad_norm": 0.3232695758342743,
+      "learning_rate": 0.0004850763848396501,
+      "loss": 3.3974,
+      "step": 32950
+    },
+    {
+      "epoch": 9.615384615384615,
+      "grad_norm": 0.36285659670829773,
+      "learning_rate": 0.00048490145772594746,
+      "loss": 3.3931,
+      "step": 33000
+    },
+    {
+      "epoch": 9.615384615384615,
+      "eval_accuracy": 0.368203101059235,
+      "eval_loss": 3.5726845264434814,
+      "eval_runtime": 180.0842,
+      "eval_samples_per_second": 92.412,
+      "eval_steps_per_second": 5.781,
+      "step": 33000
+    },
+    {
+      "epoch": 9.62995337995338,
+      "grad_norm": 0.3308947682380676,
+      "learning_rate": 0.00048472653061224484,
+      "loss": 3.4049,
+      "step": 33050
+    },
+    {
+      "epoch": 9.644522144522144,
+      "grad_norm": 0.3408724367618561,
+      "learning_rate": 0.0004845516034985422,
+      "loss": 3.404,
+      "step": 33100
+    },
+    {
+      "epoch": 9.659090909090908,
+      "grad_norm": 0.34324896335601807,
+      "learning_rate": 0.0004843766763848396,
+      "loss": 3.399,
+      "step": 33150
+    },
+    {
+      "epoch": 9.673659673659674,
+      "grad_norm": 0.34077367186546326,
+      "learning_rate": 0.000484201749271137,
+      "loss": 3.3953,
+      "step": 33200
+    },
+    {
+      "epoch": 9.688228438228439,
+      "grad_norm": 0.35905328392982483,
+      "learning_rate": 0.0004840268221574344,
+      "loss": 3.3853,
+      "step": 33250
+    },
+    {
+      "epoch": 9.702797202797203,
+      "grad_norm": 0.3622050881385803,
+      "learning_rate": 0.00048385189504373177,
+      "loss": 3.4025,
+      "step": 33300
+    },
+    {
+      "epoch": 9.717365967365968,
+      "grad_norm": 0.34367215633392334,
+      "learning_rate": 0.0004836769679300291,
+      "loss": 3.4029,
+      "step": 33350
+    },
+    {
+      "epoch": 9.731934731934732,
+      "grad_norm": 0.32383468747138977,
+      "learning_rate": 0.00048350204081632647,
+      "loss": 3.4049,
+      "step": 33400
+    },
+    {
+      "epoch": 9.746503496503497,
+      "grad_norm": 0.36959537863731384,
+      "learning_rate": 0.00048332711370262384,
+      "loss": 3.405,
+      "step": 33450
+    },
+    {
+      "epoch": 9.761072261072261,
+      "grad_norm": 0.3404758870601654,
+      "learning_rate": 0.0004831521865889212,
+      "loss": 3.4005,
+      "step": 33500
+    },
+    {
+      "epoch": 9.775641025641026,
+      "grad_norm": 0.36188212037086487,
+      "learning_rate": 0.0004829772594752186,
+      "loss": 3.4074,
+      "step": 33550
+    },
+    {
+      "epoch": 9.79020979020979,
+      "grad_norm": 0.38642576336860657,
+      "learning_rate": 0.00048280233236151597,
+      "loss": 3.4068,
+      "step": 33600
+    },
+    {
+      "epoch": 9.804778554778554,
+      "grad_norm": 0.32433605194091797,
+      "learning_rate": 0.0004826274052478134,
+      "loss": 3.4092,
+      "step": 33650
+    },
+    {
+      "epoch": 9.819347319347319,
+      "grad_norm": 0.3639720678329468,
+      "learning_rate": 0.0004824524781341108,
+      "loss": 3.3985,
+      "step": 33700
+    },
+    {
+      "epoch": 9.833916083916083,
+      "grad_norm": 0.3690209686756134,
+      "learning_rate": 0.00048227755102040815,
+      "loss": 3.407,
+      "step": 33750
+    },
+    {
+      "epoch": 9.848484848484848,
+      "grad_norm": 0.32806217670440674,
+      "learning_rate": 0.0004821026239067055,
+      "loss": 3.4117,
+      "step": 33800
+    },
+    {
+      "epoch": 9.863053613053612,
+      "grad_norm": 0.32632794976234436,
+      "learning_rate": 0.00048192769679300285,
+      "loss": 3.4169,
+      "step": 33850
+    },
+    {
+      "epoch": 9.877622377622378,
+      "grad_norm": 0.34658604860305786,
+      "learning_rate": 0.0004817527696793002,
+      "loss": 3.4117,
+      "step": 33900
+    },
+    {
+      "epoch": 9.892191142191143,
+      "grad_norm": 0.34974268078804016,
+      "learning_rate": 0.0004815778425655976,
+      "loss": 3.4073,
+      "step": 33950
+    },
+    {
+      "epoch": 9.906759906759907,
+      "grad_norm": 0.3343101739883423,
+      "learning_rate": 0.000481402915451895,
+      "loss": 3.4063,
+      "step": 34000
+    },
+    {
+      "epoch": 9.906759906759907,
+      "eval_accuracy": 0.3688429129514813,
+      "eval_loss": 3.5587732791900635,
+      "eval_runtime": 180.2379,
+      "eval_samples_per_second": 92.334,
+      "eval_steps_per_second": 5.776,
+      "step": 34000
+    },
+    {
+      "epoch": 9.921328671328672,
+      "grad_norm": 0.33629804849624634,
+      "learning_rate": 0.0004812279883381924,
+      "loss": 3.4184,
+      "step": 34050
+    },
+    {
+      "epoch": 9.935897435897436,
+      "grad_norm": 0.35826265811920166,
+      "learning_rate": 0.0004810530612244898,
+      "loss": 3.4062,
+      "step": 34100
+    },
+    {
+      "epoch": 9.9504662004662,
+      "grad_norm": 0.3323402404785156,
+      "learning_rate": 0.00048087813411078716,
+      "loss": 3.4029,
+      "step": 34150
+    },
+    {
+      "epoch": 9.965034965034965,
+      "grad_norm": 0.3231922388076782,
+      "learning_rate": 0.00048070320699708453,
+      "loss": 3.4137,
+      "step": 34200
+    },
+    {
+      "epoch": 9.97960372960373,
+      "grad_norm": 0.35591524839401245,
+      "learning_rate": 0.00048052827988338186,
+      "loss": 3.4172,
+      "step": 34250
+    },
+    {
+      "epoch": 9.994172494172494,
+      "grad_norm": 0.3526099920272827,
+      "learning_rate": 0.00048035335276967923,
+      "loss": 3.4215,
+      "step": 34300
+    },
+    {
+      "epoch": 10.008741258741258,
+      "grad_norm": 0.367563933134079,
+      "learning_rate": 0.0004801784256559766,
+      "loss": 3.3311,
+      "step": 34350
+    },
+    {
+      "epoch": 10.023310023310023,
+      "grad_norm": 0.34572193026542664,
+      "learning_rate": 0.000480003498542274,
+      "loss": 3.3062,
+      "step": 34400
+    },
+    {
+      "epoch": 10.037878787878787,
+      "grad_norm": 0.362204909324646,
+      "learning_rate": 0.00047982857142857136,
+      "loss": 3.3028,
+      "step": 34450
+    },
+    {
+      "epoch": 10.052447552447552,
+      "grad_norm": 0.3749389946460724,
+      "learning_rate": 0.0004796536443148688,
+      "loss": 3.3031,
+      "step": 34500
+    },
+    {
+      "epoch": 10.067016317016318,
+      "grad_norm": 0.3729357421398163,
+      "learning_rate": 0.00047947871720116616,
+      "loss": 3.3036,
+      "step": 34550
+    },
+    {
+      "epoch": 10.081585081585082,
+      "grad_norm": 0.3892238140106201,
+      "learning_rate": 0.00047930379008746354,
+      "loss": 3.3145,
+      "step": 34600
+    },
+    {
+      "epoch": 10.096153846153847,
+      "grad_norm": 0.3650963008403778,
+      "learning_rate": 0.0004791288629737609,
+      "loss": 3.3232,
+      "step": 34650
+    },
+    {
+      "epoch": 10.110722610722611,
+      "grad_norm": 0.3529200851917267,
+      "learning_rate": 0.00047895393586005824,
+      "loss": 3.3166,
+      "step": 34700
+    },
+    {
+      "epoch": 10.125291375291376,
+      "grad_norm": 0.3430958390235901,
+      "learning_rate": 0.0004787790087463556,
+      "loss": 3.3311,
+      "step": 34750
+    },
+    {
+      "epoch": 10.13986013986014,
+      "grad_norm": 0.35546183586120605,
+      "learning_rate": 0.000478604081632653,
+      "loss": 3.3229,
+      "step": 34800
+    },
+    {
+      "epoch": 10.154428904428904,
+      "grad_norm": 0.3477681279182434,
+      "learning_rate": 0.00047842915451895037,
+      "loss": 3.3211,
+      "step": 34850
+    },
+    {
+      "epoch": 10.168997668997669,
+      "grad_norm": 0.35804784297943115,
+      "learning_rate": 0.0004782542274052478,
+      "loss": 3.318,
+      "step": 34900
+    },
+    {
+      "epoch": 10.183566433566433,
+      "grad_norm": 0.3714865744113922,
+      "learning_rate": 0.00047807930029154517,
+      "loss": 3.3529,
+      "step": 34950
+    },
+    {
+      "epoch": 10.198135198135198,
+      "grad_norm": 0.37744787335395813,
+      "learning_rate": 0.00047790437317784255,
+      "loss": 3.3379,
+      "step": 35000
+    },
+    {
+      "epoch": 10.198135198135198,
+      "eval_accuracy": 0.36837090322248356,
+      "eval_loss": 3.5747363567352295,
+      "eval_runtime": 180.0894,
+      "eval_samples_per_second": 92.41,
+      "eval_steps_per_second": 5.78,
+      "step": 35000
+    },
+    {
+      "epoch": 10.212703962703962,
+      "grad_norm": 0.3652697801589966,
+      "learning_rate": 0.0004777294460641399,
+      "loss": 3.3403,
+      "step": 35050
+    },
+    {
+      "epoch": 10.227272727272727,
+      "grad_norm": 0.3565238118171692,
+      "learning_rate": 0.0004775545189504373,
+      "loss": 3.3517,
+      "step": 35100
+    },
+    {
+      "epoch": 10.241841491841491,
+      "grad_norm": 0.3647816777229309,
+      "learning_rate": 0.0004773795918367346,
+      "loss": 3.3465,
+      "step": 35150
+    },
+    {
+      "epoch": 10.256410256410255,
+      "grad_norm": 0.3312961161136627,
+      "learning_rate": 0.000477204664723032,
+      "loss": 3.3448,
+      "step": 35200
+    },
+    {
+      "epoch": 10.270979020979022,
+      "grad_norm": 0.3463350534439087,
+      "learning_rate": 0.00047702973760932937,
+      "loss": 3.329,
+      "step": 35250
+    },
+    {
+      "epoch": 10.285547785547786,
+      "grad_norm": 0.36243367195129395,
+      "learning_rate": 0.00047685481049562675,
+      "loss": 3.3469,
+      "step": 35300
+    },
+    {
+      "epoch": 10.30011655011655,
+      "grad_norm": 0.3585239350795746,
+      "learning_rate": 0.0004766798833819242,
+      "loss": 3.3488,
+      "step": 35350
+    },
+    {
+      "epoch": 10.314685314685315,
+      "grad_norm": 0.33923816680908203,
+      "learning_rate": 0.00047650495626822155,
+      "loss": 3.357,
+      "step": 35400
+    },
+    {
+      "epoch": 10.32925407925408,
+      "grad_norm": 0.3626267910003662,
+      "learning_rate": 0.00047633002915451893,
+      "loss": 3.356,
+      "step": 35450
+    },
+    {
+      "epoch": 10.343822843822844,
+      "grad_norm": 0.36127206683158875,
+      "learning_rate": 0.0004761551020408163,
+      "loss": 3.3728,
+      "step": 35500
+    },
+    {
+      "epoch": 10.358391608391608,
+      "grad_norm": 0.3516559600830078,
+      "learning_rate": 0.0004759801749271137,
+      "loss": 3.3548,
+      "step": 35550
+    },
+    {
+      "epoch": 10.372960372960373,
+      "grad_norm": 0.38914352655410767,
+      "learning_rate": 0.000475805247813411,
+      "loss": 3.3593,
+      "step": 35600
+    },
+    {
+      "epoch": 10.387529137529137,
+      "grad_norm": 0.3629930317401886,
+      "learning_rate": 0.0004756303206997084,
+      "loss": 3.3497,
+      "step": 35650
+    },
+    {
+      "epoch": 10.402097902097902,
+      "grad_norm": 0.34036391973495483,
+      "learning_rate": 0.00047545539358600575,
+      "loss": 3.3635,
+      "step": 35700
+    },
+    {
+      "epoch": 10.416666666666666,
+      "grad_norm": 0.35723787546157837,
+      "learning_rate": 0.00047528046647230313,
+      "loss": 3.364,
+      "step": 35750
+    },
+    {
+      "epoch": 10.43123543123543,
+      "grad_norm": 0.3406592309474945,
+      "learning_rate": 0.00047510553935860056,
+      "loss": 3.3589,
+      "step": 35800
+    },
+    {
+      "epoch": 10.445804195804195,
+      "grad_norm": 0.3650604784488678,
+      "learning_rate": 0.00047493061224489794,
+      "loss": 3.3673,
+      "step": 35850
+    },
+    {
+      "epoch": 10.460372960372961,
+      "grad_norm": 0.33995601534843445,
+      "learning_rate": 0.0004747556851311953,
+      "loss": 3.3702,
+      "step": 35900
+    },
+    {
+      "epoch": 10.474941724941726,
+      "grad_norm": 0.3596780002117157,
+      "learning_rate": 0.0004745807580174927,
+      "loss": 3.3651,
+      "step": 35950
+    },
+    {
+      "epoch": 10.48951048951049,
+      "grad_norm": 0.358271062374115,
+      "learning_rate": 0.00047440583090379006,
+      "loss": 3.3768,
+      "step": 36000
+    },
+    {
+      "epoch": 10.48951048951049,
+      "eval_accuracy": 0.3689066471998911,
+      "eval_loss": 3.565972089767456,
+      "eval_runtime": 180.2039,
+      "eval_samples_per_second": 92.351,
+      "eval_steps_per_second": 5.777,
+      "step": 36000
+    },
+    {
+      "epoch": 10.504079254079254,
+      "grad_norm": 0.3587784767150879,
+      "learning_rate": 0.0004742309037900874,
+      "loss": 3.3685,
+      "step": 36050
+    },
+    {
+      "epoch": 10.518648018648019,
+      "grad_norm": 0.36644667387008667,
+      "learning_rate": 0.00047405597667638476,
+      "loss": 3.3731,
+      "step": 36100
+    },
+    {
+      "epoch": 10.533216783216783,
+      "grad_norm": 0.3659219741821289,
+      "learning_rate": 0.00047388104956268214,
+      "loss": 3.3799,
+      "step": 36150
+    },
+    {
+      "epoch": 10.547785547785548,
+      "grad_norm": 0.36219388246536255,
+      "learning_rate": 0.00047370612244897957,
+      "loss": 3.366,
+      "step": 36200
+    },
+    {
+      "epoch": 10.562354312354312,
+      "grad_norm": 0.3452727496623993,
+      "learning_rate": 0.00047353119533527694,
+      "loss": 3.3727,
+      "step": 36250
+    },
+    {
+      "epoch": 10.576923076923077,
+      "grad_norm": 0.34664297103881836,
+      "learning_rate": 0.0004733562682215743,
+      "loss": 3.359,
+      "step": 36300
+    },
+    {
+      "epoch": 10.591491841491841,
+      "grad_norm": 0.34712809324264526,
+      "learning_rate": 0.0004731813411078717,
+      "loss": 3.3701,
+      "step": 36350
+    },
+    {
+      "epoch": 10.606060606060606,
+      "grad_norm": 0.34347906708717346,
+      "learning_rate": 0.00047300641399416907,
+      "loss": 3.3803,
+      "step": 36400
+    },
+    {
+      "epoch": 10.62062937062937,
+      "grad_norm": 0.37337714433670044,
+      "learning_rate": 0.00047283148688046645,
+      "loss": 3.3882,
+      "step": 36450
+    },
+    {
+      "epoch": 10.635198135198134,
+      "grad_norm": 0.36376672983169556,
+      "learning_rate": 0.00047265655976676377,
+      "loss": 3.383,
+      "step": 36500
+    },
+    {
+      "epoch": 10.649766899766899,
+      "grad_norm": 0.34523946046829224,
+      "learning_rate": 0.00047248163265306114,
+      "loss": 3.3846,
+      "step": 36550
+    },
+    {
+      "epoch": 10.664335664335665,
+      "grad_norm": 0.3508089482784271,
+      "learning_rate": 0.0004723067055393585,
+      "loss": 3.3739,
+      "step": 36600
+    },
+    {
+      "epoch": 10.67890442890443,
+      "grad_norm": 0.3470657467842102,
+      "learning_rate": 0.00047213177842565595,
+      "loss": 3.3717,
+      "step": 36650
+    },
+    {
+      "epoch": 10.693473193473194,
+      "grad_norm": 0.3334925174713135,
+      "learning_rate": 0.0004719568513119533,
+      "loss": 3.3814,
+      "step": 36700
+    },
+    {
+      "epoch": 10.708041958041958,
+      "grad_norm": 0.3517080545425415,
+      "learning_rate": 0.0004717819241982507,
+      "loss": 3.3845,
+      "step": 36750
+    },
+    {
+      "epoch": 10.722610722610723,
+      "grad_norm": 0.3703469932079315,
+      "learning_rate": 0.0004716069970845481,
+      "loss": 3.3785,
+      "step": 36800
+    },
+    {
+      "epoch": 10.737179487179487,
+      "grad_norm": 0.3503482937812805,
+      "learning_rate": 0.00047143206997084545,
+      "loss": 3.3877,
+      "step": 36850
+    },
+    {
+      "epoch": 10.751748251748252,
+      "grad_norm": 0.36413902044296265,
+      "learning_rate": 0.00047125714285714283,
+      "loss": 3.3901,
+      "step": 36900
+    },
+    {
+      "epoch": 10.766317016317016,
+      "grad_norm": 0.35273477435112,
+      "learning_rate": 0.00047108221574344015,
+      "loss": 3.3989,
+      "step": 36950
+    },
+    {
+      "epoch": 10.78088578088578,
+      "grad_norm": 0.3469065725803375,
+      "learning_rate": 0.0004709072886297375,
+      "loss": 3.3929,
+      "step": 37000
+    },
+    {
+      "epoch": 10.78088578088578,
+      "eval_accuracy": 0.36930116454936474,
+      "eval_loss": 3.5597054958343506,
+      "eval_runtime": 180.1588,
+      "eval_samples_per_second": 92.374,
+      "eval_steps_per_second": 5.778,
+      "step": 37000
+    },
+    {
+      "epoch": 10.795454545454545,
+      "grad_norm": 0.347210556268692,
+      "learning_rate": 0.00047073236151603495,
+      "loss": 3.3819,
+      "step": 37050
+    },
+    {
+      "epoch": 10.81002331002331,
+      "grad_norm": 0.35915273427963257,
+      "learning_rate": 0.00047055743440233233,
+      "loss": 3.3801,
+      "step": 37100
+    },
+    {
+      "epoch": 10.824592074592074,
+      "grad_norm": 0.3388284146785736,
+      "learning_rate": 0.0004703825072886297,
+      "loss": 3.3866,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83916083916084,
+      "grad_norm": 0.3657146990299225,
+      "learning_rate": 0.0004702075801749271,
+      "loss": 3.4009,
+      "step": 37200
+    },
+    {
+      "epoch": 10.853729603729604,
+      "grad_norm": 0.35583174228668213,
+      "learning_rate": 0.00047003265306122446,
+      "loss": 3.387,
+      "step": 37250
+    },
+    {
+      "epoch": 10.868298368298369,
+      "grad_norm": 0.3616805672645569,
+      "learning_rate": 0.00046985772594752183,
+      "loss": 3.3672,
+      "step": 37300
+    },
+    {
+      "epoch": 10.882867132867133,
+      "grad_norm": 0.34906110167503357,
+      "learning_rate": 0.0004696827988338192,
+      "loss": 3.3822,
+      "step": 37350
+    },
+    {
+      "epoch": 10.897435897435898,
+      "grad_norm": 0.37446925044059753,
+      "learning_rate": 0.00046950787172011653,
+      "loss": 3.3935,
+      "step": 37400
+    },
+    {
+      "epoch": 10.912004662004662,
+      "grad_norm": 0.3785672187805176,
+      "learning_rate": 0.0004693329446064139,
+      "loss": 3.3824,
+      "step": 37450
+    },
+    {
+      "epoch": 10.926573426573427,
+      "grad_norm": 0.37299731373786926,
+      "learning_rate": 0.00046915801749271134,
+      "loss": 3.3865,
+      "step": 37500
+    },
+    {
+      "epoch": 10.941142191142191,
+      "grad_norm": 0.3548412621021271,
+      "learning_rate": 0.0004689830903790087,
+      "loss": 3.3952,
+      "step": 37550
+    },
+    {
+      "epoch": 10.955710955710956,
+      "grad_norm": 0.36777183413505554,
+      "learning_rate": 0.0004688081632653061,
+      "loss": 3.3878,
+      "step": 37600
+    },
+    {
+      "epoch": 10.97027972027972,
+      "grad_norm": 0.36412835121154785,
+      "learning_rate": 0.00046863323615160346,
+      "loss": 3.4091,
+      "step": 37650
+    },
+    {
+      "epoch": 10.984848484848484,
+      "grad_norm": 0.3270232379436493,
+      "learning_rate": 0.00046845830903790084,
+      "loss": 3.3996,
+      "step": 37700
+    },
+    {
+      "epoch": 10.999417249417249,
+      "grad_norm": 0.3319988250732422,
+      "learning_rate": 0.0004682833819241982,
+      "loss": 3.3991,
+      "step": 37750
+    },
+    {
+      "epoch": 11.013986013986013,
+      "grad_norm": 0.35844141244888306,
+      "learning_rate": 0.0004681084548104956,
+      "loss": 3.2767,
+      "step": 37800
+    },
+    {
+      "epoch": 11.028554778554778,
+      "grad_norm": 0.3383696377277374,
+      "learning_rate": 0.0004679335276967929,
+      "loss": 3.2718,
+      "step": 37850
+    },
+    {
+      "epoch": 11.043123543123544,
+      "grad_norm": 0.3634346127510071,
+      "learning_rate": 0.0004677586005830903,
+      "loss": 3.2706,
+      "step": 37900
+    },
+    {
+      "epoch": 11.057692307692308,
+      "grad_norm": 0.3992638885974884,
+      "learning_rate": 0.0004675836734693877,
+      "loss": 3.2881,
+      "step": 37950
+    },
+    {
+      "epoch": 11.072261072261073,
+      "grad_norm": 0.35264912247657776,
+      "learning_rate": 0.0004674087463556851,
+      "loss": 3.2905,
+      "step": 38000
+    },
+    {
+      "epoch": 11.072261072261073,
+      "eval_accuracy": 0.36926294751849176,
+      "eval_loss": 3.5672919750213623,
+      "eval_runtime": 180.0525,
+      "eval_samples_per_second": 92.429,
+      "eval_steps_per_second": 5.782,
+      "step": 38000
+    },
+    {
+      "epoch": 11.086829836829837,
+      "grad_norm": 0.38650333881378174,
+      "learning_rate": 0.00046723381924198247,
+      "loss": 3.3106,
+      "step": 38050
+    },
+    {
+      "epoch": 11.101398601398602,
+      "grad_norm": 0.3478892743587494,
+      "learning_rate": 0.00046705889212827985,
+      "loss": 3.3016,
+      "step": 38100
+    },
+    {
+      "epoch": 11.115967365967366,
+      "grad_norm": 0.3671860992908478,
+      "learning_rate": 0.0004668839650145772,
+      "loss": 3.2985,
+      "step": 38150
+    },
+    {
+      "epoch": 11.13053613053613,
+      "grad_norm": 0.3565201461315155,
+      "learning_rate": 0.0004667090379008746,
+      "loss": 3.3071,
+      "step": 38200
+    },
+    {
+      "epoch": 11.145104895104895,
+      "grad_norm": 0.3274824321269989,
+      "learning_rate": 0.000466534110787172,
+      "loss": 3.3222,
+      "step": 38250
+    },
+    {
+      "epoch": 11.15967365967366,
+      "grad_norm": 0.3710516691207886,
+      "learning_rate": 0.0004663591836734693,
+      "loss": 3.3109,
+      "step": 38300
+    },
+    {
+      "epoch": 11.174242424242424,
+      "grad_norm": 0.37232545018196106,
+      "learning_rate": 0.0004661842565597667,
+      "loss": 3.3054,
+      "step": 38350
+    },
+    {
+      "epoch": 11.188811188811188,
+      "grad_norm": 0.3739616274833679,
+      "learning_rate": 0.0004660093294460641,
+      "loss": 3.3147,
+      "step": 38400
+    },
+    {
+      "epoch": 11.203379953379953,
+      "grad_norm": 0.35690245032310486,
+      "learning_rate": 0.0004658344023323615,
+      "loss": 3.3187,
+      "step": 38450
+    },
+    {
+      "epoch": 11.217948717948717,
+      "grad_norm": 0.3522016704082489,
+      "learning_rate": 0.00046565947521865885,
+      "loss": 3.321,
+      "step": 38500
+    },
+    {
+      "epoch": 11.232517482517483,
+      "grad_norm": 0.379158079624176,
+      "learning_rate": 0.00046548454810495623,
+      "loss": 3.3273,
+      "step": 38550
+    },
+    {
+      "epoch": 11.247086247086248,
+      "grad_norm": 0.37325507402420044,
+      "learning_rate": 0.0004653096209912536,
+      "loss": 3.3222,
+      "step": 38600
+    },
+    {
+      "epoch": 11.261655011655012,
+      "grad_norm": 0.3767625093460083,
+      "learning_rate": 0.000465134693877551,
+      "loss": 3.3269,
+      "step": 38650
+    },
+    {
+      "epoch": 11.276223776223777,
+      "grad_norm": 0.3531850278377533,
+      "learning_rate": 0.0004649597667638484,
+      "loss": 3.3361,
+      "step": 38700
+    },
+    {
+      "epoch": 11.290792540792541,
+      "grad_norm": 0.35781583189964294,
+      "learning_rate": 0.0004647848396501457,
+      "loss": 3.3308,
+      "step": 38750
+    },
+    {
+      "epoch": 11.305361305361306,
+      "grad_norm": 0.35981640219688416,
+      "learning_rate": 0.0004646099125364431,
+      "loss": 3.3252,
+      "step": 38800
+    },
+    {
+      "epoch": 11.31993006993007,
+      "grad_norm": 0.36371827125549316,
+      "learning_rate": 0.0004644349854227405,
+      "loss": 3.3374,
+      "step": 38850
+    },
+    {
+      "epoch": 11.334498834498834,
+      "grad_norm": 0.37464508414268494,
+      "learning_rate": 0.00046426005830903786,
+      "loss": 3.3461,
+      "step": 38900
+    },
+    {
+      "epoch": 11.349067599067599,
+      "grad_norm": 0.38214632868766785,
+      "learning_rate": 0.00046408513119533523,
+      "loss": 3.3348,
+      "step": 38950
+    },
+    {
+      "epoch": 11.363636363636363,
+      "grad_norm": 0.40841469168663025,
+      "learning_rate": 0.0004639102040816326,
+      "loss": 3.3375,
+      "step": 39000
+    },
+    {
+      "epoch": 11.363636363636363,
+      "eval_accuracy": 0.3695302315528744,
+      "eval_loss": 3.563751220703125,
+      "eval_runtime": 180.0277,
+      "eval_samples_per_second": 92.441,
+      "eval_steps_per_second": 5.782,
+      "step": 39000
+    },
+    {
+      "epoch": 11.378205128205128,
+      "grad_norm": 0.35644689202308655,
+      "learning_rate": 0.00046373527696793,
+      "loss": 3.3485,
+      "step": 39050
+    },
+    {
+      "epoch": 11.392773892773892,
+      "grad_norm": 0.3444243371486664,
+      "learning_rate": 0.00046356034985422736,
+      "loss": 3.3417,
+      "step": 39100
+    },
+    {
+      "epoch": 11.407342657342657,
+      "grad_norm": 0.3749789893627167,
+      "learning_rate": 0.0004633854227405248,
+      "loss": 3.3419,
+      "step": 39150
+    },
+    {
+      "epoch": 11.421911421911421,
+      "grad_norm": 0.3557623326778412,
+      "learning_rate": 0.0004632104956268221,
+      "loss": 3.3325,
+      "step": 39200
+    },
+    {
+      "epoch": 11.436480186480187,
+      "grad_norm": 0.36125391721725464,
+      "learning_rate": 0.0004630355685131195,
+      "loss": 3.3398,
+      "step": 39250
+    },
+    {
+      "epoch": 11.451048951048952,
+      "grad_norm": 0.3687732517719269,
+      "learning_rate": 0.00046286064139941687,
+      "loss": 3.3518,
+      "step": 39300
+    },
+    {
+      "epoch": 11.465617715617716,
+      "grad_norm": 0.3502034842967987,
+      "learning_rate": 0.00046268571428571424,
+      "loss": 3.3484,
+      "step": 39350
+    },
+    {
+      "epoch": 11.48018648018648,
+      "grad_norm": 0.3895909786224365,
+      "learning_rate": 0.0004625107871720116,
+      "loss": 3.3564,
+      "step": 39400
+    },
+    {
+      "epoch": 11.494755244755245,
+      "grad_norm": 0.3652609884738922,
+      "learning_rate": 0.000462335860058309,
+      "loss": 3.346,
+      "step": 39450
+    },
+    {
+      "epoch": 11.50932400932401,
+      "grad_norm": 0.372211754322052,
+      "learning_rate": 0.00046216093294460637,
+      "loss": 3.3468,
+      "step": 39500
+    },
+    {
+      "epoch": 11.523892773892774,
+      "grad_norm": 0.3634597063064575,
+      "learning_rate": 0.0004619860058309038,
+      "loss": 3.343,
+      "step": 39550
+    },
+    {
+      "epoch": 11.538461538461538,
+      "grad_norm": 0.3725431561470032,
+      "learning_rate": 0.0004618110787172012,
+      "loss": 3.3475,
+      "step": 39600
+    },
+    {
+      "epoch": 11.553030303030303,
+      "grad_norm": 0.3666999042034149,
+      "learning_rate": 0.0004616361516034985,
+      "loss": 3.3463,
+      "step": 39650
+    },
+    {
+      "epoch": 11.567599067599067,
+      "grad_norm": 0.33625391125679016,
+      "learning_rate": 0.00046146122448979587,
+      "loss": 3.3364,
+      "step": 39700
+    },
+    {
+      "epoch": 11.582167832167832,
+      "grad_norm": 0.35108792781829834,
+      "learning_rate": 0.00046128629737609325,
+      "loss": 3.3491,
+      "step": 39750
+    },
+    {
+      "epoch": 11.596736596736596,
+      "grad_norm": 0.36968687176704407,
+      "learning_rate": 0.0004611113702623906,
+      "loss": 3.3587,
+      "step": 39800
+    },
+    {
+      "epoch": 11.61130536130536,
+      "grad_norm": 0.37255340814590454,
+      "learning_rate": 0.000460936443148688,
+      "loss": 3.3613,
+      "step": 39850
+    },
+    {
+      "epoch": 11.625874125874127,
+      "grad_norm": 0.37071385979652405,
+      "learning_rate": 0.0004607615160349854,
+      "loss": 3.3637,
+      "step": 39900
+    },
+    {
+      "epoch": 11.640442890442891,
+      "grad_norm": 0.3244622051715851,
+      "learning_rate": 0.00046058658892128275,
+      "loss": 3.347,
+      "step": 39950
+    },
+    {
+      "epoch": 11.655011655011656,
+      "grad_norm": 0.33037108182907104,
+      "learning_rate": 0.0004604116618075802,
+      "loss": 3.352,
+      "step": 40000
+    },
+    {
+      "epoch": 11.655011655011656,
+      "eval_accuracy": 0.3698865318714751,
+      "eval_loss": 3.5570318698883057,
+      "eval_runtime": 179.9937,
+      "eval_samples_per_second": 92.459,
+      "eval_steps_per_second": 5.784,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171600,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.36123005550592e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}