diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,22181 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997401507125867,
+  "eval_steps": 100,
+  "global_step": 3126,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00031981450758560037,
+      "grad_norm": 0.5762398838996887,
+      "learning_rate": 0.0,
+      "loss": 11.9102,
+      "step": 1
+    },
+    {
+      "epoch": 0.0006396290151712007,
+      "grad_norm": 0.5624870657920837,
+      "learning_rate": 6.382978723404255e-06,
+      "loss": 11.9077,
+      "step": 2
+    },
+    {
+      "epoch": 0.0009594435227568011,
+      "grad_norm": 0.6053410768508911,
+      "learning_rate": 1.276595744680851e-05,
+      "loss": 11.9013,
+      "step": 3
+    },
+    {
+      "epoch": 0.0012792580303424015,
+      "grad_norm": 0.598536491394043,
+      "learning_rate": 1.9148936170212762e-05,
+      "loss": 11.8807,
+      "step": 4
+    },
+    {
+      "epoch": 0.0015990725379280018,
+      "grad_norm": 0.5812181234359741,
+      "learning_rate": 2.553191489361702e-05,
+      "loss": 11.848,
+      "step": 5
+    },
+    {
+      "epoch": 0.0019188870455136022,
+      "grad_norm": 0.5945044755935669,
+      "learning_rate": 3.1914893617021275e-05,
+      "loss": 11.8085,
+      "step": 6
+    },
+    {
+      "epoch": 0.0022387015530992023,
+      "grad_norm": 0.6150892376899719,
+      "learning_rate": 3.8297872340425525e-05,
+      "loss": 11.765,
+      "step": 7
+    },
+    {
+      "epoch": 0.002558516060684803,
+      "grad_norm": 0.6108999252319336,
+      "learning_rate": 4.468085106382978e-05,
+      "loss": 11.7116,
+      "step": 8
+    },
+    {
+      "epoch": 0.002878330568270403,
+      "grad_norm": 0.6705823540687561,
+      "learning_rate": 5.106382978723404e-05,
+      "loss": 11.644,
+      "step": 9
+    },
+    {
+      "epoch": 0.0031981450758560037,
+      "grad_norm": 0.7061710357666016,
+      "learning_rate": 5.7446808510638294e-05,
+      "loss": 11.5856,
+      "step": 10
+    },
+    {
+      "epoch": 0.003517959583441604,
+      "grad_norm": 0.7612898349761963,
+      "learning_rate": 6.382978723404255e-05,
+      "loss": 11.5082,
+      "step": 11
+    },
+    {
+      "epoch": 0.0038377740910272044,
+      "grad_norm": 0.9055056571960449,
+      "learning_rate": 7.02127659574468e-05,
+      "loss": 11.3855,
+      "step": 12
+    },
+    {
+      "epoch": 0.0041575885986128045,
+      "grad_norm": 0.9641930460929871,
+      "learning_rate": 7.659574468085105e-05,
+      "loss": 11.3033,
+      "step": 13
+    },
+    {
+      "epoch": 0.004477403106198405,
+      "grad_norm": 1.0467524528503418,
+      "learning_rate": 8.297872340425531e-05,
+      "loss": 11.2125,
+      "step": 14
+    },
+    {
+      "epoch": 0.004797217613784005,
+      "grad_norm": 1.1716750860214233,
+      "learning_rate": 8.936170212765956e-05,
+      "loss": 11.0821,
+      "step": 15
+    },
+    {
+      "epoch": 0.005117032121369606,
+      "grad_norm": 1.3222126960754395,
+      "learning_rate": 9.574468085106382e-05,
+      "loss": 10.9287,
+      "step": 16
+    },
+    {
+      "epoch": 0.005436846628955206,
+      "grad_norm": 1.4363603591918945,
+      "learning_rate": 0.00010212765957446807,
+      "loss": 10.8168,
+      "step": 17
+    },
+    {
+      "epoch": 0.005756661136540806,
+      "grad_norm": 1.5512793064117432,
+      "learning_rate": 0.00010851063829787234,
+      "loss": 10.6823,
+      "step": 18
+    },
+    {
+      "epoch": 0.006076475644126406,
+      "grad_norm": 1.624642014503479,
+      "learning_rate": 0.00011489361702127659,
+      "loss": 10.552,
+      "step": 19
+    },
+    {
+      "epoch": 0.006396290151712007,
+      "grad_norm": 1.6283295154571533,
+      "learning_rate": 0.00012127659574468084,
+      "loss": 10.4734,
+      "step": 20
+    },
+    {
+      "epoch": 0.0067161046592976075,
+      "grad_norm": 1.698269248008728,
+      "learning_rate": 0.0001276595744680851,
+      "loss": 10.3521,
+      "step": 21
+    },
+    {
+      "epoch": 0.007035919166883208,
+      "grad_norm": 1.7003437280654907,
+      "learning_rate": 0.00013404255319148935,
+      "loss": 10.2487,
+      "step": 22
+    },
+    {
+      "epoch": 0.007355733674468808,
+      "grad_norm": 1.6472309827804565,
+      "learning_rate": 0.0001404255319148936,
+      "loss": 10.1751,
+      "step": 23
+    },
+    {
+      "epoch": 0.007675548182054409,
+      "grad_norm": 1.6952213048934937,
+      "learning_rate": 0.00014680851063829785,
+      "loss": 10.0406,
+      "step": 24
+    },
+    {
+      "epoch": 0.007995362689640009,
+      "grad_norm": 1.6617240905761719,
+      "learning_rate": 0.0001531914893617021,
+      "loss": 9.9372,
+      "step": 25
+    },
+    {
+      "epoch": 0.008315177197225609,
+      "grad_norm": 1.6541690826416016,
+      "learning_rate": 0.00015957446808510637,
+      "loss": 9.8292,
+      "step": 26
+    },
+    {
+      "epoch": 0.00863499170481121,
+      "grad_norm": 1.5970962047576904,
+      "learning_rate": 0.00016595744680851062,
+      "loss": 9.7187,
+      "step": 27
+    },
+    {
+      "epoch": 0.00895480621239681,
+      "grad_norm": 1.5352132320404053,
+      "learning_rate": 0.0001723404255319149,
+      "loss": 9.6444,
+      "step": 28
+    },
+    {
+      "epoch": 0.00927462071998241,
+      "grad_norm": 1.5574132204055786,
+      "learning_rate": 0.00017872340425531912,
+      "loss": 9.5396,
+      "step": 29
+    },
+    {
+      "epoch": 0.00959443522756801,
+      "grad_norm": 1.7909637689590454,
+      "learning_rate": 0.0001851063829787234,
+      "loss": 9.4365,
+      "step": 30
+    },
+    {
+      "epoch": 0.009914249735153612,
+      "grad_norm": 2.3043689727783203,
+      "learning_rate": 0.00019148936170212765,
+      "loss": 9.3488,
+      "step": 31
+    },
+    {
+      "epoch": 0.010234064242739212,
+      "grad_norm": 1.7960251569747925,
+      "learning_rate": 0.00019787234042553187,
+      "loss": 9.215,
+      "step": 32
+    },
+    {
+      "epoch": 0.010553878750324812,
+      "grad_norm": 1.5585463047027588,
+      "learning_rate": 0.00020425531914893615,
+      "loss": 9.0985,
+      "step": 33
+    },
+    {
+      "epoch": 0.010873693257910412,
+      "grad_norm": 1.8078670501708984,
+      "learning_rate": 0.0002106382978723404,
+      "loss": 9.007,
+      "step": 34
+    },
+    {
+      "epoch": 0.011193507765496012,
+      "grad_norm": 1.4333995580673218,
+      "learning_rate": 0.00021702127659574468,
+      "loss": 8.8843,
+      "step": 35
+    },
+    {
+      "epoch": 0.011513322273081612,
+      "grad_norm": 1.474975347518921,
+      "learning_rate": 0.0002234042553191489,
+      "loss": 8.8019,
+      "step": 36
+    },
+    {
+      "epoch": 0.011833136780667212,
+      "grad_norm": 1.3332194089889526,
+      "learning_rate": 0.00022978723404255317,
+      "loss": 8.727,
+      "step": 37
+    },
+    {
+      "epoch": 0.012152951288252813,
+      "grad_norm": 1.425425410270691,
+      "learning_rate": 0.00023617021276595742,
+      "loss": 8.5894,
+      "step": 38
+    },
+    {
+      "epoch": 0.012472765795838415,
+      "grad_norm": 1.230981707572937,
+      "learning_rate": 0.00024255319148936167,
+      "loss": 8.5641,
+      "step": 39
+    },
+    {
+      "epoch": 0.012792580303424015,
+      "grad_norm": 1.2399401664733887,
+      "learning_rate": 0.0002489361702127659,
+      "loss": 8.3774,
+      "step": 40
+    },
+    {
+      "epoch": 0.013112394811009615,
+      "grad_norm": 1.1850652694702148,
+      "learning_rate": 0.0002553191489361702,
+      "loss": 8.3388,
+      "step": 41
+    },
+    {
+      "epoch": 0.013432209318595215,
+      "grad_norm": 1.085748314857483,
+      "learning_rate": 0.0002617021276595745,
+      "loss": 8.2185,
+      "step": 42
+    },
+    {
+      "epoch": 0.013752023826180815,
+      "grad_norm": 1.026831865310669,
+      "learning_rate": 0.0002680851063829787,
+      "loss": 8.1171,
+      "step": 43
+    },
+    {
+      "epoch": 0.014071838333766415,
+      "grad_norm": 1.0322154760360718,
+      "learning_rate": 0.000274468085106383,
+      "loss": 8.0373,
+      "step": 44
+    },
+    {
+      "epoch": 0.014391652841352015,
+      "grad_norm": 0.8840400576591492,
+      "learning_rate": 0.0002808510638297872,
+      "loss": 7.9007,
+      "step": 45
+    },
+    {
+      "epoch": 0.014711467348937616,
+      "grad_norm": 0.7530202269554138,
+      "learning_rate": 0.0002872340425531915,
+      "loss": 7.8504,
+      "step": 46
+    },
+    {
+      "epoch": 0.015031281856523216,
+      "grad_norm": 0.8164878487586975,
+      "learning_rate": 0.0002936170212765957,
+      "loss": 7.7963,
+      "step": 47
+    },
+    {
+      "epoch": 0.015351096364108818,
+      "grad_norm": 0.6531561613082886,
+      "learning_rate": 0.0003,
+      "loss": 7.8051,
+      "step": 48
+    },
+    {
+      "epoch": 0.015670910871694418,
+      "grad_norm": 0.5979849100112915,
+      "learning_rate": 0.0003063829787234042,
+      "loss": 7.6738,
+      "step": 49
+    },
+    {
+      "epoch": 0.015990725379280018,
+      "grad_norm": 0.7686684727668762,
+      "learning_rate": 0.0003127659574468085,
+      "loss": 7.6786,
+      "step": 50
+    },
+    {
+      "epoch": 0.016310539886865618,
+      "grad_norm": 0.7229753136634827,
+      "learning_rate": 0.00031914893617021275,
+      "loss": 7.6438,
+      "step": 51
+    },
+    {
+      "epoch": 0.016630354394451218,
+      "grad_norm": 0.40795406699180603,
+      "learning_rate": 0.00032553191489361697,
+      "loss": 7.6013,
+      "step": 52
+    },
+    {
+      "epoch": 0.01695016890203682,
+      "grad_norm": 0.7603433728218079,
+      "learning_rate": 0.00033191489361702125,
+      "loss": 7.5757,
+      "step": 53
+    },
+    {
+      "epoch": 0.01726998340962242,
+      "grad_norm": 0.5100083351135254,
+      "learning_rate": 0.00033829787234042547,
+      "loss": 7.5115,
+      "step": 54
+    },
+    {
+      "epoch": 0.01758979791720802,
+      "grad_norm": 0.42650240659713745,
+      "learning_rate": 0.0003446808510638298,
+      "loss": 7.4833,
+      "step": 55
+    },
+    {
+      "epoch": 0.01790961242479362,
+      "grad_norm": 0.4726926386356354,
+      "learning_rate": 0.000351063829787234,
+      "loss": 7.499,
+      "step": 56
+    },
+    {
+      "epoch": 0.01822942693237922,
+      "grad_norm": 0.6424689292907715,
+      "learning_rate": 0.00035744680851063825,
+      "loss": 7.3896,
+      "step": 57
+    },
+    {
+      "epoch": 0.01854924143996482,
+      "grad_norm": 0.969096839427948,
+      "learning_rate": 0.0003638297872340425,
+      "loss": 7.4446,
+      "step": 58
+    },
+    {
+      "epoch": 0.01886905594755042,
+      "grad_norm": 1.238357424736023,
+      "learning_rate": 0.0003702127659574468,
+      "loss": 7.4339,
+      "step": 59
+    },
+    {
+      "epoch": 0.01918887045513602,
+      "grad_norm": 0.5127294659614563,
+      "learning_rate": 0.000376595744680851,
+      "loss": 7.3979,
+      "step": 60
+    },
+    {
+      "epoch": 0.019508684962721623,
+      "grad_norm": 0.5544953942298889,
+      "learning_rate": 0.0003829787234042553,
+      "loss": 7.3483,
+      "step": 61
+    },
+    {
+      "epoch": 0.019828499470307223,
+      "grad_norm": 0.7398856282234192,
+      "learning_rate": 0.0003893617021276595,
+      "loss": 7.2954,
+      "step": 62
+    },
+    {
+      "epoch": 0.020148313977892823,
+      "grad_norm": 0.39174893498420715,
+      "learning_rate": 0.00039574468085106374,
+      "loss": 7.3479,
+      "step": 63
+    },
+    {
+      "epoch": 0.020468128485478423,
+      "grad_norm": 0.41279804706573486,
+      "learning_rate": 0.0004021276595744681,
+      "loss": 7.4516,
+      "step": 64
+    },
+    {
+      "epoch": 0.020787942993064024,
+      "grad_norm": 1.256068229675293,
+      "learning_rate": 0.0004085106382978723,
+      "loss": 7.3337,
+      "step": 65
+    },
+    {
+      "epoch": 0.021107757500649624,
+      "grad_norm": 0.7139375805854797,
+      "learning_rate": 0.0004148936170212766,
+      "loss": 7.2613,
+      "step": 66
+    },
+    {
+      "epoch": 0.021427572008235224,
+      "grad_norm": 0.6181330680847168,
+      "learning_rate": 0.0004212765957446808,
+      "loss": 7.3833,
+      "step": 67
+    },
+    {
+      "epoch": 0.021747386515820824,
+      "grad_norm": 0.6431315541267395,
+      "learning_rate": 0.0004276595744680851,
+      "loss": 7.3363,
+      "step": 68
+    },
+    {
+      "epoch": 0.022067201023406424,
+      "grad_norm": 0.5013834834098816,
+      "learning_rate": 0.00043404255319148935,
+      "loss": 7.2818,
+      "step": 69
+    },
+    {
+      "epoch": 0.022387015530992024,
+      "grad_norm": 0.331126868724823,
+      "learning_rate": 0.00044042553191489357,
+      "loss": 7.3045,
+      "step": 70
+    },
+    {
+      "epoch": 0.022706830038577624,
+      "grad_norm": 0.46566659212112427,
+      "learning_rate": 0.0004468085106382978,
+      "loss": 7.2218,
+      "step": 71
+    },
+    {
+      "epoch": 0.023026644546163225,
+      "grad_norm": 0.3669161796569824,
+      "learning_rate": 0.0004531914893617021,
+      "loss": 7.1607,
+      "step": 72
+    },
+    {
+      "epoch": 0.023346459053748825,
+      "grad_norm": 0.35284051299095154,
+      "learning_rate": 0.00045957446808510635,
+      "loss": 7.1419,
+      "step": 73
+    },
+    {
+      "epoch": 0.023666273561334425,
+      "grad_norm": 0.3425213396549225,
+      "learning_rate": 0.00046595744680851057,
+      "loss": 7.1929,
+      "step": 74
+    },
+    {
+      "epoch": 0.023986088068920025,
+      "grad_norm": 0.40044257044792175,
+      "learning_rate": 0.00047234042553191485,
+      "loss": 7.163,
+      "step": 75
+    },
+    {
+      "epoch": 0.024305902576505625,
+      "grad_norm": 0.35946935415267944,
+      "learning_rate": 0.0004787234042553191,
+      "loss": 7.0943,
+      "step": 76
+    },
+    {
+      "epoch": 0.024625717084091225,
+      "grad_norm": 0.3463836908340454,
+      "learning_rate": 0.00048510638297872335,
+      "loss": 7.0932,
+      "step": 77
+    },
+    {
+      "epoch": 0.02494553159167683,
+      "grad_norm": 0.37853115797042847,
+      "learning_rate": 0.0004914893617021277,
+      "loss": 7.1608,
+      "step": 78
+    },
+    {
+      "epoch": 0.02526534609926243,
+      "grad_norm": 0.9995279908180237,
+      "learning_rate": 0.0004978723404255318,
+      "loss": 7.0951,
+      "step": 79
+    },
+    {
+      "epoch": 0.02558516060684803,
+      "grad_norm": 0.4370875358581543,
+      "learning_rate": 0.0005042553191489361,
+      "loss": 7.0864,
+      "step": 80
+    },
+    {
+      "epoch": 0.02590497511443363,
+      "grad_norm": 0.39335861802101135,
+      "learning_rate": 0.0005106382978723404,
+      "loss": 7.1225,
+      "step": 81
+    },
+    {
+      "epoch": 0.02622478962201923,
+      "grad_norm": 0.5575166940689087,
+      "learning_rate": 0.0005170212765957446,
+      "loss": 7.0128,
+      "step": 82
+    },
+    {
+      "epoch": 0.02654460412960483,
+      "grad_norm": 0.382291704416275,
+      "learning_rate": 0.000523404255319149,
+      "loss": 7.0568,
+      "step": 83
+    },
+    {
+      "epoch": 0.02686441863719043,
+      "grad_norm": 0.37663501501083374,
+      "learning_rate": 0.0005297872340425531,
+      "loss": 7.0283,
+      "step": 84
+    },
+    {
+      "epoch": 0.02718423314477603,
+      "grad_norm": 0.324942022562027,
+      "learning_rate": 0.0005361702127659574,
+      "loss": 6.9855,
+      "step": 85
+    },
+    {
+      "epoch": 0.02750404765236163,
+      "grad_norm": 0.3324145972728729,
+      "learning_rate": 0.0005425531914893617,
+      "loss": 6.9152,
+      "step": 86
+    },
+    {
+      "epoch": 0.02782386215994723,
+      "grad_norm": 0.29377809166908264,
+      "learning_rate": 0.000548936170212766,
+      "loss": 6.9509,
+      "step": 87
+    },
+    {
+      "epoch": 0.02814367666753283,
+      "grad_norm": 0.30234116315841675,
+      "learning_rate": 0.0005553191489361701,
+      "loss": 6.9518,
+      "step": 88
+    },
+    {
+      "epoch": 0.02846349117511843,
+      "grad_norm": 0.42053472995758057,
+      "learning_rate": 0.0005617021276595744,
+      "loss": 6.8915,
+      "step": 89
+    },
+    {
+      "epoch": 0.02878330568270403,
+      "grad_norm": 0.3598235249519348,
+      "learning_rate": 0.0005680851063829787,
+      "loss": 6.9226,
+      "step": 90
+    },
+    {
+      "epoch": 0.02910312019028963,
+      "grad_norm": 0.35407304763793945,
+      "learning_rate": 0.000574468085106383,
+      "loss": 6.901,
+      "step": 91
+    },
+    {
+      "epoch": 0.02942293469787523,
+      "grad_norm": 0.29050391912460327,
+      "learning_rate": 0.0005808510638297872,
+      "loss": 6.816,
+      "step": 92
+    },
+    {
+      "epoch": 0.02974274920546083,
+      "grad_norm": 0.32192665338516235,
+      "learning_rate": 0.0005872340425531914,
+      "loss": 6.796,
+      "step": 93
+    },
+    {
+      "epoch": 0.03006256371304643,
+      "grad_norm": 0.3513169288635254,
+      "learning_rate": 0.0005936170212765957,
+      "loss": 6.8849,
+      "step": 94
+    },
+    {
+      "epoch": 0.030382378220632035,
+      "grad_norm": 0.4323410987854004,
+      "learning_rate": 0.0006,
+      "loss": 6.8101,
+      "step": 95
+    },
+    {
+      "epoch": 0.030702192728217635,
+      "grad_norm": 0.2884387671947479,
+      "learning_rate": 0.0005999998389604413,
+      "loss": 6.8199,
+      "step": 96
+    },
+    {
+      "epoch": 0.031022007235803235,
+      "grad_norm": 0.3581418991088867,
+      "learning_rate": 0.0005999993558419382,
+      "loss": 6.8157,
+      "step": 97
+    },
+    {
+      "epoch": 0.031341821743388835,
+      "grad_norm": 0.3392150402069092,
+      "learning_rate": 0.0005999985506450094,
+      "loss": 6.8469,
+      "step": 98
+    },
+    {
+      "epoch": 0.031661636250974436,
+      "grad_norm": 0.3693081736564636,
+      "learning_rate": 0.0005999974233705192,
+      "loss": 6.8005,
+      "step": 99
+    },
+    {
+      "epoch": 0.031981450758560036,
+      "grad_norm": 0.3570478856563568,
+      "learning_rate": 0.000599995974019678,
+      "loss": 6.8532,
+      "step": 100
+    },
+    {
+      "epoch": 0.031981450758560036,
+      "eval_loss": 6.748331546783447,
+      "eval_runtime": 76.957,
+      "eval_samples_per_second": 24.65,
+      "eval_steps_per_second": 6.172,
+      "step": 100
+    },
+    {
+      "epoch": 0.032301265266145636,
+      "grad_norm": 0.41135793924331665,
+      "learning_rate": 0.0005999942025940418,
+      "loss": 6.7633,
+      "step": 101
+    },
+    {
+      "epoch": 0.032621079773731236,
+      "grad_norm": 0.3616463840007782,
+      "learning_rate": 0.0005999921090955123,
+      "loss": 6.8156,
+      "step": 102
+    },
+    {
+      "epoch": 0.032940894281316836,
+      "grad_norm": 0.3172069787979126,
+      "learning_rate": 0.0005999896935263372,
+      "loss": 6.7868,
+      "step": 103
+    },
+    {
+      "epoch": 0.033260708788902436,
+      "grad_norm": 0.5328623652458191,
+      "learning_rate": 0.0005999869558891097,
+      "loss": 6.8434,
+      "step": 104
+    },
+    {
+      "epoch": 0.033580523296488037,
+      "grad_norm": 0.37089961767196655,
+      "learning_rate": 0.000599983896186769,
+      "loss": 6.7699,
+      "step": 105
+    },
+    {
+      "epoch": 0.03390033780407364,
+      "grad_norm": 0.4071977138519287,
+      "learning_rate": 0.0005999805144226,
+      "loss": 6.7143,
+      "step": 106
+    },
+    {
+      "epoch": 0.03422015231165924,
+      "grad_norm": 0.33601731061935425,
+      "learning_rate": 0.0005999768106002334,
+      "loss": 6.7098,
+      "step": 107
+    },
+    {
+      "epoch": 0.03453996681924484,
+      "grad_norm": 0.34338271617889404,
+      "learning_rate": 0.0005999727847236454,
+      "loss": 6.7295,
+      "step": 108
+    },
+    {
+      "epoch": 0.03485978132683044,
+      "grad_norm": 0.3717418611049652,
+      "learning_rate": 0.0005999684367971584,
+      "loss": 6.6593,
+      "step": 109
+    },
+    {
+      "epoch": 0.03517959583441604,
+      "grad_norm": 0.41923823952674866,
+      "learning_rate": 0.0005999637668254403,
+      "loss": 6.6612,
+      "step": 110
+    },
+    {
+      "epoch": 0.03549941034200164,
+      "grad_norm": 0.3013748824596405,
+      "learning_rate": 0.0005999587748135047,
+      "loss": 6.5929,
+      "step": 111
+    },
+    {
+      "epoch": 0.03581922484958724,
+      "grad_norm": 0.4821736216545105,
+      "learning_rate": 0.000599953460766711,
+      "loss": 6.6543,
+      "step": 112
+    },
+    {
+      "epoch": 0.03613903935717284,
+      "grad_norm": 0.32597431540489197,
+      "learning_rate": 0.0005999478246907643,
+      "loss": 6.7071,
+      "step": 113
+    },
+    {
+      "epoch": 0.03645885386475844,
+      "grad_norm": 0.33268311619758606,
+      "learning_rate": 0.0005999418665917157,
+      "loss": 6.6091,
+      "step": 114
+    },
+    {
+      "epoch": 0.03677866837234404,
+      "grad_norm": 0.30232611298561096,
+      "learning_rate": 0.0005999355864759614,
+      "loss": 6.6904,
+      "step": 115
+    },
+    {
+      "epoch": 0.03709848287992964,
+      "grad_norm": 0.3010026812553406,
+      "learning_rate": 0.0005999289843502441,
+      "loss": 6.6532,
+      "step": 116
+    },
+    {
+      "epoch": 0.03741829738751524,
+      "grad_norm": 0.34711113572120667,
+      "learning_rate": 0.0005999220602216517,
+      "loss": 6.5868,
+      "step": 117
+    },
+    {
+      "epoch": 0.03773811189510084,
+      "grad_norm": 0.4245126247406006,
+      "learning_rate": 0.0005999148140976179,
+      "loss": 6.6374,
+      "step": 118
+    },
+    {
+      "epoch": 0.03805792640268644,
+      "grad_norm": 0.43401578068733215,
+      "learning_rate": 0.0005999072459859221,
+      "loss": 6.5992,
+      "step": 119
+    },
+    {
+      "epoch": 0.03837774091027204,
+      "grad_norm": 0.4109058976173401,
+      "learning_rate": 0.0005998993558946892,
+      "loss": 6.5269,
+      "step": 120
+    },
+    {
+      "epoch": 0.038697555417857646,
+      "grad_norm": 0.3726668059825897,
+      "learning_rate": 0.0005998911438323904,
+      "loss": 6.6078,
+      "step": 121
+    },
+    {
+      "epoch": 0.039017369925443246,
+      "grad_norm": 0.3629209101200104,
+      "learning_rate": 0.000599882609807842,
+      "loss": 6.5175,
+      "step": 122
+    },
+    {
+      "epoch": 0.039337184433028846,
+      "grad_norm": 0.4317244291305542,
+      "learning_rate": 0.000599873753830206,
+      "loss": 6.5939,
+      "step": 123
+    },
+    {
+      "epoch": 0.039656998940614446,
+      "grad_norm": 0.2928583025932312,
+      "learning_rate": 0.0005998645759089901,
+      "loss": 6.5584,
+      "step": 124
+    },
+    {
+      "epoch": 0.039976813448200046,
+      "grad_norm": 0.3732014000415802,
+      "learning_rate": 0.0005998550760540478,
+      "loss": 6.5541,
+      "step": 125
+    },
+    {
+      "epoch": 0.04029662795578565,
+      "grad_norm": 0.27851733565330505,
+      "learning_rate": 0.000599845254275578,
+      "loss": 6.4988,
+      "step": 126
+    },
+    {
+      "epoch": 0.04061644246337125,
+      "grad_norm": 0.3921887278556824,
+      "learning_rate": 0.0005998351105841257,
+      "loss": 6.6184,
+      "step": 127
+    },
+    {
+      "epoch": 0.04093625697095685,
+      "grad_norm": 0.3566243052482605,
+      "learning_rate": 0.0005998246449905807,
+      "loss": 6.503,
+      "step": 128
+    },
+    {
+      "epoch": 0.04125607147854245,
+      "grad_norm": 0.39187195897102356,
+      "learning_rate": 0.0005998138575061791,
+      "loss": 6.4724,
+      "step": 129
+    },
+    {
+      "epoch": 0.04157588598612805,
+      "grad_norm": 0.3276127874851227,
+      "learning_rate": 0.000599802748142502,
+      "loss": 6.5288,
+      "step": 130
+    },
+    {
+      "epoch": 0.04189570049371365,
+      "grad_norm": 0.39741045236587524,
+      "learning_rate": 0.0005997913169114768,
+      "loss": 6.4702,
+      "step": 131
+    },
+    {
+      "epoch": 0.04221551500129925,
+      "grad_norm": 0.4539431035518646,
+      "learning_rate": 0.0005997795638253759,
+      "loss": 6.4902,
+      "step": 132
+    },
+    {
+      "epoch": 0.04253532950888485,
+      "grad_norm": 0.41237616539001465,
+      "learning_rate": 0.0005997674888968171,
+      "loss": 6.5647,
+      "step": 133
+    },
+    {
+      "epoch": 0.04285514401647045,
+      "grad_norm": 0.3613832890987396,
+      "learning_rate": 0.0005997550921387643,
+      "loss": 6.4609,
+      "step": 134
+    },
+    {
+      "epoch": 0.04317495852405605,
+      "grad_norm": 0.33870750665664673,
+      "learning_rate": 0.0005997423735645265,
+      "loss": 6.4513,
+      "step": 135
+    },
+    {
+      "epoch": 0.04349477303164165,
+      "grad_norm": 0.3336218297481537,
+      "learning_rate": 0.0005997293331877584,
+      "loss": 6.4568,
+      "step": 136
+    },
+    {
+      "epoch": 0.04381458753922725,
+      "grad_norm": 0.3454459309577942,
+      "learning_rate": 0.0005997159710224602,
+      "loss": 6.4306,
+      "step": 137
+    },
+    {
+      "epoch": 0.04413440204681285,
+      "grad_norm": 0.35895201563835144,
+      "learning_rate": 0.0005997022870829771,
+      "loss": 6.411,
+      "step": 138
+    },
+    {
+      "epoch": 0.04445421655439845,
+      "grad_norm": 0.335130900144577,
+      "learning_rate": 0.0005996882813840005,
+      "loss": 6.3966,
+      "step": 139
+    },
+    {
+      "epoch": 0.04477403106198405,
+      "grad_norm": 0.29891467094421387,
+      "learning_rate": 0.0005996739539405668,
+      "loss": 6.4543,
+      "step": 140
+    },
+    {
+      "epoch": 0.04509384556956965,
+      "grad_norm": 0.3919838070869446,
+      "learning_rate": 0.0005996593047680579,
+      "loss": 6.3899,
+      "step": 141
+    },
+    {
+      "epoch": 0.04541366007715525,
+      "grad_norm": 0.43485793471336365,
+      "learning_rate": 0.0005996443338822011,
+      "loss": 6.473,
+      "step": 142
+    },
+    {
+      "epoch": 0.04573347458474085,
+      "grad_norm": 0.5233163833618164,
+      "learning_rate": 0.000599629041299069,
+      "loss": 6.4152,
+      "step": 143
+    },
+    {
+      "epoch": 0.04605328909232645,
+      "grad_norm": 0.38263949751853943,
+      "learning_rate": 0.0005996134270350797,
+      "loss": 6.4465,
+      "step": 144
+    },
+    {
+      "epoch": 0.04637310359991205,
+      "grad_norm": 0.38162732124328613,
+      "learning_rate": 0.0005995974911069968,
+      "loss": 6.3812,
+      "step": 145
+    },
+    {
+      "epoch": 0.04669291810749765,
+      "grad_norm": 0.4471980333328247,
+      "learning_rate": 0.0005995812335319289,
+      "loss": 6.4205,
+      "step": 146
+    },
+    {
+      "epoch": 0.04701273261508325,
+      "grad_norm": 0.3626723885536194,
+      "learning_rate": 0.0005995646543273301,
+      "loss": 6.3803,
+      "step": 147
+    },
+    {
+      "epoch": 0.04733254712266885,
+      "grad_norm": 0.37030258774757385,
+      "learning_rate": 0.0005995477535109998,
+      "loss": 6.4335,
+      "step": 148
+    },
+    {
+      "epoch": 0.04765236163025445,
+      "grad_norm": 0.30875375866889954,
+      "learning_rate": 0.0005995305311010826,
+      "loss": 6.386,
+      "step": 149
+    },
+    {
+      "epoch": 0.04797217613784005,
+      "grad_norm": 0.41564837098121643,
+      "learning_rate": 0.0005995129871160688,
+      "loss": 6.3846,
+      "step": 150
+    },
+    {
+      "epoch": 0.04829199064542565,
+      "grad_norm": 0.3652048110961914,
+      "learning_rate": 0.000599495121574793,
+      "loss": 6.4395,
+      "step": 151
+    },
+    {
+      "epoch": 0.04861180515301125,
+      "grad_norm": 0.33011215925216675,
+      "learning_rate": 0.0005994769344964359,
+      "loss": 6.4096,
+      "step": 152
+    },
+    {
+      "epoch": 0.04893161966059685,
+      "grad_norm": 0.28985321521759033,
+      "learning_rate": 0.0005994584259005232,
+      "loss": 6.3923,
+      "step": 153
+    },
+    {
+      "epoch": 0.04925143416818245,
+      "grad_norm": 0.32829374074935913,
+      "learning_rate": 0.0005994395958069254,
+      "loss": 6.351,
+      "step": 154
+    },
+    {
+      "epoch": 0.04957124867576806,
+      "grad_norm": 0.32175174355506897,
+      "learning_rate": 0.0005994204442358586,
+      "loss": 6.3592,
+      "step": 155
+    },
+    {
+      "epoch": 0.04989106318335366,
+      "grad_norm": 0.3477462828159332,
+      "learning_rate": 0.0005994009712078839,
+      "loss": 6.3272,
+      "step": 156
+    },
+    {
+      "epoch": 0.05021087769093926,
+      "grad_norm": 0.31097620725631714,
+      "learning_rate": 0.0005993811767439074,
+      "loss": 6.358,
+      "step": 157
+    },
+    {
+      "epoch": 0.05053069219852486,
+      "grad_norm": 0.3622291386127472,
+      "learning_rate": 0.0005993610608651804,
+      "loss": 6.3131,
+      "step": 158
+    },
+    {
+      "epoch": 0.05085050670611046,
+      "grad_norm": 0.3798142373561859,
+      "learning_rate": 0.0005993406235932992,
+      "loss": 6.3733,
+      "step": 159
+    },
+    {
+      "epoch": 0.05117032121369606,
+      "grad_norm": 0.3285475969314575,
+      "learning_rate": 0.0005993198649502054,
+      "loss": 6.3525,
+      "step": 160
+    },
+    {
+      "epoch": 0.05149013572128166,
+      "grad_norm": 0.3842519223690033,
+      "learning_rate": 0.0005992987849581852,
+      "loss": 6.3819,
+      "step": 161
+    },
+    {
+      "epoch": 0.05180995022886726,
+      "grad_norm": 0.38792696595191956,
+      "learning_rate": 0.00059927738363987,
+      "loss": 6.383,
+      "step": 162
+    },
+    {
+      "epoch": 0.05212976473645286,
+      "grad_norm": 0.3751309812068939,
+      "learning_rate": 0.0005992556610182364,
+      "loss": 6.3039,
+      "step": 163
+    },
+    {
+      "epoch": 0.05244957924403846,
+      "grad_norm": 0.4142090976238251,
+      "learning_rate": 0.0005992336171166056,
+      "loss": 6.3056,
+      "step": 164
+    },
+    {
+      "epoch": 0.05276939375162406,
+      "grad_norm": 0.6167832016944885,
+      "learning_rate": 0.0005992112519586438,
+      "loss": 6.2986,
+      "step": 165
+    },
+    {
+      "epoch": 0.05308920825920966,
+      "grad_norm": 0.5728801488876343,
+      "learning_rate": 0.0005991885655683624,
+      "loss": 6.2972,
+      "step": 166
+    },
+    {
+      "epoch": 0.05340902276679526,
+      "grad_norm": 0.3504704236984253,
+      "learning_rate": 0.0005991655579701171,
+      "loss": 6.1997,
+      "step": 167
+    },
+    {
+      "epoch": 0.05372883727438086,
+      "grad_norm": 0.4289745092391968,
+      "learning_rate": 0.0005991422291886092,
+      "loss": 6.2569,
+      "step": 168
+    },
+    {
+      "epoch": 0.05404865178196646,
+      "grad_norm": 0.3686276972293854,
+      "learning_rate": 0.000599118579248884,
+      "loss": 6.2246,
+      "step": 169
+    },
+    {
+      "epoch": 0.05436846628955206,
+      "grad_norm": 0.3959043323993683,
+      "learning_rate": 0.0005990946081763324,
+      "loss": 6.3558,
+      "step": 170
+    },
+    {
+      "epoch": 0.05468828079713766,
+      "grad_norm": 0.31661829352378845,
+      "learning_rate": 0.0005990703159966894,
+      "loss": 6.2752,
+      "step": 171
+    },
+    {
+      "epoch": 0.05500809530472326,
+      "grad_norm": 0.3865572512149811,
+      "learning_rate": 0.000599045702736035,
+      "loss": 6.2855,
+      "step": 172
+    },
+    {
+      "epoch": 0.05532790981230886,
+      "grad_norm": 0.31952956318855286,
+      "learning_rate": 0.000599020768420794,
+      "loss": 6.2626,
+      "step": 173
+    },
+    {
+      "epoch": 0.05564772431989446,
+      "grad_norm": 0.40415337681770325,
+      "learning_rate": 0.000598995513077736,
+      "loss": 6.3349,
+      "step": 174
+    },
+    {
+      "epoch": 0.05596753882748006,
+      "grad_norm": 0.39738693833351135,
+      "learning_rate": 0.0005989699367339748,
+      "loss": 6.358,
+      "step": 175
+    },
+    {
+      "epoch": 0.05628735333506566,
+      "grad_norm": 0.34964892268180847,
+      "learning_rate": 0.0005989440394169692,
+      "loss": 6.2258,
+      "step": 176
+    },
+    {
+      "epoch": 0.05660716784265126,
+      "grad_norm": 0.3460441529750824,
+      "learning_rate": 0.0005989178211545223,
+      "loss": 6.2828,
+      "step": 177
+    },
+    {
+      "epoch": 0.05692698235023686,
+      "grad_norm": 0.38462570309638977,
+      "learning_rate": 0.0005988912819747822,
+      "loss": 6.2042,
+      "step": 178
+    },
+    {
+      "epoch": 0.05724679685782246,
+      "grad_norm": 0.35884854197502136,
+      "learning_rate": 0.0005988644219062412,
+      "loss": 6.2393,
+      "step": 179
+    },
+    {
+      "epoch": 0.05756661136540806,
+      "grad_norm": 0.3842756748199463,
+      "learning_rate": 0.0005988372409777362,
+      "loss": 6.1717,
+      "step": 180
+    },
+    {
+      "epoch": 0.05788642587299366,
+      "grad_norm": 0.3338313698768616,
+      "learning_rate": 0.0005988097392184486,
+      "loss": 6.2315,
+      "step": 181
+    },
+    {
+      "epoch": 0.05820624038057926,
+      "grad_norm": 0.36342915892601013,
+      "learning_rate": 0.000598781916657904,
+      "loss": 6.2506,
+      "step": 182
+    },
+    {
+      "epoch": 0.05852605488816486,
+      "grad_norm": 0.3592490255832672,
+      "learning_rate": 0.0005987537733259729,
+      "loss": 6.2037,
+      "step": 183
+    },
+    {
+      "epoch": 0.05884586939575046,
+      "grad_norm": 0.40599575638771057,
+      "learning_rate": 0.0005987253092528697,
+      "loss": 6.1787,
+      "step": 184
+    },
+    {
+      "epoch": 0.05916568390333606,
+      "grad_norm": 0.38951805233955383,
+      "learning_rate": 0.0005986965244691533,
+      "loss": 6.1482,
+      "step": 185
+    },
+    {
+      "epoch": 0.05948549841092166,
+      "grad_norm": 0.3523838520050049,
+      "learning_rate": 0.0005986674190057274,
+      "loss": 6.1634,
+      "step": 186
+    },
+    {
+      "epoch": 0.05980531291850726,
+      "grad_norm": 0.3295622766017914,
+      "learning_rate": 0.0005986379928938389,
+      "loss": 6.1444,
+      "step": 187
+    },
+    {
+      "epoch": 0.06012512742609286,
+      "grad_norm": 0.3437183201313019,
+      "learning_rate": 0.0005986082461650801,
+      "loss": 6.1733,
+      "step": 188
+    },
+    {
+      "epoch": 0.06044494193367847,
+      "grad_norm": 0.4113365709781647,
+      "learning_rate": 0.0005985781788513867,
+      "loss": 6.1967,
+      "step": 189
+    },
+    {
+      "epoch": 0.06076475644126407,
+      "grad_norm": 0.3398386240005493,
+      "learning_rate": 0.000598547790985039,
+      "loss": 6.1989,
+      "step": 190
+    },
+    {
+      "epoch": 0.06108457094884967,
+      "grad_norm": 0.3166162669658661,
+      "learning_rate": 0.0005985170825986613,
+      "loss": 6.1402,
+      "step": 191
+    },
+    {
+      "epoch": 0.06140438545643527,
+      "grad_norm": 0.4324597120285034,
+      "learning_rate": 0.000598486053725222,
+      "loss": 6.2131,
+      "step": 192
+    },
+    {
+      "epoch": 0.06172419996402087,
+      "grad_norm": 0.4721795320510864,
+      "learning_rate": 0.0005984547043980338,
+      "loss": 6.1555,
+      "step": 193
+    },
+    {
+      "epoch": 0.06204401447160647,
+      "grad_norm": 0.4092848002910614,
+      "learning_rate": 0.0005984230346507529,
+      "loss": 6.1317,
+      "step": 194
+    },
+    {
+      "epoch": 0.06236382897919207,
+      "grad_norm": 0.3960781991481781,
+      "learning_rate": 0.0005983910445173802,
+      "loss": 6.1638,
+      "step": 195
+    },
+    {
+      "epoch": 0.06268364348677767,
+      "grad_norm": 0.4284694492816925,
+      "learning_rate": 0.00059835873403226,
+      "loss": 6.1422,
+      "step": 196
+    },
+    {
+      "epoch": 0.06300345799436327,
+      "grad_norm": 0.3202615976333618,
+      "learning_rate": 0.0005983261032300807,
+      "loss": 6.0783,
+      "step": 197
+    },
+    {
+      "epoch": 0.06332327250194887,
+      "grad_norm": 0.40215131640434265,
+      "learning_rate": 0.0005982931521458747,
+      "loss": 6.0568,
+      "step": 198
+    },
+    {
+      "epoch": 0.06364308700953447,
+      "grad_norm": 0.36528280377388,
+      "learning_rate": 0.0005982598808150184,
+      "loss": 6.2003,
+      "step": 199
+    },
+    {
+      "epoch": 0.06396290151712007,
+      "grad_norm": 0.3844723701477051,
+      "learning_rate": 0.0005982262892732315,
+      "loss": 6.1754,
+      "step": 200
+    },
+    {
+      "epoch": 0.06396290151712007,
+      "eval_loss": 6.124255180358887,
+      "eval_runtime": 79.4151,
+      "eval_samples_per_second": 23.887,
+      "eval_steps_per_second": 5.981,
+      "step": 200
+    },
+    {
+      "epoch": 0.06428271602470567,
+      "grad_norm": 0.3817574679851532,
+      "learning_rate": 0.000598192377556578,
+      "loss": 6.1477,
+      "step": 201
+    },
+    {
+      "epoch": 0.06460253053229127,
+      "grad_norm": 0.36154475808143616,
+      "learning_rate": 0.0005981581457014652,
+      "loss": 6.1077,
+      "step": 202
+    },
+    {
+      "epoch": 0.06492234503987687,
+      "grad_norm": 0.33983469009399414,
+      "learning_rate": 0.0005981235937446446,
+      "loss": 6.1379,
+      "step": 203
+    },
+    {
+      "epoch": 0.06524215954746247,
+      "grad_norm": 0.3408897817134857,
+      "learning_rate": 0.0005980887217232107,
+      "loss": 6.1238,
+      "step": 204
+    },
+    {
+      "epoch": 0.06556197405504807,
+      "grad_norm": 0.3715430200099945,
+      "learning_rate": 0.0005980535296746023,
+      "loss": 6.1099,
+      "step": 205
+    },
+    {
+      "epoch": 0.06588178856263367,
+      "grad_norm": 0.39493903517723083,
+      "learning_rate": 0.0005980180176366013,
+      "loss": 6.1089,
+      "step": 206
+    },
+    {
+      "epoch": 0.06620160307021927,
+      "grad_norm": 0.35184791684150696,
+      "learning_rate": 0.0005979821856473336,
+      "loss": 6.0774,
+      "step": 207
+    },
+    {
+      "epoch": 0.06652141757780487,
+      "grad_norm": 0.3386211097240448,
+      "learning_rate": 0.0005979460337452681,
+      "loss": 6.0681,
+      "step": 208
+    },
+    {
+      "epoch": 0.06684123208539047,
+      "grad_norm": 0.34366345405578613,
+      "learning_rate": 0.0005979095619692172,
+      "loss": 6.0457,
+      "step": 209
+    },
+    {
+      "epoch": 0.06716104659297607,
+      "grad_norm": 0.4233472943305969,
+      "learning_rate": 0.0005978727703583374,
+      "loss": 6.0769,
+      "step": 210
+    },
+    {
+      "epoch": 0.06748086110056167,
+      "grad_norm": 0.38509127497673035,
+      "learning_rate": 0.0005978356589521276,
+      "loss": 6.0947,
+      "step": 211
+    },
+    {
+      "epoch": 0.06780067560814727,
+      "grad_norm": 0.3696242570877075,
+      "learning_rate": 0.0005977982277904306,
+      "loss": 6.0721,
+      "step": 212
+    },
+    {
+      "epoch": 0.06812049011573287,
+      "grad_norm": 0.4310404658317566,
+      "learning_rate": 0.0005977604769134325,
+      "loss": 6.0199,
+      "step": 213
+    },
+    {
+      "epoch": 0.06844030462331847,
+      "grad_norm": 0.47304555773735046,
+      "learning_rate": 0.0005977224063616625,
+      "loss": 6.115,
+      "step": 214
+    },
+    {
+      "epoch": 0.06876011913090407,
+      "grad_norm": 0.3975367844104767,
+      "learning_rate": 0.0005976840161759931,
+      "loss": 6.0713,
+      "step": 215
+    },
+    {
+      "epoch": 0.06907993363848967,
+      "grad_norm": 0.35958242416381836,
+      "learning_rate": 0.0005976453063976396,
+      "loss": 6.1155,
+      "step": 216
+    },
+    {
+      "epoch": 0.06939974814607527,
+      "grad_norm": 0.3872879147529602,
+      "learning_rate": 0.000597606277068161,
+      "loss": 6.0932,
+      "step": 217
+    },
+    {
+      "epoch": 0.06971956265366087,
+      "grad_norm": 0.4409867525100708,
+      "learning_rate": 0.000597566928229459,
+      "loss": 6.062,
+      "step": 218
+    },
+    {
+      "epoch": 0.07003937716124647,
+      "grad_norm": 0.39331111311912537,
+      "learning_rate": 0.0005975272599237784,
+      "loss": 5.9937,
+      "step": 219
+    },
+    {
+      "epoch": 0.07035919166883207,
+      "grad_norm": 0.42473429441452026,
+      "learning_rate": 0.0005974872721937069,
+      "loss": 6.038,
+      "step": 220
+    },
+    {
+      "epoch": 0.07067900617641767,
+      "grad_norm": 0.5901650190353394,
+      "learning_rate": 0.0005974469650821753,
+      "loss": 6.0971,
+      "step": 221
+    },
+    {
+      "epoch": 0.07099882068400327,
+      "grad_norm": 0.4795404374599457,
+      "learning_rate": 0.0005974063386324571,
+      "loss": 6.0316,
+      "step": 222
+    },
+    {
+      "epoch": 0.07131863519158887,
+      "grad_norm": 0.4145892858505249,
+      "learning_rate": 0.0005973653928881688,
+      "loss": 5.9712,
+      "step": 223
+    },
+    {
+      "epoch": 0.07163844969917448,
+      "grad_norm": 0.36704781651496887,
+      "learning_rate": 0.0005973241278932695,
+      "loss": 6.0002,
+      "step": 224
+    },
+    {
+      "epoch": 0.07195826420676008,
+      "grad_norm": 0.3889116048812866,
+      "learning_rate": 0.0005972825436920615,
+      "loss": 5.9938,
+      "step": 225
+    },
+    {
+      "epoch": 0.07227807871434568,
+      "grad_norm": 0.39109617471694946,
+      "learning_rate": 0.0005972406403291893,
+      "loss": 6.0182,
+      "step": 226
+    },
+    {
+      "epoch": 0.07259789322193128,
+      "grad_norm": 0.3956647217273712,
+      "learning_rate": 0.00059719841784964,
+      "loss": 6.0772,
+      "step": 227
+    },
+    {
+      "epoch": 0.07291770772951688,
+      "grad_norm": 0.3841460943222046,
+      "learning_rate": 0.0005971558762987439,
+      "loss": 5.9538,
+      "step": 228
+    },
+    {
+      "epoch": 0.07323752223710248,
+      "grad_norm": 0.3795830011367798,
+      "learning_rate": 0.0005971130157221733,
+      "loss": 6.022,
+      "step": 229
+    },
+    {
+      "epoch": 0.07355733674468808,
+      "grad_norm": 0.3490051031112671,
+      "learning_rate": 0.0005970698361659431,
+      "loss": 6.0195,
+      "step": 230
+    },
+    {
+      "epoch": 0.07387715125227368,
+      "grad_norm": 0.3971054255962372,
+      "learning_rate": 0.000597026337676411,
+      "loss": 6.0745,
+      "step": 231
+    },
+    {
+      "epoch": 0.07419696575985928,
+      "grad_norm": 0.3888954520225525,
+      "learning_rate": 0.0005969825203002765,
+      "loss": 5.998,
+      "step": 232
+    },
+    {
+      "epoch": 0.07451678026744488,
+      "grad_norm": 0.3782314360141754,
+      "learning_rate": 0.0005969383840845822,
+      "loss": 6.0144,
+      "step": 233
+    },
+    {
+      "epoch": 0.07483659477503048,
+      "grad_norm": 0.37992793321609497,
+      "learning_rate": 0.0005968939290767123,
+      "loss": 5.9202,
+      "step": 234
+    },
+    {
+      "epoch": 0.07515640928261608,
+      "grad_norm": 0.3844238817691803,
+      "learning_rate": 0.0005968491553243937,
+      "loss": 6.0261,
+      "step": 235
+    },
+    {
+      "epoch": 0.07547622379020168,
+      "grad_norm": 0.31843745708465576,
+      "learning_rate": 0.0005968040628756955,
+      "loss": 5.9868,
+      "step": 236
+    },
+    {
+      "epoch": 0.07579603829778728,
+      "grad_norm": 0.38442689180374146,
+      "learning_rate": 0.0005967586517790285,
+      "loss": 5.873,
+      "step": 237
+    },
+    {
+      "epoch": 0.07611585280537288,
+      "grad_norm": 0.4192902445793152,
+      "learning_rate": 0.0005967129220831461,
+      "loss": 6.0088,
+      "step": 238
+    },
+    {
+      "epoch": 0.07643566731295848,
+      "grad_norm": 0.4624556005001068,
+      "learning_rate": 0.0005966668738371436,
+      "loss": 5.9821,
+      "step": 239
+    },
+    {
+      "epoch": 0.07675548182054408,
+      "grad_norm": 0.46568813920021057,
+      "learning_rate": 0.0005966205070904582,
+      "loss": 5.9741,
+      "step": 240
+    },
+    {
+      "epoch": 0.07707529632812969,
+      "grad_norm": 0.41743966937065125,
+      "learning_rate": 0.0005965738218928693,
+      "loss": 5.8992,
+      "step": 241
+    },
+    {
+      "epoch": 0.07739511083571529,
+      "grad_norm": 0.4026016891002655,
+      "learning_rate": 0.0005965268182944976,
+      "loss": 5.9726,
+      "step": 242
+    },
+    {
+      "epoch": 0.07771492534330089,
+      "grad_norm": 0.3927863538265228,
+      "learning_rate": 0.0005964794963458063,
+      "loss": 6.0312,
+      "step": 243
+    },
+    {
+      "epoch": 0.07803473985088649,
+      "grad_norm": 0.3907330632209778,
+      "learning_rate": 0.0005964318560976001,
+      "loss": 5.9757,
+      "step": 244
+    },
+    {
+      "epoch": 0.07835455435847209,
+      "grad_norm": 0.42175161838531494,
+      "learning_rate": 0.0005963838976010252,
+      "loss": 5.8983,
+      "step": 245
+    },
+    {
+      "epoch": 0.07867436886605769,
+      "grad_norm": 0.44585391879081726,
+      "learning_rate": 0.0005963356209075701,
+      "loss": 5.9396,
+      "step": 246
+    },
+    {
+      "epoch": 0.07899418337364329,
+      "grad_norm": 0.40885382890701294,
+      "learning_rate": 0.0005962870260690641,
+      "loss": 6.0422,
+      "step": 247
+    },
+    {
+      "epoch": 0.07931399788122889,
+      "grad_norm": 0.3636566996574402,
+      "learning_rate": 0.0005962381131376788,
+      "loss": 5.9002,
+      "step": 248
+    },
+    {
+      "epoch": 0.07963381238881449,
+      "grad_norm": 0.3666331171989441,
+      "learning_rate": 0.0005961888821659268,
+      "loss": 5.9966,
+      "step": 249
+    },
+    {
+      "epoch": 0.07995362689640009,
+      "grad_norm": 0.3743114173412323,
+      "learning_rate": 0.0005961393332066623,
+      "loss": 5.9821,
+      "step": 250
+    },
+    {
+      "epoch": 0.08027344140398569,
+      "grad_norm": 0.41628390550613403,
+      "learning_rate": 0.0005960894663130811,
+      "loss": 5.9146,
+      "step": 251
+    },
+    {
+      "epoch": 0.0805932559115713,
+      "grad_norm": 0.5151297450065613,
+      "learning_rate": 0.0005960392815387201,
+      "loss": 5.9578,
+      "step": 252
+    },
+    {
+      "epoch": 0.0809130704191569,
+      "grad_norm": 0.45721518993377686,
+      "learning_rate": 0.0005959887789374573,
+      "loss": 5.9206,
+      "step": 253
+    },
+    {
+      "epoch": 0.0812328849267425,
+      "grad_norm": 0.402630090713501,
+      "learning_rate": 0.0005959379585635124,
+      "loss": 5.9299,
+      "step": 254
+    },
+    {
+      "epoch": 0.0815526994343281,
+      "grad_norm": 0.40605592727661133,
+      "learning_rate": 0.0005958868204714459,
+      "loss": 5.9394,
+      "step": 255
+    },
+    {
+      "epoch": 0.0818725139419137,
+      "grad_norm": 0.35721316933631897,
+      "learning_rate": 0.0005958353647161595,
+      "loss": 5.9359,
+      "step": 256
+    },
+    {
+      "epoch": 0.0821923284494993,
+      "grad_norm": 0.38720184564590454,
+      "learning_rate": 0.0005957835913528959,
+      "loss": 5.8879,
+      "step": 257
+    },
+    {
+      "epoch": 0.0825121429570849,
+      "grad_norm": 0.3456253111362457,
+      "learning_rate": 0.0005957315004372391,
+      "loss": 5.8717,
+      "step": 258
+    },
+    {
+      "epoch": 0.0828319574646705,
+      "grad_norm": 0.3766682744026184,
+      "learning_rate": 0.0005956790920251133,
+      "loss": 5.8761,
+      "step": 259
+    },
+    {
+      "epoch": 0.0831517719722561,
+      "grad_norm": 0.3425726294517517,
+      "learning_rate": 0.0005956263661727844,
+      "loss": 5.9843,
+      "step": 260
+    },
+    {
+      "epoch": 0.0834715864798417,
+      "grad_norm": 0.33851897716522217,
+      "learning_rate": 0.0005955733229368586,
+      "loss": 5.964,
+      "step": 261
+    },
+    {
+      "epoch": 0.0837914009874273,
+      "grad_norm": 0.3460259437561035,
+      "learning_rate": 0.000595519962374283,
+      "loss": 5.92,
+      "step": 262
+    },
+    {
+      "epoch": 0.0841112154950129,
+      "grad_norm": 0.40014368295669556,
+      "learning_rate": 0.0005954662845423452,
+      "loss": 5.8951,
+      "step": 263
+    },
+    {
+      "epoch": 0.0844310300025985,
+      "grad_norm": 0.4758973717689514,
+      "learning_rate": 0.0005954122894986736,
+      "loss": 5.9198,
+      "step": 264
+    },
+    {
+      "epoch": 0.0847508445101841,
+      "grad_norm": 0.34136953949928284,
+      "learning_rate": 0.0005953579773012374,
+      "loss": 5.9156,
+      "step": 265
+    },
+    {
+      "epoch": 0.0850706590177697,
+      "grad_norm": 0.41231444478034973,
+      "learning_rate": 0.0005953033480083456,
+      "loss": 5.8732,
+      "step": 266
+    },
+    {
+      "epoch": 0.0853904735253553,
+      "grad_norm": 0.4707014560699463,
+      "learning_rate": 0.0005952484016786483,
+      "loss": 5.8981,
+      "step": 267
+    },
+    {
+      "epoch": 0.0857102880329409,
+      "grad_norm": 0.375381201505661,
+      "learning_rate": 0.0005951931383711357,
+      "loss": 5.8859,
+      "step": 268
+    },
+    {
+      "epoch": 0.0860301025405265,
+      "grad_norm": 0.40857964754104614,
+      "learning_rate": 0.0005951375581451382,
+      "loss": 5.8945,
+      "step": 269
+    },
+    {
+      "epoch": 0.0863499170481121,
+      "grad_norm": 0.4175202548503876,
+      "learning_rate": 0.0005950816610603266,
+      "loss": 5.9622,
+      "step": 270
+    },
+    {
+      "epoch": 0.0866697315556977,
+      "grad_norm": 0.3857949376106262,
+      "learning_rate": 0.0005950254471767119,
+      "loss": 5.9279,
+      "step": 271
+    },
+    {
+      "epoch": 0.0869895460632833,
+      "grad_norm": 0.3293483853340149,
+      "learning_rate": 0.0005949689165546453,
+      "loss": 5.9038,
+      "step": 272
+    },
+    {
+      "epoch": 0.0873093605708689,
+      "grad_norm": 0.3591293394565582,
+      "learning_rate": 0.0005949120692548177,
+      "loss": 5.8429,
+      "step": 273
+    },
+    {
+      "epoch": 0.0876291750784545,
+      "grad_norm": 0.3662240505218506,
+      "learning_rate": 0.0005948549053382602,
+      "loss": 5.8856,
+      "step": 274
+    },
+    {
+      "epoch": 0.0879489895860401,
+      "grad_norm": 0.3530612289905548,
+      "learning_rate": 0.0005947974248663439,
+      "loss": 5.9053,
+      "step": 275
+    },
+    {
+      "epoch": 0.0882688040936257,
+      "grad_norm": 0.4124448001384735,
+      "learning_rate": 0.0005947396279007796,
+      "loss": 5.8455,
+      "step": 276
+    },
+    {
+      "epoch": 0.0885886186012113,
+      "grad_norm": 0.46663254499435425,
+      "learning_rate": 0.0005946815145036181,
+      "loss": 5.9011,
+      "step": 277
+    },
+    {
+      "epoch": 0.0889084331087969,
+      "grad_norm": 0.48376330733299255,
+      "learning_rate": 0.0005946230847372496,
+      "loss": 5.8198,
+      "step": 278
+    },
+    {
+      "epoch": 0.0892282476163825,
+      "grad_norm": 0.43925702571868896,
+      "learning_rate": 0.0005945643386644041,
+      "loss": 5.8751,
+      "step": 279
+    },
+    {
+      "epoch": 0.0895480621239681,
+      "grad_norm": 0.38675469160079956,
+      "learning_rate": 0.0005945052763481514,
+      "loss": 5.8037,
+      "step": 280
+    },
+    {
+      "epoch": 0.0898678766315537,
+      "grad_norm": 0.3925098776817322,
+      "learning_rate": 0.0005944458978519006,
+      "loss": 5.8462,
+      "step": 281
+    },
+    {
+      "epoch": 0.0901876911391393,
+      "grad_norm": 0.4551983177661896,
+      "learning_rate": 0.0005943862032394,
+      "loss": 5.9222,
+      "step": 282
+    },
+    {
+      "epoch": 0.0905075056467249,
+      "grad_norm": 0.3979947566986084,
+      "learning_rate": 0.000594326192574738,
+      "loss": 5.7826,
+      "step": 283
+    },
+    {
+      "epoch": 0.0908273201543105,
+      "grad_norm": 0.36182329058647156,
+      "learning_rate": 0.0005942658659223415,
+      "loss": 5.781,
+      "step": 284
+    },
+    {
+      "epoch": 0.0911471346618961,
+      "grad_norm": 0.34769323468208313,
+      "learning_rate": 0.0005942052233469771,
+      "loss": 5.8048,
+      "step": 285
+    },
+    {
+      "epoch": 0.0914669491694817,
+      "grad_norm": 0.3327469825744629,
+      "learning_rate": 0.0005941442649137507,
+      "loss": 5.8698,
+      "step": 286
+    },
+    {
+      "epoch": 0.0917867636770673,
+      "grad_norm": 0.3392864465713501,
+      "learning_rate": 0.0005940829906881066,
+      "loss": 5.7951,
+      "step": 287
+    },
+    {
+      "epoch": 0.0921065781846529,
+      "grad_norm": 0.3782911002635956,
+      "learning_rate": 0.0005940214007358293,
+      "loss": 5.8339,
+      "step": 288
+    },
+    {
+      "epoch": 0.0924263926922385,
+      "grad_norm": 0.36674779653549194,
+      "learning_rate": 0.0005939594951230412,
+      "loss": 5.7968,
+      "step": 289
+    },
+    {
+      "epoch": 0.0927462071998241,
+      "grad_norm": 0.3638828694820404,
+      "learning_rate": 0.0005938972739162041,
+      "loss": 5.8608,
+      "step": 290
+    },
+    {
+      "epoch": 0.0930660217074097,
+      "grad_norm": 0.367432177066803,
+      "learning_rate": 0.0005938347371821183,
+      "loss": 5.8873,
+      "step": 291
+    },
+    {
+      "epoch": 0.0933858362149953,
+      "grad_norm": 0.3655487895011902,
+      "learning_rate": 0.0005937718849879232,
+      "loss": 5.8549,
+      "step": 292
+    },
+    {
+      "epoch": 0.0937056507225809,
+      "grad_norm": 0.36033663153648376,
+      "learning_rate": 0.0005937087174010968,
+      "loss": 5.86,
+      "step": 293
+    },
+    {
+      "epoch": 0.0940254652301665,
+      "grad_norm": 0.36835548281669617,
+      "learning_rate": 0.0005936452344894556,
+      "loss": 5.7836,
+      "step": 294
+    },
+    {
+      "epoch": 0.0943452797377521,
+      "grad_norm": 0.4200371205806732,
+      "learning_rate": 0.0005935814363211546,
+      "loss": 5.7613,
+      "step": 295
+    },
+    {
+      "epoch": 0.0946650942453377,
+      "grad_norm": 0.6512305736541748,
+      "learning_rate": 0.0005935173229646873,
+      "loss": 5.7951,
+      "step": 296
+    },
+    {
+      "epoch": 0.0949849087529233,
+      "grad_norm": 0.37604695558547974,
+      "learning_rate": 0.0005934528944888857,
+      "loss": 5.7957,
+      "step": 297
+    },
+    {
+      "epoch": 0.0953047232605089,
+      "grad_norm": 0.38934704661369324,
+      "learning_rate": 0.0005933881509629201,
+      "loss": 5.8117,
+      "step": 298
+    },
+    {
+      "epoch": 0.0956245377680945,
+      "grad_norm": 0.3900265097618103,
+      "learning_rate": 0.0005933230924562987,
+      "loss": 5.8096,
+      "step": 299
+    },
+    {
+      "epoch": 0.0959443522756801,
+      "grad_norm": 0.41541120409965515,
+      "learning_rate": 0.0005932577190388684,
+      "loss": 5.8756,
+      "step": 300
+    },
+    {
+      "epoch": 0.0959443522756801,
+      "eval_loss": 5.780359268188477,
+      "eval_runtime": 79.6584,
+      "eval_samples_per_second": 23.814,
+      "eval_steps_per_second": 5.963,
+      "step": 300
+    },
+    {
+      "epoch": 0.0962641667832657,
+      "grad_norm": 0.4308043122291565,
+      "learning_rate": 0.0005931920307808138,
+      "loss": 5.8541,
+      "step": 301
+    },
+    {
+      "epoch": 0.0965839812908513,
+      "grad_norm": 0.4143083691596985,
+      "learning_rate": 0.0005931260277526574,
+      "loss": 5.7856,
+      "step": 302
+    },
+    {
+      "epoch": 0.0969037957984369,
+      "grad_norm": 0.5045465230941772,
+      "learning_rate": 0.0005930597100252602,
+      "loss": 5.7552,
+      "step": 303
+    },
+    {
+      "epoch": 0.0972236103060225,
+      "grad_norm": 0.6526516079902649,
+      "learning_rate": 0.0005929930776698205,
+      "loss": 5.8308,
+      "step": 304
+    },
+    {
+      "epoch": 0.0975434248136081,
+      "grad_norm": 0.5426544547080994,
+      "learning_rate": 0.0005929261307578747,
+      "loss": 5.813,
+      "step": 305
+    },
+    {
+      "epoch": 0.0978632393211937,
+      "grad_norm": 0.5369417071342468,
+      "learning_rate": 0.0005928588693612969,
+      "loss": 5.7372,
+      "step": 306
+    },
+    {
+      "epoch": 0.0981830538287793,
+      "grad_norm": 0.44663500785827637,
+      "learning_rate": 0.0005927912935522985,
+      "loss": 5.7072,
+      "step": 307
+    },
+    {
+      "epoch": 0.0985028683363649,
+      "grad_norm": 0.4832947850227356,
+      "learning_rate": 0.0005927234034034289,
+      "loss": 5.8266,
+      "step": 308
+    },
+    {
+      "epoch": 0.09882268284395052,
+      "grad_norm": 0.40185096859931946,
+      "learning_rate": 0.0005926551989875746,
+      "loss": 5.7626,
+      "step": 309
+    },
+    {
+      "epoch": 0.09914249735153612,
+      "grad_norm": 0.4085821807384491,
+      "learning_rate": 0.0005925866803779598,
+      "loss": 5.7838,
+      "step": 310
+    },
+    {
+      "epoch": 0.09946231185912172,
+      "grad_norm": 0.3931988477706909,
+      "learning_rate": 0.0005925178476481458,
+      "loss": 5.7797,
+      "step": 311
+    },
+    {
+      "epoch": 0.09978212636670732,
+      "grad_norm": 0.3664226830005646,
+      "learning_rate": 0.0005924487008720313,
+      "loss": 5.8671,
+      "step": 312
+    },
+    {
+      "epoch": 0.10010194087429292,
+      "grad_norm": 0.3390718698501587,
+      "learning_rate": 0.0005923792401238519,
+      "loss": 5.7122,
+      "step": 313
+    },
+    {
+      "epoch": 0.10042175538187852,
+      "grad_norm": 0.3831624984741211,
+      "learning_rate": 0.0005923094654781805,
+      "loss": 5.7979,
+      "step": 314
+    },
+    {
+      "epoch": 0.10074156988946412,
+      "grad_norm": 0.31763604283332825,
+      "learning_rate": 0.0005922393770099271,
+      "loss": 5.7992,
+      "step": 315
+    },
+    {
+      "epoch": 0.10106138439704972,
+      "grad_norm": 0.4179720878601074,
+      "learning_rate": 0.0005921689747943384,
+      "loss": 5.7227,
+      "step": 316
+    },
+    {
+      "epoch": 0.10138119890463532,
+      "grad_norm": 0.3841486871242523,
+      "learning_rate": 0.0005920982589069979,
+      "loss": 5.6847,
+      "step": 317
+    },
+    {
+      "epoch": 0.10170101341222092,
+      "grad_norm": 0.3815637230873108,
+      "learning_rate": 0.0005920272294238261,
+      "loss": 5.6553,
+      "step": 318
+    },
+    {
+      "epoch": 0.10202082791980652,
+      "grad_norm": 0.39948034286499023,
+      "learning_rate": 0.0005919558864210801,
+      "loss": 5.7416,
+      "step": 319
+    },
+    {
+      "epoch": 0.10234064242739212,
+      "grad_norm": 0.38976314663887024,
+      "learning_rate": 0.0005918842299753534,
+      "loss": 5.6859,
+      "step": 320
+    },
+    {
+      "epoch": 0.10266045693497772,
+      "grad_norm": 0.3874850273132324,
+      "learning_rate": 0.0005918122601635763,
+      "loss": 5.6806,
+      "step": 321
+    },
+    {
+      "epoch": 0.10298027144256332,
+      "grad_norm": 0.3757673501968384,
+      "learning_rate": 0.0005917399770630151,
+      "loss": 5.7353,
+      "step": 322
+    },
+    {
+      "epoch": 0.10330008595014892,
+      "grad_norm": 0.36281487345695496,
+      "learning_rate": 0.000591667380751273,
+      "loss": 5.7287,
+      "step": 323
+    },
+    {
+      "epoch": 0.10361990045773452,
+      "grad_norm": 0.353299617767334,
+      "learning_rate": 0.0005915944713062891,
+      "loss": 5.7731,
+      "step": 324
+    },
+    {
+      "epoch": 0.10393971496532012,
+      "grad_norm": 0.40751269459724426,
+      "learning_rate": 0.0005915212488063387,
+      "loss": 5.6843,
+      "step": 325
+    },
+    {
+      "epoch": 0.10425952947290572,
+      "grad_norm": 0.35061565041542053,
+      "learning_rate": 0.0005914477133300333,
+      "loss": 5.7085,
+      "step": 326
+    },
+    {
+      "epoch": 0.10457934398049132,
+      "grad_norm": 0.42064836621284485,
+      "learning_rate": 0.0005913738649563205,
+      "loss": 5.6999,
+      "step": 327
+    },
+    {
+      "epoch": 0.10489915848807692,
+      "grad_norm": 0.39509114623069763,
+      "learning_rate": 0.0005912997037644834,
+      "loss": 5.7637,
+      "step": 328
+    },
+    {
+      "epoch": 0.10521897299566252,
+      "grad_norm": 0.39371103048324585,
+      "learning_rate": 0.0005912252298341416,
+      "loss": 5.7397,
+      "step": 329
+    },
+    {
+      "epoch": 0.10553878750324812,
+      "grad_norm": 0.39554670453071594,
+      "learning_rate": 0.0005911504432452498,
+      "loss": 5.7249,
+      "step": 330
+    },
+    {
+      "epoch": 0.10585860201083372,
+      "grad_norm": 0.4175823926925659,
+      "learning_rate": 0.0005910753440780988,
+      "loss": 5.6705,
+      "step": 331
+    },
+    {
+      "epoch": 0.10617841651841932,
+      "grad_norm": 0.4640887677669525,
+      "learning_rate": 0.0005909999324133148,
+      "loss": 5.6927,
+      "step": 332
+    },
+    {
+      "epoch": 0.10649823102600492,
+      "grad_norm": 0.39599356055259705,
+      "learning_rate": 0.0005909242083318596,
+      "loss": 5.7207,
+      "step": 333
+    },
+    {
+      "epoch": 0.10681804553359052,
+      "grad_norm": 0.38180121779441833,
+      "learning_rate": 0.0005908481719150303,
+      "loss": 5.7066,
+      "step": 334
+    },
+    {
+      "epoch": 0.10713786004117612,
+      "grad_norm": 0.4411526918411255,
+      "learning_rate": 0.0005907718232444594,
+      "loss": 5.651,
+      "step": 335
+    },
+    {
+      "epoch": 0.10745767454876172,
+      "grad_norm": 0.3934768736362457,
+      "learning_rate": 0.0005906951624021147,
+      "loss": 5.6699,
+      "step": 336
+    },
+    {
+      "epoch": 0.10777748905634732,
+      "grad_norm": 0.366379976272583,
+      "learning_rate": 0.0005906181894702987,
+      "loss": 5.6752,
+      "step": 337
+    },
+    {
+      "epoch": 0.10809730356393292,
+      "grad_norm": 0.45642510056495667,
+      "learning_rate": 0.0005905409045316497,
+      "loss": 5.6655,
+      "step": 338
+    },
+    {
+      "epoch": 0.10841711807151852,
+      "grad_norm": 0.490781307220459,
+      "learning_rate": 0.0005904633076691404,
+      "loss": 5.7221,
+      "step": 339
+    },
+    {
+      "epoch": 0.10873693257910412,
+      "grad_norm": 0.47324830293655396,
+      "learning_rate": 0.0005903853989660787,
+      "loss": 5.7254,
+      "step": 340
+    },
+    {
+      "epoch": 0.10905674708668972,
+      "grad_norm": 0.38551396131515503,
+      "learning_rate": 0.0005903071785061069,
+      "loss": 5.6606,
+      "step": 341
+    },
+    {
+      "epoch": 0.10937656159427532,
+      "grad_norm": 0.4228317439556122,
+      "learning_rate": 0.0005902286463732026,
+      "loss": 5.7021,
+      "step": 342
+    },
+    {
+      "epoch": 0.10969637610186092,
+      "grad_norm": 0.4366403818130493,
+      "learning_rate": 0.0005901498026516774,
+      "loss": 5.6812,
+      "step": 343
+    },
+    {
+      "epoch": 0.11001619060944652,
+      "grad_norm": 0.43695297837257385,
+      "learning_rate": 0.0005900706474261778,
+      "loss": 5.7154,
+      "step": 344
+    },
+    {
+      "epoch": 0.11033600511703212,
+      "grad_norm": 0.43707820773124695,
+      "learning_rate": 0.0005899911807816844,
+      "loss": 5.6471,
+      "step": 345
+    },
+    {
+      "epoch": 0.11065581962461772,
+      "grad_norm": 0.505287766456604,
+      "learning_rate": 0.0005899114028035128,
+      "loss": 5.7538,
+      "step": 346
+    },
+    {
+      "epoch": 0.11097563413220332,
+      "grad_norm": 0.4542320668697357,
+      "learning_rate": 0.0005898313135773121,
+      "loss": 5.6872,
+      "step": 347
+    },
+    {
+      "epoch": 0.11129544863978892,
+      "grad_norm": 0.3504129648208618,
+      "learning_rate": 0.0005897509131890658,
+      "loss": 5.6519,
+      "step": 348
+    },
+    {
+      "epoch": 0.11161526314737452,
+      "grad_norm": 0.4545753598213196,
+      "learning_rate": 0.0005896702017250916,
+      "loss": 5.6922,
+      "step": 349
+    },
+    {
+      "epoch": 0.11193507765496012,
+      "grad_norm": 0.36486905813217163,
+      "learning_rate": 0.0005895891792720413,
+      "loss": 5.6931,
+      "step": 350
+    },
+    {
+      "epoch": 0.11225489216254572,
+      "grad_norm": 0.41182687878608704,
+      "learning_rate": 0.0005895078459169,
+      "loss": 5.7055,
+      "step": 351
+    },
+    {
+      "epoch": 0.11257470667013132,
+      "grad_norm": 0.40154850482940674,
+      "learning_rate": 0.0005894262017469872,
+      "loss": 5.6622,
+      "step": 352
+    },
+    {
+      "epoch": 0.11289452117771692,
+      "grad_norm": 0.40887096524238586,
+      "learning_rate": 0.0005893442468499557,
+      "loss": 5.5879,
+      "step": 353
+    },
+    {
+      "epoch": 0.11321433568530252,
+      "grad_norm": 0.4395909607410431,
+      "learning_rate": 0.0005892619813137923,
+      "loss": 5.6652,
+      "step": 354
+    },
+    {
+      "epoch": 0.11353415019288812,
+      "grad_norm": 0.4587445557117462,
+      "learning_rate": 0.0005891794052268167,
+      "loss": 5.5781,
+      "step": 355
+    },
+    {
+      "epoch": 0.11385396470047372,
+      "grad_norm": 0.5725453495979309,
+      "learning_rate": 0.0005890965186776825,
+      "loss": 5.6664,
+      "step": 356
+    },
+    {
+      "epoch": 0.11417377920805932,
+      "grad_norm": 0.45469459891319275,
+      "learning_rate": 0.0005890133217553765,
+      "loss": 5.6017,
+      "step": 357
+    },
+    {
+      "epoch": 0.11449359371564492,
+      "grad_norm": 0.39412954449653625,
+      "learning_rate": 0.0005889298145492185,
+      "loss": 5.6518,
+      "step": 358
+    },
+    {
+      "epoch": 0.11481340822323052,
+      "grad_norm": 0.3986184000968933,
+      "learning_rate": 0.0005888459971488618,
+      "loss": 5.6875,
+      "step": 359
+    },
+    {
+      "epoch": 0.11513322273081612,
+      "grad_norm": 0.4425166845321655,
+      "learning_rate": 0.0005887618696442925,
+      "loss": 5.7008,
+      "step": 360
+    },
+    {
+      "epoch": 0.11545303723840172,
+      "grad_norm": 0.4173751473426819,
+      "learning_rate": 0.0005886774321258294,
+      "loss": 5.5739,
+      "step": 361
+    },
+    {
+      "epoch": 0.11577285174598732,
+      "grad_norm": 0.44372227787971497,
+      "learning_rate": 0.0005885926846841246,
+      "loss": 5.6711,
+      "step": 362
+    },
+    {
+      "epoch": 0.11609266625357292,
+      "grad_norm": 0.4772733449935913,
+      "learning_rate": 0.0005885076274101627,
+      "loss": 5.6079,
+      "step": 363
+    },
+    {
+      "epoch": 0.11641248076115852,
+      "grad_norm": 0.4153997004032135,
+      "learning_rate": 0.0005884222603952608,
+      "loss": 5.6619,
+      "step": 364
+    },
+    {
+      "epoch": 0.11673229526874412,
+      "grad_norm": 0.37677884101867676,
+      "learning_rate": 0.0005883365837310689,
+      "loss": 5.5207,
+      "step": 365
+    },
+    {
+      "epoch": 0.11705210977632972,
+      "grad_norm": 0.42494794726371765,
+      "learning_rate": 0.0005882505975095689,
+      "loss": 5.5696,
+      "step": 366
+    },
+    {
+      "epoch": 0.11737192428391532,
+      "grad_norm": 0.6638725399971008,
+      "learning_rate": 0.0005881643018230755,
+      "loss": 5.659,
+      "step": 367
+    },
+    {
+      "epoch": 0.11769173879150092,
+      "grad_norm": 0.47131872177124023,
+      "learning_rate": 0.0005880776967642355,
+      "loss": 5.5253,
+      "step": 368
+    },
+    {
+      "epoch": 0.11801155329908652,
+      "grad_norm": 0.5139328241348267,
+      "learning_rate": 0.0005879907824260281,
+      "loss": 5.5355,
+      "step": 369
+    },
+    {
+      "epoch": 0.11833136780667212,
+      "grad_norm": 0.3897528350353241,
+      "learning_rate": 0.0005879035589017638,
+      "loss": 5.6067,
+      "step": 370
+    },
+    {
+      "epoch": 0.11865118231425772,
+      "grad_norm": 0.4552856385707855,
+      "learning_rate": 0.0005878160262850859,
+      "loss": 5.6653,
+      "step": 371
+    },
+    {
+      "epoch": 0.11897099682184333,
+      "grad_norm": 0.457346111536026,
+      "learning_rate": 0.0005877281846699689,
+      "loss": 5.5748,
+      "step": 372
+    },
+    {
+      "epoch": 0.11929081132942893,
+      "grad_norm": 0.40474575757980347,
+      "learning_rate": 0.0005876400341507194,
+      "loss": 5.5896,
+      "step": 373
+    },
+    {
+      "epoch": 0.11961062583701453,
+      "grad_norm": 0.4695594012737274,
+      "learning_rate": 0.0005875515748219757,
+      "loss": 5.6558,
+      "step": 374
+    },
+    {
+      "epoch": 0.11993044034460013,
+      "grad_norm": 0.41759058833122253,
+      "learning_rate": 0.0005874628067787072,
+      "loss": 5.6042,
+      "step": 375
+    },
+    {
+      "epoch": 0.12025025485218573,
+      "grad_norm": 0.47090569138526917,
+      "learning_rate": 0.0005873737301162151,
+      "loss": 5.5964,
+      "step": 376
+    },
+    {
+      "epoch": 0.12057006935977133,
+      "grad_norm": 0.4760238826274872,
+      "learning_rate": 0.000587284344930132,
+      "loss": 5.5101,
+      "step": 377
+    },
+    {
+      "epoch": 0.12088988386735694,
+      "grad_norm": 0.3871617615222931,
+      "learning_rate": 0.0005871946513164213,
+      "loss": 5.5711,
+      "step": 378
+    },
+    {
+      "epoch": 0.12120969837494254,
+      "grad_norm": 0.43261823058128357,
+      "learning_rate": 0.000587104649371378,
+      "loss": 5.5596,
+      "step": 379
+    },
+    {
+      "epoch": 0.12152951288252814,
+      "grad_norm": 0.3882465362548828,
+      "learning_rate": 0.000587014339191628,
+      "loss": 5.5447,
+      "step": 380
+    },
+    {
+      "epoch": 0.12184932739011374,
+      "grad_norm": 0.35880735516548157,
+      "learning_rate": 0.0005869237208741278,
+      "loss": 5.5992,
+      "step": 381
+    },
+    {
+      "epoch": 0.12216914189769934,
+      "grad_norm": 0.3885442614555359,
+      "learning_rate": 0.0005868327945161651,
+      "loss": 5.6215,
+      "step": 382
+    },
+    {
+      "epoch": 0.12248895640528494,
+      "grad_norm": 0.4411607086658478,
+      "learning_rate": 0.0005867415602153582,
+      "loss": 5.6445,
+      "step": 383
+    },
+    {
+      "epoch": 0.12280877091287054,
+      "grad_norm": 0.37853386998176575,
+      "learning_rate": 0.0005866500180696558,
+      "loss": 5.5588,
+      "step": 384
+    },
+    {
+      "epoch": 0.12312858542045614,
+      "grad_norm": 0.36723509430885315,
+      "learning_rate": 0.0005865581681773374,
+      "loss": 5.516,
+      "step": 385
+    },
+    {
+      "epoch": 0.12344839992804174,
+      "grad_norm": 0.37725216150283813,
+      "learning_rate": 0.000586466010637013,
+      "loss": 5.5922,
+      "step": 386
+    },
+    {
+      "epoch": 0.12376821443562734,
+      "grad_norm": 0.36253562569618225,
+      "learning_rate": 0.0005863735455476222,
+      "loss": 5.5197,
+      "step": 387
+    },
+    {
+      "epoch": 0.12408802894321294,
+      "grad_norm": 0.3463018834590912,
+      "learning_rate": 0.0005862807730084356,
+      "loss": 5.6061,
+      "step": 388
+    },
+    {
+      "epoch": 0.12440784345079854,
+      "grad_norm": 0.34062060713768005,
+      "learning_rate": 0.0005861876931190534,
+      "loss": 5.6057,
+      "step": 389
+    },
+    {
+      "epoch": 0.12472765795838414,
+      "grad_norm": 0.39427444338798523,
+      "learning_rate": 0.0005860943059794059,
+      "loss": 5.5445,
+      "step": 390
+    },
+    {
+      "epoch": 0.12504747246596973,
+      "grad_norm": 0.37826618552207947,
+      "learning_rate": 0.0005860006116897533,
+      "loss": 5.4999,
+      "step": 391
+    },
+    {
+      "epoch": 0.12536728697355534,
+      "grad_norm": 0.37145209312438965,
+      "learning_rate": 0.0005859066103506853,
+      "loss": 5.4957,
+      "step": 392
+    },
+    {
+      "epoch": 0.12568710148114093,
+      "grad_norm": 0.3861520290374756,
+      "learning_rate": 0.0005858123020631218,
+      "loss": 5.5209,
+      "step": 393
+    },
+    {
+      "epoch": 0.12600691598872654,
+      "grad_norm": 0.447878360748291,
+      "learning_rate": 0.0005857176869283118,
+      "loss": 5.5265,
+      "step": 394
+    },
+    {
+      "epoch": 0.12632673049631213,
+      "grad_norm": 0.37489327788352966,
+      "learning_rate": 0.0005856227650478335,
+      "loss": 5.5166,
+      "step": 395
+    },
+    {
+      "epoch": 0.12664654500389774,
+      "grad_norm": 0.6625049710273743,
+      "learning_rate": 0.0005855275365235953,
+      "loss": 5.5915,
+      "step": 396
+    },
+    {
+      "epoch": 0.12696635951148333,
+      "grad_norm": 0.44229626655578613,
+      "learning_rate": 0.0005854320014578338,
+      "loss": 5.5095,
+      "step": 397
+    },
+    {
+      "epoch": 0.12728617401906894,
+      "grad_norm": 0.4284876585006714,
+      "learning_rate": 0.0005853361599531155,
+      "loss": 5.4948,
+      "step": 398
+    },
+    {
+      "epoch": 0.12760598852665453,
+      "grad_norm": 0.3634096086025238,
+      "learning_rate": 0.0005852400121123353,
+      "loss": 5.5658,
+      "step": 399
+    },
+    {
+      "epoch": 0.12792580303424014,
+      "grad_norm": 0.43918880820274353,
+      "learning_rate": 0.0005851435580387175,
+      "loss": 5.5348,
+      "step": 400
+    },
+    {
+      "epoch": 0.12792580303424014,
+      "eval_loss": 5.526098251342773,
+      "eval_runtime": 82.1635,
+      "eval_samples_per_second": 23.088,
+      "eval_steps_per_second": 5.781,
+      "step": 400
+    },
+    {
+      "epoch": 0.12824561754182573,
+      "grad_norm": 0.3572410047054291,
+      "learning_rate": 0.0005850467978358146,
+      "loss": 5.5987,
+      "step": 401
+    },
+    {
+      "epoch": 0.12856543204941134,
+      "grad_norm": 0.3999462127685547,
+      "learning_rate": 0.0005849497316075084,
+      "loss": 5.4837,
+      "step": 402
+    },
+    {
+      "epoch": 0.12888524655699693,
+      "grad_norm": 0.4488193988800049,
+      "learning_rate": 0.0005848523594580086,
+      "loss": 5.5807,
+      "step": 403
+    },
+    {
+      "epoch": 0.12920506106458254,
+      "grad_norm": 0.3959190845489502,
+      "learning_rate": 0.0005847546814918538,
+      "loss": 5.5433,
+      "step": 404
+    },
+    {
+      "epoch": 0.12952487557216813,
+      "grad_norm": 0.4110461175441742,
+      "learning_rate": 0.0005846566978139108,
+      "loss": 5.4933,
+      "step": 405
+    },
+    {
+      "epoch": 0.12984469007975374,
+      "grad_norm": 0.4625667333602905,
+      "learning_rate": 0.0005845584085293745,
+      "loss": 5.4707,
+      "step": 406
+    },
+    {
+      "epoch": 0.13016450458733933,
+      "grad_norm": 0.3771616816520691,
+      "learning_rate": 0.0005844598137437682,
+      "loss": 5.4576,
+      "step": 407
+    },
+    {
+      "epoch": 0.13048431909492494,
+      "grad_norm": 0.42803341150283813,
+      "learning_rate": 0.0005843609135629427,
+      "loss": 5.5858,
+      "step": 408
+    },
+    {
+      "epoch": 0.13080413360251053,
+      "grad_norm": 0.4550051987171173,
+      "learning_rate": 0.0005842617080930771,
+      "loss": 5.549,
+      "step": 409
+    },
+    {
+      "epoch": 0.13112394811009614,
+      "grad_norm": 0.41399329900741577,
+      "learning_rate": 0.000584162197440678,
+      "loss": 5.6118,
+      "step": 410
+    },
+    {
+      "epoch": 0.13144376261768173,
+      "grad_norm": 0.41662803292274475,
+      "learning_rate": 0.0005840623817125799,
+      "loss": 5.4915,
+      "step": 411
+    },
+    {
+      "epoch": 0.13176357712526734,
+      "grad_norm": 0.4127683639526367,
+      "learning_rate": 0.0005839622610159446,
+      "loss": 5.5255,
+      "step": 412
+    },
+    {
+      "epoch": 0.13208339163285296,
+      "grad_norm": 0.4265100955963135,
+      "learning_rate": 0.0005838618354582612,
+      "loss": 5.4756,
+      "step": 413
+    },
+    {
+      "epoch": 0.13240320614043855,
+      "grad_norm": 0.38000795245170593,
+      "learning_rate": 0.0005837611051473466,
+      "loss": 5.4627,
+      "step": 414
+    },
+    {
+      "epoch": 0.13272302064802416,
+      "grad_norm": 0.43064582347869873,
+      "learning_rate": 0.0005836600701913443,
+      "loss": 5.4952,
+      "step": 415
+    },
+    {
+      "epoch": 0.13304283515560975,
+      "grad_norm": 0.38073885440826416,
+      "learning_rate": 0.0005835587306987255,
+      "loss": 5.5138,
+      "step": 416
+    },
+    {
+      "epoch": 0.13336264966319536,
+      "grad_norm": 0.37120160460472107,
+      "learning_rate": 0.0005834570867782875,
+      "loss": 5.5417,
+      "step": 417
+    },
+    {
+      "epoch": 0.13368246417078095,
+      "grad_norm": 0.3759710490703583,
+      "learning_rate": 0.0005833551385391551,
+      "loss": 5.5581,
+      "step": 418
+    },
+    {
+      "epoch": 0.13400227867836656,
+      "grad_norm": 0.4189684987068176,
+      "learning_rate": 0.0005832528860907798,
+      "loss": 5.4671,
+      "step": 419
+    },
+    {
+      "epoch": 0.13432209318595215,
+      "grad_norm": 0.4628429412841797,
+      "learning_rate": 0.0005831503295429393,
+      "loss": 5.4805,
+      "step": 420
+    },
+    {
+      "epoch": 0.13464190769353776,
+      "grad_norm": 0.4366797208786011,
+      "learning_rate": 0.0005830474690057383,
+      "loss": 5.5822,
+      "step": 421
+    },
+    {
+      "epoch": 0.13496172220112335,
+      "grad_norm": 0.3592755198478699,
+      "learning_rate": 0.0005829443045896072,
+      "loss": 5.5895,
+      "step": 422
+    },
+    {
+      "epoch": 0.13528153670870896,
+      "grad_norm": 0.4137701392173767,
+      "learning_rate": 0.0005828408364053031,
+      "loss": 5.4889,
+      "step": 423
+    },
+    {
+      "epoch": 0.13560135121629455,
+      "grad_norm": 0.3506132960319519,
+      "learning_rate": 0.0005827370645639095,
+      "loss": 5.5088,
+      "step": 424
+    },
+    {
+      "epoch": 0.13592116572388016,
+      "grad_norm": 0.34037062525749207,
+      "learning_rate": 0.0005826329891768351,
+      "loss": 5.4807,
+      "step": 425
+    },
+    {
+      "epoch": 0.13624098023146575,
+      "grad_norm": 0.5105721950531006,
+      "learning_rate": 0.0005825286103558151,
+      "loss": 5.4829,
+      "step": 426
+    },
+    {
+      "epoch": 0.13656079473905136,
+      "grad_norm": 0.39539438486099243,
+      "learning_rate": 0.0005824239282129103,
+      "loss": 5.5657,
+      "step": 427
+    },
+    {
+      "epoch": 0.13688060924663695,
+      "grad_norm": 0.45470479130744934,
+      "learning_rate": 0.0005823189428605072,
+      "loss": 5.5231,
+      "step": 428
+    },
+    {
+      "epoch": 0.13720042375422256,
+      "grad_norm": 0.42423999309539795,
+      "learning_rate": 0.0005822136544113177,
+      "loss": 5.3981,
+      "step": 429
+    },
+    {
+      "epoch": 0.13752023826180815,
+      "grad_norm": 0.34813690185546875,
+      "learning_rate": 0.000582108062978379,
+      "loss": 5.454,
+      "step": 430
+    },
+    {
+      "epoch": 0.13784005276939376,
+      "grad_norm": 0.36712193489074707,
+      "learning_rate": 0.0005820021686750542,
+      "loss": 5.4303,
+      "step": 431
+    },
+    {
+      "epoch": 0.13815986727697935,
+      "grad_norm": 0.3907421827316284,
+      "learning_rate": 0.0005818959716150306,
+      "loss": 5.4179,
+      "step": 432
+    },
+    {
+      "epoch": 0.13847968178456496,
+      "grad_norm": 0.38730525970458984,
+      "learning_rate": 0.0005817894719123214,
+      "loss": 5.4916,
+      "step": 433
+    },
+    {
+      "epoch": 0.13879949629215055,
+      "grad_norm": 0.4636697471141815,
+      "learning_rate": 0.0005816826696812643,
+      "loss": 5.4353,
+      "step": 434
+    },
+    {
+      "epoch": 0.13911931079973616,
+      "grad_norm": 0.4625189006328583,
+      "learning_rate": 0.0005815755650365217,
+      "loss": 5.4693,
+      "step": 435
+    },
+    {
+      "epoch": 0.13943912530732175,
+      "grad_norm": 0.39926567673683167,
+      "learning_rate": 0.000581468158093081,
+      "loss": 5.4779,
+      "step": 436
+    },
+    {
+      "epoch": 0.13975893981490736,
+      "grad_norm": 0.4575376510620117,
+      "learning_rate": 0.0005813604489662539,
+      "loss": 5.5438,
+      "step": 437
+    },
+    {
+      "epoch": 0.14007875432249295,
+      "grad_norm": 0.36058712005615234,
+      "learning_rate": 0.0005812524377716766,
+      "loss": 5.5096,
+      "step": 438
+    },
+    {
+      "epoch": 0.14039856883007856,
+      "grad_norm": 0.4126695990562439,
+      "learning_rate": 0.0005811441246253098,
+      "loss": 5.4493,
+      "step": 439
+    },
+    {
+      "epoch": 0.14071838333766415,
+      "grad_norm": 0.41481906175613403,
+      "learning_rate": 0.0005810355096434378,
+      "loss": 5.4734,
+      "step": 440
+    },
+    {
+      "epoch": 0.14103819784524976,
+      "grad_norm": 0.4174092710018158,
+      "learning_rate": 0.0005809265929426696,
+      "loss": 5.4339,
+      "step": 441
+    },
+    {
+      "epoch": 0.14135801235283535,
+      "grad_norm": 0.4385882318019867,
+      "learning_rate": 0.0005808173746399377,
+      "loss": 5.4296,
+      "step": 442
+    },
+    {
+      "epoch": 0.14167782686042096,
+      "grad_norm": 0.40480849146842957,
+      "learning_rate": 0.0005807078548524988,
+      "loss": 5.3593,
+      "step": 443
+    },
+    {
+      "epoch": 0.14199764136800655,
+      "grad_norm": 0.40930604934692383,
+      "learning_rate": 0.0005805980336979327,
+      "loss": 5.5535,
+      "step": 444
+    },
+    {
+      "epoch": 0.14231745587559216,
+      "grad_norm": 0.4092625677585602,
+      "learning_rate": 0.0005804879112941433,
+      "loss": 5.3996,
+      "step": 445
+    },
+    {
+      "epoch": 0.14263727038317775,
+      "grad_norm": 0.3877846300601959,
+      "learning_rate": 0.0005803774877593575,
+      "loss": 5.4833,
+      "step": 446
+    },
+    {
+      "epoch": 0.14295708489076336,
+      "grad_norm": 0.43150779604911804,
+      "learning_rate": 0.000580266763212126,
+      "loss": 5.4727,
+      "step": 447
+    },
+    {
+      "epoch": 0.14327689939834895,
+      "grad_norm": 0.3717440366744995,
+      "learning_rate": 0.0005801557377713218,
+      "loss": 5.4995,
+      "step": 448
+    },
+    {
+      "epoch": 0.14359671390593456,
+      "grad_norm": 0.39955979585647583,
+      "learning_rate": 0.0005800444115561422,
+      "loss": 5.511,
+      "step": 449
+    },
+    {
+      "epoch": 0.14391652841352015,
+      "grad_norm": 0.36227232217788696,
+      "learning_rate": 0.000579932784686106,
+      "loss": 5.4445,
+      "step": 450
+    },
+    {
+      "epoch": 0.14423634292110576,
+      "grad_norm": 0.36275947093963623,
+      "learning_rate": 0.000579820857281056,
+      "loss": 5.4402,
+      "step": 451
+    },
+    {
+      "epoch": 0.14455615742869135,
+      "grad_norm": 0.38074591755867004,
+      "learning_rate": 0.0005797086294611569,
+      "loss": 5.4352,
+      "step": 452
+    },
+    {
+      "epoch": 0.14487597193627696,
+      "grad_norm": 0.37614187598228455,
+      "learning_rate": 0.0005795961013468961,
+      "loss": 5.4581,
+      "step": 453
+    },
+    {
+      "epoch": 0.14519578644386255,
+      "grad_norm": 0.35029304027557373,
+      "learning_rate": 0.0005794832730590836,
+      "loss": 5.4321,
+      "step": 454
+    },
+    {
+      "epoch": 0.14551560095144817,
+      "grad_norm": 0.3891676366329193,
+      "learning_rate": 0.0005793701447188514,
+      "loss": 5.3738,
+      "step": 455
+    },
+    {
+      "epoch": 0.14583541545903375,
+      "grad_norm": 0.41309744119644165,
+      "learning_rate": 0.0005792567164476539,
+      "loss": 5.4222,
+      "step": 456
+    },
+    {
+      "epoch": 0.14615522996661937,
+      "grad_norm": 0.40795260667800903,
+      "learning_rate": 0.0005791429883672672,
+      "loss": 5.3891,
+      "step": 457
+    },
+    {
+      "epoch": 0.14647504447420495,
+      "grad_norm": 0.4105323255062103,
+      "learning_rate": 0.0005790289605997895,
+      "loss": 5.3823,
+      "step": 458
+    },
+    {
+      "epoch": 0.14679485898179057,
+      "grad_norm": 0.3415970504283905,
+      "learning_rate": 0.0005789146332676407,
+      "loss": 5.3935,
+      "step": 459
+    },
+    {
+      "epoch": 0.14711467348937615,
+      "grad_norm": 0.5799669027328491,
+      "learning_rate": 0.0005788000064935623,
+      "loss": 5.4125,
+      "step": 460
+    },
+    {
+      "epoch": 0.14743448799696177,
+      "grad_norm": 0.3847658038139343,
+      "learning_rate": 0.0005786850804006172,
+      "loss": 5.4022,
+      "step": 461
+    },
+    {
+      "epoch": 0.14775430250454735,
+      "grad_norm": 0.4211963713169098,
+      "learning_rate": 0.0005785698551121897,
+      "loss": 5.4022,
+      "step": 462
+    },
+    {
+      "epoch": 0.14807411701213297,
+      "grad_norm": 0.4019568860530853,
+      "learning_rate": 0.0005784543307519854,
+      "loss": 5.3992,
+      "step": 463
+    },
+    {
+      "epoch": 0.14839393151971855,
+      "grad_norm": 0.3978749215602875,
+      "learning_rate": 0.000578338507444031,
+      "loss": 5.4448,
+      "step": 464
+    },
+    {
+      "epoch": 0.14871374602730417,
+      "grad_norm": 0.44494131207466125,
+      "learning_rate": 0.0005782223853126739,
+      "loss": 5.4292,
+      "step": 465
+    },
+    {
+      "epoch": 0.14903356053488975,
+      "grad_norm": 0.4366230070590973,
+      "learning_rate": 0.0005781059644825824,
+      "loss": 5.4311,
+      "step": 466
+    },
+    {
+      "epoch": 0.14935337504247537,
+      "grad_norm": 0.3958189785480499,
+      "learning_rate": 0.0005779892450787458,
+      "loss": 5.3312,
+      "step": 467
+    },
+    {
+      "epoch": 0.14967318955006095,
+      "grad_norm": 0.43146812915802,
+      "learning_rate": 0.0005778722272264736,
+      "loss": 5.4564,
+      "step": 468
+    },
+    {
+      "epoch": 0.14999300405764657,
+      "grad_norm": 0.5479041337966919,
+      "learning_rate": 0.0005777549110513959,
+      "loss": 5.4525,
+      "step": 469
+    },
+    {
+      "epoch": 0.15031281856523215,
+      "grad_norm": 0.4975782632827759,
+      "learning_rate": 0.0005776372966794628,
+      "loss": 5.4587,
+      "step": 470
+    },
+    {
+      "epoch": 0.15063263307281777,
+      "grad_norm": 0.44812631607055664,
+      "learning_rate": 0.000577519384236945,
+      "loss": 5.4789,
+      "step": 471
+    },
+    {
+      "epoch": 0.15095244758040335,
+      "grad_norm": 0.4622386395931244,
+      "learning_rate": 0.0005774011738504326,
+      "loss": 5.4506,
+      "step": 472
+    },
+    {
+      "epoch": 0.15127226208798897,
+      "grad_norm": 0.4067244827747345,
+      "learning_rate": 0.0005772826656468363,
+      "loss": 5.465,
+      "step": 473
+    },
+    {
+      "epoch": 0.15159207659557455,
+      "grad_norm": 0.41158872842788696,
+      "learning_rate": 0.000577163859753386,
+      "loss": 5.4287,
+      "step": 474
+    },
+    {
+      "epoch": 0.15191189110316017,
+      "grad_norm": 0.38296622037887573,
+      "learning_rate": 0.0005770447562976313,
+      "loss": 5.3948,
+      "step": 475
+    },
+    {
+      "epoch": 0.15223170561074575,
+      "grad_norm": 0.479064017534256,
+      "learning_rate": 0.0005769253554074414,
+      "loss": 5.4274,
+      "step": 476
+    },
+    {
+      "epoch": 0.15255152011833137,
+      "grad_norm": 0.598816990852356,
+      "learning_rate": 0.0005768056572110047,
+      "loss": 5.4695,
+      "step": 477
+    },
+    {
+      "epoch": 0.15287133462591695,
+      "grad_norm": 0.4154305160045624,
+      "learning_rate": 0.000576685661836829,
+      "loss": 5.3828,
+      "step": 478
+    },
+    {
+      "epoch": 0.15319114913350257,
+      "grad_norm": 0.5114104747772217,
+      "learning_rate": 0.0005765653694137406,
+      "loss": 5.3977,
+      "step": 479
+    },
+    {
+      "epoch": 0.15351096364108816,
+      "grad_norm": 0.40716323256492615,
+      "learning_rate": 0.0005764447800708856,
+      "loss": 5.3884,
+      "step": 480
+    },
+    {
+      "epoch": 0.15383077814867377,
+      "grad_norm": 0.41213738918304443,
+      "learning_rate": 0.0005763238939377278,
+      "loss": 5.3991,
+      "step": 481
+    },
+    {
+      "epoch": 0.15415059265625938,
+      "grad_norm": 0.4534163773059845,
+      "learning_rate": 0.0005762027111440506,
+      "loss": 5.4046,
+      "step": 482
+    },
+    {
+      "epoch": 0.15447040716384497,
+      "grad_norm": 0.42364469170570374,
+      "learning_rate": 0.0005760812318199555,
+      "loss": 5.4144,
+      "step": 483
+    },
+    {
+      "epoch": 0.15479022167143058,
+      "grad_norm": 0.486278235912323,
+      "learning_rate": 0.000575959456095862,
+      "loss": 5.3027,
+      "step": 484
+    },
+    {
+      "epoch": 0.15511003617901617,
+      "grad_norm": 0.4105694890022278,
+      "learning_rate": 0.0005758373841025085,
+      "loss": 5.3753,
+      "step": 485
+    },
+    {
+      "epoch": 0.15542985068660178,
+      "grad_norm": 0.41940709948539734,
+      "learning_rate": 0.000575715015970951,
+      "loss": 5.358,
+      "step": 486
+    },
+    {
+      "epoch": 0.15574966519418737,
+      "grad_norm": 0.4332304894924164,
+      "learning_rate": 0.0005755923518325637,
+      "loss": 5.3866,
+      "step": 487
+    },
+    {
+      "epoch": 0.15606947970177298,
+      "grad_norm": 0.4478780925273895,
+      "learning_rate": 0.0005754693918190382,
+      "loss": 5.4167,
+      "step": 488
+    },
+    {
+      "epoch": 0.15638929420935857,
+      "grad_norm": 0.39930951595306396,
+      "learning_rate": 0.0005753461360623842,
+      "loss": 5.3498,
+      "step": 489
+    },
+    {
+      "epoch": 0.15670910871694418,
+      "grad_norm": 0.511391282081604,
+      "learning_rate": 0.0005752225846949287,
+      "loss": 5.4319,
+      "step": 490
+    },
+    {
+      "epoch": 0.15702892322452977,
+      "grad_norm": 0.44350582361221313,
+      "learning_rate": 0.000575098737849316,
+      "loss": 5.3892,
+      "step": 491
+    },
+    {
+      "epoch": 0.15734873773211538,
+      "grad_norm": 0.5511606335639954,
+      "learning_rate": 0.0005749745956585077,
+      "loss": 5.3716,
+      "step": 492
+    },
+    {
+      "epoch": 0.15766855223970097,
+      "grad_norm": 0.44419968128204346,
+      "learning_rate": 0.0005748501582557825,
+      "loss": 5.4444,
+      "step": 493
+    },
+    {
+      "epoch": 0.15798836674728658,
+      "grad_norm": 0.37331676483154297,
+      "learning_rate": 0.0005747254257747362,
+      "loss": 5.39,
+      "step": 494
+    },
+    {
+      "epoch": 0.15830818125487217,
+      "grad_norm": 0.4771307408809662,
+      "learning_rate": 0.0005746003983492811,
+      "loss": 5.344,
+      "step": 495
+    },
+    {
+      "epoch": 0.15862799576245779,
+      "grad_norm": 0.44543081521987915,
+      "learning_rate": 0.0005744750761136463,
+      "loss": 5.3273,
+      "step": 496
+    },
+    {
+      "epoch": 0.15894781027004337,
+      "grad_norm": 0.5002357363700867,
+      "learning_rate": 0.0005743494592023773,
+      "loss": 5.3547,
+      "step": 497
+    },
+    {
+      "epoch": 0.15926762477762899,
+      "grad_norm": 0.7490878105163574,
+      "learning_rate": 0.0005742235477503362,
+      "loss": 5.3641,
+      "step": 498
+    },
+    {
+      "epoch": 0.15958743928521457,
+      "grad_norm": 0.4287881553173065,
+      "learning_rate": 0.000574097341892701,
+      "loss": 5.333,
+      "step": 499
+    },
+    {
+      "epoch": 0.15990725379280019,
+      "grad_norm": 0.4294663071632385,
+      "learning_rate": 0.0005739708417649659,
+      "loss": 5.2918,
+      "step": 500
+    },
+    {
+      "epoch": 0.15990725379280019,
+      "eval_loss": 5.372103691101074,
+      "eval_runtime": 78.0671,
+      "eval_samples_per_second": 24.3,
+      "eval_steps_per_second": 6.085,
+      "step": 500
+    },
+    {
+      "epoch": 0.16022706830038577,
+      "grad_norm": 0.42622506618499756,
+      "learning_rate": 0.0005738440475029414,
+      "loss": 5.3715,
+      "step": 501
+    },
+    {
+      "epoch": 0.16054688280797139,
+      "grad_norm": 0.5203431248664856,
+      "learning_rate": 0.0005737169592427531,
+      "loss": 5.4029,
+      "step": 502
+    },
+    {
+      "epoch": 0.16086669731555697,
+      "grad_norm": 0.4429241120815277,
+      "learning_rate": 0.0005735895771208427,
+      "loss": 5.4216,
+      "step": 503
+    },
+    {
+      "epoch": 0.1611865118231426,
+      "grad_norm": 0.478929728269577,
+      "learning_rate": 0.0005734619012739673,
+      "loss": 5.4132,
+      "step": 504
+    },
+    {
+      "epoch": 0.16150632633072817,
+      "grad_norm": 0.3934062719345093,
+      "learning_rate": 0.0005733339318391992,
+      "loss": 5.4462,
+      "step": 505
+    },
+    {
+      "epoch": 0.1618261408383138,
+      "grad_norm": 0.4275869131088257,
+      "learning_rate": 0.0005732056689539262,
+      "loss": 5.3483,
+      "step": 506
+    },
+    {
+      "epoch": 0.16214595534589937,
+      "grad_norm": 1.1098144054412842,
+      "learning_rate": 0.0005730771127558508,
+      "loss": 5.3647,
+      "step": 507
+    },
+    {
+      "epoch": 0.162465769853485,
+      "grad_norm": 0.4583177864551544,
+      "learning_rate": 0.0005729482633829906,
+      "loss": 5.3677,
+      "step": 508
+    },
+    {
+      "epoch": 0.16278558436107057,
+      "grad_norm": 0.4252079725265503,
+      "learning_rate": 0.000572819120973678,
+      "loss": 5.3601,
+      "step": 509
+    },
+    {
+      "epoch": 0.1631053988686562,
+      "grad_norm": 0.5495724678039551,
+      "learning_rate": 0.0005726896856665599,
+      "loss": 5.3454,
+      "step": 510
+    },
+    {
+      "epoch": 0.16342521337624177,
+      "grad_norm": 0.40474510192871094,
+      "learning_rate": 0.0005725599576005975,
+      "loss": 5.3744,
+      "step": 511
+    },
+    {
+      "epoch": 0.1637450278838274,
+      "grad_norm": 0.4442523717880249,
+      "learning_rate": 0.0005724299369150665,
+      "loss": 5.396,
+      "step": 512
+    },
+    {
+      "epoch": 0.16406484239141297,
+      "grad_norm": 0.4059533178806305,
+      "learning_rate": 0.0005722996237495569,
+      "loss": 5.3996,
+      "step": 513
+    },
+    {
+      "epoch": 0.1643846568989986,
+      "grad_norm": 0.613540768623352,
+      "learning_rate": 0.0005721690182439724,
+      "loss": 5.4113,
+      "step": 514
+    },
+    {
+      "epoch": 0.16470447140658417,
+      "grad_norm": 0.42122897505760193,
+      "learning_rate": 0.0005720381205385306,
+      "loss": 5.3639,
+      "step": 515
+    },
+    {
+      "epoch": 0.1650242859141698,
+      "grad_norm": 0.3783654570579529,
+      "learning_rate": 0.000571906930773763,
+      "loss": 5.3616,
+      "step": 516
+    },
+    {
+      "epoch": 0.16534410042175537,
+      "grad_norm": 0.52512127161026,
+      "learning_rate": 0.0005717754490905146,
+      "loss": 5.3951,
+      "step": 517
+    },
+    {
+      "epoch": 0.165663914929341,
+      "grad_norm": 0.3968490958213806,
+      "learning_rate": 0.0005716436756299437,
+      "loss": 5.337,
+      "step": 518
+    },
+    {
+      "epoch": 0.16598372943692657,
+      "grad_norm": 0.40351352095603943,
+      "learning_rate": 0.000571511610533522,
+      "loss": 5.3883,
+      "step": 519
+    },
+    {
+      "epoch": 0.1663035439445122,
+      "grad_norm": 0.43766388297080994,
+      "learning_rate": 0.0005713792539430339,
+      "loss": 5.3675,
+      "step": 520
+    },
+    {
+      "epoch": 0.16662335845209778,
+      "grad_norm": 0.413519024848938,
+      "learning_rate": 0.0005712466060005774,
+      "loss": 5.368,
+      "step": 521
+    },
+    {
+      "epoch": 0.1669431729596834,
+      "grad_norm": 0.42428046464920044,
+      "learning_rate": 0.0005711136668485626,
+      "loss": 5.2983,
+      "step": 522
+    },
+    {
+      "epoch": 0.16726298746726898,
+      "grad_norm": 0.4255489110946655,
+      "learning_rate": 0.0005709804366297129,
+      "loss": 5.3115,
+      "step": 523
+    },
+    {
+      "epoch": 0.1675828019748546,
+      "grad_norm": 0.44080090522766113,
+      "learning_rate": 0.0005708469154870636,
+      "loss": 5.3866,
+      "step": 524
+    },
+    {
+      "epoch": 0.16790261648244018,
+      "grad_norm": 0.38031458854675293,
+      "learning_rate": 0.0005707131035639629,
+      "loss": 5.3553,
+      "step": 525
+    },
+    {
+      "epoch": 0.1682224309900258,
+      "grad_norm": 0.4256940484046936,
+      "learning_rate": 0.0005705790010040707,
+      "loss": 5.3747,
+      "step": 526
+    },
+    {
+      "epoch": 0.16854224549761138,
+      "grad_norm": 0.41666266322135925,
+      "learning_rate": 0.000570444607951359,
+      "loss": 5.3955,
+      "step": 527
+    },
+    {
+      "epoch": 0.168862060005197,
+      "grad_norm": 0.4019726514816284,
+      "learning_rate": 0.000570309924550112,
+      "loss": 5.3084,
+      "step": 528
+    },
+    {
+      "epoch": 0.16918187451278258,
+      "grad_norm": 0.75457763671875,
+      "learning_rate": 0.0005701749509449253,
+      "loss": 5.2837,
+      "step": 529
+    },
+    {
+      "epoch": 0.1695016890203682,
+      "grad_norm": 0.41248270869255066,
+      "learning_rate": 0.0005700396872807062,
+      "loss": 5.3335,
+      "step": 530
+    },
+    {
+      "epoch": 0.16982150352795378,
+      "grad_norm": 0.4266038239002228,
+      "learning_rate": 0.0005699041337026734,
+      "loss": 5.3278,
+      "step": 531
+    },
+    {
+      "epoch": 0.1701413180355394,
+      "grad_norm": 0.41244661808013916,
+      "learning_rate": 0.0005697682903563568,
+      "loss": 5.3348,
+      "step": 532
+    },
+    {
+      "epoch": 0.17046113254312498,
+      "grad_norm": 0.39695993065834045,
+      "learning_rate": 0.0005696321573875974,
+      "loss": 5.3294,
+      "step": 533
+    },
+    {
+      "epoch": 0.1707809470507106,
+      "grad_norm": 0.4267037808895111,
+      "learning_rate": 0.0005694957349425472,
+      "loss": 5.3353,
+      "step": 534
+    },
+    {
+      "epoch": 0.17110076155829618,
+      "grad_norm": 0.4073215425014496,
+      "learning_rate": 0.0005693590231676688,
+      "loss": 5.3505,
+      "step": 535
+    },
+    {
+      "epoch": 0.1714205760658818,
+      "grad_norm": 0.4800320565700531,
+      "learning_rate": 0.0005692220222097357,
+      "loss": 5.3556,
+      "step": 536
+    },
+    {
+      "epoch": 0.17174039057346738,
+      "grad_norm": 0.42410972714424133,
+      "learning_rate": 0.0005690847322158317,
+      "loss": 5.3686,
+      "step": 537
+    },
+    {
+      "epoch": 0.172060205081053,
+      "grad_norm": 0.4676796495914459,
+      "learning_rate": 0.0005689471533333508,
+      "loss": 5.2979,
+      "step": 538
+    },
+    {
+      "epoch": 0.17238001958863858,
+      "grad_norm": 0.4038192927837372,
+      "learning_rate": 0.0005688092857099974,
+      "loss": 5.2512,
+      "step": 539
+    },
+    {
+      "epoch": 0.1726998340962242,
+      "grad_norm": 0.42532771825790405,
+      "learning_rate": 0.0005686711294937858,
+      "loss": 5.247,
+      "step": 540
+    },
+    {
+      "epoch": 0.17301964860380978,
+      "grad_norm": 0.46493101119995117,
+      "learning_rate": 0.0005685326848330402,
+      "loss": 5.3337,
+      "step": 541
+    },
+    {
+      "epoch": 0.1733394631113954,
+      "grad_norm": 0.45587480068206787,
+      "learning_rate": 0.0005683939518763942,
+      "loss": 5.2597,
+      "step": 542
+    },
+    {
+      "epoch": 0.17365927761898098,
+      "grad_norm": 0.4194573760032654,
+      "learning_rate": 0.000568254930772791,
+      "loss": 5.3739,
+      "step": 543
+    },
+    {
+      "epoch": 0.1739790921265666,
+      "grad_norm": 0.4531188905239105,
+      "learning_rate": 0.0005681156216714836,
+      "loss": 5.2942,
+      "step": 544
+    },
+    {
+      "epoch": 0.17429890663415218,
+      "grad_norm": 0.4515056014060974,
+      "learning_rate": 0.0005679760247220336,
+      "loss": 5.3101,
+      "step": 545
+    },
+    {
+      "epoch": 0.1746187211417378,
+      "grad_norm": 0.43099555373191833,
+      "learning_rate": 0.0005678361400743119,
+      "loss": 5.3211,
+      "step": 546
+    },
+    {
+      "epoch": 0.17493853564932338,
+      "grad_norm": 0.48675426840782166,
+      "learning_rate": 0.0005676959678784982,
+      "loss": 5.215,
+      "step": 547
+    },
+    {
+      "epoch": 0.175258350156909,
+      "grad_norm": 0.40001335740089417,
+      "learning_rate": 0.000567555508285081,
+      "loss": 5.3301,
+      "step": 548
+    },
+    {
+      "epoch": 0.17557816466449458,
+      "grad_norm": 0.45683255791664124,
+      "learning_rate": 0.0005674147614448574,
+      "loss": 5.3602,
+      "step": 549
+    },
+    {
+      "epoch": 0.1758979791720802,
+      "grad_norm": 0.4169895052909851,
+      "learning_rate": 0.0005672737275089327,
+      "loss": 5.3581,
+      "step": 550
+    },
+    {
+      "epoch": 0.1762177936796658,
+      "grad_norm": 0.4237796664237976,
+      "learning_rate": 0.0005671324066287205,
+      "loss": 5.2729,
+      "step": 551
+    },
+    {
+      "epoch": 0.1765376081872514,
+      "grad_norm": 0.47753238677978516,
+      "learning_rate": 0.0005669907989559426,
+      "loss": 5.3352,
+      "step": 552
+    },
+    {
+      "epoch": 0.176857422694837,
+      "grad_norm": 0.4339083731174469,
+      "learning_rate": 0.0005668489046426285,
+      "loss": 5.3531,
+      "step": 553
+    },
+    {
+      "epoch": 0.1771772372024226,
+      "grad_norm": 0.4622672200202942,
+      "learning_rate": 0.0005667067238411153,
+      "loss": 5.3213,
+      "step": 554
+    },
+    {
+      "epoch": 0.1774970517100082,
+      "grad_norm": 0.4333019554615021,
+      "learning_rate": 0.0005665642567040483,
+      "loss": 5.3419,
+      "step": 555
+    },
+    {
+      "epoch": 0.1778168662175938,
+      "grad_norm": 0.3928539752960205,
+      "learning_rate": 0.0005664215033843796,
+      "loss": 5.2607,
+      "step": 556
+    },
+    {
+      "epoch": 0.1781366807251794,
+      "grad_norm": 0.387408584356308,
+      "learning_rate": 0.0005662784640353688,
+      "loss": 5.2742,
+      "step": 557
+    },
+    {
+      "epoch": 0.178456495232765,
+      "grad_norm": 0.4588526487350464,
+      "learning_rate": 0.0005661351388105823,
+      "loss": 5.3356,
+      "step": 558
+    },
+    {
+      "epoch": 0.1787763097403506,
+      "grad_norm": 0.440641850233078,
+      "learning_rate": 0.0005659915278638939,
+      "loss": 5.3318,
+      "step": 559
+    },
+    {
+      "epoch": 0.1790961242479362,
+      "grad_norm": 0.3728678822517395,
+      "learning_rate": 0.0005658476313494839,
+      "loss": 5.2345,
+      "step": 560
+    },
+    {
+      "epoch": 0.1794159387555218,
+      "grad_norm": 0.37536266446113586,
+      "learning_rate": 0.0005657034494218389,
+      "loss": 5.282,
+      "step": 561
+    },
+    {
+      "epoch": 0.1797357532631074,
+      "grad_norm": 0.3769688904285431,
+      "learning_rate": 0.0005655589822357526,
+      "loss": 5.2509,
+      "step": 562
+    },
+    {
+      "epoch": 0.180055567770693,
+      "grad_norm": 0.4108069837093353,
+      "learning_rate": 0.0005654142299463241,
+      "loss": 5.3465,
+      "step": 563
+    },
+    {
+      "epoch": 0.1803753822782786,
+      "grad_norm": 0.3620480000972748,
+      "learning_rate": 0.0005652691927089593,
+      "loss": 5.2491,
+      "step": 564
+    },
+    {
+      "epoch": 0.1806951967858642,
+      "grad_norm": 0.3661453127861023,
+      "learning_rate": 0.0005651238706793697,
+      "loss": 5.2158,
+      "step": 565
+    },
+    {
+      "epoch": 0.1810150112934498,
+      "grad_norm": 0.4069892466068268,
+      "learning_rate": 0.0005649782640135727,
+      "loss": 5.2745,
+      "step": 566
+    },
+    {
+      "epoch": 0.1813348258010354,
+      "grad_norm": 0.42880722880363464,
+      "learning_rate": 0.000564832372867891,
+      "loss": 5.2006,
+      "step": 567
+    },
+    {
+      "epoch": 0.181654640308621,
+      "grad_norm": 0.39120304584503174,
+      "learning_rate": 0.0005646861973989531,
+      "loss": 5.3242,
+      "step": 568
+    },
+    {
+      "epoch": 0.1819744548162066,
+      "grad_norm": 0.35623669624328613,
+      "learning_rate": 0.0005645397377636922,
+      "loss": 5.2269,
+      "step": 569
+    },
+    {
+      "epoch": 0.1822942693237922,
+      "grad_norm": 0.39440712332725525,
+      "learning_rate": 0.0005643929941193474,
+      "loss": 5.2249,
+      "step": 570
+    },
+    {
+      "epoch": 0.1826140838313778,
+      "grad_norm": 0.37591278553009033,
+      "learning_rate": 0.000564245966623462,
+      "loss": 5.2233,
+      "step": 571
+    },
+    {
+      "epoch": 0.1829338983389634,
+      "grad_norm": 0.4042278230190277,
+      "learning_rate": 0.0005640986554338842,
+      "loss": 5.2269,
+      "step": 572
+    },
+    {
+      "epoch": 0.183253712846549,
+      "grad_norm": 0.4501861035823822,
+      "learning_rate": 0.0005639510607087673,
+      "loss": 5.3752,
+      "step": 573
+    },
+    {
+      "epoch": 0.1835735273541346,
+      "grad_norm": 0.3743823170661926,
+      "learning_rate": 0.0005638031826065679,
+      "loss": 5.2696,
+      "step": 574
+    },
+    {
+      "epoch": 0.1838933418617202,
+      "grad_norm": 0.4235236942768097,
+      "learning_rate": 0.0005636550212860479,
+      "loss": 5.2862,
+      "step": 575
+    },
+    {
+      "epoch": 0.1842131563693058,
+      "grad_norm": 0.3701503872871399,
+      "learning_rate": 0.0005635065769062728,
+      "loss": 5.2726,
+      "step": 576
+    },
+    {
+      "epoch": 0.1845329708768914,
+      "grad_norm": 0.45752373337745667,
+      "learning_rate": 0.0005633578496266121,
+      "loss": 5.2393,
+      "step": 577
+    },
+    {
+      "epoch": 0.184852785384477,
+      "grad_norm": 0.35952097177505493,
+      "learning_rate": 0.0005632088396067389,
+      "loss": 5.2572,
+      "step": 578
+    },
+    {
+      "epoch": 0.1851725998920626,
+      "grad_norm": 0.5012151598930359,
+      "learning_rate": 0.0005630595470066299,
+      "loss": 5.3061,
+      "step": 579
+    },
+    {
+      "epoch": 0.1854924143996482,
+      "grad_norm": 0.42023152112960815,
+      "learning_rate": 0.0005629099719865652,
+      "loss": 5.337,
+      "step": 580
+    },
+    {
+      "epoch": 0.1858122289072338,
+      "grad_norm": 0.4002183973789215,
+      "learning_rate": 0.0005627601147071282,
+      "loss": 5.2125,
+      "step": 581
+    },
+    {
+      "epoch": 0.1861320434148194,
+      "grad_norm": 0.4116981327533722,
+      "learning_rate": 0.000562609975329205,
+      "loss": 5.2079,
+      "step": 582
+    },
+    {
+      "epoch": 0.186451857922405,
+      "grad_norm": 0.41482430696487427,
+      "learning_rate": 0.0005624595540139851,
+      "loss": 5.2986,
+      "step": 583
+    },
+    {
+      "epoch": 0.1867716724299906,
+      "grad_norm": 0.380515456199646,
+      "learning_rate": 0.0005623088509229602,
+      "loss": 5.2007,
+      "step": 584
+    },
+    {
+      "epoch": 0.1870914869375762,
+      "grad_norm": 0.42362508177757263,
+      "learning_rate": 0.0005621578662179247,
+      "loss": 5.2153,
+      "step": 585
+    },
+    {
+      "epoch": 0.1874113014451618,
+      "grad_norm": 0.43335291743278503,
+      "learning_rate": 0.0005620066000609755,
+      "loss": 5.2784,
+      "step": 586
+    },
+    {
+      "epoch": 0.1877311159527474,
+      "grad_norm": 0.39207449555397034,
+      "learning_rate": 0.0005618550526145113,
+      "loss": 5.256,
+      "step": 587
+    },
+    {
+      "epoch": 0.188050930460333,
+      "grad_norm": 0.4123283326625824,
+      "learning_rate": 0.0005617032240412329,
+      "loss": 5.2194,
+      "step": 588
+    },
+    {
+      "epoch": 0.1883707449679186,
+      "grad_norm": 0.43690216541290283,
+      "learning_rate": 0.0005615511145041433,
+      "loss": 5.2638,
+      "step": 589
+    },
+    {
+      "epoch": 0.1886905594755042,
+      "grad_norm": 0.41950878500938416,
+      "learning_rate": 0.0005613987241665468,
+      "loss": 5.3,
+      "step": 590
+    },
+    {
+      "epoch": 0.1890103739830898,
+      "grad_norm": 0.4976102113723755,
+      "learning_rate": 0.000561246053192049,
+      "loss": 5.2027,
+      "step": 591
+    },
+    {
+      "epoch": 0.1893301884906754,
+      "grad_norm": 0.4183708131313324,
+      "learning_rate": 0.0005610931017445573,
+      "loss": 5.264,
+      "step": 592
+    },
+    {
+      "epoch": 0.189650002998261,
+      "grad_norm": 0.5086091756820679,
+      "learning_rate": 0.0005609398699882796,
+      "loss": 5.2579,
+      "step": 593
+    },
+    {
+      "epoch": 0.1899698175058466,
+      "grad_norm": 0.3733188509941101,
+      "learning_rate": 0.0005607863580877253,
+      "loss": 5.282,
+      "step": 594
+    },
+    {
+      "epoch": 0.19028963201343221,
+      "grad_norm": 0.5005497336387634,
+      "learning_rate": 0.0005606325662077042,
+      "loss": 5.2219,
+      "step": 595
+    },
+    {
+      "epoch": 0.1906094465210178,
+      "grad_norm": 0.381599485874176,
+      "learning_rate": 0.0005604784945133271,
+      "loss": 5.1787,
+      "step": 596
+    },
+    {
+      "epoch": 0.19092926102860341,
+      "grad_norm": 0.47054019570350647,
+      "learning_rate": 0.0005603241431700045,
+      "loss": 5.2808,
+      "step": 597
+    },
+    {
+      "epoch": 0.191249075536189,
+      "grad_norm": 0.38837242126464844,
+      "learning_rate": 0.0005601695123434477,
+      "loss": 5.1762,
+      "step": 598
+    },
+    {
+      "epoch": 0.19156889004377461,
+      "grad_norm": 0.48838260769844055,
+      "learning_rate": 0.000560014602199668,
+      "loss": 5.2408,
+      "step": 599
+    },
+    {
+      "epoch": 0.1918887045513602,
+      "grad_norm": 0.3695312440395355,
+      "learning_rate": 0.0005598594129049765,
+      "loss": 5.329,
+      "step": 600
+    },
+    {
+      "epoch": 0.1918887045513602,
+      "eval_loss": 5.246670246124268,
+      "eval_runtime": 80.6874,
+      "eval_samples_per_second": 23.51,
+      "eval_steps_per_second": 5.887,
+      "step": 600
+    },
+    {
+      "epoch": 0.19220851905894581,
+      "grad_norm": 0.41941291093826294,
+      "learning_rate": 0.0005597039446259837,
+      "loss": 5.3453,
+      "step": 601
+    },
+    {
+      "epoch": 0.1925283335665314,
+      "grad_norm": 0.4879932999610901,
+      "learning_rate": 0.0005595481975296002,
+      "loss": 5.3331,
+      "step": 602
+    },
+    {
+      "epoch": 0.19284814807411702,
+      "grad_norm": 0.4067867398262024,
+      "learning_rate": 0.0005593921717830354,
+      "loss": 5.2184,
+      "step": 603
+    },
+    {
+      "epoch": 0.1931679625817026,
+      "grad_norm": 0.43347740173339844,
+      "learning_rate": 0.0005592358675537983,
+      "loss": 5.2875,
+      "step": 604
+    },
+    {
+      "epoch": 0.19348777708928822,
+      "grad_norm": 0.41474223136901855,
+      "learning_rate": 0.0005590792850096965,
+      "loss": 5.2204,
+      "step": 605
+    },
+    {
+      "epoch": 0.1938075915968738,
+      "grad_norm": 0.4278968870639801,
+      "learning_rate": 0.0005589224243188365,
+      "loss": 5.2632,
+      "step": 606
+    },
+    {
+      "epoch": 0.19412740610445942,
+      "grad_norm": 0.4322011470794678,
+      "learning_rate": 0.0005587652856496236,
+      "loss": 5.1834,
+      "step": 607
+    },
+    {
+      "epoch": 0.194447220612045,
+      "grad_norm": 0.3791482150554657,
+      "learning_rate": 0.0005586078691707614,
+      "loss": 5.1847,
+      "step": 608
+    },
+    {
+      "epoch": 0.19476703511963062,
+      "grad_norm": 0.37472835183143616,
+      "learning_rate": 0.0005584501750512516,
+      "loss": 5.2734,
+      "step": 609
+    },
+    {
+      "epoch": 0.1950868496272162,
+      "grad_norm": 0.3757517337799072,
+      "learning_rate": 0.0005582922034603945,
+      "loss": 5.2306,
+      "step": 610
+    },
+    {
+      "epoch": 0.19540666413480182,
+      "grad_norm": 0.37529265880584717,
+      "learning_rate": 0.0005581339545677877,
+      "loss": 5.2716,
+      "step": 611
+    },
+    {
+      "epoch": 0.1957264786423874,
+      "grad_norm": 0.38108786940574646,
+      "learning_rate": 0.0005579754285433269,
+      "loss": 5.1982,
+      "step": 612
+    },
+    {
+      "epoch": 0.19604629314997302,
+      "grad_norm": 0.4154306650161743,
+      "learning_rate": 0.0005578166255572048,
+      "loss": 5.2879,
+      "step": 613
+    },
+    {
+      "epoch": 0.1963661076575586,
+      "grad_norm": 0.40405797958374023,
+      "learning_rate": 0.0005576575457799122,
+      "loss": 5.2084,
+      "step": 614
+    },
+    {
+      "epoch": 0.19668592216514422,
+      "grad_norm": 0.4085008502006531,
+      "learning_rate": 0.0005574981893822365,
+      "loss": 5.2127,
+      "step": 615
+    },
+    {
+      "epoch": 0.1970057366727298,
+      "grad_norm": 0.39384084939956665,
+      "learning_rate": 0.0005573385565352622,
+      "loss": 5.2061,
+      "step": 616
+    },
+    {
+      "epoch": 0.19732555118031542,
+      "grad_norm": 0.3667563498020172,
+      "learning_rate": 0.0005571786474103709,
+      "loss": 5.305,
+      "step": 617
+    },
+    {
+      "epoch": 0.19764536568790103,
+      "grad_norm": 0.4089621603488922,
+      "learning_rate": 0.0005570184621792405,
+      "loss": 5.1365,
+      "step": 618
+    },
+    {
+      "epoch": 0.19796518019548662,
+      "grad_norm": 0.3874637484550476,
+      "learning_rate": 0.0005568580010138452,
+      "loss": 5.2534,
+      "step": 619
+    },
+    {
+      "epoch": 0.19828499470307223,
+      "grad_norm": 0.371349036693573,
+      "learning_rate": 0.0005566972640864558,
+      "loss": 5.2055,
+      "step": 620
+    },
+    {
+      "epoch": 0.19860480921065782,
+      "grad_norm": 0.36935508251190186,
+      "learning_rate": 0.0005565362515696389,
+      "loss": 5.2411,
+      "step": 621
+    },
+    {
+      "epoch": 0.19892462371824343,
+      "grad_norm": 0.4087619483470917,
+      "learning_rate": 0.0005563749636362572,
+      "loss": 5.1594,
+      "step": 622
+    },
+    {
+      "epoch": 0.19924443822582902,
+      "grad_norm": 0.3479459285736084,
+      "learning_rate": 0.0005562134004594687,
+      "loss": 5.2632,
+      "step": 623
+    },
+    {
+      "epoch": 0.19956425273341463,
+      "grad_norm": 0.4259833097457886,
+      "learning_rate": 0.0005560515622127276,
+      "loss": 5.1791,
+      "step": 624
+    },
+    {
+      "epoch": 0.19988406724100022,
+      "grad_norm": 0.4216997027397156,
+      "learning_rate": 0.0005558894490697824,
+      "loss": 5.1627,
+      "step": 625
+    },
+    {
+      "epoch": 0.20020388174858583,
+      "grad_norm": 0.37855878472328186,
+      "learning_rate": 0.0005557270612046777,
+      "loss": 5.1793,
+      "step": 626
+    },
+    {
+      "epoch": 0.20052369625617142,
+      "grad_norm": 0.35549864172935486,
+      "learning_rate": 0.0005555643987917525,
+      "loss": 5.1747,
+      "step": 627
+    },
+    {
+      "epoch": 0.20084351076375703,
+      "grad_norm": 0.3608039915561676,
+      "learning_rate": 0.0005554014620056406,
+      "loss": 5.1809,
+      "step": 628
+    },
+    {
+      "epoch": 0.20116332527134262,
+      "grad_norm": 0.35643693804740906,
+      "learning_rate": 0.0005552382510212706,
+      "loss": 5.1599,
+      "step": 629
+    },
+    {
+      "epoch": 0.20148313977892823,
+      "grad_norm": 0.38763564825057983,
+      "learning_rate": 0.0005550747660138653,
+      "loss": 5.2292,
+      "step": 630
+    },
+    {
+      "epoch": 0.20180295428651382,
+      "grad_norm": 0.4711453914642334,
+      "learning_rate": 0.0005549110071589418,
+      "loss": 5.1897,
+      "step": 631
+    },
+    {
+      "epoch": 0.20212276879409943,
+      "grad_norm": 0.3638678789138794,
+      "learning_rate": 0.0005547469746323109,
+      "loss": 5.1936,
+      "step": 632
+    },
+    {
+      "epoch": 0.20244258330168502,
+      "grad_norm": 0.3598167896270752,
+      "learning_rate": 0.0005545826686100776,
+      "loss": 5.2621,
+      "step": 633
+    },
+    {
+      "epoch": 0.20276239780927063,
+      "grad_norm": 0.38647595047950745,
+      "learning_rate": 0.0005544180892686403,
+      "loss": 5.1097,
+      "step": 634
+    },
+    {
+      "epoch": 0.20308221231685622,
+      "grad_norm": 0.38954514265060425,
+      "learning_rate": 0.000554253236784691,
+      "loss": 5.145,
+      "step": 635
+    },
+    {
+      "epoch": 0.20340202682444183,
+      "grad_norm": 0.580151379108429,
+      "learning_rate": 0.0005540881113352148,
+      "loss": 5.1903,
+      "step": 636
+    },
+    {
+      "epoch": 0.20372184133202742,
+      "grad_norm": 0.3926187753677368,
+      "learning_rate": 0.0005539227130974898,
+      "loss": 5.2114,
+      "step": 637
+    },
+    {
+      "epoch": 0.20404165583961303,
+      "grad_norm": 0.38736334443092346,
+      "learning_rate": 0.0005537570422490871,
+      "loss": 5.1262,
+      "step": 638
+    },
+    {
+      "epoch": 0.20436147034719862,
+      "grad_norm": 0.3514604866504669,
+      "learning_rate": 0.0005535910989678706,
+      "loss": 5.1722,
+      "step": 639
+    },
+    {
+      "epoch": 0.20468128485478423,
+      "grad_norm": 0.466145396232605,
+      "learning_rate": 0.0005534248834319962,
+      "loss": 5.1735,
+      "step": 640
+    },
+    {
+      "epoch": 0.20500109936236982,
+      "grad_norm": 0.3967139720916748,
+      "learning_rate": 0.0005532583958199126,
+      "loss": 5.1577,
+      "step": 641
+    },
+    {
+      "epoch": 0.20532091386995543,
+      "grad_norm": 0.8810538053512573,
+      "learning_rate": 0.0005530916363103605,
+      "loss": 5.2778,
+      "step": 642
+    },
+    {
+      "epoch": 0.20564072837754102,
+      "grad_norm": 0.367849737405777,
+      "learning_rate": 0.0005529246050823723,
+      "loss": 5.1614,
+      "step": 643
+    },
+    {
+      "epoch": 0.20596054288512664,
+      "grad_norm": 0.3990894556045532,
+      "learning_rate": 0.0005527573023152722,
+      "loss": 5.1537,
+      "step": 644
+    },
+    {
+      "epoch": 0.20628035739271222,
+      "grad_norm": 0.40600651502609253,
+      "learning_rate": 0.0005525897281886761,
+      "loss": 5.1424,
+      "step": 645
+    },
+    {
+      "epoch": 0.20660017190029784,
+      "grad_norm": 0.3771745264530182,
+      "learning_rate": 0.000552421882882491,
+      "loss": 5.1969,
+      "step": 646
+    },
+    {
+      "epoch": 0.20691998640788342,
+      "grad_norm": 0.4336475133895874,
+      "learning_rate": 0.000552253766576915,
+      "loss": 5.1765,
+      "step": 647
+    },
+    {
+      "epoch": 0.20723980091546904,
+      "grad_norm": 0.36182352900505066,
+      "learning_rate": 0.0005520853794524375,
+      "loss": 5.2697,
+      "step": 648
+    },
+    {
+      "epoch": 0.20755961542305462,
+      "grad_norm": 0.47667577862739563,
+      "learning_rate": 0.0005519167216898383,
+      "loss": 5.2595,
+      "step": 649
+    },
+    {
+      "epoch": 0.20787942993064024,
+      "grad_norm": 0.41349339485168457,
+      "learning_rate": 0.0005517477934701879,
+      "loss": 5.134,
+      "step": 650
+    },
+    {
+      "epoch": 0.20819924443822582,
+      "grad_norm": 0.3495538830757141,
+      "learning_rate": 0.0005515785949748471,
+      "loss": 5.1849,
+      "step": 651
+    },
+    {
+      "epoch": 0.20851905894581144,
+      "grad_norm": 0.3815857768058777,
+      "learning_rate": 0.0005514091263854671,
+      "loss": 5.1395,
+      "step": 652
+    },
+    {
+      "epoch": 0.20883887345339702,
+      "grad_norm": 0.3903793394565582,
+      "learning_rate": 0.0005512393878839885,
+      "loss": 5.0982,
+      "step": 653
+    },
+    {
+      "epoch": 0.20915868796098264,
+      "grad_norm": 0.4030245542526245,
+      "learning_rate": 0.0005510693796526425,
+      "loss": 5.0803,
+      "step": 654
+    },
+    {
+      "epoch": 0.20947850246856822,
+      "grad_norm": 0.4484241008758545,
+      "learning_rate": 0.000550899101873949,
+      "loss": 5.2476,
+      "step": 655
+    },
+    {
+      "epoch": 0.20979831697615384,
+      "grad_norm": 0.43497729301452637,
+      "learning_rate": 0.0005507285547307181,
+      "loss": 5.1655,
+      "step": 656
+    },
+    {
+      "epoch": 0.21011813148373942,
+      "grad_norm": 0.4137641489505768,
+      "learning_rate": 0.0005505577384060485,
+      "loss": 5.17,
+      "step": 657
+    },
+    {
+      "epoch": 0.21043794599132504,
+      "grad_norm": 0.4317357838153839,
+      "learning_rate": 0.0005503866530833281,
+      "loss": 5.219,
+      "step": 658
+    },
+    {
+      "epoch": 0.21075776049891062,
+      "grad_norm": 0.3765553832054138,
+      "learning_rate": 0.0005502152989462337,
+      "loss": 5.243,
+      "step": 659
+    },
+    {
+      "epoch": 0.21107757500649624,
+      "grad_norm": 0.4000135362148285,
+      "learning_rate": 0.0005500436761787306,
+      "loss": 5.1108,
+      "step": 660
+    },
+    {
+      "epoch": 0.21139738951408182,
+      "grad_norm": 0.39066606760025024,
+      "learning_rate": 0.0005498717849650724,
+      "loss": 5.2114,
+      "step": 661
+    },
+    {
+      "epoch": 0.21171720402166744,
+      "grad_norm": 0.3841003179550171,
+      "learning_rate": 0.0005496996254898011,
+      "loss": 5.1906,
+      "step": 662
+    },
+    {
+      "epoch": 0.21203701852925302,
+      "grad_norm": 0.4065331518650055,
+      "learning_rate": 0.0005495271979377464,
+      "loss": 5.1637,
+      "step": 663
+    },
+    {
+      "epoch": 0.21235683303683864,
+      "grad_norm": 0.37493783235549927,
+      "learning_rate": 0.0005493545024940264,
+      "loss": 5.1488,
+      "step": 664
+    },
+    {
+      "epoch": 0.21267664754442422,
+      "grad_norm": 0.3737407922744751,
+      "learning_rate": 0.000549181539344046,
+      "loss": 5.1694,
+      "step": 665
+    },
+    {
+      "epoch": 0.21299646205200984,
+      "grad_norm": 0.3587566018104553,
+      "learning_rate": 0.0005490083086734982,
+      "loss": 5.1737,
+      "step": 666
+    },
+    {
+      "epoch": 0.21331627655959542,
+      "grad_norm": 0.42007777094841003,
+      "learning_rate": 0.000548834810668363,
+      "loss": 5.2466,
+      "step": 667
+    },
+    {
+      "epoch": 0.21363609106718104,
+      "grad_norm": 0.3630130887031555,
+      "learning_rate": 0.0005486610455149069,
+      "loss": 5.2188,
+      "step": 668
+    },
+    {
+      "epoch": 0.21395590557476662,
+      "grad_norm": 0.4182991683483124,
+      "learning_rate": 0.0005484870133996842,
+      "loss": 5.2046,
+      "step": 669
+    },
+    {
+      "epoch": 0.21427572008235224,
+      "grad_norm": 0.40020623803138733,
+      "learning_rate": 0.0005483127145095349,
+      "loss": 5.1565,
+      "step": 670
+    },
+    {
+      "epoch": 0.21459553458993783,
+      "grad_norm": 0.41598305106163025,
+      "learning_rate": 0.0005481381490315859,
+      "loss": 5.2775,
+      "step": 671
+    },
+    {
+      "epoch": 0.21491534909752344,
+      "grad_norm": 0.3689316511154175,
+      "learning_rate": 0.0005479633171532503,
+      "loss": 5.1098,
+      "step": 672
+    },
+    {
+      "epoch": 0.21523516360510903,
+      "grad_norm": 0.40048545598983765,
+      "learning_rate": 0.0005477882190622269,
+      "loss": 5.1345,
+      "step": 673
+    },
+    {
+      "epoch": 0.21555497811269464,
+      "grad_norm": 0.36160582304000854,
+      "learning_rate": 0.0005476128549465006,
+      "loss": 5.1598,
+      "step": 674
+    },
+    {
+      "epoch": 0.21587479262028023,
+      "grad_norm": 0.3673115074634552,
+      "learning_rate": 0.0005474372249943417,
+      "loss": 5.2201,
+      "step": 675
+    },
+    {
+      "epoch": 0.21619460712786584,
+      "grad_norm": 0.4571950435638428,
+      "learning_rate": 0.0005472613293943062,
+      "loss": 5.202,
+      "step": 676
+    },
+    {
+      "epoch": 0.21651442163545143,
+      "grad_norm": 0.35925915837287903,
+      "learning_rate": 0.0005470851683352349,
+      "loss": 5.1643,
+      "step": 677
+    },
+    {
+      "epoch": 0.21683423614303704,
+      "grad_norm": 0.3808862864971161,
+      "learning_rate": 0.0005469087420062538,
+      "loss": 5.1623,
+      "step": 678
+    },
+    {
+      "epoch": 0.21715405065062263,
+      "grad_norm": 0.36828768253326416,
+      "learning_rate": 0.0005467320505967739,
+      "loss": 5.1198,
+      "step": 679
+    },
+    {
+      "epoch": 0.21747386515820824,
+      "grad_norm": 0.3488163352012634,
+      "learning_rate": 0.0005465550942964903,
+      "loss": 5.1343,
+      "step": 680
+    },
+    {
+      "epoch": 0.21779367966579383,
+      "grad_norm": 0.40178030729293823,
+      "learning_rate": 0.000546377873295383,
+      "loss": 5.0838,
+      "step": 681
+    },
+    {
+      "epoch": 0.21811349417337944,
+      "grad_norm": 0.37855249643325806,
+      "learning_rate": 0.0005462003877837157,
+      "loss": 5.1824,
+      "step": 682
+    },
+    {
+      "epoch": 0.21843330868096503,
+      "grad_norm": 0.39485079050064087,
+      "learning_rate": 0.0005460226379520365,
+      "loss": 5.1908,
+      "step": 683
+    },
+    {
+      "epoch": 0.21875312318855064,
+      "grad_norm": 0.4116956889629364,
+      "learning_rate": 0.0005458446239911772,
+      "loss": 5.1255,
+      "step": 684
+    },
+    {
+      "epoch": 0.21907293769613623,
+      "grad_norm": 0.38690185546875,
+      "learning_rate": 0.0005456663460922528,
+      "loss": 5.1903,
+      "step": 685
+    },
+    {
+      "epoch": 0.21939275220372184,
+      "grad_norm": 0.6163234114646912,
+      "learning_rate": 0.000545487804446662,
+      "loss": 5.1338,
+      "step": 686
+    },
+    {
+      "epoch": 0.21971256671130746,
+      "grad_norm": 0.3915090262889862,
+      "learning_rate": 0.0005453089992460868,
+      "loss": 5.1987,
+      "step": 687
+    },
+    {
+      "epoch": 0.22003238121889304,
+      "grad_norm": 0.4104084372520447,
+      "learning_rate": 0.0005451299306824917,
+      "loss": 5.1334,
+      "step": 688
+    },
+    {
+      "epoch": 0.22035219572647866,
+      "grad_norm": 0.44581642746925354,
+      "learning_rate": 0.0005449505989481243,
+      "loss": 5.1779,
+      "step": 689
+    },
+    {
+      "epoch": 0.22067201023406424,
+      "grad_norm": 0.41805586218833923,
+      "learning_rate": 0.0005447710042355145,
+      "loss": 5.1203,
+      "step": 690
+    },
+    {
+      "epoch": 0.22099182474164986,
+      "grad_norm": 0.42157623171806335,
+      "learning_rate": 0.0005445911467374747,
+      "loss": 5.1803,
+      "step": 691
+    },
+    {
+      "epoch": 0.22131163924923544,
+      "grad_norm": 0.38313740491867065,
+      "learning_rate": 0.0005444110266470995,
+      "loss": 5.2184,
+      "step": 692
+    },
+    {
+      "epoch": 0.22163145375682106,
+      "grad_norm": 0.35705432295799255,
+      "learning_rate": 0.0005442306441577651,
+      "loss": 5.3459,
+      "step": 693
+    },
+    {
+      "epoch": 0.22195126826440664,
+      "grad_norm": 0.3512997329235077,
+      "learning_rate": 0.0005440499994631299,
+      "loss": 5.1653,
+      "step": 694
+    },
+    {
+      "epoch": 0.22227108277199226,
+      "grad_norm": 0.38805651664733887,
+      "learning_rate": 0.0005438690927571332,
+      "loss": 5.1115,
+      "step": 695
+    },
+    {
+      "epoch": 0.22259089727957784,
+      "grad_norm": 0.37657782435417175,
+      "learning_rate": 0.000543687924233996,
+      "loss": 5.0689,
+      "step": 696
+    },
+    {
+      "epoch": 0.22291071178716346,
+      "grad_norm": 0.36555126309394836,
+      "learning_rate": 0.0005435064940882204,
+      "loss": 5.1798,
+      "step": 697
+    },
+    {
+      "epoch": 0.22323052629474904,
+      "grad_norm": 0.3793173134326935,
+      "learning_rate": 0.0005433248025145894,
+      "loss": 5.1051,
+      "step": 698
+    },
+    {
+      "epoch": 0.22355034080233466,
+      "grad_norm": 0.434103786945343,
+      "learning_rate": 0.0005431428497081661,
+      "loss": 5.1889,
+      "step": 699
+    },
+    {
+      "epoch": 0.22387015530992024,
+      "grad_norm": 0.3871440291404724,
+      "learning_rate": 0.0005429606358642948,
+      "loss": 5.0479,
+      "step": 700
+    },
+    {
+      "epoch": 0.22387015530992024,
+      "eval_loss": 5.1346116065979,
+      "eval_runtime": 83.8832,
+      "eval_samples_per_second": 22.615,
+      "eval_steps_per_second": 5.663,
+      "step": 700
+    },
+    {
+      "epoch": 0.22418996981750586,
+      "grad_norm": 0.37413862347602844,
+      "learning_rate": 0.0005427781611785998,
+      "loss": 5.0907,
+      "step": 701
+    },
+    {
+      "epoch": 0.22450978432509144,
+      "grad_norm": 0.41222137212753296,
+      "learning_rate": 0.0005425954258469852,
+      "loss": 5.1388,
+      "step": 702
+    },
+    {
+      "epoch": 0.22482959883267706,
+      "grad_norm": 0.3832525908946991,
+      "learning_rate": 0.000542412430065635,
+      "loss": 5.1396,
+      "step": 703
+    },
+    {
+      "epoch": 0.22514941334026264,
+      "grad_norm": 0.3836553692817688,
+      "learning_rate": 0.0005422291740310134,
+      "loss": 5.0898,
+      "step": 704
+    },
+    {
+      "epoch": 0.22546922784784826,
+      "grad_norm": 0.37601372599601746,
+      "learning_rate": 0.0005420456579398632,
+      "loss": 5.1271,
+      "step": 705
+    },
+    {
+      "epoch": 0.22578904235543384,
+      "grad_norm": 0.44405072927474976,
+      "learning_rate": 0.0005418618819892067,
+      "loss": 5.1508,
+      "step": 706
+    },
+    {
+      "epoch": 0.22610885686301946,
+      "grad_norm": 0.3710222542285919,
+      "learning_rate": 0.0005416778463763454,
+      "loss": 5.0919,
+      "step": 707
+    },
+    {
+      "epoch": 0.22642867137060504,
+      "grad_norm": 0.38442203402519226,
+      "learning_rate": 0.0005414935512988593,
+      "loss": 5.1242,
+      "step": 708
+    },
+    {
+      "epoch": 0.22674848587819066,
+      "grad_norm": 0.3756135106086731,
+      "learning_rate": 0.0005413089969546071,
+      "loss": 5.1882,
+      "step": 709
+    },
+    {
+      "epoch": 0.22706830038577624,
+      "grad_norm": 0.38981232047080994,
+      "learning_rate": 0.0005411241835417256,
+      "loss": 5.1904,
+      "step": 710
+    },
+    {
+      "epoch": 0.22738811489336186,
+      "grad_norm": 0.41756415367126465,
+      "learning_rate": 0.0005409391112586303,
+      "loss": 5.1239,
+      "step": 711
+    },
+    {
+      "epoch": 0.22770792940094745,
+      "grad_norm": 0.38702163100242615,
+      "learning_rate": 0.0005407537803040139,
+      "loss": 5.1117,
+      "step": 712
+    },
+    {
+      "epoch": 0.22802774390853306,
+      "grad_norm": 0.40529951453208923,
+      "learning_rate": 0.0005405681908768475,
+      "loss": 5.0795,
+      "step": 713
+    },
+    {
+      "epoch": 0.22834755841611865,
+      "grad_norm": 0.36242255568504333,
+      "learning_rate": 0.0005403823431763791,
+      "loss": 5.1368,
+      "step": 714
+    },
+    {
+      "epoch": 0.22866737292370426,
+      "grad_norm": 0.3760404884815216,
+      "learning_rate": 0.0005401962374021342,
+      "loss": 5.0858,
+      "step": 715
+    },
+    {
+      "epoch": 0.22898718743128985,
+      "grad_norm": 0.3962754011154175,
+      "learning_rate": 0.0005400098737539157,
+      "loss": 5.2717,
+      "step": 716
+    },
+    {
+      "epoch": 0.22930700193887546,
+      "grad_norm": 0.658300518989563,
+      "learning_rate": 0.0005398232524318029,
+      "loss": 5.1172,
+      "step": 717
+    },
+    {
+      "epoch": 0.22962681644646105,
+      "grad_norm": 0.3779653012752533,
+      "learning_rate": 0.0005396363736361519,
+      "loss": 5.1571,
+      "step": 718
+    },
+    {
+      "epoch": 0.22994663095404666,
+      "grad_norm": 0.4019310772418976,
+      "learning_rate": 0.0005394492375675953,
+      "loss": 5.0618,
+      "step": 719
+    },
+    {
+      "epoch": 0.23026644546163225,
+      "grad_norm": 0.38351598381996155,
+      "learning_rate": 0.0005392618444270417,
+      "loss": 5.0987,
+      "step": 720
+    },
+    {
+      "epoch": 0.23058625996921786,
+      "grad_norm": 0.4161638617515564,
+      "learning_rate": 0.0005390741944156759,
+      "loss": 5.1888,
+      "step": 721
+    },
+    {
+      "epoch": 0.23090607447680345,
+      "grad_norm": 0.3698093891143799,
+      "learning_rate": 0.0005388862877349584,
+      "loss": 5.0928,
+      "step": 722
+    },
+    {
+      "epoch": 0.23122588898438906,
+      "grad_norm": 0.4438035786151886,
+      "learning_rate": 0.0005386981245866252,
+      "loss": 5.0899,
+      "step": 723
+    },
+    {
+      "epoch": 0.23154570349197465,
+      "grad_norm": 0.3572853207588196,
+      "learning_rate": 0.0005385097051726879,
+      "loss": 5.1191,
+      "step": 724
+    },
+    {
+      "epoch": 0.23186551799956026,
+      "grad_norm": 0.4035443365573883,
+      "learning_rate": 0.0005383210296954328,
+      "loss": 5.1538,
+      "step": 725
+    },
+    {
+      "epoch": 0.23218533250714585,
+      "grad_norm": 0.3826752305030823,
+      "learning_rate": 0.0005381320983574214,
+      "loss": 5.188,
+      "step": 726
+    },
+    {
+      "epoch": 0.23250514701473146,
+      "grad_norm": 0.38229018449783325,
+      "learning_rate": 0.0005379429113614898,
+      "loss": 5.2084,
+      "step": 727
+    },
+    {
+      "epoch": 0.23282496152231705,
+      "grad_norm": 0.41644829511642456,
+      "learning_rate": 0.0005377534689107487,
+      "loss": 5.0967,
+      "step": 728
+    },
+    {
+      "epoch": 0.23314477602990266,
+      "grad_norm": 0.3715463876724243,
+      "learning_rate": 0.0005375637712085829,
+      "loss": 5.0421,
+      "step": 729
+    },
+    {
+      "epoch": 0.23346459053748825,
+      "grad_norm": 0.3967476785182953,
+      "learning_rate": 0.0005373738184586514,
+      "loss": 5.104,
+      "step": 730
+    },
+    {
+      "epoch": 0.23378440504507386,
+      "grad_norm": 0.3768517076969147,
+      "learning_rate": 0.0005371836108648868,
+      "loss": 5.1045,
+      "step": 731
+    },
+    {
+      "epoch": 0.23410421955265945,
+      "grad_norm": 0.4107721447944641,
+      "learning_rate": 0.0005369931486314953,
+      "loss": 5.0859,
+      "step": 732
+    },
+    {
+      "epoch": 0.23442403406024506,
+      "grad_norm": 0.3913835883140564,
+      "learning_rate": 0.0005368024319629569,
+      "loss": 5.2196,
+      "step": 733
+    },
+    {
+      "epoch": 0.23474384856783065,
+      "grad_norm": 0.4043770730495453,
+      "learning_rate": 0.0005366114610640241,
+      "loss": 5.1076,
+      "step": 734
+    },
+    {
+      "epoch": 0.23506366307541626,
+      "grad_norm": 0.3797147274017334,
+      "learning_rate": 0.000536420236139723,
+      "loss": 5.0718,
+      "step": 735
+    },
+    {
+      "epoch": 0.23538347758300185,
+      "grad_norm": 0.40881407260894775,
+      "learning_rate": 0.000536228757395352,
+      "loss": 5.0545,
+      "step": 736
+    },
+    {
+      "epoch": 0.23570329209058746,
+      "grad_norm": 0.3897201120853424,
+      "learning_rate": 0.000536037025036482,
+      "loss": 5.124,
+      "step": 737
+    },
+    {
+      "epoch": 0.23602310659817305,
+      "grad_norm": 0.3821205198764801,
+      "learning_rate": 0.0005358450392689564,
+      "loss": 5.0568,
+      "step": 738
+    },
+    {
+      "epoch": 0.23634292110575866,
+      "grad_norm": 0.36248835921287537,
+      "learning_rate": 0.0005356528002988907,
+      "loss": 5.1143,
+      "step": 739
+    },
+    {
+      "epoch": 0.23666273561334425,
+      "grad_norm": 0.3769552409648895,
+      "learning_rate": 0.000535460308332672,
+      "loss": 5.107,
+      "step": 740
+    },
+    {
+      "epoch": 0.23698255012092986,
+      "grad_norm": 0.4510996341705322,
+      "learning_rate": 0.0005352675635769589,
+      "loss": 5.2007,
+      "step": 741
+    },
+    {
+      "epoch": 0.23730236462851545,
+      "grad_norm": 0.39315706491470337,
+      "learning_rate": 0.0005350745662386818,
+      "loss": 5.115,
+      "step": 742
+    },
+    {
+      "epoch": 0.23762217913610106,
+      "grad_norm": 0.4506649076938629,
+      "learning_rate": 0.000534881316525042,
+      "loss": 5.0953,
+      "step": 743
+    },
+    {
+      "epoch": 0.23794199364368665,
+      "grad_norm": 0.4265020489692688,
+      "learning_rate": 0.0005346878146435119,
+      "loss": 5.0854,
+      "step": 744
+    },
+    {
+      "epoch": 0.23826180815127226,
+      "grad_norm": 0.7627310752868652,
+      "learning_rate": 0.0005344940608018345,
+      "loss": 5.0993,
+      "step": 745
+    },
+    {
+      "epoch": 0.23858162265885785,
+      "grad_norm": 0.40325039625167847,
+      "learning_rate": 0.0005343000552080235,
+      "loss": 5.1046,
+      "step": 746
+    },
+    {
+      "epoch": 0.23890143716644346,
+      "grad_norm": 0.4026595652103424,
+      "learning_rate": 0.0005341057980703624,
+      "loss": 5.0665,
+      "step": 747
+    },
+    {
+      "epoch": 0.23922125167402905,
+      "grad_norm": 0.38960710167884827,
+      "learning_rate": 0.0005339112895974054,
+      "loss": 5.1161,
+      "step": 748
+    },
+    {
+      "epoch": 0.23954106618161466,
+      "grad_norm": 0.37810978293418884,
+      "learning_rate": 0.0005337165299979761,
+      "loss": 5.1018,
+      "step": 749
+    },
+    {
+      "epoch": 0.23986088068920025,
+      "grad_norm": 0.406655877828598,
+      "learning_rate": 0.0005335215194811678,
+      "loss": 5.1116,
+      "step": 750
+    },
+    {
+      "epoch": 0.24018069519678586,
+      "grad_norm": 0.47911736369132996,
+      "learning_rate": 0.0005333262582563434,
+      "loss": 5.1131,
+      "step": 751
+    },
+    {
+      "epoch": 0.24050050970437145,
+      "grad_norm": 0.4555525779724121,
+      "learning_rate": 0.0005331307465331346,
+      "loss": 5.1441,
+      "step": 752
+    },
+    {
+      "epoch": 0.24082032421195707,
+      "grad_norm": 0.43813085556030273,
+      "learning_rate": 0.0005329349845214421,
+      "loss": 5.099,
+      "step": 753
+    },
+    {
+      "epoch": 0.24114013871954265,
+      "grad_norm": 0.411918967962265,
+      "learning_rate": 0.0005327389724314357,
+      "loss": 5.1283,
+      "step": 754
+    },
+    {
+      "epoch": 0.24145995322712827,
+      "grad_norm": 0.4153035879135132,
+      "learning_rate": 0.0005325427104735533,
+      "loss": 5.0294,
+      "step": 755
+    },
+    {
+      "epoch": 0.24177976773471388,
+      "grad_norm": 0.4288122057914734,
+      "learning_rate": 0.0005323461988585011,
+      "loss": 5.0778,
+      "step": 756
+    },
+    {
+      "epoch": 0.24209958224229947,
+      "grad_norm": 0.4059446454048157,
+      "learning_rate": 0.0005321494377972534,
+      "loss": 5.092,
+      "step": 757
+    },
+    {
+      "epoch": 0.24241939674988508,
+      "grad_norm": 0.4147324860095978,
+      "learning_rate": 0.0005319524275010524,
+      "loss": 5.0944,
+      "step": 758
+    },
+    {
+      "epoch": 0.24273921125747067,
+      "grad_norm": 0.43272823095321655,
+      "learning_rate": 0.0005317551681814076,
+      "loss": 5.1102,
+      "step": 759
+    },
+    {
+      "epoch": 0.24305902576505628,
+      "grad_norm": 0.39714720845222473,
+      "learning_rate": 0.0005315576600500962,
+      "loss": 5.1273,
+      "step": 760
+    },
+    {
+      "epoch": 0.24337884027264187,
+      "grad_norm": 0.38973182439804077,
+      "learning_rate": 0.0005313599033191622,
+      "loss": 5.0397,
+      "step": 761
+    },
+    {
+      "epoch": 0.24369865478022748,
+      "grad_norm": 0.5939362645149231,
+      "learning_rate": 0.0005311618982009168,
+      "loss": 5.1157,
+      "step": 762
+    },
+    {
+      "epoch": 0.24401846928781307,
+      "grad_norm": 0.38486504554748535,
+      "learning_rate": 0.0005309636449079377,
+      "loss": 5.1182,
+      "step": 763
+    },
+    {
+      "epoch": 0.24433828379539868,
+      "grad_norm": 0.4241164028644562,
+      "learning_rate": 0.0005307651436530688,
+      "loss": 5.1045,
+      "step": 764
+    },
+    {
+      "epoch": 0.24465809830298427,
+      "grad_norm": 0.39092934131622314,
+      "learning_rate": 0.0005305663946494208,
+      "loss": 5.0244,
+      "step": 765
+    },
+    {
+      "epoch": 0.24497791281056988,
+      "grad_norm": 0.3757341802120209,
+      "learning_rate": 0.0005303673981103698,
+      "loss": 5.0916,
+      "step": 766
+    },
+    {
+      "epoch": 0.24529772731815547,
+      "grad_norm": 0.41403621435165405,
+      "learning_rate": 0.000530168154249558,
+      "loss": 5.034,
+      "step": 767
+    },
+    {
+      "epoch": 0.24561754182574108,
+      "grad_norm": 0.4083942770957947,
+      "learning_rate": 0.000529968663280893,
+      "loss": 5.0816,
+      "step": 768
+    },
+    {
+      "epoch": 0.24593735633332667,
+      "grad_norm": 0.4417659044265747,
+      "learning_rate": 0.0005297689254185478,
+      "loss": 5.1167,
+      "step": 769
+    },
+    {
+      "epoch": 0.24625717084091228,
+      "grad_norm": 0.3944707214832306,
+      "learning_rate": 0.0005295689408769602,
+      "loss": 5.0785,
+      "step": 770
+    },
+    {
+      "epoch": 0.24657698534849787,
+      "grad_norm": 0.3857533037662506,
+      "learning_rate": 0.0005293687098708332,
+      "loss": 5.1196,
+      "step": 771
+    },
+    {
+      "epoch": 0.24689679985608348,
+      "grad_norm": 0.4001021981239319,
+      "learning_rate": 0.0005291682326151342,
+      "loss": 5.0776,
+      "step": 772
+    },
+    {
+      "epoch": 0.24721661436366907,
+      "grad_norm": 0.4205099046230316,
+      "learning_rate": 0.0005289675093250949,
+      "loss": 5.1358,
+      "step": 773
+    },
+    {
+      "epoch": 0.24753642887125468,
+      "grad_norm": 0.4752465784549713,
+      "learning_rate": 0.0005287665402162112,
+      "loss": 5.0899,
+      "step": 774
+    },
+    {
+      "epoch": 0.24785624337884027,
+      "grad_norm": 0.40020492672920227,
+      "learning_rate": 0.0005285653255042432,
+      "loss": 5.0728,
+      "step": 775
+    },
+    {
+      "epoch": 0.24817605788642588,
+      "grad_norm": 0.38226863741874695,
+      "learning_rate": 0.0005283638654052141,
+      "loss": 5.0339,
+      "step": 776
+    },
+    {
+      "epoch": 0.24849587239401147,
+      "grad_norm": 0.39739298820495605,
+      "learning_rate": 0.000528162160135411,
+      "loss": 5.1028,
+      "step": 777
+    },
+    {
+      "epoch": 0.24881568690159708,
+      "grad_norm": 0.41745543479919434,
+      "learning_rate": 0.000527960209911384,
+      "loss": 5.0628,
+      "step": 778
+    },
+    {
+      "epoch": 0.24913550140918267,
+      "grad_norm": 0.4319940507411957,
+      "learning_rate": 0.0005277580149499465,
+      "loss": 5.0945,
+      "step": 779
+    },
+    {
+      "epoch": 0.24945531591676828,
+      "grad_norm": 0.36128103733062744,
+      "learning_rate": 0.0005275555754681742,
+      "loss": 5.0755,
+      "step": 780
+    },
+    {
+      "epoch": 0.24977513042435387,
+      "grad_norm": 0.3810884356498718,
+      "learning_rate": 0.0005273528916834056,
+      "loss": 5.0289,
+      "step": 781
+    },
+    {
+      "epoch": 0.25009494493193946,
+      "grad_norm": 0.37757015228271484,
+      "learning_rate": 0.0005271499638132415,
+      "loss": 5.0682,
+      "step": 782
+    },
+    {
+      "epoch": 0.25041475943952507,
+      "grad_norm": 0.34914740920066833,
+      "learning_rate": 0.0005269467920755446,
+      "loss": 5.0937,
+      "step": 783
+    },
+    {
+      "epoch": 0.2507345739471107,
+      "grad_norm": 0.43093082308769226,
+      "learning_rate": 0.0005267433766884394,
+      "loss": 5.0231,
+      "step": 784
+    },
+    {
+      "epoch": 0.2510543884546963,
+      "grad_norm": 0.36913996934890747,
+      "learning_rate": 0.0005265397178703122,
+      "loss": 5.0134,
+      "step": 785
+    },
+    {
+      "epoch": 0.25137420296228186,
+      "grad_norm": 0.34622567892074585,
+      "learning_rate": 0.0005263358158398104,
+      "loss": 4.979,
+      "step": 786
+    },
+    {
+      "epoch": 0.25169401746986747,
+      "grad_norm": 0.3433881998062134,
+      "learning_rate": 0.0005261316708158426,
+      "loss": 5.0447,
+      "step": 787
+    },
+    {
+      "epoch": 0.2520138319774531,
+      "grad_norm": 0.34745779633522034,
+      "learning_rate": 0.0005259272830175784,
+      "loss": 5.015,
+      "step": 788
+    },
+    {
+      "epoch": 0.2523336464850387,
+      "grad_norm": 0.38063815236091614,
+      "learning_rate": 0.0005257226526644478,
+      "loss": 5.0884,
+      "step": 789
+    },
+    {
+      "epoch": 0.25265346099262426,
+      "grad_norm": 0.355397492647171,
+      "learning_rate": 0.0005255177799761416,
+      "loss": 5.0463,
+      "step": 790
+    },
+    {
+      "epoch": 0.25297327550020987,
+      "grad_norm": 0.4084603190422058,
+      "learning_rate": 0.0005253126651726102,
+      "loss": 5.0605,
+      "step": 791
+    },
+    {
+      "epoch": 0.2532930900077955,
+      "grad_norm": 0.37883585691452026,
+      "learning_rate": 0.0005251073084740646,
+      "loss": 5.058,
+      "step": 792
+    },
+    {
+      "epoch": 0.2536129045153811,
+      "grad_norm": 0.36867570877075195,
+      "learning_rate": 0.0005249017101009747,
+      "loss": 5.0808,
+      "step": 793
+    },
+    {
+      "epoch": 0.25393271902296666,
+      "grad_norm": 0.38485613465309143,
+      "learning_rate": 0.0005246958702740707,
+      "loss": 5.0665,
+      "step": 794
+    },
+    {
+      "epoch": 0.25425253353055227,
+      "grad_norm": 0.3903096318244934,
+      "learning_rate": 0.0005244897892143414,
+      "loss": 4.9922,
+      "step": 795
+    },
+    {
+      "epoch": 0.2545723480381379,
+      "grad_norm": 0.38079750537872314,
+      "learning_rate": 0.0005242834671430349,
+      "loss": 5.0481,
+      "step": 796
+    },
+    {
+      "epoch": 0.2548921625457235,
+      "grad_norm": 0.36984172463417053,
+      "learning_rate": 0.0005240769042816581,
+      "loss": 4.9691,
+      "step": 797
+    },
+    {
+      "epoch": 0.25521197705330906,
+      "grad_norm": 0.3557301461696625,
+      "learning_rate": 0.0005238701008519761,
+      "loss": 4.9433,
+      "step": 798
+    },
+    {
+      "epoch": 0.25553179156089467,
+      "grad_norm": 0.36051568388938904,
+      "learning_rate": 0.0005236630570760126,
+      "loss": 5.0263,
+      "step": 799
+    },
+    {
+      "epoch": 0.2558516060684803,
+      "grad_norm": 0.3609025180339813,
+      "learning_rate": 0.0005234557731760489,
+      "loss": 5.0769,
+      "step": 800
+    },
+    {
+      "epoch": 0.2558516060684803,
+      "eval_loss": 5.047708988189697,
+      "eval_runtime": 82.315,
+      "eval_samples_per_second": 23.046,
+      "eval_steps_per_second": 5.771,
+      "step": 800
+    },
+    {
+      "epoch": 0.2561714205760659,
+      "grad_norm": 0.35658442974090576,
+      "learning_rate": 0.0005232482493746247,
+      "loss": 5.0813,
+      "step": 801
+    },
+    {
+      "epoch": 0.25649123508365146,
+      "grad_norm": 0.37020763754844666,
+      "learning_rate": 0.0005230404858945369,
+      "loss": 5.1019,
+      "step": 802
+    },
+    {
+      "epoch": 0.2568110495912371,
+      "grad_norm": 0.34233710169792175,
+      "learning_rate": 0.0005228324829588396,
+      "loss": 5.1039,
+      "step": 803
+    },
+    {
+      "epoch": 0.2571308640988227,
+      "grad_norm": 0.37765568494796753,
+      "learning_rate": 0.0005226242407908441,
+      "loss": 5.0349,
+      "step": 804
+    },
+    {
+      "epoch": 0.2574506786064083,
+      "grad_norm": 0.35139763355255127,
+      "learning_rate": 0.0005224157596141189,
+      "loss": 5.0445,
+      "step": 805
+    },
+    {
+      "epoch": 0.25777049311399386,
+      "grad_norm": 0.3577726483345032,
+      "learning_rate": 0.0005222070396524886,
+      "loss": 4.9836,
+      "step": 806
+    },
+    {
+      "epoch": 0.2580903076215795,
+      "grad_norm": 0.34711799025535583,
+      "learning_rate": 0.0005219980811300342,
+      "loss": 5.1969,
+      "step": 807
+    },
+    {
+      "epoch": 0.2584101221291651,
+      "grad_norm": 0.36387181282043457,
+      "learning_rate": 0.0005217888842710931,
+      "loss": 5.0998,
+      "step": 808
+    },
+    {
+      "epoch": 0.2587299366367507,
+      "grad_norm": 0.3965149521827698,
+      "learning_rate": 0.0005215794493002583,
+      "loss": 4.9794,
+      "step": 809
+    },
+    {
+      "epoch": 0.25904975114433626,
+      "grad_norm": 0.39265790581703186,
+      "learning_rate": 0.000521369776442379,
+      "loss": 5.0195,
+      "step": 810
+    },
+    {
+      "epoch": 0.2593695656519219,
+      "grad_norm": 0.381270170211792,
+      "learning_rate": 0.0005211598659225588,
+      "loss": 5.0358,
+      "step": 811
+    },
+    {
+      "epoch": 0.2596893801595075,
+      "grad_norm": 0.38809168338775635,
+      "learning_rate": 0.0005209497179661573,
+      "loss": 5.1098,
+      "step": 812
+    },
+    {
+      "epoch": 0.2600091946670931,
+      "grad_norm": 0.4025643765926361,
+      "learning_rate": 0.0005207393327987886,
+      "loss": 5.0135,
+      "step": 813
+    },
+    {
+      "epoch": 0.26032900917467866,
+      "grad_norm": 0.3857475817203522,
+      "learning_rate": 0.0005205287106463219,
+      "loss": 5.0203,
+      "step": 814
+    },
+    {
+      "epoch": 0.2606488236822643,
+      "grad_norm": 0.386263906955719,
+      "learning_rate": 0.0005203178517348801,
+      "loss": 5.0744,
+      "step": 815
+    },
+    {
+      "epoch": 0.2609686381898499,
+      "grad_norm": 0.4239501357078552,
+      "learning_rate": 0.0005201067562908409,
+      "loss": 4.9913,
+      "step": 816
+    },
+    {
+      "epoch": 0.2612884526974355,
+      "grad_norm": 0.3845202922821045,
+      "learning_rate": 0.0005198954245408359,
+      "loss": 4.9916,
+      "step": 817
+    },
+    {
+      "epoch": 0.26160826720502106,
+      "grad_norm": 0.3505474328994751,
+      "learning_rate": 0.00051968385671175,
+      "loss": 5.0195,
+      "step": 818
+    },
+    {
+      "epoch": 0.2619280817126067,
+      "grad_norm": 0.36485254764556885,
+      "learning_rate": 0.000519472053030722,
+      "loss": 5.023,
+      "step": 819
+    },
+    {
+      "epoch": 0.2622478962201923,
+      "grad_norm": 0.3782816231250763,
+      "learning_rate": 0.0005192600137251435,
+      "loss": 5.0162,
+      "step": 820
+    },
+    {
+      "epoch": 0.2625677107277779,
+      "grad_norm": 0.5833194255828857,
+      "learning_rate": 0.0005190477390226595,
+      "loss": 5.1193,
+      "step": 821
+    },
+    {
+      "epoch": 0.26288752523536346,
+      "grad_norm": 0.3927282989025116,
+      "learning_rate": 0.0005188352291511673,
+      "loss": 4.9848,
+      "step": 822
+    },
+    {
+      "epoch": 0.2632073397429491,
+      "grad_norm": 0.4369112253189087,
+      "learning_rate": 0.000518622484338817,
+      "loss": 5.0558,
+      "step": 823
+    },
+    {
+      "epoch": 0.2635271542505347,
+      "grad_norm": 0.3752604126930237,
+      "learning_rate": 0.0005184095048140106,
+      "loss": 5.0508,
+      "step": 824
+    },
+    {
+      "epoch": 0.2638469687581203,
+      "grad_norm": 0.372988224029541,
+      "learning_rate": 0.0005181962908054027,
+      "loss": 5.0706,
+      "step": 825
+    },
+    {
+      "epoch": 0.2641667832657059,
+      "grad_norm": 0.371185839176178,
+      "learning_rate": 0.0005179828425418988,
+      "loss": 5.029,
+      "step": 826
+    },
+    {
+      "epoch": 0.2644865977732915,
+      "grad_norm": 0.4701564908027649,
+      "learning_rate": 0.0005177691602526566,
+      "loss": 5.0012,
+      "step": 827
+    },
+    {
+      "epoch": 0.2648064122808771,
+      "grad_norm": 0.37562939524650574,
+      "learning_rate": 0.0005175552441670847,
+      "loss": 5.0122,
+      "step": 828
+    },
+    {
+      "epoch": 0.2651262267884627,
+      "grad_norm": 0.3713657855987549,
+      "learning_rate": 0.0005173410945148427,
+      "loss": 5.0855,
+      "step": 829
+    },
+    {
+      "epoch": 0.2654460412960483,
+      "grad_norm": 0.3984169661998749,
+      "learning_rate": 0.0005171267115258412,
+      "loss": 5.0673,
+      "step": 830
+    },
+    {
+      "epoch": 0.2657658558036339,
+      "grad_norm": 0.3704157769680023,
+      "learning_rate": 0.0005169120954302409,
+      "loss": 5.0133,
+      "step": 831
+    },
+    {
+      "epoch": 0.2660856703112195,
+      "grad_norm": 0.38717636466026306,
+      "learning_rate": 0.0005166972464584532,
+      "loss": 4.9717,
+      "step": 832
+    },
+    {
+      "epoch": 0.2664054848188051,
+      "grad_norm": 0.37879058718681335,
+      "learning_rate": 0.0005164821648411394,
+      "loss": 5.0024,
+      "step": 833
+    },
+    {
+      "epoch": 0.2667252993263907,
+      "grad_norm": 0.37203672528266907,
+      "learning_rate": 0.0005162668508092103,
+      "loss": 4.9128,
+      "step": 834
+    },
+    {
+      "epoch": 0.2670451138339763,
+      "grad_norm": 0.3519008755683899,
+      "learning_rate": 0.0005160513045938265,
+      "loss": 4.9973,
+      "step": 835
+    },
+    {
+      "epoch": 0.2673649283415619,
+      "grad_norm": 0.36659252643585205,
+      "learning_rate": 0.0005158355264263978,
+      "loss": 5.0137,
+      "step": 836
+    },
+    {
+      "epoch": 0.2676847428491475,
+      "grad_norm": 0.36538055539131165,
+      "learning_rate": 0.0005156195165385829,
+      "loss": 5.009,
+      "step": 837
+    },
+    {
+      "epoch": 0.2680045573567331,
+      "grad_norm": 0.3755475580692291,
+      "learning_rate": 0.0005154032751622894,
+      "loss": 5.0259,
+      "step": 838
+    },
+    {
+      "epoch": 0.2683243718643187,
+      "grad_norm": 0.3508605360984802,
+      "learning_rate": 0.0005151868025296736,
+      "loss": 5.095,
+      "step": 839
+    },
+    {
+      "epoch": 0.2686441863719043,
+      "grad_norm": 0.3421269655227661,
+      "learning_rate": 0.0005149700988731397,
+      "loss": 4.9484,
+      "step": 840
+    },
+    {
+      "epoch": 0.2689640008794899,
+      "grad_norm": 0.42560285329818726,
+      "learning_rate": 0.0005147531644253402,
+      "loss": 5.11,
+      "step": 841
+    },
+    {
+      "epoch": 0.2692838153870755,
+      "grad_norm": 0.3709728717803955,
+      "learning_rate": 0.0005145359994191751,
+      "loss": 5.0851,
+      "step": 842
+    },
+    {
+      "epoch": 0.2696036298946611,
+      "grad_norm": 0.34871870279312134,
+      "learning_rate": 0.0005143186040877923,
+      "loss": 4.9554,
+      "step": 843
+    },
+    {
+      "epoch": 0.2699234444022467,
+      "grad_norm": 0.3683784604072571,
+      "learning_rate": 0.0005141009786645868,
+      "loss": 4.9996,
+      "step": 844
+    },
+    {
+      "epoch": 0.2702432589098323,
+      "grad_norm": 0.3769448697566986,
+      "learning_rate": 0.0005138831233832005,
+      "loss": 5.0486,
+      "step": 845
+    },
+    {
+      "epoch": 0.2705630734174179,
+      "grad_norm": 0.36065971851348877,
+      "learning_rate": 0.0005136650384775221,
+      "loss": 4.9998,
+      "step": 846
+    },
+    {
+      "epoch": 0.2708828879250035,
+      "grad_norm": 0.37307313084602356,
+      "learning_rate": 0.0005134467241816872,
+      "loss": 4.9706,
+      "step": 847
+    },
+    {
+      "epoch": 0.2712027024325891,
+      "grad_norm": 0.35749685764312744,
+      "learning_rate": 0.0005132281807300773,
+      "loss": 5.0893,
+      "step": 848
+    },
+    {
+      "epoch": 0.2715225169401747,
+      "grad_norm": 0.3517495095729828,
+      "learning_rate": 0.0005130094083573198,
+      "loss": 4.993,
+      "step": 849
+    },
+    {
+      "epoch": 0.2718423314477603,
+      "grad_norm": 0.3668519854545593,
+      "learning_rate": 0.0005127904072982884,
+      "loss": 5.0027,
+      "step": 850
+    },
+    {
+      "epoch": 0.2721621459553459,
+      "grad_norm": 0.3489197790622711,
+      "learning_rate": 0.0005125711777881016,
+      "loss": 4.9578,
+      "step": 851
+    },
+    {
+      "epoch": 0.2724819604629315,
+      "grad_norm": 0.39180049300193787,
+      "learning_rate": 0.0005123517200621238,
+      "loss": 5.0029,
+      "step": 852
+    },
+    {
+      "epoch": 0.2728017749705171,
+      "grad_norm": 0.37387964129447937,
+      "learning_rate": 0.0005121320343559641,
+      "loss": 4.9993,
+      "step": 853
+    },
+    {
+      "epoch": 0.2731215894781027,
+      "grad_norm": 0.369540274143219,
+      "learning_rate": 0.0005119121209054767,
+      "loss": 5.0809,
+      "step": 854
+    },
+    {
+      "epoch": 0.2734414039856883,
+      "grad_norm": 0.3686256408691406,
+      "learning_rate": 0.0005116919799467597,
+      "loss": 5.0736,
+      "step": 855
+    },
+    {
+      "epoch": 0.2737612184932739,
+      "grad_norm": 0.36162644624710083,
+      "learning_rate": 0.0005114716117161558,
+      "loss": 5.0013,
+      "step": 856
+    },
+    {
+      "epoch": 0.2740810330008595,
+      "grad_norm": 0.3603357970714569,
+      "learning_rate": 0.0005112510164502518,
+      "loss": 4.982,
+      "step": 857
+    },
+    {
+      "epoch": 0.2744008475084451,
+      "grad_norm": 0.41782650351524353,
+      "learning_rate": 0.000511030194385878,
+      "loss": 4.9961,
+      "step": 858
+    },
+    {
+      "epoch": 0.2747206620160307,
+      "grad_norm": 0.37291574478149414,
+      "learning_rate": 0.0005108091457601085,
+      "loss": 4.9476,
+      "step": 859
+    },
+    {
+      "epoch": 0.2750404765236163,
+      "grad_norm": 0.3500765562057495,
+      "learning_rate": 0.0005105878708102604,
+      "loss": 4.9956,
+      "step": 860
+    },
+    {
+      "epoch": 0.2753602910312019,
+      "grad_norm": 0.6664988398551941,
+      "learning_rate": 0.0005103663697738937,
+      "loss": 4.9269,
+      "step": 861
+    },
+    {
+      "epoch": 0.2756801055387875,
+      "grad_norm": 0.45024263858795166,
+      "learning_rate": 0.0005101446428888115,
+      "loss": 5.0055,
+      "step": 862
+    },
+    {
+      "epoch": 0.2759999200463731,
+      "grad_norm": 0.3707965314388275,
+      "learning_rate": 0.0005099226903930589,
+      "loss": 5.037,
+      "step": 863
+    },
+    {
+      "epoch": 0.2763197345539587,
+      "grad_norm": 0.4126085937023163,
+      "learning_rate": 0.0005097005125249236,
+      "loss": 4.9696,
+      "step": 864
+    },
+    {
+      "epoch": 0.2766395490615443,
+      "grad_norm": 0.34763750433921814,
+      "learning_rate": 0.0005094781095229352,
+      "loss": 4.9535,
+      "step": 865
+    },
+    {
+      "epoch": 0.2769593635691299,
+      "grad_norm": 0.35203567147254944,
+      "learning_rate": 0.0005092554816258644,
+      "loss": 4.9696,
+      "step": 866
+    },
+    {
+      "epoch": 0.2772791780767155,
+      "grad_norm": 0.36334866285324097,
+      "learning_rate": 0.0005090326290727245,
+      "loss": 4.9889,
+      "step": 867
+    },
+    {
+      "epoch": 0.2775989925843011,
+      "grad_norm": 0.40206074714660645,
+      "learning_rate": 0.0005088095521027689,
+      "loss": 4.9872,
+      "step": 868
+    },
+    {
+      "epoch": 0.2779188070918867,
+      "grad_norm": 0.35013046860694885,
+      "learning_rate": 0.0005085862509554926,
+      "loss": 4.9679,
+      "step": 869
+    },
+    {
+      "epoch": 0.2782386215994723,
+      "grad_norm": 0.40070170164108276,
+      "learning_rate": 0.000508362725870631,
+      "loss": 4.9543,
+      "step": 870
+    },
+    {
+      "epoch": 0.2785584361070579,
+      "grad_norm": 0.3722264766693115,
+      "learning_rate": 0.0005081389770881599,
+      "loss": 5.0307,
+      "step": 871
+    },
+    {
+      "epoch": 0.2788782506146435,
+      "grad_norm": 0.3882618844509125,
+      "learning_rate": 0.0005079150048482954,
+      "loss": 4.9114,
+      "step": 872
+    },
+    {
+      "epoch": 0.2791980651222291,
+      "grad_norm": 0.3612721562385559,
+      "learning_rate": 0.0005076908093914936,
+      "loss": 4.9894,
+      "step": 873
+    },
+    {
+      "epoch": 0.2795178796298147,
+      "grad_norm": 0.39893192052841187,
+      "learning_rate": 0.0005074663909584498,
+      "loss": 4.9934,
+      "step": 874
+    },
+    {
+      "epoch": 0.2798376941374003,
+      "grad_norm": 0.37248483300209045,
+      "learning_rate": 0.000507241749790099,
+      "loss": 4.9761,
+      "step": 875
+    },
+    {
+      "epoch": 0.2801575086449859,
+      "grad_norm": 0.41064324975013733,
+      "learning_rate": 0.0005070168861276155,
+      "loss": 4.9979,
+      "step": 876
+    },
+    {
+      "epoch": 0.2804773231525715,
+      "grad_norm": 0.5655444264411926,
+      "learning_rate": 0.0005067918002124121,
+      "loss": 4.9685,
+      "step": 877
+    },
+    {
+      "epoch": 0.2807971376601571,
+      "grad_norm": 0.37055379152297974,
+      "learning_rate": 0.0005065664922861405,
+      "loss": 5.0583,
+      "step": 878
+    },
+    {
+      "epoch": 0.2811169521677427,
+      "grad_norm": 0.342821329832077,
+      "learning_rate": 0.0005063409625906905,
+      "loss": 4.962,
+      "step": 879
+    },
+    {
+      "epoch": 0.2814367666753283,
+      "grad_norm": 0.3737742006778717,
+      "learning_rate": 0.0005061152113681901,
+      "loss": 4.9767,
+      "step": 880
+    },
+    {
+      "epoch": 0.2817565811829139,
+      "grad_norm": 0.34838446974754333,
+      "learning_rate": 0.0005058892388610053,
+      "loss": 4.9858,
+      "step": 881
+    },
+    {
+      "epoch": 0.2820763956904995,
+      "grad_norm": 0.362216979265213,
+      "learning_rate": 0.0005056630453117394,
+      "loss": 4.986,
+      "step": 882
+    },
+    {
+      "epoch": 0.2823962101980851,
+      "grad_norm": 0.6060847640037537,
+      "learning_rate": 0.0005054366309632333,
+      "loss": 5.0062,
+      "step": 883
+    },
+    {
+      "epoch": 0.2827160247056707,
+      "grad_norm": 0.3720206022262573,
+      "learning_rate": 0.0005052099960585645,
+      "loss": 4.9763,
+      "step": 884
+    },
+    {
+      "epoch": 0.2830358392132563,
+      "grad_norm": 0.3505547046661377,
+      "learning_rate": 0.0005049831408410478,
+      "loss": 4.9872,
+      "step": 885
+    },
+    {
+      "epoch": 0.2833556537208419,
+      "grad_norm": 0.3697142004966736,
+      "learning_rate": 0.0005047560655542342,
+      "loss": 4.9589,
+      "step": 886
+    },
+    {
+      "epoch": 0.2836754682284275,
+      "grad_norm": 0.3885795474052429,
+      "learning_rate": 0.000504528770441911,
+      "loss": 4.974,
+      "step": 887
+    },
+    {
+      "epoch": 0.2839952827360131,
+      "grad_norm": 0.36806532740592957,
+      "learning_rate": 0.0005043012557481016,
+      "loss": 4.9719,
+      "step": 888
+    },
+    {
+      "epoch": 0.2843150972435987,
+      "grad_norm": 0.3809673488140106,
+      "learning_rate": 0.0005040735217170653,
+      "loss": 4.9729,
+      "step": 889
+    },
+    {
+      "epoch": 0.2846349117511843,
+      "grad_norm": 0.38288503885269165,
+      "learning_rate": 0.0005038455685932964,
+      "loss": 4.9595,
+      "step": 890
+    },
+    {
+      "epoch": 0.2849547262587699,
+      "grad_norm": 0.3831663429737091,
+      "learning_rate": 0.0005036173966215248,
+      "loss": 4.9489,
+      "step": 891
+    },
+    {
+      "epoch": 0.2852745407663555,
+      "grad_norm": 0.3987366557121277,
+      "learning_rate": 0.0005033890060467153,
+      "loss": 4.9182,
+      "step": 892
+    },
+    {
+      "epoch": 0.2855943552739411,
+      "grad_norm": 0.37844398617744446,
+      "learning_rate": 0.0005031603971140674,
+      "loss": 4.9515,
+      "step": 893
+    },
+    {
+      "epoch": 0.28591416978152673,
+      "grad_norm": 0.4271279275417328,
+      "learning_rate": 0.000502931570069015,
+      "loss": 4.8621,
+      "step": 894
+    },
+    {
+      "epoch": 0.28623398428911234,
+      "grad_norm": 0.36089277267456055,
+      "learning_rate": 0.0005027025251572259,
+      "loss": 4.973,
+      "step": 895
+    },
+    {
+      "epoch": 0.2865537987966979,
+      "grad_norm": 0.47260555624961853,
+      "learning_rate": 0.0005024732626246022,
+      "loss": 4.9528,
+      "step": 896
+    },
+    {
+      "epoch": 0.2868736133042835,
+      "grad_norm": 0.3725051283836365,
+      "learning_rate": 0.0005022437827172795,
+      "loss": 5.0413,
+      "step": 897
+    },
+    {
+      "epoch": 0.28719342781186913,
+      "grad_norm": 0.3459242284297943,
+      "learning_rate": 0.0005020140856816268,
+      "loss": 5.0664,
+      "step": 898
+    },
+    {
+      "epoch": 0.28751324231945474,
+      "grad_norm": 0.3797368109226227,
+      "learning_rate": 0.0005017841717642461,
+      "loss": 5.0057,
+      "step": 899
+    },
+    {
+      "epoch": 0.2878330568270403,
+      "grad_norm": 0.3647208511829376,
+      "learning_rate": 0.0005015540412119721,
+      "loss": 4.9082,
+      "step": 900
+    },
+    {
+      "epoch": 0.2878330568270403,
+      "eval_loss": 4.972633361816406,
+      "eval_runtime": 83.5318,
+      "eval_samples_per_second": 22.71,
+      "eval_steps_per_second": 5.686,
+      "step": 900
+    },
+    {
+      "epoch": 0.2881528713346259,
+      "grad_norm": 0.3690331280231476,
+      "learning_rate": 0.0005013236942718725,
+      "loss": 4.9479,
+      "step": 901
+    },
+    {
+      "epoch": 0.28847268584221153,
+      "grad_norm": 0.3647288680076599,
+      "learning_rate": 0.0005010931311912473,
+      "loss": 5.0071,
+      "step": 902
+    },
+    {
+      "epoch": 0.28879250034979714,
+      "grad_norm": 0.4730885326862335,
+      "learning_rate": 0.0005008623522176279,
+      "loss": 4.9958,
+      "step": 903
+    },
+    {
+      "epoch": 0.2891123148573827,
+      "grad_norm": 0.3861388564109802,
+      "learning_rate": 0.0005006313575987784,
+      "loss": 4.951,
+      "step": 904
+    },
+    {
+      "epoch": 0.2894321293649683,
+      "grad_norm": 0.36299601197242737,
+      "learning_rate": 0.0005004001475826935,
+      "loss": 5.0293,
+      "step": 905
+    },
+    {
+      "epoch": 0.28975194387255393,
+      "grad_norm": 0.41219601035118103,
+      "learning_rate": 0.0005001687224175999,
+      "loss": 5.0395,
+      "step": 906
+    },
+    {
+      "epoch": 0.29007175838013954,
+      "grad_norm": 0.36094823479652405,
+      "learning_rate": 0.0004999370823519548,
+      "loss": 5.0214,
+      "step": 907
+    },
+    {
+      "epoch": 0.2903915728877251,
+      "grad_norm": 0.3846047818660736,
+      "learning_rate": 0.0004997052276344463,
+      "loss": 5.0025,
+      "step": 908
+    },
+    {
+      "epoch": 0.2907113873953107,
+      "grad_norm": 0.3636736571788788,
+      "learning_rate": 0.000499473158513993,
+      "loss": 5.0134,
+      "step": 909
+    },
+    {
+      "epoch": 0.29103120190289633,
+      "grad_norm": 0.38272932171821594,
+      "learning_rate": 0.0004992408752397437,
+      "loss": 4.9437,
+      "step": 910
+    },
+    {
+      "epoch": 0.29135101641048194,
+      "grad_norm": 0.34067991375923157,
+      "learning_rate": 0.0004990083780610769,
+      "loss": 4.8482,
+      "step": 911
+    },
+    {
+      "epoch": 0.2916708309180675,
+      "grad_norm": 0.35028305649757385,
+      "learning_rate": 0.000498775667227601,
+      "loss": 4.9277,
+      "step": 912
+    },
+    {
+      "epoch": 0.2919906454256531,
+      "grad_norm": 0.37718984484672546,
+      "learning_rate": 0.0004985427429891536,
+      "loss": 5.0258,
+      "step": 913
+    },
+    {
+      "epoch": 0.29231045993323873,
+      "grad_norm": 0.33929359912872314,
+      "learning_rate": 0.0004983096055958014,
+      "loss": 4.9383,
+      "step": 914
+    },
+    {
+      "epoch": 0.29263027444082435,
+      "grad_norm": 0.4247644245624542,
+      "learning_rate": 0.0004980762552978403,
+      "loss": 4.9067,
+      "step": 915
+    },
+    {
+      "epoch": 0.2929500889484099,
+      "grad_norm": 0.3345932364463806,
+      "learning_rate": 0.0004978426923457942,
+      "loss": 4.9354,
+      "step": 916
+    },
+    {
+      "epoch": 0.2932699034559955,
+      "grad_norm": 0.36324384808540344,
+      "learning_rate": 0.0004976089169904156,
+      "loss": 4.8719,
+      "step": 917
+    },
+    {
+      "epoch": 0.29358971796358113,
+      "grad_norm": 0.37019455432891846,
+      "learning_rate": 0.0004973749294826853,
+      "loss": 4.9591,
+      "step": 918
+    },
+    {
+      "epoch": 0.29390953247116675,
+      "grad_norm": 0.35735613107681274,
+      "learning_rate": 0.0004971407300738114,
+      "loss": 4.8741,
+      "step": 919
+    },
+    {
+      "epoch": 0.2942293469787523,
+      "grad_norm": 0.34016358852386475,
+      "learning_rate": 0.0004969063190152297,
+      "loss": 4.8985,
+      "step": 920
+    },
+    {
+      "epoch": 0.2945491614863379,
+      "grad_norm": 0.3893618881702423,
+      "learning_rate": 0.0004966716965586033,
+      "loss": 4.9197,
+      "step": 921
+    },
+    {
+      "epoch": 0.29486897599392353,
+      "grad_norm": 0.34983229637145996,
+      "learning_rate": 0.0004964368629558221,
+      "loss": 4.9452,
+      "step": 922
+    },
+    {
+      "epoch": 0.29518879050150915,
+      "grad_norm": 0.3541092574596405,
+      "learning_rate": 0.0004962018184590028,
+      "loss": 4.9591,
+      "step": 923
+    },
+    {
+      "epoch": 0.2955086050090947,
+      "grad_norm": 0.39640942215919495,
+      "learning_rate": 0.0004959665633204885,
+      "loss": 4.8656,
+      "step": 924
+    },
+    {
+      "epoch": 0.2958284195166803,
+      "grad_norm": 0.3901992440223694,
+      "learning_rate": 0.0004957310977928484,
+      "loss": 4.9126,
+      "step": 925
+    },
+    {
+      "epoch": 0.29614823402426593,
+      "grad_norm": 0.36173298954963684,
+      "learning_rate": 0.0004954954221288775,
+      "loss": 4.9545,
+      "step": 926
+    },
+    {
+      "epoch": 0.29646804853185155,
+      "grad_norm": 0.3484973907470703,
+      "learning_rate": 0.0004952595365815967,
+      "loss": 4.9946,
+      "step": 927
+    },
+    {
+      "epoch": 0.2967878630394371,
+      "grad_norm": 0.366262823343277,
+      "learning_rate": 0.0004950234414042519,
+      "loss": 4.8764,
+      "step": 928
+    },
+    {
+      "epoch": 0.2971076775470227,
+      "grad_norm": 0.348197877407074,
+      "learning_rate": 0.0004947871368503143,
+      "loss": 4.9088,
+      "step": 929
+    },
+    {
+      "epoch": 0.29742749205460833,
+      "grad_norm": 0.4134189486503601,
+      "learning_rate": 0.0004945506231734796,
+      "loss": 4.985,
+      "step": 930
+    },
+    {
+      "epoch": 0.29774730656219395,
+      "grad_norm": 0.3315630555152893,
+      "learning_rate": 0.0004943139006276683,
+      "loss": 4.9724,
+      "step": 931
+    },
+    {
+      "epoch": 0.2980671210697795,
+      "grad_norm": 0.3424299955368042,
+      "learning_rate": 0.0004940769694670251,
+      "loss": 5.0209,
+      "step": 932
+    },
+    {
+      "epoch": 0.2983869355773651,
+      "grad_norm": 0.3581695556640625,
+      "learning_rate": 0.0004938398299459183,
+      "loss": 4.9245,
+      "step": 933
+    },
+    {
+      "epoch": 0.29870675008495073,
+      "grad_norm": 0.34082433581352234,
+      "learning_rate": 0.0004936024823189406,
+      "loss": 4.882,
+      "step": 934
+    },
+    {
+      "epoch": 0.29902656459253635,
+      "grad_norm": 0.40066200494766235,
+      "learning_rate": 0.0004933649268409073,
+      "loss": 4.8821,
+      "step": 935
+    },
+    {
+      "epoch": 0.2993463791001219,
+      "grad_norm": 0.3584911823272705,
+      "learning_rate": 0.0004931271637668577,
+      "loss": 5.0226,
+      "step": 936
+    },
+    {
+      "epoch": 0.2996661936077075,
+      "grad_norm": 0.37688034772872925,
+      "learning_rate": 0.0004928891933520533,
+      "loss": 4.9522,
+      "step": 937
+    },
+    {
+      "epoch": 0.29998600811529313,
+      "grad_norm": 0.37159818410873413,
+      "learning_rate": 0.0004926510158519784,
+      "loss": 4.9429,
+      "step": 938
+    },
+    {
+      "epoch": 0.30030582262287875,
+      "grad_norm": 0.39578378200531006,
+      "learning_rate": 0.0004924126315223396,
+      "loss": 5.0257,
+      "step": 939
+    },
+    {
+      "epoch": 0.3006256371304643,
+      "grad_norm": 0.36488571763038635,
+      "learning_rate": 0.0004921740406190659,
+      "loss": 4.898,
+      "step": 940
+    },
+    {
+      "epoch": 0.3009454516380499,
+      "grad_norm": 0.3469265103340149,
+      "learning_rate": 0.0004919352433983075,
+      "loss": 4.877,
+      "step": 941
+    },
+    {
+      "epoch": 0.30126526614563554,
+      "grad_norm": 0.3934706151485443,
+      "learning_rate": 0.0004916962401164365,
+      "loss": 4.9137,
+      "step": 942
+    },
+    {
+      "epoch": 0.30158508065322115,
+      "grad_norm": 0.3785736858844757,
+      "learning_rate": 0.0004914570310300462,
+      "loss": 4.9126,
+      "step": 943
+    },
+    {
+      "epoch": 0.3019048951608067,
+      "grad_norm": 0.39063695073127747,
+      "learning_rate": 0.0004912176163959506,
+      "loss": 4.975,
+      "step": 944
+    },
+    {
+      "epoch": 0.3022247096683923,
+      "grad_norm": 0.3680741488933563,
+      "learning_rate": 0.0004909779964711848,
+      "loss": 4.8925,
+      "step": 945
+    },
+    {
+      "epoch": 0.30254452417597794,
+      "grad_norm": 0.3956775963306427,
+      "learning_rate": 0.0004907381715130038,
+      "loss": 4.8729,
+      "step": 946
+    },
+    {
+      "epoch": 0.30286433868356355,
+      "grad_norm": 0.39570778608322144,
+      "learning_rate": 0.000490498141778883,
+      "loss": 4.9321,
+      "step": 947
+    },
+    {
+      "epoch": 0.3031841531911491,
+      "grad_norm": 0.38764604926109314,
+      "learning_rate": 0.0004902579075265178,
+      "loss": 4.9018,
+      "step": 948
+    },
+    {
+      "epoch": 0.3035039676987347,
+      "grad_norm": 0.3573993742465973,
+      "learning_rate": 0.0004900174690138229,
+      "loss": 4.9348,
+      "step": 949
+    },
+    {
+      "epoch": 0.30382378220632034,
+      "grad_norm": 0.3773062825202942,
+      "learning_rate": 0.0004897768264989323,
+      "loss": 4.9789,
+      "step": 950
+    },
+    {
+      "epoch": 0.30414359671390595,
+      "grad_norm": 0.36863696575164795,
+      "learning_rate": 0.0004895359802401992,
+      "loss": 4.973,
+      "step": 951
+    },
+    {
+      "epoch": 0.3044634112214915,
+      "grad_norm": 0.3877570629119873,
+      "learning_rate": 0.0004892949304961952,
+      "loss": 4.9151,
+      "step": 952
+    },
+    {
+      "epoch": 0.3047832257290771,
+      "grad_norm": 0.3498269021511078,
+      "learning_rate": 0.0004890536775257109,
+      "loss": 4.8523,
+      "step": 953
+    },
+    {
+      "epoch": 0.30510304023666274,
+      "grad_norm": 0.35669517517089844,
+      "learning_rate": 0.0004888122215877547,
+      "loss": 4.9247,
+      "step": 954
+    },
+    {
+      "epoch": 0.30542285474424835,
+      "grad_norm": 0.3852956295013428,
+      "learning_rate": 0.0004885705629415528,
+      "loss": 4.9261,
+      "step": 955
+    },
+    {
+      "epoch": 0.3057426692518339,
+      "grad_norm": 0.3802240788936615,
+      "learning_rate": 0.0004883287018465494,
+      "loss": 4.9425,
+      "step": 956
+    },
+    {
+      "epoch": 0.3060624837594195,
+      "grad_norm": 0.3908936381340027,
+      "learning_rate": 0.00048808663856240596,
+      "loss": 5.0087,
+      "step": 957
+    },
+    {
+      "epoch": 0.30638229826700514,
+      "grad_norm": 0.39753517508506775,
+      "learning_rate": 0.0004878443733490006,
+      "loss": 4.919,
+      "step": 958
+    },
+    {
+      "epoch": 0.30670211277459075,
+      "grad_norm": 0.3648401200771332,
+      "learning_rate": 0.00048760190646642866,
+      "loss": 4.8682,
+      "step": 959
+    },
+    {
+      "epoch": 0.3070219272821763,
+      "grad_norm": 0.4140999913215637,
+      "learning_rate": 0.000487359238175002,
+      "loss": 4.9643,
+      "step": 960
+    },
+    {
+      "epoch": 0.3073417417897619,
+      "grad_norm": 0.3538423180580139,
+      "learning_rate": 0.00048711636873524856,
+      "loss": 4.9116,
+      "step": 961
+    },
+    {
+      "epoch": 0.30766155629734754,
+      "grad_norm": 0.43048593401908875,
+      "learning_rate": 0.00048687329840791207,
+      "loss": 4.9888,
+      "step": 962
+    },
+    {
+      "epoch": 0.30798137080493315,
+      "grad_norm": 0.3531091511249542,
+      "learning_rate": 0.0004866300274539523,
+      "loss": 4.9137,
+      "step": 963
+    },
+    {
+      "epoch": 0.30830118531251877,
+      "grad_norm": 0.38962531089782715,
+      "learning_rate": 0.0004863865561345442,
+      "loss": 4.9497,
+      "step": 964
+    },
+    {
+      "epoch": 0.3086209998201043,
+      "grad_norm": 0.3531169295310974,
+      "learning_rate": 0.00048614288471107774,
+      "loss": 4.856,
+      "step": 965
+    },
+    {
+      "epoch": 0.30894081432768994,
+      "grad_norm": 0.36637869477272034,
+      "learning_rate": 0.00048589901344515805,
+      "loss": 4.9839,
+      "step": 966
+    },
+    {
+      "epoch": 0.30926062883527555,
+      "grad_norm": 0.3657850921154022,
+      "learning_rate": 0.00048565494259860434,
+      "loss": 4.9139,
+      "step": 967
+    },
+    {
+      "epoch": 0.30958044334286117,
+      "grad_norm": 0.35608503222465515,
+      "learning_rate": 0.00048541067243345064,
+      "loss": 4.9159,
+      "step": 968
+    },
+    {
+      "epoch": 0.3099002578504467,
+      "grad_norm": 0.38662147521972656,
+      "learning_rate": 0.00048516620321194443,
+      "loss": 4.9352,
+      "step": 969
+    },
+    {
+      "epoch": 0.31022007235803234,
+      "grad_norm": 0.3447380065917969,
+      "learning_rate": 0.0004849215351965474,
+      "loss": 4.8427,
+      "step": 970
+    },
+    {
+      "epoch": 0.31053988686561795,
+      "grad_norm": 0.3676728308200836,
+      "learning_rate": 0.0004846766686499342,
+      "loss": 4.9424,
+      "step": 971
+    },
+    {
+      "epoch": 0.31085970137320357,
+      "grad_norm": 0.34721434116363525,
+      "learning_rate": 0.0004844316038349929,
+      "loss": 4.8913,
+      "step": 972
+    },
+    {
+      "epoch": 0.3111795158807891,
+      "grad_norm": 0.3613666892051697,
+      "learning_rate": 0.00048418634101482435,
+      "loss": 4.8762,
+      "step": 973
+    },
+    {
+      "epoch": 0.31149933038837474,
+      "grad_norm": 0.3623393774032593,
+      "learning_rate": 0.000483940880452742,
+      "loss": 4.9168,
+      "step": 974
+    },
+    {
+      "epoch": 0.31181914489596035,
+      "grad_norm": 0.3604717254638672,
+      "learning_rate": 0.0004836952224122716,
+      "loss": 4.9698,
+      "step": 975
+    },
+    {
+      "epoch": 0.31213895940354597,
+      "grad_norm": 0.3825310468673706,
+      "learning_rate": 0.00048344936715715104,
+      "loss": 4.9762,
+      "step": 976
+    },
+    {
+      "epoch": 0.3124587739111315,
+      "grad_norm": 0.34435907006263733,
+      "learning_rate": 0.0004832033149513295,
+      "loss": 4.8628,
+      "step": 977
+    },
+    {
+      "epoch": 0.31277858841871714,
+      "grad_norm": 0.6025840640068054,
+      "learning_rate": 0.0004829570660589681,
+      "loss": 4.9957,
+      "step": 978
+    },
+    {
+      "epoch": 0.31309840292630275,
+      "grad_norm": 0.3436499238014221,
+      "learning_rate": 0.0004827106207444389,
+      "loss": 4.8879,
+      "step": 979
+    },
+    {
+      "epoch": 0.31341821743388837,
+      "grad_norm": 0.4185909032821655,
+      "learning_rate": 0.00048246397927232483,
+      "loss": 4.9352,
+      "step": 980
+    },
+    {
+      "epoch": 0.3137380319414739,
+      "grad_norm": 0.358967125415802,
+      "learning_rate": 0.00048221714190741947,
+      "loss": 4.9247,
+      "step": 981
+    },
+    {
+      "epoch": 0.31405784644905954,
+      "grad_norm": 0.34398168325424194,
+      "learning_rate": 0.00048197010891472665,
+      "loss": 4.8922,
+      "step": 982
+    },
+    {
+      "epoch": 0.31437766095664516,
+      "grad_norm": 0.3618887662887573,
+      "learning_rate": 0.00048172288055946033,
+      "loss": 4.9811,
+      "step": 983
+    },
+    {
+      "epoch": 0.31469747546423077,
+      "grad_norm": 0.3518047332763672,
+      "learning_rate": 0.0004814754571070442,
+      "loss": 4.9583,
+      "step": 984
+    },
+    {
+      "epoch": 0.3150172899718163,
+      "grad_norm": 0.383455753326416,
+      "learning_rate": 0.00048122783882311126,
+      "loss": 4.9536,
+      "step": 985
+    },
+    {
+      "epoch": 0.31533710447940194,
+      "grad_norm": 0.38077494502067566,
+      "learning_rate": 0.0004809800259735038,
+      "loss": 4.9451,
+      "step": 986
+    },
+    {
+      "epoch": 0.31565691898698756,
+      "grad_norm": 0.3760354220867157,
+      "learning_rate": 0.0004807320188242728,
+      "loss": 4.8219,
+      "step": 987
+    },
+    {
+      "epoch": 0.31597673349457317,
+      "grad_norm": 0.3830978274345398,
+      "learning_rate": 0.0004804838176416782,
+      "loss": 4.8887,
+      "step": 988
+    },
+    {
+      "epoch": 0.31629654800215873,
+      "grad_norm": 0.40143024921417236,
+      "learning_rate": 0.000480235422692188,
+      "loss": 4.9925,
+      "step": 989
+    },
+    {
+      "epoch": 0.31661636250974434,
+      "grad_norm": 0.3880362808704376,
+      "learning_rate": 0.0004799868342424784,
+      "loss": 4.8642,
+      "step": 990
+    },
+    {
+      "epoch": 0.31693617701732996,
+      "grad_norm": 0.36687639355659485,
+      "learning_rate": 0.00047973805255943305,
+      "loss": 4.9327,
+      "step": 991
+    },
+    {
+      "epoch": 0.31725599152491557,
+      "grad_norm": 0.3998223841190338,
+      "learning_rate": 0.0004794890779101434,
+      "loss": 4.9667,
+      "step": 992
+    },
+    {
+      "epoch": 0.31757580603250113,
+      "grad_norm": 0.3674963712692261,
+      "learning_rate": 0.0004792399105619077,
+      "loss": 4.9583,
+      "step": 993
+    },
+    {
+      "epoch": 0.31789562054008674,
+      "grad_norm": 0.45018628239631653,
+      "learning_rate": 0.0004789905507822314,
+      "loss": 4.8859,
+      "step": 994
+    },
+    {
+      "epoch": 0.31821543504767236,
+      "grad_norm": 0.3655175268650055,
+      "learning_rate": 0.00047874099883882644,
+      "loss": 4.9264,
+      "step": 995
+    },
+    {
+      "epoch": 0.31853524955525797,
+      "grad_norm": 0.36511993408203125,
+      "learning_rate": 0.000478491254999611,
+      "loss": 4.8902,
+      "step": 996
+    },
+    {
+      "epoch": 0.31885506406284353,
+      "grad_norm": 0.3576342761516571,
+      "learning_rate": 0.0004782413195327094,
+      "loss": 4.937,
+      "step": 997
+    },
+    {
+      "epoch": 0.31917487857042914,
+      "grad_norm": 0.3686801791191101,
+      "learning_rate": 0.0004779911927064516,
+      "loss": 4.8042,
+      "step": 998
+    },
+    {
+      "epoch": 0.31949469307801476,
+      "grad_norm": 0.3380935490131378,
+      "learning_rate": 0.000477740874789373,
+      "loss": 4.9013,
+      "step": 999
+    },
+    {
+      "epoch": 0.31981450758560037,
+      "grad_norm": 0.36893823742866516,
+      "learning_rate": 0.0004774903660502142,
+      "loss": 4.8851,
+      "step": 1000
+    },
+    {
+      "epoch": 0.31981450758560037,
+      "eval_loss": 4.902451992034912,
+      "eval_runtime": 79.3732,
+      "eval_samples_per_second": 23.9,
+      "eval_steps_per_second": 5.984,
+      "step": 1000
+    },
+    {
+      "epoch": 0.32013432209318593,
+      "grad_norm": 0.37070271372795105,
+      "learning_rate": 0.0004772396667579205,
+      "loss": 4.962,
+      "step": 1001
+    },
+    {
+      "epoch": 0.32045413660077154,
+      "grad_norm": 0.3702748417854309,
+      "learning_rate": 0.0004769887771816422,
+      "loss": 4.8628,
+      "step": 1002
+    },
+    {
+      "epoch": 0.32077395110835716,
+      "grad_norm": 0.39788708090782166,
+      "learning_rate": 0.0004767376975907334,
+      "loss": 4.9365,
+      "step": 1003
+    },
+    {
+      "epoch": 0.32109376561594277,
+      "grad_norm": 0.36875808238983154,
+      "learning_rate": 0.00047648642825475255,
+      "loss": 4.8822,
+      "step": 1004
+    },
+    {
+      "epoch": 0.32141358012352833,
+      "grad_norm": 0.356067419052124,
+      "learning_rate": 0.0004762349694434615,
+      "loss": 4.8964,
+      "step": 1005
+    },
+    {
+      "epoch": 0.32173339463111394,
+      "grad_norm": 0.37602582573890686,
+      "learning_rate": 0.0004759833214268259,
+      "loss": 4.8708,
+      "step": 1006
+    },
+    {
+      "epoch": 0.32205320913869956,
+      "grad_norm": 0.36407309770584106,
+      "learning_rate": 0.0004757314844750141,
+      "loss": 4.854,
+      "step": 1007
+    },
+    {
+      "epoch": 0.3223730236462852,
+      "grad_norm": 0.35537827014923096,
+      "learning_rate": 0.00047547945885839763,
+      "loss": 4.9622,
+      "step": 1008
+    },
+    {
+      "epoch": 0.32269283815387073,
+      "grad_norm": 4.673837661743164,
+      "learning_rate": 0.00047522724484755054,
+      "loss": 4.937,
+      "step": 1009
+    },
+    {
+      "epoch": 0.32301265266145635,
+      "grad_norm": 0.4190693795681,
+      "learning_rate": 0.0004749748427132488,
+      "loss": 4.8952,
+      "step": 1010
+    },
+    {
+      "epoch": 0.32333246716904196,
+      "grad_norm": 0.3714112639427185,
+      "learning_rate": 0.00047472225272647084,
+      "loss": 4.9616,
+      "step": 1011
+    },
+    {
+      "epoch": 0.3236522816766276,
+      "grad_norm": 0.3778044283390045,
+      "learning_rate": 0.00047446947515839634,
+      "loss": 4.9127,
+      "step": 1012
+    },
+    {
+      "epoch": 0.32397209618421313,
+      "grad_norm": 0.36721158027648926,
+      "learning_rate": 0.0004742165102804067,
+      "loss": 4.9322,
+      "step": 1013
+    },
+    {
+      "epoch": 0.32429191069179875,
+      "grad_norm": 0.3614025413990021,
+      "learning_rate": 0.00047396335836408427,
+      "loss": 4.9971,
+      "step": 1014
+    },
+    {
+      "epoch": 0.32461172519938436,
+      "grad_norm": 0.3511156737804413,
+      "learning_rate": 0.0004737100196812121,
+      "loss": 4.8961,
+      "step": 1015
+    },
+    {
+      "epoch": 0.32493153970697,
+      "grad_norm": 0.3765702247619629,
+      "learning_rate": 0.00047345649450377395,
+      "loss": 4.8508,
+      "step": 1016
+    },
+    {
+      "epoch": 0.32525135421455553,
+      "grad_norm": 0.3439734876155853,
+      "learning_rate": 0.0004732027831039536,
+      "loss": 4.7858,
+      "step": 1017
+    },
+    {
+      "epoch": 0.32557116872214115,
+      "grad_norm": 0.3662970960140228,
+      "learning_rate": 0.00047294888575413486,
+      "loss": 4.9161,
+      "step": 1018
+    },
+    {
+      "epoch": 0.32589098322972676,
+      "grad_norm": 0.3613886535167694,
+      "learning_rate": 0.0004726948027269013,
+      "loss": 4.9031,
+      "step": 1019
+    },
+    {
+      "epoch": 0.3262107977373124,
+      "grad_norm": 0.38528895378112793,
+      "learning_rate": 0.00047244053429503565,
+      "loss": 4.9852,
+      "step": 1020
+    },
+    {
+      "epoch": 0.32653061224489793,
+      "grad_norm": 0.3607567846775055,
+      "learning_rate": 0.00047218608073151976,
+      "loss": 4.8439,
+      "step": 1021
+    },
+    {
+      "epoch": 0.32685042675248355,
+      "grad_norm": 0.36917412281036377,
+      "learning_rate": 0.0004719314423095342,
+      "loss": 4.9332,
+      "step": 1022
+    },
+    {
+      "epoch": 0.32717024126006916,
+      "grad_norm": 0.3716620206832886,
+      "learning_rate": 0.00047167661930245803,
+      "loss": 4.805,
+      "step": 1023
+    },
+    {
+      "epoch": 0.3274900557676548,
+      "grad_norm": 0.3702612519264221,
+      "learning_rate": 0.0004714216119838685,
+      "loss": 4.8916,
+      "step": 1024
+    },
+    {
+      "epoch": 0.32780987027524033,
+      "grad_norm": 0.3598913252353668,
+      "learning_rate": 0.00047116642062754074,
+      "loss": 4.8999,
+      "step": 1025
+    },
+    {
+      "epoch": 0.32812968478282595,
+      "grad_norm": 0.34733474254608154,
+      "learning_rate": 0.00047091104550744733,
+      "loss": 4.8999,
+      "step": 1026
+    },
+    {
+      "epoch": 0.32844949929041156,
+      "grad_norm": 0.34953516721725464,
+      "learning_rate": 0.00047065548689775844,
+      "loss": 4.844,
+      "step": 1027
+    },
+    {
+      "epoch": 0.3287693137979972,
+      "grad_norm": 0.37234926223754883,
+      "learning_rate": 0.00047039974507284086,
+      "loss": 4.9322,
+      "step": 1028
+    },
+    {
+      "epoch": 0.32908912830558273,
+      "grad_norm": 0.35275664925575256,
+      "learning_rate": 0.0004701438203072584,
+      "loss": 4.8971,
+      "step": 1029
+    },
+    {
+      "epoch": 0.32940894281316835,
+      "grad_norm": 0.365399569272995,
+      "learning_rate": 0.00046988771287577105,
+      "loss": 4.9074,
+      "step": 1030
+    },
+    {
+      "epoch": 0.32972875732075396,
+      "grad_norm": 0.345048725605011,
+      "learning_rate": 0.0004696314230533349,
+      "loss": 4.8695,
+      "step": 1031
+    },
+    {
+      "epoch": 0.3300485718283396,
+      "grad_norm": 0.33528172969818115,
+      "learning_rate": 0.00046937495111510204,
+      "loss": 4.8512,
+      "step": 1032
+    },
+    {
+      "epoch": 0.3303683863359252,
+      "grad_norm": 0.3688417375087738,
+      "learning_rate": 0.00046911829733642016,
+      "loss": 4.8542,
+      "step": 1033
+    },
+    {
+      "epoch": 0.33068820084351075,
+      "grad_norm": 0.3624173104763031,
+      "learning_rate": 0.0004688614619928318,
+      "loss": 4.9194,
+      "step": 1034
+    },
+    {
+      "epoch": 0.33100801535109636,
+      "grad_norm": 0.35962924361228943,
+      "learning_rate": 0.00046860444536007473,
+      "loss": 4.9694,
+      "step": 1035
+    },
+    {
+      "epoch": 0.331327829858682,
+      "grad_norm": 0.36196452379226685,
+      "learning_rate": 0.0004683472477140811,
+      "loss": 4.911,
+      "step": 1036
+    },
+    {
+      "epoch": 0.3316476443662676,
+      "grad_norm": 0.347702294588089,
+      "learning_rate": 0.0004680898693309777,
+      "loss": 4.859,
+      "step": 1037
+    },
+    {
+      "epoch": 0.33196745887385315,
+      "grad_norm": 0.36738497018814087,
+      "learning_rate": 0.0004678323104870852,
+      "loss": 4.9272,
+      "step": 1038
+    },
+    {
+      "epoch": 0.33228727338143876,
+      "grad_norm": 0.3768245577812195,
+      "learning_rate": 0.00046757457145891784,
+      "loss": 4.812,
+      "step": 1039
+    },
+    {
+      "epoch": 0.3326070878890244,
+      "grad_norm": 0.36470353603363037,
+      "learning_rate": 0.00046731665252318366,
+      "loss": 4.871,
+      "step": 1040
+    },
+    {
+      "epoch": 0.33292690239661,
+      "grad_norm": 0.388289213180542,
+      "learning_rate": 0.0004670585539567837,
+      "loss": 4.868,
+      "step": 1041
+    },
+    {
+      "epoch": 0.33324671690419555,
+      "grad_norm": 0.35525208711624146,
+      "learning_rate": 0.00046680027603681164,
+      "loss": 4.8379,
+      "step": 1042
+    },
+    {
+      "epoch": 0.33356653141178116,
+      "grad_norm": 0.4119868874549866,
+      "learning_rate": 0.0004665418190405541,
+      "loss": 4.8108,
+      "step": 1043
+    },
+    {
+      "epoch": 0.3338863459193668,
+      "grad_norm": 0.35186243057250977,
+      "learning_rate": 0.0004662831832454895,
+      "loss": 4.9152,
+      "step": 1044
+    },
+    {
+      "epoch": 0.3342061604269524,
+      "grad_norm": 0.3671918511390686,
+      "learning_rate": 0.00046602436892928875,
+      "loss": 4.8589,
+      "step": 1045
+    },
+    {
+      "epoch": 0.33452597493453795,
+      "grad_norm": 0.34154975414276123,
+      "learning_rate": 0.000465765376369814,
+      "loss": 4.8987,
+      "step": 1046
+    },
+    {
+      "epoch": 0.33484578944212356,
+      "grad_norm": 0.3686893582344055,
+      "learning_rate": 0.00046550620584511886,
+      "loss": 4.8835,
+      "step": 1047
+    },
+    {
+      "epoch": 0.3351656039497092,
+      "grad_norm": 0.3374258279800415,
+      "learning_rate": 0.00046524685763344803,
+      "loss": 4.8285,
+      "step": 1048
+    },
+    {
+      "epoch": 0.3354854184572948,
+      "grad_norm": 0.3705580532550812,
+      "learning_rate": 0.00046498733201323715,
+      "loss": 4.84,
+      "step": 1049
+    },
+    {
+      "epoch": 0.33580523296488035,
+      "grad_norm": 0.36040836572647095,
+      "learning_rate": 0.000464727629263112,
+      "loss": 4.8931,
+      "step": 1050
+    },
+    {
+      "epoch": 0.33612504747246597,
+      "grad_norm": 0.38583824038505554,
+      "learning_rate": 0.0004644677496618887,
+      "loss": 4.8973,
+      "step": 1051
+    },
+    {
+      "epoch": 0.3364448619800516,
+      "grad_norm": 0.36153537034988403,
+      "learning_rate": 0.00046420769348857343,
+      "loss": 4.8919,
+      "step": 1052
+    },
+    {
+      "epoch": 0.3367646764876372,
+      "grad_norm": 0.3621228039264679,
+      "learning_rate": 0.00046394746102236144,
+      "loss": 4.8117,
+      "step": 1053
+    },
+    {
+      "epoch": 0.33708449099522275,
+      "grad_norm": 0.41207414865493774,
+      "learning_rate": 0.00046368705254263773,
+      "loss": 4.853,
+      "step": 1054
+    },
+    {
+      "epoch": 0.33740430550280837,
+      "grad_norm": 0.37452879548072815,
+      "learning_rate": 0.000463426468328976,
+      "loss": 4.839,
+      "step": 1055
+    },
+    {
+      "epoch": 0.337724120010394,
+      "grad_norm": 0.3425996005535126,
+      "learning_rate": 0.0004631657086611387,
+      "loss": 4.8584,
+      "step": 1056
+    },
+    {
+      "epoch": 0.3380439345179796,
+      "grad_norm": 0.36205095052719116,
+      "learning_rate": 0.0004629047738190767,
+      "loss": 4.8762,
+      "step": 1057
+    },
+    {
+      "epoch": 0.33836374902556515,
+      "grad_norm": 0.3805515170097351,
+      "learning_rate": 0.00046264366408292883,
+      "loss": 4.8817,
+      "step": 1058
+    },
+    {
+      "epoch": 0.33868356353315077,
+      "grad_norm": 0.3517549932003021,
+      "learning_rate": 0.0004623823797330216,
+      "loss": 4.9096,
+      "step": 1059
+    },
+    {
+      "epoch": 0.3390033780407364,
+      "grad_norm": 0.35046592354774475,
+      "learning_rate": 0.00046212092104986946,
+      "loss": 4.9018,
+      "step": 1060
+    },
+    {
+      "epoch": 0.339323192548322,
+      "grad_norm": 0.3718402087688446,
+      "learning_rate": 0.0004618592883141734,
+      "loss": 4.8226,
+      "step": 1061
+    },
+    {
+      "epoch": 0.33964300705590755,
+      "grad_norm": 0.4067389965057373,
+      "learning_rate": 0.00046159748180682166,
+      "loss": 4.8947,
+      "step": 1062
+    },
+    {
+      "epoch": 0.33996282156349317,
+      "grad_norm": 0.3579217791557312,
+      "learning_rate": 0.0004613355018088889,
+      "loss": 4.7653,
+      "step": 1063
+    },
+    {
+      "epoch": 0.3402826360710788,
+      "grad_norm": 0.3520847260951996,
+      "learning_rate": 0.00046107334860163605,
+      "loss": 4.8847,
+      "step": 1064
+    },
+    {
+      "epoch": 0.3406024505786644,
+      "grad_norm": 0.37844428420066833,
+      "learning_rate": 0.00046081102246651014,
+      "loss": 4.8943,
+      "step": 1065
+    },
+    {
+      "epoch": 0.34092226508624995,
+      "grad_norm": 0.343375027179718,
+      "learning_rate": 0.0004605485236851436,
+      "loss": 4.8446,
+      "step": 1066
+    },
+    {
+      "epoch": 0.34124207959383557,
+      "grad_norm": 0.3797420263290405,
+      "learning_rate": 0.0004602858525393544,
+      "loss": 4.7879,
+      "step": 1067
+    },
+    {
+      "epoch": 0.3415618941014212,
+      "grad_norm": 0.3718990385532379,
+      "learning_rate": 0.00046002300931114555,
+      "loss": 4.8804,
+      "step": 1068
+    },
+    {
+      "epoch": 0.3418817086090068,
+      "grad_norm": 0.35296258330345154,
+      "learning_rate": 0.0004597599942827048,
+      "loss": 4.8964,
+      "step": 1069
+    },
+    {
+      "epoch": 0.34220152311659235,
+      "grad_norm": 0.3609677851200104,
+      "learning_rate": 0.0004594968077364041,
+      "loss": 4.9593,
+      "step": 1070
+    },
+    {
+      "epoch": 0.34252133762417797,
+      "grad_norm": 0.354596346616745,
+      "learning_rate": 0.00045923344995480006,
+      "loss": 4.8428,
+      "step": 1071
+    },
+    {
+      "epoch": 0.3428411521317636,
+      "grad_norm": 0.357670396566391,
+      "learning_rate": 0.0004589699212206325,
+      "loss": 4.9326,
+      "step": 1072
+    },
+    {
+      "epoch": 0.3431609666393492,
+      "grad_norm": 0.3854842185974121,
+      "learning_rate": 0.0004587062218168253,
+      "loss": 4.9109,
+      "step": 1073
+    },
+    {
+      "epoch": 0.34348078114693475,
+      "grad_norm": 0.3449469804763794,
+      "learning_rate": 0.0004584423520264853,
+      "loss": 4.9041,
+      "step": 1074
+    },
+    {
+      "epoch": 0.34380059565452037,
+      "grad_norm": 0.3718896508216858,
+      "learning_rate": 0.0004581783121329024,
+      "loss": 4.8633,
+      "step": 1075
+    },
+    {
+      "epoch": 0.344120410162106,
+      "grad_norm": 0.3840758502483368,
+      "learning_rate": 0.00045791410241954894,
+      "loss": 4.7397,
+      "step": 1076
+    },
+    {
+      "epoch": 0.3444402246696916,
+      "grad_norm": 0.3534940183162689,
+      "learning_rate": 0.0004576497231700798,
+      "loss": 4.8555,
+      "step": 1077
+    },
+    {
+      "epoch": 0.34476003917727716,
+      "grad_norm": 0.3727824091911316,
+      "learning_rate": 0.0004573851746683317,
+      "loss": 4.8823,
+      "step": 1078
+    },
+    {
+      "epoch": 0.34507985368486277,
+      "grad_norm": 0.345478355884552,
+      "learning_rate": 0.00045712045719832313,
+      "loss": 4.7997,
+      "step": 1079
+    },
+    {
+      "epoch": 0.3453996681924484,
+      "grad_norm": 0.3740197420120239,
+      "learning_rate": 0.00045685557104425397,
+      "loss": 4.9766,
+      "step": 1080
+    },
+    {
+      "epoch": 0.345719482700034,
+      "grad_norm": 0.36381879448890686,
+      "learning_rate": 0.00045659051649050525,
+      "loss": 4.8279,
+      "step": 1081
+    },
+    {
+      "epoch": 0.34603929720761956,
+      "grad_norm": 0.35513120889663696,
+      "learning_rate": 0.00045632529382163883,
+      "loss": 4.822,
+      "step": 1082
+    },
+    {
+      "epoch": 0.34635911171520517,
+      "grad_norm": 0.3540807068347931,
+      "learning_rate": 0.00045605990332239684,
+      "loss": 4.8205,
+      "step": 1083
+    },
+    {
+      "epoch": 0.3466789262227908,
+      "grad_norm": 0.3478928804397583,
+      "learning_rate": 0.00045579434527770186,
+      "loss": 4.8185,
+      "step": 1084
+    },
+    {
+      "epoch": 0.3469987407303764,
+      "grad_norm": 0.3550429344177246,
+      "learning_rate": 0.0004555286199726561,
+      "loss": 4.795,
+      "step": 1085
+    },
+    {
+      "epoch": 0.34731855523796196,
+      "grad_norm": 0.35866010189056396,
+      "learning_rate": 0.0004552627276925416,
+      "loss": 4.8276,
+      "step": 1086
+    },
+    {
+      "epoch": 0.34763836974554757,
+      "grad_norm": 0.35746774077415466,
+      "learning_rate": 0.0004549966687228195,
+      "loss": 4.8081,
+      "step": 1087
+    },
+    {
+      "epoch": 0.3479581842531332,
+      "grad_norm": 0.3648238778114319,
+      "learning_rate": 0.0004547304433491299,
+      "loss": 4.8687,
+      "step": 1088
+    },
+    {
+      "epoch": 0.3482779987607188,
+      "grad_norm": 0.3464508354663849,
+      "learning_rate": 0.00045446405185729154,
+      "loss": 4.8114,
+      "step": 1089
+    },
+    {
+      "epoch": 0.34859781326830436,
+      "grad_norm": 0.38740813732147217,
+      "learning_rate": 0.00045419749453330167,
+      "loss": 4.827,
+      "step": 1090
+    },
+    {
+      "epoch": 0.34891762777588997,
+      "grad_norm": 0.36972126364707947,
+      "learning_rate": 0.00045393077166333524,
+      "loss": 4.8716,
+      "step": 1091
+    },
+    {
+      "epoch": 0.3492374422834756,
+      "grad_norm": 0.3767491579055786,
+      "learning_rate": 0.0004536638835337452,
+      "loss": 4.8603,
+      "step": 1092
+    },
+    {
+      "epoch": 0.3495572567910612,
+      "grad_norm": 0.3509376347064972,
+      "learning_rate": 0.00045339683043106214,
+      "loss": 4.8471,
+      "step": 1093
+    },
+    {
+      "epoch": 0.34987707129864676,
+      "grad_norm": 0.34784337878227234,
+      "learning_rate": 0.00045312961264199316,
+      "loss": 4.7883,
+      "step": 1094
+    },
+    {
+      "epoch": 0.35019688580623237,
+      "grad_norm": 0.35170140862464905,
+      "learning_rate": 0.0004528622304534225,
+      "loss": 4.8756,
+      "step": 1095
+    },
+    {
+      "epoch": 0.350516700313818,
+      "grad_norm": 0.33361902832984924,
+      "learning_rate": 0.00045259468415241117,
+      "loss": 4.7779,
+      "step": 1096
+    },
+    {
+      "epoch": 0.3508365148214036,
+      "grad_norm": 0.38126787543296814,
+      "learning_rate": 0.0004523269740261957,
+      "loss": 4.8836,
+      "step": 1097
+    },
+    {
+      "epoch": 0.35115632932898916,
+      "grad_norm": 0.34420838952064514,
+      "learning_rate": 0.0004520591003621892,
+      "loss": 4.793,
+      "step": 1098
+    },
+    {
+      "epoch": 0.3514761438365748,
+      "grad_norm": 0.3590407371520996,
+      "learning_rate": 0.00045179106344798005,
+      "loss": 4.863,
+      "step": 1099
+    },
+    {
+      "epoch": 0.3517959583441604,
+      "grad_norm": 0.3648085594177246,
+      "learning_rate": 0.00045152286357133157,
+      "loss": 4.8578,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3517959583441604,
+      "eval_loss": 4.842448711395264,
+      "eval_runtime": 78.0099,
+      "eval_samples_per_second": 24.317,
+      "eval_steps_per_second": 6.089,
+      "step": 1100
+    },
+    {
+      "epoch": 0.352115772851746,
+      "grad_norm": 0.47429999709129333,
+      "learning_rate": 0.0004512545010201828,
+      "loss": 4.77,
+      "step": 1101
+    },
+    {
+      "epoch": 0.3524355873593316,
+      "grad_norm": 0.36832159757614136,
+      "learning_rate": 0.0004509859760826466,
+      "loss": 4.756,
+      "step": 1102
+    },
+    {
+      "epoch": 0.3527554018669172,
+      "grad_norm": 0.34895962476730347,
+      "learning_rate": 0.0004507172890470108,
+      "loss": 4.8125,
+      "step": 1103
+    },
+    {
+      "epoch": 0.3530752163745028,
+      "grad_norm": 0.34782490134239197,
+      "learning_rate": 0.000450448440201737,
+      "loss": 4.7803,
+      "step": 1104
+    },
+    {
+      "epoch": 0.3533950308820884,
+      "grad_norm": 0.36187058687210083,
+      "learning_rate": 0.0004501794298354603,
+      "loss": 4.8202,
+      "step": 1105
+    },
+    {
+      "epoch": 0.353714845389674,
+      "grad_norm": 0.3739657998085022,
+      "learning_rate": 0.0004499102582369897,
+      "loss": 4.7097,
+      "step": 1106
+    },
+    {
+      "epoch": 0.3540346598972596,
+      "grad_norm": 0.36065033078193665,
+      "learning_rate": 0.0004496409256953069,
+      "loss": 4.9222,
+      "step": 1107
+    },
+    {
+      "epoch": 0.3543544744048452,
+      "grad_norm": 0.3667559027671814,
+      "learning_rate": 0.0004493714324995666,
+      "loss": 4.894,
+      "step": 1108
+    },
+    {
+      "epoch": 0.3546742889124308,
+      "grad_norm": 0.390155166387558,
+      "learning_rate": 0.00044910177893909577,
+      "loss": 4.9217,
+      "step": 1109
+    },
+    {
+      "epoch": 0.3549941034200164,
+      "grad_norm": 0.3407348394393921,
+      "learning_rate": 0.00044883196530339376,
+      "loss": 4.8408,
+      "step": 1110
+    },
+    {
+      "epoch": 0.355313917927602,
+      "grad_norm": 0.3738819360733032,
+      "learning_rate": 0.0004485619918821318,
+      "loss": 4.8303,
+      "step": 1111
+    },
+    {
+      "epoch": 0.3556337324351876,
+      "grad_norm": 0.3555871248245239,
+      "learning_rate": 0.00044829185896515245,
+      "loss": 4.933,
+      "step": 1112
+    },
+    {
+      "epoch": 0.3559535469427732,
+      "grad_norm": 0.3620026409626007,
+      "learning_rate": 0.0004480215668424696,
+      "loss": 4.8471,
+      "step": 1113
+    },
+    {
+      "epoch": 0.3562733614503588,
+      "grad_norm": 0.34683966636657715,
+      "learning_rate": 0.00044775111580426817,
+      "loss": 4.832,
+      "step": 1114
+    },
+    {
+      "epoch": 0.3565931759579444,
+      "grad_norm": 0.3447222411632538,
+      "learning_rate": 0.00044748050614090343,
+      "loss": 4.8587,
+      "step": 1115
+    },
+    {
+      "epoch": 0.35691299046553,
+      "grad_norm": 0.35056382417678833,
+      "learning_rate": 0.00044720973814290125,
+      "loss": 4.8076,
+      "step": 1116
+    },
+    {
+      "epoch": 0.3572328049731156,
+      "grad_norm": 0.3493582308292389,
+      "learning_rate": 0.0004469388121009574,
+      "loss": 4.8278,
+      "step": 1117
+    },
+    {
+      "epoch": 0.3575526194807012,
+      "grad_norm": 0.36693739891052246,
+      "learning_rate": 0.00044666772830593714,
+      "loss": 4.8642,
+      "step": 1118
+    },
+    {
+      "epoch": 0.3578724339882868,
+      "grad_norm": 0.34915661811828613,
+      "learning_rate": 0.00044639648704887535,
+      "loss": 4.8292,
+      "step": 1119
+    },
+    {
+      "epoch": 0.3581922484958724,
+      "grad_norm": 0.3442689776420593,
+      "learning_rate": 0.00044612508862097575,
+      "loss": 4.8267,
+      "step": 1120
+    },
+    {
+      "epoch": 0.358512063003458,
+      "grad_norm": 0.37427109479904175,
+      "learning_rate": 0.00044585353331361095,
+      "loss": 4.8233,
+      "step": 1121
+    },
+    {
+      "epoch": 0.3588318775110436,
+      "grad_norm": 0.3511858582496643,
+      "learning_rate": 0.000445581821418322,
+      "loss": 4.8552,
+      "step": 1122
+    },
+    {
+      "epoch": 0.3591516920186292,
+      "grad_norm": 0.36977121233940125,
+      "learning_rate": 0.0004453099532268178,
+      "loss": 4.8884,
+      "step": 1123
+    },
+    {
+      "epoch": 0.3594715065262148,
+      "grad_norm": 0.35378748178482056,
+      "learning_rate": 0.0004450379290309755,
+      "loss": 4.699,
+      "step": 1124
+    },
+    {
+      "epoch": 0.3597913210338004,
+      "grad_norm": 0.36219707131385803,
+      "learning_rate": 0.0004447657491228392,
+      "loss": 4.7922,
+      "step": 1125
+    },
+    {
+      "epoch": 0.360111135541386,
+      "grad_norm": 0.36497360467910767,
+      "learning_rate": 0.0004444934137946207,
+      "loss": 4.8067,
+      "step": 1126
+    },
+    {
+      "epoch": 0.3604309500489716,
+      "grad_norm": 0.35565951466560364,
+      "learning_rate": 0.00044422092333869814,
+      "loss": 4.8199,
+      "step": 1127
+    },
+    {
+      "epoch": 0.3607507645565572,
+      "grad_norm": 0.3564634621143341,
+      "learning_rate": 0.00044394827804761667,
+      "loss": 4.8902,
+      "step": 1128
+    },
+    {
+      "epoch": 0.3610705790641428,
+      "grad_norm": 0.3579818308353424,
+      "learning_rate": 0.0004436754782140875,
+      "loss": 4.7855,
+      "step": 1129
+    },
+    {
+      "epoch": 0.3613903935717284,
+      "grad_norm": 0.3743307888507843,
+      "learning_rate": 0.0004434025241309876,
+      "loss": 4.8971,
+      "step": 1130
+    },
+    {
+      "epoch": 0.361710208079314,
+      "grad_norm": 0.3698958158493042,
+      "learning_rate": 0.0004431294160913597,
+      "loss": 4.7787,
+      "step": 1131
+    },
+    {
+      "epoch": 0.3620300225868996,
+      "grad_norm": 0.37127307057380676,
+      "learning_rate": 0.0004428561543884118,
+      "loss": 4.6915,
+      "step": 1132
+    },
+    {
+      "epoch": 0.3623498370944852,
+      "grad_norm": 0.36235642433166504,
+      "learning_rate": 0.0004425827393155169,
+      "loss": 4.8175,
+      "step": 1133
+    },
+    {
+      "epoch": 0.3626696516020708,
+      "grad_norm": 0.3819771707057953,
+      "learning_rate": 0.00044230917116621266,
+      "loss": 4.7539,
+      "step": 1134
+    },
+    {
+      "epoch": 0.3629894661096564,
+      "grad_norm": 0.35926157236099243,
+      "learning_rate": 0.00044203545023420085,
+      "loss": 4.9012,
+      "step": 1135
+    },
+    {
+      "epoch": 0.363309280617242,
+      "grad_norm": 0.37980303168296814,
+      "learning_rate": 0.00044176157681334767,
+      "loss": 4.7817,
+      "step": 1136
+    },
+    {
+      "epoch": 0.3636290951248276,
+      "grad_norm": 0.35066235065460205,
+      "learning_rate": 0.0004414875511976827,
+      "loss": 4.7868,
+      "step": 1137
+    },
+    {
+      "epoch": 0.3639489096324132,
+      "grad_norm": 0.36465728282928467,
+      "learning_rate": 0.00044121337368139906,
+      "loss": 4.8841,
+      "step": 1138
+    },
+    {
+      "epoch": 0.3642687241399988,
+      "grad_norm": 0.3646154999732971,
+      "learning_rate": 0.0004409390445588528,
+      "loss": 4.8033,
+      "step": 1139
+    },
+    {
+      "epoch": 0.3645885386475844,
+      "grad_norm": 0.3409954309463501,
+      "learning_rate": 0.0004406645641245631,
+      "loss": 4.8258,
+      "step": 1140
+    },
+    {
+      "epoch": 0.36490835315517,
+      "grad_norm": 0.35481762886047363,
+      "learning_rate": 0.0004403899326732112,
+      "loss": 4.7841,
+      "step": 1141
+    },
+    {
+      "epoch": 0.3652281676627556,
+      "grad_norm": 0.35458990931510925,
+      "learning_rate": 0.00044011515049964073,
+      "loss": 4.8184,
+      "step": 1142
+    },
+    {
+      "epoch": 0.3655479821703412,
+      "grad_norm": 0.4338507354259491,
+      "learning_rate": 0.0004398402178988568,
+      "loss": 4.8885,
+      "step": 1143
+    },
+    {
+      "epoch": 0.3658677966779268,
+      "grad_norm": 0.36764827370643616,
+      "learning_rate": 0.00043956513516602653,
+      "loss": 4.8525,
+      "step": 1144
+    },
+    {
+      "epoch": 0.3661876111855124,
+      "grad_norm": 0.3740900158882141,
+      "learning_rate": 0.00043928990259647764,
+      "loss": 4.7446,
+      "step": 1145
+    },
+    {
+      "epoch": 0.366507425693098,
+      "grad_norm": 0.3610120117664337,
+      "learning_rate": 0.00043901452048569913,
+      "loss": 4.7707,
+      "step": 1146
+    },
+    {
+      "epoch": 0.3668272402006836,
+      "grad_norm": 0.37914660573005676,
+      "learning_rate": 0.00043873898912934054,
+      "loss": 4.9002,
+      "step": 1147
+    },
+    {
+      "epoch": 0.3671470547082692,
+      "grad_norm": 0.37821099162101746,
+      "learning_rate": 0.00043846330882321146,
+      "loss": 4.8313,
+      "step": 1148
+    },
+    {
+      "epoch": 0.3674668692158548,
+      "grad_norm": 0.3644234240055084,
+      "learning_rate": 0.00043818747986328136,
+      "loss": 4.7384,
+      "step": 1149
+    },
+    {
+      "epoch": 0.3677866837234404,
+      "grad_norm": 0.3613603413105011,
+      "learning_rate": 0.0004379115025456795,
+      "loss": 4.8134,
+      "step": 1150
+    },
+    {
+      "epoch": 0.368106498231026,
+      "grad_norm": 0.3736517131328583,
+      "learning_rate": 0.0004376353771666942,
+      "loss": 4.8469,
+      "step": 1151
+    },
+    {
+      "epoch": 0.3684263127386116,
+      "grad_norm": 0.37014541029930115,
+      "learning_rate": 0.000437359104022773,
+      "loss": 4.8588,
+      "step": 1152
+    },
+    {
+      "epoch": 0.3687461272461972,
+      "grad_norm": 0.3674084544181824,
+      "learning_rate": 0.00043708268341052185,
+      "loss": 4.771,
+      "step": 1153
+    },
+    {
+      "epoch": 0.3690659417537828,
+      "grad_norm": 0.35765981674194336,
+      "learning_rate": 0.00043680611562670513,
+      "loss": 4.7784,
+      "step": 1154
+    },
+    {
+      "epoch": 0.3693857562613684,
+      "grad_norm": 0.3745180368423462,
+      "learning_rate": 0.00043652940096824516,
+      "loss": 4.8395,
+      "step": 1155
+    },
+    {
+      "epoch": 0.369705570768954,
+      "grad_norm": 0.3918968439102173,
+      "learning_rate": 0.00043625253973222206,
+      "loss": 4.783,
+      "step": 1156
+    },
+    {
+      "epoch": 0.3700253852765396,
+      "grad_norm": 0.34982675313949585,
+      "learning_rate": 0.00043597553221587316,
+      "loss": 4.8801,
+      "step": 1157
+    },
+    {
+      "epoch": 0.3703451997841252,
+      "grad_norm": 0.3849962055683136,
+      "learning_rate": 0.00043569837871659296,
+      "loss": 4.7992,
+      "step": 1158
+    },
+    {
+      "epoch": 0.3706650142917108,
+      "grad_norm": 0.35203996300697327,
+      "learning_rate": 0.0004354210795319327,
+      "loss": 4.87,
+      "step": 1159
+    },
+    {
+      "epoch": 0.3709848287992964,
+      "grad_norm": 0.3672322928905487,
+      "learning_rate": 0.00043514363495959985,
+      "loss": 4.7667,
+      "step": 1160
+    },
+    {
+      "epoch": 0.371304643306882,
+      "grad_norm": 0.35468590259552,
+      "learning_rate": 0.0004348660452974581,
+      "loss": 4.7612,
+      "step": 1161
+    },
+    {
+      "epoch": 0.3716244578144676,
+      "grad_norm": 0.3686932325363159,
+      "learning_rate": 0.00043458831084352705,
+      "loss": 4.8025,
+      "step": 1162
+    },
+    {
+      "epoch": 0.3719442723220532,
+      "grad_norm": 0.3381997346878052,
+      "learning_rate": 0.00043431043189598125,
+      "loss": 4.8255,
+      "step": 1163
+    },
+    {
+      "epoch": 0.3722640868296388,
+      "grad_norm": 0.3542218506336212,
+      "learning_rate": 0.0004340324087531511,
+      "loss": 4.8247,
+      "step": 1164
+    },
+    {
+      "epoch": 0.3725839013372244,
+      "grad_norm": 0.3275507390499115,
+      "learning_rate": 0.00043375424171352133,
+      "loss": 4.7272,
+      "step": 1165
+    },
+    {
+      "epoch": 0.37290371584481,
+      "grad_norm": 0.3618198037147522,
+      "learning_rate": 0.00043347593107573106,
+      "loss": 4.8289,
+      "step": 1166
+    },
+    {
+      "epoch": 0.37322353035239564,
+      "grad_norm": 0.3532826006412506,
+      "learning_rate": 0.000433197477138574,
+      "loss": 4.8198,
+      "step": 1167
+    },
+    {
+      "epoch": 0.3735433448599812,
+      "grad_norm": 0.35923513770103455,
+      "learning_rate": 0.00043291888020099723,
+      "loss": 4.7377,
+      "step": 1168
+    },
+    {
+      "epoch": 0.3738631593675668,
+      "grad_norm": 0.34886351227760315,
+      "learning_rate": 0.0004326401405621019,
+      "loss": 4.8268,
+      "step": 1169
+    },
+    {
+      "epoch": 0.3741829738751524,
+      "grad_norm": 0.3776870369911194,
+      "learning_rate": 0.0004323612585211419,
+      "loss": 4.8158,
+      "step": 1170
+    },
+    {
+      "epoch": 0.37450278838273804,
+      "grad_norm": 0.35528770089149475,
+      "learning_rate": 0.0004320822343775242,
+      "loss": 4.9009,
+      "step": 1171
+    },
+    {
+      "epoch": 0.3748226028903236,
+      "grad_norm": 0.38312670588493347,
+      "learning_rate": 0.00043180306843080836,
+      "loss": 4.8545,
+      "step": 1172
+    },
+    {
+      "epoch": 0.3751424173979092,
+      "grad_norm": 0.40025246143341064,
+      "learning_rate": 0.0004315237609807059,
+      "loss": 4.881,
+      "step": 1173
+    },
+    {
+      "epoch": 0.3754622319054948,
+      "grad_norm": 0.35279229283332825,
+      "learning_rate": 0.00043124431232708076,
+      "loss": 4.7822,
+      "step": 1174
+    },
+    {
+      "epoch": 0.37578204641308044,
+      "grad_norm": 0.3803034722805023,
+      "learning_rate": 0.000430964722769948,
+      "loss": 4.7747,
+      "step": 1175
+    },
+    {
+      "epoch": 0.376101860920666,
+      "grad_norm": 0.3923249840736389,
+      "learning_rate": 0.0004306849926094742,
+      "loss": 4.7027,
+      "step": 1176
+    },
+    {
+      "epoch": 0.3764216754282516,
+      "grad_norm": 0.3511454164981842,
+      "learning_rate": 0.00043040512214597684,
+      "loss": 4.7863,
+      "step": 1177
+    },
+    {
+      "epoch": 0.3767414899358372,
+      "grad_norm": 0.34750327467918396,
+      "learning_rate": 0.00043012511167992405,
+      "loss": 4.7971,
+      "step": 1178
+    },
+    {
+      "epoch": 0.37706130444342284,
+      "grad_norm": 0.3603704273700714,
+      "learning_rate": 0.0004298449615119343,
+      "loss": 4.7601,
+      "step": 1179
+    },
+    {
+      "epoch": 0.3773811189510084,
+      "grad_norm": 0.3695945143699646,
+      "learning_rate": 0.0004295646719427758,
+      "loss": 4.8054,
+      "step": 1180
+    },
+    {
+      "epoch": 0.377700933458594,
+      "grad_norm": 0.3885416090488434,
+      "learning_rate": 0.00042928424327336667,
+      "loss": 4.7725,
+      "step": 1181
+    },
+    {
+      "epoch": 0.3780207479661796,
+      "grad_norm": 0.339851051568985,
+      "learning_rate": 0.00042900367580477446,
+      "loss": 4.7132,
+      "step": 1182
+    },
+    {
+      "epoch": 0.37834056247376524,
+      "grad_norm": 0.35797902941703796,
+      "learning_rate": 0.0004287229698382154,
+      "loss": 4.7972,
+      "step": 1183
+    },
+    {
+      "epoch": 0.3786603769813508,
+      "grad_norm": 0.3440041244029999,
+      "learning_rate": 0.0004284421256750547,
+      "loss": 4.8355,
+      "step": 1184
+    },
+    {
+      "epoch": 0.3789801914889364,
+      "grad_norm": 0.3346846401691437,
+      "learning_rate": 0.0004281611436168059,
+      "loss": 4.7913,
+      "step": 1185
+    },
+    {
+      "epoch": 0.379300005996522,
+      "grad_norm": 0.47180992364883423,
+      "learning_rate": 0.00042788002396513023,
+      "loss": 4.7696,
+      "step": 1186
+    },
+    {
+      "epoch": 0.37961982050410764,
+      "grad_norm": 0.34173375368118286,
+      "learning_rate": 0.00042759876702183706,
+      "loss": 4.7845,
+      "step": 1187
+    },
+    {
+      "epoch": 0.3799396350116932,
+      "grad_norm": 0.3632044494152069,
+      "learning_rate": 0.0004273173730888831,
+      "loss": 4.7338,
+      "step": 1188
+    },
+    {
+      "epoch": 0.3802594495192788,
+      "grad_norm": 0.3603549003601074,
+      "learning_rate": 0.00042703584246837206,
+      "loss": 4.8058,
+      "step": 1189
+    },
+    {
+      "epoch": 0.38057926402686443,
+      "grad_norm": 0.397983193397522,
+      "learning_rate": 0.0004267541754625543,
+      "loss": 4.8072,
+      "step": 1190
+    },
+    {
+      "epoch": 0.38089907853445004,
+      "grad_norm": 0.34411120414733887,
+      "learning_rate": 0.00042647237237382666,
+      "loss": 4.7369,
+      "step": 1191
+    },
+    {
+      "epoch": 0.3812188930420356,
+      "grad_norm": 0.35892680287361145,
+      "learning_rate": 0.00042619043350473223,
+      "loss": 4.8927,
+      "step": 1192
+    },
+    {
+      "epoch": 0.3815387075496212,
+      "grad_norm": 0.35901933908462524,
+      "learning_rate": 0.0004259083591579596,
+      "loss": 4.7778,
+      "step": 1193
+    },
+    {
+      "epoch": 0.38185852205720683,
+      "grad_norm": 0.3576551079750061,
+      "learning_rate": 0.000425626149636343,
+      "loss": 4.7477,
+      "step": 1194
+    },
+    {
+      "epoch": 0.38217833656479244,
+      "grad_norm": 0.3562488257884979,
+      "learning_rate": 0.0004253438052428619,
+      "loss": 4.7469,
+      "step": 1195
+    },
+    {
+      "epoch": 0.382498151072378,
+      "grad_norm": 0.3653848171234131,
+      "learning_rate": 0.00042506132628064016,
+      "loss": 4.8627,
+      "step": 1196
+    },
+    {
+      "epoch": 0.3828179655799636,
+      "grad_norm": 0.36694326996803284,
+      "learning_rate": 0.00042477871305294655,
+      "loss": 4.8451,
+      "step": 1197
+    },
+    {
+      "epoch": 0.38313778008754923,
+      "grad_norm": 0.3641037940979004,
+      "learning_rate": 0.0004244959658631938,
+      "loss": 4.7344,
+      "step": 1198
+    },
+    {
+      "epoch": 0.38345759459513484,
+      "grad_norm": 0.368745893239975,
+      "learning_rate": 0.00042421308501493823,
+      "loss": 4.6974,
+      "step": 1199
+    },
+    {
+      "epoch": 0.3837774091027204,
+      "grad_norm": 0.35546058416366577,
+      "learning_rate": 0.0004239300708118802,
+      "loss": 4.7683,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3837774091027204,
+      "eval_loss": 4.789093494415283,
+      "eval_runtime": 79.2193,
+      "eval_samples_per_second": 23.946,
+      "eval_steps_per_second": 5.996,
+      "step": 1200
+    },
+    {
+      "epoch": 0.384097223610306,
+      "grad_norm": 0.3464343547821045,
+      "learning_rate": 0.0004236469235578627,
+      "loss": 4.7838,
+      "step": 1201
+    },
+    {
+      "epoch": 0.38441703811789163,
+      "grad_norm": 0.36398279666900635,
+      "learning_rate": 0.0004233636435568719,
+      "loss": 4.8208,
+      "step": 1202
+    },
+    {
+      "epoch": 0.38473685262547724,
+      "grad_norm": 0.359862357378006,
+      "learning_rate": 0.00042308023111303636,
+      "loss": 4.7881,
+      "step": 1203
+    },
+    {
+      "epoch": 0.3850566671330628,
+      "grad_norm": 0.4442932903766632,
+      "learning_rate": 0.00042279668653062686,
+      "loss": 4.8383,
+      "step": 1204
+    },
+    {
+      "epoch": 0.3853764816406484,
+      "grad_norm": 0.3936426043510437,
+      "learning_rate": 0.0004225130101140559,
+      "loss": 4.7622,
+      "step": 1205
+    },
+    {
+      "epoch": 0.38569629614823403,
+      "grad_norm": 0.3605417013168335,
+      "learning_rate": 0.00042222920216787786,
+      "loss": 4.7688,
+      "step": 1206
+    },
+    {
+      "epoch": 0.38601611065581964,
+      "grad_norm": 0.3449859917163849,
+      "learning_rate": 0.000421945262996788,
+      "loss": 4.8179,
+      "step": 1207
+    },
+    {
+      "epoch": 0.3863359251634052,
+      "grad_norm": 0.36619365215301514,
+      "learning_rate": 0.0004216611929056225,
+      "loss": 4.7845,
+      "step": 1208
+    },
+    {
+      "epoch": 0.3866557396709908,
+      "grad_norm": 0.3378742039203644,
+      "learning_rate": 0.0004213769921993583,
+      "loss": 4.8207,
+      "step": 1209
+    },
+    {
+      "epoch": 0.38697555417857643,
+      "grad_norm": 0.375707745552063,
+      "learning_rate": 0.0004210926611831124,
+      "loss": 4.6434,
+      "step": 1210
+    },
+    {
+      "epoch": 0.38729536868616204,
+      "grad_norm": 0.3499867916107178,
+      "learning_rate": 0.0004208082001621417,
+      "loss": 4.7915,
+      "step": 1211
+    },
+    {
+      "epoch": 0.3876151831937476,
+      "grad_norm": 0.36418724060058594,
+      "learning_rate": 0.0004205236094418428,
+      "loss": 4.8163,
+      "step": 1212
+    },
+    {
+      "epoch": 0.3879349977013332,
+      "grad_norm": 0.3566012978553772,
+      "learning_rate": 0.0004202388893277515,
+      "loss": 4.7468,
+      "step": 1213
+    },
+    {
+      "epoch": 0.38825481220891883,
+      "grad_norm": 0.3427860140800476,
+      "learning_rate": 0.00041995404012554226,
+      "loss": 4.7259,
+      "step": 1214
+    },
+    {
+      "epoch": 0.38857462671650445,
+      "grad_norm": 0.36360323429107666,
+      "learning_rate": 0.0004196690621410285,
+      "loss": 4.7334,
+      "step": 1215
+    },
+    {
+      "epoch": 0.38889444122409,
+      "grad_norm": 0.37315964698791504,
+      "learning_rate": 0.0004193839556801617,
+      "loss": 4.7683,
+      "step": 1216
+    },
+    {
+      "epoch": 0.3892142557316756,
+      "grad_norm": 0.348958820104599,
+      "learning_rate": 0.0004190987210490314,
+      "loss": 4.8029,
+      "step": 1217
+    },
+    {
+      "epoch": 0.38953407023926123,
+      "grad_norm": 0.3609643578529358,
+      "learning_rate": 0.00041881335855386463,
+      "loss": 4.7958,
+      "step": 1218
+    },
+    {
+      "epoch": 0.38985388474684685,
+      "grad_norm": 0.36503320932388306,
+      "learning_rate": 0.00041852786850102557,
+      "loss": 4.7606,
+      "step": 1219
+    },
+    {
+      "epoch": 0.3901736992544324,
+      "grad_norm": 0.3636496961116791,
+      "learning_rate": 0.00041824225119701576,
+      "loss": 4.8224,
+      "step": 1220
+    },
+    {
+      "epoch": 0.390493513762018,
+      "grad_norm": 0.7249408960342407,
+      "learning_rate": 0.0004179565069484729,
+      "loss": 4.7619,
+      "step": 1221
+    },
+    {
+      "epoch": 0.39081332826960363,
+      "grad_norm": 0.35281887650489807,
+      "learning_rate": 0.0004176706360621713,
+      "loss": 4.7195,
+      "step": 1222
+    },
+    {
+      "epoch": 0.39113314277718925,
+      "grad_norm": 0.37250128388404846,
+      "learning_rate": 0.0004173846388450209,
+      "loss": 4.7523,
+      "step": 1223
+    },
+    {
+      "epoch": 0.3914529572847748,
+      "grad_norm": 0.3570784628391266,
+      "learning_rate": 0.0004170985156040677,
+      "loss": 4.7547,
+      "step": 1224
+    },
+    {
+      "epoch": 0.3917727717923604,
+      "grad_norm": 0.4446999132633209,
+      "learning_rate": 0.0004168122666464927,
+      "loss": 4.7975,
+      "step": 1225
+    },
+    {
+      "epoch": 0.39209258629994603,
+      "grad_norm": 0.37899062037467957,
+      "learning_rate": 0.0004165258922796119,
+      "loss": 4.818,
+      "step": 1226
+    },
+    {
+      "epoch": 0.39241240080753165,
+      "grad_norm": 0.3786870837211609,
+      "learning_rate": 0.00041623939281087605,
+      "loss": 4.757,
+      "step": 1227
+    },
+    {
+      "epoch": 0.3927322153151172,
+      "grad_norm": 0.4067842662334442,
+      "learning_rate": 0.00041595276854787007,
+      "loss": 4.6869,
+      "step": 1228
+    },
+    {
+      "epoch": 0.3930520298227028,
+      "grad_norm": 0.36138680577278137,
+      "learning_rate": 0.00041566601979831287,
+      "loss": 4.7871,
+      "step": 1229
+    },
+    {
+      "epoch": 0.39337184433028843,
+      "grad_norm": 0.3776302933692932,
+      "learning_rate": 0.00041537914687005714,
+      "loss": 4.818,
+      "step": 1230
+    },
+    {
+      "epoch": 0.39369165883787405,
+      "grad_norm": 0.3752312660217285,
+      "learning_rate": 0.00041509215007108885,
+      "loss": 4.8022,
+      "step": 1231
+    },
+    {
+      "epoch": 0.3940114733454596,
+      "grad_norm": 0.3481239974498749,
+      "learning_rate": 0.0004148050297095269,
+      "loss": 4.7868,
+      "step": 1232
+    },
+    {
+      "epoch": 0.3943312878530452,
+      "grad_norm": 0.3753473460674286,
+      "learning_rate": 0.00041451778609362286,
+      "loss": 4.7985,
+      "step": 1233
+    },
+    {
+      "epoch": 0.39465110236063083,
+      "grad_norm": 0.38862380385398865,
+      "learning_rate": 0.0004142304195317605,
+      "loss": 4.7357,
+      "step": 1234
+    },
+    {
+      "epoch": 0.39497091686821645,
+      "grad_norm": 0.36923497915267944,
+      "learning_rate": 0.00041394293033245597,
+      "loss": 4.7071,
+      "step": 1235
+    },
+    {
+      "epoch": 0.39529073137580206,
+      "grad_norm": 0.3899265229701996,
+      "learning_rate": 0.00041365531880435647,
+      "loss": 4.7769,
+      "step": 1236
+    },
+    {
+      "epoch": 0.3956105458833876,
+      "grad_norm": 0.3533836603164673,
+      "learning_rate": 0.0004133675852562413,
+      "loss": 4.8711,
+      "step": 1237
+    },
+    {
+      "epoch": 0.39593036039097323,
+      "grad_norm": 0.3839608430862427,
+      "learning_rate": 0.00041307972999702014,
+      "loss": 4.7194,
+      "step": 1238
+    },
+    {
+      "epoch": 0.39625017489855885,
+      "grad_norm": 0.4235544800758362,
+      "learning_rate": 0.00041279175333573345,
+      "loss": 4.8387,
+      "step": 1239
+    },
+    {
+      "epoch": 0.39656998940614446,
+      "grad_norm": 0.3789507746696472,
+      "learning_rate": 0.00041250365558155236,
+      "loss": 4.8791,
+      "step": 1240
+    },
+    {
+      "epoch": 0.39688980391373,
+      "grad_norm": 0.4530871510505676,
+      "learning_rate": 0.0004122154370437776,
+      "loss": 4.799,
+      "step": 1241
+    },
+    {
+      "epoch": 0.39720961842131564,
+      "grad_norm": 0.38132578134536743,
+      "learning_rate": 0.0004119270980318398,
+      "loss": 4.79,
+      "step": 1242
+    },
+    {
+      "epoch": 0.39752943292890125,
+      "grad_norm": 0.38641002774238586,
+      "learning_rate": 0.0004116386388552988,
+      "loss": 4.8165,
+      "step": 1243
+    },
+    {
+      "epoch": 0.39784924743648686,
+      "grad_norm": 0.37316030263900757,
+      "learning_rate": 0.0004113500598238437,
+      "loss": 4.8027,
+      "step": 1244
+    },
+    {
+      "epoch": 0.3981690619440724,
+      "grad_norm": 0.34834080934524536,
+      "learning_rate": 0.000411061361247292,
+      "loss": 4.793,
+      "step": 1245
+    },
+    {
+      "epoch": 0.39848887645165804,
+      "grad_norm": 0.36298882961273193,
+      "learning_rate": 0.00041077254343558955,
+      "loss": 4.7775,
+      "step": 1246
+    },
+    {
+      "epoch": 0.39880869095924365,
+      "grad_norm": 0.378531277179718,
+      "learning_rate": 0.00041048360669881055,
+      "loss": 4.8365,
+      "step": 1247
+    },
+    {
+      "epoch": 0.39912850546682926,
+      "grad_norm": 0.3716171383857727,
+      "learning_rate": 0.0004101945513471563,
+      "loss": 4.7765,
+      "step": 1248
+    },
+    {
+      "epoch": 0.3994483199744148,
+      "grad_norm": 0.37697818875312805,
+      "learning_rate": 0.000409905377690956,
+      "loss": 4.7976,
+      "step": 1249
+    },
+    {
+      "epoch": 0.39976813448200044,
+      "grad_norm": 0.37140795588493347,
+      "learning_rate": 0.00040961608604066566,
+      "loss": 4.8138,
+      "step": 1250
+    },
+    {
+      "epoch": 0.40008794898958605,
+      "grad_norm": 0.3761359751224518,
+      "learning_rate": 0.0004093266767068677,
+      "loss": 4.7103,
+      "step": 1251
+    },
+    {
+      "epoch": 0.40040776349717166,
+      "grad_norm": 0.3735464811325073,
+      "learning_rate": 0.0004090371500002715,
+      "loss": 4.7285,
+      "step": 1252
+    },
+    {
+      "epoch": 0.4007275780047572,
+      "grad_norm": 0.36980125308036804,
+      "learning_rate": 0.00040874750623171176,
+      "loss": 4.6936,
+      "step": 1253
+    },
+    {
+      "epoch": 0.40104739251234284,
+      "grad_norm": 0.3474436402320862,
+      "learning_rate": 0.00040845774571214924,
+      "loss": 4.7765,
+      "step": 1254
+    },
+    {
+      "epoch": 0.40136720701992845,
+      "grad_norm": 0.3448978364467621,
+      "learning_rate": 0.0004081678687526701,
+      "loss": 4.7407,
+      "step": 1255
+    },
+    {
+      "epoch": 0.40168702152751407,
+      "grad_norm": 0.3554507791996002,
+      "learning_rate": 0.0004078778756644854,
+      "loss": 4.8064,
+      "step": 1256
+    },
+    {
+      "epoch": 0.4020068360350996,
+      "grad_norm": 0.36760538816452026,
+      "learning_rate": 0.00040758776675893065,
+      "loss": 4.7797,
+      "step": 1257
+    },
+    {
+      "epoch": 0.40232665054268524,
+      "grad_norm": 0.3476412892341614,
+      "learning_rate": 0.00040729754234746613,
+      "loss": 4.7432,
+      "step": 1258
+    },
+    {
+      "epoch": 0.40264646505027085,
+      "grad_norm": 0.3805728256702423,
+      "learning_rate": 0.0004070072027416758,
+      "loss": 4.7659,
+      "step": 1259
+    },
+    {
+      "epoch": 0.40296627955785647,
+      "grad_norm": 0.3726625442504883,
+      "learning_rate": 0.00040671674825326745,
+      "loss": 4.7388,
+      "step": 1260
+    },
+    {
+      "epoch": 0.403286094065442,
+      "grad_norm": 0.3568096160888672,
+      "learning_rate": 0.0004064261791940723,
+      "loss": 4.7267,
+      "step": 1261
+    },
+    {
+      "epoch": 0.40360590857302764,
+      "grad_norm": 0.4026634991168976,
+      "learning_rate": 0.0004061354958760441,
+      "loss": 4.8277,
+      "step": 1262
+    },
+    {
+      "epoch": 0.40392572308061325,
+      "grad_norm": 0.35849010944366455,
+      "learning_rate": 0.00040584469861126,
+      "loss": 4.7226,
+      "step": 1263
+    },
+    {
+      "epoch": 0.40424553758819887,
+      "grad_norm": 0.37187543511390686,
+      "learning_rate": 0.00040555378771191876,
+      "loss": 4.7215,
+      "step": 1264
+    },
+    {
+      "epoch": 0.4045653520957844,
+      "grad_norm": 0.35207071900367737,
+      "learning_rate": 0.0004052627634903417,
+      "loss": 4.8124,
+      "step": 1265
+    },
+    {
+      "epoch": 0.40488516660337004,
+      "grad_norm": 0.3443794846534729,
+      "learning_rate": 0.00040497162625897134,
+      "loss": 4.7859,
+      "step": 1266
+    },
+    {
+      "epoch": 0.40520498111095565,
+      "grad_norm": 0.3849242329597473,
+      "learning_rate": 0.00040468037633037196,
+      "loss": 4.6921,
+      "step": 1267
+    },
+    {
+      "epoch": 0.40552479561854127,
+      "grad_norm": 0.3563462197780609,
+      "learning_rate": 0.0004043890140172286,
+      "loss": 4.708,
+      "step": 1268
+    },
+    {
+      "epoch": 0.4058446101261268,
+      "grad_norm": 0.3593246638774872,
+      "learning_rate": 0.00040409753963234675,
+      "loss": 4.7976,
+      "step": 1269
+    },
+    {
+      "epoch": 0.40616442463371244,
+      "grad_norm": 0.34887611865997314,
+      "learning_rate": 0.00040380595348865286,
+      "loss": 4.6867,
+      "step": 1270
+    },
+    {
+      "epoch": 0.40648423914129805,
+      "grad_norm": 0.36668267846107483,
+      "learning_rate": 0.00040351425589919257,
+      "loss": 4.8058,
+      "step": 1271
+    },
+    {
+      "epoch": 0.40680405364888367,
+      "grad_norm": 0.35184717178344727,
+      "learning_rate": 0.0004032224471771317,
+      "loss": 4.7682,
+      "step": 1272
+    },
+    {
+      "epoch": 0.4071238681564692,
+      "grad_norm": 0.34717682003974915,
+      "learning_rate": 0.00040293052763575537,
+      "loss": 4.7093,
+      "step": 1273
+    },
+    {
+      "epoch": 0.40744368266405484,
+      "grad_norm": 0.3581322729587555,
+      "learning_rate": 0.0004026384975884673,
+      "loss": 4.7513,
+      "step": 1274
+    },
+    {
+      "epoch": 0.40776349717164045,
+      "grad_norm": 0.3602757453918457,
+      "learning_rate": 0.00040234635734879036,
+      "loss": 4.8043,
+      "step": 1275
+    },
+    {
+      "epoch": 0.40808331167922607,
+      "grad_norm": 0.3304402530193329,
+      "learning_rate": 0.00040205410723036526,
+      "loss": 4.7174,
+      "step": 1276
+    },
+    {
+      "epoch": 0.4084031261868116,
+      "grad_norm": 0.3786298632621765,
+      "learning_rate": 0.0004017617475469508,
+      "loss": 4.7561,
+      "step": 1277
+    },
+    {
+      "epoch": 0.40872294069439724,
+      "grad_norm": 0.352568119764328,
+      "learning_rate": 0.00040146927861242366,
+      "loss": 4.6942,
+      "step": 1278
+    },
+    {
+      "epoch": 0.40904275520198285,
+      "grad_norm": 0.35060355067253113,
+      "learning_rate": 0.00040117670074077747,
+      "loss": 4.7754,
+      "step": 1279
+    },
+    {
+      "epoch": 0.40936256970956847,
+      "grad_norm": 0.3489457964897156,
+      "learning_rate": 0.00040088401424612317,
+      "loss": 4.7322,
+      "step": 1280
+    },
+    {
+      "epoch": 0.409682384217154,
+      "grad_norm": 0.3488384783267975,
+      "learning_rate": 0.000400591219442688,
+      "loss": 4.6923,
+      "step": 1281
+    },
+    {
+      "epoch": 0.41000219872473964,
+      "grad_norm": 0.3631167411804199,
+      "learning_rate": 0.0004002983166448155,
+      "loss": 4.7409,
+      "step": 1282
+    },
+    {
+      "epoch": 0.41032201323232526,
+      "grad_norm": 0.3414135277271271,
+      "learning_rate": 0.0004000053061669654,
+      "loss": 4.7438,
+      "step": 1283
+    },
+    {
+      "epoch": 0.41064182773991087,
+      "grad_norm": 0.3368418216705322,
+      "learning_rate": 0.00039971218832371284,
+      "loss": 4.7802,
+      "step": 1284
+    },
+    {
+      "epoch": 0.41096164224749643,
+      "grad_norm": 0.3384615182876587,
+      "learning_rate": 0.0003994189634297483,
+      "loss": 4.6527,
+      "step": 1285
+    },
+    {
+      "epoch": 0.41128145675508204,
+      "grad_norm": 0.40064677596092224,
+      "learning_rate": 0.00039912563179987713,
+      "loss": 4.8604,
+      "step": 1286
+    },
+    {
+      "epoch": 0.41160127126266766,
+      "grad_norm": 0.3540748357772827,
+      "learning_rate": 0.00039883219374901933,
+      "loss": 4.713,
+      "step": 1287
+    },
+    {
+      "epoch": 0.41192108577025327,
+      "grad_norm": 0.34424421191215515,
+      "learning_rate": 0.0003985386495922091,
+      "loss": 4.7114,
+      "step": 1288
+    },
+    {
+      "epoch": 0.41224090027783883,
+      "grad_norm": 0.3402925133705139,
+      "learning_rate": 0.00039824499964459455,
+      "loss": 4.7385,
+      "step": 1289
+    },
+    {
+      "epoch": 0.41256071478542444,
+      "grad_norm": 0.38066187500953674,
+      "learning_rate": 0.00039795124422143746,
+      "loss": 4.7384,
+      "step": 1290
+    },
+    {
+      "epoch": 0.41288052929301006,
+      "grad_norm": 0.34309983253479004,
+      "learning_rate": 0.0003976573836381128,
+      "loss": 4.7604,
+      "step": 1291
+    },
+    {
+      "epoch": 0.41320034380059567,
+      "grad_norm": 0.3329739570617676,
+      "learning_rate": 0.00039736341821010833,
+      "loss": 4.77,
+      "step": 1292
+    },
+    {
+      "epoch": 0.41352015830818123,
+      "grad_norm": 0.35351860523223877,
+      "learning_rate": 0.0003970693482530247,
+      "loss": 4.7382,
+      "step": 1293
+    },
+    {
+      "epoch": 0.41383997281576684,
+      "grad_norm": 0.33818474411964417,
+      "learning_rate": 0.00039677517408257424,
+      "loss": 4.7161,
+      "step": 1294
+    },
+    {
+      "epoch": 0.41415978732335246,
+      "grad_norm": 0.4118068218231201,
+      "learning_rate": 0.00039648089601458165,
+      "loss": 4.733,
+      "step": 1295
+    },
+    {
+      "epoch": 0.41447960183093807,
+      "grad_norm": 0.3382696509361267,
+      "learning_rate": 0.000396186514364983,
+      "loss": 4.8091,
+      "step": 1296
+    },
+    {
+      "epoch": 0.41479941633852363,
+      "grad_norm": 0.348286509513855,
+      "learning_rate": 0.0003958920294498255,
+      "loss": 4.7826,
+      "step": 1297
+    },
+    {
+      "epoch": 0.41511923084610924,
+      "grad_norm": 0.3513914942741394,
+      "learning_rate": 0.00039559744158526735,
+      "loss": 4.7369,
+      "step": 1298
+    },
+    {
+      "epoch": 0.41543904535369486,
+      "grad_norm": 0.3593021333217621,
+      "learning_rate": 0.0003953027510875772,
+      "loss": 4.7276,
+      "step": 1299
+    },
+    {
+      "epoch": 0.41575885986128047,
+      "grad_norm": 0.3372235596179962,
+      "learning_rate": 0.0003950079582731339,
+      "loss": 4.7845,
+      "step": 1300
+    },
+    {
+      "epoch": 0.41575885986128047,
+      "eval_loss": 4.742072105407715,
+      "eval_runtime": 83.4036,
+      "eval_samples_per_second": 22.745,
+      "eval_steps_per_second": 5.695,
+      "step": 1300
+    },
+    {
+      "epoch": 0.41607867436886603,
+      "grad_norm": 0.4168403744697571,
+      "learning_rate": 0.0003947130634584261,
+      "loss": 4.7408,
+      "step": 1301
+    },
+    {
+      "epoch": 0.41639848887645164,
+      "grad_norm": 0.337454617023468,
+      "learning_rate": 0.000394418066960052,
+      "loss": 4.7456,
+      "step": 1302
+    },
+    {
+      "epoch": 0.41671830338403726,
+      "grad_norm": 0.3640049695968628,
+      "learning_rate": 0.00039412296909471914,
+      "loss": 4.7155,
+      "step": 1303
+    },
+    {
+      "epoch": 0.4170381178916229,
+      "grad_norm": 0.35823458433151245,
+      "learning_rate": 0.00039382777017924354,
+      "loss": 4.7275,
+      "step": 1304
+    },
+    {
+      "epoch": 0.4173579323992085,
+      "grad_norm": 0.3543967008590698,
+      "learning_rate": 0.00039353247053054984,
+      "loss": 4.7342,
+      "step": 1305
+    },
+    {
+      "epoch": 0.41767774690679405,
+      "grad_norm": 0.37220895290374756,
+      "learning_rate": 0.0003932370704656711,
+      "loss": 4.7876,
+      "step": 1306
+    },
+    {
+      "epoch": 0.41799756141437966,
+      "grad_norm": 0.34149494767189026,
+      "learning_rate": 0.00039294157030174783,
+      "loss": 4.7389,
+      "step": 1307
+    },
+    {
+      "epoch": 0.4183173759219653,
+      "grad_norm": 0.33853477239608765,
+      "learning_rate": 0.00039264597035602807,
+      "loss": 4.6651,
+      "step": 1308
+    },
+    {
+      "epoch": 0.4186371904295509,
+      "grad_norm": 0.33498257398605347,
+      "learning_rate": 0.0003923502709458672,
+      "loss": 4.7405,
+      "step": 1309
+    },
+    {
+      "epoch": 0.41895700493713645,
+      "grad_norm": 0.3531733453273773,
+      "learning_rate": 0.00039205447238872706,
+      "loss": 4.7414,
+      "step": 1310
+    },
+    {
+      "epoch": 0.41927681944472206,
+      "grad_norm": 0.34058284759521484,
+      "learning_rate": 0.0003917585750021763,
+      "loss": 4.718,
+      "step": 1311
+    },
+    {
+      "epoch": 0.4195966339523077,
+      "grad_norm": 0.428093284368515,
+      "learning_rate": 0.0003914625791038893,
+      "loss": 4.7236,
+      "step": 1312
+    },
+    {
+      "epoch": 0.4199164484598933,
+      "grad_norm": 0.35845211148262024,
+      "learning_rate": 0.00039116648501164665,
+      "loss": 4.6984,
+      "step": 1313
+    },
+    {
+      "epoch": 0.42023626296747885,
+      "grad_norm": 0.35991171002388,
+      "learning_rate": 0.0003908702930433338,
+      "loss": 4.7357,
+      "step": 1314
+    },
+    {
+      "epoch": 0.42055607747506446,
+      "grad_norm": 0.33685556054115295,
+      "learning_rate": 0.0003905740035169417,
+      "loss": 4.7028,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4208758919826501,
+      "grad_norm": 0.3488212823867798,
+      "learning_rate": 0.00039027761675056595,
+      "loss": 4.6826,
+      "step": 1316
+    },
+    {
+      "epoch": 0.4211957064902357,
+      "grad_norm": 0.35829851031303406,
+      "learning_rate": 0.0003899811330624065,
+      "loss": 4.684,
+      "step": 1317
+    },
+    {
+      "epoch": 0.42151552099782125,
+      "grad_norm": 0.34577038884162903,
+      "learning_rate": 0.0003896845527707673,
+      "loss": 4.705,
+      "step": 1318
+    },
+    {
+      "epoch": 0.42183533550540686,
+      "grad_norm": 0.3443538248538971,
+      "learning_rate": 0.00038938787619405616,
+      "loss": 4.7583,
+      "step": 1319
+    },
+    {
+      "epoch": 0.4221551500129925,
+      "grad_norm": 0.35939159989356995,
+      "learning_rate": 0.00038909110365078413,
+      "loss": 4.6987,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4224749645205781,
+      "grad_norm": 0.3435399830341339,
+      "learning_rate": 0.00038879423545956534,
+      "loss": 4.6453,
+      "step": 1321
+    },
+    {
+      "epoch": 0.42279477902816365,
+      "grad_norm": 0.3486972749233246,
+      "learning_rate": 0.00038849727193911664,
+      "loss": 4.7149,
+      "step": 1322
+    },
+    {
+      "epoch": 0.42311459353574926,
+      "grad_norm": 0.38485008478164673,
+      "learning_rate": 0.0003882002134082571,
+      "loss": 4.7308,
+      "step": 1323
+    },
+    {
+      "epoch": 0.4234344080433349,
+      "grad_norm": 0.3632776737213135,
+      "learning_rate": 0.000387903060185908,
+      "loss": 4.6965,
+      "step": 1324
+    },
+    {
+      "epoch": 0.4237542225509205,
+      "grad_norm": 0.36035025119781494,
+      "learning_rate": 0.00038760581259109214,
+      "loss": 4.7418,
+      "step": 1325
+    },
+    {
+      "epoch": 0.42407403705850605,
+      "grad_norm": 0.35747748613357544,
+      "learning_rate": 0.0003873084709429336,
+      "loss": 4.7514,
+      "step": 1326
+    },
+    {
+      "epoch": 0.42439385156609166,
+      "grad_norm": 0.35926157236099243,
+      "learning_rate": 0.00038701103556065754,
+      "loss": 4.8184,
+      "step": 1327
+    },
+    {
+      "epoch": 0.4247136660736773,
+      "grad_norm": 0.3767782747745514,
+      "learning_rate": 0.0003867135067635898,
+      "loss": 4.7554,
+      "step": 1328
+    },
+    {
+      "epoch": 0.4250334805812629,
+      "grad_norm": 0.34841591119766235,
+      "learning_rate": 0.0003864158848711562,
+      "loss": 4.705,
+      "step": 1329
+    },
+    {
+      "epoch": 0.42535329508884845,
+      "grad_norm": 0.36564236879348755,
+      "learning_rate": 0.000386118170202883,
+      "loss": 4.71,
+      "step": 1330
+    },
+    {
+      "epoch": 0.42567310959643406,
+      "grad_norm": 0.3872424066066742,
+      "learning_rate": 0.00038582036307839557,
+      "loss": 4.7814,
+      "step": 1331
+    },
+    {
+      "epoch": 0.4259929241040197,
+      "grad_norm": 0.3488774299621582,
+      "learning_rate": 0.00038552246381741884,
+      "loss": 4.6798,
+      "step": 1332
+    },
+    {
+      "epoch": 0.4263127386116053,
+      "grad_norm": 0.347474068403244,
+      "learning_rate": 0.0003852244727397766,
+      "loss": 4.8074,
+      "step": 1333
+    },
+    {
+      "epoch": 0.42663255311919085,
+      "grad_norm": 0.3365819752216339,
+      "learning_rate": 0.00038492639016539116,
+      "loss": 4.7403,
+      "step": 1334
+    },
+    {
+      "epoch": 0.42695236762677646,
+      "grad_norm": 0.3785269856452942,
+      "learning_rate": 0.0003846282164142831,
+      "loss": 4.836,
+      "step": 1335
+    },
+    {
+      "epoch": 0.4272721821343621,
+      "grad_norm": 0.3417024612426758,
+      "learning_rate": 0.00038432995180657094,
+      "loss": 4.6824,
+      "step": 1336
+    },
+    {
+      "epoch": 0.4275919966419477,
+      "grad_norm": 0.3502133786678314,
+      "learning_rate": 0.00038403159666247063,
+      "loss": 4.6996,
+      "step": 1337
+    },
+    {
+      "epoch": 0.42791181114953325,
+      "grad_norm": 0.3363844156265259,
+      "learning_rate": 0.0003837331513022954,
+      "loss": 4.701,
+      "step": 1338
+    },
+    {
+      "epoch": 0.42823162565711886,
+      "grad_norm": 0.3410629630088806,
+      "learning_rate": 0.0003834346160464553,
+      "loss": 4.7362,
+      "step": 1339
+    },
+    {
+      "epoch": 0.4285514401647045,
+      "grad_norm": 0.3651580214500427,
+      "learning_rate": 0.0003831359912154569,
+      "loss": 4.7333,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4288712546722901,
+      "grad_norm": 0.38412901759147644,
+      "learning_rate": 0.0003828372771299029,
+      "loss": 4.6356,
+      "step": 1341
+    },
+    {
+      "epoch": 0.42919106917987565,
+      "grad_norm": 0.37616869807243347,
+      "learning_rate": 0.00038253847411049194,
+      "loss": 4.63,
+      "step": 1342
+    },
+    {
+      "epoch": 0.42951088368746126,
+      "grad_norm": 0.3305797576904297,
+      "learning_rate": 0.000382239582478018,
+      "loss": 4.783,
+      "step": 1343
+    },
+    {
+      "epoch": 0.4298306981950469,
+      "grad_norm": 0.3697926104068756,
+      "learning_rate": 0.00038194060255337026,
+      "loss": 4.8082,
+      "step": 1344
+    },
+    {
+      "epoch": 0.4301505127026325,
+      "grad_norm": 0.3631756007671356,
+      "learning_rate": 0.0003816415346575327,
+      "loss": 4.7397,
+      "step": 1345
+    },
+    {
+      "epoch": 0.43047032721021805,
+      "grad_norm": 0.35470134019851685,
+      "learning_rate": 0.0003813423791115838,
+      "loss": 4.6279,
+      "step": 1346
+    },
+    {
+      "epoch": 0.43079014171780367,
+      "grad_norm": 0.3372836410999298,
+      "learning_rate": 0.00038104313623669604,
+      "loss": 4.7687,
+      "step": 1347
+    },
+    {
+      "epoch": 0.4311099562253893,
+      "grad_norm": 0.3620103895664215,
+      "learning_rate": 0.0003807438063541356,
+      "loss": 4.7333,
+      "step": 1348
+    },
+    {
+      "epoch": 0.4314297707329749,
+      "grad_norm": 0.35666435956954956,
+      "learning_rate": 0.00038044438978526235,
+      "loss": 4.7729,
+      "step": 1349
+    },
+    {
+      "epoch": 0.43174958524056045,
+      "grad_norm": 0.3575831949710846,
+      "learning_rate": 0.0003801448868515287,
+      "loss": 4.7119,
+      "step": 1350
+    },
+    {
+      "epoch": 0.43206939974814607,
+      "grad_norm": 0.3621557354927063,
+      "learning_rate": 0.00037984529787448047,
+      "loss": 4.7006,
+      "step": 1351
+    },
+    {
+      "epoch": 0.4323892142557317,
+      "grad_norm": 0.360970675945282,
+      "learning_rate": 0.0003795456231757554,
+      "loss": 4.7223,
+      "step": 1352
+    },
+    {
+      "epoch": 0.4327090287633173,
+      "grad_norm": 0.33878061175346375,
+      "learning_rate": 0.0003792458630770833,
+      "loss": 4.7156,
+      "step": 1353
+    },
+    {
+      "epoch": 0.43302884327090285,
+      "grad_norm": 0.3453442454338074,
+      "learning_rate": 0.00037894601790028576,
+      "loss": 4.7931,
+      "step": 1354
+    },
+    {
+      "epoch": 0.43334865777848847,
+      "grad_norm": 0.34347814321517944,
+      "learning_rate": 0.0003786460879672756,
+      "loss": 4.7,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4336684722860741,
+      "grad_norm": 0.3537500500679016,
+      "learning_rate": 0.0003783460736000569,
+      "loss": 4.6134,
+      "step": 1356
+    },
+    {
+      "epoch": 0.4339882867936597,
+      "grad_norm": 0.422690212726593,
+      "learning_rate": 0.0003780459751207241,
+      "loss": 4.7167,
+      "step": 1357
+    },
+    {
+      "epoch": 0.43430810130124525,
+      "grad_norm": 0.3382203280925751,
+      "learning_rate": 0.0003777457928514619,
+      "loss": 4.7304,
+      "step": 1358
+    },
+    {
+      "epoch": 0.43462791580883087,
+      "grad_norm": 0.3374168574810028,
+      "learning_rate": 0.0003774455271145454,
+      "loss": 4.6354,
+      "step": 1359
+    },
+    {
+      "epoch": 0.4349477303164165,
+      "grad_norm": 0.34140485525131226,
+      "learning_rate": 0.0003771451782323388,
+      "loss": 4.6944,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4352675448240021,
+      "grad_norm": 0.35734114050865173,
+      "learning_rate": 0.0003768447465272959,
+      "loss": 4.667,
+      "step": 1361
+    },
+    {
+      "epoch": 0.43558735933158765,
+      "grad_norm": 0.3632422089576721,
+      "learning_rate": 0.0003765442323219591,
+      "loss": 4.6941,
+      "step": 1362
+    },
+    {
+      "epoch": 0.43590717383917327,
+      "grad_norm": 0.34204337000846863,
+      "learning_rate": 0.00037624363593895976,
+      "loss": 4.6961,
+      "step": 1363
+    },
+    {
+      "epoch": 0.4362269883467589,
+      "grad_norm": 0.37258926033973694,
+      "learning_rate": 0.00037594295770101716,
+      "loss": 4.728,
+      "step": 1364
+    },
+    {
+      "epoch": 0.4365468028543445,
+      "grad_norm": 0.34576255083084106,
+      "learning_rate": 0.0003756421979309387,
+      "loss": 4.7215,
+      "step": 1365
+    },
+    {
+      "epoch": 0.43686661736193005,
+      "grad_norm": 0.34704098105430603,
+      "learning_rate": 0.00037534135695161904,
+      "loss": 4.6805,
+      "step": 1366
+    },
+    {
+      "epoch": 0.43718643186951567,
+      "grad_norm": 0.3493821322917938,
+      "learning_rate": 0.0003750404350860402,
+      "loss": 4.7144,
+      "step": 1367
+    },
+    {
+      "epoch": 0.4375062463771013,
+      "grad_norm": 0.33115461468696594,
+      "learning_rate": 0.00037473943265727114,
+      "loss": 4.7186,
+      "step": 1368
+    },
+    {
+      "epoch": 0.4378260608846869,
+      "grad_norm": 0.34023842215538025,
+      "learning_rate": 0.000374438349988467,
+      "loss": 4.7335,
+      "step": 1369
+    },
+    {
+      "epoch": 0.43814587539227245,
+      "grad_norm": 0.3522568941116333,
+      "learning_rate": 0.00037413718740286935,
+      "loss": 4.6449,
+      "step": 1370
+    },
+    {
+      "epoch": 0.43846568989985807,
+      "grad_norm": 0.3428913652896881,
+      "learning_rate": 0.00037383594522380546,
+      "loss": 4.7108,
+      "step": 1371
+    },
+    {
+      "epoch": 0.4387855044074437,
+      "grad_norm": 0.3575150966644287,
+      "learning_rate": 0.00037353462377468806,
+      "loss": 4.761,
+      "step": 1372
+    },
+    {
+      "epoch": 0.4391053189150293,
+      "grad_norm": 0.34879520535469055,
+      "learning_rate": 0.0003732332233790149,
+      "loss": 4.6459,
+      "step": 1373
+    },
+    {
+      "epoch": 0.4394251334226149,
+      "grad_norm": 0.3344023525714874,
+      "learning_rate": 0.00037293174436036855,
+      "loss": 4.8296,
+      "step": 1374
+    },
+    {
+      "epoch": 0.43974494793020047,
+      "grad_norm": 0.49138399958610535,
+      "learning_rate": 0.000372630187042416,
+      "loss": 4.7823,
+      "step": 1375
+    },
+    {
+      "epoch": 0.4400647624377861,
+      "grad_norm": 0.3501664698123932,
+      "learning_rate": 0.0003723285517489084,
+      "loss": 4.7195,
+      "step": 1376
+    },
+    {
+      "epoch": 0.4403845769453717,
+      "grad_norm": 0.3450270891189575,
+      "learning_rate": 0.0003720268388036805,
+      "loss": 4.6751,
+      "step": 1377
+    },
+    {
+      "epoch": 0.4407043914529573,
+      "grad_norm": 0.3368213474750519,
+      "learning_rate": 0.0003717250485306503,
+      "loss": 4.7517,
+      "step": 1378
+    },
+    {
+      "epoch": 0.44102420596054287,
+      "grad_norm": 0.3620702624320984,
+      "learning_rate": 0.00037142318125381915,
+      "loss": 4.7359,
+      "step": 1379
+    },
+    {
+      "epoch": 0.4413440204681285,
+      "grad_norm": 0.3435147702693939,
+      "learning_rate": 0.0003711212372972706,
+      "loss": 4.6931,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4416638349757141,
+      "grad_norm": 0.39190882444381714,
+      "learning_rate": 0.000370819216985171,
+      "loss": 4.803,
+      "step": 1381
+    },
+    {
+      "epoch": 0.4419836494832997,
+      "grad_norm": 0.40365469455718994,
+      "learning_rate": 0.0003705171206417685,
+      "loss": 4.728,
+      "step": 1382
+    },
+    {
+      "epoch": 0.44230346399088527,
+      "grad_norm": 0.3486280143260956,
+      "learning_rate": 0.0003702149485913926,
+      "loss": 4.6316,
+      "step": 1383
+    },
+    {
+      "epoch": 0.4426232784984709,
+      "grad_norm": 0.3493972718715668,
+      "learning_rate": 0.0003699127011584546,
+      "loss": 4.746,
+      "step": 1384
+    },
+    {
+      "epoch": 0.4429430930060565,
+      "grad_norm": 0.33820289373397827,
+      "learning_rate": 0.0003696103786674463,
+      "loss": 4.7163,
+      "step": 1385
+    },
+    {
+      "epoch": 0.4432629075136421,
+      "grad_norm": 0.35788559913635254,
+      "learning_rate": 0.0003693079814429403,
+      "loss": 4.7594,
+      "step": 1386
+    },
+    {
+      "epoch": 0.44358272202122767,
+      "grad_norm": 0.4123647212982178,
+      "learning_rate": 0.00036900550980958934,
+      "loss": 4.6385,
+      "step": 1387
+    },
+    {
+      "epoch": 0.4439025365288133,
+      "grad_norm": 0.339077889919281,
+      "learning_rate": 0.000368702964092126,
+      "loss": 4.6516,
+      "step": 1388
+    },
+    {
+      "epoch": 0.4442223510363989,
+      "grad_norm": 0.39766621589660645,
+      "learning_rate": 0.0003684003446153627,
+      "loss": 4.6859,
+      "step": 1389
+    },
+    {
+      "epoch": 0.4445421655439845,
+      "grad_norm": 0.33863818645477295,
+      "learning_rate": 0.0003680976517041905,
+      "loss": 4.6316,
+      "step": 1390
+    },
+    {
+      "epoch": 0.44486198005157007,
+      "grad_norm": 0.3599453568458557,
+      "learning_rate": 0.00036779488568358,
+      "loss": 4.6533,
+      "step": 1391
+    },
+    {
+      "epoch": 0.4451817945591557,
+      "grad_norm": 0.35659059882164,
+      "learning_rate": 0.00036749204687857955,
+      "loss": 4.6248,
+      "step": 1392
+    },
+    {
+      "epoch": 0.4455016090667413,
+      "grad_norm": 0.3393639624118805,
+      "learning_rate": 0.00036718913561431613,
+      "loss": 4.6681,
+      "step": 1393
+    },
+    {
+      "epoch": 0.4458214235743269,
+      "grad_norm": 0.39188846945762634,
+      "learning_rate": 0.0003668861522159945,
+      "loss": 4.6124,
+      "step": 1394
+    },
+    {
+      "epoch": 0.4461412380819125,
+      "grad_norm": 0.3615441918373108,
+      "learning_rate": 0.00036658309700889655,
+      "loss": 4.6809,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4464610525894981,
+      "grad_norm": 0.34237948060035706,
+      "learning_rate": 0.0003662799703183817,
+      "loss": 4.6823,
+      "step": 1396
+    },
+    {
+      "epoch": 0.4467808670970837,
+      "grad_norm": 0.334337055683136,
+      "learning_rate": 0.00036597677246988564,
+      "loss": 4.6697,
+      "step": 1397
+    },
+    {
+      "epoch": 0.4471006816046693,
+      "grad_norm": 0.3497103154659271,
+      "learning_rate": 0.00036567350378892074,
+      "loss": 4.7279,
+      "step": 1398
+    },
+    {
+      "epoch": 0.4474204961122549,
+      "grad_norm": 0.3531494140625,
+      "learning_rate": 0.00036537016460107545,
+      "loss": 4.7062,
+      "step": 1399
+    },
+    {
+      "epoch": 0.4477403106198405,
+      "grad_norm": 0.34297361969947815,
+      "learning_rate": 0.00036506675523201385,
+      "loss": 4.7651,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4477403106198405,
+      "eval_loss": 4.698581218719482,
+      "eval_runtime": 82.2605,
+      "eval_samples_per_second": 23.061,
+      "eval_steps_per_second": 5.774,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4480601251274261,
+      "grad_norm": 0.3714090585708618,
+      "learning_rate": 0.0003647632760074751,
+      "loss": 4.6971,
+      "step": 1401
+    },
+    {
+      "epoch": 0.4483799396350117,
+      "grad_norm": 0.34016844630241394,
+      "learning_rate": 0.0003644597272532739,
+      "loss": 4.7252,
+      "step": 1402
+    },
+    {
+      "epoch": 0.4486997541425973,
+      "grad_norm": 0.3468213379383087,
+      "learning_rate": 0.00036415610929529913,
+      "loss": 4.7265,
+      "step": 1403
+    },
+    {
+      "epoch": 0.4490195686501829,
+      "grad_norm": 0.3513246178627014,
+      "learning_rate": 0.00036385242245951427,
+      "loss": 4.755,
+      "step": 1404
+    },
+    {
+      "epoch": 0.4493393831577685,
+      "grad_norm": 0.34193092584609985,
+      "learning_rate": 0.0003635486670719564,
+      "loss": 4.7513,
+      "step": 1405
+    },
+    {
+      "epoch": 0.4496591976653541,
+      "grad_norm": 0.3676043152809143,
+      "learning_rate": 0.0003632448434587366,
+      "loss": 4.6737,
+      "step": 1406
+    },
+    {
+      "epoch": 0.4499790121729397,
+      "grad_norm": 0.379415363073349,
+      "learning_rate": 0.00036294095194603905,
+      "loss": 4.6807,
+      "step": 1407
+    },
+    {
+      "epoch": 0.4502988266805253,
+      "grad_norm": 0.3363446593284607,
+      "learning_rate": 0.00036263699286012056,
+      "loss": 4.7128,
+      "step": 1408
+    },
+    {
+      "epoch": 0.4506186411881109,
+      "grad_norm": 0.3659687042236328,
+      "learning_rate": 0.0003623329665273108,
+      "loss": 4.6283,
+      "step": 1409
+    },
+    {
+      "epoch": 0.4509384556956965,
+      "grad_norm": 0.3622778654098511,
+      "learning_rate": 0.00036202887327401167,
+      "loss": 4.6883,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4512582702032821,
+      "grad_norm": 0.4010184705257416,
+      "learning_rate": 0.00036172471342669663,
+      "loss": 4.772,
+      "step": 1411
+    },
+    {
+      "epoch": 0.4515780847108677,
+      "grad_norm": 0.3728751838207245,
+      "learning_rate": 0.000361420487311911,
+      "loss": 4.7281,
+      "step": 1412
+    },
+    {
+      "epoch": 0.4518978992184533,
+      "grad_norm": 0.3629361093044281,
+      "learning_rate": 0.0003611161952562707,
+      "loss": 4.7045,
+      "step": 1413
+    },
+    {
+      "epoch": 0.4522177137260389,
+      "grad_norm": 0.3519171178340912,
+      "learning_rate": 0.00036081183758646313,
+      "loss": 4.6861,
+      "step": 1414
+    },
+    {
+      "epoch": 0.4525375282336245,
+      "grad_norm": 0.3553248941898346,
+      "learning_rate": 0.00036050741462924563,
+      "loss": 4.724,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4528573427412101,
+      "grad_norm": 0.37955769896507263,
+      "learning_rate": 0.0003602029267114457,
+      "loss": 4.6819,
+      "step": 1416
+    },
+    {
+      "epoch": 0.4531771572487957,
+      "grad_norm": 0.3557082712650299,
+      "learning_rate": 0.000359898374159961,
+      "loss": 4.6762,
+      "step": 1417
+    },
+    {
+      "epoch": 0.4534969717563813,
+      "grad_norm": 0.36919450759887695,
+      "learning_rate": 0.0003595937573017579,
+      "loss": 4.7242,
+      "step": 1418
+    },
+    {
+      "epoch": 0.4538167862639669,
+      "grad_norm": 0.35419464111328125,
+      "learning_rate": 0.00035928907646387234,
+      "loss": 4.7052,
+      "step": 1419
+    },
+    {
+      "epoch": 0.4541366007715525,
+      "grad_norm": 0.3797052502632141,
+      "learning_rate": 0.00035898433197340874,
+      "loss": 4.748,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4544564152791381,
+      "grad_norm": 0.33481892943382263,
+      "learning_rate": 0.0003586795241575398,
+      "loss": 4.7317,
+      "step": 1421
+    },
+    {
+      "epoch": 0.4547762297867237,
+      "grad_norm": 0.336603045463562,
+      "learning_rate": 0.00035837465334350637,
+      "loss": 4.7278,
+      "step": 1422
+    },
+    {
+      "epoch": 0.4550960442943093,
+      "grad_norm": 0.3555077314376831,
+      "learning_rate": 0.0003580697198586169,
+      "loss": 4.6954,
+      "step": 1423
+    },
+    {
+      "epoch": 0.4554158588018949,
+      "grad_norm": 0.34736326336860657,
+      "learning_rate": 0.0003577647240302471,
+      "loss": 4.8181,
+      "step": 1424
+    },
+    {
+      "epoch": 0.4557356733094805,
+      "grad_norm": 0.34664371609687805,
+      "learning_rate": 0.0003574596661858396,
+      "loss": 4.6458,
+      "step": 1425
+    },
+    {
+      "epoch": 0.4560554878170661,
+      "grad_norm": 0.3364536464214325,
+      "learning_rate": 0.00035715454665290343,
+      "loss": 4.6564,
+      "step": 1426
+    },
+    {
+      "epoch": 0.4563753023246517,
+      "grad_norm": 0.3512004017829895,
+      "learning_rate": 0.0003568493657590142,
+      "loss": 4.6482,
+      "step": 1427
+    },
+    {
+      "epoch": 0.4566951168322373,
+      "grad_norm": 0.3391799032688141,
+      "learning_rate": 0.0003565441238318131,
+      "loss": 4.7488,
+      "step": 1428
+    },
+    {
+      "epoch": 0.4570149313398229,
+      "grad_norm": 0.33632373809814453,
+      "learning_rate": 0.000356238821199007,
+      "loss": 4.6127,
+      "step": 1429
+    },
+    {
+      "epoch": 0.4573347458474085,
+      "grad_norm": 0.3592793941497803,
+      "learning_rate": 0.000355933458188368,
+      "loss": 4.7661,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4576545603549941,
+      "grad_norm": 0.34287509322166443,
+      "learning_rate": 0.00035562803512773284,
+      "loss": 4.6601,
+      "step": 1431
+    },
+    {
+      "epoch": 0.4579743748625797,
+      "grad_norm": 0.331759512424469,
+      "learning_rate": 0.00035532255234500284,
+      "loss": 4.6593,
+      "step": 1432
+    },
+    {
+      "epoch": 0.4582941893701653,
+      "grad_norm": 0.38584470748901367,
+      "learning_rate": 0.0003550170101681434,
+      "loss": 4.6229,
+      "step": 1433
+    },
+    {
+      "epoch": 0.4586140038777509,
+      "grad_norm": 0.3618987798690796,
+      "learning_rate": 0.00035471140892518366,
+      "loss": 4.6855,
+      "step": 1434
+    },
+    {
+      "epoch": 0.4589338183853365,
+      "grad_norm": 0.3290684223175049,
+      "learning_rate": 0.0003544057489442164,
+      "loss": 4.6551,
+      "step": 1435
+    },
+    {
+      "epoch": 0.4592536328929221,
+      "grad_norm": 0.35568806529045105,
+      "learning_rate": 0.0003541000305533971,
+      "loss": 4.7414,
+      "step": 1436
+    },
+    {
+      "epoch": 0.4595734474005077,
+      "grad_norm": 0.35015153884887695,
+      "learning_rate": 0.00035379425408094416,
+      "loss": 4.7583,
+      "step": 1437
+    },
+    {
+      "epoch": 0.4598932619080933,
+      "grad_norm": 0.3360646069049835,
+      "learning_rate": 0.00035348841985513834,
+      "loss": 4.7037,
+      "step": 1438
+    },
+    {
+      "epoch": 0.4602130764156789,
+      "grad_norm": 0.34291842579841614,
+      "learning_rate": 0.00035318252820432236,
+      "loss": 4.6583,
+      "step": 1439
+    },
+    {
+      "epoch": 0.4605328909232645,
+      "grad_norm": 0.328739732503891,
+      "learning_rate": 0.00035287657945690045,
+      "loss": 4.6659,
+      "step": 1440
+    },
+    {
+      "epoch": 0.4608527054308501,
+      "grad_norm": 0.34876030683517456,
+      "learning_rate": 0.0003525705739413385,
+      "loss": 4.7138,
+      "step": 1441
+    },
+    {
+      "epoch": 0.4611725199384357,
+      "grad_norm": 0.34805476665496826,
+      "learning_rate": 0.000352264511986163,
+      "loss": 4.6653,
+      "step": 1442
+    },
+    {
+      "epoch": 0.46149233444602134,
+      "grad_norm": 0.33524250984191895,
+      "learning_rate": 0.0003519583939199613,
+      "loss": 4.6185,
+      "step": 1443
+    },
+    {
+      "epoch": 0.4618121489536069,
+      "grad_norm": 0.3647015690803528,
+      "learning_rate": 0.00035165222007138076,
+      "loss": 4.7487,
+      "step": 1444
+    },
+    {
+      "epoch": 0.4621319634611925,
+      "grad_norm": 0.3870471119880676,
+      "learning_rate": 0.00035134599076912866,
+      "loss": 4.66,
+      "step": 1445
+    },
+    {
+      "epoch": 0.4624517779687781,
+      "grad_norm": 0.3421085476875305,
+      "learning_rate": 0.00035103970634197193,
+      "loss": 4.7053,
+      "step": 1446
+    },
+    {
+      "epoch": 0.46277159247636374,
+      "grad_norm": 0.35573166608810425,
+      "learning_rate": 0.00035073336711873666,
+      "loss": 4.775,
+      "step": 1447
+    },
+    {
+      "epoch": 0.4630914069839493,
+      "grad_norm": 0.3303551971912384,
+      "learning_rate": 0.00035042697342830783,
+      "loss": 4.718,
+      "step": 1448
+    },
+    {
+      "epoch": 0.4634112214915349,
+      "grad_norm": 0.37305688858032227,
+      "learning_rate": 0.00035012052559962853,
+      "loss": 4.714,
+      "step": 1449
+    },
+    {
+      "epoch": 0.4637310359991205,
+      "grad_norm": 0.33113083243370056,
+      "learning_rate": 0.0003498140239617005,
+      "loss": 4.7113,
+      "step": 1450
+    },
+    {
+      "epoch": 0.46405085050670614,
+      "grad_norm": 0.3569343090057373,
+      "learning_rate": 0.0003495074688435829,
+      "loss": 4.6924,
+      "step": 1451
+    },
+    {
+      "epoch": 0.4643706650142917,
+      "grad_norm": 0.3453904986381531,
+      "learning_rate": 0.00034920086057439243,
+      "loss": 4.7093,
+      "step": 1452
+    },
+    {
+      "epoch": 0.4646904795218773,
+      "grad_norm": 0.3387502431869507,
+      "learning_rate": 0.0003488941994833028,
+      "loss": 4.6262,
+      "step": 1453
+    },
+    {
+      "epoch": 0.4650102940294629,
+      "grad_norm": 0.35884377360343933,
+      "learning_rate": 0.00034858748589954437,
+      "loss": 4.6482,
+      "step": 1454
+    },
+    {
+      "epoch": 0.46533010853704854,
+      "grad_norm": 0.3423829972743988,
+      "learning_rate": 0.0003482807201524042,
+      "loss": 4.6878,
+      "step": 1455
+    },
+    {
+      "epoch": 0.4656499230446341,
+      "grad_norm": 0.360689640045166,
+      "learning_rate": 0.00034797390257122486,
+      "loss": 4.6499,
+      "step": 1456
+    },
+    {
+      "epoch": 0.4659697375522197,
+      "grad_norm": 0.3595859408378601,
+      "learning_rate": 0.0003476670334854049,
+      "loss": 4.7275,
+      "step": 1457
+    },
+    {
+      "epoch": 0.4662895520598053,
+      "grad_norm": 0.35201844573020935,
+      "learning_rate": 0.00034736011322439796,
+      "loss": 4.6911,
+      "step": 1458
+    },
+    {
+      "epoch": 0.46660936656739094,
+      "grad_norm": 0.3574092388153076,
+      "learning_rate": 0.0003470531421177128,
+      "loss": 4.6644,
+      "step": 1459
+    },
+    {
+      "epoch": 0.4669291810749765,
+      "grad_norm": 0.36814460158348083,
+      "learning_rate": 0.00034674612049491276,
+      "loss": 4.6686,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4672489955825621,
+      "grad_norm": 0.3635476529598236,
+      "learning_rate": 0.0003464390486856153,
+      "loss": 4.7105,
+      "step": 1461
+    },
+    {
+      "epoch": 0.4675688100901477,
+      "grad_norm": 0.38265570998191833,
+      "learning_rate": 0.0003461319270194919,
+      "loss": 4.6519,
+      "step": 1462
+    },
+    {
+      "epoch": 0.46788862459773334,
+      "grad_norm": 0.36408907175064087,
+      "learning_rate": 0.0003458247558262672,
+      "loss": 4.6651,
+      "step": 1463
+    },
+    {
+      "epoch": 0.4682084391053189,
+      "grad_norm": 0.38444340229034424,
+      "learning_rate": 0.0003455175354357195,
+      "loss": 4.7486,
+      "step": 1464
+    },
+    {
+      "epoch": 0.4685282536129045,
+      "grad_norm": 0.36975181102752686,
+      "learning_rate": 0.0003452102661776798,
+      "loss": 4.564,
+      "step": 1465
+    },
+    {
+      "epoch": 0.4688480681204901,
+      "grad_norm": 0.35873138904571533,
+      "learning_rate": 0.0003449029483820313,
+      "loss": 4.5793,
+      "step": 1466
+    },
+    {
+      "epoch": 0.46916788262807574,
+      "grad_norm": 0.3803260028362274,
+      "learning_rate": 0.00034459558237870955,
+      "loss": 4.6854,
+      "step": 1467
+    },
+    {
+      "epoch": 0.4694876971356613,
+      "grad_norm": 0.3775056004524231,
+      "learning_rate": 0.00034428816849770173,
+      "loss": 4.6818,
+      "step": 1468
+    },
+    {
+      "epoch": 0.4698075116432469,
+      "grad_norm": 0.3574797809123993,
+      "learning_rate": 0.00034398070706904657,
+      "loss": 4.6641,
+      "step": 1469
+    },
+    {
+      "epoch": 0.4701273261508325,
+      "grad_norm": 0.3633163273334503,
+      "learning_rate": 0.0003436731984228336,
+      "loss": 4.6348,
+      "step": 1470
+    },
+    {
+      "epoch": 0.47044714065841814,
+      "grad_norm": 0.36290299892425537,
+      "learning_rate": 0.00034336564288920334,
+      "loss": 4.545,
+      "step": 1471
+    },
+    {
+      "epoch": 0.4707669551660037,
+      "grad_norm": 0.36241206526756287,
+      "learning_rate": 0.0003430580407983465,
+      "loss": 4.6508,
+      "step": 1472
+    },
+    {
+      "epoch": 0.4710867696735893,
+      "grad_norm": 0.3603331446647644,
+      "learning_rate": 0.00034275039248050384,
+      "loss": 4.6032,
+      "step": 1473
+    },
+    {
+      "epoch": 0.4714065841811749,
+      "grad_norm": 0.3851202726364136,
+      "learning_rate": 0.00034244269826596543,
+      "loss": 4.6484,
+      "step": 1474
+    },
+    {
+      "epoch": 0.47172639868876054,
+      "grad_norm": 0.3676314353942871,
+      "learning_rate": 0.0003421349584850711,
+      "loss": 4.7145,
+      "step": 1475
+    },
+    {
+      "epoch": 0.4720462131963461,
+      "grad_norm": 0.35233455896377563,
+      "learning_rate": 0.0003418271734682093,
+      "loss": 4.6362,
+      "step": 1476
+    },
+    {
+      "epoch": 0.4723660277039317,
+      "grad_norm": 0.36112335324287415,
+      "learning_rate": 0.00034151934354581715,
+      "loss": 4.6983,
+      "step": 1477
+    },
+    {
+      "epoch": 0.4726858422115173,
+      "grad_norm": 0.3636215329170227,
+      "learning_rate": 0.00034121146904837995,
+      "loss": 4.678,
+      "step": 1478
+    },
+    {
+      "epoch": 0.47300565671910294,
+      "grad_norm": 0.3911776542663574,
+      "learning_rate": 0.00034090355030643083,
+      "loss": 4.6516,
+      "step": 1479
+    },
+    {
+      "epoch": 0.4733254712266885,
+      "grad_norm": 0.45845794677734375,
+      "learning_rate": 0.00034059558765055047,
+      "loss": 4.7059,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4736452857342741,
+      "grad_norm": 0.3456306755542755,
+      "learning_rate": 0.0003402875814113666,
+      "loss": 4.591,
+      "step": 1481
+    },
+    {
+      "epoch": 0.4739651002418597,
+      "grad_norm": 0.3881906270980835,
+      "learning_rate": 0.00033997953191955383,
+      "loss": 4.6141,
+      "step": 1482
+    },
+    {
+      "epoch": 0.47428491474944534,
+      "grad_norm": 0.3685750365257263,
+      "learning_rate": 0.0003396714395058333,
+      "loss": 4.6435,
+      "step": 1483
+    },
+    {
+      "epoch": 0.4746047292570309,
+      "grad_norm": 0.3566403090953827,
+      "learning_rate": 0.00033936330450097193,
+      "loss": 4.643,
+      "step": 1484
+    },
+    {
+      "epoch": 0.4749245437646165,
+      "grad_norm": 0.3359985947608948,
+      "learning_rate": 0.0003390551272357829,
+      "loss": 4.6454,
+      "step": 1485
+    },
+    {
+      "epoch": 0.4752443582722021,
+      "grad_norm": 0.36924082040786743,
+      "learning_rate": 0.00033874690804112397,
+      "loss": 4.676,
+      "step": 1486
+    },
+    {
+      "epoch": 0.47556417277978774,
+      "grad_norm": 0.35759541392326355,
+      "learning_rate": 0.00033843864724789866,
+      "loss": 4.6631,
+      "step": 1487
+    },
+    {
+      "epoch": 0.4758839872873733,
+      "grad_norm": 0.34228649735450745,
+      "learning_rate": 0.00033813034518705463,
+      "loss": 4.6948,
+      "step": 1488
+    },
+    {
+      "epoch": 0.4762038017949589,
+      "grad_norm": 0.3689830005168915,
+      "learning_rate": 0.00033782200218958433,
+      "loss": 4.6772,
+      "step": 1489
+    },
+    {
+      "epoch": 0.47652361630254453,
+      "grad_norm": 0.36067578196525574,
+      "learning_rate": 0.00033751361858652375,
+      "loss": 4.6559,
+      "step": 1490
+    },
+    {
+      "epoch": 0.47684343081013014,
+      "grad_norm": 0.3399069309234619,
+      "learning_rate": 0.0003372051947089526,
+      "loss": 4.7165,
+      "step": 1491
+    },
+    {
+      "epoch": 0.4771632453177157,
+      "grad_norm": 0.33639630675315857,
+      "learning_rate": 0.0003368967308879939,
+      "loss": 4.7684,
+      "step": 1492
+    },
+    {
+      "epoch": 0.4774830598253013,
+      "grad_norm": 0.37365174293518066,
+      "learning_rate": 0.0003365882274548135,
+      "loss": 4.6071,
+      "step": 1493
+    },
+    {
+      "epoch": 0.47780287433288693,
+      "grad_norm": 0.343077152967453,
+      "learning_rate": 0.00033627968474061966,
+      "loss": 4.6191,
+      "step": 1494
+    },
+    {
+      "epoch": 0.47812268884047254,
+      "grad_norm": 0.36870089173316956,
+      "learning_rate": 0.0003359711030766631,
+      "loss": 4.7342,
+      "step": 1495
+    },
+    {
+      "epoch": 0.4784425033480581,
+      "grad_norm": 0.3427372872829437,
+      "learning_rate": 0.0003356624827942361,
+      "loss": 4.6408,
+      "step": 1496
+    },
+    {
+      "epoch": 0.4787623178556437,
+      "grad_norm": 0.3489178419113159,
+      "learning_rate": 0.00033535382422467255,
+      "loss": 4.5872,
+      "step": 1497
+    },
+    {
+      "epoch": 0.47908213236322933,
+      "grad_norm": 0.34867149591445923,
+      "learning_rate": 0.0003350451276993473,
+      "loss": 4.6825,
+      "step": 1498
+    },
+    {
+      "epoch": 0.47940194687081494,
+      "grad_norm": 0.3406824469566345,
+      "learning_rate": 0.000334736393549676,
+      "loss": 4.5928,
+      "step": 1499
+    },
+    {
+      "epoch": 0.4797217613784005,
+      "grad_norm": 0.34946388006210327,
+      "learning_rate": 0.00033442762210711483,
+      "loss": 4.6101,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4797217613784005,
+      "eval_loss": 4.658926010131836,
+      "eval_runtime": 80.6757,
+      "eval_samples_per_second": 23.514,
+      "eval_steps_per_second": 5.888,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4800415758859861,
+      "grad_norm": 0.487714558839798,
+      "learning_rate": 0.0003341188137031599,
+      "loss": 4.6049,
+      "step": 1501
+    },
+    {
+      "epoch": 0.48036139039357173,
+      "grad_norm": 0.3670230805873871,
+      "learning_rate": 0.0003338099686693469,
+      "loss": 4.7125,
+      "step": 1502
+    },
+    {
+      "epoch": 0.48068120490115734,
+      "grad_norm": 0.3611529469490051,
+      "learning_rate": 0.00033350108733725103,
+      "loss": 4.6454,
+      "step": 1503
+    },
+    {
+      "epoch": 0.4810010194087429,
+      "grad_norm": 0.3804737627506256,
+      "learning_rate": 0.00033319217003848644,
+      "loss": 4.7163,
+      "step": 1504
+    },
+    {
+      "epoch": 0.4813208339163285,
+      "grad_norm": 0.3475566506385803,
+      "learning_rate": 0.0003328832171047057,
+      "loss": 4.6357,
+      "step": 1505
+    },
+    {
+      "epoch": 0.48164064842391413,
+      "grad_norm": 0.40150049328804016,
+      "learning_rate": 0.0003325742288675998,
+      "loss": 4.6686,
+      "step": 1506
+    },
+    {
+      "epoch": 0.48196046293149974,
+      "grad_norm": 0.35256749391555786,
+      "learning_rate": 0.0003322652056588976,
+      "loss": 4.6878,
+      "step": 1507
+    },
+    {
+      "epoch": 0.4822802774390853,
+      "grad_norm": 0.324462890625,
+      "learning_rate": 0.0003319561478103656,
+      "loss": 4.6671,
+      "step": 1508
+    },
+    {
+      "epoch": 0.4826000919466709,
+      "grad_norm": 0.3362123966217041,
+      "learning_rate": 0.00033164705565380737,
+      "loss": 4.6155,
+      "step": 1509
+    },
+    {
+      "epoch": 0.48291990645425653,
+      "grad_norm": 0.3507479131221771,
+      "learning_rate": 0.00033133792952106327,
+      "loss": 4.6358,
+      "step": 1510
+    },
+    {
+      "epoch": 0.48323972096184215,
+      "grad_norm": 0.35958319902420044,
+      "learning_rate": 0.0003310287697440102,
+      "loss": 4.5733,
+      "step": 1511
+    },
+    {
+      "epoch": 0.48355953546942776,
+      "grad_norm": 0.3396565020084381,
+      "learning_rate": 0.0003307195766545612,
+      "loss": 4.5693,
+      "step": 1512
+    },
+    {
+      "epoch": 0.4838793499770133,
+      "grad_norm": 0.32788902521133423,
+      "learning_rate": 0.00033041035058466525,
+      "loss": 4.6957,
+      "step": 1513
+    },
+    {
+      "epoch": 0.48419916448459893,
+      "grad_norm": 0.35396474599838257,
+      "learning_rate": 0.00033010109186630625,
+      "loss": 4.6261,
+      "step": 1514
+    },
+    {
+      "epoch": 0.48451897899218455,
+      "grad_norm": 0.3637486696243286,
+      "learning_rate": 0.00032979180083150366,
+      "loss": 4.6156,
+      "step": 1515
+    },
+    {
+      "epoch": 0.48483879349977016,
+      "grad_norm": 0.3381948173046112,
+      "learning_rate": 0.00032948247781231134,
+      "loss": 4.6401,
+      "step": 1516
+    },
+    {
+      "epoch": 0.4851586080073557,
+      "grad_norm": 0.3423517346382141,
+      "learning_rate": 0.0003291731231408175,
+      "loss": 4.6505,
+      "step": 1517
+    },
+    {
+      "epoch": 0.48547842251494133,
+      "grad_norm": 0.34794390201568604,
+      "learning_rate": 0.00032886373714914455,
+      "loss": 4.6943,
+      "step": 1518
+    },
+    {
+      "epoch": 0.48579823702252695,
+      "grad_norm": 0.3557645082473755,
+      "learning_rate": 0.00032855432016944835,
+      "loss": 4.6012,
+      "step": 1519
+    },
+    {
+      "epoch": 0.48611805153011256,
+      "grad_norm": 0.3364381790161133,
+      "learning_rate": 0.000328244872533918,
+      "loss": 4.6509,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4864378660376981,
+      "grad_norm": 0.3535098731517792,
+      "learning_rate": 0.00032793539457477564,
+      "loss": 4.6239,
+      "step": 1521
+    },
+    {
+      "epoch": 0.48675768054528373,
+      "grad_norm": 0.34493860602378845,
+      "learning_rate": 0.00032762588662427585,
+      "loss": 4.571,
+      "step": 1522
+    },
+    {
+      "epoch": 0.48707749505286935,
+      "grad_norm": 0.37122923135757446,
+      "learning_rate": 0.0003273163490147054,
+      "loss": 4.5478,
+      "step": 1523
+    },
+    {
+      "epoch": 0.48739730956045496,
+      "grad_norm": 0.34408897161483765,
+      "learning_rate": 0.0003270067820783831,
+      "loss": 4.6441,
+      "step": 1524
+    },
+    {
+      "epoch": 0.4877171240680405,
+      "grad_norm": 0.36329254508018494,
+      "learning_rate": 0.0003266971861476589,
+      "loss": 4.6485,
+      "step": 1525
+    },
+    {
+      "epoch": 0.48803693857562613,
+      "grad_norm": 0.352076917886734,
+      "learning_rate": 0.00032638756155491436,
+      "loss": 4.6351,
+      "step": 1526
+    },
+    {
+      "epoch": 0.48835675308321175,
+      "grad_norm": 0.37958183884620667,
+      "learning_rate": 0.0003260779086325612,
+      "loss": 4.6328,
+      "step": 1527
+    },
+    {
+      "epoch": 0.48867656759079736,
+      "grad_norm": 0.3417004644870758,
+      "learning_rate": 0.0003257682277130422,
+      "loss": 4.5971,
+      "step": 1528
+    },
+    {
+      "epoch": 0.4889963820983829,
+      "grad_norm": 0.3330473005771637,
+      "learning_rate": 0.0003254585191288297,
+      "loss": 4.6064,
+      "step": 1529
+    },
+    {
+      "epoch": 0.48931619660596853,
+      "grad_norm": 0.3555782735347748,
+      "learning_rate": 0.0003251487832124259,
+      "loss": 4.5578,
+      "step": 1530
+    },
+    {
+      "epoch": 0.48963601111355415,
+      "grad_norm": 0.33937859535217285,
+      "learning_rate": 0.00032483902029636257,
+      "loss": 4.6298,
+      "step": 1531
+    },
+    {
+      "epoch": 0.48995582562113976,
+      "grad_norm": 0.3552972972393036,
+      "learning_rate": 0.00032452923071320006,
+      "loss": 4.6884,
+      "step": 1532
+    },
+    {
+      "epoch": 0.4902756401287253,
+      "grad_norm": 0.34347274899482727,
+      "learning_rate": 0.00032421941479552767,
+      "loss": 4.6478,
+      "step": 1533
+    },
+    {
+      "epoch": 0.49059545463631093,
+      "grad_norm": 0.3543516993522644,
+      "learning_rate": 0.00032390957287596275,
+      "loss": 4.7032,
+      "step": 1534
+    },
+    {
+      "epoch": 0.49091526914389655,
+      "grad_norm": 0.37930676341056824,
+      "learning_rate": 0.0003235997052871508,
+      "loss": 4.6913,
+      "step": 1535
+    },
+    {
+      "epoch": 0.49123508365148216,
+      "grad_norm": 0.3467349410057068,
+      "learning_rate": 0.00032328981236176465,
+      "loss": 4.5694,
+      "step": 1536
+    },
+    {
+      "epoch": 0.4915548981590677,
+      "grad_norm": 0.3399907946586609,
+      "learning_rate": 0.00032297989443250445,
+      "loss": 4.617,
+      "step": 1537
+    },
+    {
+      "epoch": 0.49187471266665334,
+      "grad_norm": 0.33236512541770935,
+      "learning_rate": 0.0003226699518320973,
+      "loss": 4.654,
+      "step": 1538
+    },
+    {
+      "epoch": 0.49219452717423895,
+      "grad_norm": 0.3547270894050598,
+      "learning_rate": 0.0003223599848932964,
+      "loss": 4.6355,
+      "step": 1539
+    },
+    {
+      "epoch": 0.49251434168182456,
+      "grad_norm": 0.3350517749786377,
+      "learning_rate": 0.0003220499939488817,
+      "loss": 4.5831,
+      "step": 1540
+    },
+    {
+      "epoch": 0.4928341561894101,
+      "grad_norm": 0.3335596024990082,
+      "learning_rate": 0.0003217399793316583,
+      "loss": 4.6857,
+      "step": 1541
+    },
+    {
+      "epoch": 0.49315397069699574,
+      "grad_norm": 0.3638198673725128,
+      "learning_rate": 0.00032142994137445693,
+      "loss": 4.5726,
+      "step": 1542
+    },
+    {
+      "epoch": 0.49347378520458135,
+      "grad_norm": 0.3430887460708618,
+      "learning_rate": 0.0003211198804101337,
+      "loss": 4.6377,
+      "step": 1543
+    },
+    {
+      "epoch": 0.49379359971216696,
+      "grad_norm": 0.3397575914859772,
+      "learning_rate": 0.000320809796771569,
+      "loss": 4.6102,
+      "step": 1544
+    },
+    {
+      "epoch": 0.4941134142197525,
+      "grad_norm": 0.33723217248916626,
+      "learning_rate": 0.00032049969079166765,
+      "loss": 4.5556,
+      "step": 1545
+    },
+    {
+      "epoch": 0.49443322872733814,
+      "grad_norm": 0.34441104531288147,
+      "learning_rate": 0.0003201895628033587,
+      "loss": 4.6884,
+      "step": 1546
+    },
+    {
+      "epoch": 0.49475304323492375,
+      "grad_norm": 0.3217085897922516,
+      "learning_rate": 0.00031987941313959433,
+      "loss": 4.678,
+      "step": 1547
+    },
+    {
+      "epoch": 0.49507285774250936,
+      "grad_norm": 0.34782636165618896,
+      "learning_rate": 0.0003195692421333506,
+      "loss": 4.6236,
+      "step": 1548
+    },
+    {
+      "epoch": 0.4953926722500949,
+      "grad_norm": 0.3566958010196686,
+      "learning_rate": 0.0003192590501176261,
+      "loss": 4.6062,
+      "step": 1549
+    },
+    {
+      "epoch": 0.49571248675768054,
+      "grad_norm": 0.36577853560447693,
+      "learning_rate": 0.0003189488374254421,
+      "loss": 4.6045,
+      "step": 1550
+    },
+    {
+      "epoch": 0.49603230126526615,
+      "grad_norm": 0.3512285649776459,
+      "learning_rate": 0.00031863860438984193,
+      "loss": 4.6837,
+      "step": 1551
+    },
+    {
+      "epoch": 0.49635211577285177,
+      "grad_norm": 0.3427387773990631,
+      "learning_rate": 0.00031832835134389093,
+      "loss": 4.6715,
+      "step": 1552
+    },
+    {
+      "epoch": 0.4966719302804373,
+      "grad_norm": 0.3531494140625,
+      "learning_rate": 0.0003180180786206759,
+      "loss": 4.5646,
+      "step": 1553
+    },
+    {
+      "epoch": 0.49699174478802294,
+      "grad_norm": 0.35843127965927124,
+      "learning_rate": 0.0003177077865533046,
+      "loss": 4.6221,
+      "step": 1554
+    },
+    {
+      "epoch": 0.49731155929560855,
+      "grad_norm": 0.35052454471588135,
+      "learning_rate": 0.00031739747547490584,
+      "loss": 4.611,
+      "step": 1555
+    },
+    {
+      "epoch": 0.49763137380319417,
+      "grad_norm": 0.364162802696228,
+      "learning_rate": 0.0003170871457186286,
+      "loss": 4.6817,
+      "step": 1556
+    },
+    {
+      "epoch": 0.4979511883107797,
+      "grad_norm": 0.35862216353416443,
+      "learning_rate": 0.0003167767976176419,
+      "loss": 4.485,
+      "step": 1557
+    },
+    {
+      "epoch": 0.49827100281836534,
+      "grad_norm": 0.35927194356918335,
+      "learning_rate": 0.0003164664315051347,
+      "loss": 4.6354,
+      "step": 1558
+    },
+    {
+      "epoch": 0.49859081732595095,
+      "grad_norm": 0.3603445887565613,
+      "learning_rate": 0.00031615604771431514,
+      "loss": 4.5751,
+      "step": 1559
+    },
+    {
+      "epoch": 0.49891063183353657,
+      "grad_norm": 0.37982043623924255,
+      "learning_rate": 0.00031584564657841015,
+      "loss": 4.6733,
+      "step": 1560
+    },
+    {
+      "epoch": 0.4992304463411221,
+      "grad_norm": 0.3557567000389099,
+      "learning_rate": 0.0003155352284306657,
+      "loss": 4.5831,
+      "step": 1561
+    },
+    {
+      "epoch": 0.49955026084870774,
+      "grad_norm": 0.35779282450675964,
+      "learning_rate": 0.00031522479360434567,
+      "loss": 4.6149,
+      "step": 1562
+    },
+    {
+      "epoch": 0.49987007535629335,
+      "grad_norm": 0.33635514974594116,
+      "learning_rate": 0.00031491434243273214,
+      "loss": 4.7235,
+      "step": 1563
+    },
+    {
+      "epoch": 0.5001898898638789,
+      "grad_norm": 0.38213077187538147,
+      "learning_rate": 0.00031460387524912437,
+      "loss": 4.6656,
+      "step": 1564
+    },
+    {
+      "epoch": 0.5005097043714646,
+      "grad_norm": 0.3529198467731476,
+      "learning_rate": 0.0003142933923868391,
+      "loss": 4.489,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5008295188790501,
+      "grad_norm": 0.34217679500579834,
+      "learning_rate": 0.00031398289417920976,
+      "loss": 4.6375,
+      "step": 1566
+    },
+    {
+      "epoch": 0.5011493333866357,
+      "grad_norm": 0.37172558903694153,
+      "learning_rate": 0.00031367238095958644,
+      "loss": 4.6723,
+      "step": 1567
+    },
+    {
+      "epoch": 0.5014691478942214,
+      "grad_norm": 0.3407527208328247,
+      "learning_rate": 0.00031336185306133523,
+      "loss": 4.5211,
+      "step": 1568
+    },
+    {
+      "epoch": 0.5017889624018069,
+      "grad_norm": 0.36462146043777466,
+      "learning_rate": 0.0003130513108178378,
+      "loss": 4.5811,
+      "step": 1569
+    },
+    {
+      "epoch": 0.5021087769093926,
+      "grad_norm": 0.3509719669818878,
+      "learning_rate": 0.0003127407545624915,
+      "loss": 4.756,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5024285914169782,
+      "grad_norm": 0.375997394323349,
+      "learning_rate": 0.0003124301846287085,
+      "loss": 4.6367,
+      "step": 1571
+    },
+    {
+      "epoch": 0.5027484059245637,
+      "grad_norm": 0.3393837809562683,
+      "learning_rate": 0.00031211960134991596,
+      "loss": 4.6095,
+      "step": 1572
+    },
+    {
+      "epoch": 0.5030682204321494,
+      "grad_norm": 0.3639393150806427,
+      "learning_rate": 0.00031180900505955496,
+      "loss": 4.7104,
+      "step": 1573
+    },
+    {
+      "epoch": 0.5033880349397349,
+      "grad_norm": 0.34335988759994507,
+      "learning_rate": 0.000311498396091081,
+      "loss": 4.6459,
+      "step": 1574
+    },
+    {
+      "epoch": 0.5037078494473205,
+      "grad_norm": 0.350800484418869,
+      "learning_rate": 0.00031118777477796275,
+      "loss": 4.6279,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5040276639549062,
+      "grad_norm": 0.35865524411201477,
+      "learning_rate": 0.0003108771414536825,
+      "loss": 4.6563,
+      "step": 1576
+    },
+    {
+      "epoch": 0.5043474784624917,
+      "grad_norm": 0.346827894449234,
+      "learning_rate": 0.0003105664964517351,
+      "loss": 4.5878,
+      "step": 1577
+    },
+    {
+      "epoch": 0.5046672929700774,
+      "grad_norm": 0.34434160590171814,
+      "learning_rate": 0.0003102558401056282,
+      "loss": 4.5747,
+      "step": 1578
+    },
+    {
+      "epoch": 0.504987107477663,
+      "grad_norm": 0.358316570520401,
+      "learning_rate": 0.00030994517274888155,
+      "loss": 4.6122,
+      "step": 1579
+    },
+    {
+      "epoch": 0.5053069219852485,
+      "grad_norm": 0.3511278033256531,
+      "learning_rate": 0.00030963449471502674,
+      "loss": 4.6333,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5056267364928342,
+      "grad_norm": 0.3433941900730133,
+      "learning_rate": 0.0003093238063376068,
+      "loss": 4.6083,
+      "step": 1581
+    },
+    {
+      "epoch": 0.5059465510004197,
+      "grad_norm": 0.34738221764564514,
+      "learning_rate": 0.00030901310795017567,
+      "loss": 4.6832,
+      "step": 1582
+    },
+    {
+      "epoch": 0.5062663655080054,
+      "grad_norm": 0.35411185026168823,
+      "learning_rate": 0.00030870239988629844,
+      "loss": 4.6223,
+      "step": 1583
+    },
+    {
+      "epoch": 0.506586180015591,
+      "grad_norm": 0.3453824818134308,
+      "learning_rate": 0.0003083916824795503,
+      "loss": 4.6439,
+      "step": 1584
+    },
+    {
+      "epoch": 0.5069059945231765,
+      "grad_norm": 0.35654762387275696,
+      "learning_rate": 0.0003080809560635165,
+      "loss": 4.5631,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5072258090307622,
+      "grad_norm": 0.3844980299472809,
+      "learning_rate": 0.0003077702209717921,
+      "loss": 4.7018,
+      "step": 1586
+    },
+    {
+      "epoch": 0.5075456235383478,
+      "grad_norm": 0.3394923508167267,
+      "learning_rate": 0.0003074594775379812,
+      "loss": 4.5698,
+      "step": 1587
+    },
+    {
+      "epoch": 0.5078654380459333,
+      "grad_norm": 0.34746694564819336,
+      "learning_rate": 0.00030714872609569733,
+      "loss": 4.5785,
+      "step": 1588
+    },
+    {
+      "epoch": 0.508185252553519,
+      "grad_norm": 0.3709687292575836,
+      "learning_rate": 0.0003068379669785622,
+      "loss": 4.6214,
+      "step": 1589
+    },
+    {
+      "epoch": 0.5085050670611045,
+      "grad_norm": 0.3458937108516693,
+      "learning_rate": 0.0003065272005202056,
+      "loss": 4.5854,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5088248815686902,
+      "grad_norm": 0.37032395601272583,
+      "learning_rate": 0.00030621642705426586,
+      "loss": 4.6531,
+      "step": 1591
+    },
+    {
+      "epoch": 0.5091446960762758,
+      "grad_norm": 0.35681501030921936,
+      "learning_rate": 0.0003059056469143884,
+      "loss": 4.5877,
+      "step": 1592
+    },
+    {
+      "epoch": 0.5094645105838613,
+      "grad_norm": 0.34055960178375244,
+      "learning_rate": 0.0003055948604342257,
+      "loss": 4.5802,
+      "step": 1593
+    },
+    {
+      "epoch": 0.509784325091447,
+      "grad_norm": 0.3622954785823822,
+      "learning_rate": 0.0003052840679474373,
+      "loss": 4.5284,
+      "step": 1594
+    },
+    {
+      "epoch": 0.5101041395990326,
+      "grad_norm": 0.33317726850509644,
+      "learning_rate": 0.0003049732697876891,
+      "loss": 4.6278,
+      "step": 1595
+    },
+    {
+      "epoch": 0.5104239541066181,
+      "grad_norm": 0.3395687937736511,
+      "learning_rate": 0.000304662466288653,
+      "loss": 4.6818,
+      "step": 1596
+    },
+    {
+      "epoch": 0.5107437686142038,
+      "grad_norm": 0.3536480963230133,
+      "learning_rate": 0.000304351657784007,
+      "loss": 4.5842,
+      "step": 1597
+    },
+    {
+      "epoch": 0.5110635831217893,
+      "grad_norm": 0.34140917658805847,
+      "learning_rate": 0.0003040408446074339,
+      "loss": 4.7312,
+      "step": 1598
+    },
+    {
+      "epoch": 0.511383397629375,
+      "grad_norm": 0.33931636810302734,
+      "learning_rate": 0.000303730027092622,
+      "loss": 4.5617,
+      "step": 1599
+    },
+    {
+      "epoch": 0.5117032121369606,
+      "grad_norm": 0.336770236492157,
+      "learning_rate": 0.00030341920557326385,
+      "loss": 4.5814,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5117032121369606,
+      "eval_loss": 4.6179680824279785,
+      "eval_runtime": 80.4923,
+      "eval_samples_per_second": 23.567,
+      "eval_steps_per_second": 5.901,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5120230266445461,
+      "grad_norm": 0.3277044892311096,
+      "learning_rate": 0.0003031083803830567,
+      "loss": 4.5805,
+      "step": 1601
+    },
+    {
+      "epoch": 0.5123428411521318,
+      "grad_norm": 0.34600555896759033,
+      "learning_rate": 0.0003027975518557016,
+      "loss": 4.6399,
+      "step": 1602
+    },
+    {
+      "epoch": 0.5126626556597174,
+      "grad_norm": 0.34565469622612,
+      "learning_rate": 0.00030248672032490295,
+      "loss": 4.6341,
+      "step": 1603
+    },
+    {
+      "epoch": 0.5129824701673029,
+      "grad_norm": 0.3367885649204254,
+      "learning_rate": 0.0003021758861243688,
+      "loss": 4.6379,
+      "step": 1604
+    },
+    {
+      "epoch": 0.5133022846748886,
+      "grad_norm": 0.3366295099258423,
+      "learning_rate": 0.0003018650495878096,
+      "loss": 4.6036,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5136220991824741,
+      "grad_norm": 0.34721916913986206,
+      "learning_rate": 0.0003015542110489387,
+      "loss": 4.5726,
+      "step": 1606
+    },
+    {
+      "epoch": 0.5139419136900598,
+      "grad_norm": 0.33897995948791504,
+      "learning_rate": 0.00030124337084147144,
+      "loss": 4.5874,
+      "step": 1607
+    },
+    {
+      "epoch": 0.5142617281976454,
+      "grad_norm": 0.3487263023853302,
+      "learning_rate": 0.0003009325292991247,
+      "loss": 4.5789,
+      "step": 1608
+    },
+    {
+      "epoch": 0.5145815427052309,
+      "grad_norm": 0.3403370678424835,
+      "learning_rate": 0.0003006216867556175,
+      "loss": 4.625,
+      "step": 1609
+    },
+    {
+      "epoch": 0.5149013572128166,
+      "grad_norm": 0.33869045972824097,
+      "learning_rate": 0.00030031084354466904,
+      "loss": 4.5981,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5152211717204022,
+      "grad_norm": 0.34933406114578247,
+      "learning_rate": 0.0003,
+      "loss": 4.6767,
+      "step": 1611
+    },
+    {
+      "epoch": 0.5155409862279877,
+      "grad_norm": 0.3558529317378998,
+      "learning_rate": 0.00029968915645533085,
+      "loss": 4.6249,
+      "step": 1612
+    },
+    {
+      "epoch": 0.5158608007355734,
+      "grad_norm": 0.3501514196395874,
+      "learning_rate": 0.0002993783132443825,
+      "loss": 4.5925,
+      "step": 1613
+    },
+    {
+      "epoch": 0.516180615243159,
+      "grad_norm": 0.34425514936447144,
+      "learning_rate": 0.0002990674707008752,
+      "loss": 4.6,
+      "step": 1614
+    },
+    {
+      "epoch": 0.5165004297507446,
+      "grad_norm": 0.3402040898799896,
+      "learning_rate": 0.0002987566291585286,
+      "loss": 4.6288,
+      "step": 1615
+    },
+    {
+      "epoch": 0.5168202442583302,
+      "grad_norm": 0.33086714148521423,
+      "learning_rate": 0.00029844578895106127,
+      "loss": 4.5629,
+      "step": 1616
+    },
+    {
+      "epoch": 0.5171400587659157,
+      "grad_norm": 0.35298269987106323,
+      "learning_rate": 0.0002981349504121904,
+      "loss": 4.5526,
+      "step": 1617
+    },
+    {
+      "epoch": 0.5174598732735014,
+      "grad_norm": 0.3435682952404022,
+      "learning_rate": 0.0002978241138756312,
+      "loss": 4.5722,
+      "step": 1618
+    },
+    {
+      "epoch": 0.517779687781087,
+      "grad_norm": 0.34460437297821045,
+      "learning_rate": 0.00029751327967509695,
+      "loss": 4.5969,
+      "step": 1619
+    },
+    {
+      "epoch": 0.5180995022886725,
+      "grad_norm": 0.3217136561870575,
+      "learning_rate": 0.0002972024481442984,
+      "loss": 4.5586,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5184193167962582,
+      "grad_norm": 0.36733126640319824,
+      "learning_rate": 0.00029689161961694323,
+      "loss": 4.5857,
+      "step": 1621
+    },
+    {
+      "epoch": 0.5187391313038437,
+      "grad_norm": 0.3302042484283447,
+      "learning_rate": 0.00029658079442673616,
+      "loss": 4.6349,
+      "step": 1622
+    },
+    {
+      "epoch": 0.5190589458114294,
+      "grad_norm": 0.3308677673339844,
+      "learning_rate": 0.000296269972907378,
+      "loss": 4.5493,
+      "step": 1623
+    },
+    {
+      "epoch": 0.519378760319015,
+      "grad_norm": 0.34367772936820984,
+      "learning_rate": 0.00029595915539256605,
+      "loss": 4.6464,
+      "step": 1624
+    },
+    {
+      "epoch": 0.5196985748266005,
+      "grad_norm": 0.37406760454177856,
+      "learning_rate": 0.000295648342215993,
+      "loss": 4.6505,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5200183893341862,
+      "grad_norm": 0.3676385283470154,
+      "learning_rate": 0.0002953375337113468,
+      "loss": 4.5778,
+      "step": 1626
+    },
+    {
+      "epoch": 0.5203382038417718,
+      "grad_norm": 0.3593791127204895,
+      "learning_rate": 0.00029502673021231096,
+      "loss": 4.6247,
+      "step": 1627
+    },
+    {
+      "epoch": 0.5206580183493573,
+      "grad_norm": 0.34721025824546814,
+      "learning_rate": 0.0002947159320525627,
+      "loss": 4.6347,
+      "step": 1628
+    },
+    {
+      "epoch": 0.520977832856943,
+      "grad_norm": 0.36313459277153015,
+      "learning_rate": 0.0002944051395657744,
+      "loss": 4.5717,
+      "step": 1629
+    },
+    {
+      "epoch": 0.5212976473645285,
+      "grad_norm": 0.34549617767333984,
+      "learning_rate": 0.0002940943530856116,
+      "loss": 4.6189,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5216174618721142,
+      "grad_norm": 0.34426239132881165,
+      "learning_rate": 0.00029378357294573403,
+      "loss": 4.5404,
+      "step": 1631
+    },
+    {
+      "epoch": 0.5219372763796998,
+      "grad_norm": 0.3499269187450409,
+      "learning_rate": 0.0002934727994797944,
+      "loss": 4.6828,
+      "step": 1632
+    },
+    {
+      "epoch": 0.5222570908872853,
+      "grad_norm": 0.33449286222457886,
+      "learning_rate": 0.0002931620330214378,
+      "loss": 4.6162,
+      "step": 1633
+    },
+    {
+      "epoch": 0.522576905394871,
+      "grad_norm": 0.3505636751651764,
+      "learning_rate": 0.00029285127390430273,
+      "loss": 4.5799,
+      "step": 1634
+    },
+    {
+      "epoch": 0.5228967199024566,
+      "grad_norm": 0.3406105637550354,
+      "learning_rate": 0.00029254052246201873,
+      "loss": 4.6334,
+      "step": 1635
+    },
+    {
+      "epoch": 0.5232165344100421,
+      "grad_norm": 0.3492126762866974,
+      "learning_rate": 0.00029222977902820785,
+      "loss": 4.6696,
+      "step": 1636
+    },
+    {
+      "epoch": 0.5235363489176278,
+      "grad_norm": 0.3455886244773865,
+      "learning_rate": 0.0002919190439364835,
+      "loss": 4.5658,
+      "step": 1637
+    },
+    {
+      "epoch": 0.5238561634252134,
+      "grad_norm": 0.3499142825603485,
+      "learning_rate": 0.00029160831752044966,
+      "loss": 4.6524,
+      "step": 1638
+    },
+    {
+      "epoch": 0.524175977932799,
+      "grad_norm": 0.34150341153144836,
+      "learning_rate": 0.00029129760011370156,
+      "loss": 4.5589,
+      "step": 1639
+    },
+    {
+      "epoch": 0.5244957924403846,
+      "grad_norm": 0.36174193024635315,
+      "learning_rate": 0.00029098689204982433,
+      "loss": 4.6428,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5248156069479701,
+      "grad_norm": 0.3357899785041809,
+      "learning_rate": 0.00029067619366239327,
+      "loss": 4.5669,
+      "step": 1641
+    },
+    {
+      "epoch": 0.5251354214555558,
+      "grad_norm": 0.3400886058807373,
+      "learning_rate": 0.00029036550528497326,
+      "loss": 4.5824,
+      "step": 1642
+    },
+    {
+      "epoch": 0.5254552359631414,
+      "grad_norm": 0.32499825954437256,
+      "learning_rate": 0.0002900548272511183,
+      "loss": 4.6356,
+      "step": 1643
+    },
+    {
+      "epoch": 0.5257750504707269,
+      "grad_norm": 0.34948495030403137,
+      "learning_rate": 0.00028974415989437176,
+      "loss": 4.5922,
+      "step": 1644
+    },
+    {
+      "epoch": 0.5260948649783126,
+      "grad_norm": 0.3675726056098938,
+      "learning_rate": 0.0002894335035482649,
+      "loss": 4.6599,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5264146794858982,
+      "grad_norm": 0.3842601180076599,
+      "learning_rate": 0.00028912285854631754,
+      "loss": 4.552,
+      "step": 1646
+    },
+    {
+      "epoch": 0.5267344939934838,
+      "grad_norm": 0.33656397461891174,
+      "learning_rate": 0.0002888122252220372,
+      "loss": 4.5866,
+      "step": 1647
+    },
+    {
+      "epoch": 0.5270543085010694,
+      "grad_norm": 0.3506307005882263,
+      "learning_rate": 0.00028850160390891895,
+      "loss": 4.6307,
+      "step": 1648
+    },
+    {
+      "epoch": 0.5273741230086549,
+      "grad_norm": 0.40038666129112244,
+      "learning_rate": 0.000288190994940445,
+      "loss": 4.5926,
+      "step": 1649
+    },
+    {
+      "epoch": 0.5276939375162406,
+      "grad_norm": 0.350915789604187,
+      "learning_rate": 0.00028788039865008404,
+      "loss": 4.664,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5280137520238262,
+      "grad_norm": 0.34243419766426086,
+      "learning_rate": 0.0002875698153712915,
+      "loss": 4.5545,
+      "step": 1651
+    },
+    {
+      "epoch": 0.5283335665314118,
+      "grad_norm": 0.33768656849861145,
+      "learning_rate": 0.0002872592454375086,
+      "loss": 4.5719,
+      "step": 1652
+    },
+    {
+      "epoch": 0.5286533810389974,
+      "grad_norm": 0.34391385316848755,
+      "learning_rate": 0.00028694868918216227,
+      "loss": 4.5932,
+      "step": 1653
+    },
+    {
+      "epoch": 0.528973195546583,
+      "grad_norm": 0.3686625361442566,
+      "learning_rate": 0.0002866381469386648,
+      "loss": 4.6416,
+      "step": 1654
+    },
+    {
+      "epoch": 0.5292930100541686,
+      "grad_norm": 0.3312521278858185,
+      "learning_rate": 0.0002863276190404135,
+      "loss": 4.5455,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5296128245617542,
+      "grad_norm": 0.3539404273033142,
+      "learning_rate": 0.0002860171058207902,
+      "loss": 4.6432,
+      "step": 1656
+    },
+    {
+      "epoch": 0.5299326390693397,
+      "grad_norm": 0.34247922897338867,
+      "learning_rate": 0.0002857066076131609,
+      "loss": 4.5633,
+      "step": 1657
+    },
+    {
+      "epoch": 0.5302524535769254,
+      "grad_norm": 0.3436299264431,
+      "learning_rate": 0.00028539612475087563,
+      "loss": 4.5739,
+      "step": 1658
+    },
+    {
+      "epoch": 0.530572268084511,
+      "grad_norm": 0.36008891463279724,
+      "learning_rate": 0.0002850856575672679,
+      "loss": 4.5924,
+      "step": 1659
+    },
+    {
+      "epoch": 0.5308920825920966,
+      "grad_norm": 0.33665239810943604,
+      "learning_rate": 0.0002847752063956543,
+      "loss": 4.5593,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5312118970996822,
+      "grad_norm": 0.35827916860580444,
+      "learning_rate": 0.00028446477156933425,
+      "loss": 4.5354,
+      "step": 1661
+    },
+    {
+      "epoch": 0.5315317116072678,
+      "grad_norm": 0.33464479446411133,
+      "learning_rate": 0.0002841543534215898,
+      "loss": 4.6346,
+      "step": 1662
+    },
+    {
+      "epoch": 0.5318515261148534,
+      "grad_norm": 0.3329927623271942,
+      "learning_rate": 0.0002838439522856849,
+      "loss": 4.5882,
+      "step": 1663
+    },
+    {
+      "epoch": 0.532171340622439,
+      "grad_norm": 0.34070804715156555,
+      "learning_rate": 0.00028353356849486526,
+      "loss": 4.571,
+      "step": 1664
+    },
+    {
+      "epoch": 0.5324911551300245,
+      "grad_norm": 0.339232861995697,
+      "learning_rate": 0.00028322320238235814,
+      "loss": 4.5275,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5328109696376102,
+      "grad_norm": 0.3289678990840912,
+      "learning_rate": 0.00028291285428137146,
+      "loss": 4.6546,
+      "step": 1666
+    },
+    {
+      "epoch": 0.5331307841451958,
+      "grad_norm": 0.36252179741859436,
+      "learning_rate": 0.0002826025245250941,
+      "loss": 4.5838,
+      "step": 1667
+    },
+    {
+      "epoch": 0.5334505986527814,
+      "grad_norm": 0.3532697856426239,
+      "learning_rate": 0.00028229221344669534,
+      "loss": 4.6553,
+      "step": 1668
+    },
+    {
+      "epoch": 0.533770413160367,
+      "grad_norm": 0.33651626110076904,
+      "learning_rate": 0.0002819819213793241,
+      "loss": 4.6096,
+      "step": 1669
+    },
+    {
+      "epoch": 0.5340902276679526,
+      "grad_norm": 0.36147356033325195,
+      "learning_rate": 0.00028167164865610907,
+      "loss": 4.6065,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5344100421755382,
+      "grad_norm": 0.3495180904865265,
+      "learning_rate": 0.00028136139561015807,
+      "loss": 4.5254,
+      "step": 1671
+    },
+    {
+      "epoch": 0.5347298566831238,
+      "grad_norm": 0.3535555899143219,
+      "learning_rate": 0.00028105116257455786,
+      "loss": 4.5102,
+      "step": 1672
+    },
+    {
+      "epoch": 0.5350496711907093,
+      "grad_norm": 0.35404103994369507,
+      "learning_rate": 0.00028074094988237385,
+      "loss": 4.5907,
+      "step": 1673
+    },
+    {
+      "epoch": 0.535369485698295,
+      "grad_norm": 0.3414469361305237,
+      "learning_rate": 0.00028043075786664934,
+      "loss": 4.5869,
+      "step": 1674
+    },
+    {
+      "epoch": 0.5356893002058806,
+      "grad_norm": 0.35111403465270996,
+      "learning_rate": 0.0002801205868604057,
+      "loss": 4.5031,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5360091147134662,
+      "grad_norm": 0.3515930473804474,
+      "learning_rate": 0.0002798104371966414,
+      "loss": 4.5849,
+      "step": 1676
+    },
+    {
+      "epoch": 0.5363289292210518,
+      "grad_norm": 0.34862038493156433,
+      "learning_rate": 0.0002795003092083324,
+      "loss": 4.6823,
+      "step": 1677
+    },
+    {
+      "epoch": 0.5366487437286374,
+      "grad_norm": 0.3787916302680969,
+      "learning_rate": 0.000279190203228431,
+      "loss": 4.5382,
+      "step": 1678
+    },
+    {
+      "epoch": 0.536968558236223,
+      "grad_norm": 0.3493860960006714,
+      "learning_rate": 0.00027888011958986623,
+      "loss": 4.6592,
+      "step": 1679
+    },
+    {
+      "epoch": 0.5372883727438086,
+      "grad_norm": 0.35074809193611145,
+      "learning_rate": 0.00027857005862554307,
+      "loss": 4.5425,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5376081872513941,
+      "grad_norm": 0.35839346051216125,
+      "learning_rate": 0.00027826002066834167,
+      "loss": 4.6724,
+      "step": 1681
+    },
+    {
+      "epoch": 0.5379280017589798,
+      "grad_norm": 0.3281191885471344,
+      "learning_rate": 0.0002779500060511184,
+      "loss": 4.5329,
+      "step": 1682
+    },
+    {
+      "epoch": 0.5382478162665654,
+      "grad_norm": 0.36021775007247925,
+      "learning_rate": 0.00027764001510670354,
+      "loss": 4.4596,
+      "step": 1683
+    },
+    {
+      "epoch": 0.538567630774151,
+      "grad_norm": 0.36002200841903687,
+      "learning_rate": 0.00027733004816790267,
+      "loss": 4.5739,
+      "step": 1684
+    },
+    {
+      "epoch": 0.5388874452817366,
+      "grad_norm": 0.33183857798576355,
+      "learning_rate": 0.00027702010556749556,
+      "loss": 4.5799,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5392072597893222,
+      "grad_norm": 0.3585663437843323,
+      "learning_rate": 0.0002767101876382353,
+      "loss": 4.5949,
+      "step": 1686
+    },
+    {
+      "epoch": 0.5395270742969078,
+      "grad_norm": 0.36853551864624023,
+      "learning_rate": 0.00027640029471284923,
+      "loss": 4.553,
+      "step": 1687
+    },
+    {
+      "epoch": 0.5398468888044934,
+      "grad_norm": 0.33454397320747375,
+      "learning_rate": 0.00027609042712403725,
+      "loss": 4.6197,
+      "step": 1688
+    },
+    {
+      "epoch": 0.5401667033120789,
+      "grad_norm": 0.36533334851264954,
+      "learning_rate": 0.0002757805852044724,
+      "loss": 4.5755,
+      "step": 1689
+    },
+    {
+      "epoch": 0.5404865178196646,
+      "grad_norm": 0.3548142611980438,
+      "learning_rate": 0.00027547076928679994,
+      "loss": 4.5953,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5408063323272502,
+      "grad_norm": 0.33640915155410767,
+      "learning_rate": 0.0002751609797036373,
+      "loss": 4.5421,
+      "step": 1691
+    },
+    {
+      "epoch": 0.5411261468348358,
+      "grad_norm": 0.3413470685482025,
+      "learning_rate": 0.00027485121678757406,
+      "loss": 4.6046,
+      "step": 1692
+    },
+    {
+      "epoch": 0.5414459613424214,
+      "grad_norm": 0.34951040148735046,
+      "learning_rate": 0.0002745414808711703,
+      "loss": 4.5637,
+      "step": 1693
+    },
+    {
+      "epoch": 0.541765775850007,
+      "grad_norm": 0.3462730348110199,
+      "learning_rate": 0.00027423177228695785,
+      "loss": 4.5405,
+      "step": 1694
+    },
+    {
+      "epoch": 0.5420855903575926,
+      "grad_norm": 0.3536907732486725,
+      "learning_rate": 0.00027392209136743875,
+      "loss": 4.5413,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5424054048651782,
+      "grad_norm": 0.3455154299736023,
+      "learning_rate": 0.0002736124384450857,
+      "loss": 4.5782,
+      "step": 1696
+    },
+    {
+      "epoch": 0.5427252193727637,
+      "grad_norm": 0.35326552391052246,
+      "learning_rate": 0.0002733028138523411,
+      "loss": 4.5603,
+      "step": 1697
+    },
+    {
+      "epoch": 0.5430450338803494,
+      "grad_norm": 0.35846778750419617,
+      "learning_rate": 0.0002729932179216169,
+      "loss": 4.5149,
+      "step": 1698
+    },
+    {
+      "epoch": 0.543364848387935,
+      "grad_norm": 0.34490907192230225,
+      "learning_rate": 0.0002726836509852946,
+      "loss": 4.5543,
+      "step": 1699
+    },
+    {
+      "epoch": 0.5436846628955206,
+      "grad_norm": 0.35876429080963135,
+      "learning_rate": 0.0002723741133757242,
+      "loss": 4.5607,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5436846628955206,
+      "eval_loss": 4.5858354568481445,
+      "eval_runtime": 80.5895,
+      "eval_samples_per_second": 23.539,
+      "eval_steps_per_second": 5.894,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5440044774031062,
+      "grad_norm": 0.34980279207229614,
+      "learning_rate": 0.0002720646054252244,
+      "loss": 4.6041,
+      "step": 1701
+    },
+    {
+      "epoch": 0.5443242919106918,
+      "grad_norm": 0.3580838441848755,
+      "learning_rate": 0.000271755127466082,
+      "loss": 4.6248,
+      "step": 1702
+    },
+    {
+      "epoch": 0.5446441064182774,
+      "grad_norm": 0.34394776821136475,
+      "learning_rate": 0.0002714456798305516,
+      "loss": 4.6201,
+      "step": 1703
+    },
+    {
+      "epoch": 0.544963920925863,
+      "grad_norm": 0.35680657625198364,
+      "learning_rate": 0.0002711362628508554,
+      "loss": 4.5247,
+      "step": 1704
+    },
+    {
+      "epoch": 0.5452837354334485,
+      "grad_norm": 0.33691632747650146,
+      "learning_rate": 0.0002708268768591825,
+      "loss": 4.5839,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5456035499410342,
+      "grad_norm": 0.35764801502227783,
+      "learning_rate": 0.0002705175221876887,
+      "loss": 4.6062,
+      "step": 1706
+    },
+    {
+      "epoch": 0.5459233644486198,
+      "grad_norm": 0.37481939792633057,
+      "learning_rate": 0.00027020819916849634,
+      "loss": 4.5672,
+      "step": 1707
+    },
+    {
+      "epoch": 0.5462431789562054,
+      "grad_norm": 0.347343772649765,
+      "learning_rate": 0.0002698989081336937,
+      "loss": 4.5451,
+      "step": 1708
+    },
+    {
+      "epoch": 0.546562993463791,
+      "grad_norm": 0.3330977261066437,
+      "learning_rate": 0.00026958964941533475,
+      "loss": 4.5537,
+      "step": 1709
+    },
+    {
+      "epoch": 0.5468828079713766,
+      "grad_norm": 0.35509002208709717,
+      "learning_rate": 0.00026928042334543867,
+      "loss": 4.5283,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5472026224789622,
+      "grad_norm": 0.3351483941078186,
+      "learning_rate": 0.0002689712302559898,
+      "loss": 4.6409,
+      "step": 1711
+    },
+    {
+      "epoch": 0.5475224369865478,
+      "grad_norm": 0.3378784954547882,
+      "learning_rate": 0.0002686620704789367,
+      "loss": 4.5829,
+      "step": 1712
+    },
+    {
+      "epoch": 0.5478422514941333,
+      "grad_norm": 0.3531706929206848,
+      "learning_rate": 0.00026835294434619263,
+      "loss": 4.5944,
+      "step": 1713
+    },
+    {
+      "epoch": 0.548162066001719,
+      "grad_norm": 0.3312965929508209,
+      "learning_rate": 0.0002680438521896343,
+      "loss": 4.6036,
+      "step": 1714
+    },
+    {
+      "epoch": 0.5484818805093046,
+      "grad_norm": 0.3529874086380005,
+      "learning_rate": 0.0002677347943411023,
+      "loss": 4.6288,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5488016950168902,
+      "grad_norm": 0.34308186173439026,
+      "learning_rate": 0.0002674257711324002,
+      "loss": 4.544,
+      "step": 1716
+    },
+    {
+      "epoch": 0.5491215095244758,
+      "grad_norm": 0.33933305740356445,
+      "learning_rate": 0.0002671167828952943,
+      "loss": 4.5742,
+      "step": 1717
+    },
+    {
+      "epoch": 0.5494413240320614,
+      "grad_norm": 0.3590443730354309,
+      "learning_rate": 0.0002668078299615136,
+      "loss": 4.5491,
+      "step": 1718
+    },
+    {
+      "epoch": 0.549761138539647,
+      "grad_norm": 0.3418152332305908,
+      "learning_rate": 0.0002664989126627489,
+      "loss": 4.6603,
+      "step": 1719
+    },
+    {
+      "epoch": 0.5500809530472326,
+      "grad_norm": 0.33432820439338684,
+      "learning_rate": 0.00026619003133065305,
+      "loss": 4.6488,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5504007675548183,
+      "grad_norm": 0.36660119891166687,
+      "learning_rate": 0.0002658811862968401,
+      "loss": 4.5412,
+      "step": 1721
+    },
+    {
+      "epoch": 0.5507205820624038,
+      "grad_norm": 0.34112343192100525,
+      "learning_rate": 0.0002655723778928851,
+      "loss": 4.5507,
+      "step": 1722
+    },
+    {
+      "epoch": 0.5510403965699894,
+      "grad_norm": 0.3553185760974884,
+      "learning_rate": 0.00026526360645032405,
+      "loss": 4.5742,
+      "step": 1723
+    },
+    {
+      "epoch": 0.551360211077575,
+      "grad_norm": 0.34461116790771484,
+      "learning_rate": 0.0002649548723006527,
+      "loss": 4.6606,
+      "step": 1724
+    },
+    {
+      "epoch": 0.5516800255851606,
+      "grad_norm": 0.3275977671146393,
+      "learning_rate": 0.00026464617577532756,
+      "loss": 4.6868,
+      "step": 1725
+    },
+    {
+      "epoch": 0.5519998400927462,
+      "grad_norm": 0.340777188539505,
+      "learning_rate": 0.00026433751720576385,
+      "loss": 4.5074,
+      "step": 1726
+    },
+    {
+      "epoch": 0.5523196546003318,
+      "grad_norm": 0.3257087469100952,
+      "learning_rate": 0.00026402889692333684,
+      "loss": 4.5068,
+      "step": 1727
+    },
+    {
+      "epoch": 0.5526394691079174,
+      "grad_norm": 0.3451725244522095,
+      "learning_rate": 0.00026372031525938034,
+      "loss": 4.4852,
+      "step": 1728
+    },
+    {
+      "epoch": 0.5529592836155031,
+      "grad_norm": 0.3478621542453766,
+      "learning_rate": 0.0002634117725451865,
+      "loss": 4.5435,
+      "step": 1729
+    },
+    {
+      "epoch": 0.5532790981230886,
+      "grad_norm": 0.33903783559799194,
+      "learning_rate": 0.00026310326911200616,
+      "loss": 4.5651,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5535989126306742,
+      "grad_norm": 0.3416825830936432,
+      "learning_rate": 0.0002627948052910474,
+      "loss": 4.6645,
+      "step": 1731
+    },
+    {
+      "epoch": 0.5539187271382598,
+      "grad_norm": 0.3245784044265747,
+      "learning_rate": 0.00026248638141347614,
+      "loss": 4.538,
+      "step": 1732
+    },
+    {
+      "epoch": 0.5542385416458454,
+      "grad_norm": 0.36666932702064514,
+      "learning_rate": 0.00026217799781041567,
+      "loss": 4.6333,
+      "step": 1733
+    },
+    {
+      "epoch": 0.554558356153431,
+      "grad_norm": 0.33497753739356995,
+      "learning_rate": 0.00026186965481294526,
+      "loss": 4.4563,
+      "step": 1734
+    },
+    {
+      "epoch": 0.5548781706610166,
+      "grad_norm": 0.34304019808769226,
+      "learning_rate": 0.0002615613527521014,
+      "loss": 4.5803,
+      "step": 1735
+    },
+    {
+      "epoch": 0.5551979851686022,
+      "grad_norm": 0.3408200144767761,
+      "learning_rate": 0.00026125309195887603,
+      "loss": 4.5838,
+      "step": 1736
+    },
+    {
+      "epoch": 0.5555177996761879,
+      "grad_norm": 0.36223822832107544,
+      "learning_rate": 0.0002609448727642172,
+      "loss": 4.5038,
+      "step": 1737
+    },
+    {
+      "epoch": 0.5558376141837734,
+      "grad_norm": 0.3321765959262848,
+      "learning_rate": 0.000260636695499028,
+      "loss": 4.5712,
+      "step": 1738
+    },
+    {
+      "epoch": 0.556157428691359,
+      "grad_norm": 0.33548539876937866,
+      "learning_rate": 0.00026032856049416664,
+      "loss": 4.5174,
+      "step": 1739
+    },
+    {
+      "epoch": 0.5564772431989446,
+      "grad_norm": 0.3426455855369568,
+      "learning_rate": 0.00026002046808044617,
+      "loss": 4.6059,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5567970577065302,
+      "grad_norm": 0.34694841504096985,
+      "learning_rate": 0.0002597124185886334,
+      "loss": 4.6188,
+      "step": 1741
+    },
+    {
+      "epoch": 0.5571168722141158,
+      "grad_norm": 0.330301970243454,
+      "learning_rate": 0.0002594044123494496,
+      "loss": 4.558,
+      "step": 1742
+    },
+    {
+      "epoch": 0.5574366867217014,
+      "grad_norm": 0.3402824401855469,
+      "learning_rate": 0.00025909644969356917,
+      "loss": 4.5588,
+      "step": 1743
+    },
+    {
+      "epoch": 0.557756501229287,
+      "grad_norm": 0.3349056541919708,
+      "learning_rate": 0.00025878853095162,
+      "loss": 4.5558,
+      "step": 1744
+    },
+    {
+      "epoch": 0.5580763157368727,
+      "grad_norm": 0.3273567259311676,
+      "learning_rate": 0.00025848065645418285,
+      "loss": 4.4886,
+      "step": 1745
+    },
+    {
+      "epoch": 0.5583961302444582,
+      "grad_norm": 0.37428098917007446,
+      "learning_rate": 0.00025817282653179064,
+      "loss": 4.6228,
+      "step": 1746
+    },
+    {
+      "epoch": 0.5587159447520438,
+      "grad_norm": 0.33602315187454224,
+      "learning_rate": 0.0002578650415149289,
+      "loss": 4.5614,
+      "step": 1747
+    },
+    {
+      "epoch": 0.5590357592596295,
+      "grad_norm": 0.37221935391426086,
+      "learning_rate": 0.00025755730173403457,
+      "loss": 4.5068,
+      "step": 1748
+    },
+    {
+      "epoch": 0.559355573767215,
+      "grad_norm": 0.34931477904319763,
+      "learning_rate": 0.0002572496075194963,
+      "loss": 4.5419,
+      "step": 1749
+    },
+    {
+      "epoch": 0.5596753882748006,
+      "grad_norm": 0.33774423599243164,
+      "learning_rate": 0.00025694195920165344,
+      "loss": 4.6304,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5599952027823862,
+      "grad_norm": 0.355263888835907,
+      "learning_rate": 0.0002566343571107966,
+      "loss": 4.5588,
+      "step": 1751
+    },
+    {
+      "epoch": 0.5603150172899718,
+      "grad_norm": 0.34730982780456543,
+      "learning_rate": 0.0002563268015771664,
+      "loss": 4.5754,
+      "step": 1752
+    },
+    {
+      "epoch": 0.5606348317975575,
+      "grad_norm": 0.32993969321250916,
+      "learning_rate": 0.00025601929293095344,
+      "loss": 4.5407,
+      "step": 1753
+    },
+    {
+      "epoch": 0.560954646305143,
+      "grad_norm": 0.3502591848373413,
+      "learning_rate": 0.00025571183150229827,
+      "loss": 4.5203,
+      "step": 1754
+    },
+    {
+      "epoch": 0.5612744608127286,
+      "grad_norm": 0.38740217685699463,
+      "learning_rate": 0.00025540441762129045,
+      "loss": 4.5163,
+      "step": 1755
+    },
+    {
+      "epoch": 0.5615942753203143,
+      "grad_norm": 0.34605130553245544,
+      "learning_rate": 0.00025509705161796866,
+      "loss": 4.5708,
+      "step": 1756
+    },
+    {
+      "epoch": 0.5619140898278998,
+      "grad_norm": 0.3458990156650543,
+      "learning_rate": 0.0002547897338223202,
+      "loss": 4.5134,
+      "step": 1757
+    },
+    {
+      "epoch": 0.5622339043354854,
+      "grad_norm": 0.3720369338989258,
+      "learning_rate": 0.0002544824645642804,
+      "loss": 4.5965,
+      "step": 1758
+    },
+    {
+      "epoch": 0.562553718843071,
+      "grad_norm": 0.3522976040840149,
+      "learning_rate": 0.00025417524417373276,
+      "loss": 4.5462,
+      "step": 1759
+    },
+    {
+      "epoch": 0.5628735333506566,
+      "grad_norm": 0.35671481490135193,
+      "learning_rate": 0.00025386807298050817,
+      "loss": 4.6235,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5631933478582423,
+      "grad_norm": 0.35742253065109253,
+      "learning_rate": 0.00025356095131438464,
+      "loss": 4.6114,
+      "step": 1761
+    },
+    {
+      "epoch": 0.5635131623658278,
+      "grad_norm": 0.36183950304985046,
+      "learning_rate": 0.0002532538795050872,
+      "loss": 4.6162,
+      "step": 1762
+    },
+    {
+      "epoch": 0.5638329768734134,
+      "grad_norm": 0.33316004276275635,
+      "learning_rate": 0.0002529468578822871,
+      "loss": 4.5738,
+      "step": 1763
+    },
+    {
+      "epoch": 0.564152791380999,
+      "grad_norm": 0.3625221252441406,
+      "learning_rate": 0.00025263988677560204,
+      "loss": 4.4923,
+      "step": 1764
+    },
+    {
+      "epoch": 0.5644726058885846,
+      "grad_norm": 0.3555978536605835,
+      "learning_rate": 0.0002523329665145951,
+      "loss": 4.4812,
+      "step": 1765
+    },
+    {
+      "epoch": 0.5647924203961702,
+      "grad_norm": 0.38111501932144165,
+      "learning_rate": 0.00025202609742877515,
+      "loss": 4.5462,
+      "step": 1766
+    },
+    {
+      "epoch": 0.5651122349037558,
+      "grad_norm": 0.37184572219848633,
+      "learning_rate": 0.0002517192798475958,
+      "loss": 4.4111,
+      "step": 1767
+    },
+    {
+      "epoch": 0.5654320494113414,
+      "grad_norm": 0.3523752689361572,
+      "learning_rate": 0.00025141251410045547,
+      "loss": 4.4912,
+      "step": 1768
+    },
+    {
+      "epoch": 0.5657518639189271,
+      "grad_norm": 0.3493099808692932,
+      "learning_rate": 0.0002511058005166972,
+      "loss": 4.4431,
+      "step": 1769
+    },
+    {
+      "epoch": 0.5660716784265126,
+      "grad_norm": 0.3703976571559906,
+      "learning_rate": 0.0002507991394256075,
+      "loss": 4.5189,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5663914929340982,
+      "grad_norm": 0.3478959798812866,
+      "learning_rate": 0.00025049253115641713,
+      "loss": 4.5718,
+      "step": 1771
+    },
+    {
+      "epoch": 0.5667113074416839,
+      "grad_norm": 0.3373987674713135,
+      "learning_rate": 0.00025018597603829944,
+      "loss": 4.5871,
+      "step": 1772
+    },
+    {
+      "epoch": 0.5670311219492694,
+      "grad_norm": 0.371184378862381,
+      "learning_rate": 0.0002498794744003715,
+      "loss": 4.5144,
+      "step": 1773
+    },
+    {
+      "epoch": 0.567350936456855,
+      "grad_norm": 0.35939913988113403,
+      "learning_rate": 0.0002495730265716922,
+      "loss": 4.5341,
+      "step": 1774
+    },
+    {
+      "epoch": 0.5676707509644406,
+      "grad_norm": 0.3704868257045746,
+      "learning_rate": 0.00024926663288126323,
+      "loss": 4.5542,
+      "step": 1775
+    },
+    {
+      "epoch": 0.5679905654720262,
+      "grad_norm": 0.3416334390640259,
+      "learning_rate": 0.00024896029365802807,
+      "loss": 4.4877,
+      "step": 1776
+    },
+    {
+      "epoch": 0.5683103799796119,
+      "grad_norm": 0.36683937907218933,
+      "learning_rate": 0.0002486540092308713,
+      "loss": 4.6201,
+      "step": 1777
+    },
+    {
+      "epoch": 0.5686301944871974,
+      "grad_norm": 0.34621694684028625,
+      "learning_rate": 0.00024834777992861935,
+      "loss": 4.4863,
+      "step": 1778
+    },
+    {
+      "epoch": 0.568950008994783,
+      "grad_norm": 0.34777936339378357,
+      "learning_rate": 0.0002480416060800387,
+      "loss": 4.5469,
+      "step": 1779
+    },
+    {
+      "epoch": 0.5692698235023687,
+      "grad_norm": 0.34788841009140015,
+      "learning_rate": 0.0002477354880138369,
+      "loss": 4.5659,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5695896380099542,
+      "grad_norm": 0.3381100296974182,
+      "learning_rate": 0.0002474294260586615,
+      "loss": 4.5329,
+      "step": 1781
+    },
+    {
+      "epoch": 0.5699094525175398,
+      "grad_norm": 0.348910927772522,
+      "learning_rate": 0.00024712342054309945,
+      "loss": 4.5827,
+      "step": 1782
+    },
+    {
+      "epoch": 0.5702292670251254,
+      "grad_norm": 0.3376369774341583,
+      "learning_rate": 0.00024681747179567775,
+      "loss": 4.5388,
+      "step": 1783
+    },
+    {
+      "epoch": 0.570549081532711,
+      "grad_norm": 0.3344530761241913,
+      "learning_rate": 0.0002465115801448617,
+      "loss": 4.5565,
+      "step": 1784
+    },
+    {
+      "epoch": 0.5708688960402967,
+      "grad_norm": 0.3593077063560486,
+      "learning_rate": 0.0002462057459190559,
+      "loss": 4.5538,
+      "step": 1785
+    },
+    {
+      "epoch": 0.5711887105478822,
+      "grad_norm": 0.33933642506599426,
+      "learning_rate": 0.0002458999694466029,
+      "loss": 4.5417,
+      "step": 1786
+    },
+    {
+      "epoch": 0.5715085250554678,
+      "grad_norm": 0.3367229998111725,
+      "learning_rate": 0.0002455942510557836,
+      "loss": 4.5613,
+      "step": 1787
+    },
+    {
+      "epoch": 0.5718283395630535,
+      "grad_norm": 0.3529382646083832,
+      "learning_rate": 0.0002452885910748163,
+      "loss": 4.5653,
+      "step": 1788
+    },
+    {
+      "epoch": 0.572148154070639,
+      "grad_norm": 0.34044578671455383,
+      "learning_rate": 0.0002449829898318566,
+      "loss": 4.5278,
+      "step": 1789
+    },
+    {
+      "epoch": 0.5724679685782247,
+      "grad_norm": 0.3467525839805603,
+      "learning_rate": 0.0002446774476549972,
+      "loss": 4.57,
+      "step": 1790
+    },
+    {
+      "epoch": 0.5727877830858102,
+      "grad_norm": 0.344215989112854,
+      "learning_rate": 0.00024437196487226716,
+      "loss": 4.4932,
+      "step": 1791
+    },
+    {
+      "epoch": 0.5731075975933958,
+      "grad_norm": 0.3458828032016754,
+      "learning_rate": 0.00024406654181163197,
+      "loss": 4.5249,
+      "step": 1792
+    },
+    {
+      "epoch": 0.5734274121009815,
+      "grad_norm": 0.3492017984390259,
+      "learning_rate": 0.000243761178800993,
+      "loss": 4.5409,
+      "step": 1793
+    },
+    {
+      "epoch": 0.573747226608567,
+      "grad_norm": 0.3367227613925934,
+      "learning_rate": 0.00024345587616818692,
+      "loss": 4.4586,
+      "step": 1794
+    },
+    {
+      "epoch": 0.5740670411161526,
+      "grad_norm": 0.3362981379032135,
+      "learning_rate": 0.00024315063424098585,
+      "loss": 4.534,
+      "step": 1795
+    },
+    {
+      "epoch": 0.5743868556237383,
+      "grad_norm": 0.3493768870830536,
+      "learning_rate": 0.00024284545334709657,
+      "loss": 4.6799,
+      "step": 1796
+    },
+    {
+      "epoch": 0.5747066701313238,
+      "grad_norm": 0.35034531354904175,
+      "learning_rate": 0.00024254033381416047,
+      "loss": 4.4829,
+      "step": 1797
+    },
+    {
+      "epoch": 0.5750264846389095,
+      "grad_norm": 0.326031357049942,
+      "learning_rate": 0.00024223527596975284,
+      "loss": 4.5284,
+      "step": 1798
+    },
+    {
+      "epoch": 0.575346299146495,
+      "grad_norm": 0.3683282434940338,
+      "learning_rate": 0.000241930280141383,
+      "loss": 4.5942,
+      "step": 1799
+    },
+    {
+      "epoch": 0.5756661136540806,
+      "grad_norm": 0.3724258542060852,
+      "learning_rate": 0.00024162534665649358,
+      "loss": 4.62,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5756661136540806,
+      "eval_loss": 4.554470062255859,
+      "eval_runtime": 97.1859,
+      "eval_samples_per_second": 19.519,
+      "eval_steps_per_second": 4.888,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5759859281616663,
+      "grad_norm": 0.3389361500740051,
+      "learning_rate": 0.0002413204758424602,
+      "loss": 4.4803,
+      "step": 1801
+    },
+    {
+      "epoch": 0.5763057426692518,
+      "grad_norm": 0.3431956171989441,
+      "learning_rate": 0.00024101566802659137,
+      "loss": 4.5875,
+      "step": 1802
+    },
+    {
+      "epoch": 0.5766255571768374,
+      "grad_norm": 0.3464840352535248,
+      "learning_rate": 0.0002407109235361277,
+      "loss": 4.5123,
+      "step": 1803
+    },
+    {
+      "epoch": 0.5769453716844231,
+      "grad_norm": 0.3432200849056244,
+      "learning_rate": 0.0002404062426982421,
+      "loss": 4.5653,
+      "step": 1804
+    },
+    {
+      "epoch": 0.5772651861920086,
+      "grad_norm": 0.3582673668861389,
+      "learning_rate": 0.00024010162584003905,
+      "loss": 4.5433,
+      "step": 1805
+    },
+    {
+      "epoch": 0.5775850006995943,
+      "grad_norm": 0.3397902846336365,
+      "learning_rate": 0.0002397970732885542,
+      "loss": 4.6512,
+      "step": 1806
+    },
+    {
+      "epoch": 0.5779048152071798,
+      "grad_norm": 0.3517637252807617,
+      "learning_rate": 0.0002394925853707544,
+      "loss": 4.5119,
+      "step": 1807
+    },
+    {
+      "epoch": 0.5782246297147654,
+      "grad_norm": 0.3295648992061615,
+      "learning_rate": 0.00023918816241353684,
+      "loss": 4.5526,
+      "step": 1808
+    },
+    {
+      "epoch": 0.5785444442223511,
+      "grad_norm": 0.36371392011642456,
+      "learning_rate": 0.0002388838047437293,
+      "loss": 4.5532,
+      "step": 1809
+    },
+    {
+      "epoch": 0.5788642587299366,
+      "grad_norm": 0.38383764028549194,
+      "learning_rate": 0.00023857951268808905,
+      "loss": 4.5058,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5791840732375222,
+      "grad_norm": 0.34932655096054077,
+      "learning_rate": 0.00023827528657330331,
+      "loss": 4.6644,
+      "step": 1811
+    },
+    {
+      "epoch": 0.5795038877451079,
+      "grad_norm": 0.3224482536315918,
+      "learning_rate": 0.00023797112672598833,
+      "loss": 4.5296,
+      "step": 1812
+    },
+    {
+      "epoch": 0.5798237022526934,
+      "grad_norm": 0.3408745527267456,
+      "learning_rate": 0.0002376670334726891,
+      "loss": 4.5024,
+      "step": 1813
+    },
+    {
+      "epoch": 0.5801435167602791,
+      "grad_norm": 0.3669995963573456,
+      "learning_rate": 0.00023736300713987946,
+      "loss": 4.6073,
+      "step": 1814
+    },
+    {
+      "epoch": 0.5804633312678646,
+      "grad_norm": 0.336173951625824,
+      "learning_rate": 0.00023705904805396095,
+      "loss": 4.5538,
+      "step": 1815
+    },
+    {
+      "epoch": 0.5807831457754502,
+      "grad_norm": 0.33492520451545715,
+      "learning_rate": 0.00023675515654126327,
+      "loss": 4.5486,
+      "step": 1816
+    },
+    {
+      "epoch": 0.5811029602830359,
+      "grad_norm": 0.3570772707462311,
+      "learning_rate": 0.00023645133292804352,
+      "loss": 4.5727,
+      "step": 1817
+    },
+    {
+      "epoch": 0.5814227747906214,
+      "grad_norm": 0.35234716534614563,
+      "learning_rate": 0.0002361475775404857,
+      "loss": 4.4869,
+      "step": 1818
+    },
+    {
+      "epoch": 0.581742589298207,
+      "grad_norm": 0.34041503071784973,
+      "learning_rate": 0.00023584389070470087,
+      "loss": 4.5867,
+      "step": 1819
+    },
+    {
+      "epoch": 0.5820624038057927,
+      "grad_norm": 0.3548620045185089,
+      "learning_rate": 0.000235540272746726,
+      "loss": 4.55,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5823822183133782,
+      "grad_norm": 0.34405526518821716,
+      "learning_rate": 0.00023523672399252492,
+      "loss": 4.5571,
+      "step": 1821
+    },
+    {
+      "epoch": 0.5827020328209639,
+      "grad_norm": 0.33513495326042175,
+      "learning_rate": 0.00023493324476798618,
+      "loss": 4.5564,
+      "step": 1822
+    },
+    {
+      "epoch": 0.5830218473285494,
+      "grad_norm": 0.34113237261772156,
+      "learning_rate": 0.0002346298353989245,
+      "loss": 4.4995,
+      "step": 1823
+    },
+    {
+      "epoch": 0.583341661836135,
+      "grad_norm": 0.340901643037796,
+      "learning_rate": 0.00023432649621107928,
+      "loss": 4.6311,
+      "step": 1824
+    },
+    {
+      "epoch": 0.5836614763437207,
+      "grad_norm": 0.3488660156726837,
+      "learning_rate": 0.00023402322753011433,
+      "loss": 4.4622,
+      "step": 1825
+    },
+    {
+      "epoch": 0.5839812908513062,
+      "grad_norm": 0.3383903503417969,
+      "learning_rate": 0.0002337200296816184,
+      "loss": 4.5033,
+      "step": 1826
+    },
+    {
+      "epoch": 0.5843011053588918,
+      "grad_norm": 0.35267290472984314,
+      "learning_rate": 0.00023341690299110338,
+      "loss": 4.5771,
+      "step": 1827
+    },
+    {
+      "epoch": 0.5846209198664775,
+      "grad_norm": 0.34412881731987,
+      "learning_rate": 0.0002331138477840054,
+      "loss": 4.4763,
+      "step": 1828
+    },
+    {
+      "epoch": 0.584940734374063,
+      "grad_norm": 0.3485495448112488,
+      "learning_rate": 0.00023281086438568384,
+      "loss": 4.5253,
+      "step": 1829
+    },
+    {
+      "epoch": 0.5852605488816487,
+      "grad_norm": 0.337989866733551,
+      "learning_rate": 0.0002325079531214204,
+      "loss": 4.5676,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5855803633892342,
+      "grad_norm": 0.37945863604545593,
+      "learning_rate": 0.00023220511431642008,
+      "loss": 4.5198,
+      "step": 1831
+    },
+    {
+      "epoch": 0.5859001778968198,
+      "grad_norm": 0.33420321345329285,
+      "learning_rate": 0.00023190234829580943,
+      "loss": 4.5406,
+      "step": 1832
+    },
+    {
+      "epoch": 0.5862199924044055,
+      "grad_norm": 0.33344873785972595,
+      "learning_rate": 0.00023159965538463738,
+      "loss": 4.4691,
+      "step": 1833
+    },
+    {
+      "epoch": 0.586539806911991,
+      "grad_norm": 0.34540778398513794,
+      "learning_rate": 0.00023129703590787394,
+      "loss": 4.4858,
+      "step": 1834
+    },
+    {
+      "epoch": 0.5868596214195766,
+      "grad_norm": 0.33878621459007263,
+      "learning_rate": 0.0002309944901904107,
+      "loss": 4.5232,
+      "step": 1835
+    },
+    {
+      "epoch": 0.5871794359271623,
+      "grad_norm": 0.35938650369644165,
+      "learning_rate": 0.00023069201855705973,
+      "loss": 4.5278,
+      "step": 1836
+    },
+    {
+      "epoch": 0.5874992504347478,
+      "grad_norm": 0.35607171058654785,
+      "learning_rate": 0.00023038962133255366,
+      "loss": 4.5165,
+      "step": 1837
+    },
+    {
+      "epoch": 0.5878190649423335,
+      "grad_norm": 0.35275357961654663,
+      "learning_rate": 0.00023008729884154542,
+      "loss": 4.5,
+      "step": 1838
+    },
+    {
+      "epoch": 0.588138879449919,
+      "grad_norm": 0.3373669385910034,
+      "learning_rate": 0.00022978505140860736,
+      "loss": 4.5829,
+      "step": 1839
+    },
+    {
+      "epoch": 0.5884586939575046,
+      "grad_norm": 0.3494403660297394,
+      "learning_rate": 0.00022948287935823153,
+      "loss": 4.542,
+      "step": 1840
+    },
+    {
+      "epoch": 0.5887785084650903,
+      "grad_norm": 0.33572816848754883,
+      "learning_rate": 0.00022918078301482897,
+      "loss": 4.4934,
+      "step": 1841
+    },
+    {
+      "epoch": 0.5890983229726758,
+      "grad_norm": 0.3375544846057892,
+      "learning_rate": 0.00022887876270272938,
+      "loss": 4.5353,
+      "step": 1842
+    },
+    {
+      "epoch": 0.5894181374802614,
+      "grad_norm": 0.3462027907371521,
+      "learning_rate": 0.0002285768187461809,
+      "loss": 4.5498,
+      "step": 1843
+    },
+    {
+      "epoch": 0.5897379519878471,
+      "grad_norm": 0.33625560998916626,
+      "learning_rate": 0.00022827495146934964,
+      "loss": 4.4935,
+      "step": 1844
+    },
+    {
+      "epoch": 0.5900577664954326,
+      "grad_norm": 0.3321734070777893,
+      "learning_rate": 0.00022797316119631952,
+      "loss": 4.5493,
+      "step": 1845
+    },
+    {
+      "epoch": 0.5903775810030183,
+      "grad_norm": 0.3436463475227356,
+      "learning_rate": 0.00022767144825109153,
+      "loss": 4.4957,
+      "step": 1846
+    },
+    {
+      "epoch": 0.5906973955106039,
+      "grad_norm": 0.34126192331314087,
+      "learning_rate": 0.00022736981295758393,
+      "loss": 4.4996,
+      "step": 1847
+    },
+    {
+      "epoch": 0.5910172100181894,
+      "grad_norm": 0.33446893095970154,
+      "learning_rate": 0.00022706825563963148,
+      "loss": 4.5688,
+      "step": 1848
+    },
+    {
+      "epoch": 0.5913370245257751,
+      "grad_norm": 0.3394505977630615,
+      "learning_rate": 0.00022676677662098512,
+      "loss": 4.5422,
+      "step": 1849
+    },
+    {
+      "epoch": 0.5916568390333606,
+      "grad_norm": 0.3588700592517853,
+      "learning_rate": 0.00022646537622531197,
+      "loss": 4.5228,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5919766535409462,
+      "grad_norm": 0.33262884616851807,
+      "learning_rate": 0.00022616405477619448,
+      "loss": 4.5603,
+      "step": 1851
+    },
+    {
+      "epoch": 0.5922964680485319,
+      "grad_norm": 0.3373951315879822,
+      "learning_rate": 0.00022586281259713055,
+      "loss": 4.501,
+      "step": 1852
+    },
+    {
+      "epoch": 0.5926162825561174,
+      "grad_norm": 0.3469450771808624,
+      "learning_rate": 0.00022556165001153295,
+      "loss": 4.5772,
+      "step": 1853
+    },
+    {
+      "epoch": 0.5929360970637031,
+      "grad_norm": 0.3388214707374573,
+      "learning_rate": 0.0002252605673427288,
+      "loss": 4.5937,
+      "step": 1854
+    },
+    {
+      "epoch": 0.5932559115712887,
+      "grad_norm": 0.33294713497161865,
+      "learning_rate": 0.0002249595649139597,
+      "loss": 4.5361,
+      "step": 1855
+    },
+    {
+      "epoch": 0.5935757260788742,
+      "grad_norm": 0.33870381116867065,
+      "learning_rate": 0.0002246586430483809,
+      "loss": 4.4634,
+      "step": 1856
+    },
+    {
+      "epoch": 0.5938955405864599,
+      "grad_norm": 0.33916759490966797,
+      "learning_rate": 0.00022435780206906132,
+      "loss": 4.6296,
+      "step": 1857
+    },
+    {
+      "epoch": 0.5942153550940454,
+      "grad_norm": 0.33612513542175293,
+      "learning_rate": 0.00022405704229898278,
+      "loss": 4.5925,
+      "step": 1858
+    },
+    {
+      "epoch": 0.5945351696016311,
+      "grad_norm": 0.35525017976760864,
+      "learning_rate": 0.00022375636406104022,
+      "loss": 4.5298,
+      "step": 1859
+    },
+    {
+      "epoch": 0.5948549841092167,
+      "grad_norm": 0.35198476910591125,
+      "learning_rate": 0.00022345576767804085,
+      "loss": 4.5011,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5951747986168022,
+      "grad_norm": 0.3347802460193634,
+      "learning_rate": 0.00022315525347270412,
+      "loss": 4.5497,
+      "step": 1861
+    },
+    {
+      "epoch": 0.5954946131243879,
+      "grad_norm": 0.332305371761322,
+      "learning_rate": 0.00022285482176766122,
+      "loss": 4.4805,
+      "step": 1862
+    },
+    {
+      "epoch": 0.5958144276319735,
+      "grad_norm": 0.35164013504981995,
+      "learning_rate": 0.00022255447288545453,
+      "loss": 4.5509,
+      "step": 1863
+    },
+    {
+      "epoch": 0.596134242139559,
+      "grad_norm": 0.3494516909122467,
+      "learning_rate": 0.00022225420714853798,
+      "loss": 4.505,
+      "step": 1864
+    },
+    {
+      "epoch": 0.5964540566471447,
+      "grad_norm": 0.349423885345459,
+      "learning_rate": 0.00022195402487927592,
+      "loss": 4.5237,
+      "step": 1865
+    },
+    {
+      "epoch": 0.5967738711547302,
+      "grad_norm": 0.3518355190753937,
+      "learning_rate": 0.00022165392639994307,
+      "loss": 4.4976,
+      "step": 1866
+    },
+    {
+      "epoch": 0.5970936856623159,
+      "grad_norm": 0.34486064314842224,
+      "learning_rate": 0.00022135391203272441,
+      "loss": 4.4665,
+      "step": 1867
+    },
+    {
+      "epoch": 0.5974135001699015,
+      "grad_norm": 0.3417609632015228,
+      "learning_rate": 0.00022105398209971424,
+      "loss": 4.5459,
+      "step": 1868
+    },
+    {
+      "epoch": 0.597733314677487,
+      "grad_norm": 0.35042083263397217,
+      "learning_rate": 0.00022075413692291678,
+      "loss": 4.4606,
+      "step": 1869
+    },
+    {
+      "epoch": 0.5980531291850727,
+      "grad_norm": 0.3334195613861084,
+      "learning_rate": 0.00022045437682424458,
+      "loss": 4.5156,
+      "step": 1870
+    },
+    {
+      "epoch": 0.5983729436926583,
+      "grad_norm": 0.3369036018848419,
+      "learning_rate": 0.00022015470212551942,
+      "loss": 4.5203,
+      "step": 1871
+    },
+    {
+      "epoch": 0.5986927582002438,
+      "grad_norm": 0.35034123063087463,
+      "learning_rate": 0.00021985511314847128,
+      "loss": 4.5367,
+      "step": 1872
+    },
+    {
+      "epoch": 0.5990125727078295,
+      "grad_norm": 0.34471485018730164,
+      "learning_rate": 0.00021955561021473765,
+      "loss": 4.5716,
+      "step": 1873
+    },
+    {
+      "epoch": 0.599332387215415,
+      "grad_norm": 0.3398720324039459,
+      "learning_rate": 0.0002192561936458644,
+      "loss": 4.4874,
+      "step": 1874
+    },
+    {
+      "epoch": 0.5996522017230007,
+      "grad_norm": 0.3795250654220581,
+      "learning_rate": 0.00021895686376330396,
+      "loss": 4.4272,
+      "step": 1875
+    },
+    {
+      "epoch": 0.5999720162305863,
+      "grad_norm": 0.34881579875946045,
+      "learning_rate": 0.00021865762088841607,
+      "loss": 4.4228,
+      "step": 1876
+    },
+    {
+      "epoch": 0.6002918307381718,
+      "grad_norm": 0.3544309437274933,
+      "learning_rate": 0.00021835846534246726,
+      "loss": 4.5057,
+      "step": 1877
+    },
+    {
+      "epoch": 0.6006116452457575,
+      "grad_norm": 0.39712315797805786,
+      "learning_rate": 0.00021805939744662964,
+      "loss": 4.5367,
+      "step": 1878
+    },
+    {
+      "epoch": 0.6009314597533431,
+      "grad_norm": 0.38285964727401733,
+      "learning_rate": 0.00021776041752198202,
+      "loss": 4.5301,
+      "step": 1879
+    },
+    {
+      "epoch": 0.6012512742609286,
+      "grad_norm": 0.3336244523525238,
+      "learning_rate": 0.00021746152588950809,
+      "loss": 4.5421,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6015710887685143,
+      "grad_norm": 0.3482455313205719,
+      "learning_rate": 0.00021716272287009713,
+      "loss": 4.583,
+      "step": 1881
+    },
+    {
+      "epoch": 0.6018909032760998,
+      "grad_norm": 0.33469104766845703,
+      "learning_rate": 0.00021686400878454312,
+      "loss": 4.3977,
+      "step": 1882
+    },
+    {
+      "epoch": 0.6022107177836855,
+      "grad_norm": 0.34940576553344727,
+      "learning_rate": 0.0002165653839535447,
+      "loss": 4.4293,
+      "step": 1883
+    },
+    {
+      "epoch": 0.6025305322912711,
+      "grad_norm": 0.34258222579956055,
+      "learning_rate": 0.00021626684869770462,
+      "loss": 4.4359,
+      "step": 1884
+    },
+    {
+      "epoch": 0.6028503467988566,
+      "grad_norm": 0.3487623631954193,
+      "learning_rate": 0.00021596840333752934,
+      "loss": 4.5172,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6031701613064423,
+      "grad_norm": 0.3448670208454132,
+      "learning_rate": 0.00021567004819342907,
+      "loss": 4.6143,
+      "step": 1886
+    },
+    {
+      "epoch": 0.6034899758140279,
+      "grad_norm": 0.36221206188201904,
+      "learning_rate": 0.00021537178358571686,
+      "loss": 4.4608,
+      "step": 1887
+    },
+    {
+      "epoch": 0.6038097903216134,
+      "grad_norm": 0.3512362241744995,
+      "learning_rate": 0.00021507360983460882,
+      "loss": 4.5247,
+      "step": 1888
+    },
+    {
+      "epoch": 0.6041296048291991,
+      "grad_norm": 0.38084450364112854,
+      "learning_rate": 0.0002147755272602234,
+      "loss": 4.4477,
+      "step": 1889
+    },
+    {
+      "epoch": 0.6044494193367846,
+      "grad_norm": 0.35751184821128845,
+      "learning_rate": 0.00021447753618258116,
+      "loss": 4.5502,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6047692338443703,
+      "grad_norm": 0.3478350043296814,
+      "learning_rate": 0.00021417963692160448,
+      "loss": 4.4548,
+      "step": 1891
+    },
+    {
+      "epoch": 0.6050890483519559,
+      "grad_norm": 0.36147207021713257,
+      "learning_rate": 0.00021388182979711703,
+      "loss": 4.4134,
+      "step": 1892
+    },
+    {
+      "epoch": 0.6054088628595414,
+      "grad_norm": 0.34922581911087036,
+      "learning_rate": 0.0002135841151288438,
+      "loss": 4.5662,
+      "step": 1893
+    },
+    {
+      "epoch": 0.6057286773671271,
+      "grad_norm": 0.33871176838874817,
+      "learning_rate": 0.00021328649323641022,
+      "loss": 4.4339,
+      "step": 1894
+    },
+    {
+      "epoch": 0.6060484918747127,
+      "grad_norm": 0.3543529808521271,
+      "learning_rate": 0.00021298896443934238,
+      "loss": 4.4614,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6063683063822982,
+      "grad_norm": 0.3440582752227783,
+      "learning_rate": 0.00021269152905706637,
+      "loss": 4.5051,
+      "step": 1896
+    },
+    {
+      "epoch": 0.6066881208898839,
+      "grad_norm": 0.3369908332824707,
+      "learning_rate": 0.00021239418740890786,
+      "loss": 4.5262,
+      "step": 1897
+    },
+    {
+      "epoch": 0.6070079353974694,
+      "grad_norm": 0.3528311848640442,
+      "learning_rate": 0.000212096939814092,
+      "loss": 4.531,
+      "step": 1898
+    },
+    {
+      "epoch": 0.6073277499050551,
+      "grad_norm": 0.3660440146923065,
+      "learning_rate": 0.00021179978659174284,
+      "loss": 4.4864,
+      "step": 1899
+    },
+    {
+      "epoch": 0.6076475644126407,
+      "grad_norm": 0.33674898743629456,
+      "learning_rate": 0.00021150272806088333,
+      "loss": 4.4465,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6076475644126407,
+      "eval_loss": 4.525420665740967,
+      "eval_runtime": 90.4335,
+      "eval_samples_per_second": 20.977,
+      "eval_steps_per_second": 5.252,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6079673789202262,
+      "grad_norm": 0.36991459131240845,
+      "learning_rate": 0.00021120576454043463,
+      "loss": 4.4703,
+      "step": 1901
+    },
+    {
+      "epoch": 0.6082871934278119,
+      "grad_norm": 0.3455876410007477,
+      "learning_rate": 0.00021090889634921585,
+      "loss": 4.6068,
+      "step": 1902
+    },
+    {
+      "epoch": 0.6086070079353975,
+      "grad_norm": 0.34351128339767456,
+      "learning_rate": 0.00021061212380594382,
+      "loss": 4.5503,
+      "step": 1903
+    },
+    {
+      "epoch": 0.608926822442983,
+      "grad_norm": 0.3383599519729614,
+      "learning_rate": 0.00021031544722923266,
+      "loss": 4.6339,
+      "step": 1904
+    },
+    {
+      "epoch": 0.6092466369505687,
+      "grad_norm": 0.35959696769714355,
+      "learning_rate": 0.0002100188669375935,
+      "loss": 4.5512,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6095664514581542,
+      "grad_norm": 0.3345339298248291,
+      "learning_rate": 0.000209722383249434,
+      "loss": 4.4128,
+      "step": 1906
+    },
+    {
+      "epoch": 0.6098862659657399,
+      "grad_norm": 0.34817659854888916,
+      "learning_rate": 0.0002094259964830582,
+      "loss": 4.4591,
+      "step": 1907
+    },
+    {
+      "epoch": 0.6102060804733255,
+      "grad_norm": 0.3305789530277252,
+      "learning_rate": 0.0002091297069566662,
+      "loss": 4.3654,
+      "step": 1908
+    },
+    {
+      "epoch": 0.610525894980911,
+      "grad_norm": 0.3378927707672119,
+      "learning_rate": 0.00020883351498835335,
+      "loss": 4.5551,
+      "step": 1909
+    },
+    {
+      "epoch": 0.6108457094884967,
+      "grad_norm": 0.34520092606544495,
+      "learning_rate": 0.00020853742089611067,
+      "loss": 4.601,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6111655239960823,
+      "grad_norm": 0.34389182925224304,
+      "learning_rate": 0.00020824142499782368,
+      "loss": 4.5051,
+      "step": 1911
+    },
+    {
+      "epoch": 0.6114853385036678,
+      "grad_norm": 0.33250677585601807,
+      "learning_rate": 0.00020794552761127283,
+      "loss": 4.3949,
+      "step": 1912
+    },
+    {
+      "epoch": 0.6118051530112535,
+      "grad_norm": 0.3374512493610382,
+      "learning_rate": 0.0002076497290541328,
+      "loss": 4.5135,
+      "step": 1913
+    },
+    {
+      "epoch": 0.612124967518839,
+      "grad_norm": 0.332403302192688,
+      "learning_rate": 0.0002073540296439719,
+      "loss": 4.4235,
+      "step": 1914
+    },
+    {
+      "epoch": 0.6124447820264247,
+      "grad_norm": 0.3468950390815735,
+      "learning_rate": 0.00020705842969825225,
+      "loss": 4.6296,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6127645965340103,
+      "grad_norm": 0.34265509247779846,
+      "learning_rate": 0.00020676292953432886,
+      "loss": 4.4676,
+      "step": 1916
+    },
+    {
+      "epoch": 0.6130844110415958,
+      "grad_norm": 0.342316597700119,
+      "learning_rate": 0.00020646752946945016,
+      "loss": 4.4868,
+      "step": 1917
+    },
+    {
+      "epoch": 0.6134042255491815,
+      "grad_norm": 0.3344189524650574,
+      "learning_rate": 0.00020617222982075646,
+      "loss": 4.4635,
+      "step": 1918
+    },
+    {
+      "epoch": 0.6137240400567671,
+      "grad_norm": 0.33452966809272766,
+      "learning_rate": 0.0002058770309052808,
+      "loss": 4.4877,
+      "step": 1919
+    },
+    {
+      "epoch": 0.6140438545643526,
+      "grad_norm": 0.33984753489494324,
+      "learning_rate": 0.00020558193303994797,
+      "loss": 4.4969,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6143636690719383,
+      "grad_norm": 0.32579725980758667,
+      "learning_rate": 0.0002052869365415738,
+      "loss": 4.5268,
+      "step": 1921
+    },
+    {
+      "epoch": 0.6146834835795238,
+      "grad_norm": 0.3360597789287567,
+      "learning_rate": 0.00020499204172686616,
+      "loss": 4.5063,
+      "step": 1922
+    },
+    {
+      "epoch": 0.6150032980871095,
+      "grad_norm": 0.33437472581863403,
+      "learning_rate": 0.00020469724891242281,
+      "loss": 4.4823,
+      "step": 1923
+    },
+    {
+      "epoch": 0.6153231125946951,
+      "grad_norm": 0.34467798471450806,
+      "learning_rate": 0.00020440255841473252,
+      "loss": 4.4709,
+      "step": 1924
+    },
+    {
+      "epoch": 0.6156429271022806,
+      "grad_norm": 0.39499175548553467,
+      "learning_rate": 0.0002041079705501745,
+      "loss": 4.4726,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6159627416098663,
+      "grad_norm": 0.3400439918041229,
+      "learning_rate": 0.00020381348563501694,
+      "loss": 4.461,
+      "step": 1926
+    },
+    {
+      "epoch": 0.6162825561174519,
+      "grad_norm": 0.3481975495815277,
+      "learning_rate": 0.00020351910398541835,
+      "loss": 4.5781,
+      "step": 1927
+    },
+    {
+      "epoch": 0.6166023706250375,
+      "grad_norm": 0.3361368179321289,
+      "learning_rate": 0.00020322482591742576,
+      "loss": 4.4866,
+      "step": 1928
+    },
+    {
+      "epoch": 0.6169221851326231,
+      "grad_norm": 0.3281785845756531,
+      "learning_rate": 0.0002029306517469754,
+      "loss": 4.4865,
+      "step": 1929
+    },
+    {
+      "epoch": 0.6172419996402086,
+      "grad_norm": 0.35954657196998596,
+      "learning_rate": 0.00020263658178989162,
+      "loss": 4.5366,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6175618141477943,
+      "grad_norm": 0.35379758477211,
+      "learning_rate": 0.0002023426163618872,
+      "loss": 4.4782,
+      "step": 1931
+    },
+    {
+      "epoch": 0.6178816286553799,
+      "grad_norm": 0.33757951855659485,
+      "learning_rate": 0.00020204875577856256,
+      "loss": 4.5129,
+      "step": 1932
+    },
+    {
+      "epoch": 0.6182014431629654,
+      "grad_norm": 0.35059356689453125,
+      "learning_rate": 0.00020175500035540545,
+      "loss": 4.4797,
+      "step": 1933
+    },
+    {
+      "epoch": 0.6185212576705511,
+      "grad_norm": 0.3704817295074463,
+      "learning_rate": 0.00020146135040779097,
+      "loss": 4.4134,
+      "step": 1934
+    },
+    {
+      "epoch": 0.6188410721781367,
+      "grad_norm": 0.33202195167541504,
+      "learning_rate": 0.0002011678062509807,
+      "loss": 4.5071,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6191608866857223,
+      "grad_norm": 0.3939604163169861,
+      "learning_rate": 0.00020087436820012287,
+      "loss": 4.5482,
+      "step": 1936
+    },
+    {
+      "epoch": 0.6194807011933079,
+      "grad_norm": 0.34221702814102173,
+      "learning_rate": 0.0002005810365702517,
+      "loss": 4.4425,
+      "step": 1937
+    },
+    {
+      "epoch": 0.6198005157008935,
+      "grad_norm": 0.324089914560318,
+      "learning_rate": 0.00020028781167628714,
+      "loss": 4.5057,
+      "step": 1938
+    },
+    {
+      "epoch": 0.6201203302084791,
+      "grad_norm": 0.3476245403289795,
+      "learning_rate": 0.0001999946938330346,
+      "loss": 4.4755,
+      "step": 1939
+    },
+    {
+      "epoch": 0.6204401447160647,
+      "grad_norm": 0.3370070457458496,
+      "learning_rate": 0.0001997016833551845,
+      "loss": 4.5277,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6207599592236502,
+      "grad_norm": 0.34495389461517334,
+      "learning_rate": 0.00019940878055731208,
+      "loss": 4.4686,
+      "step": 1941
+    },
+    {
+      "epoch": 0.6210797737312359,
+      "grad_norm": 0.34336042404174805,
+      "learning_rate": 0.00019911598575387683,
+      "loss": 4.4037,
+      "step": 1942
+    },
+    {
+      "epoch": 0.6213995882388215,
+      "grad_norm": 0.3384661078453064,
+      "learning_rate": 0.00019882329925922245,
+      "loss": 4.5484,
+      "step": 1943
+    },
+    {
+      "epoch": 0.6217194027464071,
+      "grad_norm": 0.32501736283302307,
+      "learning_rate": 0.00019853072138757637,
+      "loss": 4.4984,
+      "step": 1944
+    },
+    {
+      "epoch": 0.6220392172539927,
+      "grad_norm": 0.35760048031806946,
+      "learning_rate": 0.00019823825245304918,
+      "loss": 4.5562,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6223590317615783,
+      "grad_norm": 0.3294287919998169,
+      "learning_rate": 0.00019794589276963482,
+      "loss": 4.5322,
+      "step": 1946
+    },
+    {
+      "epoch": 0.6226788462691639,
+      "grad_norm": 0.33806145191192627,
+      "learning_rate": 0.00019765364265120962,
+      "loss": 4.5018,
+      "step": 1947
+    },
+    {
+      "epoch": 0.6229986607767495,
+      "grad_norm": 0.3356688320636749,
+      "learning_rate": 0.00019736150241153258,
+      "loss": 4.4559,
+      "step": 1948
+    },
+    {
+      "epoch": 0.623318475284335,
+      "grad_norm": 0.3312791883945465,
+      "learning_rate": 0.0001970694723642446,
+      "loss": 4.4325,
+      "step": 1949
+    },
+    {
+      "epoch": 0.6236382897919207,
+      "grad_norm": 0.3365389406681061,
+      "learning_rate": 0.00019677755282286822,
+      "loss": 4.4863,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6239581042995063,
+      "grad_norm": 0.3383220136165619,
+      "learning_rate": 0.00019648574410080743,
+      "loss": 4.4953,
+      "step": 1951
+    },
+    {
+      "epoch": 0.6242779188070919,
+      "grad_norm": 0.33379092812538147,
+      "learning_rate": 0.00019619404651134717,
+      "loss": 4.5105,
+      "step": 1952
+    },
+    {
+      "epoch": 0.6245977333146775,
+      "grad_norm": 0.33623144030570984,
+      "learning_rate": 0.0001959024603676532,
+      "loss": 4.6028,
+      "step": 1953
+    },
+    {
+      "epoch": 0.624917547822263,
+      "grad_norm": 0.34045708179473877,
+      "learning_rate": 0.00019561098598277145,
+      "loss": 4.6114,
+      "step": 1954
+    },
+    {
+      "epoch": 0.6252373623298487,
+      "grad_norm": 0.33441922068595886,
+      "learning_rate": 0.000195319623669628,
+      "loss": 4.5134,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6255571768374343,
+      "grad_norm": 0.33533331751823425,
+      "learning_rate": 0.00019502837374102866,
+      "loss": 4.5119,
+      "step": 1956
+    },
+    {
+      "epoch": 0.6258769913450198,
+      "grad_norm": 0.3240436315536499,
+      "learning_rate": 0.00019473723650965832,
+      "loss": 4.5549,
+      "step": 1957
+    },
+    {
+      "epoch": 0.6261968058526055,
+      "grad_norm": 0.35325974225997925,
+      "learning_rate": 0.0001944462122880813,
+      "loss": 4.5761,
+      "step": 1958
+    },
+    {
+      "epoch": 0.6265166203601911,
+      "grad_norm": 0.34944358468055725,
+      "learning_rate": 0.00019415530138874,
+      "loss": 4.4868,
+      "step": 1959
+    },
+    {
+      "epoch": 0.6268364348677767,
+      "grad_norm": 0.33490127325057983,
+      "learning_rate": 0.0001938645041239558,
+      "loss": 4.4705,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6271562493753623,
+      "grad_norm": 0.32527029514312744,
+      "learning_rate": 0.00019357382080592773,
+      "loss": 4.5074,
+      "step": 1961
+    },
+    {
+      "epoch": 0.6274760638829479,
+      "grad_norm": 0.3463835120201111,
+      "learning_rate": 0.00019328325174673247,
+      "loss": 4.556,
+      "step": 1962
+    },
+    {
+      "epoch": 0.6277958783905335,
+      "grad_norm": 0.3363448977470398,
+      "learning_rate": 0.0001929927972583242,
+      "loss": 4.5661,
+      "step": 1963
+    },
+    {
+      "epoch": 0.6281156928981191,
+      "grad_norm": 0.33256959915161133,
+      "learning_rate": 0.00019270245765253382,
+      "loss": 4.5234,
+      "step": 1964
+    },
+    {
+      "epoch": 0.6284355074057046,
+      "grad_norm": 0.3460303843021393,
+      "learning_rate": 0.0001924122332410694,
+      "loss": 4.5785,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6287553219132903,
+      "grad_norm": 0.342033326625824,
+      "learning_rate": 0.00019212212433551465,
+      "loss": 4.5518,
+      "step": 1966
+    },
+    {
+      "epoch": 0.6290751364208759,
+      "grad_norm": 0.37389683723449707,
+      "learning_rate": 0.00019183213124732986,
+      "loss": 4.6119,
+      "step": 1967
+    },
+    {
+      "epoch": 0.6293949509284615,
+      "grad_norm": 0.34783628582954407,
+      "learning_rate": 0.0001915422542878508,
+      "loss": 4.4027,
+      "step": 1968
+    },
+    {
+      "epoch": 0.6297147654360471,
+      "grad_norm": 0.334176242351532,
+      "learning_rate": 0.00019125249376828824,
+      "loss": 4.5087,
+      "step": 1969
+    },
+    {
+      "epoch": 0.6300345799436327,
+      "grad_norm": 0.3315781056880951,
+      "learning_rate": 0.00019096284999972862,
+      "loss": 4.3967,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6303543944512183,
+      "grad_norm": 0.3476397395133972,
+      "learning_rate": 0.00019067332329313226,
+      "loss": 4.4586,
+      "step": 1971
+    },
+    {
+      "epoch": 0.6306742089588039,
+      "grad_norm": 0.343718945980072,
+      "learning_rate": 0.0001903839139593343,
+      "loss": 4.4096,
+      "step": 1972
+    },
+    {
+      "epoch": 0.6309940234663894,
+      "grad_norm": 0.3535640835762024,
+      "learning_rate": 0.00019009462230904398,
+      "loss": 4.5057,
+      "step": 1973
+    },
+    {
+      "epoch": 0.6313138379739751,
+      "grad_norm": 0.3376810848712921,
+      "learning_rate": 0.0001898054486528436,
+      "loss": 4.5772,
+      "step": 1974
+    },
+    {
+      "epoch": 0.6316336524815607,
+      "grad_norm": 0.3592750132083893,
+      "learning_rate": 0.00018951639330118953,
+      "loss": 4.4987,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6319534669891463,
+      "grad_norm": 0.3285256326198578,
+      "learning_rate": 0.0001892274565644104,
+      "loss": 4.4203,
+      "step": 1976
+    },
+    {
+      "epoch": 0.6322732814967319,
+      "grad_norm": 0.34887608885765076,
+      "learning_rate": 0.000188938638752708,
+      "loss": 4.3972,
+      "step": 1977
+    },
+    {
+      "epoch": 0.6325930960043175,
+      "grad_norm": 0.34385180473327637,
+      "learning_rate": 0.00018864994017615624,
+      "loss": 4.4383,
+      "step": 1978
+    },
+    {
+      "epoch": 0.6329129105119031,
+      "grad_norm": 0.3433148264884949,
+      "learning_rate": 0.0001883613611447011,
+      "loss": 4.58,
+      "step": 1979
+    },
+    {
+      "epoch": 0.6332327250194887,
+      "grad_norm": 0.34894120693206787,
+      "learning_rate": 0.00018807290196816022,
+      "loss": 4.4366,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6335525395270742,
+      "grad_norm": 0.3402814269065857,
+      "learning_rate": 0.00018778456295622239,
+      "loss": 4.441,
+      "step": 1981
+    },
+    {
+      "epoch": 0.6338723540346599,
+      "grad_norm": 0.44206148386001587,
+      "learning_rate": 0.00018749634441844764,
+      "loss": 4.4753,
+      "step": 1982
+    },
+    {
+      "epoch": 0.6341921685422455,
+      "grad_norm": 0.3443446159362793,
+      "learning_rate": 0.00018720824666426647,
+      "loss": 4.5124,
+      "step": 1983
+    },
+    {
+      "epoch": 0.6345119830498311,
+      "grad_norm": 0.3371545672416687,
+      "learning_rate": 0.00018692027000297986,
+      "loss": 4.5428,
+      "step": 1984
+    },
+    {
+      "epoch": 0.6348317975574167,
+      "grad_norm": 0.3495273292064667,
+      "learning_rate": 0.0001866324147437587,
+      "loss": 4.5698,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6351516120650023,
+      "grad_norm": 0.3294757306575775,
+      "learning_rate": 0.00018634468119564342,
+      "loss": 4.5179,
+      "step": 1986
+    },
+    {
+      "epoch": 0.6354714265725879,
+      "grad_norm": 0.338234007358551,
+      "learning_rate": 0.00018605706966754408,
+      "loss": 4.4306,
+      "step": 1987
+    },
+    {
+      "epoch": 0.6357912410801735,
+      "grad_norm": 0.39578574895858765,
+      "learning_rate": 0.00018576958046823944,
+      "loss": 4.4886,
+      "step": 1988
+    },
+    {
+      "epoch": 0.636111055587759,
+      "grad_norm": 0.3399364948272705,
+      "learning_rate": 0.0001854822139063772,
+      "loss": 4.5338,
+      "step": 1989
+    },
+    {
+      "epoch": 0.6364308700953447,
+      "grad_norm": 0.35335099697113037,
+      "learning_rate": 0.00018519497029047307,
+      "loss": 4.5156,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6367506846029303,
+      "grad_norm": 0.4992465674877167,
+      "learning_rate": 0.00018490784992891107,
+      "loss": 4.5368,
+      "step": 1991
+    },
+    {
+      "epoch": 0.6370704991105159,
+      "grad_norm": 0.34836506843566895,
+      "learning_rate": 0.00018462085312994278,
+      "loss": 4.4756,
+      "step": 1992
+    },
+    {
+      "epoch": 0.6373903136181015,
+      "grad_norm": 0.36463114619255066,
+      "learning_rate": 0.0001843339802016871,
+      "loss": 4.6071,
+      "step": 1993
+    },
+    {
+      "epoch": 0.6377101281256871,
+      "grad_norm": 0.3420564532279968,
+      "learning_rate": 0.00018404723145212993,
+      "loss": 4.4792,
+      "step": 1994
+    },
+    {
+      "epoch": 0.6380299426332727,
+      "grad_norm": 0.3417738974094391,
+      "learning_rate": 0.00018376060718912392,
+      "loss": 4.5,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6383497571408583,
+      "grad_norm": 0.3472539484500885,
+      "learning_rate": 0.00018347410772038807,
+      "loss": 4.4523,
+      "step": 1996
+    },
+    {
+      "epoch": 0.638669571648444,
+      "grad_norm": 0.3467002809047699,
+      "learning_rate": 0.00018318773335350723,
+      "loss": 4.4398,
+      "step": 1997
+    },
+    {
+      "epoch": 0.6389893861560295,
+      "grad_norm": 0.3370647430419922,
+      "learning_rate": 0.0001829014843959322,
+      "loss": 4.4692,
+      "step": 1998
+    },
+    {
+      "epoch": 0.6393092006636151,
+      "grad_norm": 0.3472033143043518,
+      "learning_rate": 0.00018261536115497904,
+      "loss": 4.5119,
+      "step": 1999
+    },
+    {
+      "epoch": 0.6396290151712007,
+      "grad_norm": 0.3552742600440979,
+      "learning_rate": 0.0001823293639378287,
+      "loss": 4.5038,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6396290151712007,
+      "eval_loss": 4.501763820648193,
+      "eval_runtime": 85.2654,
+      "eval_samples_per_second": 22.248,
+      "eval_steps_per_second": 5.571,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6399488296787863,
+      "grad_norm": 0.33789336681365967,
+      "learning_rate": 0.00018204349305152707,
+      "loss": 4.4083,
+      "step": 2001
+    },
+    {
+      "epoch": 0.6402686441863719,
+      "grad_norm": 0.35798177123069763,
+      "learning_rate": 0.00018175774880298422,
+      "loss": 4.6027,
+      "step": 2002
+    },
+    {
+      "epoch": 0.6405884586939575,
+      "grad_norm": 0.35995379090309143,
+      "learning_rate": 0.0001814721314989743,
+      "loss": 4.5795,
+      "step": 2003
+    },
+    {
+      "epoch": 0.6409082732015431,
+      "grad_norm": 0.3444795608520508,
+      "learning_rate": 0.0001811866414461354,
+      "loss": 4.4867,
+      "step": 2004
+    },
+    {
+      "epoch": 0.6412280877091288,
+      "grad_norm": 0.3556591272354126,
+      "learning_rate": 0.00018090127895096855,
+      "loss": 4.4126,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6415479022167143,
+      "grad_norm": 0.34814828634262085,
+      "learning_rate": 0.0001806160443198383,
+      "loss": 4.5066,
+      "step": 2006
+    },
+    {
+      "epoch": 0.6418677167242999,
+      "grad_norm": 0.3337419629096985,
+      "learning_rate": 0.00018033093785897145,
+      "loss": 4.4302,
+      "step": 2007
+    },
+    {
+      "epoch": 0.6421875312318855,
+      "grad_norm": 0.3529178202152252,
+      "learning_rate": 0.00018004595987445782,
+      "loss": 4.5251,
+      "step": 2008
+    },
+    {
+      "epoch": 0.6425073457394711,
+      "grad_norm": 0.36757129430770874,
+      "learning_rate": 0.00017976111067224854,
+      "loss": 4.5572,
+      "step": 2009
+    },
+    {
+      "epoch": 0.6428271602470567,
+      "grad_norm": 0.33173325657844543,
+      "learning_rate": 0.00017947639055815713,
+      "loss": 4.5201,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6431469747546423,
+      "grad_norm": 0.3335769772529602,
+      "learning_rate": 0.00017919179983785828,
+      "loss": 4.5098,
+      "step": 2011
+    },
+    {
+      "epoch": 0.6434667892622279,
+      "grad_norm": 0.3455420136451721,
+      "learning_rate": 0.00017890733881688754,
+      "loss": 4.5294,
+      "step": 2012
+    },
+    {
+      "epoch": 0.6437866037698136,
+      "grad_norm": 0.33455345034599304,
+      "learning_rate": 0.0001786230078006417,
+      "loss": 4.4092,
+      "step": 2013
+    },
+    {
+      "epoch": 0.6441064182773991,
+      "grad_norm": 0.34600844979286194,
+      "learning_rate": 0.00017833880709437752,
+      "loss": 4.4824,
+      "step": 2014
+    },
+    {
+      "epoch": 0.6444262327849847,
+      "grad_norm": 0.3213809132575989,
+      "learning_rate": 0.00017805473700321193,
+      "loss": 4.3922,
+      "step": 2015
+    },
+    {
+      "epoch": 0.6447460472925703,
+      "grad_norm": 0.32600635290145874,
+      "learning_rate": 0.00017777079783212215,
+      "loss": 4.5513,
+      "step": 2016
+    },
+    {
+      "epoch": 0.6450658618001559,
+      "grad_norm": 0.35062530636787415,
+      "learning_rate": 0.00017748698988594394,
+      "loss": 4.4991,
+      "step": 2017
+    },
+    {
+      "epoch": 0.6453856763077415,
+      "grad_norm": 0.3276106119155884,
+      "learning_rate": 0.00017720331346937317,
+      "loss": 4.4505,
+      "step": 2018
+    },
+    {
+      "epoch": 0.6457054908153271,
+      "grad_norm": 0.3346673250198364,
+      "learning_rate": 0.0001769197688869636,
+      "loss": 4.4227,
+      "step": 2019
+    },
+    {
+      "epoch": 0.6460253053229127,
+      "grad_norm": 0.3427330255508423,
+      "learning_rate": 0.0001766363564431281,
+      "loss": 4.4448,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6463451198304984,
+      "grad_norm": 0.3343569338321686,
+      "learning_rate": 0.00017635307644213726,
+      "loss": 4.4658,
+      "step": 2021
+    },
+    {
+      "epoch": 0.6466649343380839,
+      "grad_norm": 0.3355761766433716,
+      "learning_rate": 0.00017606992918811976,
+      "loss": 4.4333,
+      "step": 2022
+    },
+    {
+      "epoch": 0.6469847488456695,
+      "grad_norm": 0.3303985297679901,
+      "learning_rate": 0.00017578691498506177,
+      "loss": 4.4637,
+      "step": 2023
+    },
+    {
+      "epoch": 0.6473045633532551,
+      "grad_norm": 0.3363431990146637,
+      "learning_rate": 0.00017550403413680625,
+      "loss": 4.5341,
+      "step": 2024
+    },
+    {
+      "epoch": 0.6476243778608407,
+      "grad_norm": 0.33544039726257324,
+      "learning_rate": 0.00017522128694705347,
+      "loss": 4.4159,
+      "step": 2025
+    },
+    {
+      "epoch": 0.6479441923684263,
+      "grad_norm": 0.33282896876335144,
+      "learning_rate": 0.0001749386737193598,
+      "loss": 4.462,
+      "step": 2026
+    },
+    {
+      "epoch": 0.6482640068760119,
+      "grad_norm": 0.34504929184913635,
+      "learning_rate": 0.00017465619475713813,
+      "loss": 4.5122,
+      "step": 2027
+    },
+    {
+      "epoch": 0.6485838213835975,
+      "grad_norm": 0.3385733664035797,
+      "learning_rate": 0.00017437385036365695,
+      "loss": 4.4972,
+      "step": 2028
+    },
+    {
+      "epoch": 0.6489036358911832,
+      "grad_norm": 0.3179207146167755,
+      "learning_rate": 0.00017409164084204037,
+      "loss": 4.4297,
+      "step": 2029
+    },
+    {
+      "epoch": 0.6492234503987687,
+      "grad_norm": 0.3247489631175995,
+      "learning_rate": 0.00017380956649526785,
+      "loss": 4.4343,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6495432649063543,
+      "grad_norm": 0.3349881172180176,
+      "learning_rate": 0.00017352762762617334,
+      "loss": 4.4495,
+      "step": 2031
+    },
+    {
+      "epoch": 0.64986307941394,
+      "grad_norm": 0.35694414377212524,
+      "learning_rate": 0.00017324582453744577,
+      "loss": 4.436,
+      "step": 2032
+    },
+    {
+      "epoch": 0.6501828939215255,
+      "grad_norm": 0.36272308230400085,
+      "learning_rate": 0.00017296415753162786,
+      "loss": 4.4708,
+      "step": 2033
+    },
+    {
+      "epoch": 0.6505027084291111,
+      "grad_norm": 0.3409692049026489,
+      "learning_rate": 0.00017268262691111675,
+      "loss": 4.5294,
+      "step": 2034
+    },
+    {
+      "epoch": 0.6508225229366967,
+      "grad_norm": 0.3303185999393463,
+      "learning_rate": 0.0001724012329781629,
+      "loss": 4.4625,
+      "step": 2035
+    },
+    {
+      "epoch": 0.6511423374442823,
+      "grad_norm": 0.3401007056236267,
+      "learning_rate": 0.0001721199760348698,
+      "loss": 4.4891,
+      "step": 2036
+    },
+    {
+      "epoch": 0.651462151951868,
+      "grad_norm": 0.3425493836402893,
+      "learning_rate": 0.00017183885638319426,
+      "loss": 4.468,
+      "step": 2037
+    },
+    {
+      "epoch": 0.6517819664594535,
+      "grad_norm": 0.34367531538009644,
+      "learning_rate": 0.00017155787432494529,
+      "loss": 4.485,
+      "step": 2038
+    },
+    {
+      "epoch": 0.6521017809670391,
+      "grad_norm": 0.33023732900619507,
+      "learning_rate": 0.00017127703016178445,
+      "loss": 4.5316,
+      "step": 2039
+    },
+    {
+      "epoch": 0.6524215954746247,
+      "grad_norm": 0.33133843541145325,
+      "learning_rate": 0.00017099632419522552,
+      "loss": 4.5366,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6527414099822103,
+      "grad_norm": 0.34126347303390503,
+      "learning_rate": 0.00017071575672663325,
+      "loss": 4.4601,
+      "step": 2041
+    },
+    {
+      "epoch": 0.6530612244897959,
+      "grad_norm": 0.34515446424484253,
+      "learning_rate": 0.0001704353280572243,
+      "loss": 4.5834,
+      "step": 2042
+    },
+    {
+      "epoch": 0.6533810389973815,
+      "grad_norm": 0.3388916850090027,
+      "learning_rate": 0.0001701550384880658,
+      "loss": 4.4507,
+      "step": 2043
+    },
+    {
+      "epoch": 0.6537008535049671,
+      "grad_norm": 0.3309282958507538,
+      "learning_rate": 0.00016987488832007593,
+      "loss": 4.4855,
+      "step": 2044
+    },
+    {
+      "epoch": 0.6540206680125528,
+      "grad_norm": 0.33507040143013,
+      "learning_rate": 0.00016959487785402313,
+      "loss": 4.4535,
+      "step": 2045
+    },
+    {
+      "epoch": 0.6543404825201383,
+      "grad_norm": 0.32252126932144165,
+      "learning_rate": 0.00016931500739052576,
+      "loss": 4.4948,
+      "step": 2046
+    },
+    {
+      "epoch": 0.6546602970277239,
+      "grad_norm": 0.33887749910354614,
+      "learning_rate": 0.00016903527723005206,
+      "loss": 4.534,
+      "step": 2047
+    },
+    {
+      "epoch": 0.6549801115353096,
+      "grad_norm": 0.3391673266887665,
+      "learning_rate": 0.0001687556876729193,
+      "loss": 4.5171,
+      "step": 2048
+    },
+    {
+      "epoch": 0.6552999260428951,
+      "grad_norm": 0.3284638226032257,
+      "learning_rate": 0.00016847623901929408,
+      "loss": 4.3993,
+      "step": 2049
+    },
+    {
+      "epoch": 0.6556197405504807,
+      "grad_norm": 0.33401697874069214,
+      "learning_rate": 0.00016819693156919167,
+      "loss": 4.4099,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6559395550580663,
+      "grad_norm": 0.32520854473114014,
+      "learning_rate": 0.00016791776562247572,
+      "loss": 4.3699,
+      "step": 2051
+    },
+    {
+      "epoch": 0.6562593695656519,
+      "grad_norm": 0.33887824416160583,
+      "learning_rate": 0.0001676387414788581,
+      "loss": 4.337,
+      "step": 2052
+    },
+    {
+      "epoch": 0.6565791840732376,
+      "grad_norm": 0.3305310904979706,
+      "learning_rate": 0.00016735985943789808,
+      "loss": 4.458,
+      "step": 2053
+    },
+    {
+      "epoch": 0.6568989985808231,
+      "grad_norm": 0.33141234517097473,
+      "learning_rate": 0.0001670811197990027,
+      "loss": 4.3534,
+      "step": 2054
+    },
+    {
+      "epoch": 0.6572188130884087,
+      "grad_norm": 0.3346043825149536,
+      "learning_rate": 0.000166802522861426,
+      "loss": 4.3786,
+      "step": 2055
+    },
+    {
+      "epoch": 0.6575386275959944,
+      "grad_norm": 0.34695038199424744,
+      "learning_rate": 0.00016652406892426902,
+      "loss": 4.4992,
+      "step": 2056
+    },
+    {
+      "epoch": 0.6578584421035799,
+      "grad_norm": 0.32746338844299316,
+      "learning_rate": 0.00016624575828647878,
+      "loss": 4.4396,
+      "step": 2057
+    },
+    {
+      "epoch": 0.6581782566111655,
+      "grad_norm": 0.3358507454395294,
+      "learning_rate": 0.0001659675912468489,
+      "loss": 4.4843,
+      "step": 2058
+    },
+    {
+      "epoch": 0.6584980711187511,
+      "grad_norm": 0.35679417848587036,
+      "learning_rate": 0.00016568956810401867,
+      "loss": 4.4237,
+      "step": 2059
+    },
+    {
+      "epoch": 0.6588178856263367,
+      "grad_norm": 0.3554205298423767,
+      "learning_rate": 0.00016541168915647298,
+      "loss": 4.4,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6591377001339224,
+      "grad_norm": 0.3345361053943634,
+      "learning_rate": 0.00016513395470254194,
+      "loss": 4.4388,
+      "step": 2061
+    },
+    {
+      "epoch": 0.6594575146415079,
+      "grad_norm": 0.33950522541999817,
+      "learning_rate": 0.00016485636504040015,
+      "loss": 4.4784,
+      "step": 2062
+    },
+    {
+      "epoch": 0.6597773291490935,
+      "grad_norm": 0.370823472738266,
+      "learning_rate": 0.00016457892046806727,
+      "loss": 4.4151,
+      "step": 2063
+    },
+    {
+      "epoch": 0.6600971436566792,
+      "grad_norm": 0.34681758284568787,
+      "learning_rate": 0.00016430162128340693,
+      "loss": 4.4152,
+      "step": 2064
+    },
+    {
+      "epoch": 0.6604169581642647,
+      "grad_norm": 0.3444735109806061,
+      "learning_rate": 0.0001640244677841267,
+      "loss": 4.472,
+      "step": 2065
+    },
+    {
+      "epoch": 0.6607367726718504,
+      "grad_norm": 0.3534565269947052,
+      "learning_rate": 0.00016374746026777794,
+      "loss": 4.4343,
+      "step": 2066
+    },
+    {
+      "epoch": 0.6610565871794359,
+      "grad_norm": 0.3453407883644104,
+      "learning_rate": 0.0001634705990317548,
+      "loss": 4.4922,
+      "step": 2067
+    },
+    {
+      "epoch": 0.6613764016870215,
+      "grad_norm": 0.3331710696220398,
+      "learning_rate": 0.00016319388437329482,
+      "loss": 4.4478,
+      "step": 2068
+    },
+    {
+      "epoch": 0.6616962161946072,
+      "grad_norm": 0.3514850437641144,
+      "learning_rate": 0.00016291731658947808,
+      "loss": 4.4396,
+      "step": 2069
+    },
+    {
+      "epoch": 0.6620160307021927,
+      "grad_norm": 0.344855934381485,
+      "learning_rate": 0.0001626408959772269,
+      "loss": 4.4916,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6623358452097783,
+      "grad_norm": 0.40698984265327454,
+      "learning_rate": 0.00016236462283330578,
+      "loss": 4.4119,
+      "step": 2071
+    },
+    {
+      "epoch": 0.662655659717364,
+      "grad_norm": 0.3326229155063629,
+      "learning_rate": 0.0001620884974543205,
+      "loss": 4.4911,
+      "step": 2072
+    },
+    {
+      "epoch": 0.6629754742249495,
+      "grad_norm": 0.34035009145736694,
+      "learning_rate": 0.00016181252013671858,
+      "loss": 4.4649,
+      "step": 2073
+    },
+    {
+      "epoch": 0.6632952887325352,
+      "grad_norm": 0.34327614307403564,
+      "learning_rate": 0.00016153669117678848,
+      "loss": 4.4943,
+      "step": 2074
+    },
+    {
+      "epoch": 0.6636151032401207,
+      "grad_norm": 0.3320022225379944,
+      "learning_rate": 0.00016126101087065933,
+      "loss": 4.5206,
+      "step": 2075
+    },
+    {
+      "epoch": 0.6639349177477063,
+      "grad_norm": 0.350603312253952,
+      "learning_rate": 0.00016098547951430082,
+      "loss": 4.5001,
+      "step": 2076
+    },
+    {
+      "epoch": 0.664254732255292,
+      "grad_norm": 0.3385373055934906,
+      "learning_rate": 0.00016071009740352237,
+      "loss": 4.4999,
+      "step": 2077
+    },
+    {
+      "epoch": 0.6645745467628775,
+      "grad_norm": 0.35841625928878784,
+      "learning_rate": 0.0001604348648339736,
+      "loss": 4.426,
+      "step": 2078
+    },
+    {
+      "epoch": 0.6648943612704631,
+      "grad_norm": 0.331993967294693,
+      "learning_rate": 0.0001601597821011431,
+      "loss": 4.5417,
+      "step": 2079
+    },
+    {
+      "epoch": 0.6652141757780488,
+      "grad_norm": 0.34224316477775574,
+      "learning_rate": 0.0001598848495003593,
+      "loss": 4.4207,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6655339902856343,
+      "grad_norm": 0.3511262834072113,
+      "learning_rate": 0.00015961006732678873,
+      "loss": 4.4842,
+      "step": 2081
+    },
+    {
+      "epoch": 0.66585380479322,
+      "grad_norm": 0.3389959931373596,
+      "learning_rate": 0.00015933543587543682,
+      "loss": 4.4316,
+      "step": 2082
+    },
+    {
+      "epoch": 0.6661736193008055,
+      "grad_norm": 0.34232959151268005,
+      "learning_rate": 0.0001590609554411472,
+      "loss": 4.5411,
+      "step": 2083
+    },
+    {
+      "epoch": 0.6664934338083911,
+      "grad_norm": 0.3344050347805023,
+      "learning_rate": 0.0001587866263186009,
+      "loss": 4.4495,
+      "step": 2084
+    },
+    {
+      "epoch": 0.6668132483159768,
+      "grad_norm": 0.35482099652290344,
+      "learning_rate": 0.0001585124488023173,
+      "loss": 4.5453,
+      "step": 2085
+    },
+    {
+      "epoch": 0.6671330628235623,
+      "grad_norm": 0.3336874842643738,
+      "learning_rate": 0.00015823842318665233,
+      "loss": 4.3472,
+      "step": 2086
+    },
+    {
+      "epoch": 0.6674528773311479,
+      "grad_norm": 0.366793155670166,
+      "learning_rate": 0.00015796454976579901,
+      "loss": 4.497,
+      "step": 2087
+    },
+    {
+      "epoch": 0.6677726918387336,
+      "grad_norm": 0.3410011827945709,
+      "learning_rate": 0.00015769082883378737,
+      "loss": 4.3986,
+      "step": 2088
+    },
+    {
+      "epoch": 0.6680925063463191,
+      "grad_norm": 0.3739314079284668,
+      "learning_rate": 0.00015741726068448293,
+      "loss": 4.4505,
+      "step": 2089
+    },
+    {
+      "epoch": 0.6684123208539048,
+      "grad_norm": 0.33777886629104614,
+      "learning_rate": 0.0001571438456115881,
+      "loss": 4.4421,
+      "step": 2090
+    },
+    {
+      "epoch": 0.6687321353614903,
+      "grad_norm": 0.33963218331336975,
+      "learning_rate": 0.0001568705839086402,
+      "loss": 4.4998,
+      "step": 2091
+    },
+    {
+      "epoch": 0.6690519498690759,
+      "grad_norm": 0.3247828185558319,
+      "learning_rate": 0.00015659747586901243,
+      "loss": 4.4794,
+      "step": 2092
+    },
+    {
+      "epoch": 0.6693717643766616,
+      "grad_norm": 0.32846730947494507,
+      "learning_rate": 0.00015632452178591252,
+      "loss": 4.4371,
+      "step": 2093
+    },
+    {
+      "epoch": 0.6696915788842471,
+      "grad_norm": 0.3390513062477112,
+      "learning_rate": 0.00015605172195238314,
+      "loss": 4.4305,
+      "step": 2094
+    },
+    {
+      "epoch": 0.6700113933918327,
+      "grad_norm": 0.33395376801490784,
+      "learning_rate": 0.00015577907666130178,
+      "loss": 4.5377,
+      "step": 2095
+    },
+    {
+      "epoch": 0.6703312078994184,
+      "grad_norm": 0.33547675609588623,
+      "learning_rate": 0.00015550658620537932,
+      "loss": 4.5431,
+      "step": 2096
+    },
+    {
+      "epoch": 0.6706510224070039,
+      "grad_norm": 0.34296077489852905,
+      "learning_rate": 0.0001552342508771608,
+      "loss": 4.386,
+      "step": 2097
+    },
+    {
+      "epoch": 0.6709708369145896,
+      "grad_norm": 0.36239510774612427,
+      "learning_rate": 0.00015496207096902457,
+      "loss": 4.4442,
+      "step": 2098
+    },
+    {
+      "epoch": 0.6712906514221751,
+      "grad_norm": 0.37812966108322144,
+      "learning_rate": 0.00015469004677318214,
+      "loss": 4.4708,
+      "step": 2099
+    },
+    {
+      "epoch": 0.6716104659297607,
+      "grad_norm": 0.34064537286758423,
+      "learning_rate": 0.000154418178581678,
+      "loss": 4.4746,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6716104659297607,
+      "eval_loss": 4.476485729217529,
+      "eval_runtime": 99.2816,
+      "eval_samples_per_second": 19.107,
+      "eval_steps_per_second": 4.784,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6719302804373464,
+      "grad_norm": 0.33424270153045654,
+      "learning_rate": 0.00015414646668638897,
+      "loss": 4.3999,
+      "step": 2101
+    },
+    {
+      "epoch": 0.6722500949449319,
+      "grad_norm": 0.33114364743232727,
+      "learning_rate": 0.00015387491137902428,
+      "loss": 4.4414,
+      "step": 2102
+    },
+    {
+      "epoch": 0.6725699094525175,
+      "grad_norm": 0.34432920813560486,
+      "learning_rate": 0.00015360351295112468,
+      "loss": 4.3984,
+      "step": 2103
+    },
+    {
+      "epoch": 0.6728897239601032,
+      "grad_norm": 0.33681678771972656,
+      "learning_rate": 0.00015333227169406284,
+      "loss": 4.4823,
+      "step": 2104
+    },
+    {
+      "epoch": 0.6732095384676887,
+      "grad_norm": 0.34930625557899475,
+      "learning_rate": 0.0001530611878990426,
+      "loss": 4.4129,
+      "step": 2105
+    },
+    {
+      "epoch": 0.6735293529752744,
+      "grad_norm": 0.3283677101135254,
+      "learning_rate": 0.00015279026185709865,
+      "loss": 4.3738,
+      "step": 2106
+    },
+    {
+      "epoch": 0.67384916748286,
+      "grad_norm": 0.3268563449382782,
+      "learning_rate": 0.0001525194938590966,
+      "loss": 4.5099,
+      "step": 2107
+    },
+    {
+      "epoch": 0.6741689819904455,
+      "grad_norm": 0.3501228094100952,
+      "learning_rate": 0.0001522488841957319,
+      "loss": 4.5102,
+      "step": 2108
+    },
+    {
+      "epoch": 0.6744887964980312,
+      "grad_norm": 0.33682167530059814,
+      "learning_rate": 0.00015197843315753034,
+      "loss": 4.5232,
+      "step": 2109
+    },
+    {
+      "epoch": 0.6748086110056167,
+      "grad_norm": 0.3597559630870819,
+      "learning_rate": 0.00015170814103484747,
+      "loss": 4.4373,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6751284255132023,
+      "grad_norm": 0.340017169713974,
+      "learning_rate": 0.00015143800811786805,
+      "loss": 4.4699,
+      "step": 2111
+    },
+    {
+      "epoch": 0.675448240020788,
+      "grad_norm": 0.332932710647583,
+      "learning_rate": 0.00015116803469660616,
+      "loss": 4.4958,
+      "step": 2112
+    },
+    {
+      "epoch": 0.6757680545283735,
+      "grad_norm": 0.3516872227191925,
+      "learning_rate": 0.00015089822106090418,
+      "loss": 4.5335,
+      "step": 2113
+    },
+    {
+      "epoch": 0.6760878690359592,
+      "grad_norm": 0.35002401471138,
+      "learning_rate": 0.00015062856750043343,
+      "loss": 4.4151,
+      "step": 2114
+    },
+    {
+      "epoch": 0.6764076835435447,
+      "grad_norm": 0.3430773913860321,
+      "learning_rate": 0.00015035907430469304,
+      "loss": 4.4774,
+      "step": 2115
+    },
+    {
+      "epoch": 0.6767274980511303,
+      "grad_norm": 0.3273634910583496,
+      "learning_rate": 0.00015008974176301031,
+      "loss": 4.498,
+      "step": 2116
+    },
+    {
+      "epoch": 0.677047312558716,
+      "grad_norm": 0.3389027416706085,
+      "learning_rate": 0.00014982057016453969,
+      "loss": 4.4551,
+      "step": 2117
+    },
+    {
+      "epoch": 0.6773671270663015,
+      "grad_norm": 0.35352566838264465,
+      "learning_rate": 0.00014955155979826302,
+      "loss": 4.4706,
+      "step": 2118
+    },
+    {
+      "epoch": 0.6776869415738871,
+      "grad_norm": 0.3369107246398926,
+      "learning_rate": 0.00014928271095298912,
+      "loss": 4.3326,
+      "step": 2119
+    },
+    {
+      "epoch": 0.6780067560814728,
+      "grad_norm": 0.3422829508781433,
+      "learning_rate": 0.00014901402391735328,
+      "loss": 4.4644,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6783265705890583,
+      "grad_norm": 0.33077096939086914,
+      "learning_rate": 0.00014874549897981725,
+      "loss": 4.4121,
+      "step": 2121
+    },
+    {
+      "epoch": 0.678646385096644,
+      "grad_norm": 0.33814603090286255,
+      "learning_rate": 0.00014847713642866835,
+      "loss": 4.5341,
+      "step": 2122
+    },
+    {
+      "epoch": 0.6789661996042295,
+      "grad_norm": 0.34842199087142944,
+      "learning_rate": 0.00014820893655201998,
+      "loss": 4.4643,
+      "step": 2123
+    },
+    {
+      "epoch": 0.6792860141118151,
+      "grad_norm": 0.33767151832580566,
+      "learning_rate": 0.0001479408996378107,
+      "loss": 4.424,
+      "step": 2124
+    },
+    {
+      "epoch": 0.6796058286194008,
+      "grad_norm": 0.33817440271377563,
+      "learning_rate": 0.00014767302597380418,
+      "loss": 4.5601,
+      "step": 2125
+    },
+    {
+      "epoch": 0.6799256431269863,
+      "grad_norm": 0.3428094983100891,
+      "learning_rate": 0.0001474053158475889,
+      "loss": 4.4309,
+      "step": 2126
+    },
+    {
+      "epoch": 0.6802454576345719,
+      "grad_norm": 0.33630338311195374,
+      "learning_rate": 0.00014713776954657743,
+      "loss": 4.4203,
+      "step": 2127
+    },
+    {
+      "epoch": 0.6805652721421576,
+      "grad_norm": 0.34329938888549805,
+      "learning_rate": 0.00014687038735800693,
+      "loss": 4.5438,
+      "step": 2128
+    },
+    {
+      "epoch": 0.6808850866497431,
+      "grad_norm": 0.3318649232387543,
+      "learning_rate": 0.0001466031695689378,
+      "loss": 4.4217,
+      "step": 2129
+    },
+    {
+      "epoch": 0.6812049011573288,
+      "grad_norm": 0.3329257071018219,
+      "learning_rate": 0.0001463361164662546,
+      "loss": 4.413,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6815247156649143,
+      "grad_norm": 0.3311763107776642,
+      "learning_rate": 0.00014606922833666476,
+      "loss": 4.4273,
+      "step": 2131
+    },
+    {
+      "epoch": 0.6818445301724999,
+      "grad_norm": 0.3407959043979645,
+      "learning_rate": 0.00014580250546669836,
+      "loss": 4.4846,
+      "step": 2132
+    },
+    {
+      "epoch": 0.6821643446800856,
+      "grad_norm": 0.34549182653427124,
+      "learning_rate": 0.0001455359481427085,
+      "loss": 4.5137,
+      "step": 2133
+    },
+    {
+      "epoch": 0.6824841591876711,
+      "grad_norm": 0.33701956272125244,
+      "learning_rate": 0.00014526955665087013,
+      "loss": 4.403,
+      "step": 2134
+    },
+    {
+      "epoch": 0.6828039736952568,
+      "grad_norm": 0.327288419008255,
+      "learning_rate": 0.00014500333127718035,
+      "loss": 4.4906,
+      "step": 2135
+    },
+    {
+      "epoch": 0.6831237882028424,
+      "grad_norm": 0.33447083830833435,
+      "learning_rate": 0.00014473727230745833,
+      "loss": 4.4476,
+      "step": 2136
+    },
+    {
+      "epoch": 0.6834436027104279,
+      "grad_norm": 0.3489927053451538,
+      "learning_rate": 0.0001444713800273438,
+      "loss": 4.4885,
+      "step": 2137
+    },
+    {
+      "epoch": 0.6837634172180136,
+      "grad_norm": 0.3353646397590637,
+      "learning_rate": 0.0001442056547222982,
+      "loss": 4.496,
+      "step": 2138
+    },
+    {
+      "epoch": 0.6840832317255992,
+      "grad_norm": 0.33472129702568054,
+      "learning_rate": 0.0001439400966776032,
+      "loss": 4.4526,
+      "step": 2139
+    },
+    {
+      "epoch": 0.6844030462331847,
+      "grad_norm": 0.34008723497390747,
+      "learning_rate": 0.00014367470617836117,
+      "loss": 4.5634,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6847228607407704,
+      "grad_norm": 0.3388221859931946,
+      "learning_rate": 0.00014340948350949467,
+      "loss": 4.5802,
+      "step": 2141
+    },
+    {
+      "epoch": 0.6850426752483559,
+      "grad_norm": 0.3305363953113556,
+      "learning_rate": 0.00014314442895574595,
+      "loss": 4.495,
+      "step": 2142
+    },
+    {
+      "epoch": 0.6853624897559416,
+      "grad_norm": 0.3456083834171295,
+      "learning_rate": 0.00014287954280167695,
+      "loss": 4.4121,
+      "step": 2143
+    },
+    {
+      "epoch": 0.6856823042635272,
+      "grad_norm": 0.3401589095592499,
+      "learning_rate": 0.00014261482533166832,
+      "loss": 4.3316,
+      "step": 2144
+    },
+    {
+      "epoch": 0.6860021187711127,
+      "grad_norm": 0.3346431851387024,
+      "learning_rate": 0.0001423502768299202,
+      "loss": 4.4217,
+      "step": 2145
+    },
+    {
+      "epoch": 0.6863219332786984,
+      "grad_norm": 0.3352597951889038,
+      "learning_rate": 0.00014208589758045098,
+      "loss": 4.4556,
+      "step": 2146
+    },
+    {
+      "epoch": 0.686641747786284,
+      "grad_norm": 0.3395027816295624,
+      "learning_rate": 0.00014182168786709755,
+      "loss": 4.4635,
+      "step": 2147
+    },
+    {
+      "epoch": 0.6869615622938695,
+      "grad_norm": 0.3329550623893738,
+      "learning_rate": 0.00014155764797351472,
+      "loss": 4.5404,
+      "step": 2148
+    },
+    {
+      "epoch": 0.6872813768014552,
+      "grad_norm": 0.33801010251045227,
+      "learning_rate": 0.0001412937781831747,
+      "loss": 4.5546,
+      "step": 2149
+    },
+    {
+      "epoch": 0.6876011913090407,
+      "grad_norm": 0.3325575590133667,
+      "learning_rate": 0.0001410300787793675,
+      "loss": 4.4383,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6879210058166264,
+      "grad_norm": 0.33519449830055237,
+      "learning_rate": 0.00014076655004519997,
+      "loss": 4.4429,
+      "step": 2151
+    },
+    {
+      "epoch": 0.688240820324212,
+      "grad_norm": 0.3456709682941437,
+      "learning_rate": 0.00014050319226359593,
+      "loss": 4.5446,
+      "step": 2152
+    },
+    {
+      "epoch": 0.6885606348317975,
+      "grad_norm": 0.3320106565952301,
+      "learning_rate": 0.00014024000571729526,
+      "loss": 4.4131,
+      "step": 2153
+    },
+    {
+      "epoch": 0.6888804493393832,
+      "grad_norm": 0.32259705662727356,
+      "learning_rate": 0.00013997699068885443,
+      "loss": 4.4047,
+      "step": 2154
+    },
+    {
+      "epoch": 0.6892002638469688,
+      "grad_norm": 0.32922127842903137,
+      "learning_rate": 0.00013971414746064554,
+      "loss": 4.4786,
+      "step": 2155
+    },
+    {
+      "epoch": 0.6895200783545543,
+      "grad_norm": 0.3358190655708313,
+      "learning_rate": 0.00013945147631485634,
+      "loss": 4.3915,
+      "step": 2156
+    },
+    {
+      "epoch": 0.68983989286214,
+      "grad_norm": 0.34597843885421753,
+      "learning_rate": 0.00013918897753348991,
+      "loss": 4.4127,
+      "step": 2157
+    },
+    {
+      "epoch": 0.6901597073697255,
+      "grad_norm": 0.34270811080932617,
+      "learning_rate": 0.00013892665139836392,
+      "loss": 4.4276,
+      "step": 2158
+    },
+    {
+      "epoch": 0.6904795218773112,
+      "grad_norm": 0.33038774132728577,
+      "learning_rate": 0.0001386644981911111,
+      "loss": 4.5058,
+      "step": 2159
+    },
+    {
+      "epoch": 0.6907993363848968,
+      "grad_norm": 0.5337318181991577,
+      "learning_rate": 0.00013840251819317832,
+      "loss": 4.437,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6911191508924823,
+      "grad_norm": 0.3278668522834778,
+      "learning_rate": 0.00013814071168582654,
+      "loss": 4.4028,
+      "step": 2161
+    },
+    {
+      "epoch": 0.691438965400068,
+      "grad_norm": 0.3297139108181,
+      "learning_rate": 0.00013787907895013054,
+      "loss": 4.4416,
+      "step": 2162
+    },
+    {
+      "epoch": 0.6917587799076536,
+      "grad_norm": 0.33460527658462524,
+      "learning_rate": 0.0001376176202669783,
+      "loss": 4.3965,
+      "step": 2163
+    },
+    {
+      "epoch": 0.6920785944152391,
+      "grad_norm": 0.33878377079963684,
+      "learning_rate": 0.00013735633591707117,
+      "loss": 4.5049,
+      "step": 2164
+    },
+    {
+      "epoch": 0.6923984089228248,
+      "grad_norm": 0.3428264558315277,
+      "learning_rate": 0.00013709522618092328,
+      "loss": 4.4022,
+      "step": 2165
+    },
+    {
+      "epoch": 0.6927182234304103,
+      "grad_norm": 0.33774280548095703,
+      "learning_rate": 0.00013683429133886122,
+      "loss": 4.4205,
+      "step": 2166
+    },
+    {
+      "epoch": 0.693038037937996,
+      "grad_norm": 0.33137834072113037,
+      "learning_rate": 0.00013657353167102401,
+      "loss": 4.4648,
+      "step": 2167
+    },
+    {
+      "epoch": 0.6933578524455816,
+      "grad_norm": 0.33991193771362305,
+      "learning_rate": 0.00013631294745736227,
+      "loss": 4.4886,
+      "step": 2168
+    },
+    {
+      "epoch": 0.6936776669531671,
+      "grad_norm": 0.3367961049079895,
+      "learning_rate": 0.0001360525389776385,
+      "loss": 4.4017,
+      "step": 2169
+    },
+    {
+      "epoch": 0.6939974814607528,
+      "grad_norm": 0.3382626175880432,
+      "learning_rate": 0.00013579230651142654,
+      "loss": 4.4184,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6943172959683384,
+      "grad_norm": 0.3440368175506592,
+      "learning_rate": 0.00013553225033811114,
+      "loss": 4.4781,
+      "step": 2171
+    },
+    {
+      "epoch": 0.6946371104759239,
+      "grad_norm": 0.3414704501628876,
+      "learning_rate": 0.00013527237073688797,
+      "loss": 4.5359,
+      "step": 2172
+    },
+    {
+      "epoch": 0.6949569249835096,
+      "grad_norm": 0.3314661383628845,
+      "learning_rate": 0.00013501266798676283,
+      "loss": 4.4856,
+      "step": 2173
+    },
+    {
+      "epoch": 0.6952767394910951,
+      "grad_norm": 0.3305390477180481,
+      "learning_rate": 0.000134753142366552,
+      "loss": 4.4384,
+      "step": 2174
+    },
+    {
+      "epoch": 0.6955965539986808,
+      "grad_norm": 0.33603882789611816,
+      "learning_rate": 0.0001344937941548811,
+      "loss": 4.4986,
+      "step": 2175
+    },
+    {
+      "epoch": 0.6959163685062664,
+      "grad_norm": 0.3437805771827698,
+      "learning_rate": 0.00013423462363018604,
+      "loss": 4.3386,
+      "step": 2176
+    },
+    {
+      "epoch": 0.6962361830138519,
+      "grad_norm": 0.328469842672348,
+      "learning_rate": 0.00013397563107071125,
+      "loss": 4.412,
+      "step": 2177
+    },
+    {
+      "epoch": 0.6965559975214376,
+      "grad_norm": 0.3432283103466034,
+      "learning_rate": 0.0001337168167545104,
+      "loss": 4.5509,
+      "step": 2178
+    },
+    {
+      "epoch": 0.6968758120290232,
+      "grad_norm": 0.3334380090236664,
+      "learning_rate": 0.000133458180959446,
+      "loss": 4.4809,
+      "step": 2179
+    },
+    {
+      "epoch": 0.6971956265366087,
+      "grad_norm": 0.32941102981567383,
+      "learning_rate": 0.00013319972396318828,
+      "loss": 4.4263,
+      "step": 2180
+    },
+    {
+      "epoch": 0.6975154410441944,
+      "grad_norm": 0.34028035402297974,
+      "learning_rate": 0.00013294144604321633,
+      "loss": 4.4874,
+      "step": 2181
+    },
+    {
+      "epoch": 0.6978352555517799,
+      "grad_norm": 0.34419766068458557,
+      "learning_rate": 0.00013268334747681626,
+      "loss": 4.4144,
+      "step": 2182
+    },
+    {
+      "epoch": 0.6981550700593656,
+      "grad_norm": 0.32000261545181274,
+      "learning_rate": 0.0001324254285410821,
+      "loss": 4.4725,
+      "step": 2183
+    },
+    {
+      "epoch": 0.6984748845669512,
+      "grad_norm": 0.3259260058403015,
+      "learning_rate": 0.0001321676895129149,
+      "loss": 4.4092,
+      "step": 2184
+    },
+    {
+      "epoch": 0.6987946990745367,
+      "grad_norm": 0.33444148302078247,
+      "learning_rate": 0.0001319101306690222,
+      "loss": 4.3921,
+      "step": 2185
+    },
+    {
+      "epoch": 0.6991145135821224,
+      "grad_norm": 0.3239077627658844,
+      "learning_rate": 0.0001316527522859189,
+      "loss": 4.4585,
+      "step": 2186
+    },
+    {
+      "epoch": 0.699434328089708,
+      "grad_norm": 0.32290899753570557,
+      "learning_rate": 0.00013139555463992527,
+      "loss": 4.3708,
+      "step": 2187
+    },
+    {
+      "epoch": 0.6997541425972935,
+      "grad_norm": 0.33842989802360535,
+      "learning_rate": 0.00013113853800716824,
+      "loss": 4.4469,
+      "step": 2188
+    },
+    {
+      "epoch": 0.7000739571048792,
+      "grad_norm": 0.32952606678009033,
+      "learning_rate": 0.00013088170266357986,
+      "loss": 4.4598,
+      "step": 2189
+    },
+    {
+      "epoch": 0.7003937716124647,
+      "grad_norm": 0.3406091034412384,
+      "learning_rate": 0.00013062504888489788,
+      "loss": 4.484,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7007135861200504,
+      "grad_norm": 0.3301216661930084,
+      "learning_rate": 0.0001303685769466651,
+      "loss": 4.5346,
+      "step": 2191
+    },
+    {
+      "epoch": 0.701033400627636,
+      "grad_norm": 0.34070494771003723,
+      "learning_rate": 0.00013011228712422898,
+      "loss": 4.4581,
+      "step": 2192
+    },
+    {
+      "epoch": 0.7013532151352215,
+      "grad_norm": 0.3262787163257599,
+      "learning_rate": 0.0001298561796927417,
+      "loss": 4.5433,
+      "step": 2193
+    },
+    {
+      "epoch": 0.7016730296428072,
+      "grad_norm": 0.33801570534706116,
+      "learning_rate": 0.00012960025492715914,
+      "loss": 4.4782,
+      "step": 2194
+    },
+    {
+      "epoch": 0.7019928441503928,
+      "grad_norm": 0.3243328630924225,
+      "learning_rate": 0.0001293445131022416,
+      "loss": 4.4121,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7023126586579783,
+      "grad_norm": 0.323103666305542,
+      "learning_rate": 0.00012908895449255262,
+      "loss": 4.3293,
+      "step": 2196
+    },
+    {
+      "epoch": 0.702632473165564,
+      "grad_norm": 0.3357979953289032,
+      "learning_rate": 0.0001288335793724592,
+      "loss": 4.4821,
+      "step": 2197
+    },
+    {
+      "epoch": 0.7029522876731495,
+      "grad_norm": 0.3401612341403961,
+      "learning_rate": 0.00012857838801613153,
+      "loss": 4.4517,
+      "step": 2198
+    },
+    {
+      "epoch": 0.7032721021807352,
+      "grad_norm": 0.3437880873680115,
+      "learning_rate": 0.000128323380697542,
+      "loss": 4.3027,
+      "step": 2199
+    },
+    {
+      "epoch": 0.7035919166883208,
+      "grad_norm": 0.32506972551345825,
+      "learning_rate": 0.0001280685576904658,
+      "loss": 4.4328,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7035919166883208,
+      "eval_loss": 4.454440593719482,
+      "eval_runtime": 96.9049,
+      "eval_samples_per_second": 19.576,
+      "eval_steps_per_second": 4.902,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7039117311959063,
+      "grad_norm": 0.3431197702884674,
+      "learning_rate": 0.0001278139192684802,
+      "loss": 4.4512,
+      "step": 2201
+    },
+    {
+      "epoch": 0.704231545703492,
+      "grad_norm": 0.338541716337204,
+      "learning_rate": 0.00012755946570496427,
+      "loss": 4.4354,
+      "step": 2202
+    },
+    {
+      "epoch": 0.7045513602110776,
+      "grad_norm": 0.3421216905117035,
+      "learning_rate": 0.0001273051972730987,
+      "loss": 4.3548,
+      "step": 2203
+    },
+    {
+      "epoch": 0.7048711747186632,
+      "grad_norm": 0.352405309677124,
+      "learning_rate": 0.00012705111424586512,
+      "loss": 4.5309,
+      "step": 2204
+    },
+    {
+      "epoch": 0.7051909892262488,
+      "grad_norm": 0.34158819913864136,
+      "learning_rate": 0.00012679721689604642,
+      "loss": 4.4749,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7055108037338343,
+      "grad_norm": 0.3365214765071869,
+      "learning_rate": 0.00012654350549622605,
+      "loss": 4.4697,
+      "step": 2206
+    },
+    {
+      "epoch": 0.70583061824142,
+      "grad_norm": 0.3539983034133911,
+      "learning_rate": 0.00012628998031878784,
+      "loss": 4.4178,
+      "step": 2207
+    },
+    {
+      "epoch": 0.7061504327490056,
+      "grad_norm": 0.34218186140060425,
+      "learning_rate": 0.00012603664163591573,
+      "loss": 4.5161,
+      "step": 2208
+    },
+    {
+      "epoch": 0.7064702472565911,
+      "grad_norm": 0.34383469820022583,
+      "learning_rate": 0.00012578348971959324,
+      "loss": 4.3953,
+      "step": 2209
+    },
+    {
+      "epoch": 0.7067900617641768,
+      "grad_norm": 0.3489435911178589,
+      "learning_rate": 0.0001255305248416036,
+      "loss": 4.3898,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7071098762717624,
+      "grad_norm": 0.3475740849971771,
+      "learning_rate": 0.0001252777472735291,
+      "loss": 4.4867,
+      "step": 2211
+    },
+    {
+      "epoch": 0.707429690779348,
+      "grad_norm": 0.33726832270622253,
+      "learning_rate": 0.00012502515728675124,
+      "loss": 4.4221,
+      "step": 2212
+    },
+    {
+      "epoch": 0.7077495052869336,
+      "grad_norm": 0.33398839831352234,
+      "learning_rate": 0.00012477275515244951,
+      "loss": 4.4643,
+      "step": 2213
+    },
+    {
+      "epoch": 0.7080693197945191,
+      "grad_norm": 0.33227816224098206,
+      "learning_rate": 0.00012452054114160232,
+      "loss": 4.4726,
+      "step": 2214
+    },
+    {
+      "epoch": 0.7083891343021048,
+      "grad_norm": 0.348752498626709,
+      "learning_rate": 0.00012426851552498584,
+      "loss": 4.4492,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7087089488096904,
+      "grad_norm": 0.3545939326286316,
+      "learning_rate": 0.00012401667857317406,
+      "loss": 4.4339,
+      "step": 2216
+    },
+    {
+      "epoch": 0.7090287633172759,
+      "grad_norm": 0.3521256148815155,
+      "learning_rate": 0.0001237650305565385,
+      "loss": 4.5827,
+      "step": 2217
+    },
+    {
+      "epoch": 0.7093485778248616,
+      "grad_norm": 0.33230698108673096,
+      "learning_rate": 0.00012351357174524745,
+      "loss": 4.4767,
+      "step": 2218
+    },
+    {
+      "epoch": 0.7096683923324472,
+      "grad_norm": 0.3276318311691284,
+      "learning_rate": 0.00012326230240926653,
+      "loss": 4.5138,
+      "step": 2219
+    },
+    {
+      "epoch": 0.7099882068400328,
+      "grad_norm": 0.3415897488594055,
+      "learning_rate": 0.00012301122281835772,
+      "loss": 4.4965,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7103080213476184,
+      "grad_norm": 0.3361230790615082,
+      "learning_rate": 0.00012276033324207935,
+      "loss": 4.2935,
+      "step": 2221
+    },
+    {
+      "epoch": 0.710627835855204,
+      "grad_norm": 0.32287877798080444,
+      "learning_rate": 0.00012250963394978584,
+      "loss": 4.4713,
+      "step": 2222
+    },
+    {
+      "epoch": 0.7109476503627896,
+      "grad_norm": 0.3255848288536072,
+      "learning_rate": 0.00012225912521062702,
+      "loss": 4.492,
+      "step": 2223
+    },
+    {
+      "epoch": 0.7112674648703752,
+      "grad_norm": 0.3354206681251526,
+      "learning_rate": 0.00012200880729354847,
+      "loss": 4.475,
+      "step": 2224
+    },
+    {
+      "epoch": 0.7115872793779607,
+      "grad_norm": 0.3278037905693054,
+      "learning_rate": 0.0001217586804672905,
+      "loss": 4.4227,
+      "step": 2225
+    },
+    {
+      "epoch": 0.7119070938855464,
+      "grad_norm": 0.3404330611228943,
+      "learning_rate": 0.0001215087450003889,
+      "loss": 4.5188,
+      "step": 2226
+    },
+    {
+      "epoch": 0.712226908393132,
+      "grad_norm": 0.3332688510417938,
+      "learning_rate": 0.00012125900116117357,
+      "loss": 4.2328,
+      "step": 2227
+    },
+    {
+      "epoch": 0.7125467229007176,
+      "grad_norm": 0.33184289932250977,
+      "learning_rate": 0.0001210094492177686,
+      "loss": 4.3894,
+      "step": 2228
+    },
+    {
+      "epoch": 0.7128665374083032,
+      "grad_norm": 0.32943254709243774,
+      "learning_rate": 0.00012076008943809238,
+      "loss": 4.4396,
+      "step": 2229
+    },
+    {
+      "epoch": 0.7131863519158887,
+      "grad_norm": 0.32169508934020996,
+      "learning_rate": 0.00012051092208985671,
+      "loss": 4.452,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7135061664234744,
+      "grad_norm": 0.3315311372280121,
+      "learning_rate": 0.00012026194744056684,
+      "loss": 4.4236,
+      "step": 2231
+    },
+    {
+      "epoch": 0.71382598093106,
+      "grad_norm": 0.33657577633857727,
+      "learning_rate": 0.00012001316575752159,
+      "loss": 4.4091,
+      "step": 2232
+    },
+    {
+      "epoch": 0.7141457954386455,
+      "grad_norm": 0.339559406042099,
+      "learning_rate": 0.00011976457730781191,
+      "loss": 4.5155,
+      "step": 2233
+    },
+    {
+      "epoch": 0.7144656099462312,
+      "grad_norm": 0.3332586884498596,
+      "learning_rate": 0.00011951618235832183,
+      "loss": 4.418,
+      "step": 2234
+    },
+    {
+      "epoch": 0.7147854244538168,
+      "grad_norm": 0.3349529206752777,
+      "learning_rate": 0.00011926798117572722,
+      "loss": 4.5207,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7151052389614024,
+      "grad_norm": 0.35063642263412476,
+      "learning_rate": 0.00011901997402649629,
+      "loss": 4.4715,
+      "step": 2236
+    },
+    {
+      "epoch": 0.715425053468988,
+      "grad_norm": 0.340107262134552,
+      "learning_rate": 0.00011877216117688875,
+      "loss": 4.4252,
+      "step": 2237
+    },
+    {
+      "epoch": 0.7157448679765736,
+      "grad_norm": 0.3303718864917755,
+      "learning_rate": 0.00011852454289295575,
+      "loss": 4.3497,
+      "step": 2238
+    },
+    {
+      "epoch": 0.7160646824841592,
+      "grad_norm": 0.3243018686771393,
+      "learning_rate": 0.00011827711944053962,
+      "loss": 4.3824,
+      "step": 2239
+    },
+    {
+      "epoch": 0.7163844969917448,
+      "grad_norm": 0.3365945816040039,
+      "learning_rate": 0.00011802989108527331,
+      "loss": 4.5189,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7167043114993303,
+      "grad_norm": 0.33397188782691956,
+      "learning_rate": 0.00011778285809258052,
+      "loss": 4.4392,
+      "step": 2241
+    },
+    {
+      "epoch": 0.717024126006916,
+      "grad_norm": 0.33258089423179626,
+      "learning_rate": 0.00011753602072767514,
+      "loss": 4.4494,
+      "step": 2242
+    },
+    {
+      "epoch": 0.7173439405145016,
+      "grad_norm": 0.3365824222564697,
+      "learning_rate": 0.00011728937925556107,
+      "loss": 4.3737,
+      "step": 2243
+    },
+    {
+      "epoch": 0.7176637550220872,
+      "grad_norm": 0.354343980550766,
+      "learning_rate": 0.00011704293394103194,
+      "loss": 4.4178,
+      "step": 2244
+    },
+    {
+      "epoch": 0.7179835695296728,
+      "grad_norm": 0.3422267735004425,
+      "learning_rate": 0.00011679668504867051,
+      "loss": 4.4577,
+      "step": 2245
+    },
+    {
+      "epoch": 0.7183033840372584,
+      "grad_norm": 0.3237408697605133,
+      "learning_rate": 0.00011655063284284901,
+      "loss": 4.356,
+      "step": 2246
+    },
+    {
+      "epoch": 0.718623198544844,
+      "grad_norm": 0.33587223291397095,
+      "learning_rate": 0.0001163047775877283,
+      "loss": 4.3869,
+      "step": 2247
+    },
+    {
+      "epoch": 0.7189430130524296,
+      "grad_norm": 0.3292006552219391,
+      "learning_rate": 0.00011605911954725802,
+      "loss": 4.3966,
+      "step": 2248
+    },
+    {
+      "epoch": 0.7192628275600151,
+      "grad_norm": 0.3411811590194702,
+      "learning_rate": 0.00011581365898517567,
+      "loss": 4.327,
+      "step": 2249
+    },
+    {
+      "epoch": 0.7195826420676008,
+      "grad_norm": 0.3547792434692383,
+      "learning_rate": 0.0001155683961650071,
+      "loss": 4.5197,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7199024565751864,
+      "grad_norm": 0.33441030979156494,
+      "learning_rate": 0.00011532333135006579,
+      "loss": 4.3558,
+      "step": 2251
+    },
+    {
+      "epoch": 0.720222271082772,
+      "grad_norm": 0.3281584680080414,
+      "learning_rate": 0.00011507846480345255,
+      "loss": 4.4791,
+      "step": 2252
+    },
+    {
+      "epoch": 0.7205420855903576,
+      "grad_norm": 0.3454711139202118,
+      "learning_rate": 0.00011483379678805551,
+      "loss": 4.3724,
+      "step": 2253
+    },
+    {
+      "epoch": 0.7208619000979432,
+      "grad_norm": 0.33835241198539734,
+      "learning_rate": 0.00011458932756654938,
+      "loss": 4.4171,
+      "step": 2254
+    },
+    {
+      "epoch": 0.7211817146055288,
+      "grad_norm": 0.3432719111442566,
+      "learning_rate": 0.00011434505740139558,
+      "loss": 4.4025,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7215015291131144,
+      "grad_norm": 0.34123605489730835,
+      "learning_rate": 0.00011410098655484194,
+      "loss": 4.4138,
+      "step": 2256
+    },
+    {
+      "epoch": 0.7218213436206999,
+      "grad_norm": 0.34249627590179443,
+      "learning_rate": 0.00011385711528892216,
+      "loss": 4.4132,
+      "step": 2257
+    },
+    {
+      "epoch": 0.7221411581282856,
+      "grad_norm": 0.3301515579223633,
+      "learning_rate": 0.00011361344386545585,
+      "loss": 4.4961,
+      "step": 2258
+    },
+    {
+      "epoch": 0.7224609726358712,
+      "grad_norm": 0.33210471272468567,
+      "learning_rate": 0.00011336997254604769,
+      "loss": 4.3918,
+      "step": 2259
+    },
+    {
+      "epoch": 0.7227807871434568,
+      "grad_norm": 0.3336414396762848,
+      "learning_rate": 0.0001131267015920879,
+      "loss": 4.4551,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7231006016510424,
+      "grad_norm": 0.34347960352897644,
+      "learning_rate": 0.0001128836312647514,
+      "loss": 4.3648,
+      "step": 2261
+    },
+    {
+      "epoch": 0.723420416158628,
+      "grad_norm": 0.33358657360076904,
+      "learning_rate": 0.00011264076182499787,
+      "loss": 4.4994,
+      "step": 2262
+    },
+    {
+      "epoch": 0.7237402306662136,
+      "grad_norm": 0.32134950160980225,
+      "learning_rate": 0.00011239809353357127,
+      "loss": 4.4902,
+      "step": 2263
+    },
+    {
+      "epoch": 0.7240600451737992,
+      "grad_norm": 0.3383727967739105,
+      "learning_rate": 0.00011215562665099941,
+      "loss": 4.3234,
+      "step": 2264
+    },
+    {
+      "epoch": 0.7243798596813849,
+      "grad_norm": 0.36297017335891724,
+      "learning_rate": 0.00011191336143759417,
+      "loss": 4.4825,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7246996741889704,
+      "grad_norm": 0.3291953206062317,
+      "learning_rate": 0.00011167129815345048,
+      "loss": 4.4349,
+      "step": 2266
+    },
+    {
+      "epoch": 0.725019488696556,
+      "grad_norm": 0.35270607471466064,
+      "learning_rate": 0.0001114294370584471,
+      "loss": 4.4035,
+      "step": 2267
+    },
+    {
+      "epoch": 0.7253393032041416,
+      "grad_norm": 0.3327333927154541,
+      "learning_rate": 0.00011118777841224534,
+      "loss": 4.444,
+      "step": 2268
+    },
+    {
+      "epoch": 0.7256591177117272,
+      "grad_norm": 0.3331069350242615,
+      "learning_rate": 0.00011094632247428907,
+      "loss": 4.3419,
+      "step": 2269
+    },
+    {
+      "epoch": 0.7259789322193128,
+      "grad_norm": 0.33473634719848633,
+      "learning_rate": 0.00011070506950380483,
+      "loss": 4.3794,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7262987467268984,
+      "grad_norm": 0.3344692885875702,
+      "learning_rate": 0.0001104640197598008,
+      "loss": 4.4044,
+      "step": 2271
+    },
+    {
+      "epoch": 0.726618561234484,
+      "grad_norm": 0.3378191888332367,
+      "learning_rate": 0.00011022317350106774,
+      "loss": 4.4291,
+      "step": 2272
+    },
+    {
+      "epoch": 0.7269383757420697,
+      "grad_norm": 0.3254040479660034,
+      "learning_rate": 0.00010998253098617707,
+      "loss": 4.4667,
+      "step": 2273
+    },
+    {
+      "epoch": 0.7272581902496552,
+      "grad_norm": 0.3407931327819824,
+      "learning_rate": 0.00010974209247348211,
+      "loss": 4.439,
+      "step": 2274
+    },
+    {
+      "epoch": 0.7275780047572408,
+      "grad_norm": 0.3590623140335083,
+      "learning_rate": 0.00010950185822111697,
+      "loss": 4.3551,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7278978192648264,
+      "grad_norm": 0.3413101136684418,
+      "learning_rate": 0.00010926182848699613,
+      "loss": 4.4105,
+      "step": 2276
+    },
+    {
+      "epoch": 0.728217633772412,
+      "grad_norm": 0.33191800117492676,
+      "learning_rate": 0.00010902200352881522,
+      "loss": 4.5755,
+      "step": 2277
+    },
+    {
+      "epoch": 0.7285374482799976,
+      "grad_norm": 0.3360489010810852,
+      "learning_rate": 0.00010878238360404934,
+      "loss": 4.383,
+      "step": 2278
+    },
+    {
+      "epoch": 0.7288572627875832,
+      "grad_norm": 0.33308154344558716,
+      "learning_rate": 0.00010854296896995379,
+      "loss": 4.4871,
+      "step": 2279
+    },
+    {
+      "epoch": 0.7291770772951688,
+      "grad_norm": 0.3325072228908539,
+      "learning_rate": 0.00010830375988356354,
+      "loss": 4.4002,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7294968918027545,
+      "grad_norm": 0.32435134053230286,
+      "learning_rate": 0.00010806475660169243,
+      "loss": 4.3895,
+      "step": 2281
+    },
+    {
+      "epoch": 0.72981670631034,
+      "grad_norm": 0.3442571461200714,
+      "learning_rate": 0.00010782595938093417,
+      "loss": 4.4782,
+      "step": 2282
+    },
+    {
+      "epoch": 0.7301365208179256,
+      "grad_norm": 0.3612309992313385,
+      "learning_rate": 0.00010758736847766033,
+      "loss": 4.422,
+      "step": 2283
+    },
+    {
+      "epoch": 0.7304563353255112,
+      "grad_norm": 0.3383840024471283,
+      "learning_rate": 0.00010734898414802169,
+      "loss": 4.4108,
+      "step": 2284
+    },
+    {
+      "epoch": 0.7307761498330968,
+      "grad_norm": 0.3336951434612274,
+      "learning_rate": 0.00010711080664794676,
+      "loss": 4.4591,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7310959643406824,
+      "grad_norm": 0.3335397243499756,
+      "learning_rate": 0.00010687283623314225,
+      "loss": 4.3975,
+      "step": 2286
+    },
+    {
+      "epoch": 0.731415778848268,
+      "grad_norm": 0.39656996726989746,
+      "learning_rate": 0.00010663507315909255,
+      "loss": 4.4759,
+      "step": 2287
+    },
+    {
+      "epoch": 0.7317355933558536,
+      "grad_norm": 0.347331702709198,
+      "learning_rate": 0.00010639751768105936,
+      "loss": 4.445,
+      "step": 2288
+    },
+    {
+      "epoch": 0.7320554078634393,
+      "grad_norm": 0.3473789095878601,
+      "learning_rate": 0.00010616017005408167,
+      "loss": 4.4213,
+      "step": 2289
+    },
+    {
+      "epoch": 0.7323752223710248,
+      "grad_norm": 0.32854992151260376,
+      "learning_rate": 0.00010592303053297499,
+      "loss": 4.3913,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7326950368786104,
+      "grad_norm": 0.35310834646224976,
+      "learning_rate": 0.00010568609937233168,
+      "loss": 4.4185,
+      "step": 2291
+    },
+    {
+      "epoch": 0.733014851386196,
+      "grad_norm": 0.3509746491909027,
+      "learning_rate": 0.00010544937682652035,
+      "loss": 4.4345,
+      "step": 2292
+    },
+    {
+      "epoch": 0.7333346658937816,
+      "grad_norm": 0.3366645276546478,
+      "learning_rate": 0.00010521286314968567,
+      "loss": 4.3897,
+      "step": 2293
+    },
+    {
+      "epoch": 0.7336544804013672,
+      "grad_norm": 0.322303831577301,
+      "learning_rate": 0.00010497655859574809,
+      "loss": 4.4076,
+      "step": 2294
+    },
+    {
+      "epoch": 0.7339742949089528,
+      "grad_norm": 0.3318299949169159,
+      "learning_rate": 0.00010474046341840329,
+      "loss": 4.431,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7342941094165384,
+      "grad_norm": 0.3476223945617676,
+      "learning_rate": 0.00010450457787112246,
+      "loss": 4.4455,
+      "step": 2296
+    },
+    {
+      "epoch": 0.7346139239241241,
+      "grad_norm": 0.34152400493621826,
+      "learning_rate": 0.00010426890220715164,
+      "loss": 4.3992,
+      "step": 2297
+    },
+    {
+      "epoch": 0.7349337384317096,
+      "grad_norm": 0.34011977910995483,
+      "learning_rate": 0.00010403343667951149,
+      "loss": 4.3288,
+      "step": 2298
+    },
+    {
+      "epoch": 0.7352535529392952,
+      "grad_norm": 0.3339643180370331,
+      "learning_rate": 0.00010379818154099724,
+      "loss": 4.4182,
+      "step": 2299
+    },
+    {
+      "epoch": 0.7355733674468808,
+      "grad_norm": 0.33768823742866516,
+      "learning_rate": 0.00010356313704417794,
+      "loss": 4.4182,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7355733674468808,
+      "eval_loss": 4.4368157386779785,
+      "eval_runtime": 89.2711,
+      "eval_samples_per_second": 21.25,
+      "eval_steps_per_second": 5.321,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7358931819544664,
+      "grad_norm": 0.33821696043014526,
+      "learning_rate": 0.0001033283034413967,
+      "loss": 4.3648,
+      "step": 2301
+    },
+    {
+      "epoch": 0.736212996462052,
+      "grad_norm": 0.3329372704029083,
+      "learning_rate": 0.00010309368098477025,
+      "loss": 4.4616,
+      "step": 2302
+    },
+    {
+      "epoch": 0.7365328109696376,
+      "grad_norm": 0.3347349464893341,
+      "learning_rate": 0.00010285926992618855,
+      "loss": 4.374,
+      "step": 2303
+    },
+    {
+      "epoch": 0.7368526254772232,
+      "grad_norm": 0.3423709273338318,
+      "learning_rate": 0.0001026250705173147,
+      "loss": 4.374,
+      "step": 2304
+    },
+    {
+      "epoch": 0.7371724399848089,
+      "grad_norm": 0.3551722466945648,
+      "learning_rate": 0.00010239108300958432,
+      "loss": 4.4644,
+      "step": 2305
+    },
+    {
+      "epoch": 0.7374922544923944,
+      "grad_norm": 0.3246476948261261,
+      "learning_rate": 0.00010215730765420579,
+      "loss": 4.4353,
+      "step": 2306
+    },
+    {
+      "epoch": 0.73781206899998,
+      "grad_norm": 0.3443451523780823,
+      "learning_rate": 0.00010192374470215969,
+      "loss": 4.4584,
+      "step": 2307
+    },
+    {
+      "epoch": 0.7381318835075656,
+      "grad_norm": 0.3336166441440582,
+      "learning_rate": 0.00010169039440419855,
+      "loss": 4.4641,
+      "step": 2308
+    },
+    {
+      "epoch": 0.7384516980151512,
+      "grad_norm": 0.34944331645965576,
+      "learning_rate": 0.00010145725701084643,
+      "loss": 4.3898,
+      "step": 2309
+    },
+    {
+      "epoch": 0.7387715125227368,
+      "grad_norm": 0.34261205792427063,
+      "learning_rate": 0.000101224332772399,
+      "loss": 4.3359,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7390913270303224,
+      "grad_norm": 0.34464573860168457,
+      "learning_rate": 0.00010099162193892303,
+      "loss": 4.4617,
+      "step": 2311
+    },
+    {
+      "epoch": 0.739411141537908,
+      "grad_norm": 0.3316618800163269,
+      "learning_rate": 0.00010075912476025623,
+      "loss": 4.4845,
+      "step": 2312
+    },
+    {
+      "epoch": 0.7397309560454937,
+      "grad_norm": 0.32946211099624634,
+      "learning_rate": 0.000100526841486007,
+      "loss": 4.3392,
+      "step": 2313
+    },
+    {
+      "epoch": 0.7400507705530792,
+      "grad_norm": 0.34942230582237244,
+      "learning_rate": 0.00010029477236555372,
+      "loss": 4.4224,
+      "step": 2314
+    },
+    {
+      "epoch": 0.7403705850606648,
+      "grad_norm": 0.32532206177711487,
+      "learning_rate": 0.00010006291764804523,
+      "loss": 4.4112,
+      "step": 2315
+    },
+    {
+      "epoch": 0.7406903995682504,
+      "grad_norm": 0.3478979766368866,
+      "learning_rate": 9.98312775824001e-05,
+      "loss": 4.3782,
+      "step": 2316
+    },
+    {
+      "epoch": 0.741010214075836,
+      "grad_norm": 0.3328978419303894,
+      "learning_rate": 9.959985241730641e-05,
+      "loss": 4.426,
+      "step": 2317
+    },
+    {
+      "epoch": 0.7413300285834216,
+      "grad_norm": 0.33060571551322937,
+      "learning_rate": 9.936864240122164e-05,
+      "loss": 4.3671,
+      "step": 2318
+    },
+    {
+      "epoch": 0.7416498430910072,
+      "grad_norm": 0.3444348871707916,
+      "learning_rate": 9.913764778237196e-05,
+      "loss": 4.3798,
+      "step": 2319
+    },
+    {
+      "epoch": 0.7419696575985928,
+      "grad_norm": 0.3299075961112976,
+      "learning_rate": 9.890686880875274e-05,
+      "loss": 4.5089,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7422894721061785,
+      "grad_norm": 0.347756028175354,
+      "learning_rate": 9.86763057281273e-05,
+      "loss": 4.3846,
+      "step": 2321
+    },
+    {
+      "epoch": 0.742609286613764,
+      "grad_norm": 0.3371478319168091,
+      "learning_rate": 9.844595878802778e-05,
+      "loss": 4.3443,
+      "step": 2322
+    },
+    {
+      "epoch": 0.7429291011213496,
+      "grad_norm": 0.3344487249851227,
+      "learning_rate": 9.821582823575398e-05,
+      "loss": 4.4405,
+      "step": 2323
+    },
+    {
+      "epoch": 0.7432489156289352,
+      "grad_norm": 0.3376719057559967,
+      "learning_rate": 9.79859143183732e-05,
+      "loss": 4.4945,
+      "step": 2324
+    },
+    {
+      "epoch": 0.7435687301365208,
+      "grad_norm": 0.3353802561759949,
+      "learning_rate": 9.77562172827205e-05,
+      "loss": 4.4184,
+      "step": 2325
+    },
+    {
+      "epoch": 0.7438885446441064,
+      "grad_norm": 0.3342118561267853,
+      "learning_rate": 9.752673737539779e-05,
+      "loss": 4.4159,
+      "step": 2326
+    },
+    {
+      "epoch": 0.744208359151692,
+      "grad_norm": 0.3366515338420868,
+      "learning_rate": 9.729747484277402e-05,
+      "loss": 4.3736,
+      "step": 2327
+    },
+    {
+      "epoch": 0.7445281736592776,
+      "grad_norm": 0.3395729959011078,
+      "learning_rate": 9.706842993098503e-05,
+      "loss": 4.4127,
+      "step": 2328
+    },
+    {
+      "epoch": 0.7448479881668633,
+      "grad_norm": 0.33426910638809204,
+      "learning_rate": 9.683960288593249e-05,
+      "loss": 4.3812,
+      "step": 2329
+    },
+    {
+      "epoch": 0.7451678026744488,
+      "grad_norm": 0.35259851813316345,
+      "learning_rate": 9.661099395328463e-05,
+      "loss": 4.3867,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7454876171820344,
+      "grad_norm": 0.3474106192588806,
+      "learning_rate": 9.638260337847513e-05,
+      "loss": 4.3845,
+      "step": 2331
+    },
+    {
+      "epoch": 0.74580743168962,
+      "grad_norm": 0.331312358379364,
+      "learning_rate": 9.615443140670357e-05,
+      "loss": 4.4059,
+      "step": 2332
+    },
+    {
+      "epoch": 0.7461272461972056,
+      "grad_norm": 0.33790484070777893,
+      "learning_rate": 9.592647828293468e-05,
+      "loss": 4.3753,
+      "step": 2333
+    },
+    {
+      "epoch": 0.7464470607047913,
+      "grad_norm": 0.34516483545303345,
+      "learning_rate": 9.569874425189827e-05,
+      "loss": 4.3413,
+      "step": 2334
+    },
+    {
+      "epoch": 0.7467668752123768,
+      "grad_norm": 0.34916952252388,
+      "learning_rate": 9.547122955808902e-05,
+      "loss": 4.5153,
+      "step": 2335
+    },
+    {
+      "epoch": 0.7470866897199624,
+      "grad_norm": 0.3533429801464081,
+      "learning_rate": 9.524393444576585e-05,
+      "loss": 4.4815,
+      "step": 2336
+    },
+    {
+      "epoch": 0.7474065042275481,
+      "grad_norm": 0.32154449820518494,
+      "learning_rate": 9.501685915895218e-05,
+      "loss": 4.3711,
+      "step": 2337
+    },
+    {
+      "epoch": 0.7477263187351336,
+      "grad_norm": 0.33027729392051697,
+      "learning_rate": 9.479000394143543e-05,
+      "loss": 4.396,
+      "step": 2338
+    },
+    {
+      "epoch": 0.7480461332427192,
+      "grad_norm": 0.3365285098552704,
+      "learning_rate": 9.456336903676666e-05,
+      "loss": 4.4671,
+      "step": 2339
+    },
+    {
+      "epoch": 0.7483659477503048,
+      "grad_norm": 0.33891135454177856,
+      "learning_rate": 9.433695468826055e-05,
+      "loss": 4.4509,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7486857622578904,
+      "grad_norm": 0.3212648332118988,
+      "learning_rate": 9.411076113899465e-05,
+      "loss": 4.4377,
+      "step": 2341
+    },
+    {
+      "epoch": 0.7490055767654761,
+      "grad_norm": 0.33179762959480286,
+      "learning_rate": 9.388478863180982e-05,
+      "loss": 4.4276,
+      "step": 2342
+    },
+    {
+      "epoch": 0.7493253912730616,
+      "grad_norm": 0.3465440273284912,
+      "learning_rate": 9.365903740930947e-05,
+      "loss": 4.4863,
+      "step": 2343
+    },
+    {
+      "epoch": 0.7496452057806472,
+      "grad_norm": 0.32808244228363037,
+      "learning_rate": 9.343350771385957e-05,
+      "loss": 4.4296,
+      "step": 2344
+    },
+    {
+      "epoch": 0.7499650202882329,
+      "grad_norm": 0.3241809010505676,
+      "learning_rate": 9.320819978758787e-05,
+      "loss": 4.3921,
+      "step": 2345
+    },
+    {
+      "epoch": 0.7502848347958184,
+      "grad_norm": 0.3299873173236847,
+      "learning_rate": 9.298311387238449e-05,
+      "loss": 4.484,
+      "step": 2346
+    },
+    {
+      "epoch": 0.750604649303404,
+      "grad_norm": 0.3408411145210266,
+      "learning_rate": 9.275825020990092e-05,
+      "loss": 4.4399,
+      "step": 2347
+    },
+    {
+      "epoch": 0.7509244638109897,
+      "grad_norm": 0.3207140564918518,
+      "learning_rate": 9.25336090415502e-05,
+      "loss": 4.3208,
+      "step": 2348
+    },
+    {
+      "epoch": 0.7512442783185752,
+      "grad_norm": 0.33310171961784363,
+      "learning_rate": 9.230919060850645e-05,
+      "loss": 4.5034,
+      "step": 2349
+    },
+    {
+      "epoch": 0.7515640928261609,
+      "grad_norm": 0.344112366437912,
+      "learning_rate": 9.208499515170451e-05,
+      "loss": 4.3852,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7518839073337464,
+      "grad_norm": 0.33422914147377014,
+      "learning_rate": 9.186102291184003e-05,
+      "loss": 4.4972,
+      "step": 2351
+    },
+    {
+      "epoch": 0.752203721841332,
+      "grad_norm": 0.3382188081741333,
+      "learning_rate": 9.163727412936895e-05,
+      "loss": 4.455,
+      "step": 2352
+    },
+    {
+      "epoch": 0.7525235363489177,
+      "grad_norm": 0.3393407464027405,
+      "learning_rate": 9.141374904450733e-05,
+      "loss": 4.4865,
+      "step": 2353
+    },
+    {
+      "epoch": 0.7528433508565032,
+      "grad_norm": 0.34060433506965637,
+      "learning_rate": 9.119044789723108e-05,
+      "loss": 4.3234,
+      "step": 2354
+    },
+    {
+      "epoch": 0.7531631653640888,
+      "grad_norm": 0.33734777569770813,
+      "learning_rate": 9.09673709272755e-05,
+      "loss": 4.5168,
+      "step": 2355
+    },
+    {
+      "epoch": 0.7534829798716745,
+      "grad_norm": 0.3249700963497162,
+      "learning_rate": 9.07445183741355e-05,
+      "loss": 4.3355,
+      "step": 2356
+    },
+    {
+      "epoch": 0.75380279437926,
+      "grad_norm": 0.3302399516105652,
+      "learning_rate": 9.052189047706484e-05,
+      "loss": 4.3356,
+      "step": 2357
+    },
+    {
+      "epoch": 0.7541226088868457,
+      "grad_norm": 0.3629155158996582,
+      "learning_rate": 9.029948747507627e-05,
+      "loss": 4.3506,
+      "step": 2358
+    },
+    {
+      "epoch": 0.7544424233944312,
+      "grad_norm": 0.34194469451904297,
+      "learning_rate": 9.0077309606941e-05,
+      "loss": 4.4767,
+      "step": 2359
+    },
+    {
+      "epoch": 0.7547622379020168,
+      "grad_norm": 0.3354686498641968,
+      "learning_rate": 8.985535711118844e-05,
+      "loss": 4.4364,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7550820524096025,
+      "grad_norm": 0.34276947379112244,
+      "learning_rate": 8.963363022610623e-05,
+      "loss": 4.3009,
+      "step": 2361
+    },
+    {
+      "epoch": 0.755401866917188,
+      "grad_norm": 0.33784785866737366,
+      "learning_rate": 8.941212918973952e-05,
+      "loss": 4.4614,
+      "step": 2362
+    },
+    {
+      "epoch": 0.7557216814247736,
+      "grad_norm": 0.32760488986968994,
+      "learning_rate": 8.919085423989135e-05,
+      "loss": 4.3488,
+      "step": 2363
+    },
+    {
+      "epoch": 0.7560414959323593,
+      "grad_norm": 0.32915598154067993,
+      "learning_rate": 8.896980561412196e-05,
+      "loss": 4.4392,
+      "step": 2364
+    },
+    {
+      "epoch": 0.7563613104399448,
+      "grad_norm": 0.33573564887046814,
+      "learning_rate": 8.874898354974821e-05,
+      "loss": 4.3963,
+      "step": 2365
+    },
+    {
+      "epoch": 0.7566811249475305,
+      "grad_norm": 0.3680020570755005,
+      "learning_rate": 8.85283882838443e-05,
+      "loss": 4.4194,
+      "step": 2366
+    },
+    {
+      "epoch": 0.757000939455116,
+      "grad_norm": 0.31760096549987793,
+      "learning_rate": 8.830802005324031e-05,
+      "loss": 4.3387,
+      "step": 2367
+    },
+    {
+      "epoch": 0.7573207539627016,
+      "grad_norm": 0.32655471563339233,
+      "learning_rate": 8.808787909452334e-05,
+      "loss": 4.3461,
+      "step": 2368
+    },
+    {
+      "epoch": 0.7576405684702873,
+      "grad_norm": 0.32854411005973816,
+      "learning_rate": 8.786796564403575e-05,
+      "loss": 4.354,
+      "step": 2369
+    },
+    {
+      "epoch": 0.7579603829778728,
+      "grad_norm": 0.35439810156822205,
+      "learning_rate": 8.764827993787613e-05,
+      "loss": 4.4464,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7582801974854584,
+      "grad_norm": 0.34276890754699707,
+      "learning_rate": 8.742882221189844e-05,
+      "loss": 4.4719,
+      "step": 2371
+    },
+    {
+      "epoch": 0.758600011993044,
+      "grad_norm": 0.33621183037757874,
+      "learning_rate": 8.720959270171162e-05,
+      "loss": 4.3776,
+      "step": 2372
+    },
+    {
+      "epoch": 0.7589198265006296,
+      "grad_norm": 0.34009408950805664,
+      "learning_rate": 8.699059164268015e-05,
+      "loss": 4.4472,
+      "step": 2373
+    },
+    {
+      "epoch": 0.7592396410082153,
+      "grad_norm": 0.32150155305862427,
+      "learning_rate": 8.677181926992271e-05,
+      "loss": 4.3584,
+      "step": 2374
+    },
+    {
+      "epoch": 0.7595594555158008,
+      "grad_norm": 0.332084596157074,
+      "learning_rate": 8.655327581831279e-05,
+      "loss": 4.4232,
+      "step": 2375
+    },
+    {
+      "epoch": 0.7598792700233864,
+      "grad_norm": 0.32362791895866394,
+      "learning_rate": 8.633496152247784e-05,
+      "loss": 4.4576,
+      "step": 2376
+    },
+    {
+      "epoch": 0.7601990845309721,
+      "grad_norm": 0.3358210623264313,
+      "learning_rate": 8.611687661679945e-05,
+      "loss": 4.4467,
+      "step": 2377
+    },
+    {
+      "epoch": 0.7605188990385576,
+      "grad_norm": 0.33137738704681396,
+      "learning_rate": 8.589902133541323e-05,
+      "loss": 4.4064,
+      "step": 2378
+    },
+    {
+      "epoch": 0.7608387135461432,
+      "grad_norm": 0.3294398784637451,
+      "learning_rate": 8.568139591220764e-05,
+      "loss": 4.2972,
+      "step": 2379
+    },
+    {
+      "epoch": 0.7611585280537289,
+      "grad_norm": 0.32762813568115234,
+      "learning_rate": 8.546400058082492e-05,
+      "loss": 4.3664,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7614783425613144,
+      "grad_norm": 0.3572172522544861,
+      "learning_rate": 8.524683557465987e-05,
+      "loss": 4.4907,
+      "step": 2381
+    },
+    {
+      "epoch": 0.7617981570689001,
+      "grad_norm": 0.33606159687042236,
+      "learning_rate": 8.502990112686028e-05,
+      "loss": 4.5022,
+      "step": 2382
+    },
+    {
+      "epoch": 0.7621179715764856,
+      "grad_norm": 0.3445335626602173,
+      "learning_rate": 8.481319747032635e-05,
+      "loss": 4.5111,
+      "step": 2383
+    },
+    {
+      "epoch": 0.7624377860840712,
+      "grad_norm": 0.3442610502243042,
+      "learning_rate": 8.459672483771046e-05,
+      "loss": 4.446,
+      "step": 2384
+    },
+    {
+      "epoch": 0.7627576005916569,
+      "grad_norm": 0.3225667476654053,
+      "learning_rate": 8.438048346141713e-05,
+      "loss": 4.3515,
+      "step": 2385
+    },
+    {
+      "epoch": 0.7630774150992424,
+      "grad_norm": 0.3303259313106537,
+      "learning_rate": 8.416447357360224e-05,
+      "loss": 4.4814,
+      "step": 2386
+    },
+    {
+      "epoch": 0.763397229606828,
+      "grad_norm": 0.3265552222728729,
+      "learning_rate": 8.394869540617347e-05,
+      "loss": 4.4112,
+      "step": 2387
+    },
+    {
+      "epoch": 0.7637170441144137,
+      "grad_norm": 0.35281017422676086,
+      "learning_rate": 8.373314919078964e-05,
+      "loss": 4.3664,
+      "step": 2388
+    },
+    {
+      "epoch": 0.7640368586219992,
+      "grad_norm": 0.3419521152973175,
+      "learning_rate": 8.35178351588605e-05,
+      "loss": 4.3578,
+      "step": 2389
+    },
+    {
+      "epoch": 0.7643566731295849,
+      "grad_norm": 0.3403926491737366,
+      "learning_rate": 8.330275354154672e-05,
+      "loss": 4.3963,
+      "step": 2390
+    },
+    {
+      "epoch": 0.7646764876371704,
+      "grad_norm": 0.3328573703765869,
+      "learning_rate": 8.308790456975905e-05,
+      "loss": 4.4007,
+      "step": 2391
+    },
+    {
+      "epoch": 0.764996302144756,
+      "grad_norm": 0.39904558658599854,
+      "learning_rate": 8.28732884741588e-05,
+      "loss": 4.4595,
+      "step": 2392
+    },
+    {
+      "epoch": 0.7653161166523417,
+      "grad_norm": 0.3315344750881195,
+      "learning_rate": 8.265890548515723e-05,
+      "loss": 4.3695,
+      "step": 2393
+    },
+    {
+      "epoch": 0.7656359311599272,
+      "grad_norm": 0.34847211837768555,
+      "learning_rate": 8.244475583291522e-05,
+      "loss": 4.3524,
+      "step": 2394
+    },
+    {
+      "epoch": 0.7659557456675128,
+      "grad_norm": 0.33161666989326477,
+      "learning_rate": 8.223083974734336e-05,
+      "loss": 4.4508,
+      "step": 2395
+    },
+    {
+      "epoch": 0.7662755601750985,
+      "grad_norm": 0.342219740152359,
+      "learning_rate": 8.201715745810112e-05,
+      "loss": 4.4734,
+      "step": 2396
+    },
+    {
+      "epoch": 0.766595374682684,
+      "grad_norm": 0.33304139971733093,
+      "learning_rate": 8.180370919459728e-05,
+      "loss": 4.3801,
+      "step": 2397
+    },
+    {
+      "epoch": 0.7669151891902697,
+      "grad_norm": 0.354841023683548,
+      "learning_rate": 8.159049518598924e-05,
+      "loss": 4.3773,
+      "step": 2398
+    },
+    {
+      "epoch": 0.7672350036978552,
+      "grad_norm": 0.32176217436790466,
+      "learning_rate": 8.137751566118306e-05,
+      "loss": 4.3688,
+      "step": 2399
+    },
+    {
+      "epoch": 0.7675548182054408,
+      "grad_norm": 0.3267159163951874,
+      "learning_rate": 8.11647708488327e-05,
+      "loss": 4.4987,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7675548182054408,
+      "eval_loss": 4.421462059020996,
+      "eval_runtime": 84.969,
+      "eval_samples_per_second": 22.326,
+      "eval_steps_per_second": 5.59,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7678746327130265,
+      "grad_norm": 0.33585166931152344,
+      "learning_rate": 8.09522609773405e-05,
+      "loss": 4.3732,
+      "step": 2401
+    },
+    {
+      "epoch": 0.768194447220612,
+      "grad_norm": 0.34492063522338867,
+      "learning_rate": 8.073998627485641e-05,
+      "loss": 4.4611,
+      "step": 2402
+    },
+    {
+      "epoch": 0.7685142617281977,
+      "grad_norm": 0.3376120626926422,
+      "learning_rate": 8.052794696927796e-05,
+      "loss": 4.3214,
+      "step": 2403
+    },
+    {
+      "epoch": 0.7688340762357833,
+      "grad_norm": 0.3616439998149872,
+      "learning_rate": 8.031614328824998e-05,
+      "loss": 4.3799,
+      "step": 2404
+    },
+    {
+      "epoch": 0.7691538907433688,
+      "grad_norm": 0.3344370722770691,
+      "learning_rate": 8.010457545916408e-05,
+      "loss": 4.4234,
+      "step": 2405
+    },
+    {
+      "epoch": 0.7694737052509545,
+      "grad_norm": 0.32834044098854065,
+      "learning_rate": 7.989324370915899e-05,
+      "loss": 4.4505,
+      "step": 2406
+    },
+    {
+      "epoch": 0.76979351975854,
+      "grad_norm": 0.3269968032836914,
+      "learning_rate": 7.968214826511987e-05,
+      "loss": 4.4195,
+      "step": 2407
+    },
+    {
+      "epoch": 0.7701133342661256,
+      "grad_norm": 0.47119635343551636,
+      "learning_rate": 7.947128935367813e-05,
+      "loss": 4.4604,
+      "step": 2408
+    },
+    {
+      "epoch": 0.7704331487737113,
+      "grad_norm": 0.3617459833621979,
+      "learning_rate": 7.926066720121134e-05,
+      "loss": 4.4275,
+      "step": 2409
+    },
+    {
+      "epoch": 0.7707529632812968,
+      "grad_norm": 0.3359169363975525,
+      "learning_rate": 7.905028203384269e-05,
+      "loss": 4.3953,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7710727777888825,
+      "grad_norm": 0.34564802050590515,
+      "learning_rate": 7.884013407744129e-05,
+      "loss": 4.4501,
+      "step": 2411
+    },
+    {
+      "epoch": 0.7713925922964681,
+      "grad_norm": 0.3374510109424591,
+      "learning_rate": 7.863022355762101e-05,
+      "loss": 4.4675,
+      "step": 2412
+    },
+    {
+      "epoch": 0.7717124068040536,
+      "grad_norm": 0.3311443030834198,
+      "learning_rate": 7.842055069974149e-05,
+      "loss": 4.4123,
+      "step": 2413
+    },
+    {
+      "epoch": 0.7720322213116393,
+      "grad_norm": 0.33511215448379517,
+      "learning_rate": 7.82111157289069e-05,
+      "loss": 4.4288,
+      "step": 2414
+    },
+    {
+      "epoch": 0.7723520358192248,
+      "grad_norm": 0.342142790555954,
+      "learning_rate": 7.800191886996578e-05,
+      "loss": 4.2889,
+      "step": 2415
+    },
+    {
+      "epoch": 0.7726718503268104,
+      "grad_norm": 0.3379803001880646,
+      "learning_rate": 7.779296034751152e-05,
+      "loss": 4.3346,
+      "step": 2416
+    },
+    {
+      "epoch": 0.7729916648343961,
+      "grad_norm": 0.3355945944786072,
+      "learning_rate": 7.75842403858811e-05,
+      "loss": 4.4643,
+      "step": 2417
+    },
+    {
+      "epoch": 0.7733114793419816,
+      "grad_norm": 0.3274465799331665,
+      "learning_rate": 7.737575920915574e-05,
+      "loss": 4.3874,
+      "step": 2418
+    },
+    {
+      "epoch": 0.7736312938495673,
+      "grad_norm": 0.3239230811595917,
+      "learning_rate": 7.716751704116042e-05,
+      "loss": 4.3716,
+      "step": 2419
+    },
+    {
+      "epoch": 0.7739511083571529,
+      "grad_norm": 0.3335186541080475,
+      "learning_rate": 7.695951410546311e-05,
+      "loss": 4.5191,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7742709228647384,
+      "grad_norm": 0.3376699388027191,
+      "learning_rate": 7.67517506253753e-05,
+      "loss": 4.4352,
+      "step": 2421
+    },
+    {
+      "epoch": 0.7745907373723241,
+      "grad_norm": 0.36317139863967896,
+      "learning_rate": 7.654422682395106e-05,
+      "loss": 4.424,
+      "step": 2422
+    },
+    {
+      "epoch": 0.7749105518799096,
+      "grad_norm": 0.3489604592323303,
+      "learning_rate": 7.633694292398745e-05,
+      "loss": 4.3503,
+      "step": 2423
+    },
+    {
+      "epoch": 0.7752303663874952,
+      "grad_norm": 0.33292216062545776,
+      "learning_rate": 7.612989914802383e-05,
+      "loss": 4.4104,
+      "step": 2424
+    },
+    {
+      "epoch": 0.7755501808950809,
+      "grad_norm": 0.3456326425075531,
+      "learning_rate": 7.592309571834179e-05,
+      "loss": 4.3774,
+      "step": 2425
+    },
+    {
+      "epoch": 0.7758699954026664,
+      "grad_norm": 0.3363443613052368,
+      "learning_rate": 7.5716532856965e-05,
+      "loss": 4.3974,
+      "step": 2426
+    },
+    {
+      "epoch": 0.7761898099102521,
+      "grad_norm": 0.3309082090854645,
+      "learning_rate": 7.551021078565857e-05,
+      "loss": 4.3973,
+      "step": 2427
+    },
+    {
+      "epoch": 0.7765096244178377,
+      "grad_norm": 0.3364388644695282,
+      "learning_rate": 7.530412972592928e-05,
+      "loss": 4.4558,
+      "step": 2428
+    },
+    {
+      "epoch": 0.7768294389254232,
+      "grad_norm": 0.33900633454322815,
+      "learning_rate": 7.509828989902525e-05,
+      "loss": 4.425,
+      "step": 2429
+    },
+    {
+      "epoch": 0.7771492534330089,
+      "grad_norm": 0.34174394607543945,
+      "learning_rate": 7.489269152593543e-05,
+      "loss": 4.3907,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7774690679405944,
+      "grad_norm": 0.3305186629295349,
+      "learning_rate": 7.468733482738976e-05,
+      "loss": 4.4916,
+      "step": 2431
+    },
+    {
+      "epoch": 0.77778888244818,
+      "grad_norm": 0.34548285603523254,
+      "learning_rate": 7.44822200238584e-05,
+      "loss": 4.4056,
+      "step": 2432
+    },
+    {
+      "epoch": 0.7781086969557657,
+      "grad_norm": 0.3213754892349243,
+      "learning_rate": 7.42773473355521e-05,
+      "loss": 4.4359,
+      "step": 2433
+    },
+    {
+      "epoch": 0.7784285114633512,
+      "grad_norm": 0.35759109258651733,
+      "learning_rate": 7.407271698242155e-05,
+      "loss": 4.4189,
+      "step": 2434
+    },
+    {
+      "epoch": 0.7787483259709369,
+      "grad_norm": 0.3435339331626892,
+      "learning_rate": 7.386832918415741e-05,
+      "loss": 4.3936,
+      "step": 2435
+    },
+    {
+      "epoch": 0.7790681404785225,
+      "grad_norm": 0.34353944659233093,
+      "learning_rate": 7.366418416018963e-05,
+      "loss": 4.3856,
+      "step": 2436
+    },
+    {
+      "epoch": 0.779387954986108,
+      "grad_norm": 0.3284934461116791,
+      "learning_rate": 7.346028212968778e-05,
+      "loss": 4.3705,
+      "step": 2437
+    },
+    {
+      "epoch": 0.7797077694936937,
+      "grad_norm": 0.3299228549003601,
+      "learning_rate": 7.325662331156049e-05,
+      "loss": 4.3706,
+      "step": 2438
+    },
+    {
+      "epoch": 0.7800275840012793,
+      "grad_norm": 0.3226522207260132,
+      "learning_rate": 7.305320792445532e-05,
+      "loss": 4.3188,
+      "step": 2439
+    },
+    {
+      "epoch": 0.7803473985088648,
+      "grad_norm": 0.3250078558921814,
+      "learning_rate": 7.285003618675842e-05,
+      "loss": 4.3658,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7806672130164505,
+      "grad_norm": 0.3348226249217987,
+      "learning_rate": 7.264710831659426e-05,
+      "loss": 4.3454,
+      "step": 2441
+    },
+    {
+      "epoch": 0.780987027524036,
+      "grad_norm": 0.32369184494018555,
+      "learning_rate": 7.24444245318257e-05,
+      "loss": 4.3334,
+      "step": 2442
+    },
+    {
+      "epoch": 0.7813068420316217,
+      "grad_norm": 0.37221813201904297,
+      "learning_rate": 7.224198505005344e-05,
+      "loss": 4.4436,
+      "step": 2443
+    },
+    {
+      "epoch": 0.7816266565392073,
+      "grad_norm": 0.3341996967792511,
+      "learning_rate": 7.203979008861588e-05,
+      "loss": 4.4195,
+      "step": 2444
+    },
+    {
+      "epoch": 0.7819464710467928,
+      "grad_norm": 0.3328430950641632,
+      "learning_rate": 7.183783986458906e-05,
+      "loss": 4.4021,
+      "step": 2445
+    },
+    {
+      "epoch": 0.7822662855543785,
+      "grad_norm": 0.3489823341369629,
+      "learning_rate": 7.163613459478595e-05,
+      "loss": 4.4735,
+      "step": 2446
+    },
+    {
+      "epoch": 0.782586100061964,
+      "grad_norm": 0.34073081612586975,
+      "learning_rate": 7.143467449575682e-05,
+      "loss": 4.3651,
+      "step": 2447
+    },
+    {
+      "epoch": 0.7829059145695496,
+      "grad_norm": 0.33331355452537537,
+      "learning_rate": 7.12334597837887e-05,
+      "loss": 4.4408,
+      "step": 2448
+    },
+    {
+      "epoch": 0.7832257290771353,
+      "grad_norm": 0.3399198651313782,
+      "learning_rate": 7.103249067490502e-05,
+      "loss": 4.4111,
+      "step": 2449
+    },
+    {
+      "epoch": 0.7835455435847208,
+      "grad_norm": 0.3442201614379883,
+      "learning_rate": 7.083176738486578e-05,
+      "loss": 4.3269,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7838653580923065,
+      "grad_norm": 0.3376154899597168,
+      "learning_rate": 7.063129012916671e-05,
+      "loss": 4.3702,
+      "step": 2451
+    },
+    {
+      "epoch": 0.7841851725998921,
+      "grad_norm": 0.3253526985645294,
+      "learning_rate": 7.04310591230397e-05,
+      "loss": 4.3661,
+      "step": 2452
+    },
+    {
+      "epoch": 0.7845049871074776,
+      "grad_norm": 0.336834192276001,
+      "learning_rate": 7.023107458145214e-05,
+      "loss": 4.4983,
+      "step": 2453
+    },
+    {
+      "epoch": 0.7848248016150633,
+      "grad_norm": 0.32964494824409485,
+      "learning_rate": 7.003133671910688e-05,
+      "loss": 4.4248,
+      "step": 2454
+    },
+    {
+      "epoch": 0.7851446161226489,
+      "grad_norm": 0.32808932662010193,
+      "learning_rate": 6.983184575044199e-05,
+      "loss": 4.3788,
+      "step": 2455
+    },
+    {
+      "epoch": 0.7854644306302344,
+      "grad_norm": 0.3290889859199524,
+      "learning_rate": 6.963260188963016e-05,
+      "loss": 4.3742,
+      "step": 2456
+    },
+    {
+      "epoch": 0.7857842451378201,
+      "grad_norm": 0.3432980179786682,
+      "learning_rate": 6.943360535057926e-05,
+      "loss": 4.4134,
+      "step": 2457
+    },
+    {
+      "epoch": 0.7861040596454056,
+      "grad_norm": 0.34411704540252686,
+      "learning_rate": 6.923485634693109e-05,
+      "loss": 4.4038,
+      "step": 2458
+    },
+    {
+      "epoch": 0.7864238741529913,
+      "grad_norm": 0.3406968116760254,
+      "learning_rate": 6.903635509206234e-05,
+      "loss": 4.4335,
+      "step": 2459
+    },
+    {
+      "epoch": 0.7867436886605769,
+      "grad_norm": 0.3379204273223877,
+      "learning_rate": 6.883810179908315e-05,
+      "loss": 4.3759,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7870635031681624,
+      "grad_norm": 0.335379034280777,
+      "learning_rate": 6.86400966808377e-05,
+      "loss": 4.4072,
+      "step": 2461
+    },
+    {
+      "epoch": 0.7873833176757481,
+      "grad_norm": 0.33987942337989807,
+      "learning_rate": 6.844233994990382e-05,
+      "loss": 4.404,
+      "step": 2462
+    },
+    {
+      "epoch": 0.7877031321833337,
+      "grad_norm": 0.3290582299232483,
+      "learning_rate": 6.824483181859231e-05,
+      "loss": 4.4041,
+      "step": 2463
+    },
+    {
+      "epoch": 0.7880229466909192,
+      "grad_norm": 0.3431604504585266,
+      "learning_rate": 6.804757249894762e-05,
+      "loss": 4.3937,
+      "step": 2464
+    },
+    {
+      "epoch": 0.7883427611985049,
+      "grad_norm": 0.3415065109729767,
+      "learning_rate": 6.785056220274658e-05,
+      "loss": 4.3634,
+      "step": 2465
+    },
+    {
+      "epoch": 0.7886625757060904,
+      "grad_norm": 0.34374210238456726,
+      "learning_rate": 6.765380114149887e-05,
+      "loss": 4.4099,
+      "step": 2466
+    },
+    {
+      "epoch": 0.7889823902136761,
+      "grad_norm": 0.3328897953033447,
+      "learning_rate": 6.745728952644675e-05,
+      "loss": 4.4099,
+      "step": 2467
+    },
+    {
+      "epoch": 0.7893022047212617,
+      "grad_norm": 0.32808518409729004,
+      "learning_rate": 6.726102756856422e-05,
+      "loss": 4.4275,
+      "step": 2468
+    },
+    {
+      "epoch": 0.7896220192288472,
+      "grad_norm": 0.33417269587516785,
+      "learning_rate": 6.706501547855787e-05,
+      "loss": 4.4154,
+      "step": 2469
+    },
+    {
+      "epoch": 0.7899418337364329,
+      "grad_norm": 0.3304135203361511,
+      "learning_rate": 6.686925346686544e-05,
+      "loss": 4.3633,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7902616482440185,
+      "grad_norm": 0.3450307548046112,
+      "learning_rate": 6.667374174365667e-05,
+      "loss": 4.4377,
+      "step": 2471
+    },
+    {
+      "epoch": 0.7905814627516041,
+      "grad_norm": 0.3265368640422821,
+      "learning_rate": 6.647848051883217e-05,
+      "loss": 4.39,
+      "step": 2472
+    },
+    {
+      "epoch": 0.7909012772591897,
+      "grad_norm": 0.33910951018333435,
+      "learning_rate": 6.628347000202381e-05,
+      "loss": 4.3719,
+      "step": 2473
+    },
+    {
+      "epoch": 0.7912210917667752,
+      "grad_norm": 0.33357036113739014,
+      "learning_rate": 6.608871040259457e-05,
+      "loss": 4.38,
+      "step": 2474
+    },
+    {
+      "epoch": 0.7915409062743609,
+      "grad_norm": 0.33957427740097046,
+      "learning_rate": 6.589420192963754e-05,
+      "loss": 4.3611,
+      "step": 2475
+    },
+    {
+      "epoch": 0.7918607207819465,
+      "grad_norm": 0.34905606508255005,
+      "learning_rate": 6.56999447919766e-05,
+      "loss": 4.4632,
+      "step": 2476
+    },
+    {
+      "epoch": 0.792180535289532,
+      "grad_norm": 0.3371109962463379,
+      "learning_rate": 6.550593919816545e-05,
+      "loss": 4.3714,
+      "step": 2477
+    },
+    {
+      "epoch": 0.7925003497971177,
+      "grad_norm": 0.33412280678749084,
+      "learning_rate": 6.531218535648807e-05,
+      "loss": 4.3534,
+      "step": 2478
+    },
+    {
+      "epoch": 0.7928201643047033,
+      "grad_norm": 0.3325359523296356,
+      "learning_rate": 6.511868347495793e-05,
+      "loss": 4.3707,
+      "step": 2479
+    },
+    {
+      "epoch": 0.7931399788122889,
+      "grad_norm": 0.33235684037208557,
+      "learning_rate": 6.492543376131817e-05,
+      "loss": 4.464,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7934597933198745,
+      "grad_norm": 0.3194396495819092,
+      "learning_rate": 6.473243642304114e-05,
+      "loss": 4.4164,
+      "step": 2481
+    },
+    {
+      "epoch": 0.79377960782746,
+      "grad_norm": 0.34822776913642883,
+      "learning_rate": 6.453969166732808e-05,
+      "loss": 4.4403,
+      "step": 2482
+    },
+    {
+      "epoch": 0.7940994223350457,
+      "grad_norm": 0.3341214656829834,
+      "learning_rate": 6.434719970110923e-05,
+      "loss": 4.2944,
+      "step": 2483
+    },
+    {
+      "epoch": 0.7944192368426313,
+      "grad_norm": 0.3367195427417755,
+      "learning_rate": 6.415496073104344e-05,
+      "loss": 4.4819,
+      "step": 2484
+    },
+    {
+      "epoch": 0.7947390513502168,
+      "grad_norm": 0.33086076378822327,
+      "learning_rate": 6.396297496351791e-05,
+      "loss": 4.4394,
+      "step": 2485
+    },
+    {
+      "epoch": 0.7950588658578025,
+      "grad_norm": 0.3196883499622345,
+      "learning_rate": 6.377124260464804e-05,
+      "loss": 4.3904,
+      "step": 2486
+    },
+    {
+      "epoch": 0.7953786803653881,
+      "grad_norm": 0.3320275843143463,
+      "learning_rate": 6.357976386027697e-05,
+      "loss": 4.4522,
+      "step": 2487
+    },
+    {
+      "epoch": 0.7956984948729737,
+      "grad_norm": 0.3390849828720093,
+      "learning_rate": 6.338853893597584e-05,
+      "loss": 4.4185,
+      "step": 2488
+    },
+    {
+      "epoch": 0.7960183093805593,
+      "grad_norm": 0.3309732973575592,
+      "learning_rate": 6.319756803704311e-05,
+      "loss": 4.403,
+      "step": 2489
+    },
+    {
+      "epoch": 0.7963381238881448,
+      "grad_norm": 0.33671632409095764,
+      "learning_rate": 6.300685136850458e-05,
+      "loss": 4.4218,
+      "step": 2490
+    },
+    {
+      "epoch": 0.7966579383957305,
+      "grad_norm": 0.32891610264778137,
+      "learning_rate": 6.281638913511324e-05,
+      "loss": 4.3904,
+      "step": 2491
+    },
+    {
+      "epoch": 0.7969777529033161,
+      "grad_norm": 0.3296626806259155,
+      "learning_rate": 6.262618154134858e-05,
+      "loss": 4.3143,
+      "step": 2492
+    },
+    {
+      "epoch": 0.7972975674109016,
+      "grad_norm": 0.32553809881210327,
+      "learning_rate": 6.2436228791417e-05,
+      "loss": 4.4164,
+      "step": 2493
+    },
+    {
+      "epoch": 0.7976173819184873,
+      "grad_norm": 0.3539281189441681,
+      "learning_rate": 6.224653108925122e-05,
+      "loss": 4.4039,
+      "step": 2494
+    },
+    {
+      "epoch": 0.7979371964260729,
+      "grad_norm": 0.32112589478492737,
+      "learning_rate": 6.205708863851019e-05,
+      "loss": 4.3172,
+      "step": 2495
+    },
+    {
+      "epoch": 0.7982570109336585,
+      "grad_norm": 0.3262506425380707,
+      "learning_rate": 6.186790164257866e-05,
+      "loss": 4.421,
+      "step": 2496
+    },
+    {
+      "epoch": 0.7985768254412441,
+      "grad_norm": 0.3287886679172516,
+      "learning_rate": 6.167897030456725e-05,
+      "loss": 4.4184,
+      "step": 2497
+    },
+    {
+      "epoch": 0.7988966399488296,
+      "grad_norm": 0.3291045129299164,
+      "learning_rate": 6.149029482731211e-05,
+      "loss": 4.3694,
+      "step": 2498
+    },
+    {
+      "epoch": 0.7992164544564153,
+      "grad_norm": 0.3448774814605713,
+      "learning_rate": 6.13018754133747e-05,
+      "loss": 4.4399,
+      "step": 2499
+    },
+    {
+      "epoch": 0.7995362689640009,
+      "grad_norm": 0.3265831470489502,
+      "learning_rate": 6.111371226504162e-05,
+      "loss": 4.4017,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7995362689640009,
+      "eval_loss": 4.408499717712402,
+      "eval_runtime": 96.591,
+      "eval_samples_per_second": 19.64,
+      "eval_steps_per_second": 4.918,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7998560834715864,
+      "grad_norm": 0.3272258937358856,
+      "learning_rate": 6.092580558432409e-05,
+      "loss": 4.4209,
+      "step": 2501
+    },
+    {
+      "epoch": 0.8001758979791721,
+      "grad_norm": 0.3352862596511841,
+      "learning_rate": 6.073815557295827e-05,
+      "loss": 4.3778,
+      "step": 2502
+    },
+    {
+      "epoch": 0.8004957124867577,
+      "grad_norm": 0.3348987400531769,
+      "learning_rate": 6.055076243240467e-05,
+      "loss": 4.3724,
+      "step": 2503
+    },
+    {
+      "epoch": 0.8008155269943433,
+      "grad_norm": 0.3495721220970154,
+      "learning_rate": 6.036362636384798e-05,
+      "loss": 4.3764,
+      "step": 2504
+    },
+    {
+      "epoch": 0.8011353415019289,
+      "grad_norm": 0.3421262204647064,
+      "learning_rate": 6.017674756819705e-05,
+      "loss": 4.334,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8014551560095144,
+      "grad_norm": 0.3377733528614044,
+      "learning_rate": 5.9990126246084204e-05,
+      "loss": 4.3369,
+      "step": 2506
+    },
+    {
+      "epoch": 0.8017749705171001,
+      "grad_norm": 0.336994469165802,
+      "learning_rate": 5.9803762597865745e-05,
+      "loss": 4.3393,
+      "step": 2507
+    },
+    {
+      "epoch": 0.8020947850246857,
+      "grad_norm": 0.34005388617515564,
+      "learning_rate": 5.96176568236209e-05,
+      "loss": 4.4098,
+      "step": 2508
+    },
+    {
+      "epoch": 0.8024145995322712,
+      "grad_norm": 0.32835811376571655,
+      "learning_rate": 5.9431809123152465e-05,
+      "loss": 4.3953,
+      "step": 2509
+    },
+    {
+      "epoch": 0.8027344140398569,
+      "grad_norm": 0.34376657009124756,
+      "learning_rate": 5.924621969598604e-05,
+      "loss": 4.4965,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8030542285474425,
+      "grad_norm": 0.3300655782222748,
+      "learning_rate": 5.906088874136968e-05,
+      "loss": 4.4316,
+      "step": 2511
+    },
+    {
+      "epoch": 0.8033740430550281,
+      "grad_norm": 0.3265095055103302,
+      "learning_rate": 5.887581645827436e-05,
+      "loss": 4.3927,
+      "step": 2512
+    },
+    {
+      "epoch": 0.8036938575626137,
+      "grad_norm": 0.3215543031692505,
+      "learning_rate": 5.869100304539297e-05,
+      "loss": 4.3649,
+      "step": 2513
+    },
+    {
+      "epoch": 0.8040136720701992,
+      "grad_norm": 0.32721421122550964,
+      "learning_rate": 5.850644870114063e-05,
+      "loss": 4.3213,
+      "step": 2514
+    },
+    {
+      "epoch": 0.8043334865777849,
+      "grad_norm": 0.3254854083061218,
+      "learning_rate": 5.832215362365458e-05,
+      "loss": 4.4192,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8046533010853705,
+      "grad_norm": 0.34419766068458557,
+      "learning_rate": 5.813811801079325e-05,
+      "loss": 4.458,
+      "step": 2516
+    },
+    {
+      "epoch": 0.804973115592956,
+      "grad_norm": 0.3316987156867981,
+      "learning_rate": 5.795434206013685e-05,
+      "loss": 4.3995,
+      "step": 2517
+    },
+    {
+      "epoch": 0.8052929301005417,
+      "grad_norm": 0.3386606276035309,
+      "learning_rate": 5.77708259689866e-05,
+      "loss": 4.4321,
+      "step": 2518
+    },
+    {
+      "epoch": 0.8056127446081273,
+      "grad_norm": 0.3319477140903473,
+      "learning_rate": 5.7587569934364896e-05,
+      "loss": 4.4673,
+      "step": 2519
+    },
+    {
+      "epoch": 0.8059325591157129,
+      "grad_norm": 0.3327592611312866,
+      "learning_rate": 5.740457415301486e-05,
+      "loss": 4.4537,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8062523736232985,
+      "grad_norm": 0.331850528717041,
+      "learning_rate": 5.72218388214002e-05,
+      "loss": 4.3952,
+      "step": 2521
+    },
+    {
+      "epoch": 0.806572188130884,
+      "grad_norm": 0.323560893535614,
+      "learning_rate": 5.703936413570519e-05,
+      "loss": 4.3919,
+      "step": 2522
+    },
+    {
+      "epoch": 0.8068920026384697,
+      "grad_norm": 0.4162808954715729,
+      "learning_rate": 5.6857150291833884e-05,
+      "loss": 4.4768,
+      "step": 2523
+    },
+    {
+      "epoch": 0.8072118171460553,
+      "grad_norm": 0.3326115310192108,
+      "learning_rate": 5.667519748541064e-05,
+      "loss": 4.3733,
+      "step": 2524
+    },
+    {
+      "epoch": 0.8075316316536408,
+      "grad_norm": 0.3389906585216522,
+      "learning_rate": 5.649350591177946e-05,
+      "loss": 4.336,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8078514461612265,
+      "grad_norm": 0.32866370677948,
+      "learning_rate": 5.6312075766003876e-05,
+      "loss": 4.4592,
+      "step": 2526
+    },
+    {
+      "epoch": 0.8081712606688121,
+      "grad_norm": 0.32848337292671204,
+      "learning_rate": 5.613090724286681e-05,
+      "loss": 4.3391,
+      "step": 2527
+    },
+    {
+      "epoch": 0.8084910751763977,
+      "grad_norm": 0.3331284523010254,
+      "learning_rate": 5.595000053687014e-05,
+      "loss": 4.4384,
+      "step": 2528
+    },
+    {
+      "epoch": 0.8088108896839833,
+      "grad_norm": 0.33180293440818787,
+      "learning_rate": 5.576935584223482e-05,
+      "loss": 4.3779,
+      "step": 2529
+    },
+    {
+      "epoch": 0.8091307041915689,
+      "grad_norm": 0.3350355625152588,
+      "learning_rate": 5.55889733529005e-05,
+      "loss": 4.4041,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8094505186991545,
+      "grad_norm": 0.3404220640659332,
+      "learning_rate": 5.540885326252531e-05,
+      "loss": 4.537,
+      "step": 2531
+    },
+    {
+      "epoch": 0.8097703332067401,
+      "grad_norm": 0.3256952166557312,
+      "learning_rate": 5.5228995764485564e-05,
+      "loss": 4.3572,
+      "step": 2532
+    },
+    {
+      "epoch": 0.8100901477143256,
+      "grad_norm": 0.3268665075302124,
+      "learning_rate": 5.5049401051875765e-05,
+      "loss": 4.4335,
+      "step": 2533
+    },
+    {
+      "epoch": 0.8104099622219113,
+      "grad_norm": 0.32366156578063965,
+      "learning_rate": 5.487006931750828e-05,
+      "loss": 4.3503,
+      "step": 2534
+    },
+    {
+      "epoch": 0.8107297767294969,
+      "grad_norm": 0.33809319138526917,
+      "learning_rate": 5.469100075391314e-05,
+      "loss": 4.4448,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8110495912370825,
+      "grad_norm": 0.3393336236476898,
+      "learning_rate": 5.451219555333792e-05,
+      "loss": 4.3668,
+      "step": 2536
+    },
+    {
+      "epoch": 0.8113694057446681,
+      "grad_norm": 0.32927775382995605,
+      "learning_rate": 5.4333653907747174e-05,
+      "loss": 4.2971,
+      "step": 2537
+    },
+    {
+      "epoch": 0.8116892202522537,
+      "grad_norm": 0.33429858088493347,
+      "learning_rate": 5.4155376008822805e-05,
+      "loss": 4.3864,
+      "step": 2538
+    },
+    {
+      "epoch": 0.8120090347598393,
+      "grad_norm": 0.3288191258907318,
+      "learning_rate": 5.397736204796337e-05,
+      "loss": 4.4077,
+      "step": 2539
+    },
+    {
+      "epoch": 0.8123288492674249,
+      "grad_norm": 0.336532324552536,
+      "learning_rate": 5.37996122162842e-05,
+      "loss": 4.386,
+      "step": 2540
+    },
+    {
+      "epoch": 0.8126486637750105,
+      "grad_norm": 0.3313378691673279,
+      "learning_rate": 5.362212670461706e-05,
+      "loss": 4.4191,
+      "step": 2541
+    },
+    {
+      "epoch": 0.8129684782825961,
+      "grad_norm": 0.33725032210350037,
+      "learning_rate": 5.3444905703509687e-05,
+      "loss": 4.3506,
+      "step": 2542
+    },
+    {
+      "epoch": 0.8132882927901817,
+      "grad_norm": 0.324349582195282,
+      "learning_rate": 5.3267949403226104e-05,
+      "loss": 4.3942,
+      "step": 2543
+    },
+    {
+      "epoch": 0.8136081072977673,
+      "grad_norm": 0.33053717017173767,
+      "learning_rate": 5.3091257993746115e-05,
+      "loss": 4.3883,
+      "step": 2544
+    },
+    {
+      "epoch": 0.8139279218053529,
+      "grad_norm": 0.3250136971473694,
+      "learning_rate": 5.2914831664765045e-05,
+      "loss": 4.3464,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8142477363129385,
+      "grad_norm": 0.3636740446090698,
+      "learning_rate": 5.2738670605693814e-05,
+      "loss": 4.4609,
+      "step": 2546
+    },
+    {
+      "epoch": 0.8145675508205241,
+      "grad_norm": 0.33198001980781555,
+      "learning_rate": 5.256277500565823e-05,
+      "loss": 4.3835,
+      "step": 2547
+    },
+    {
+      "epoch": 0.8148873653281097,
+      "grad_norm": 0.3311094641685486,
+      "learning_rate": 5.238714505349938e-05,
+      "loss": 4.378,
+      "step": 2548
+    },
+    {
+      "epoch": 0.8152071798356954,
+      "grad_norm": 0.3309457302093506,
+      "learning_rate": 5.221178093777303e-05,
+      "loss": 4.4277,
+      "step": 2549
+    },
+    {
+      "epoch": 0.8155269943432809,
+      "grad_norm": 0.3373015820980072,
+      "learning_rate": 5.2036682846749645e-05,
+      "loss": 4.4029,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8158468088508665,
+      "grad_norm": 0.32732510566711426,
+      "learning_rate": 5.186185096841402e-05,
+      "loss": 4.4289,
+      "step": 2551
+    },
+    {
+      "epoch": 0.8161666233584521,
+      "grad_norm": 0.32910993695259094,
+      "learning_rate": 5.168728549046508e-05,
+      "loss": 4.4051,
+      "step": 2552
+    },
+    {
+      "epoch": 0.8164864378660377,
+      "grad_norm": 0.3277089595794678,
+      "learning_rate": 5.151298660031587e-05,
+      "loss": 4.4015,
+      "step": 2553
+    },
+    {
+      "epoch": 0.8168062523736233,
+      "grad_norm": 0.33765482902526855,
+      "learning_rate": 5.133895448509299e-05,
+      "loss": 4.369,
+      "step": 2554
+    },
+    {
+      "epoch": 0.8171260668812089,
+      "grad_norm": 0.33570003509521484,
+      "learning_rate": 5.116518933163709e-05,
+      "loss": 4.3564,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8174458813887945,
+      "grad_norm": 0.33497217297554016,
+      "learning_rate": 5.099169132650173e-05,
+      "loss": 4.3783,
+      "step": 2556
+    },
+    {
+      "epoch": 0.8177656958963802,
+      "grad_norm": 0.330083429813385,
+      "learning_rate": 5.0818460655953894e-05,
+      "loss": 4.3159,
+      "step": 2557
+    },
+    {
+      "epoch": 0.8180855104039657,
+      "grad_norm": 0.3331679403781891,
+      "learning_rate": 5.0645497505973633e-05,
+      "loss": 4.3378,
+      "step": 2558
+    },
+    {
+      "epoch": 0.8184053249115513,
+      "grad_norm": 0.32330459356307983,
+      "learning_rate": 5.0472802062253426e-05,
+      "loss": 4.3493,
+      "step": 2559
+    },
+    {
+      "epoch": 0.8187251394191369,
+      "grad_norm": 0.32560333609580994,
+      "learning_rate": 5.0300374510198904e-05,
+      "loss": 4.3681,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8190449539267225,
+      "grad_norm": 0.35731586813926697,
+      "learning_rate": 5.012821503492755e-05,
+      "loss": 4.4038,
+      "step": 2561
+    },
+    {
+      "epoch": 0.819364768434308,
+      "grad_norm": 0.34077370166778564,
+      "learning_rate": 4.9956323821269326e-05,
+      "loss": 4.4004,
+      "step": 2562
+    },
+    {
+      "epoch": 0.8196845829418937,
+      "grad_norm": 0.3387181758880615,
+      "learning_rate": 4.978470105376627e-05,
+      "loss": 4.4366,
+      "step": 2563
+    },
+    {
+      "epoch": 0.8200043974494793,
+      "grad_norm": 0.34127897024154663,
+      "learning_rate": 4.961334691667177e-05,
+      "loss": 4.374,
+      "step": 2564
+    },
+    {
+      "epoch": 0.820324211957065,
+      "grad_norm": 0.3408231735229492,
+      "learning_rate": 4.9442261593951496e-05,
+      "loss": 4.4105,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8206440264646505,
+      "grad_norm": 0.33096012473106384,
+      "learning_rate": 4.9271445269281884e-05,
+      "loss": 4.4505,
+      "step": 2566
+    },
+    {
+      "epoch": 0.8209638409722361,
+      "grad_norm": 0.3267434537410736,
+      "learning_rate": 4.910089812605098e-05,
+      "loss": 4.3676,
+      "step": 2567
+    },
+    {
+      "epoch": 0.8212836554798217,
+      "grad_norm": 0.33827266097068787,
+      "learning_rate": 4.893062034735758e-05,
+      "loss": 4.3366,
+      "step": 2568
+    },
+    {
+      "epoch": 0.8216034699874073,
+      "grad_norm": 0.33350881934165955,
+      "learning_rate": 4.8760612116011464e-05,
+      "loss": 4.3751,
+      "step": 2569
+    },
+    {
+      "epoch": 0.8219232844949929,
+      "grad_norm": 0.3247975707054138,
+      "learning_rate": 4.8590873614532956e-05,
+      "loss": 4.3698,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8222430990025785,
+      "grad_norm": 0.3426414728164673,
+      "learning_rate": 4.842140502515282e-05,
+      "loss": 4.3529,
+      "step": 2571
+    },
+    {
+      "epoch": 0.8225629135101641,
+      "grad_norm": 0.3254157602787018,
+      "learning_rate": 4.825220652981211e-05,
+      "loss": 4.3941,
+      "step": 2572
+    },
+    {
+      "epoch": 0.8228827280177498,
+      "grad_norm": 0.3290571868419647,
+      "learning_rate": 4.80832783101617e-05,
+      "loss": 4.3833,
+      "step": 2573
+    },
+    {
+      "epoch": 0.8232025425253353,
+      "grad_norm": 0.33417990803718567,
+      "learning_rate": 4.7914620547562475e-05,
+      "loss": 4.3641,
+      "step": 2574
+    },
+    {
+      "epoch": 0.8235223570329209,
+      "grad_norm": 0.3283510208129883,
+      "learning_rate": 4.7746233423084965e-05,
+      "loss": 4.3072,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8238421715405065,
+      "grad_norm": 0.3323967456817627,
+      "learning_rate": 4.757811711750903e-05,
+      "loss": 4.4042,
+      "step": 2576
+    },
+    {
+      "epoch": 0.8241619860480921,
+      "grad_norm": 0.32422980666160583,
+      "learning_rate": 4.741027181132392e-05,
+      "loss": 4.4286,
+      "step": 2577
+    },
+    {
+      "epoch": 0.8244818005556777,
+      "grad_norm": 0.353149950504303,
+      "learning_rate": 4.724269768472776e-05,
+      "loss": 4.394,
+      "step": 2578
+    },
+    {
+      "epoch": 0.8248016150632633,
+      "grad_norm": 0.3301638066768646,
+      "learning_rate": 4.707539491762767e-05,
+      "loss": 4.4815,
+      "step": 2579
+    },
+    {
+      "epoch": 0.8251214295708489,
+      "grad_norm": 0.32457199692726135,
+      "learning_rate": 4.690836368963945e-05,
+      "loss": 4.4616,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8254412440784346,
+      "grad_norm": 0.33951979875564575,
+      "learning_rate": 4.674160418008728e-05,
+      "loss": 4.3569,
+      "step": 2581
+    },
+    {
+      "epoch": 0.8257610585860201,
+      "grad_norm": 0.33899393677711487,
+      "learning_rate": 4.657511656800381e-05,
+      "loss": 4.4591,
+      "step": 2582
+    },
+    {
+      "epoch": 0.8260808730936057,
+      "grad_norm": 0.32934606075286865,
+      "learning_rate": 4.6408901032129476e-05,
+      "loss": 4.4011,
+      "step": 2583
+    },
+    {
+      "epoch": 0.8264006876011913,
+      "grad_norm": 0.3325296938419342,
+      "learning_rate": 4.624295775091282e-05,
+      "loss": 4.253,
+      "step": 2584
+    },
+    {
+      "epoch": 0.8267205021087769,
+      "grad_norm": 0.35208362340927124,
+      "learning_rate": 4.6077286902510144e-05,
+      "loss": 4.3729,
+      "step": 2585
+    },
+    {
+      "epoch": 0.8270403166163625,
+      "grad_norm": 0.3326524496078491,
+      "learning_rate": 4.591188866478513e-05,
+      "loss": 4.3585,
+      "step": 2586
+    },
+    {
+      "epoch": 0.8273601311239481,
+      "grad_norm": 0.3358716368675232,
+      "learning_rate": 4.574676321530891e-05,
+      "loss": 4.3566,
+      "step": 2587
+    },
+    {
+      "epoch": 0.8276799456315337,
+      "grad_norm": 0.3392607271671295,
+      "learning_rate": 4.558191073135957e-05,
+      "loss": 4.4831,
+      "step": 2588
+    },
+    {
+      "epoch": 0.8279997601391194,
+      "grad_norm": 0.32785317301750183,
+      "learning_rate": 4.541733138992231e-05,
+      "loss": 4.406,
+      "step": 2589
+    },
+    {
+      "epoch": 0.8283195746467049,
+      "grad_norm": 0.3403478264808655,
+      "learning_rate": 4.525302536768901e-05,
+      "loss": 4.4775,
+      "step": 2590
+    },
+    {
+      "epoch": 0.8286393891542905,
+      "grad_norm": 0.332083523273468,
+      "learning_rate": 4.5088992841058214e-05,
+      "loss": 4.3948,
+      "step": 2591
+    },
+    {
+      "epoch": 0.8289592036618761,
+      "grad_norm": 0.333533376455307,
+      "learning_rate": 4.4925233986134614e-05,
+      "loss": 4.4332,
+      "step": 2592
+    },
+    {
+      "epoch": 0.8292790181694617,
+      "grad_norm": 0.34167930483818054,
+      "learning_rate": 4.4761748978729305e-05,
+      "loss": 4.301,
+      "step": 2593
+    },
+    {
+      "epoch": 0.8295988326770473,
+      "grad_norm": 0.33838021755218506,
+      "learning_rate": 4.4598537994359297e-05,
+      "loss": 4.3024,
+      "step": 2594
+    },
+    {
+      "epoch": 0.8299186471846329,
+      "grad_norm": 0.3343757390975952,
+      "learning_rate": 4.443560120824748e-05,
+      "loss": 4.2999,
+      "step": 2595
+    },
+    {
+      "epoch": 0.8302384616922185,
+      "grad_norm": 0.3297710418701172,
+      "learning_rate": 4.427293879532231e-05,
+      "loss": 4.3453,
+      "step": 2596
+    },
+    {
+      "epoch": 0.8305582761998042,
+      "grad_norm": 0.3453871011734009,
+      "learning_rate": 4.411055093021758e-05,
+      "loss": 4.3338,
+      "step": 2597
+    },
+    {
+      "epoch": 0.8308780907073897,
+      "grad_norm": 0.334453821182251,
+      "learning_rate": 4.394843778727247e-05,
+      "loss": 4.368,
+      "step": 2598
+    },
+    {
+      "epoch": 0.8311979052149753,
+      "grad_norm": 0.338049978017807,
+      "learning_rate": 4.3786599540531164e-05,
+      "loss": 4.406,
+      "step": 2599
+    },
+    {
+      "epoch": 0.8315177197225609,
+      "grad_norm": 0.32910865545272827,
+      "learning_rate": 4.362503636374277e-05,
+      "loss": 4.4284,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8315177197225609,
+      "eval_loss": 4.398345947265625,
+      "eval_runtime": 96.1854,
+      "eval_samples_per_second": 19.722,
+      "eval_steps_per_second": 4.938,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8318375342301465,
+      "grad_norm": 0.34660476446151733,
+      "learning_rate": 4.346374843036104e-05,
+      "loss": 4.3165,
+      "step": 2601
+    },
+    {
+      "epoch": 0.8321573487377321,
+      "grad_norm": 0.3316264748573303,
+      "learning_rate": 4.3302735913544174e-05,
+      "loss": 4.3164,
+      "step": 2602
+    },
+    {
+      "epoch": 0.8324771632453177,
+      "grad_norm": 0.3275536894798279,
+      "learning_rate": 4.314199898615481e-05,
+      "loss": 4.3702,
+      "step": 2603
+    },
+    {
+      "epoch": 0.8327969777529033,
+      "grad_norm": 0.3246834874153137,
+      "learning_rate": 4.298153782075946e-05,
+      "loss": 4.3451,
+      "step": 2604
+    },
+    {
+      "epoch": 0.833116792260489,
+      "grad_norm": 0.3249054253101349,
+      "learning_rate": 4.2821352589628944e-05,
+      "loss": 4.3683,
+      "step": 2605
+    },
+    {
+      "epoch": 0.8334366067680745,
+      "grad_norm": 0.34387335181236267,
+      "learning_rate": 4.26614434647377e-05,
+      "loss": 4.4254,
+      "step": 2606
+    },
+    {
+      "epoch": 0.8337564212756601,
+      "grad_norm": 0.3431605100631714,
+      "learning_rate": 4.25018106177635e-05,
+      "loss": 4.416,
+      "step": 2607
+    },
+    {
+      "epoch": 0.8340762357832457,
+      "grad_norm": 0.35249772667884827,
+      "learning_rate": 4.2342454220087855e-05,
+      "loss": 4.3307,
+      "step": 2608
+    },
+    {
+      "epoch": 0.8343960502908313,
+      "grad_norm": 0.34468838572502136,
+      "learning_rate": 4.21833744427952e-05,
+      "loss": 4.3808,
+      "step": 2609
+    },
+    {
+      "epoch": 0.834715864798417,
+      "grad_norm": 0.32654619216918945,
+      "learning_rate": 4.202457145667311e-05,
+      "loss": 4.4191,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8350356793060025,
+      "grad_norm": 0.3252173662185669,
+      "learning_rate": 4.1866045432212214e-05,
+      "loss": 4.4372,
+      "step": 2611
+    },
+    {
+      "epoch": 0.8353554938135881,
+      "grad_norm": 0.3516523540019989,
+      "learning_rate": 4.1707796539605385e-05,
+      "loss": 4.4323,
+      "step": 2612
+    },
+    {
+      "epoch": 0.8356753083211738,
+      "grad_norm": 0.33103737235069275,
+      "learning_rate": 4.154982494874829e-05,
+      "loss": 4.3093,
+      "step": 2613
+    },
+    {
+      "epoch": 0.8359951228287593,
+      "grad_norm": 0.3542514443397522,
+      "learning_rate": 4.139213082923862e-05,
+      "loss": 4.4259,
+      "step": 2614
+    },
+    {
+      "epoch": 0.8363149373363449,
+      "grad_norm": 0.341665118932724,
+      "learning_rate": 4.12347143503764e-05,
+      "loss": 4.4255,
+      "step": 2615
+    },
+    {
+      "epoch": 0.8366347518439305,
+      "grad_norm": 0.32837289571762085,
+      "learning_rate": 4.107757568116352e-05,
+      "loss": 4.405,
+      "step": 2616
+    },
+    {
+      "epoch": 0.8369545663515161,
+      "grad_norm": 0.3300032317638397,
+      "learning_rate": 4.092071499030355e-05,
+      "loss": 4.3612,
+      "step": 2617
+    },
+    {
+      "epoch": 0.8372743808591018,
+      "grad_norm": 0.33197590708732605,
+      "learning_rate": 4.076413244620177e-05,
+      "loss": 4.3632,
+      "step": 2618
+    },
+    {
+      "epoch": 0.8375941953666873,
+      "grad_norm": 0.33739814162254333,
+      "learning_rate": 4.060782821696458e-05,
+      "loss": 4.3407,
+      "step": 2619
+    },
+    {
+      "epoch": 0.8379140098742729,
+      "grad_norm": 0.3367348313331604,
+      "learning_rate": 4.0451802470399805e-05,
+      "loss": 4.4094,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8382338243818586,
+      "grad_norm": 0.33014410734176636,
+      "learning_rate": 4.029605537401623e-05,
+      "loss": 4.3781,
+      "step": 2621
+    },
+    {
+      "epoch": 0.8385536388894441,
+      "grad_norm": 0.3287923336029053,
+      "learning_rate": 4.01405870950235e-05,
+      "loss": 4.3943,
+      "step": 2622
+    },
+    {
+      "epoch": 0.8388734533970297,
+      "grad_norm": 0.33316370844841003,
+      "learning_rate": 3.9985397800331965e-05,
+      "loss": 4.3524,
+      "step": 2623
+    },
+    {
+      "epoch": 0.8391932679046153,
+      "grad_norm": 0.33590611815452576,
+      "learning_rate": 3.983048765655225e-05,
+      "loss": 4.3676,
+      "step": 2624
+    },
+    {
+      "epoch": 0.8395130824122009,
+      "grad_norm": 0.33867955207824707,
+      "learning_rate": 3.9675856829995513e-05,
+      "loss": 4.365,
+      "step": 2625
+    },
+    {
+      "epoch": 0.8398328969197866,
+      "grad_norm": 0.32774361968040466,
+      "learning_rate": 3.95215054866729e-05,
+      "loss": 4.387,
+      "step": 2626
+    },
+    {
+      "epoch": 0.8401527114273721,
+      "grad_norm": 0.3267671465873718,
+      "learning_rate": 3.936743379229572e-05,
+      "loss": 4.3414,
+      "step": 2627
+    },
+    {
+      "epoch": 0.8404725259349577,
+      "grad_norm": 0.3394142687320709,
+      "learning_rate": 3.921364191227466e-05,
+      "loss": 4.4019,
+      "step": 2628
+    },
+    {
+      "epoch": 0.8407923404425434,
+      "grad_norm": 0.32703539729118347,
+      "learning_rate": 3.9060130011720345e-05,
+      "loss": 4.4492,
+      "step": 2629
+    },
+    {
+      "epoch": 0.8411121549501289,
+      "grad_norm": 0.3314916491508484,
+      "learning_rate": 3.890689825544271e-05,
+      "loss": 4.423,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8414319694577145,
+      "grad_norm": 0.3226998746395111,
+      "learning_rate": 3.875394680795092e-05,
+      "loss": 4.344,
+      "step": 2631
+    },
+    {
+      "epoch": 0.8417517839653001,
+      "grad_norm": 0.3287048935890198,
+      "learning_rate": 3.8601275833453224e-05,
+      "loss": 4.3911,
+      "step": 2632
+    },
+    {
+      "epoch": 0.8420715984728857,
+      "grad_norm": 0.3339630365371704,
+      "learning_rate": 3.844888549585662e-05,
+      "loss": 4.4108,
+      "step": 2633
+    },
+    {
+      "epoch": 0.8423914129804714,
+      "grad_norm": 0.33046579360961914,
+      "learning_rate": 3.829677595876699e-05,
+      "loss": 4.3197,
+      "step": 2634
+    },
+    {
+      "epoch": 0.8427112274880569,
+      "grad_norm": 0.34412693977355957,
+      "learning_rate": 3.814494738548871e-05,
+      "loss": 4.3802,
+      "step": 2635
+    },
+    {
+      "epoch": 0.8430310419956425,
+      "grad_norm": 0.33722245693206787,
+      "learning_rate": 3.799339993902446e-05,
+      "loss": 4.3095,
+      "step": 2636
+    },
+    {
+      "epoch": 0.8433508565032282,
+      "grad_norm": 0.324649840593338,
+      "learning_rate": 3.784213378207522e-05,
+      "loss": 4.4115,
+      "step": 2637
+    },
+    {
+      "epoch": 0.8436706710108137,
+      "grad_norm": 0.32171830534935,
+      "learning_rate": 3.769114907703973e-05,
+      "loss": 4.3975,
+      "step": 2638
+    },
+    {
+      "epoch": 0.8439904855183993,
+      "grad_norm": 0.3386608064174652,
+      "learning_rate": 3.7540445986014845e-05,
+      "loss": 4.2348,
+      "step": 2639
+    },
+    {
+      "epoch": 0.844310300025985,
+      "grad_norm": 0.3258531093597412,
+      "learning_rate": 3.739002467079488e-05,
+      "loss": 4.4067,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8446301145335705,
+      "grad_norm": 0.33536481857299805,
+      "learning_rate": 3.723988529287176e-05,
+      "loss": 4.4138,
+      "step": 2641
+    },
+    {
+      "epoch": 0.8449499290411562,
+      "grad_norm": 0.3187597692012787,
+      "learning_rate": 3.709002801343478e-05,
+      "loss": 4.3633,
+      "step": 2642
+    },
+    {
+      "epoch": 0.8452697435487417,
+      "grad_norm": 0.3352622985839844,
+      "learning_rate": 3.6940452993370105e-05,
+      "loss": 4.443,
+      "step": 2643
+    },
+    {
+      "epoch": 0.8455895580563273,
+      "grad_norm": 0.3221001625061035,
+      "learning_rate": 3.679116039326115e-05,
+      "loss": 4.3375,
+      "step": 2644
+    },
+    {
+      "epoch": 0.845909372563913,
+      "grad_norm": 0.3328348398208618,
+      "learning_rate": 3.664215037338785e-05,
+      "loss": 4.3941,
+      "step": 2645
+    },
+    {
+      "epoch": 0.8462291870714985,
+      "grad_norm": 0.3309707045555115,
+      "learning_rate": 3.6493423093727084e-05,
+      "loss": 4.4173,
+      "step": 2646
+    },
+    {
+      "epoch": 0.8465490015790841,
+      "grad_norm": 0.32627466320991516,
+      "learning_rate": 3.634497871395207e-05,
+      "loss": 4.457,
+      "step": 2647
+    },
+    {
+      "epoch": 0.8468688160866698,
+      "grad_norm": 0.3152695298194885,
+      "learning_rate": 3.6196817393432085e-05,
+      "loss": 4.3698,
+      "step": 2648
+    },
+    {
+      "epoch": 0.8471886305942553,
+      "grad_norm": 0.3303844928741455,
+      "learning_rate": 3.604893929123284e-05,
+      "loss": 4.4141,
+      "step": 2649
+    },
+    {
+      "epoch": 0.847508445101841,
+      "grad_norm": 0.32189106941223145,
+      "learning_rate": 3.590134456611562e-05,
+      "loss": 4.3802,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8478282596094265,
+      "grad_norm": 0.3331794738769531,
+      "learning_rate": 3.5754033376537947e-05,
+      "loss": 4.4424,
+      "step": 2651
+    },
+    {
+      "epoch": 0.8481480741170121,
+      "grad_norm": 0.32761675119400024,
+      "learning_rate": 3.560700588065252e-05,
+      "loss": 4.3987,
+      "step": 2652
+    },
+    {
+      "epoch": 0.8484678886245978,
+      "grad_norm": 0.3419148027896881,
+      "learning_rate": 3.5460262236307657e-05,
+      "loss": 4.3915,
+      "step": 2653
+    },
+    {
+      "epoch": 0.8487877031321833,
+      "grad_norm": 0.34858790040016174,
+      "learning_rate": 3.531380260104698e-05,
+      "loss": 4.4045,
+      "step": 2654
+    },
+    {
+      "epoch": 0.8491075176397689,
+      "grad_norm": 0.3266744017601013,
+      "learning_rate": 3.516762713210891e-05,
+      "loss": 4.4382,
+      "step": 2655
+    },
+    {
+      "epoch": 0.8494273321473546,
+      "grad_norm": 0.32765451073646545,
+      "learning_rate": 3.502173598642728e-05,
+      "loss": 4.4018,
+      "step": 2656
+    },
+    {
+      "epoch": 0.8497471466549401,
+      "grad_norm": 0.33169153332710266,
+      "learning_rate": 3.4876129320630196e-05,
+      "loss": 4.3031,
+      "step": 2657
+    },
+    {
+      "epoch": 0.8500669611625258,
+      "grad_norm": 0.3545640707015991,
+      "learning_rate": 3.473080729104062e-05,
+      "loss": 4.4478,
+      "step": 2658
+    },
+    {
+      "epoch": 0.8503867756701113,
+      "grad_norm": 0.3219090402126312,
+      "learning_rate": 3.4585770053675876e-05,
+      "loss": 4.3893,
+      "step": 2659
+    },
+    {
+      "epoch": 0.8507065901776969,
+      "grad_norm": 0.3292368948459625,
+      "learning_rate": 3.444101776424738e-05,
+      "loss": 4.3986,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8510264046852826,
+      "grad_norm": 0.32431018352508545,
+      "learning_rate": 3.429655057816099e-05,
+      "loss": 4.3436,
+      "step": 2661
+    },
+    {
+      "epoch": 0.8513462191928681,
+      "grad_norm": 0.3349042534828186,
+      "learning_rate": 3.415236865051606e-05,
+      "loss": 4.3661,
+      "step": 2662
+    },
+    {
+      "epoch": 0.8516660337004537,
+      "grad_norm": 0.32648709416389465,
+      "learning_rate": 3.4008472136106046e-05,
+      "loss": 4.3795,
+      "step": 2663
+    },
+    {
+      "epoch": 0.8519858482080394,
+      "grad_norm": 0.32753390073776245,
+      "learning_rate": 3.3864861189417636e-05,
+      "loss": 4.4567,
+      "step": 2664
+    },
+    {
+      "epoch": 0.8523056627156249,
+      "grad_norm": 0.3344663977622986,
+      "learning_rate": 3.3721535964631195e-05,
+      "loss": 4.3676,
+      "step": 2665
+    },
+    {
+      "epoch": 0.8526254772232106,
+      "grad_norm": 0.33271554112434387,
+      "learning_rate": 3.3578496615620307e-05,
+      "loss": 4.3831,
+      "step": 2666
+    },
+    {
+      "epoch": 0.8529452917307961,
+      "grad_norm": 0.32605451345443726,
+      "learning_rate": 3.343574329595157e-05,
+      "loss": 4.4372,
+      "step": 2667
+    },
+    {
+      "epoch": 0.8532651062383817,
+      "grad_norm": 0.3346996307373047,
+      "learning_rate": 3.329327615888461e-05,
+      "loss": 4.3936,
+      "step": 2668
+    },
+    {
+      "epoch": 0.8535849207459674,
+      "grad_norm": 0.33022093772888184,
+      "learning_rate": 3.315109535737155e-05,
+      "loss": 4.3627,
+      "step": 2669
+    },
+    {
+      "epoch": 0.8539047352535529,
+      "grad_norm": 0.33286476135253906,
+      "learning_rate": 3.300920104405739e-05,
+      "loss": 4.3585,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8542245497611385,
+      "grad_norm": 0.3309535086154938,
+      "learning_rate": 3.2867593371279434e-05,
+      "loss": 4.4029,
+      "step": 2671
+    },
+    {
+      "epoch": 0.8545443642687242,
+      "grad_norm": 0.33394062519073486,
+      "learning_rate": 3.272627249106724e-05,
+      "loss": 4.3053,
+      "step": 2672
+    },
+    {
+      "epoch": 0.8548641787763097,
+      "grad_norm": 0.3265056908130646,
+      "learning_rate": 3.258523855514258e-05,
+      "loss": 4.2647,
+      "step": 2673
+    },
+    {
+      "epoch": 0.8551839932838954,
+      "grad_norm": 0.32501220703125,
+      "learning_rate": 3.244449171491896e-05,
+      "loss": 4.3984,
+      "step": 2674
+    },
+    {
+      "epoch": 0.8555038077914809,
+      "grad_norm": 0.33929458260536194,
+      "learning_rate": 3.230403212150179e-05,
+      "loss": 4.4622,
+      "step": 2675
+    },
+    {
+      "epoch": 0.8558236222990665,
+      "grad_norm": 0.3200635313987732,
+      "learning_rate": 3.216385992568813e-05,
+      "loss": 4.4147,
+      "step": 2676
+    },
+    {
+      "epoch": 0.8561434368066522,
+      "grad_norm": 0.3386853039264679,
+      "learning_rate": 3.202397527796637e-05,
+      "loss": 4.3412,
+      "step": 2677
+    },
+    {
+      "epoch": 0.8564632513142377,
+      "grad_norm": 0.3138151466846466,
+      "learning_rate": 3.188437832851639e-05,
+      "loss": 4.3613,
+      "step": 2678
+    },
+    {
+      "epoch": 0.8567830658218234,
+      "grad_norm": 0.3236668109893799,
+      "learning_rate": 3.1745069227208894e-05,
+      "loss": 4.3597,
+      "step": 2679
+    },
+    {
+      "epoch": 0.857102880329409,
+      "grad_norm": 0.3210524022579193,
+      "learning_rate": 3.160604812360579e-05,
+      "loss": 4.3687,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8574226948369945,
+      "grad_norm": 0.32683470845222473,
+      "learning_rate": 3.146731516695974e-05,
+      "loss": 4.3062,
+      "step": 2681
+    },
+    {
+      "epoch": 0.8577425093445802,
+      "grad_norm": 0.3700844645500183,
+      "learning_rate": 3.1328870506214044e-05,
+      "loss": 4.3855,
+      "step": 2682
+    },
+    {
+      "epoch": 0.8580623238521657,
+      "grad_norm": 0.32976222038269043,
+      "learning_rate": 3.119071429000254e-05,
+      "loss": 4.4458,
+      "step": 2683
+    },
+    {
+      "epoch": 0.8583821383597513,
+      "grad_norm": 0.32784411311149597,
+      "learning_rate": 3.105284666664918e-05,
+      "loss": 4.3289,
+      "step": 2684
+    },
+    {
+      "epoch": 0.858701952867337,
+      "grad_norm": 0.31430763006210327,
+      "learning_rate": 3.091526778416833e-05,
+      "loss": 4.3308,
+      "step": 2685
+    },
+    {
+      "epoch": 0.8590217673749225,
+      "grad_norm": 0.3267063498497009,
+      "learning_rate": 3.077797779026428e-05,
+      "loss": 4.3948,
+      "step": 2686
+    },
+    {
+      "epoch": 0.8593415818825082,
+      "grad_norm": 0.33419206738471985,
+      "learning_rate": 3.064097683233121e-05,
+      "loss": 4.3861,
+      "step": 2687
+    },
+    {
+      "epoch": 0.8596613963900938,
+      "grad_norm": 0.3286486268043518,
+      "learning_rate": 3.0504265057452815e-05,
+      "loss": 4.2863,
+      "step": 2688
+    },
+    {
+      "epoch": 0.8599812108976793,
+      "grad_norm": 0.33864644169807434,
+      "learning_rate": 3.036784261240255e-05,
+      "loss": 4.2062,
+      "step": 2689
+    },
+    {
+      "epoch": 0.860301025405265,
+      "grad_norm": 0.33045342564582825,
+      "learning_rate": 3.0231709643643086e-05,
+      "loss": 4.3897,
+      "step": 2690
+    },
+    {
+      "epoch": 0.8606208399128505,
+      "grad_norm": 0.33123284578323364,
+      "learning_rate": 3.0095866297326455e-05,
+      "loss": 4.3802,
+      "step": 2691
+    },
+    {
+      "epoch": 0.8609406544204361,
+      "grad_norm": 0.33128055930137634,
+      "learning_rate": 2.996031271929369e-05,
+      "loss": 4.3809,
+      "step": 2692
+    },
+    {
+      "epoch": 0.8612604689280218,
+      "grad_norm": 0.3309783339500427,
+      "learning_rate": 2.982504905507461e-05,
+      "loss": 4.331,
+      "step": 2693
+    },
+    {
+      "epoch": 0.8615802834356073,
+      "grad_norm": 0.3281680941581726,
+      "learning_rate": 2.969007544988793e-05,
+      "loss": 4.2814,
+      "step": 2694
+    },
+    {
+      "epoch": 0.861900097943193,
+      "grad_norm": 0.3542790114879608,
+      "learning_rate": 2.9555392048640924e-05,
+      "loss": 4.3908,
+      "step": 2695
+    },
+    {
+      "epoch": 0.8622199124507786,
+      "grad_norm": 0.3346249461174011,
+      "learning_rate": 2.9420998995929267e-05,
+      "loss": 4.4109,
+      "step": 2696
+    },
+    {
+      "epoch": 0.8625397269583641,
+      "grad_norm": 0.3290999233722687,
+      "learning_rate": 2.9286896436037076e-05,
+      "loss": 4.389,
+      "step": 2697
+    },
+    {
+      "epoch": 0.8628595414659498,
+      "grad_norm": 0.337666779756546,
+      "learning_rate": 2.9153084512936285e-05,
+      "loss": 4.422,
+      "step": 2698
+    },
+    {
+      "epoch": 0.8631793559735353,
+      "grad_norm": 0.3327541947364807,
+      "learning_rate": 2.9019563370287112e-05,
+      "loss": 4.3749,
+      "step": 2699
+    },
+    {
+      "epoch": 0.8634991704811209,
+      "grad_norm": 0.31937623023986816,
+      "learning_rate": 2.8886333151437292e-05,
+      "loss": 4.3105,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8634991704811209,
+      "eval_loss": 4.390111923217773,
+      "eval_runtime": 92.48,
+      "eval_samples_per_second": 20.513,
+      "eval_steps_per_second": 5.136,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8638189849887066,
+      "grad_norm": 0.32369351387023926,
+      "learning_rate": 2.875339399942257e-05,
+      "loss": 4.3694,
+      "step": 2701
+    },
+    {
+      "epoch": 0.8641387994962921,
+      "grad_norm": 0.3404898941516876,
+      "learning_rate": 2.862074605696605e-05,
+      "loss": 4.3563,
+      "step": 2702
+    },
+    {
+      "epoch": 0.8644586140038778,
+      "grad_norm": 0.3288138508796692,
+      "learning_rate": 2.848838946647801e-05,
+      "loss": 4.3834,
+      "step": 2703
+    },
+    {
+      "epoch": 0.8647784285114634,
+      "grad_norm": 0.35545358061790466,
+      "learning_rate": 2.835632437005626e-05,
+      "loss": 4.2707,
+      "step": 2704
+    },
+    {
+      "epoch": 0.8650982430190489,
+      "grad_norm": 0.32958588004112244,
+      "learning_rate": 2.8224550909485344e-05,
+      "loss": 4.4501,
+      "step": 2705
+    },
+    {
+      "epoch": 0.8654180575266346,
+      "grad_norm": 0.32998397946357727,
+      "learning_rate": 2.8093069226236865e-05,
+      "loss": 4.3847,
+      "step": 2706
+    },
+    {
+      "epoch": 0.8657378720342201,
+      "grad_norm": 0.3279590308666229,
+      "learning_rate": 2.796187946146937e-05,
+      "loss": 4.3963,
+      "step": 2707
+    },
+    {
+      "epoch": 0.8660576865418057,
+      "grad_norm": 0.34240469336509705,
+      "learning_rate": 2.7830981756027636e-05,
+      "loss": 4.2706,
+      "step": 2708
+    },
+    {
+      "epoch": 0.8663775010493914,
+      "grad_norm": 0.32866594195365906,
+      "learning_rate": 2.7700376250443147e-05,
+      "loss": 4.3242,
+      "step": 2709
+    },
+    {
+      "epoch": 0.8666973155569769,
+      "grad_norm": 0.3390978276729584,
+      "learning_rate": 2.757006308493347e-05,
+      "loss": 4.3714,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8670171300645626,
+      "grad_norm": 0.3310787081718445,
+      "learning_rate": 2.7440042399402496e-05,
+      "loss": 4.3714,
+      "step": 2711
+    },
+    {
+      "epoch": 0.8673369445721482,
+      "grad_norm": 0.3252509832382202,
+      "learning_rate": 2.7310314333440097e-05,
+      "loss": 4.2953,
+      "step": 2712
+    },
+    {
+      "epoch": 0.8676567590797337,
+      "grad_norm": 0.3651213049888611,
+      "learning_rate": 2.7180879026321866e-05,
+      "loss": 4.4167,
+      "step": 2713
+    },
+    {
+      "epoch": 0.8679765735873194,
+      "grad_norm": 0.3748612105846405,
+      "learning_rate": 2.7051736617009277e-05,
+      "loss": 4.3505,
+      "step": 2714
+    },
+    {
+      "epoch": 0.868296388094905,
+      "grad_norm": 0.34408658742904663,
+      "learning_rate": 2.6922887244149126e-05,
+      "loss": 4.4079,
+      "step": 2715
+    },
+    {
+      "epoch": 0.8686162026024905,
+      "grad_norm": 0.33819928765296936,
+      "learning_rate": 2.6794331046073724e-05,
+      "loss": 4.2749,
+      "step": 2716
+    },
+    {
+      "epoch": 0.8689360171100762,
+      "grad_norm": 0.33545467257499695,
+      "learning_rate": 2.6666068160800702e-05,
+      "loss": 4.4416,
+      "step": 2717
+    },
+    {
+      "epoch": 0.8692558316176617,
+      "grad_norm": 0.3370988667011261,
+      "learning_rate": 2.6538098726032675e-05,
+      "loss": 4.3412,
+      "step": 2718
+    },
+    {
+      "epoch": 0.8695756461252474,
+      "grad_norm": 0.33135709166526794,
+      "learning_rate": 2.6410422879157313e-05,
+      "loss": 4.3847,
+      "step": 2719
+    },
+    {
+      "epoch": 0.869895460632833,
+      "grad_norm": 0.36448410153388977,
+      "learning_rate": 2.628304075724693e-05,
+      "loss": 4.3429,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8702152751404185,
+      "grad_norm": 0.39434826374053955,
+      "learning_rate": 2.6155952497058643e-05,
+      "loss": 4.3748,
+      "step": 2721
+    },
+    {
+      "epoch": 0.8705350896480042,
+      "grad_norm": 0.3282822072505951,
+      "learning_rate": 2.6029158235033997e-05,
+      "loss": 4.4443,
+      "step": 2722
+    },
+    {
+      "epoch": 0.8708549041555897,
+      "grad_norm": 0.3259486258029938,
+      "learning_rate": 2.5902658107299078e-05,
+      "loss": 4.4201,
+      "step": 2723
+    },
+    {
+      "epoch": 0.8711747186631753,
+      "grad_norm": 0.35804739594459534,
+      "learning_rate": 2.5776452249663847e-05,
+      "loss": 4.3938,
+      "step": 2724
+    },
+    {
+      "epoch": 0.871494533170761,
+      "grad_norm": 0.32361823320388794,
+      "learning_rate": 2.5650540797622687e-05,
+      "loss": 4.4667,
+      "step": 2725
+    },
+    {
+      "epoch": 0.8718143476783465,
+      "grad_norm": 0.32772985100746155,
+      "learning_rate": 2.5524923886353697e-05,
+      "loss": 4.3217,
+      "step": 2726
+    },
+    {
+      "epoch": 0.8721341621859322,
+      "grad_norm": 0.3569638729095459,
+      "learning_rate": 2.5399601650718838e-05,
+      "loss": 4.4559,
+      "step": 2727
+    },
+    {
+      "epoch": 0.8724539766935178,
+      "grad_norm": 0.3326680064201355,
+      "learning_rate": 2.5274574225263776e-05,
+      "loss": 4.366,
+      "step": 2728
+    },
+    {
+      "epoch": 0.8727737912011033,
+      "grad_norm": 0.3196980655193329,
+      "learning_rate": 2.5149841744217415e-05,
+      "loss": 4.293,
+      "step": 2729
+    },
+    {
+      "epoch": 0.873093605708689,
+      "grad_norm": 0.331160306930542,
+      "learning_rate": 2.5025404341492327e-05,
+      "loss": 4.3402,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8734134202162745,
+      "grad_norm": 0.33455970883369446,
+      "learning_rate": 2.4901262150684055e-05,
+      "loss": 4.4117,
+      "step": 2731
+    },
+    {
+      "epoch": 0.8737332347238601,
+      "grad_norm": 0.3468061089515686,
+      "learning_rate": 2.4777415305071346e-05,
+      "loss": 4.4355,
+      "step": 2732
+    },
+    {
+      "epoch": 0.8740530492314458,
+      "grad_norm": 0.3290843963623047,
+      "learning_rate": 2.4653863937615813e-05,
+      "loss": 4.4007,
+      "step": 2733
+    },
+    {
+      "epoch": 0.8743728637390313,
+      "grad_norm": 0.3317509889602661,
+      "learning_rate": 2.4530608180961786e-05,
+      "loss": 4.3998,
+      "step": 2734
+    },
+    {
+      "epoch": 0.874692678246617,
+      "grad_norm": 0.3440130949020386,
+      "learning_rate": 2.440764816743631e-05,
+      "loss": 4.362,
+      "step": 2735
+    },
+    {
+      "epoch": 0.8750124927542026,
+      "grad_norm": 0.35315367579460144,
+      "learning_rate": 2.428498402904889e-05,
+      "loss": 4.3674,
+      "step": 2736
+    },
+    {
+      "epoch": 0.8753323072617881,
+      "grad_norm": 0.3442535698413849,
+      "learning_rate": 2.416261589749139e-05,
+      "loss": 4.4137,
+      "step": 2737
+    },
+    {
+      "epoch": 0.8756521217693738,
+      "grad_norm": 0.3287537097930908,
+      "learning_rate": 2.4040543904137942e-05,
+      "loss": 4.3668,
+      "step": 2738
+    },
+    {
+      "epoch": 0.8759719362769594,
+      "grad_norm": 0.32724133133888245,
+      "learning_rate": 2.391876818004452e-05,
+      "loss": 4.3551,
+      "step": 2739
+    },
+    {
+      "epoch": 0.8762917507845449,
+      "grad_norm": 0.3481774628162384,
+      "learning_rate": 2.3797288855949382e-05,
+      "loss": 4.2624,
+      "step": 2740
+    },
+    {
+      "epoch": 0.8766115652921306,
+      "grad_norm": 0.33640459179878235,
+      "learning_rate": 2.3676106062272126e-05,
+      "loss": 4.4033,
+      "step": 2741
+    },
+    {
+      "epoch": 0.8769313797997161,
+      "grad_norm": 0.33367154002189636,
+      "learning_rate": 2.3555219929114454e-05,
+      "loss": 4.4157,
+      "step": 2742
+    },
+    {
+      "epoch": 0.8772511943073018,
+      "grad_norm": 0.32960766553878784,
+      "learning_rate": 2.343463058625932e-05,
+      "loss": 4.3441,
+      "step": 2743
+    },
+    {
+      "epoch": 0.8775710088148874,
+      "grad_norm": 0.33007586002349854,
+      "learning_rate": 2.331433816317102e-05,
+      "loss": 4.4697,
+      "step": 2744
+    },
+    {
+      "epoch": 0.8778908233224729,
+      "grad_norm": 0.32566601037979126,
+      "learning_rate": 2.3194342788995257e-05,
+      "loss": 4.2543,
+      "step": 2745
+    },
+    {
+      "epoch": 0.8782106378300586,
+      "grad_norm": 0.3600383698940277,
+      "learning_rate": 2.307464459255851e-05,
+      "loss": 4.3637,
+      "step": 2746
+    },
+    {
+      "epoch": 0.8785304523376442,
+      "grad_norm": 0.3245023488998413,
+      "learning_rate": 2.2955243702368652e-05,
+      "loss": 4.3556,
+      "step": 2747
+    },
+    {
+      "epoch": 0.8788502668452298,
+      "grad_norm": 0.32192283868789673,
+      "learning_rate": 2.2836140246613977e-05,
+      "loss": 4.3702,
+      "step": 2748
+    },
+    {
+      "epoch": 0.8791700813528154,
+      "grad_norm": 0.32928332686424255,
+      "learning_rate": 2.271733435316363e-05,
+      "loss": 4.4101,
+      "step": 2749
+    },
+    {
+      "epoch": 0.8794898958604009,
+      "grad_norm": 0.3395914137363434,
+      "learning_rate": 2.2598826149567352e-05,
+      "loss": 4.4447,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8798097103679866,
+      "grad_norm": 0.33036407828330994,
+      "learning_rate": 2.2480615763055032e-05,
+      "loss": 4.335,
+      "step": 2751
+    },
+    {
+      "epoch": 0.8801295248755722,
+      "grad_norm": 0.331396222114563,
+      "learning_rate": 2.2362703320537156e-05,
+      "loss": 4.3357,
+      "step": 2752
+    },
+    {
+      "epoch": 0.8804493393831577,
+      "grad_norm": 0.32844099402427673,
+      "learning_rate": 2.2245088948604095e-05,
+      "loss": 4.3891,
+      "step": 2753
+    },
+    {
+      "epoch": 0.8807691538907434,
+      "grad_norm": 0.3302816152572632,
+      "learning_rate": 2.2127772773526342e-05,
+      "loss": 4.3532,
+      "step": 2754
+    },
+    {
+      "epoch": 0.881088968398329,
+      "grad_norm": 0.33271288871765137,
+      "learning_rate": 2.201075492125415e-05,
+      "loss": 4.3602,
+      "step": 2755
+    },
+    {
+      "epoch": 0.8814087829059146,
+      "grad_norm": 0.32084935903549194,
+      "learning_rate": 2.1894035517417486e-05,
+      "loss": 4.3989,
+      "step": 2756
+    },
+    {
+      "epoch": 0.8817285974135002,
+      "grad_norm": 0.33213767409324646,
+      "learning_rate": 2.1777614687326116e-05,
+      "loss": 4.3145,
+      "step": 2757
+    },
+    {
+      "epoch": 0.8820484119210857,
+      "grad_norm": 0.3316733241081238,
+      "learning_rate": 2.166149255596896e-05,
+      "loss": 4.3445,
+      "step": 2758
+    },
+    {
+      "epoch": 0.8823682264286714,
+      "grad_norm": 0.3338378369808197,
+      "learning_rate": 2.154566924801453e-05,
+      "loss": 4.3665,
+      "step": 2759
+    },
+    {
+      "epoch": 0.882688040936257,
+      "grad_norm": 0.32423871755599976,
+      "learning_rate": 2.1430144887810218e-05,
+      "loss": 4.3968,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8830078554438425,
+      "grad_norm": 0.3304431736469269,
+      "learning_rate": 2.131491959938275e-05,
+      "loss": 4.364,
+      "step": 2761
+    },
+    {
+      "epoch": 0.8833276699514282,
+      "grad_norm": 0.3281329572200775,
+      "learning_rate": 2.119999350643764e-05,
+      "loss": 4.3357,
+      "step": 2762
+    },
+    {
+      "epoch": 0.8836474844590138,
+      "grad_norm": 0.3241371512413025,
+      "learning_rate": 2.108536673235922e-05,
+      "loss": 4.3018,
+      "step": 2763
+    },
+    {
+      "epoch": 0.8839672989665994,
+      "grad_norm": 0.3227525055408478,
+      "learning_rate": 2.0971039400210453e-05,
+      "loss": 4.3876,
+      "step": 2764
+    },
+    {
+      "epoch": 0.884287113474185,
+      "grad_norm": 0.31972551345825195,
+      "learning_rate": 2.0857011632732755e-05,
+      "loss": 4.335,
+      "step": 2765
+    },
+    {
+      "epoch": 0.8846069279817705,
+      "grad_norm": 0.3417794704437256,
+      "learning_rate": 2.0743283552346067e-05,
+      "loss": 4.3918,
+      "step": 2766
+    },
+    {
+      "epoch": 0.8849267424893562,
+      "grad_norm": 0.3201994001865387,
+      "learning_rate": 2.062985528114852e-05,
+      "loss": 4.3126,
+      "step": 2767
+    },
+    {
+      "epoch": 0.8852465569969418,
+      "grad_norm": 0.32395732402801514,
+      "learning_rate": 2.0516726940916372e-05,
+      "loss": 4.2857,
+      "step": 2768
+    },
+    {
+      "epoch": 0.8855663715045273,
+      "grad_norm": 0.3288622498512268,
+      "learning_rate": 2.0403898653103867e-05,
+      "loss": 4.4953,
+      "step": 2769
+    },
+    {
+      "epoch": 0.885886186012113,
+      "grad_norm": 0.3226605951786041,
+      "learning_rate": 2.029137053884311e-05,
+      "loss": 4.4309,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8862060005196986,
+      "grad_norm": 0.31975656747817993,
+      "learning_rate": 2.0179142718943964e-05,
+      "loss": 4.3941,
+      "step": 2771
+    },
+    {
+      "epoch": 0.8865258150272842,
+      "grad_norm": 0.32429590821266174,
+      "learning_rate": 2.006721531389388e-05,
+      "loss": 4.4056,
+      "step": 2772
+    },
+    {
+      "epoch": 0.8868456295348698,
+      "grad_norm": 0.3391755521297455,
+      "learning_rate": 1.9955588443857807e-05,
+      "loss": 4.3843,
+      "step": 2773
+    },
+    {
+      "epoch": 0.8871654440424553,
+      "grad_norm": 0.3246627151966095,
+      "learning_rate": 1.9844262228678077e-05,
+      "loss": 4.3636,
+      "step": 2774
+    },
+    {
+      "epoch": 0.887485258550041,
+      "grad_norm": 0.3355211913585663,
+      "learning_rate": 1.9733236787874053e-05,
+      "loss": 4.3891,
+      "step": 2775
+    },
+    {
+      "epoch": 0.8878050730576266,
+      "grad_norm": 0.3338679075241089,
+      "learning_rate": 1.9622512240642386e-05,
+      "loss": 4.3871,
+      "step": 2776
+    },
+    {
+      "epoch": 0.8881248875652121,
+      "grad_norm": 0.3314814567565918,
+      "learning_rate": 1.9512088705856654e-05,
+      "loss": 4.4157,
+      "step": 2777
+    },
+    {
+      "epoch": 0.8884447020727978,
+      "grad_norm": 0.32310590147972107,
+      "learning_rate": 1.9401966302067262e-05,
+      "loss": 4.3733,
+      "step": 2778
+    },
+    {
+      "epoch": 0.8887645165803834,
+      "grad_norm": 0.35283005237579346,
+      "learning_rate": 1.9292145147501204e-05,
+      "loss": 4.3552,
+      "step": 2779
+    },
+    {
+      "epoch": 0.889084331087969,
+      "grad_norm": 0.3279760181903839,
+      "learning_rate": 1.91826253600622e-05,
+      "loss": 4.3163,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8894041455955546,
+      "grad_norm": 0.3331490159034729,
+      "learning_rate": 1.907340705733036e-05,
+      "loss": 4.3924,
+      "step": 2781
+    },
+    {
+      "epoch": 0.8897239601031401,
+      "grad_norm": 0.32731348276138306,
+      "learning_rate": 1.8964490356562155e-05,
+      "loss": 4.3928,
+      "step": 2782
+    },
+    {
+      "epoch": 0.8900437746107258,
+      "grad_norm": 0.32343342900276184,
+      "learning_rate": 1.8855875374690288e-05,
+      "loss": 4.3203,
+      "step": 2783
+    },
+    {
+      "epoch": 0.8903635891183114,
+      "grad_norm": 0.37520188093185425,
+      "learning_rate": 1.8747562228323344e-05,
+      "loss": 4.3195,
+      "step": 2784
+    },
+    {
+      "epoch": 0.8906834036258969,
+      "grad_norm": 0.32275810837745667,
+      "learning_rate": 1.863955103374607e-05,
+      "loss": 4.3777,
+      "step": 2785
+    },
+    {
+      "epoch": 0.8910032181334826,
+      "grad_norm": 0.35362449288368225,
+      "learning_rate": 1.8531841906918976e-05,
+      "loss": 4.3911,
+      "step": 2786
+    },
+    {
+      "epoch": 0.8913230326410682,
+      "grad_norm": 0.3307854235172272,
+      "learning_rate": 1.8424434963478262e-05,
+      "loss": 4.3747,
+      "step": 2787
+    },
+    {
+      "epoch": 0.8916428471486538,
+      "grad_norm": 0.33758893609046936,
+      "learning_rate": 1.8317330318735757e-05,
+      "loss": 4.3987,
+      "step": 2788
+    },
+    {
+      "epoch": 0.8919626616562394,
+      "grad_norm": 0.33458662033081055,
+      "learning_rate": 1.8210528087678577e-05,
+      "loss": 4.3698,
+      "step": 2789
+    },
+    {
+      "epoch": 0.892282476163825,
+      "grad_norm": 0.31996577978134155,
+      "learning_rate": 1.810402838496937e-05,
+      "loss": 4.3393,
+      "step": 2790
+    },
+    {
+      "epoch": 0.8926022906714106,
+      "grad_norm": 0.32762593030929565,
+      "learning_rate": 1.799783132494581e-05,
+      "loss": 4.3347,
+      "step": 2791
+    },
+    {
+      "epoch": 0.8929221051789962,
+      "grad_norm": 0.3323177397251129,
+      "learning_rate": 1.789193702162086e-05,
+      "loss": 4.4891,
+      "step": 2792
+    },
+    {
+      "epoch": 0.8932419196865817,
+      "grad_norm": 0.33863160014152527,
+      "learning_rate": 1.7786345588682317e-05,
+      "loss": 4.4102,
+      "step": 2793
+    },
+    {
+      "epoch": 0.8935617341941674,
+      "grad_norm": 0.3298216462135315,
+      "learning_rate": 1.7681057139492792e-05,
+      "loss": 4.3999,
+      "step": 2794
+    },
+    {
+      "epoch": 0.893881548701753,
+      "grad_norm": 0.33105114102363586,
+      "learning_rate": 1.7576071787089672e-05,
+      "loss": 4.359,
+      "step": 2795
+    },
+    {
+      "epoch": 0.8942013632093386,
+      "grad_norm": 0.32095324993133545,
+      "learning_rate": 1.7471389644184897e-05,
+      "loss": 4.4448,
+      "step": 2796
+    },
+    {
+      "epoch": 0.8945211777169242,
+      "grad_norm": 0.3180572986602783,
+      "learning_rate": 1.7367010823164862e-05,
+      "loss": 4.4066,
+      "step": 2797
+    },
+    {
+      "epoch": 0.8948409922245097,
+      "grad_norm": 0.32465660572052,
+      "learning_rate": 1.726293543609053e-05,
+      "loss": 4.3098,
+      "step": 2798
+    },
+    {
+      "epoch": 0.8951608067320954,
+      "grad_norm": 0.32698917388916016,
+      "learning_rate": 1.7159163594696756e-05,
+      "loss": 4.3551,
+      "step": 2799
+    },
+    {
+      "epoch": 0.895480621239681,
+      "grad_norm": 0.3233741223812103,
+      "learning_rate": 1.7055695410392823e-05,
+      "loss": 4.2949,
+      "step": 2800
+    },
+    {
+      "epoch": 0.895480621239681,
+      "eval_loss": 4.384608268737793,
+      "eval_runtime": 97.7804,
+      "eval_samples_per_second": 19.401,
+      "eval_steps_per_second": 4.858,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8958004357472665,
+      "grad_norm": 0.32978156208992004,
+      "learning_rate": 1.695253099426177e-05,
+      "loss": 4.2676,
+      "step": 2801
+    },
+    {
+      "epoch": 0.8961202502548522,
+      "grad_norm": 0.3305906057357788,
+      "learning_rate": 1.6849670457060605e-05,
+      "loss": 4.353,
+      "step": 2802
+    },
+    {
+      "epoch": 0.8964400647624378,
+      "grad_norm": 0.33238425850868225,
+      "learning_rate": 1.6747113909220155e-05,
+      "loss": 4.374,
+      "step": 2803
+    },
+    {
+      "epoch": 0.8967598792700234,
+      "grad_norm": 0.33963367342948914,
+      "learning_rate": 1.6644861460844782e-05,
+      "loss": 4.2773,
+      "step": 2804
+    },
+    {
+      "epoch": 0.897079693777609,
+      "grad_norm": 0.3328540623188019,
+      "learning_rate": 1.6542913221712506e-05,
+      "loss": 4.4412,
+      "step": 2805
+    },
+    {
+      "epoch": 0.8973995082851945,
+      "grad_norm": 0.3270185887813568,
+      "learning_rate": 1.6441269301274572e-05,
+      "loss": 4.3709,
+      "step": 2806
+    },
+    {
+      "epoch": 0.8977193227927802,
+      "grad_norm": 0.31789711117744446,
+      "learning_rate": 1.633992980865556e-05,
+      "loss": 4.3758,
+      "step": 2807
+    },
+    {
+      "epoch": 0.8980391373003658,
+      "grad_norm": 0.33433958888053894,
+      "learning_rate": 1.6238894852653338e-05,
+      "loss": 4.3488,
+      "step": 2808
+    },
+    {
+      "epoch": 0.8983589518079513,
+      "grad_norm": 0.3263933062553406,
+      "learning_rate": 1.6138164541738674e-05,
+      "loss": 4.4204,
+      "step": 2809
+    },
+    {
+      "epoch": 0.898678766315537,
+      "grad_norm": 0.3347971737384796,
+      "learning_rate": 1.6037738984055425e-05,
+      "loss": 4.34,
+      "step": 2810
+    },
+    {
+      "epoch": 0.8989985808231226,
+      "grad_norm": 0.32892242074012756,
+      "learning_rate": 1.5937618287420052e-05,
+      "loss": 4.3679,
+      "step": 2811
+    },
+    {
+      "epoch": 0.8993183953307082,
+      "grad_norm": 0.3295742869377136,
+      "learning_rate": 1.583780255932193e-05,
+      "loss": 4.3666,
+      "step": 2812
+    },
+    {
+      "epoch": 0.8996382098382938,
+      "grad_norm": 0.33569231629371643,
+      "learning_rate": 1.5738291906922883e-05,
+      "loss": 4.3431,
+      "step": 2813
+    },
+    {
+      "epoch": 0.8999580243458793,
+      "grad_norm": 0.3311055898666382,
+      "learning_rate": 1.5639086437057314e-05,
+      "loss": 4.3241,
+      "step": 2814
+    },
+    {
+      "epoch": 0.900277838853465,
+      "grad_norm": 0.4988267421722412,
+      "learning_rate": 1.5540186256231823e-05,
+      "loss": 4.3952,
+      "step": 2815
+    },
+    {
+      "epoch": 0.9005976533610506,
+      "grad_norm": 0.3264239728450775,
+      "learning_rate": 1.5441591470625414e-05,
+      "loss": 4.333,
+      "step": 2816
+    },
+    {
+      "epoch": 0.9009174678686362,
+      "grad_norm": 0.3201085031032562,
+      "learning_rate": 1.534330218608918e-05,
+      "loss": 4.4191,
+      "step": 2817
+    },
+    {
+      "epoch": 0.9012372823762218,
+      "grad_norm": 0.3359006643295288,
+      "learning_rate": 1.5245318508146175e-05,
+      "loss": 4.3771,
+      "step": 2818
+    },
+    {
+      "epoch": 0.9015570968838074,
+      "grad_norm": 0.3259337544441223,
+      "learning_rate": 1.5147640541991424e-05,
+      "loss": 4.4356,
+      "step": 2819
+    },
+    {
+      "epoch": 0.901876911391393,
+      "grad_norm": 0.3287445306777954,
+      "learning_rate": 1.5050268392491639e-05,
+      "loss": 4.4308,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9021967258989786,
+      "grad_norm": 0.33562859892845154,
+      "learning_rate": 1.4953202164185297e-05,
+      "loss": 4.4679,
+      "step": 2821
+    },
+    {
+      "epoch": 0.9025165404065641,
+      "grad_norm": 0.3279714584350586,
+      "learning_rate": 1.4856441961282472e-05,
+      "loss": 4.4241,
+      "step": 2822
+    },
+    {
+      "epoch": 0.9028363549141498,
+      "grad_norm": 0.3326357305049896,
+      "learning_rate": 1.4759987887664537e-05,
+      "loss": 4.3265,
+      "step": 2823
+    },
+    {
+      "epoch": 0.9031561694217354,
+      "grad_norm": 0.3204142451286316,
+      "learning_rate": 1.4663840046884423e-05,
+      "loss": 4.29,
+      "step": 2824
+    },
+    {
+      "epoch": 0.903475983929321,
+      "grad_norm": 0.32729199528694153,
+      "learning_rate": 1.456799854216606e-05,
+      "loss": 4.3067,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9037957984369066,
+      "grad_norm": 0.3270171880722046,
+      "learning_rate": 1.447246347640464e-05,
+      "loss": 4.3767,
+      "step": 2826
+    },
+    {
+      "epoch": 0.9041156129444922,
+      "grad_norm": 0.3349086344242096,
+      "learning_rate": 1.437723495216635e-05,
+      "loss": 4.2788,
+      "step": 2827
+    },
+    {
+      "epoch": 0.9044354274520778,
+      "grad_norm": 0.3228219449520111,
+      "learning_rate": 1.4282313071688211e-05,
+      "loss": 4.4143,
+      "step": 2828
+    },
+    {
+      "epoch": 0.9047552419596634,
+      "grad_norm": 0.3361509442329407,
+      "learning_rate": 1.4187697936878172e-05,
+      "loss": 4.3191,
+      "step": 2829
+    },
+    {
+      "epoch": 0.905075056467249,
+      "grad_norm": 0.3282942473888397,
+      "learning_rate": 1.4093389649314613e-05,
+      "loss": 4.3967,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9053948709748346,
+      "grad_norm": 0.3477500379085541,
+      "learning_rate": 1.399938831024674e-05,
+      "loss": 4.5111,
+      "step": 2831
+    },
+    {
+      "epoch": 0.9057146854824202,
+      "grad_norm": 0.3260105550289154,
+      "learning_rate": 1.3905694020594093e-05,
+      "loss": 4.4794,
+      "step": 2832
+    },
+    {
+      "epoch": 0.9060344999900058,
+      "grad_norm": 0.3467327952384949,
+      "learning_rate": 1.3812306880946577e-05,
+      "loss": 4.3209,
+      "step": 2833
+    },
+    {
+      "epoch": 0.9063543144975914,
+      "grad_norm": 0.3284205496311188,
+      "learning_rate": 1.3719226991564392e-05,
+      "loss": 4.3277,
+      "step": 2834
+    },
+    {
+      "epoch": 0.906674129005177,
+      "grad_norm": 0.33202722668647766,
+      "learning_rate": 1.3626454452377734e-05,
+      "loss": 4.4188,
+      "step": 2835
+    },
+    {
+      "epoch": 0.9069939435127626,
+      "grad_norm": 0.33401980996131897,
+      "learning_rate": 1.3533989362987063e-05,
+      "loss": 4.4344,
+      "step": 2836
+    },
+    {
+      "epoch": 0.9073137580203482,
+      "grad_norm": 0.330626517534256,
+      "learning_rate": 1.3441831822662441e-05,
+      "loss": 4.3946,
+      "step": 2837
+    },
+    {
+      "epoch": 0.9076335725279338,
+      "grad_norm": 0.3449816107749939,
+      "learning_rate": 1.3349981930344156e-05,
+      "loss": 4.346,
+      "step": 2838
+    },
+    {
+      "epoch": 0.9079533870355194,
+      "grad_norm": 0.33034005761146545,
+      "learning_rate": 1.3258439784641795e-05,
+      "loss": 4.3935,
+      "step": 2839
+    },
+    {
+      "epoch": 0.908273201543105,
+      "grad_norm": 0.33299288153648376,
+      "learning_rate": 1.3167205483834842e-05,
+      "loss": 4.3037,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9085930160506906,
+      "grad_norm": 0.3254457712173462,
+      "learning_rate": 1.307627912587218e-05,
+      "loss": 4.3259,
+      "step": 2841
+    },
+    {
+      "epoch": 0.9089128305582762,
+      "grad_norm": 0.32772693037986755,
+      "learning_rate": 1.2985660808371955e-05,
+      "loss": 4.435,
+      "step": 2842
+    },
+    {
+      "epoch": 0.9092326450658618,
+      "grad_norm": 0.3270621597766876,
+      "learning_rate": 1.2895350628621882e-05,
+      "loss": 4.2919,
+      "step": 2843
+    },
+    {
+      "epoch": 0.9095524595734474,
+      "grad_norm": 0.32787418365478516,
+      "learning_rate": 1.2805348683578598e-05,
+      "loss": 4.3179,
+      "step": 2844
+    },
+    {
+      "epoch": 0.909872274081033,
+      "grad_norm": 0.32550475001335144,
+      "learning_rate": 1.271565506986798e-05,
+      "loss": 4.3153,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9101920885886186,
+      "grad_norm": 0.3306429982185364,
+      "learning_rate": 1.2626269883784834e-05,
+      "loss": 4.3148,
+      "step": 2846
+    },
+    {
+      "epoch": 0.9105119030962042,
+      "grad_norm": 0.32415875792503357,
+      "learning_rate": 1.2537193221292763e-05,
+      "loss": 4.3869,
+      "step": 2847
+    },
+    {
+      "epoch": 0.9108317176037898,
+      "grad_norm": 0.3241727352142334,
+      "learning_rate": 1.2448425178024302e-05,
+      "loss": 4.408,
+      "step": 2848
+    },
+    {
+      "epoch": 0.9111515321113755,
+      "grad_norm": 0.34127408266067505,
+      "learning_rate": 1.2359965849280518e-05,
+      "loss": 4.3713,
+      "step": 2849
+    },
+    {
+      "epoch": 0.911471346618961,
+      "grad_norm": 0.3368206322193146,
+      "learning_rate": 1.2271815330031076e-05,
+      "loss": 4.4106,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9117911611265466,
+      "grad_norm": 0.32262179255485535,
+      "learning_rate": 1.218397371491414e-05,
+      "loss": 4.3404,
+      "step": 2851
+    },
+    {
+      "epoch": 0.9121109756341322,
+      "grad_norm": 0.3209473788738251,
+      "learning_rate": 1.2096441098236108e-05,
+      "loss": 4.3591,
+      "step": 2852
+    },
+    {
+      "epoch": 0.9124307901417178,
+      "grad_norm": 0.33111003041267395,
+      "learning_rate": 1.2009217573971907e-05,
+      "loss": 4.3861,
+      "step": 2853
+    },
+    {
+      "epoch": 0.9127506046493034,
+      "grad_norm": 0.3383842408657074,
+      "learning_rate": 1.1922303235764363e-05,
+      "loss": 4.4078,
+      "step": 2854
+    },
+    {
+      "epoch": 0.913070419156889,
+      "grad_norm": 0.3423125743865967,
+      "learning_rate": 1.1835698176924468e-05,
+      "loss": 4.3925,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9133902336644746,
+      "grad_norm": 0.3296370208263397,
+      "learning_rate": 1.1749402490431148e-05,
+      "loss": 4.364,
+      "step": 2856
+    },
+    {
+      "epoch": 0.9137100481720603,
+      "grad_norm": 0.32709506154060364,
+      "learning_rate": 1.1663416268931192e-05,
+      "loss": 4.3299,
+      "step": 2857
+    },
+    {
+      "epoch": 0.9140298626796458,
+      "grad_norm": 0.3346802294254303,
+      "learning_rate": 1.1577739604739155e-05,
+      "loss": 4.3629,
+      "step": 2858
+    },
+    {
+      "epoch": 0.9143496771872314,
+      "grad_norm": 0.32957184314727783,
+      "learning_rate": 1.1492372589837261e-05,
+      "loss": 4.3879,
+      "step": 2859
+    },
+    {
+      "epoch": 0.914669491694817,
+      "grad_norm": 0.3254009783267975,
+      "learning_rate": 1.1407315315875365e-05,
+      "loss": 4.4061,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9149893062024026,
+      "grad_norm": 0.3332465589046478,
+      "learning_rate": 1.1322567874170552e-05,
+      "loss": 4.3795,
+      "step": 2861
+    },
+    {
+      "epoch": 0.9153091207099882,
+      "grad_norm": 0.31989049911499023,
+      "learning_rate": 1.1238130355707509e-05,
+      "loss": 4.3036,
+      "step": 2862
+    },
+    {
+      "epoch": 0.9156289352175738,
+      "grad_norm": 0.320244699716568,
+      "learning_rate": 1.1154002851138122e-05,
+      "loss": 4.3326,
+      "step": 2863
+    },
+    {
+      "epoch": 0.9159487497251594,
+      "grad_norm": 0.314765065908432,
+      "learning_rate": 1.107018545078141e-05,
+      "loss": 4.2839,
+      "step": 2864
+    },
+    {
+      "epoch": 0.916268564232745,
+      "grad_norm": 0.31837067008018494,
+      "learning_rate": 1.0986678244623526e-05,
+      "loss": 4.308,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9165883787403306,
+      "grad_norm": 0.32831281423568726,
+      "learning_rate": 1.0903481322317486e-05,
+      "loss": 4.3934,
+      "step": 2866
+    },
+    {
+      "epoch": 0.9169081932479162,
+      "grad_norm": 0.320701003074646,
+      "learning_rate": 1.0820594773183278e-05,
+      "loss": 4.3102,
+      "step": 2867
+    },
+    {
+      "epoch": 0.9172280077555018,
+      "grad_norm": 0.37862440943717957,
+      "learning_rate": 1.0738018686207683e-05,
+      "loss": 4.3674,
+      "step": 2868
+    },
+    {
+      "epoch": 0.9175478222630874,
+      "grad_norm": 0.34594613313674927,
+      "learning_rate": 1.0655753150044155e-05,
+      "loss": 4.2845,
+      "step": 2869
+    },
+    {
+      "epoch": 0.917867636770673,
+      "grad_norm": 0.33938634395599365,
+      "learning_rate": 1.0573798253012778e-05,
+      "loss": 4.3945,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9181874512782586,
+      "grad_norm": 0.3470006287097931,
+      "learning_rate": 1.0492154083099968e-05,
+      "loss": 4.4398,
+      "step": 2871
+    },
+    {
+      "epoch": 0.9185072657858442,
+      "grad_norm": 0.3180997371673584,
+      "learning_rate": 1.0410820727958712e-05,
+      "loss": 4.3954,
+      "step": 2872
+    },
+    {
+      "epoch": 0.9188270802934299,
+      "grad_norm": 0.32326364517211914,
+      "learning_rate": 1.0329798274908297e-05,
+      "loss": 4.3679,
+      "step": 2873
+    },
+    {
+      "epoch": 0.9191468948010154,
+      "grad_norm": 0.3454664945602417,
+      "learning_rate": 1.0249086810934204e-05,
+      "loss": 4.3929,
+      "step": 2874
+    },
+    {
+      "epoch": 0.919466709308601,
+      "grad_norm": 0.33620408177375793,
+      "learning_rate": 1.0168686422687921e-05,
+      "loss": 4.3501,
+      "step": 2875
+    },
+    {
+      "epoch": 0.9197865238161866,
+      "grad_norm": 0.335411936044693,
+      "learning_rate": 1.008859719648717e-05,
+      "loss": 4.3344,
+      "step": 2876
+    },
+    {
+      "epoch": 0.9201063383237722,
+      "grad_norm": 0.32632091641426086,
+      "learning_rate": 1.0008819218315434e-05,
+      "loss": 4.3609,
+      "step": 2877
+    },
+    {
+      "epoch": 0.9204261528313578,
+      "grad_norm": 0.3331916630268097,
+      "learning_rate": 9.929352573822203e-06,
+      "loss": 4.287,
+      "step": 2878
+    },
+    {
+      "epoch": 0.9207459673389434,
+      "grad_norm": 0.3235928416252136,
+      "learning_rate": 9.850197348322597e-06,
+      "loss": 4.3305,
+      "step": 2879
+    },
+    {
+      "epoch": 0.921065781846529,
+      "grad_norm": 0.3344513177871704,
+      "learning_rate": 9.771353626797373e-06,
+      "loss": 4.35,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9213855963541147,
+      "grad_norm": 0.33483588695526123,
+      "learning_rate": 9.692821493892988e-06,
+      "loss": 4.4333,
+      "step": 2881
+    },
+    {
+      "epoch": 0.9217054108617002,
+      "grad_norm": 0.32479128241539,
+      "learning_rate": 9.614601033921266e-06,
+      "loss": 4.2633,
+      "step": 2882
+    },
+    {
+      "epoch": 0.9220252253692858,
+      "grad_norm": 0.35839805006980896,
+      "learning_rate": 9.536692330859497e-06,
+      "loss": 4.4617,
+      "step": 2883
+    },
+    {
+      "epoch": 0.9223450398768714,
+      "grad_norm": 0.3326389193534851,
+      "learning_rate": 9.459095468350241e-06,
+      "loss": 4.3522,
+      "step": 2884
+    },
+    {
+      "epoch": 0.922664854384457,
+      "grad_norm": 0.3288221061229706,
+      "learning_rate": 9.381810529701228e-06,
+      "loss": 4.3787,
+      "step": 2885
+    },
+    {
+      "epoch": 0.9229846688920427,
+      "grad_norm": 0.3186630606651306,
+      "learning_rate": 9.30483759788535e-06,
+      "loss": 4.403,
+      "step": 2886
+    },
+    {
+      "epoch": 0.9233044833996282,
+      "grad_norm": 0.3173036277294159,
+      "learning_rate": 9.228176755540506e-06,
+      "loss": 4.3659,
+      "step": 2887
+    },
+    {
+      "epoch": 0.9236242979072138,
+      "grad_norm": 0.3245466649532318,
+      "learning_rate": 9.151828084969593e-06,
+      "loss": 4.3796,
+      "step": 2888
+    },
+    {
+      "epoch": 0.9239441124147995,
+      "grad_norm": 0.31835073232650757,
+      "learning_rate": 9.075791668140308e-06,
+      "loss": 4.4037,
+      "step": 2889
+    },
+    {
+      "epoch": 0.924263926922385,
+      "grad_norm": 0.34545114636421204,
+      "learning_rate": 9.000067586685089e-06,
+      "loss": 4.358,
+      "step": 2890
+    },
+    {
+      "epoch": 0.9245837414299706,
+      "grad_norm": 0.3350619077682495,
+      "learning_rate": 8.924655921901135e-06,
+      "loss": 4.3785,
+      "step": 2891
+    },
+    {
+      "epoch": 0.9249035559375562,
+      "grad_norm": 0.3419342041015625,
+      "learning_rate": 8.849556754750153e-06,
+      "loss": 4.3976,
+      "step": 2892
+    },
+    {
+      "epoch": 0.9252233704451418,
+      "grad_norm": 1.3042224645614624,
+      "learning_rate": 8.774770165858347e-06,
+      "loss": 4.424,
+      "step": 2893
+    },
+    {
+      "epoch": 0.9255431849527275,
+      "grad_norm": 0.3174867331981659,
+      "learning_rate": 8.70029623551649e-06,
+      "loss": 4.3476,
+      "step": 2894
+    },
+    {
+      "epoch": 0.925862999460313,
+      "grad_norm": 0.3302491307258606,
+      "learning_rate": 8.626135043679495e-06,
+      "loss": 4.3753,
+      "step": 2895
+    },
+    {
+      "epoch": 0.9261828139678986,
+      "grad_norm": 0.3348652422428131,
+      "learning_rate": 8.552286669966635e-06,
+      "loss": 4.3164,
+      "step": 2896
+    },
+    {
+      "epoch": 0.9265026284754843,
+      "grad_norm": 0.3382120430469513,
+      "learning_rate": 8.47875119366126e-06,
+      "loss": 4.4176,
+      "step": 2897
+    },
+    {
+      "epoch": 0.9268224429830698,
+      "grad_norm": 0.3329518437385559,
+      "learning_rate": 8.405528693710883e-06,
+      "loss": 4.3972,
+      "step": 2898
+    },
+    {
+      "epoch": 0.9271422574906554,
+      "grad_norm": 0.34764277935028076,
+      "learning_rate": 8.332619248726957e-06,
+      "loss": 4.4171,
+      "step": 2899
+    },
+    {
+      "epoch": 0.927462071998241,
+      "grad_norm": 0.3330550491809845,
+      "learning_rate": 8.260022936984833e-06,
+      "loss": 4.3673,
+      "step": 2900
+    },
+    {
+      "epoch": 0.927462071998241,
+      "eval_loss": 4.38118314743042,
+      "eval_runtime": 96.8272,
+      "eval_samples_per_second": 19.592,
+      "eval_steps_per_second": 4.906,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9277818865058266,
+      "grad_norm": 0.3248373866081238,
+      "learning_rate": 8.187739836423734e-06,
+      "loss": 4.3651,
+      "step": 2901
+    },
+    {
+      "epoch": 0.9281017010134123,
+      "grad_norm": 0.32036662101745605,
+      "learning_rate": 8.115770024646518e-06,
+      "loss": 4.3364,
+      "step": 2902
+    },
+    {
+      "epoch": 0.9284215155209978,
+      "grad_norm": 0.3199023902416229,
+      "learning_rate": 8.044113578919842e-06,
+      "loss": 4.4604,
+      "step": 2903
+    },
+    {
+      "epoch": 0.9287413300285834,
+      "grad_norm": 0.32586199045181274,
+      "learning_rate": 7.97277057617377e-06,
+      "loss": 4.392,
+      "step": 2904
+    },
+    {
+      "epoch": 0.9290611445361691,
+      "grad_norm": 0.34327834844589233,
+      "learning_rate": 7.901741093002002e-06,
+      "loss": 4.3379,
+      "step": 2905
+    },
+    {
+      "epoch": 0.9293809590437546,
+      "grad_norm": 0.3284776508808136,
+      "learning_rate": 7.8310252056616e-06,
+      "loss": 4.3835,
+      "step": 2906
+    },
+    {
+      "epoch": 0.9297007735513402,
+      "grad_norm": 0.3319602608680725,
+      "learning_rate": 7.760622990072873e-06,
+      "loss": 4.4086,
+      "step": 2907
+    },
+    {
+      "epoch": 0.9300205880589258,
+      "grad_norm": 0.3359242379665375,
+      "learning_rate": 7.690534521819458e-06,
+      "loss": 4.3714,
+      "step": 2908
+    },
+    {
+      "epoch": 0.9303404025665114,
+      "grad_norm": 0.3331519067287445,
+      "learning_rate": 7.6207598761481305e-06,
+      "loss": 4.3064,
+      "step": 2909
+    },
+    {
+      "epoch": 0.9306602170740971,
+      "grad_norm": 0.32268157601356506,
+      "learning_rate": 7.5512991279687684e-06,
+      "loss": 4.3458,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9309800315816826,
+      "grad_norm": 0.32395583391189575,
+      "learning_rate": 7.482152351854187e-06,
+      "loss": 4.348,
+      "step": 2911
+    },
+    {
+      "epoch": 0.9312998460892682,
+      "grad_norm": 0.33460599184036255,
+      "learning_rate": 7.413319622040137e-06,
+      "loss": 4.3118,
+      "step": 2912
+    },
+    {
+      "epoch": 0.9316196605968539,
+      "grad_norm": 0.3139401972293854,
+      "learning_rate": 7.344801012425306e-06,
+      "loss": 4.3487,
+      "step": 2913
+    },
+    {
+      "epoch": 0.9319394751044394,
+      "grad_norm": 0.3270988464355469,
+      "learning_rate": 7.276596596571016e-06,
+      "loss": 4.3667,
+      "step": 2914
+    },
+    {
+      "epoch": 0.932259289612025,
+      "grad_norm": 0.32469436526298523,
+      "learning_rate": 7.208706447701395e-06,
+      "loss": 4.3818,
+      "step": 2915
+    },
+    {
+      "epoch": 0.9325791041196106,
+      "grad_norm": 0.3502923250198364,
+      "learning_rate": 7.141130638703041e-06,
+      "loss": 4.3693,
+      "step": 2916
+    },
+    {
+      "epoch": 0.9328989186271962,
+      "grad_norm": 0.33618977665901184,
+      "learning_rate": 7.073869242125152e-06,
+      "loss": 4.4508,
+      "step": 2917
+    },
+    {
+      "epoch": 0.9332187331347819,
+      "grad_norm": 0.3267005980014801,
+      "learning_rate": 7.006922330179398e-06,
+      "loss": 4.4196,
+      "step": 2918
+    },
+    {
+      "epoch": 0.9335385476423674,
+      "grad_norm": 0.32550060749053955,
+      "learning_rate": 6.940289974739754e-06,
+      "loss": 4.2636,
+      "step": 2919
+    },
+    {
+      "epoch": 0.933858362149953,
+      "grad_norm": 0.32697173953056335,
+      "learning_rate": 6.8739722473425295e-06,
+      "loss": 4.2765,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9341781766575387,
+      "grad_norm": 0.3203597068786621,
+      "learning_rate": 6.807969219186271e-06,
+      "loss": 4.3186,
+      "step": 2921
+    },
+    {
+      "epoch": 0.9344979911651242,
+      "grad_norm": 0.32253390550613403,
+      "learning_rate": 6.742280961131563e-06,
+      "loss": 4.4395,
+      "step": 2922
+    },
+    {
+      "epoch": 0.9348178056727098,
+      "grad_norm": 0.3350317180156708,
+      "learning_rate": 6.676907543701227e-06,
+      "loss": 4.4133,
+      "step": 2923
+    },
+    {
+      "epoch": 0.9351376201802954,
+      "grad_norm": 0.35089296102523804,
+      "learning_rate": 6.611849037079886e-06,
+      "loss": 4.3996,
+      "step": 2924
+    },
+    {
+      "epoch": 0.935457434687881,
+      "grad_norm": 0.3362065851688385,
+      "learning_rate": 6.5471055111142035e-06,
+      "loss": 4.3639,
+      "step": 2925
+    },
+    {
+      "epoch": 0.9357772491954667,
+      "grad_norm": 0.33695685863494873,
+      "learning_rate": 6.4826770353126115e-06,
+      "loss": 4.3329,
+      "step": 2926
+    },
+    {
+      "epoch": 0.9360970637030522,
+      "grad_norm": 0.3362276256084442,
+      "learning_rate": 6.418563678845379e-06,
+      "loss": 4.3527,
+      "step": 2927
+    },
+    {
+      "epoch": 0.9364168782106378,
+      "grad_norm": 0.3312215805053711,
+      "learning_rate": 6.354765510544346e-06,
+      "loss": 4.4177,
+      "step": 2928
+    },
+    {
+      "epoch": 0.9367366927182235,
+      "grad_norm": 0.31979092955589294,
+      "learning_rate": 6.291282598903091e-06,
+      "loss": 4.3469,
+      "step": 2929
+    },
+    {
+      "epoch": 0.937056507225809,
+      "grad_norm": 0.32077279686927795,
+      "learning_rate": 6.228115012076729e-06,
+      "loss": 4.4415,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9373763217333946,
+      "grad_norm": 0.3242551386356354,
+      "learning_rate": 6.165262817881678e-06,
+      "loss": 4.3477,
+      "step": 2931
+    },
+    {
+      "epoch": 0.9376961362409802,
+      "grad_norm": 0.32322248816490173,
+      "learning_rate": 6.102726083795961e-06,
+      "loss": 4.3688,
+      "step": 2932
+    },
+    {
+      "epoch": 0.9380159507485658,
+      "grad_norm": 0.323236346244812,
+      "learning_rate": 6.040504876958741e-06,
+      "loss": 4.367,
+      "step": 2933
+    },
+    {
+      "epoch": 0.9383357652561515,
+      "grad_norm": 0.3199426233768463,
+      "learning_rate": 5.978599264170614e-06,
+      "loss": 4.3521,
+      "step": 2934
+    },
+    {
+      "epoch": 0.938655579763737,
+      "grad_norm": 0.3212834298610687,
+      "learning_rate": 5.917009311893217e-06,
+      "loss": 4.3101,
+      "step": 2935
+    },
+    {
+      "epoch": 0.9389753942713226,
+      "grad_norm": 0.3298892080783844,
+      "learning_rate": 5.855735086249358e-06,
+      "loss": 4.4033,
+      "step": 2936
+    },
+    {
+      "epoch": 0.9392952087789083,
+      "grad_norm": 0.320708692073822,
+      "learning_rate": 5.794776653022881e-06,
+      "loss": 4.329,
+      "step": 2937
+    },
+    {
+      "epoch": 0.9396150232864938,
+      "grad_norm": 0.33011531829833984,
+      "learning_rate": 5.7341340776585035e-06,
+      "loss": 4.3612,
+      "step": 2938
+    },
+    {
+      "epoch": 0.9399348377940794,
+      "grad_norm": 0.3417920768260956,
+      "learning_rate": 5.673807425262045e-06,
+      "loss": 4.3838,
+      "step": 2939
+    },
+    {
+      "epoch": 0.940254652301665,
+      "grad_norm": 0.32390668988227844,
+      "learning_rate": 5.613796760599898e-06,
+      "loss": 4.3027,
+      "step": 2940
+    },
+    {
+      "epoch": 0.9405744668092506,
+      "grad_norm": 0.3507520854473114,
+      "learning_rate": 5.554102148099393e-06,
+      "loss": 4.4617,
+      "step": 2941
+    },
+    {
+      "epoch": 0.9408942813168363,
+      "grad_norm": 0.32159173488616943,
+      "learning_rate": 5.494723651848532e-06,
+      "loss": 4.4397,
+      "step": 2942
+    },
+    {
+      "epoch": 0.9412140958244218,
+      "grad_norm": 0.33860674500465393,
+      "learning_rate": 5.435661335595753e-06,
+      "loss": 4.3651,
+      "step": 2943
+    },
+    {
+      "epoch": 0.9415339103320074,
+      "grad_norm": 0.3142724931240082,
+      "learning_rate": 5.376915262750369e-06,
+      "loss": 4.3949,
+      "step": 2944
+    },
+    {
+      "epoch": 0.9418537248395931,
+      "grad_norm": 0.3150574564933777,
+      "learning_rate": 5.3184854963818305e-06,
+      "loss": 4.2867,
+      "step": 2945
+    },
+    {
+      "epoch": 0.9421735393471786,
+      "grad_norm": 0.32347816228866577,
+      "learning_rate": 5.260372099220289e-06,
+      "loss": 4.3839,
+      "step": 2946
+    },
+    {
+      "epoch": 0.9424933538547642,
+      "grad_norm": 0.32392770051956177,
+      "learning_rate": 5.202575133656039e-06,
+      "loss": 4.4329,
+      "step": 2947
+    },
+    {
+      "epoch": 0.9428131683623499,
+      "grad_norm": 0.334563672542572,
+      "learning_rate": 5.145094661739746e-06,
+      "loss": 4.3614,
+      "step": 2948
+    },
+    {
+      "epoch": 0.9431329828699354,
+      "grad_norm": 0.32934969663619995,
+      "learning_rate": 5.087930745182278e-06,
+      "loss": 4.3988,
+      "step": 2949
+    },
+    {
+      "epoch": 0.9434527973775211,
+      "grad_norm": 0.332389771938324,
+      "learning_rate": 5.031083445354644e-06,
+      "loss": 4.3827,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9437726118851066,
+      "grad_norm": 0.3233090341091156,
+      "learning_rate": 4.9745528232879915e-06,
+      "loss": 4.3413,
+      "step": 2951
+    },
+    {
+      "epoch": 0.9440924263926922,
+      "grad_norm": 0.3395127058029175,
+      "learning_rate": 4.918338939673372e-06,
+      "loss": 4.4381,
+      "step": 2952
+    },
+    {
+      "epoch": 0.9444122409002779,
+      "grad_norm": 0.321155846118927,
+      "learning_rate": 4.862441854861809e-06,
+      "loss": 4.3916,
+      "step": 2953
+    },
+    {
+      "epoch": 0.9447320554078634,
+      "grad_norm": 0.33099478483200073,
+      "learning_rate": 4.806861628864333e-06,
+      "loss": 4.4219,
+      "step": 2954
+    },
+    {
+      "epoch": 0.9450518699154491,
+      "grad_norm": 0.33024051785469055,
+      "learning_rate": 4.751598321351679e-06,
+      "loss": 4.3912,
+      "step": 2955
+    },
+    {
+      "epoch": 0.9453716844230347,
+      "grad_norm": 0.3497087359428406,
+      "learning_rate": 4.6966519916543875e-06,
+      "loss": 4.3261,
+      "step": 2956
+    },
+    {
+      "epoch": 0.9456914989306202,
+      "grad_norm": 0.31810522079467773,
+      "learning_rate": 4.642022698762638e-06,
+      "loss": 4.2891,
+      "step": 2957
+    },
+    {
+      "epoch": 0.9460113134382059,
+      "grad_norm": 0.3298070728778839,
+      "learning_rate": 4.5877105013262805e-06,
+      "loss": 4.3088,
+      "step": 2958
+    },
+    {
+      "epoch": 0.9463311279457914,
+      "grad_norm": 0.32999950647354126,
+      "learning_rate": 4.533715457654741e-06,
+      "loss": 4.4058,
+      "step": 2959
+    },
+    {
+      "epoch": 0.946650942453377,
+      "grad_norm": 0.3260367214679718,
+      "learning_rate": 4.480037625716981e-06,
+      "loss": 4.3343,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9469707569609627,
+      "grad_norm": 0.3239196538925171,
+      "learning_rate": 4.4266770631413374e-06,
+      "loss": 4.4208,
+      "step": 2961
+    },
+    {
+      "epoch": 0.9472905714685482,
+      "grad_norm": 0.3258492350578308,
+      "learning_rate": 4.373633827215517e-06,
+      "loss": 4.5037,
+      "step": 2962
+    },
+    {
+      "epoch": 0.9476103859761339,
+      "grad_norm": 0.3244808316230774,
+      "learning_rate": 4.3209079748866e-06,
+      "loss": 4.3016,
+      "step": 2963
+    },
+    {
+      "epoch": 0.9479302004837195,
+      "grad_norm": 0.33479514718055725,
+      "learning_rate": 4.268499562760907e-06,
+      "loss": 4.3761,
+      "step": 2964
+    },
+    {
+      "epoch": 0.948250014991305,
+      "grad_norm": 0.31352558732032776,
+      "learning_rate": 4.216408647103997e-06,
+      "loss": 4.3636,
+      "step": 2965
+    },
+    {
+      "epoch": 0.9485698294988907,
+      "grad_norm": 0.3271879255771637,
+      "learning_rate": 4.164635283840468e-06,
+      "loss": 4.3915,
+      "step": 2966
+    },
+    {
+      "epoch": 0.9488896440064762,
+      "grad_norm": 0.32334235310554504,
+      "learning_rate": 4.113179528554089e-06,
+      "loss": 4.3971,
+      "step": 2967
+    },
+    {
+      "epoch": 0.9492094585140618,
+      "grad_norm": 0.3343781530857086,
+      "learning_rate": 4.062041436487573e-06,
+      "loss": 4.3883,
+      "step": 2968
+    },
+    {
+      "epoch": 0.9495292730216475,
+      "grad_norm": 0.31923946738243103,
+      "learning_rate": 4.011221062542636e-06,
+      "loss": 4.2875,
+      "step": 2969
+    },
+    {
+      "epoch": 0.949849087529233,
+      "grad_norm": 0.334395170211792,
+      "learning_rate": 3.9607184612799325e-06,
+      "loss": 4.3944,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9501689020368187,
+      "grad_norm": 0.33013853430747986,
+      "learning_rate": 3.910533686918826e-06,
+      "loss": 4.3853,
+      "step": 2971
+    },
+    {
+      "epoch": 0.9504887165444043,
+      "grad_norm": 0.3307196795940399,
+      "learning_rate": 3.860666793337585e-06,
+      "loss": 4.3385,
+      "step": 2972
+    },
+    {
+      "epoch": 0.9508085310519898,
+      "grad_norm": 0.3317532241344452,
+      "learning_rate": 3.811117834073152e-06,
+      "loss": 4.3694,
+      "step": 2973
+    },
+    {
+      "epoch": 0.9511283455595755,
+      "grad_norm": 0.3383411467075348,
+      "learning_rate": 3.761886862321173e-06,
+      "loss": 4.3688,
+      "step": 2974
+    },
+    {
+      "epoch": 0.951448160067161,
+      "grad_norm": 0.32362377643585205,
+      "learning_rate": 3.7129739309358362e-06,
+      "loss": 4.3074,
+      "step": 2975
+    },
+    {
+      "epoch": 0.9517679745747466,
+      "grad_norm": 0.32669341564178467,
+      "learning_rate": 3.664379092429903e-06,
+      "loss": 4.3258,
+      "step": 2976
+    },
+    {
+      "epoch": 0.9520877890823323,
+      "grad_norm": 0.32052507996559143,
+      "learning_rate": 3.6161023989747075e-06,
+      "loss": 4.4202,
+      "step": 2977
+    },
+    {
+      "epoch": 0.9524076035899178,
+      "grad_norm": 0.3372294306755066,
+      "learning_rate": 3.5681439023999224e-06,
+      "loss": 4.3761,
+      "step": 2978
+    },
+    {
+      "epoch": 0.9527274180975035,
+      "grad_norm": 0.33484935760498047,
+      "learning_rate": 3.5205036541936626e-06,
+      "loss": 4.3767,
+      "step": 2979
+    },
+    {
+      "epoch": 0.9530472326050891,
+      "grad_norm": 0.3463588356971741,
+      "learning_rate": 3.4731817055023812e-06,
+      "loss": 4.3565,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9533670471126746,
+      "grad_norm": 0.325015664100647,
+      "learning_rate": 3.4261781071307393e-06,
+      "loss": 4.3474,
+      "step": 2981
+    },
+    {
+      "epoch": 0.9536868616202603,
+      "grad_norm": 0.32021352648735046,
+      "learning_rate": 3.3794929095417034e-06,
+      "loss": 4.3902,
+      "step": 2982
+    },
+    {
+      "epoch": 0.9540066761278458,
+      "grad_norm": 0.34606412053108215,
+      "learning_rate": 3.3331261628563145e-06,
+      "loss": 4.2884,
+      "step": 2983
+    },
+    {
+      "epoch": 0.9543264906354314,
+      "grad_norm": 0.3238740861415863,
+      "learning_rate": 3.2870779168538196e-06,
+      "loss": 4.3385,
+      "step": 2984
+    },
+    {
+      "epoch": 0.9546463051430171,
+      "grad_norm": 0.3227710723876953,
+      "learning_rate": 3.2413482209714737e-06,
+      "loss": 4.2974,
+      "step": 2985
+    },
+    {
+      "epoch": 0.9549661196506026,
+      "grad_norm": 0.33221638202667236,
+      "learning_rate": 3.195937124304504e-06,
+      "loss": 4.44,
+      "step": 2986
+    },
+    {
+      "epoch": 0.9552859341581883,
+      "grad_norm": 0.32845816016197205,
+      "learning_rate": 3.150844675606212e-06,
+      "loss": 4.2868,
+      "step": 2987
+    },
+    {
+      "epoch": 0.9556057486657739,
+      "grad_norm": 0.33255472779273987,
+      "learning_rate": 3.10607092328764e-06,
+      "loss": 4.3865,
+      "step": 2988
+    },
+    {
+      "epoch": 0.9559255631733594,
+      "grad_norm": 0.3340242803096771,
+      "learning_rate": 3.0616159154177366e-06,
+      "loss": 4.3735,
+      "step": 2989
+    },
+    {
+      "epoch": 0.9562453776809451,
+      "grad_norm": 0.3314898610115051,
+      "learning_rate": 3.0174796997233908e-06,
+      "loss": 4.4163,
+      "step": 2990
+    },
+    {
+      "epoch": 0.9565651921885306,
+      "grad_norm": 0.33092954754829407,
+      "learning_rate": 2.973662323588999e-06,
+      "loss": 4.3312,
+      "step": 2991
+    },
+    {
+      "epoch": 0.9568850066961162,
+      "grad_norm": 0.3198767900466919,
+      "learning_rate": 2.930163834056831e-06,
+      "loss": 4.4544,
+      "step": 2992
+    },
+    {
+      "epoch": 0.9572048212037019,
+      "grad_norm": 0.3256797194480896,
+      "learning_rate": 2.8869842778266983e-06,
+      "loss": 4.401,
+      "step": 2993
+    },
+    {
+      "epoch": 0.9575246357112874,
+      "grad_norm": 0.32482126355171204,
+      "learning_rate": 2.844123701256051e-06,
+      "loss": 4.3991,
+      "step": 2994
+    },
+    {
+      "epoch": 0.9578444502188731,
+      "grad_norm": 0.34271669387817383,
+      "learning_rate": 2.801582150359882e-06,
+      "loss": 4.3053,
+      "step": 2995
+    },
+    {
+      "epoch": 0.9581642647264587,
+      "grad_norm": 0.32339465618133545,
+      "learning_rate": 2.7593596708106904e-06,
+      "loss": 4.2509,
+      "step": 2996
+    },
+    {
+      "epoch": 0.9584840792340442,
+      "grad_norm": 0.3229231834411621,
+      "learning_rate": 2.717456307938415e-06,
+      "loss": 4.356,
+      "step": 2997
+    },
+    {
+      "epoch": 0.9588038937416299,
+      "grad_norm": 0.3340475559234619,
+      "learning_rate": 2.6758721067303367e-06,
+      "loss": 4.4262,
+      "step": 2998
+    },
+    {
+      "epoch": 0.9591237082492154,
+      "grad_norm": 0.3331576883792877,
+      "learning_rate": 2.634607111831177e-06,
+      "loss": 4.3648,
+      "step": 2999
+    },
+    {
+      "epoch": 0.959443522756801,
+      "grad_norm": 0.3264421820640564,
+      "learning_rate": 2.5936613675428985e-06,
+      "loss": 4.3048,
+      "step": 3000
+    },
+    {
+      "epoch": 0.959443522756801,
+      "eval_loss": 4.379649639129639,
+      "eval_runtime": 97.9927,
+      "eval_samples_per_second": 19.359,
+      "eval_steps_per_second": 4.847,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9597633372643867,
+      "grad_norm": 0.3356820046901703,
+      "learning_rate": 2.5530349178247033e-06,
+      "loss": 4.3879,
+      "step": 3001
+    },
+    {
+      "epoch": 0.9600831517719722,
+      "grad_norm": 0.3238847553730011,
+      "learning_rate": 2.512727806293069e-06,
+      "loss": 4.3802,
+      "step": 3002
+    },
+    {
+      "epoch": 0.9604029662795579,
+      "grad_norm": 0.32261836528778076,
+      "learning_rate": 2.4727400762215798e-06,
+      "loss": 4.3,
+      "step": 3003
+    },
+    {
+      "epoch": 0.9607227807871435,
+      "grad_norm": 0.34502753615379333,
+      "learning_rate": 2.4330717705409287e-06,
+      "loss": 4.2756,
+      "step": 3004
+    },
+    {
+      "epoch": 0.961042595294729,
+      "grad_norm": 0.321431040763855,
+      "learning_rate": 2.393722931838882e-06,
+      "loss": 4.387,
+      "step": 3005
+    },
+    {
+      "epoch": 0.9613624098023147,
+      "grad_norm": 0.33116859197616577,
+      "learning_rate": 2.3546936023603134e-06,
+      "loss": 4.4409,
+      "step": 3006
+    },
+    {
+      "epoch": 0.9616822243099002,
+      "grad_norm": 0.32576796412467957,
+      "learning_rate": 2.315983824006906e-06,
+      "loss": 4.4106,
+      "step": 3007
+    },
+    {
+      "epoch": 0.9620020388174858,
+      "grad_norm": 0.32816702127456665,
+      "learning_rate": 2.277593638337416e-06,
+      "loss": 4.3597,
+      "step": 3008
+    },
+    {
+      "epoch": 0.9623218533250715,
+      "grad_norm": 0.3323589265346527,
+      "learning_rate": 2.2395230865674075e-06,
+      "loss": 4.3409,
+      "step": 3009
+    },
+    {
+      "epoch": 0.962641667832657,
+      "grad_norm": 0.3263186514377594,
+      "learning_rate": 2.201772209569319e-06,
+      "loss": 4.3907,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9629614823402427,
+      "grad_norm": 0.3258233964443207,
+      "learning_rate": 2.164341047872398e-06,
+      "loss": 4.4135,
+      "step": 3011
+    },
+    {
+      "epoch": 0.9632812968478283,
+      "grad_norm": 0.33841806650161743,
+      "learning_rate": 2.127229641662598e-06,
+      "loss": 4.3527,
+      "step": 3012
+    },
+    {
+      "epoch": 0.9636011113554138,
+      "grad_norm": 0.3296872079372406,
+      "learning_rate": 2.0904380307826483e-06,
+      "loss": 4.3534,
+      "step": 3013
+    },
+    {
+      "epoch": 0.9639209258629995,
+      "grad_norm": 0.3302517533302307,
+      "learning_rate": 2.053966254731887e-06,
+      "loss": 4.322,
+      "step": 3014
+    },
+    {
+      "epoch": 0.964240740370585,
+      "grad_norm": 0.33398133516311646,
+      "learning_rate": 2.0178143526663248e-06,
+      "loss": 4.4822,
+      "step": 3015
+    },
+    {
+      "epoch": 0.9645605548781706,
+      "grad_norm": 0.3158261775970459,
+      "learning_rate": 1.981982363398549e-06,
+      "loss": 4.3978,
+      "step": 3016
+    },
+    {
+      "epoch": 0.9648803693857563,
+      "grad_norm": 0.32655230164527893,
+      "learning_rate": 1.9464703253976533e-06,
+      "loss": 4.2977,
+      "step": 3017
+    },
+    {
+      "epoch": 0.9652001838933418,
+      "grad_norm": 0.330531507730484,
+      "learning_rate": 1.911278276789241e-06,
+      "loss": 4.3358,
+      "step": 3018
+    },
+    {
+      "epoch": 0.9655199984009275,
+      "grad_norm": 0.3409190773963928,
+      "learning_rate": 1.8764062553554227e-06,
+      "loss": 4.2726,
+      "step": 3019
+    },
+    {
+      "epoch": 0.9658398129085131,
+      "grad_norm": 0.33117616176605225,
+      "learning_rate": 1.8418542985347174e-06,
+      "loss": 4.4635,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9661596274160986,
+      "grad_norm": 0.3396887183189392,
+      "learning_rate": 1.8076224434219523e-06,
+      "loss": 4.3382,
+      "step": 3021
+    },
+    {
+      "epoch": 0.9664794419236843,
+      "grad_norm": 0.3328210115432739,
+      "learning_rate": 1.773710726768396e-06,
+      "loss": 4.3677,
+      "step": 3022
+    },
+    {
+      "epoch": 0.9667992564312698,
+      "grad_norm": 0.3235970139503479,
+      "learning_rate": 1.7401191849815255e-06,
+      "loss": 4.3789,
+      "step": 3023
+    },
+    {
+      "epoch": 0.9671190709388555,
+      "grad_norm": 0.32505133748054504,
+      "learning_rate": 1.7068478541251263e-06,
+      "loss": 4.3603,
+      "step": 3024
+    },
+    {
+      "epoch": 0.9674388854464411,
+      "grad_norm": 0.327471524477005,
+      "learning_rate": 1.673896769919192e-06,
+      "loss": 4.3199,
+      "step": 3025
+    },
+    {
+      "epoch": 0.9677586999540266,
+      "grad_norm": 0.3187865614891052,
+      "learning_rate": 1.6412659677399908e-06,
+      "loss": 4.3572,
+      "step": 3026
+    },
+    {
+      "epoch": 0.9680785144616123,
+      "grad_norm": 0.3221952021121979,
+      "learning_rate": 1.608955482619767e-06,
+      "loss": 4.369,
+      "step": 3027
+    },
+    {
+      "epoch": 0.9683983289691979,
+      "grad_norm": 0.3291928172111511,
+      "learning_rate": 1.5769653492470057e-06,
+      "loss": 4.3607,
+      "step": 3028
+    },
+    {
+      "epoch": 0.9687181434767834,
+      "grad_norm": 0.3323615491390228,
+      "learning_rate": 1.5452956019661678e-06,
+      "loss": 4.3528,
+      "step": 3029
+    },
+    {
+      "epoch": 0.9690379579843691,
+      "grad_norm": 0.3374388515949249,
+      "learning_rate": 1.5139462747778885e-06,
+      "loss": 4.3617,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9693577724919547,
+      "grad_norm": 0.3264639973640442,
+      "learning_rate": 1.4829174013386126e-06,
+      "loss": 4.3405,
+      "step": 3031
+    },
+    {
+      "epoch": 0.9696775869995403,
+      "grad_norm": 0.33052435517311096,
+      "learning_rate": 1.4522090149609256e-06,
+      "loss": 4.3701,
+      "step": 3032
+    },
+    {
+      "epoch": 0.9699974015071259,
+      "grad_norm": 0.3322097063064575,
+      "learning_rate": 1.4218211486132558e-06,
+      "loss": 4.4074,
+      "step": 3033
+    },
+    {
+      "epoch": 0.9703172160147114,
+      "grad_norm": 0.3176632225513458,
+      "learning_rate": 1.3917538349198731e-06,
+      "loss": 4.3757,
+      "step": 3034
+    },
+    {
+      "epoch": 0.9706370305222971,
+      "grad_norm": 0.32025620341300964,
+      "learning_rate": 1.3620071061609894e-06,
+      "loss": 4.318,
+      "step": 3035
+    },
+    {
+      "epoch": 0.9709568450298827,
+      "grad_norm": 0.3187826871871948,
+      "learning_rate": 1.332580994272625e-06,
+      "loss": 4.2726,
+      "step": 3036
+    },
+    {
+      "epoch": 0.9712766595374682,
+      "grad_norm": 0.3278151750564575,
+      "learning_rate": 1.3034755308465428e-06,
+      "loss": 4.4687,
+      "step": 3037
+    },
+    {
+      "epoch": 0.9715964740450539,
+      "grad_norm": 0.31633511185646057,
+      "learning_rate": 1.2746907471302803e-06,
+      "loss": 4.3152,
+      "step": 3038
+    },
+    {
+      "epoch": 0.9719162885526395,
+      "grad_norm": 0.3345247805118561,
+      "learning_rate": 1.2462266740270843e-06,
+      "loss": 4.3626,
+      "step": 3039
+    },
+    {
+      "epoch": 0.9722361030602251,
+      "grad_norm": 0.33534467220306396,
+      "learning_rate": 1.2180833420959436e-06,
+      "loss": 4.4277,
+      "step": 3040
+    },
+    {
+      "epoch": 0.9725559175678107,
+      "grad_norm": 0.3333605229854584,
+      "learning_rate": 1.190260781551422e-06,
+      "loss": 4.314,
+      "step": 3041
+    },
+    {
+      "epoch": 0.9728757320753962,
+      "grad_norm": 0.32544615864753723,
+      "learning_rate": 1.1627590222637594e-06,
+      "loss": 4.3561,
+      "step": 3042
+    },
+    {
+      "epoch": 0.9731955465829819,
+      "grad_norm": 0.32639381289482117,
+      "learning_rate": 1.1355780937587378e-06,
+      "loss": 4.3612,
+      "step": 3043
+    },
+    {
+      "epoch": 0.9735153610905675,
+      "grad_norm": 0.33417561650276184,
+      "learning_rate": 1.1087180252177475e-06,
+      "loss": 4.3262,
+      "step": 3044
+    },
+    {
+      "epoch": 0.973835175598153,
+      "grad_norm": 0.33775198459625244,
+      "learning_rate": 1.0821788454776548e-06,
+      "loss": 4.4527,
+      "step": 3045
+    },
+    {
+      "epoch": 0.9741549901057387,
+      "grad_norm": 0.32477089762687683,
+      "learning_rate": 1.0559605830308682e-06,
+      "loss": 4.3682,
+      "step": 3046
+    },
+    {
+      "epoch": 0.9744748046133243,
+      "grad_norm": 0.3236415386199951,
+      "learning_rate": 1.030063266025205e-06,
+      "loss": 4.345,
+      "step": 3047
+    },
+    {
+      "epoch": 0.9747946191209099,
+      "grad_norm": 0.32303833961486816,
+      "learning_rate": 1.0044869222639917e-06,
+      "loss": 4.4157,
+      "step": 3048
+    },
+    {
+      "epoch": 0.9751144336284955,
+      "grad_norm": 0.32153502106666565,
+      "learning_rate": 9.79231579205897e-07,
+      "loss": 4.3407,
+      "step": 3049
+    },
+    {
+      "epoch": 0.975434248136081,
+      "grad_norm": 0.3337421715259552,
+      "learning_rate": 9.54297263964965e-07,
+      "loss": 4.2743,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9757540626436667,
+      "grad_norm": 0.3250197768211365,
+      "learning_rate": 9.29684003310649e-07,
+      "loss": 4.4275,
+      "step": 3051
+    },
+    {
+      "epoch": 0.9760738771512523,
+      "grad_norm": 0.3775787949562073,
+      "learning_rate": 9.053918236676116e-07,
+      "loss": 4.4613,
+      "step": 3052
+    },
+    {
+      "epoch": 0.9763936916588378,
+      "grad_norm": 0.3378108739852905,
+      "learning_rate": 8.814207511159243e-07,
+      "loss": 4.4499,
+      "step": 3053
+    },
+    {
+      "epoch": 0.9767135061664235,
+      "grad_norm": 0.3372562527656555,
+      "learning_rate": 8.577708113908011e-07,
+      "loss": 4.48,
+      "step": 3054
+    },
+    {
+      "epoch": 0.977033320674009,
+      "grad_norm": 0.3256985545158386,
+      "learning_rate": 8.344420298827981e-07,
+      "loss": 4.3823,
+      "step": 3055
+    },
+    {
+      "epoch": 0.9773531351815947,
+      "grad_norm": 0.33073222637176514,
+      "learning_rate": 8.114344316376143e-07,
+      "loss": 4.4387,
+      "step": 3056
+    },
+    {
+      "epoch": 0.9776729496891803,
+      "grad_norm": 0.3283195197582245,
+      "learning_rate": 7.887480413561243e-07,
+      "loss": 4.3839,
+      "step": 3057
+    },
+    {
+      "epoch": 0.9779927641967658,
+      "grad_norm": 0.32535162568092346,
+      "learning_rate": 7.663828833943786e-07,
+      "loss": 4.3539,
+      "step": 3058
+    },
+    {
+      "epoch": 0.9783125787043515,
+      "grad_norm": 0.34874841570854187,
+      "learning_rate": 7.443389817635371e-07,
+      "loss": 4.3777,
+      "step": 3059
+    },
+    {
+      "epoch": 0.9786323932119371,
+      "grad_norm": 0.3269280195236206,
+      "learning_rate": 7.226163601298685e-07,
+      "loss": 4.367,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9789522077195226,
+      "grad_norm": 0.3236599266529083,
+      "learning_rate": 7.01215041814751e-07,
+      "loss": 4.4251,
+      "step": 3061
+    },
+    {
+      "epoch": 0.9792720222271083,
+      "grad_norm": 0.31829267740249634,
+      "learning_rate": 6.801350497945391e-07,
+      "loss": 4.3718,
+      "step": 3062
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.32019856572151184,
+      "learning_rate": 6.593764067006624e-07,
+      "loss": 4.2499,
+      "step": 3063
+    },
+    {
+      "epoch": 0.9799116512422795,
+      "grad_norm": 0.318994402885437,
+      "learning_rate": 6.389391348195272e-07,
+      "loss": 4.3874,
+      "step": 3064
+    },
+    {
+      "epoch": 0.9802314657498651,
+      "grad_norm": 0.32636478543281555,
+      "learning_rate": 6.188232560925155e-07,
+      "loss": 4.3053,
+      "step": 3065
+    },
+    {
+      "epoch": 0.9805512802574506,
+      "grad_norm": 0.34945639967918396,
+      "learning_rate": 5.990287921160186e-07,
+      "loss": 4.404,
+      "step": 3066
+    },
+    {
+      "epoch": 0.9808710947650363,
+      "grad_norm": 0.32556045055389404,
+      "learning_rate": 5.79555764141304e-07,
+      "loss": 4.3463,
+      "step": 3067
+    },
+    {
+      "epoch": 0.9811909092726219,
+      "grad_norm": 0.3282671272754669,
+      "learning_rate": 5.604041930745485e-07,
+      "loss": 4.4157,
+      "step": 3068
+    },
+    {
+      "epoch": 0.9815107237802074,
+      "grad_norm": 0.3228117823600769,
+      "learning_rate": 5.415740994768048e-07,
+      "loss": 4.4524,
+      "step": 3069
+    },
+    {
+      "epoch": 0.9818305382877931,
+      "grad_norm": 0.3230701982975006,
+      "learning_rate": 5.230655035640352e-07,
+      "loss": 4.3208,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9821503527953787,
+      "grad_norm": 0.42493200302124023,
+      "learning_rate": 5.048784252069782e-07,
+      "loss": 4.3802,
+      "step": 3071
+    },
+    {
+      "epoch": 0.9824701673029643,
+      "grad_norm": 0.3209701478481293,
+      "learning_rate": 4.870128839312815e-07,
+      "loss": 4.402,
+      "step": 3072
+    },
+    {
+      "epoch": 0.9827899818105499,
+      "grad_norm": 0.3352445363998413,
+      "learning_rate": 4.6946889891726903e-07,
+      "loss": 4.3347,
+      "step": 3073
+    },
+    {
+      "epoch": 0.9831097963181354,
+      "grad_norm": 0.3174760937690735,
+      "learning_rate": 4.5224648900017424e-07,
+      "loss": 4.3444,
+      "step": 3074
+    },
+    {
+      "epoch": 0.9834296108257211,
+      "grad_norm": 0.3161444664001465,
+      "learning_rate": 4.353456726699067e-07,
+      "loss": 4.319,
+      "step": 3075
+    },
+    {
+      "epoch": 0.9837494253333067,
+      "grad_norm": 0.33450281620025635,
+      "learning_rate": 4.1876646807111893e-07,
+      "loss": 4.327,
+      "step": 3076
+    },
+    {
+      "epoch": 0.9840692398408922,
+      "grad_norm": 0.3306039869785309,
+      "learning_rate": 4.025088930031728e-07,
+      "loss": 4.3109,
+      "step": 3077
+    },
+    {
+      "epoch": 0.9843890543484779,
+      "grad_norm": 0.31524422764778137,
+      "learning_rate": 3.8657296492023984e-07,
+      "loss": 4.2918,
+      "step": 3078
+    },
+    {
+      "epoch": 0.9847088688560635,
+      "grad_norm": 0.32267364859580994,
+      "learning_rate": 3.709587009309678e-07,
+      "loss": 4.3028,
+      "step": 3079
+    },
+    {
+      "epoch": 0.9850286833636491,
+      "grad_norm": 0.3193490505218506,
+      "learning_rate": 3.5566611779888066e-07,
+      "loss": 4.2841,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9853484978712347,
+      "grad_norm": 0.32063576579093933,
+      "learning_rate": 3.406952319420453e-07,
+      "loss": 4.165,
+      "step": 3081
+    },
+    {
+      "epoch": 0.9856683123788202,
+      "grad_norm": 0.3379741311073303,
+      "learning_rate": 3.260460594330716e-07,
+      "loss": 4.406,
+      "step": 3082
+    },
+    {
+      "epoch": 0.9859881268864059,
+      "grad_norm": 0.3285538852214813,
+      "learning_rate": 3.1171861599937896e-07,
+      "loss": 4.4421,
+      "step": 3083
+    },
+    {
+      "epoch": 0.9863079413939915,
+      "grad_norm": 0.33019211888313293,
+      "learning_rate": 2.9771291702279655e-07,
+      "loss": 4.4411,
+      "step": 3084
+    },
+    {
+      "epoch": 0.986627755901577,
+      "grad_norm": 0.32944613695144653,
+      "learning_rate": 2.840289775398297e-07,
+      "loss": 4.4086,
+      "step": 3085
+    },
+    {
+      "epoch": 0.9869475704091627,
+      "grad_norm": 0.32963770627975464,
+      "learning_rate": 2.7066681224149344e-07,
+      "loss": 4.387,
+      "step": 3086
+    },
+    {
+      "epoch": 0.9872673849167483,
+      "grad_norm": 0.32858437299728394,
+      "learning_rate": 2.5762643547337924e-07,
+      "loss": 4.3697,
+      "step": 3087
+    },
+    {
+      "epoch": 0.9875871994243339,
+      "grad_norm": 0.33206164836883545,
+      "learning_rate": 2.4490786123562144e-07,
+      "loss": 4.3878,
+      "step": 3088
+    },
+    {
+      "epoch": 0.9879070139319195,
+      "grad_norm": 0.332179993391037,
+      "learning_rate": 2.3251110318283083e-07,
+      "loss": 4.3124,
+      "step": 3089
+    },
+    {
+      "epoch": 0.988226828439505,
+      "grad_norm": 0.32216018438339233,
+      "learning_rate": 2.204361746241279e-07,
+      "loss": 4.3728,
+      "step": 3090
+    },
+    {
+      "epoch": 0.9885466429470907,
+      "grad_norm": 0.33033496141433716,
+      "learning_rate": 2.0868308852310943e-07,
+      "loss": 4.3396,
+      "step": 3091
+    },
+    {
+      "epoch": 0.9888664574546763,
+      "grad_norm": 0.32683953642845154,
+      "learning_rate": 1.9725185749784879e-07,
+      "loss": 4.3295,
+      "step": 3092
+    },
+    {
+      "epoch": 0.9891862719622619,
+      "grad_norm": 0.3205666244029999,
+      "learning_rate": 1.861424938208955e-07,
+      "loss": 4.2917,
+      "step": 3093
+    },
+    {
+      "epoch": 0.9895060864698475,
+      "grad_norm": 0.3305319547653198,
+      "learning_rate": 1.753550094192424e-07,
+      "loss": 4.3785,
+      "step": 3094
+    },
+    {
+      "epoch": 0.9898259009774331,
+      "grad_norm": 0.3250020444393158,
+      "learning_rate": 1.6488941587429193e-07,
+      "loss": 4.3505,
+      "step": 3095
+    },
+    {
+      "epoch": 0.9901457154850187,
+      "grad_norm": 0.33178406953811646,
+      "learning_rate": 1.547457244218564e-07,
+      "loss": 4.3275,
+      "step": 3096
+    },
+    {
+      "epoch": 0.9904655299926043,
+      "grad_norm": 0.32208383083343506,
+      "learning_rate": 1.4492394595219115e-07,
+      "loss": 4.3009,
+      "step": 3097
+    },
+    {
+      "epoch": 0.9907853445001898,
+      "grad_norm": 0.3235897123813629,
+      "learning_rate": 1.3542409100992802e-07,
+      "loss": 4.3555,
+      "step": 3098
+    },
+    {
+      "epoch": 0.9911051590077755,
+      "grad_norm": 0.3169964551925659,
+      "learning_rate": 1.2624616979407532e-07,
+      "loss": 4.388,
+      "step": 3099
+    },
+    {
+      "epoch": 0.9914249735153611,
+      "grad_norm": 0.33814671635627747,
+      "learning_rate": 1.1739019215801781e-07,
+      "loss": 4.4036,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9914249735153611,
+      "eval_loss": 4.37929630279541,
+      "eval_runtime": 85.0817,
+      "eval_samples_per_second": 22.296,
+      "eval_steps_per_second": 5.583,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9917447880229467,
+      "grad_norm": 0.3211912214756012,
+      "learning_rate": 1.0885616760951676e-07,
+      "loss": 4.3015,
+      "step": 3101
+    },
+    {
+      "epoch": 0.9920646025305323,
+      "grad_norm": 0.31432220339775085,
+      "learning_rate": 1.0064410531067657e-07,
+      "loss": 4.3175,
+      "step": 3102
+    },
+    {
+      "epoch": 0.9923844170381179,
+      "grad_norm": 0.34299880266189575,
+      "learning_rate": 9.27540140779448e-08,
+      "loss": 4.4855,
+      "step": 3103
+    },
+    {
+      "epoch": 0.9927042315457035,
+      "grad_norm": 0.33430150151252747,
+      "learning_rate": 8.51859023821122e-08,
+      "loss": 4.4051,
+      "step": 3104
+    },
+    {
+      "epoch": 0.9930240460532891,
+      "grad_norm": 0.3370993137359619,
+      "learning_rate": 7.793977834824605e-08,
+      "loss": 4.3902,
+      "step": 3105
+    },
+    {
+      "epoch": 0.9933438605608746,
+      "grad_norm": 0.32714369893074036,
+      "learning_rate": 7.101564975579011e-08,
+      "loss": 4.3716,
+      "step": 3106
+    },
+    {
+      "epoch": 0.9936636750684603,
+      "grad_norm": 0.3260760009288788,
+      "learning_rate": 6.441352403849798e-08,
+      "loss": 4.3759,
+      "step": 3107
+    },
+    {
+      "epoch": 0.9939834895760459,
+      "grad_norm": 0.33366405963897705,
+      "learning_rate": 5.813340828429991e-08,
+      "loss": 4.3563,
+      "step": 3108
+    },
+    {
+      "epoch": 0.9943033040836315,
+      "grad_norm": 0.32677289843559265,
+      "learning_rate": 5.217530923560254e-08,
+      "loss": 4.3215,
+      "step": 3109
+    },
+    {
+      "epoch": 0.9946231185912171,
+      "grad_norm": 0.3401174545288086,
+      "learning_rate": 4.6539233288955816e-08,
+      "loss": 4.4004,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9949429330988027,
+      "grad_norm": 0.37048086524009705,
+      "learning_rate": 4.122518649525286e-08,
+      "loss": 4.3808,
+      "step": 3111
+    },
+    {
+      "epoch": 0.9952627476063883,
+      "grad_norm": 0.3258591890335083,
+      "learning_rate": 3.623317455959673e-08,
+      "loss": 4.3797,
+      "step": 3112
+    },
+    {
+      "epoch": 0.9955825621139739,
+      "grad_norm": 0.32999947667121887,
+      "learning_rate": 3.156320284146696e-08,
+      "loss": 4.2535,
+      "step": 3113
+    },
+    {
+      "epoch": 0.9959023766215594,
+      "grad_norm": 0.33407601714134216,
+      "learning_rate": 2.7215276354486393e-08,
+      "loss": 4.3589,
+      "step": 3114
+    },
+    {
+      "epoch": 0.9962221911291451,
+      "grad_norm": 0.32332849502563477,
+      "learning_rate": 2.3189399766587735e-08,
+      "loss": 4.316,
+      "step": 3115
+    },
+    {
+      "epoch": 0.9965420056367307,
+      "grad_norm": 0.3339650332927704,
+      "learning_rate": 1.948557739994694e-08,
+      "loss": 4.4301,
+      "step": 3116
+    },
+    {
+      "epoch": 0.9968618201443163,
+      "grad_norm": 0.3142257034778595,
+      "learning_rate": 1.6103813230949892e-08,
+      "loss": 4.3146,
+      "step": 3117
+    },
+    {
+      "epoch": 0.9971816346519019,
+      "grad_norm": 0.31975337862968445,
+      "learning_rate": 1.3044110890292336e-08,
+      "loss": 4.4019,
+      "step": 3118
+    },
+    {
+      "epoch": 0.9975014491594875,
+      "grad_norm": 0.3598925471305847,
+      "learning_rate": 1.0306473662813341e-08,
+      "loss": 4.3411,
+      "step": 3119
+    },
+    {
+      "epoch": 0.9978212636670731,
+      "grad_norm": 0.3364954888820648,
+      "learning_rate": 7.89090448766183e-09,
+      "loss": 4.3687,
+      "step": 3120
+    },
+    {
+      "epoch": 0.9981410781746587,
+      "grad_norm": 0.32584497332572937,
+      "learning_rate": 5.7974059581633595e-09,
+      "loss": 4.3917,
+      "step": 3121
+    },
+    {
+      "epoch": 0.9984608926822442,
+      "grad_norm": 0.32605403661727905,
+      "learning_rate": 4.02598032192003e-09,
+      "loss": 4.4047,
+      "step": 3122
+    },
+    {
+      "epoch": 0.9987807071898299,
+      "grad_norm": 0.31860044598579407,
+      "learning_rate": 2.5766294807438858e-09,
+      "loss": 4.2947,
+      "step": 3123
+    },
+    {
+      "epoch": 0.9991005216974155,
+      "grad_norm": 0.32749542593955994,
+      "learning_rate": 1.4493549905902902e-09,
+      "loss": 4.3667,
+      "step": 3124
+    },
+    {
+      "epoch": 0.9994203362050011,
+      "grad_norm": 0.3176465630531311,
+      "learning_rate": 6.441580617577713e-10,
+      "loss": 4.3476,
+      "step": 3125
+    },
+    {
+      "epoch": 0.9997401507125867,
+      "grad_norm": 0.32228946685791016,
+      "learning_rate": 1.6103955865487407e-10,
+      "loss": 4.3319,
+      "step": 3126
+    },
+    {
+      "epoch": 0.9997401507125867,
+      "step": 3126,
+      "total_flos": 1.503421305716736e+17,
+      "train_loss": 4.916973250231068,
+      "train_runtime": 23463.9242,
+      "train_samples_per_second": 8.529,
+      "train_steps_per_second": 0.133
+    },
+    {
+      "epoch": 0.9997401507125867,
+      "eval_loss": 4.379289150238037,
+      "eval_runtime": 88.5733,
+      "eval_samples_per_second": 21.417,
+      "eval_steps_per_second": 5.363,
+      "step": 3126
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 3126,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.503421305716736e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}