diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,15543 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9983779399837793,
+  "eval_steps": 154,
+  "global_step": 1540,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.0013,
+      "grad_norm": 0.31844592094421387,
+      "learning_rate": 6.493506493506495e-08,
+      "loss": 6.3418,
+      "step": 1
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.0026,
+      "grad_norm": 0.26869267225265503,
+      "learning_rate": 1.298701298701299e-07,
+      "loss": 6.3926,
+      "step": 2
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.00389,
+      "grad_norm": 0.2948790490627289,
+      "learning_rate": 1.948051948051948e-07,
+      "loss": 6.2676,
+      "step": 3
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.00519,
+      "grad_norm": 0.3147197663784027,
+      "learning_rate": 2.597402597402598e-07,
+      "loss": 6.4707,
+      "step": 4
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.00649,
+      "grad_norm": 0.3039565086364746,
+      "learning_rate": 3.2467532467532465e-07,
+      "loss": 7.0977,
+      "step": 5
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.00779,
+      "grad_norm": 0.28105345368385315,
+      "learning_rate": 3.896103896103896e-07,
+      "loss": 6.8184,
+      "step": 6
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8189,
+      "epoch": 0.00908,
+      "grad_norm": 0.2680772840976715,
+      "learning_rate": 4.5454545454545457e-07,
+      "loss": 6.5508,
+      "step": 7
+    },
+    {
+      "batch_num_effect_tokens": 7887,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8105,
+      "epoch": 0.01038,
+      "grad_norm": 0.3874300718307495,
+      "learning_rate": 5.194805194805196e-07,
+      "loss": 6.1758,
+      "step": 8
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.01168,
+      "grad_norm": 0.29857560992240906,
+      "learning_rate": 5.844155844155845e-07,
+      "loss": 6.7598,
+      "step": 9
+    },
+    {
+      "batch_num_effect_tokens": 7892,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8119,
+      "epoch": 0.01298,
+      "grad_norm": 0.3115284740924835,
+      "learning_rate": 6.493506493506493e-07,
+      "loss": 6.3301,
+      "step": 10
+    },
+    {
+      "batch_num_effect_tokens": 7943,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 0.01427,
+      "grad_norm": 0.38096851110458374,
+      "learning_rate": 7.142857142857143e-07,
+      "loss": 6.2168,
+      "step": 11
+    },
+    {
+      "batch_num_effect_tokens": 8074,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.01557,
+      "grad_norm": 0.24052490293979645,
+      "learning_rate": 7.792207792207792e-07,
+      "loss": 6.0957,
+      "step": 12
+    },
+    {
+      "batch_num_effect_tokens": 7983,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8146,
+      "epoch": 0.01687,
+      "grad_norm": 0.2261405885219574,
+      "learning_rate": 8.441558441558442e-07,
+      "loss": 5.8008,
+      "step": 13
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.01817,
+      "grad_norm": 0.25014105439186096,
+      "learning_rate": 9.090909090909091e-07,
+      "loss": 6.459,
+      "step": 14
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8177,
+      "epoch": 0.01946,
+      "grad_norm": 0.2601844370365143,
+      "learning_rate": 9.740259740259742e-07,
+      "loss": 6.2109,
+      "step": 15
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.02076,
+      "grad_norm": 0.2764773368835449,
+      "learning_rate": 1.0389610389610392e-06,
+      "loss": 6.0508,
+      "step": 16
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 0.02206,
+      "grad_norm": 0.23783260583877563,
+      "learning_rate": 1.103896103896104e-06,
+      "loss": 6.3242,
+      "step": 17
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.02336,
+      "grad_norm": 0.21356581151485443,
+      "learning_rate": 1.168831168831169e-06,
+      "loss": 6.1641,
+      "step": 18
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.02466,
+      "grad_norm": 0.2238253653049469,
+      "learning_rate": 1.2337662337662338e-06,
+      "loss": 5.8047,
+      "step": 19
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8170,
+      "epoch": 0.02595,
+      "grad_norm": 0.21132396161556244,
+      "learning_rate": 1.2987012987012986e-06,
+      "loss": 5.7354,
+      "step": 20
+    },
+    {
+      "batch_num_effect_tokens": 7901,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8128,
+      "epoch": 0.02725,
+      "grad_norm": 0.2627267837524414,
+      "learning_rate": 1.3636363636363636e-06,
+      "loss": 6.2051,
+      "step": 21
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8165,
+      "epoch": 0.02855,
+      "grad_norm": 0.23212876915931702,
+      "learning_rate": 1.4285714285714286e-06,
+      "loss": 6.1699,
+      "step": 22
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.02985,
+      "grad_norm": 0.20811595022678375,
+      "learning_rate": 1.4935064935064936e-06,
+      "loss": 5.6621,
+      "step": 23
+    },
+    {
+      "batch_num_effect_tokens": 8066,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.03114,
+      "grad_norm": 0.205363467335701,
+      "learning_rate": 1.5584415584415584e-06,
+      "loss": 5.7109,
+      "step": 24
+    },
+    {
+      "batch_num_effect_tokens": 7906,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8091,
+      "epoch": 0.03244,
+      "grad_norm": 0.248266339302063,
+      "learning_rate": 1.6233766233766235e-06,
+      "loss": 5.8008,
+      "step": 25
+    },
+    {
+      "batch_num_effect_tokens": 8072,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.03374,
+      "grad_norm": 0.22567051649093628,
+      "learning_rate": 1.6883116883116885e-06,
+      "loss": 5.668,
+      "step": 26
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.03504,
+      "grad_norm": 0.21363011002540588,
+      "learning_rate": 1.7532467532467535e-06,
+      "loss": 5.9082,
+      "step": 27
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8142,
+      "epoch": 0.03633,
+      "grad_norm": 0.17314466834068298,
+      "learning_rate": 1.8181818181818183e-06,
+      "loss": 5.8809,
+      "step": 28
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.03763,
+      "grad_norm": 0.1672496497631073,
+      "learning_rate": 1.8831168831168833e-06,
+      "loss": 5.2207,
+      "step": 29
+    },
+    {
+      "batch_num_effect_tokens": 7987,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8155,
+      "epoch": 0.03893,
+      "grad_norm": 0.17449834942817688,
+      "learning_rate": 1.9480519480519483e-06,
+      "loss": 5.7324,
+      "step": 30
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.04023,
+      "grad_norm": 0.1806560754776001,
+      "learning_rate": 2.012987012987013e-06,
+      "loss": 5.2949,
+      "step": 31
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.04152,
+      "grad_norm": 0.1780816614627838,
+      "learning_rate": 2.0779220779220784e-06,
+      "loss": 5.6357,
+      "step": 32
+    },
+    {
+      "batch_num_effect_tokens": 7915,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8090,
+      "epoch": 0.04282,
+      "grad_norm": 0.1659129112958908,
+      "learning_rate": 2.1428571428571427e-06,
+      "loss": 5.2832,
+      "step": 33
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.04412,
+      "grad_norm": 0.18337897956371307,
+      "learning_rate": 2.207792207792208e-06,
+      "loss": 5.4307,
+      "step": 34
+    },
+    {
+      "batch_num_effect_tokens": 7927,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8154,
+      "epoch": 0.04542,
+      "grad_norm": 0.20109063386917114,
+      "learning_rate": 2.2727272727272728e-06,
+      "loss": 5.876,
+      "step": 35
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8176,
+      "epoch": 0.04672,
+      "grad_norm": 0.15896430611610413,
+      "learning_rate": 2.337662337662338e-06,
+      "loss": 5.251,
+      "step": 36
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8182,
+      "epoch": 0.04801,
+      "grad_norm": 0.15734650194644928,
+      "learning_rate": 2.402597402597403e-06,
+      "loss": 4.8984,
+      "step": 37
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.04931,
+      "grad_norm": 0.17897239327430725,
+      "learning_rate": 2.4675324675324676e-06,
+      "loss": 5.3438,
+      "step": 38
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.05061,
+      "grad_norm": 0.1797408014535904,
+      "learning_rate": 2.5324675324675324e-06,
+      "loss": 5.3223,
+      "step": 39
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8156,
+      "epoch": 0.05191,
+      "grad_norm": 0.16180779039859772,
+      "learning_rate": 2.597402597402597e-06,
+      "loss": 5.0254,
+      "step": 40
+    },
+    {
+      "batch_num_effect_tokens": 7865,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8029,
+      "epoch": 0.0532,
+      "grad_norm": 0.1604710966348648,
+      "learning_rate": 2.6623376623376624e-06,
+      "loss": 5.3867,
+      "step": 41
+    },
+    {
+      "batch_num_effect_tokens": 7967,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.0545,
+      "grad_norm": 0.15748171508312225,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 4.9678,
+      "step": 42
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.0558,
+      "grad_norm": 0.163799449801445,
+      "learning_rate": 2.7922077922077925e-06,
+      "loss": 5.1172,
+      "step": 43
+    },
+    {
+      "batch_num_effect_tokens": 7880,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8102,
+      "epoch": 0.0571,
+      "grad_norm": 0.1661587953567505,
+      "learning_rate": 2.8571428571428573e-06,
+      "loss": 4.8545,
+      "step": 44
+    },
+    {
+      "batch_num_effect_tokens": 7964,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8143,
+      "epoch": 0.05839,
+      "grad_norm": 0.1683725267648697,
+      "learning_rate": 2.922077922077922e-06,
+      "loss": 5.2148,
+      "step": 45
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8164,
+      "epoch": 0.05969,
+      "grad_norm": 0.15301677584648132,
+      "learning_rate": 2.9870129870129873e-06,
+      "loss": 5.0928,
+      "step": 46
+    },
+    {
+      "batch_num_effect_tokens": 7915,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8087,
+      "epoch": 0.06099,
+      "grad_norm": 0.14993946254253387,
+      "learning_rate": 3.051948051948052e-06,
+      "loss": 5.0557,
+      "step": 47
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 0.06229,
+      "grad_norm": 0.15920305252075195,
+      "learning_rate": 3.116883116883117e-06,
+      "loss": 5.209,
+      "step": 48
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.06358,
+      "grad_norm": 0.1530810445547104,
+      "learning_rate": 3.181818181818182e-06,
+      "loss": 5.3164,
+      "step": 49
+    },
+    {
+      "batch_num_effect_tokens": 7979,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8121,
+      "epoch": 0.06488,
+      "grad_norm": 0.14603291451931,
+      "learning_rate": 3.246753246753247e-06,
+      "loss": 5.1729,
+      "step": 50
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.06618,
+      "grad_norm": 0.14708639681339264,
+      "learning_rate": 3.311688311688312e-06,
+      "loss": 4.7744,
+      "step": 51
+    },
+    {
+      "batch_num_effect_tokens": 8067,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.06748,
+      "grad_norm": 0.14484994113445282,
+      "learning_rate": 3.376623376623377e-06,
+      "loss": 4.9619,
+      "step": 52
+    },
+    {
+      "batch_num_effect_tokens": 7904,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.06878,
+      "grad_norm": 0.1400662362575531,
+      "learning_rate": 3.4415584415584418e-06,
+      "loss": 4.7002,
+      "step": 53
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.07007,
+      "grad_norm": 0.14303331077098846,
+      "learning_rate": 3.506493506493507e-06,
+      "loss": 4.6895,
+      "step": 54
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.07137,
+      "grad_norm": 0.14038386940956116,
+      "learning_rate": 3.5714285714285718e-06,
+      "loss": 5.3848,
+      "step": 55
+    },
+    {
+      "batch_num_effect_tokens": 7905,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8075,
+      "epoch": 0.07267,
+      "grad_norm": 0.1472426950931549,
+      "learning_rate": 3.6363636363636366e-06,
+      "loss": 4.9775,
+      "step": 56
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.07397,
+      "grad_norm": 0.15343016386032104,
+      "learning_rate": 3.701298701298702e-06,
+      "loss": 5.2734,
+      "step": 57
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.07526,
+      "grad_norm": 0.1423100382089615,
+      "learning_rate": 3.7662337662337666e-06,
+      "loss": 5.3379,
+      "step": 58
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.07656,
+      "grad_norm": 0.14533978700637817,
+      "learning_rate": 3.831168831168831e-06,
+      "loss": 4.8135,
+      "step": 59
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.07786,
+      "grad_norm": 0.14846143126487732,
+      "learning_rate": 3.896103896103897e-06,
+      "loss": 4.9902,
+      "step": 60
+    },
+    {
+      "batch_num_effect_tokens": 7928,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8113,
+      "epoch": 0.07916,
+      "grad_norm": 0.1590896099805832,
+      "learning_rate": 3.961038961038962e-06,
+      "loss": 5.0049,
+      "step": 61
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.08045,
+      "grad_norm": 0.1540524810552597,
+      "learning_rate": 4.025974025974026e-06,
+      "loss": 4.8965,
+      "step": 62
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.08175,
+      "grad_norm": 0.13902273774147034,
+      "learning_rate": 4.0909090909090915e-06,
+      "loss": 4.9961,
+      "step": 63
+    },
+    {
+      "batch_num_effect_tokens": 7977,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.08305,
+      "grad_norm": 0.14425402879714966,
+      "learning_rate": 4.155844155844157e-06,
+      "loss": 5.0908,
+      "step": 64
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.08435,
+      "grad_norm": 0.15884140133857727,
+      "learning_rate": 4.220779220779221e-06,
+      "loss": 5.2842,
+      "step": 65
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.08564,
+      "grad_norm": 0.14033250510692596,
+      "learning_rate": 4.2857142857142855e-06,
+      "loss": 5.0371,
+      "step": 66
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.08694,
+      "grad_norm": 0.15526610612869263,
+      "learning_rate": 4.350649350649351e-06,
+      "loss": 4.6426,
+      "step": 67
+    },
+    {
+      "batch_num_effect_tokens": 7806,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 7994,
+      "epoch": 0.08824,
+      "grad_norm": 0.1523507982492447,
+      "learning_rate": 4.415584415584416e-06,
+      "loss": 5.1045,
+      "step": 68
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.08954,
+      "grad_norm": 0.15373018383979797,
+      "learning_rate": 4.48051948051948e-06,
+      "loss": 5.3213,
+      "step": 69
+    },
+    {
+      "batch_num_effect_tokens": 7953,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8076,
+      "epoch": 0.09084,
+      "grad_norm": 0.16291280090808868,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 4.9736,
+      "step": 70
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.09213,
+      "grad_norm": 0.135183647274971,
+      "learning_rate": 4.610389610389611e-06,
+      "loss": 5.1475,
+      "step": 71
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8181,
+      "epoch": 0.09343,
+      "grad_norm": 0.13779762387275696,
+      "learning_rate": 4.675324675324676e-06,
+      "loss": 4.999,
+      "step": 72
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8152,
+      "epoch": 0.09473,
+      "grad_norm": 0.1394745260477066,
+      "learning_rate": 4.74025974025974e-06,
+      "loss": 4.9697,
+      "step": 73
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 0.09603,
+      "grad_norm": 0.14722809195518494,
+      "learning_rate": 4.805194805194806e-06,
+      "loss": 5.4229,
+      "step": 74
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.09732,
+      "grad_norm": 0.1443023979663849,
+      "learning_rate": 4.870129870129871e-06,
+      "loss": 4.8057,
+      "step": 75
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8171,
+      "epoch": 0.09862,
+      "grad_norm": 0.1304166615009308,
+      "learning_rate": 4.935064935064935e-06,
+      "loss": 5.1406,
+      "step": 76
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8176,
+      "epoch": 0.09992,
+      "grad_norm": 0.14469240605831146,
+      "learning_rate": 5e-06,
+      "loss": 4.9717,
+      "step": 77
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.10122,
+      "grad_norm": 0.14036931097507477,
+      "learning_rate": 5.064935064935065e-06,
+      "loss": 4.6973,
+      "step": 78
+    },
+    {
+      "batch_num_effect_tokens": 7970,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8176,
+      "epoch": 0.10251,
+      "grad_norm": 0.1444014012813568,
+      "learning_rate": 5.12987012987013e-06,
+      "loss": 4.8516,
+      "step": 79
+    },
+    {
+      "batch_num_effect_tokens": 7878,
+      "batch_num_samples": 27,
+      "batch_num_tokens": 8114,
+      "epoch": 0.10381,
+      "grad_norm": 0.15033836662769318,
+      "learning_rate": 5.194805194805194e-06,
+      "loss": 4.6299,
+      "step": 80
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.10511,
+      "grad_norm": 0.1547461301088333,
+      "learning_rate": 5.2597402597402605e-06,
+      "loss": 5.0889,
+      "step": 81
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.10641,
+      "grad_norm": 0.14421503245830536,
+      "learning_rate": 5.324675324675325e-06,
+      "loss": 4.9414,
+      "step": 82
+    },
+    {
+      "batch_num_effect_tokens": 8060,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.1077,
+      "grad_norm": 0.13420124351978302,
+      "learning_rate": 5.38961038961039e-06,
+      "loss": 4.8047,
+      "step": 83
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.109,
+      "grad_norm": 0.1364876627922058,
+      "learning_rate": 5.4545454545454545e-06,
+      "loss": 5.0352,
+      "step": 84
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.1103,
+      "grad_norm": 0.13382357358932495,
+      "learning_rate": 5.5194805194805205e-06,
+      "loss": 4.8984,
+      "step": 85
+    },
+    {
+      "batch_num_effect_tokens": 7784,
+      "batch_num_samples": 28,
+      "batch_num_tokens": 8032,
+      "epoch": 0.1116,
+      "grad_norm": 0.1488446742296219,
+      "learning_rate": 5.584415584415585e-06,
+      "loss": 5.2764,
+      "step": 86
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.1129,
+      "grad_norm": 0.15475524961948395,
+      "learning_rate": 5.64935064935065e-06,
+      "loss": 5.1055,
+      "step": 87
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.11419,
+      "grad_norm": 0.1377478688955307,
+      "learning_rate": 5.7142857142857145e-06,
+      "loss": 4.835,
+      "step": 88
+    },
+    {
+      "batch_num_effect_tokens": 7766,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8030,
+      "epoch": 0.11549,
+      "grad_norm": 0.1429220736026764,
+      "learning_rate": 5.77922077922078e-06,
+      "loss": 4.8867,
+      "step": 89
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8144,
+      "epoch": 0.11679,
+      "grad_norm": 0.14639155566692352,
+      "learning_rate": 5.844155844155844e-06,
+      "loss": 5.1807,
+      "step": 90
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.11809,
+      "grad_norm": 0.1477179378271103,
+      "learning_rate": 5.90909090909091e-06,
+      "loss": 4.6084,
+      "step": 91
+    },
+    {
+      "batch_num_effect_tokens": 7969,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8136,
+      "epoch": 0.11938,
+      "grad_norm": 0.1457316130399704,
+      "learning_rate": 5.9740259740259746e-06,
+      "loss": 5.335,
+      "step": 92
+    },
+    {
+      "batch_num_effect_tokens": 7885,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.12068,
+      "grad_norm": 0.1372513622045517,
+      "learning_rate": 6.03896103896104e-06,
+      "loss": 4.6279,
+      "step": 93
+    },
+    {
+      "batch_num_effect_tokens": 7953,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8107,
+      "epoch": 0.12198,
+      "grad_norm": 0.14941509068012238,
+      "learning_rate": 6.103896103896104e-06,
+      "loss": 4.7344,
+      "step": 94
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.12328,
+      "grad_norm": 0.14506636559963226,
+      "learning_rate": 6.168831168831169e-06,
+      "loss": 5.2168,
+      "step": 95
+    },
+    {
+      "batch_num_effect_tokens": 8001,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.12457,
+      "grad_norm": 0.15786777436733246,
+      "learning_rate": 6.233766233766234e-06,
+      "loss": 5.1094,
+      "step": 96
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 0.12587,
+      "grad_norm": 0.14335079491138458,
+      "learning_rate": 6.2987012987013e-06,
+      "loss": 4.7344,
+      "step": 97
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.12717,
+      "grad_norm": 0.1454756110906601,
+      "learning_rate": 6.363636363636364e-06,
+      "loss": 4.9893,
+      "step": 98
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.12847,
+      "grad_norm": 0.13855211436748505,
+      "learning_rate": 6.4285714285714295e-06,
+      "loss": 4.6758,
+      "step": 99
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.12976,
+      "grad_norm": 0.1334666609764099,
+      "learning_rate": 6.493506493506494e-06,
+      "loss": 5.1504,
+      "step": 100
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.13106,
+      "grad_norm": 0.14907817542552948,
+      "learning_rate": 6.55844155844156e-06,
+      "loss": 4.8926,
+      "step": 101
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.13236,
+      "grad_norm": 0.134397953748703,
+      "learning_rate": 6.623376623376624e-06,
+      "loss": 4.7324,
+      "step": 102
+    },
+    {
+      "batch_num_effect_tokens": 7971,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.13366,
+      "grad_norm": 0.13027189671993256,
+      "learning_rate": 6.688311688311689e-06,
+      "loss": 4.7451,
+      "step": 103
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8144,
+      "epoch": 0.13496,
+      "grad_norm": 0.1361909955739975,
+      "learning_rate": 6.753246753246754e-06,
+      "loss": 5.1758,
+      "step": 104
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8190,
+      "epoch": 0.13625,
+      "grad_norm": 0.14071358740329742,
+      "learning_rate": 6.818181818181818e-06,
+      "loss": 4.8379,
+      "step": 105
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.13755,
+      "grad_norm": 0.14475062489509583,
+      "learning_rate": 6.8831168831168835e-06,
+      "loss": 4.9668,
+      "step": 106
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 0.13885,
+      "grad_norm": 0.1445104032754898,
+      "learning_rate": 6.948051948051948e-06,
+      "loss": 4.8447,
+      "step": 107
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.14015,
+      "grad_norm": 0.14156053960323334,
+      "learning_rate": 7.012987012987014e-06,
+      "loss": 5.1055,
+      "step": 108
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8191,
+      "epoch": 0.14144,
+      "grad_norm": 0.1461244523525238,
+      "learning_rate": 7.077922077922078e-06,
+      "loss": 5.0107,
+      "step": 109
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 0.14274,
+      "grad_norm": 0.14059850573539734,
+      "learning_rate": 7.1428571428571436e-06,
+      "loss": 5.1133,
+      "step": 110
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8140,
+      "epoch": 0.14404,
+      "grad_norm": 0.14305299520492554,
+      "learning_rate": 7.207792207792208e-06,
+      "loss": 4.6953,
+      "step": 111
+    },
+    {
+      "batch_num_effect_tokens": 7981,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8144,
+      "epoch": 0.14534,
+      "grad_norm": 0.1341462880373001,
+      "learning_rate": 7.272727272727273e-06,
+      "loss": 4.8037,
+      "step": 112
+    },
+    {
+      "batch_num_effect_tokens": 7913,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8098,
+      "epoch": 0.14663,
+      "grad_norm": 0.14198477566242218,
+      "learning_rate": 7.3376623376623375e-06,
+      "loss": 4.8252,
+      "step": 113
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.14793,
+      "grad_norm": 0.1437690556049347,
+      "learning_rate": 7.402597402597404e-06,
+      "loss": 5.0996,
+      "step": 114
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8152,
+      "epoch": 0.14923,
+      "grad_norm": 0.13746197521686554,
+      "learning_rate": 7.467532467532468e-06,
+      "loss": 4.8242,
+      "step": 115
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.15053,
+      "grad_norm": 0.1328958421945572,
+      "learning_rate": 7.532467532467533e-06,
+      "loss": 5.4316,
+      "step": 116
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.15182,
+      "grad_norm": 0.14948073029518127,
+      "learning_rate": 7.597402597402598e-06,
+      "loss": 5.1084,
+      "step": 117
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8139,
+      "epoch": 0.15312,
+      "grad_norm": 0.14273382723331451,
+      "learning_rate": 7.662337662337663e-06,
+      "loss": 5.0645,
+      "step": 118
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.15442,
+      "grad_norm": 0.13271141052246094,
+      "learning_rate": 7.727272727272727e-06,
+      "loss": 4.7783,
+      "step": 119
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.15572,
+      "grad_norm": 0.14579838514328003,
+      "learning_rate": 7.792207792207793e-06,
+      "loss": 5.0703,
+      "step": 120
+    },
+    {
+      "batch_num_effect_tokens": 7983,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 0.15702,
+      "grad_norm": 0.1460658609867096,
+      "learning_rate": 7.857142857142858e-06,
+      "loss": 4.7344,
+      "step": 121
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.15831,
+      "grad_norm": 0.1394878476858139,
+      "learning_rate": 7.922077922077924e-06,
+      "loss": 5.0918,
+      "step": 122
+    },
+    {
+      "batch_num_effect_tokens": 8001,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.15961,
+      "grad_norm": 0.1350386142730713,
+      "learning_rate": 7.987012987012988e-06,
+      "loss": 5.291,
+      "step": 123
+    },
+    {
+      "batch_num_effect_tokens": 7925,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8094,
+      "epoch": 0.16091,
+      "grad_norm": 0.1350218653678894,
+      "learning_rate": 8.051948051948052e-06,
+      "loss": 5.0156,
+      "step": 124
+    },
+    {
+      "batch_num_effect_tokens": 8070,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.16221,
+      "grad_norm": 0.1472414880990982,
+      "learning_rate": 8.116883116883117e-06,
+      "loss": 4.998,
+      "step": 125
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.1635,
+      "grad_norm": 0.14445587992668152,
+      "learning_rate": 8.181818181818183e-06,
+      "loss": 5.0547,
+      "step": 126
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8177,
+      "epoch": 0.1648,
+      "grad_norm": 0.14494451880455017,
+      "learning_rate": 8.246753246753247e-06,
+      "loss": 4.6123,
+      "step": 127
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.1661,
+      "grad_norm": 0.14212098717689514,
+      "learning_rate": 8.311688311688313e-06,
+      "loss": 4.6992,
+      "step": 128
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.1674,
+      "grad_norm": 0.1644868403673172,
+      "learning_rate": 8.376623376623378e-06,
+      "loss": 5.2734,
+      "step": 129
+    },
+    {
+      "batch_num_effect_tokens": 7962,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8158,
+      "epoch": 0.16869,
+      "grad_norm": 0.13638760149478912,
+      "learning_rate": 8.441558441558442e-06,
+      "loss": 4.7773,
+      "step": 130
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8101,
+      "epoch": 0.16999,
+      "grad_norm": 0.14820170402526855,
+      "learning_rate": 8.506493506493507e-06,
+      "loss": 4.8438,
+      "step": 131
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.17129,
+      "grad_norm": 0.1421528309583664,
+      "learning_rate": 8.571428571428571e-06,
+      "loss": 5.0254,
+      "step": 132
+    },
+    {
+      "batch_num_effect_tokens": 7936,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.17259,
+      "grad_norm": 0.13923408091068268,
+      "learning_rate": 8.636363636363637e-06,
+      "loss": 4.7812,
+      "step": 133
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.17388,
+      "grad_norm": 0.15315015614032745,
+      "learning_rate": 8.701298701298701e-06,
+      "loss": 4.834,
+      "step": 134
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 0.17518,
+      "grad_norm": 0.1589311957359314,
+      "learning_rate": 8.766233766233767e-06,
+      "loss": 4.9219,
+      "step": 135
+    },
+    {
+      "batch_num_effect_tokens": 8006,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8160,
+      "epoch": 0.17648,
+      "grad_norm": 0.14829252660274506,
+      "learning_rate": 8.831168831168832e-06,
+      "loss": 4.9746,
+      "step": 136
+    },
+    {
+      "batch_num_effect_tokens": 7960,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 0.17778,
+      "grad_norm": 0.14853787422180176,
+      "learning_rate": 8.896103896103896e-06,
+      "loss": 5.0918,
+      "step": 137
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.17908,
+      "grad_norm": 0.13866592943668365,
+      "learning_rate": 8.96103896103896e-06,
+      "loss": 4.8525,
+      "step": 138
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8179,
+      "epoch": 0.18037,
+      "grad_norm": 0.1399109810590744,
+      "learning_rate": 9.025974025974027e-06,
+      "loss": 4.9492,
+      "step": 139
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.18167,
+      "grad_norm": 0.14804702997207642,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 4.8105,
+      "step": 140
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.18297,
+      "grad_norm": 0.13978514075279236,
+      "learning_rate": 9.155844155844157e-06,
+      "loss": 4.6553,
+      "step": 141
+    },
+    {
+      "batch_num_effect_tokens": 7962,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8119,
+      "epoch": 0.18427,
+      "grad_norm": 0.1482185572385788,
+      "learning_rate": 9.220779220779221e-06,
+      "loss": 4.8184,
+      "step": 142
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8189,
+      "epoch": 0.18556,
+      "grad_norm": 0.1415347456932068,
+      "learning_rate": 9.285714285714288e-06,
+      "loss": 4.8008,
+      "step": 143
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 0.18686,
+      "grad_norm": 0.15562182664871216,
+      "learning_rate": 9.350649350649352e-06,
+      "loss": 5.4648,
+      "step": 144
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.18816,
+      "grad_norm": 0.14722460508346558,
+      "learning_rate": 9.415584415584416e-06,
+      "loss": 4.7441,
+      "step": 145
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8146,
+      "epoch": 0.18946,
+      "grad_norm": 0.13538284599781036,
+      "learning_rate": 9.48051948051948e-06,
+      "loss": 4.5645,
+      "step": 146
+    },
+    {
+      "batch_num_effect_tokens": 7967,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8176,
+      "epoch": 0.19075,
+      "grad_norm": 0.1557544320821762,
+      "learning_rate": 9.545454545454547e-06,
+      "loss": 4.9785,
+      "step": 147
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19205,
+      "grad_norm": 0.15202026069164276,
+      "learning_rate": 9.610389610389611e-06,
+      "loss": 4.8027,
+      "step": 148
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19335,
+      "grad_norm": 0.14528276026248932,
+      "learning_rate": 9.675324675324677e-06,
+      "loss": 4.7441,
+      "step": 149
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19465,
+      "grad_norm": 0.15638510882854462,
+      "learning_rate": 9.740259740259742e-06,
+      "loss": 4.9463,
+      "step": 150
+    },
+    {
+      "batch_num_effect_tokens": 8070,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19594,
+      "grad_norm": 0.14898745715618134,
+      "learning_rate": 9.805194805194806e-06,
+      "loss": 4.9248,
+      "step": 151
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19724,
+      "grad_norm": 0.13369755446910858,
+      "learning_rate": 9.87012987012987e-06,
+      "loss": 4.9199,
+      "step": 152
+    },
+    {
+      "batch_num_effect_tokens": 8070,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19854,
+      "grad_norm": 0.14535720646381378,
+      "learning_rate": 9.935064935064936e-06,
+      "loss": 4.9004,
+      "step": 153
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19984,
+      "grad_norm": 0.1457190215587616,
+      "learning_rate": 1e-05,
+      "loss": 5.2598,
+      "step": 154
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.19984,
+      "eval_eval_loss": 0.6176656484603882,
+      "eval_eval_runtime": 115.0266,
+      "eval_eval_samples_per_second": 43.468,
+      "eval_eval_steps_per_second": 2.721,
+      "step": 154
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8126,
+      "epoch": 0.20114,
+      "grad_norm": 0.14440381526947021,
+      "learning_rate": 9.999987155621127e-06,
+      "loss": 5.0732,
+      "step": 155
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8170,
+      "epoch": 0.20243,
+      "grad_norm": 0.14386983215808868,
+      "learning_rate": 9.999948622550497e-06,
+      "loss": 4.9297,
+      "step": 156
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.20373,
+      "grad_norm": 0.14097128808498383,
+      "learning_rate": 9.999884400986087e-06,
+      "loss": 5.0283,
+      "step": 157
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.20503,
+      "grad_norm": 0.14975176751613617,
+      "learning_rate": 9.999794491257846e-06,
+      "loss": 4.6611,
+      "step": 158
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.20633,
+      "grad_norm": 0.13971513509750366,
+      "learning_rate": 9.999678893827711e-06,
+      "loss": 4.9727,
+      "step": 159
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.20762,
+      "grad_norm": 0.12843751907348633,
+      "learning_rate": 9.999537609289592e-06,
+      "loss": 4.9268,
+      "step": 160
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 0.20892,
+      "grad_norm": 0.13939853012561798,
+      "learning_rate": 9.999370638369377e-06,
+      "loss": 4.709,
+      "step": 161
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.21022,
+      "grad_norm": 0.14971594512462616,
+      "learning_rate": 9.999177981924915e-06,
+      "loss": 4.7676,
+      "step": 162
+    },
+    {
+      "batch_num_effect_tokens": 7874,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8091,
+      "epoch": 0.21152,
+      "grad_norm": 0.14274117350578308,
+      "learning_rate": 9.998959640946033e-06,
+      "loss": 4.8418,
+      "step": 163
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.21281,
+      "grad_norm": 0.14646124839782715,
+      "learning_rate": 9.998715616554509e-06,
+      "loss": 4.6113,
+      "step": 164
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.21411,
+      "grad_norm": 0.13392624258995056,
+      "learning_rate": 9.998445910004082e-06,
+      "loss": 4.6914,
+      "step": 165
+    },
+    {
+      "batch_num_effect_tokens": 7927,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8144,
+      "epoch": 0.21541,
+      "grad_norm": 0.14332374930381775,
+      "learning_rate": 9.998150522680437e-06,
+      "loss": 5.0967,
+      "step": 166
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.21671,
+      "grad_norm": 0.15254738926887512,
+      "learning_rate": 9.997829456101196e-06,
+      "loss": 4.7773,
+      "step": 167
+    },
+    {
+      "batch_num_effect_tokens": 7932,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8092,
+      "epoch": 0.218,
+      "grad_norm": 0.14264167845249176,
+      "learning_rate": 9.997482711915926e-06,
+      "loss": 4.7695,
+      "step": 168
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.2193,
+      "grad_norm": 0.1465001106262207,
+      "learning_rate": 9.997110291906109e-06,
+      "loss": 4.7012,
+      "step": 169
+    },
+    {
+      "batch_num_effect_tokens": 7952,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8110,
+      "epoch": 0.2206,
+      "grad_norm": 0.14456188678741455,
+      "learning_rate": 9.996712197985147e-06,
+      "loss": 4.7178,
+      "step": 170
+    },
+    {
+      "batch_num_effect_tokens": 7862,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8106,
+      "epoch": 0.2219,
+      "grad_norm": 0.1434151530265808,
+      "learning_rate": 9.99628843219835e-06,
+      "loss": 5.1191,
+      "step": 171
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.2232,
+      "grad_norm": 0.14123517274856567,
+      "learning_rate": 9.995838996722916e-06,
+      "loss": 4.9141,
+      "step": 172
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.22449,
+      "grad_norm": 0.1379905641078949,
+      "learning_rate": 9.995363893867935e-06,
+      "loss": 4.8369,
+      "step": 173
+    },
+    {
+      "batch_num_effect_tokens": 8079,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.22579,
+      "grad_norm": 0.13364200294017792,
+      "learning_rate": 9.994863126074371e-06,
+      "loss": 5.0586,
+      "step": 174
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.22709,
+      "grad_norm": 0.13714580237865448,
+      "learning_rate": 9.994336695915041e-06,
+      "loss": 4.9736,
+      "step": 175
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.22839,
+      "grad_norm": 0.13936279714107513,
+      "learning_rate": 9.993784606094612e-06,
+      "loss": 5.0059,
+      "step": 176
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.22968,
+      "grad_norm": 0.1406031847000122,
+      "learning_rate": 9.993206859449587e-06,
+      "loss": 4.8916,
+      "step": 177
+    },
+    {
+      "batch_num_effect_tokens": 7963,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 0.23098,
+      "grad_norm": 0.1409338116645813,
+      "learning_rate": 9.992603458948282e-06,
+      "loss": 5.2207,
+      "step": 178
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.23228,
+      "grad_norm": 0.16277164220809937,
+      "learning_rate": 9.99197440769082e-06,
+      "loss": 4.9805,
+      "step": 179
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8191,
+      "epoch": 0.23358,
+      "grad_norm": 0.1462642252445221,
+      "learning_rate": 9.991319708909113e-06,
+      "loss": 4.6992,
+      "step": 180
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.23487,
+      "grad_norm": 0.14574594795703888,
+      "learning_rate": 9.990639365966835e-06,
+      "loss": 5.0459,
+      "step": 181
+    },
+    {
+      "batch_num_effect_tokens": 7967,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8112,
+      "epoch": 0.23617,
+      "grad_norm": 0.14711208641529083,
+      "learning_rate": 9.989933382359423e-06,
+      "loss": 5.1992,
+      "step": 182
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 0.23747,
+      "grad_norm": 0.1444520354270935,
+      "learning_rate": 9.989201761714043e-06,
+      "loss": 5.2109,
+      "step": 183
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.23877,
+      "grad_norm": 0.13259008526802063,
+      "learning_rate": 9.988444507789584e-06,
+      "loss": 4.9014,
+      "step": 184
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.24006,
+      "grad_norm": 0.1382356435060501,
+      "learning_rate": 9.987661624476624e-06,
+      "loss": 4.876,
+      "step": 185
+    },
+    {
+      "batch_num_effect_tokens": 8060,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.24136,
+      "grad_norm": 0.13415783643722534,
+      "learning_rate": 9.986853115797424e-06,
+      "loss": 4.7227,
+      "step": 186
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.24266,
+      "grad_norm": 0.13997548818588257,
+      "learning_rate": 9.986018985905901e-06,
+      "loss": 5.1807,
+      "step": 187
+    },
+    {
+      "batch_num_effect_tokens": 7933,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8086,
+      "epoch": 0.24396,
+      "grad_norm": 0.14471964538097382,
+      "learning_rate": 9.98515923908761e-06,
+      "loss": 4.7754,
+      "step": 188
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.24526,
+      "grad_norm": 0.15303927659988403,
+      "learning_rate": 9.984273879759713e-06,
+      "loss": 4.8975,
+      "step": 189
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8101,
+      "epoch": 0.24655,
+      "grad_norm": 0.13627475500106812,
+      "learning_rate": 9.983362912470967e-06,
+      "loss": 4.9385,
+      "step": 190
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.24785,
+      "grad_norm": 0.1349112093448639,
+      "learning_rate": 9.982426341901697e-06,
+      "loss": 5.1465,
+      "step": 191
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.24915,
+      "grad_norm": 0.13799749314785004,
+      "learning_rate": 9.981464172863769e-06,
+      "loss": 5.0508,
+      "step": 192
+    },
+    {
+      "batch_num_effect_tokens": 7856,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8040,
+      "epoch": 0.25045,
+      "grad_norm": 0.14057736098766327,
+      "learning_rate": 9.980476410300567e-06,
+      "loss": 4.9756,
+      "step": 193
+    },
+    {
+      "batch_num_effect_tokens": 7958,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8176,
+      "epoch": 0.25174,
+      "grad_norm": 0.14482036232948303,
+      "learning_rate": 9.979463059286972e-06,
+      "loss": 4.8223,
+      "step": 194
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8176,
+      "epoch": 0.25304,
+      "grad_norm": 0.1419980227947235,
+      "learning_rate": 9.978424125029329e-06,
+      "loss": 4.875,
+      "step": 195
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.25434,
+      "grad_norm": 0.1462613195180893,
+      "learning_rate": 9.977359612865424e-06,
+      "loss": 4.9316,
+      "step": 196
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.25564,
+      "grad_norm": 0.13534201681613922,
+      "learning_rate": 9.976269528264456e-06,
+      "loss": 4.7822,
+      "step": 197
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.25693,
+      "grad_norm": 0.13299311697483063,
+      "learning_rate": 9.975153876827008e-06,
+      "loss": 4.9941,
+      "step": 198
+    },
+    {
+      "batch_num_effect_tokens": 7979,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 0.25823,
+      "grad_norm": 0.13966509699821472,
+      "learning_rate": 9.97401266428502e-06,
+      "loss": 5.2793,
+      "step": 199
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.25953,
+      "grad_norm": 0.14264234900474548,
+      "learning_rate": 9.972845896501762e-06,
+      "loss": 4.8848,
+      "step": 200
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.26083,
+      "grad_norm": 0.1481478363275528,
+      "learning_rate": 9.971653579471791e-06,
+      "loss": 5.0264,
+      "step": 201
+    },
+    {
+      "batch_num_effect_tokens": 7957,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8119,
+      "epoch": 0.26212,
+      "grad_norm": 0.14331968128681183,
+      "learning_rate": 9.97043571932094e-06,
+      "loss": 4.7031,
+      "step": 202
+    },
+    {
+      "batch_num_effect_tokens": 7925,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8123,
+      "epoch": 0.26342,
+      "grad_norm": 0.15035419166088104,
+      "learning_rate": 9.969192322306271e-06,
+      "loss": 4.6455,
+      "step": 203
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8188,
+      "epoch": 0.26472,
+      "grad_norm": 0.1354558765888214,
+      "learning_rate": 9.96792339481605e-06,
+      "loss": 4.6113,
+      "step": 204
+    },
+    {
+      "batch_num_effect_tokens": 7979,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8119,
+      "epoch": 0.26602,
+      "grad_norm": 0.14034751057624817,
+      "learning_rate": 9.966628943369708e-06,
+      "loss": 5.1328,
+      "step": 205
+    },
+    {
+      "batch_num_effect_tokens": 8073,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.26732,
+      "grad_norm": 0.13551993668079376,
+      "learning_rate": 9.965308974617816e-06,
+      "loss": 5.0332,
+      "step": 206
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8171,
+      "epoch": 0.26861,
+      "grad_norm": 0.14565272629261017,
+      "learning_rate": 9.963963495342049e-06,
+      "loss": 4.8906,
+      "step": 207
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.26991,
+      "grad_norm": 0.1394621729850769,
+      "learning_rate": 9.96259251245514e-06,
+      "loss": 4.835,
+      "step": 208
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.27121,
+      "grad_norm": 0.13364410400390625,
+      "learning_rate": 9.961196033000862e-06,
+      "loss": 4.9238,
+      "step": 209
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.27251,
+      "grad_norm": 0.13798867166042328,
+      "learning_rate": 9.959774064153977e-06,
+      "loss": 4.9531,
+      "step": 210
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8185,
+      "epoch": 0.2738,
+      "grad_norm": 0.13652759790420532,
+      "learning_rate": 9.95832661322021e-06,
+      "loss": 4.7188,
+      "step": 211
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.2751,
+      "grad_norm": 0.14416304230690002,
+      "learning_rate": 9.956853687636203e-06,
+      "loss": 5.21,
+      "step": 212
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.2764,
+      "grad_norm": 0.15409010648727417,
+      "learning_rate": 9.955355294969483e-06,
+      "loss": 4.8691,
+      "step": 213
+    },
+    {
+      "batch_num_effect_tokens": 7921,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8085,
+      "epoch": 0.2777,
+      "grad_norm": 0.13492988049983978,
+      "learning_rate": 9.953831442918418e-06,
+      "loss": 4.9668,
+      "step": 214
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8190,
+      "epoch": 0.27899,
+      "grad_norm": 0.13695424795150757,
+      "learning_rate": 9.952282139312182e-06,
+      "loss": 4.749,
+      "step": 215
+    },
+    {
+      "batch_num_effect_tokens": 7980,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8155,
+      "epoch": 0.28029,
+      "grad_norm": 0.14915227890014648,
+      "learning_rate": 9.95070739211071e-06,
+      "loss": 5.1152,
+      "step": 216
+    },
+    {
+      "batch_num_effect_tokens": 7958,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8114,
+      "epoch": 0.28159,
+      "grad_norm": 0.13804367184638977,
+      "learning_rate": 9.949107209404664e-06,
+      "loss": 4.7793,
+      "step": 217
+    },
+    {
+      "batch_num_effect_tokens": 8072,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.28289,
+      "grad_norm": 0.1392471045255661,
+      "learning_rate": 9.947481599415385e-06,
+      "loss": 5.0469,
+      "step": 218
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.28418,
+      "grad_norm": 0.14468149840831757,
+      "learning_rate": 9.945830570494851e-06,
+      "loss": 4.8887,
+      "step": 219
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 0.28548,
+      "grad_norm": 0.1316855400800705,
+      "learning_rate": 9.944154131125643e-06,
+      "loss": 4.9443,
+      "step": 220
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.28678,
+      "grad_norm": 0.15862536430358887,
+      "learning_rate": 9.942452289920886e-06,
+      "loss": 4.8623,
+      "step": 221
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.28808,
+      "grad_norm": 0.13182777166366577,
+      "learning_rate": 9.940725055624218e-06,
+      "loss": 5.0381,
+      "step": 222
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.28938,
+      "grad_norm": 0.1340331733226776,
+      "learning_rate": 9.938972437109742e-06,
+      "loss": 4.7461,
+      "step": 223
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 0.29067,
+      "grad_norm": 0.1430303007364273,
+      "learning_rate": 9.937194443381972e-06,
+      "loss": 4.8057,
+      "step": 224
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8191,
+      "epoch": 0.29197,
+      "grad_norm": 0.14733895659446716,
+      "learning_rate": 9.935391083575803e-06,
+      "loss": 4.7725,
+      "step": 225
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.29327,
+      "grad_norm": 0.13456237316131592,
+      "learning_rate": 9.933562366956445e-06,
+      "loss": 4.5049,
+      "step": 226
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.29457,
+      "grad_norm": 0.13491599261760712,
+      "learning_rate": 9.931708302919394e-06,
+      "loss": 5.0586,
+      "step": 227
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.29586,
+      "grad_norm": 0.13626770675182343,
+      "learning_rate": 9.929828900990367e-06,
+      "loss": 4.7988,
+      "step": 228
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.29716,
+      "grad_norm": 0.14051003754138947,
+      "learning_rate": 9.927924170825266e-06,
+      "loss": 5.0586,
+      "step": 229
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.29846,
+      "grad_norm": 0.13720320165157318,
+      "learning_rate": 9.92599412221012e-06,
+      "loss": 4.8613,
+      "step": 230
+    },
+    {
+      "batch_num_effect_tokens": 7980,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8113,
+      "epoch": 0.29976,
+      "grad_norm": 0.15138697624206543,
+      "learning_rate": 9.924038765061042e-06,
+      "loss": 4.7715,
+      "step": 231
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.30105,
+      "grad_norm": 0.1236131489276886,
+      "learning_rate": 9.922058109424168e-06,
+      "loss": 4.916,
+      "step": 232
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8105,
+      "epoch": 0.30235,
+      "grad_norm": 0.1621561050415039,
+      "learning_rate": 9.920052165475615e-06,
+      "loss": 5.0439,
+      "step": 233
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.30365,
+      "grad_norm": 0.14631430804729462,
+      "learning_rate": 9.918020943521427e-06,
+      "loss": 4.79,
+      "step": 234
+    },
+    {
+      "batch_num_effect_tokens": 7846,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8080,
+      "epoch": 0.30495,
+      "grad_norm": 0.13547104597091675,
+      "learning_rate": 9.915964453997516e-06,
+      "loss": 4.9248,
+      "step": 235
+    },
+    {
+      "batch_num_effect_tokens": 7915,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 0.30624,
+      "grad_norm": 0.13369937241077423,
+      "learning_rate": 9.913882707469615e-06,
+      "loss": 4.9131,
+      "step": 236
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.30754,
+      "grad_norm": 0.14017365872859955,
+      "learning_rate": 9.911775714633218e-06,
+      "loss": 4.5908,
+      "step": 237
+    },
+    {
+      "batch_num_effect_tokens": 7915,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8078,
+      "epoch": 0.30884,
+      "grad_norm": 0.13819383084774017,
+      "learning_rate": 9.909643486313533e-06,
+      "loss": 4.9268,
+      "step": 238
+    },
+    {
+      "batch_num_effect_tokens": 8068,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.31014,
+      "grad_norm": 0.14381656050682068,
+      "learning_rate": 9.907486033465421e-06,
+      "loss": 4.8018,
+      "step": 239
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.31144,
+      "grad_norm": 0.12868614494800568,
+      "learning_rate": 9.905303367173336e-06,
+      "loss": 4.8428,
+      "step": 240
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8166,
+      "epoch": 0.31273,
+      "grad_norm": 0.13400490581989288,
+      "learning_rate": 9.903095498651276e-06,
+      "loss": 4.8477,
+      "step": 241
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.31403,
+      "grad_norm": 0.14183661341667175,
+      "learning_rate": 9.900862439242719e-06,
+      "loss": 4.709,
+      "step": 242
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.31533,
+      "grad_norm": 0.1373617798089981,
+      "learning_rate": 9.898604200420573e-06,
+      "loss": 5.0449,
+      "step": 243
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.31663,
+      "grad_norm": 0.1389341652393341,
+      "learning_rate": 9.896320793787106e-06,
+      "loss": 4.9932,
+      "step": 244
+    },
+    {
+      "batch_num_effect_tokens": 7900,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8092,
+      "epoch": 0.31792,
+      "grad_norm": 0.1446686089038849,
+      "learning_rate": 9.894012231073895e-06,
+      "loss": 4.9473,
+      "step": 245
+    },
+    {
+      "batch_num_effect_tokens": 7948,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8080,
+      "epoch": 0.31922,
+      "grad_norm": 0.1249702200293541,
+      "learning_rate": 9.891678524141759e-06,
+      "loss": 4.7959,
+      "step": 246
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8161,
+      "epoch": 0.32052,
+      "grad_norm": 0.14360341429710388,
+      "learning_rate": 9.889319684980707e-06,
+      "loss": 5.1543,
+      "step": 247
+    },
+    {
+      "batch_num_effect_tokens": 7880,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8062,
+      "epoch": 0.32182,
+      "grad_norm": 0.1445484608411789,
+      "learning_rate": 9.886935725709868e-06,
+      "loss": 4.9531,
+      "step": 248
+    },
+    {
+      "batch_num_effect_tokens": 7734,
+      "batch_num_samples": 28,
+      "batch_num_tokens": 8008,
+      "epoch": 0.32311,
+      "grad_norm": 0.1418876349925995,
+      "learning_rate": 9.884526658577433e-06,
+      "loss": 4.9629,
+      "step": 249
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.32441,
+      "grad_norm": 0.14063134789466858,
+      "learning_rate": 9.882092495960589e-06,
+      "loss": 5.0117,
+      "step": 250
+    },
+    {
+      "batch_num_effect_tokens": 7890,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8086,
+      "epoch": 0.32571,
+      "grad_norm": 0.13051624596118927,
+      "learning_rate": 9.87963325036546e-06,
+      "loss": 4.5283,
+      "step": 251
+    },
+    {
+      "batch_num_effect_tokens": 8060,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.32701,
+      "grad_norm": 0.14672017097473145,
+      "learning_rate": 9.877148934427037e-06,
+      "loss": 4.4834,
+      "step": 252
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.3283,
+      "grad_norm": 0.13878877460956573,
+      "learning_rate": 9.874639560909118e-06,
+      "loss": 4.6523,
+      "step": 253
+    },
+    {
+      "batch_num_effect_tokens": 7886,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8100,
+      "epoch": 0.3296,
+      "grad_norm": 0.13505133986473083,
+      "learning_rate": 9.872105142704245e-06,
+      "loss": 4.8672,
+      "step": 254
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.3309,
+      "grad_norm": 0.1461716592311859,
+      "learning_rate": 9.869545692833624e-06,
+      "loss": 4.5898,
+      "step": 255
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.3322,
+      "grad_norm": 0.16102413833141327,
+      "learning_rate": 9.866961224447076e-06,
+      "loss": 4.7529,
+      "step": 256
+    },
+    {
+      "batch_num_effect_tokens": 8072,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.3335,
+      "grad_norm": 0.13654085993766785,
+      "learning_rate": 9.864351750822957e-06,
+      "loss": 4.6143,
+      "step": 257
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.33479,
+      "grad_norm": 0.13663263618946075,
+      "learning_rate": 9.86171728536809e-06,
+      "loss": 4.9082,
+      "step": 258
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.33609,
+      "grad_norm": 0.13321144878864288,
+      "learning_rate": 9.859057841617709e-06,
+      "loss": 5.084,
+      "step": 259
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.33739,
+      "grad_norm": 0.13456861674785614,
+      "learning_rate": 9.856373433235373e-06,
+      "loss": 4.8818,
+      "step": 260
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.33869,
+      "grad_norm": 0.13602375984191895,
+      "learning_rate": 9.853664074012907e-06,
+      "loss": 5.0449,
+      "step": 261
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.33998,
+      "grad_norm": 0.13534726202487946,
+      "learning_rate": 9.850929777870324e-06,
+      "loss": 4.9688,
+      "step": 262
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8160,
+      "epoch": 0.34128,
+      "grad_norm": 0.14672520756721497,
+      "learning_rate": 9.848170558855757e-06,
+      "loss": 4.6787,
+      "step": 263
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.34258,
+      "grad_norm": 0.12731723487377167,
+      "learning_rate": 9.84538643114539e-06,
+      "loss": 4.998,
+      "step": 264
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.34388,
+      "grad_norm": 0.12880627810955048,
+      "learning_rate": 9.84257740904338e-06,
+      "loss": 4.9824,
+      "step": 265
+    },
+    {
+      "batch_num_effect_tokens": 7977,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 0.34517,
+      "grad_norm": 0.13919463753700256,
+      "learning_rate": 9.839743506981783e-06,
+      "loss": 4.5137,
+      "step": 266
+    },
+    {
+      "batch_num_effect_tokens": 8070,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.34647,
+      "grad_norm": 0.14044621586799622,
+      "learning_rate": 9.836884739520482e-06,
+      "loss": 4.8906,
+      "step": 267
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 0.34777,
+      "grad_norm": 0.14072751998901367,
+      "learning_rate": 9.83400112134712e-06,
+      "loss": 5.084,
+      "step": 268
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 0.34907,
+      "grad_norm": 0.13198843598365784,
+      "learning_rate": 9.831092667277002e-06,
+      "loss": 4.7402,
+      "step": 269
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.35036,
+      "grad_norm": 0.14157812297344208,
+      "learning_rate": 9.828159392253051e-06,
+      "loss": 4.8887,
+      "step": 270
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.35166,
+      "grad_norm": 0.13811397552490234,
+      "learning_rate": 9.8252013113457e-06,
+      "loss": 4.8408,
+      "step": 271
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.35296,
+      "grad_norm": 0.141897514462471,
+      "learning_rate": 9.822218439752835e-06,
+      "loss": 4.8301,
+      "step": 272
+    },
+    {
+      "batch_num_effect_tokens": 7977,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 0.35426,
+      "grad_norm": 0.1375928372144699,
+      "learning_rate": 9.819210792799711e-06,
+      "loss": 4.8154,
+      "step": 273
+    },
+    {
+      "batch_num_effect_tokens": 7896,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8096,
+      "epoch": 0.35556,
+      "grad_norm": 0.14474721252918243,
+      "learning_rate": 9.816178385938867e-06,
+      "loss": 4.959,
+      "step": 274
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.35685,
+      "grad_norm": 0.14300163090229034,
+      "learning_rate": 9.81312123475006e-06,
+      "loss": 4.9746,
+      "step": 275
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8173,
+      "epoch": 0.35815,
+      "grad_norm": 0.1514219492673874,
+      "learning_rate": 9.810039354940172e-06,
+      "loss": 4.9414,
+      "step": 276
+    },
+    {
+      "batch_num_effect_tokens": 8001,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8145,
+      "epoch": 0.35945,
+      "grad_norm": 0.13827340304851532,
+      "learning_rate": 9.806932762343136e-06,
+      "loss": 4.9424,
+      "step": 277
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8152,
+      "epoch": 0.36075,
+      "grad_norm": 0.14099182188510895,
+      "learning_rate": 9.80380147291985e-06,
+      "loss": 5.1465,
+      "step": 278
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.36204,
+      "grad_norm": 0.12410301715135574,
+      "learning_rate": 9.800645502758104e-06,
+      "loss": 4.9053,
+      "step": 279
+    },
+    {
+      "batch_num_effect_tokens": 7958,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8101,
+      "epoch": 0.36334,
+      "grad_norm": 0.1367730349302292,
+      "learning_rate": 9.797464868072489e-06,
+      "loss": 4.6543,
+      "step": 280
+    },
+    {
+      "batch_num_effect_tokens": 7939,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8146,
+      "epoch": 0.36464,
+      "grad_norm": 0.1414947360754013,
+      "learning_rate": 9.794259585204313e-06,
+      "loss": 4.9229,
+      "step": 281
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.36594,
+      "grad_norm": 0.12790058553218842,
+      "learning_rate": 9.791029670621525e-06,
+      "loss": 4.9121,
+      "step": 282
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 0.36723,
+      "grad_norm": 0.13167431950569153,
+      "learning_rate": 9.787775140918625e-06,
+      "loss": 4.918,
+      "step": 283
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.36853,
+      "grad_norm": 0.14066706597805023,
+      "learning_rate": 9.784496012816574e-06,
+      "loss": 4.8828,
+      "step": 284
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 0.36983,
+      "grad_norm": 0.14358888566493988,
+      "learning_rate": 9.781192303162721e-06,
+      "loss": 4.7529,
+      "step": 285
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.37113,
+      "grad_norm": 0.14802835881710052,
+      "learning_rate": 9.777864028930705e-06,
+      "loss": 4.8633,
+      "step": 286
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8146,
+      "epoch": 0.37242,
+      "grad_norm": 0.14124155044555664,
+      "learning_rate": 9.774511207220369e-06,
+      "loss": 4.8584,
+      "step": 287
+    },
+    {
+      "batch_num_effect_tokens": 7869,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 0.37372,
+      "grad_norm": 0.13983023166656494,
+      "learning_rate": 9.771133855257684e-06,
+      "loss": 5.127,
+      "step": 288
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.37502,
+      "grad_norm": 0.12876510620117188,
+      "learning_rate": 9.767731990394638e-06,
+      "loss": 4.8506,
+      "step": 289
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.37632,
+      "grad_norm": 0.13428495824337006,
+      "learning_rate": 9.764305630109174e-06,
+      "loss": 4.8955,
+      "step": 290
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8155,
+      "epoch": 0.37762,
+      "grad_norm": 0.12863335013389587,
+      "learning_rate": 9.760854792005075e-06,
+      "loss": 5.1689,
+      "step": 291
+    },
+    {
+      "batch_num_effect_tokens": 8068,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 0.37891,
+      "grad_norm": 0.13183218240737915,
+      "learning_rate": 9.757379493811892e-06,
+      "loss": 4.8193,
+      "step": 292
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.38021,
+      "grad_norm": 0.1384708732366562,
+      "learning_rate": 9.753879753384845e-06,
+      "loss": 4.8105,
+      "step": 293
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.38151,
+      "grad_norm": 0.1356097161769867,
+      "learning_rate": 9.750355588704728e-06,
+      "loss": 4.9473,
+      "step": 294
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.38281,
+      "grad_norm": 0.13910697400569916,
+      "learning_rate": 9.746807017877823e-06,
+      "loss": 4.9854,
+      "step": 295
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 0.3841,
+      "grad_norm": 0.13383249938488007,
+      "learning_rate": 9.743234059135812e-06,
+      "loss": 4.8418,
+      "step": 296
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8164,
+      "epoch": 0.3854,
+      "grad_norm": 0.1411924511194229,
+      "learning_rate": 9.73963673083566e-06,
+      "loss": 4.9199,
+      "step": 297
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.3867,
+      "grad_norm": 0.1354569047689438,
+      "learning_rate": 9.736015051459551e-06,
+      "loss": 4.6748,
+      "step": 298
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.388,
+      "grad_norm": 0.138069286942482,
+      "learning_rate": 9.732369039614774e-06,
+      "loss": 4.8672,
+      "step": 299
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.38929,
+      "grad_norm": 0.137836754322052,
+      "learning_rate": 9.728698714033631e-06,
+      "loss": 5.0059,
+      "step": 300
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.39059,
+      "grad_norm": 0.14079798758029938,
+      "learning_rate": 9.725004093573343e-06,
+      "loss": 5.0039,
+      "step": 301
+    },
+    {
+      "batch_num_effect_tokens": 7842,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 7989,
+      "epoch": 0.39189,
+      "grad_norm": 0.13341772556304932,
+      "learning_rate": 9.721285197215954e-06,
+      "loss": 4.8281,
+      "step": 302
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8191,
+      "epoch": 0.39319,
+      "grad_norm": 0.14130514860153198,
+      "learning_rate": 9.717542044068224e-06,
+      "loss": 4.6729,
+      "step": 303
+    },
+    {
+      "batch_num_effect_tokens": 7882,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8105,
+      "epoch": 0.39448,
+      "grad_norm": 0.1398962289094925,
+      "learning_rate": 9.71377465336155e-06,
+      "loss": 4.6582,
+      "step": 304
+    },
+    {
+      "batch_num_effect_tokens": 7844,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8080,
+      "epoch": 0.39578,
+      "grad_norm": 0.1334947943687439,
+      "learning_rate": 9.709983044451847e-06,
+      "loss": 4.6211,
+      "step": 305
+    },
+    {
+      "batch_num_effect_tokens": 7944,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8088,
+      "epoch": 0.39708,
+      "grad_norm": 0.1320602148771286,
+      "learning_rate": 9.70616723681946e-06,
+      "loss": 4.7031,
+      "step": 306
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.39838,
+      "grad_norm": 0.13198183476924896,
+      "learning_rate": 9.702327250069058e-06,
+      "loss": 4.5635,
+      "step": 307
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.39968,
+      "grad_norm": 0.13476605713367462,
+      "learning_rate": 9.698463103929542e-06,
+      "loss": 4.9883,
+      "step": 308
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.39968,
+      "eval_eval_loss": 0.6101124882698059,
+      "eval_eval_runtime": 115.2852,
+      "eval_eval_samples_per_second": 43.371,
+      "eval_eval_steps_per_second": 2.715,
+      "step": 308
+    },
+    {
+      "batch_num_effect_tokens": 7840,
+      "batch_num_samples": 28,
+      "batch_num_tokens": 8104,
+      "epoch": 0.40097,
+      "grad_norm": 0.13941439986228943,
+      "learning_rate": 9.694574818253935e-06,
+      "loss": 4.916,
+      "step": 309
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.40227,
+      "grad_norm": 0.13381004333496094,
+      "learning_rate": 9.69066241301928e-06,
+      "loss": 5.0781,
+      "step": 310
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.40357,
+      "grad_norm": 0.14835673570632935,
+      "learning_rate": 9.686725908326547e-06,
+      "loss": 4.8125,
+      "step": 311
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.40487,
+      "grad_norm": 0.1386537253856659,
+      "learning_rate": 9.682765324400514e-06,
+      "loss": 4.7021,
+      "step": 312
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8137,
+      "epoch": 0.40616,
+      "grad_norm": 0.13303305208683014,
+      "learning_rate": 9.67878068158968e-06,
+      "loss": 4.6924,
+      "step": 313
+    },
+    {
+      "batch_num_effect_tokens": 8070,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.40746,
+      "grad_norm": 0.13811469078063965,
+      "learning_rate": 9.674772000366151e-06,
+      "loss": 4.8867,
+      "step": 314
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8176,
+      "epoch": 0.40876,
+      "grad_norm": 0.13623467087745667,
+      "learning_rate": 9.670739301325534e-06,
+      "loss": 4.7764,
+      "step": 315
+    },
+    {
+      "batch_num_effect_tokens": 8006,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.41006,
+      "grad_norm": 0.13184677064418793,
+      "learning_rate": 9.666682605186834e-06,
+      "loss": 4.6846,
+      "step": 316
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8190,
+      "epoch": 0.41135,
+      "grad_norm": 0.13243718445301056,
+      "learning_rate": 9.662601932792349e-06,
+      "loss": 4.5635,
+      "step": 317
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8104,
+      "epoch": 0.41265,
+      "grad_norm": 0.13839715719223022,
+      "learning_rate": 9.658497305107559e-06,
+      "loss": 4.8477,
+      "step": 318
+    },
+    {
+      "batch_num_effect_tokens": 7927,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8086,
+      "epoch": 0.41395,
+      "grad_norm": 0.1342511624097824,
+      "learning_rate": 9.654368743221022e-06,
+      "loss": 4.8398,
+      "step": 319
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.41525,
+      "grad_norm": 0.1273842304944992,
+      "learning_rate": 9.650216268344263e-06,
+      "loss": 4.4043,
+      "step": 320
+    },
+    {
+      "batch_num_effect_tokens": 7888,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8086,
+      "epoch": 0.41655,
+      "grad_norm": 0.12917739152908325,
+      "learning_rate": 9.646039901811666e-06,
+      "loss": 4.6807,
+      "step": 321
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8142,
+      "epoch": 0.41784,
+      "grad_norm": 0.13067729771137238,
+      "learning_rate": 9.641839665080363e-06,
+      "loss": 4.6602,
+      "step": 322
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8181,
+      "epoch": 0.41914,
+      "grad_norm": 0.13833428919315338,
+      "learning_rate": 9.63761557973013e-06,
+      "loss": 4.6309,
+      "step": 323
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8176,
+      "epoch": 0.42044,
+      "grad_norm": 0.14830631017684937,
+      "learning_rate": 9.633367667463267e-06,
+      "loss": 5.5645,
+      "step": 324
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.42174,
+      "grad_norm": 0.13653573393821716,
+      "learning_rate": 9.62909595010449e-06,
+      "loss": 5.4238,
+      "step": 325
+    },
+    {
+      "batch_num_effect_tokens": 7962,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8120,
+      "epoch": 0.42303,
+      "grad_norm": 0.13599801063537598,
+      "learning_rate": 9.624800449600826e-06,
+      "loss": 5.1523,
+      "step": 326
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.42433,
+      "grad_norm": 0.1389356106519699,
+      "learning_rate": 9.620481188021484e-06,
+      "loss": 4.9199,
+      "step": 327
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8166,
+      "epoch": 0.42563,
+      "grad_norm": 0.1340194195508957,
+      "learning_rate": 9.616138187557758e-06,
+      "loss": 4.7656,
+      "step": 328
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.42693,
+      "grad_norm": 0.1351163685321808,
+      "learning_rate": 9.611771470522908e-06,
+      "loss": 5.2266,
+      "step": 329
+    },
+    {
+      "batch_num_effect_tokens": 7892,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8137,
+      "epoch": 0.42822,
+      "grad_norm": 0.13661961257457733,
+      "learning_rate": 9.60738105935204e-06,
+      "loss": 5.0176,
+      "step": 330
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.42952,
+      "grad_norm": 0.1298064887523651,
+      "learning_rate": 9.602966976601995e-06,
+      "loss": 4.6572,
+      "step": 331
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8128,
+      "epoch": 0.43082,
+      "grad_norm": 0.13714361190795898,
+      "learning_rate": 9.598529244951233e-06,
+      "loss": 4.959,
+      "step": 332
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8186,
+      "epoch": 0.43212,
+      "grad_norm": 0.13826780021190643,
+      "learning_rate": 9.594067887199719e-06,
+      "loss": 4.875,
+      "step": 333
+    },
+    {
+      "batch_num_effect_tokens": 7877,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8100,
+      "epoch": 0.43341,
+      "grad_norm": 0.1306610256433487,
+      "learning_rate": 9.589582926268798e-06,
+      "loss": 4.2568,
+      "step": 334
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8079,
+      "epoch": 0.43471,
+      "grad_norm": 0.12845724821090698,
+      "learning_rate": 9.585074385201087e-06,
+      "loss": 4.8184,
+      "step": 335
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.43601,
+      "grad_norm": 0.13526244461536407,
+      "learning_rate": 9.580542287160348e-06,
+      "loss": 4.8398,
+      "step": 336
+    },
+    {
+      "batch_num_effect_tokens": 7987,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8186,
+      "epoch": 0.43731,
+      "grad_norm": 0.1280571073293686,
+      "learning_rate": 9.575986655431377e-06,
+      "loss": 4.7578,
+      "step": 337
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.43861,
+      "grad_norm": 0.12609054148197174,
+      "learning_rate": 9.571407513419878e-06,
+      "loss": 4.6699,
+      "step": 338
+    },
+    {
+      "batch_num_effect_tokens": 7953,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8104,
+      "epoch": 0.4399,
+      "grad_norm": 0.12861526012420654,
+      "learning_rate": 9.566804884652342e-06,
+      "loss": 4.9395,
+      "step": 339
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8191,
+      "epoch": 0.4412,
+      "grad_norm": 0.15593406558036804,
+      "learning_rate": 9.562178792775936e-06,
+      "loss": 4.7197,
+      "step": 340
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8142,
+      "epoch": 0.4425,
+      "grad_norm": 0.1305861622095108,
+      "learning_rate": 9.557529261558367e-06,
+      "loss": 4.8428,
+      "step": 341
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.4438,
+      "grad_norm": 0.13278287649154663,
+      "learning_rate": 9.552856314887772e-06,
+      "loss": 4.7871,
+      "step": 342
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.44509,
+      "grad_norm": 0.12788629531860352,
+      "learning_rate": 9.548159976772593e-06,
+      "loss": 4.709,
+      "step": 343
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.44639,
+      "grad_norm": 0.12882810831069946,
+      "learning_rate": 9.543440271341445e-06,
+      "loss": 4.9229,
+      "step": 344
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.44769,
+      "grad_norm": 0.1309846192598343,
+      "learning_rate": 9.538697222843004e-06,
+      "loss": 4.8623,
+      "step": 345
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.44899,
+      "grad_norm": 0.12217242270708084,
+      "learning_rate": 9.533930855645872e-06,
+      "loss": 4.7715,
+      "step": 346
+    },
+    {
+      "batch_num_effect_tokens": 7944,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8112,
+      "epoch": 0.45028,
+      "grad_norm": 0.13333797454833984,
+      "learning_rate": 9.529141194238462e-06,
+      "loss": 4.8975,
+      "step": 347
+    },
+    {
+      "batch_num_effect_tokens": 7923,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8110,
+      "epoch": 0.45158,
+      "grad_norm": 0.12796379625797272,
+      "learning_rate": 9.524328263228866e-06,
+      "loss": 4.8311,
+      "step": 348
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.45288,
+      "grad_norm": 0.13038370013237,
+      "learning_rate": 9.519492087344724e-06,
+      "loss": 4.708,
+      "step": 349
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.45418,
+      "grad_norm": 0.14246924221515656,
+      "learning_rate": 9.514632691433108e-06,
+      "loss": 4.5479,
+      "step": 350
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.45547,
+      "grad_norm": 0.13306821882724762,
+      "learning_rate": 9.509750100460384e-06,
+      "loss": 4.7334,
+      "step": 351
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.45677,
+      "grad_norm": 0.12792479991912842,
+      "learning_rate": 9.504844339512096e-06,
+      "loss": 4.626,
+      "step": 352
+    },
+    {
+      "batch_num_effect_tokens": 8073,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.45807,
+      "grad_norm": 0.13247136771678925,
+      "learning_rate": 9.499915433792823e-06,
+      "loss": 4.9121,
+      "step": 353
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.45937,
+      "grad_norm": 0.13608035445213318,
+      "learning_rate": 9.494963408626056e-06,
+      "loss": 5.0977,
+      "step": 354
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.46067,
+      "grad_norm": 0.1288149058818817,
+      "learning_rate": 9.489988289454073e-06,
+      "loss": 4.7832,
+      "step": 355
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.46196,
+      "grad_norm": 0.13947491347789764,
+      "learning_rate": 9.484990101837798e-06,
+      "loss": 4.625,
+      "step": 356
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.46326,
+      "grad_norm": 0.13044162094593048,
+      "learning_rate": 9.47996887145668e-06,
+      "loss": 4.6738,
+      "step": 357
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8144,
+      "epoch": 0.46456,
+      "grad_norm": 0.13908232748508453,
+      "learning_rate": 9.47492462410855e-06,
+      "loss": 4.6367,
+      "step": 358
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 0.46586,
+      "grad_norm": 0.13168592751026154,
+      "learning_rate": 9.469857385709498e-06,
+      "loss": 4.7568,
+      "step": 359
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8107,
+      "epoch": 0.46715,
+      "grad_norm": 0.13281431794166565,
+      "learning_rate": 9.46476718229374e-06,
+      "loss": 4.833,
+      "step": 360
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.46845,
+      "grad_norm": 0.13974054157733917,
+      "learning_rate": 9.45965404001347e-06,
+      "loss": 4.6826,
+      "step": 361
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.46975,
+      "grad_norm": 0.1310376226902008,
+      "learning_rate": 9.454517985138748e-06,
+      "loss": 4.626,
+      "step": 362
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8128,
+      "epoch": 0.47105,
+      "grad_norm": 0.12379533797502518,
+      "learning_rate": 9.449359044057344e-06,
+      "loss": 4.4814,
+      "step": 363
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 0.47234,
+      "grad_norm": 0.13945648074150085,
+      "learning_rate": 9.444177243274619e-06,
+      "loss": 4.6367,
+      "step": 364
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.47364,
+      "grad_norm": 0.12774516642093658,
+      "learning_rate": 9.438972609413376e-06,
+      "loss": 4.7061,
+      "step": 365
+    },
+    {
+      "batch_num_effect_tokens": 7941,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8162,
+      "epoch": 0.47494,
+      "grad_norm": 0.12300989776849747,
+      "learning_rate": 9.433745169213729e-06,
+      "loss": 4.6963,
+      "step": 366
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8176,
+      "epoch": 0.47624,
+      "grad_norm": 0.1422237604856491,
+      "learning_rate": 9.428494949532972e-06,
+      "loss": 5.0645,
+      "step": 367
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8186,
+      "epoch": 0.47753,
+      "grad_norm": 0.14680571854114532,
+      "learning_rate": 9.423221977345425e-06,
+      "loss": 4.583,
+      "step": 368
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.47883,
+      "grad_norm": 0.14597126841545105,
+      "learning_rate": 9.41792627974231e-06,
+      "loss": 4.7686,
+      "step": 369
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.48013,
+      "grad_norm": 0.13670004904270172,
+      "learning_rate": 9.412607883931608e-06,
+      "loss": 4.7676,
+      "step": 370
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 0.48143,
+      "grad_norm": 0.1419769525527954,
+      "learning_rate": 9.40726681723791e-06,
+      "loss": 4.9912,
+      "step": 371
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8184,
+      "epoch": 0.48273,
+      "grad_norm": 0.13378414511680603,
+      "learning_rate": 9.401903107102295e-06,
+      "loss": 4.8301,
+      "step": 372
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.48402,
+      "grad_norm": 0.12032909691333771,
+      "learning_rate": 9.396516781082172e-06,
+      "loss": 4.9736,
+      "step": 373
+    },
+    {
+      "batch_num_effect_tokens": 7962,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8181,
+      "epoch": 0.48532,
+      "grad_norm": 0.14288964867591858,
+      "learning_rate": 9.391107866851143e-06,
+      "loss": 5.4004,
+      "step": 374
+    },
+    {
+      "batch_num_effect_tokens": 7887,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8086,
+      "epoch": 0.48662,
+      "grad_norm": 0.13049355149269104,
+      "learning_rate": 9.385676392198869e-06,
+      "loss": 4.8486,
+      "step": 375
+    },
+    {
+      "batch_num_effect_tokens": 7965,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8177,
+      "epoch": 0.48792,
+      "grad_norm": 0.12765397131443024,
+      "learning_rate": 9.380222385030916e-06,
+      "loss": 4.8682,
+      "step": 376
+    },
+    {
+      "batch_num_effect_tokens": 7958,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8133,
+      "epoch": 0.48921,
+      "grad_norm": 0.12337980419397354,
+      "learning_rate": 9.374745873368614e-06,
+      "loss": 4.6826,
+      "step": 377
+    },
+    {
+      "batch_num_effect_tokens": 8071,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.49051,
+      "grad_norm": 0.12957154214382172,
+      "learning_rate": 9.369246885348926e-06,
+      "loss": 5.0645,
+      "step": 378
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8155,
+      "epoch": 0.49181,
+      "grad_norm": 0.13127748668193817,
+      "learning_rate": 9.363725449224281e-06,
+      "loss": 4.9531,
+      "step": 379
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.49311,
+      "grad_norm": 0.13518795371055603,
+      "learning_rate": 9.35818159336245e-06,
+      "loss": 4.9785,
+      "step": 380
+    },
+    {
+      "batch_num_effect_tokens": 7961,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8108,
+      "epoch": 0.4944,
+      "grad_norm": 0.13270984590053558,
+      "learning_rate": 9.352615346246383e-06,
+      "loss": 4.8457,
+      "step": 381
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.4957,
+      "grad_norm": 0.13498768210411072,
+      "learning_rate": 9.347026736474077e-06,
+      "loss": 4.6934,
+      "step": 382
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.497,
+      "grad_norm": 0.1299462616443634,
+      "learning_rate": 9.341415792758421e-06,
+      "loss": 5.041,
+      "step": 383
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8178,
+      "epoch": 0.4983,
+      "grad_norm": 0.14259278774261475,
+      "learning_rate": 9.33578254392705e-06,
+      "loss": 4.8867,
+      "step": 384
+    },
+    {
+      "batch_num_effect_tokens": 8068,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.49959,
+      "grad_norm": 0.142298623919487,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 5.1357,
+      "step": 385
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.50089,
+      "grad_norm": 0.1336406022310257,
+      "learning_rate": 9.324449246800538e-06,
+      "loss": 4.9541,
+      "step": 386
+    },
+    {
+      "batch_num_effect_tokens": 8067,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.50219,
+      "grad_norm": 0.13553744554519653,
+      "learning_rate": 9.318749256733064e-06,
+      "loss": 5.0166,
+      "step": 387
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.50349,
+      "grad_norm": 0.12613889575004578,
+      "learning_rate": 9.313027078004903e-06,
+      "loss": 4.8721,
+      "step": 388
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8126,
+      "epoch": 0.50479,
+      "grad_norm": 0.1368468850851059,
+      "learning_rate": 9.307282740015192e-06,
+      "loss": 5.2559,
+      "step": 389
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8191,
+      "epoch": 0.50608,
+      "grad_norm": 0.14101892709732056,
+      "learning_rate": 9.301516272276907e-06,
+      "loss": 4.7598,
+      "step": 390
+    },
+    {
+      "batch_num_effect_tokens": 7966,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8101,
+      "epoch": 0.50738,
+      "grad_norm": 0.1324855089187622,
+      "learning_rate": 9.295727704416731e-06,
+      "loss": 5.0908,
+      "step": 391
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.50868,
+      "grad_norm": 0.13836555182933807,
+      "learning_rate": 9.289917066174887e-06,
+      "loss": 4.9092,
+      "step": 392
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.50998,
+      "grad_norm": 0.1290932595729828,
+      "learning_rate": 9.284084387404985e-06,
+      "loss": 5.0156,
+      "step": 393
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 0.51127,
+      "grad_norm": 0.13208125531673431,
+      "learning_rate": 9.278229698073889e-06,
+      "loss": 4.7783,
+      "step": 394
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8188,
+      "epoch": 0.51257,
+      "grad_norm": 0.12734545767307281,
+      "learning_rate": 9.27235302826153e-06,
+      "loss": 4.6992,
+      "step": 395
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.51387,
+      "grad_norm": 0.1279255449771881,
+      "learning_rate": 9.266454408160779e-06,
+      "loss": 4.3008,
+      "step": 396
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.51517,
+      "grad_norm": 0.13789525628089905,
+      "learning_rate": 9.260533868077283e-06,
+      "loss": 4.7852,
+      "step": 397
+    },
+    {
+      "batch_num_effect_tokens": 7919,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8076,
+      "epoch": 0.51646,
+      "grad_norm": 0.12028060853481293,
+      "learning_rate": 9.254591438429305e-06,
+      "loss": 4.7539,
+      "step": 398
+    },
+    {
+      "batch_num_effect_tokens": 8069,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.51776,
+      "grad_norm": 0.12482751905918121,
+      "learning_rate": 9.248627149747573e-06,
+      "loss": 4.7402,
+      "step": 399
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.51906,
+      "grad_norm": 0.13144247233867645,
+      "learning_rate": 9.242641032675118e-06,
+      "loss": 4.7803,
+      "step": 400
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.52036,
+      "grad_norm": 0.13110798597335815,
+      "learning_rate": 9.236633117967125e-06,
+      "loss": 4.6787,
+      "step": 401
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8136,
+      "epoch": 0.52165,
+      "grad_norm": 0.13168026506900787,
+      "learning_rate": 9.230603436490764e-06,
+      "loss": 4.8691,
+      "step": 402
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.52295,
+      "grad_norm": 0.14049184322357178,
+      "learning_rate": 9.224552019225044e-06,
+      "loss": 4.9766,
+      "step": 403
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.52425,
+      "grad_norm": 0.13541923463344574,
+      "learning_rate": 9.21847889726064e-06,
+      "loss": 4.6348,
+      "step": 404
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8158,
+      "epoch": 0.52555,
+      "grad_norm": 0.13659295439720154,
+      "learning_rate": 9.212384101799748e-06,
+      "loss": 5.1406,
+      "step": 405
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8188,
+      "epoch": 0.52685,
+      "grad_norm": 0.1369631290435791,
+      "learning_rate": 9.206267664155906e-06,
+      "loss": 4.7207,
+      "step": 406
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8128,
+      "epoch": 0.52814,
+      "grad_norm": 0.12341079860925674,
+      "learning_rate": 9.200129615753858e-06,
+      "loss": 4.6543,
+      "step": 407
+    },
+    {
+      "batch_num_effect_tokens": 7884,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8080,
+      "epoch": 0.52944,
+      "grad_norm": 0.1462087631225586,
+      "learning_rate": 9.193969988129367e-06,
+      "loss": 4.8408,
+      "step": 408
+    },
+    {
+      "batch_num_effect_tokens": 7955,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8119,
+      "epoch": 0.53074,
+      "grad_norm": 0.136996328830719,
+      "learning_rate": 9.187788812929074e-06,
+      "loss": 4.9297,
+      "step": 409
+    },
+    {
+      "batch_num_effect_tokens": 7812,
+      "batch_num_samples": 32,
+      "batch_num_tokens": 8076,
+      "epoch": 0.53204,
+      "grad_norm": 0.1477639228105545,
+      "learning_rate": 9.181586121910317e-06,
+      "loss": 4.9512,
+      "step": 410
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8136,
+      "epoch": 0.53333,
+      "grad_norm": 0.13376633822917938,
+      "learning_rate": 9.175361946940983e-06,
+      "loss": 4.9346,
+      "step": 411
+    },
+    {
+      "batch_num_effect_tokens": 7993,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.53463,
+      "grad_norm": 0.12200411409139633,
+      "learning_rate": 9.169116319999336e-06,
+      "loss": 4.5762,
+      "step": 412
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.53593,
+      "grad_norm": 0.14485883712768555,
+      "learning_rate": 9.162849273173857e-06,
+      "loss": 4.7148,
+      "step": 413
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.53723,
+      "grad_norm": 0.12444700300693512,
+      "learning_rate": 9.156560838663076e-06,
+      "loss": 4.5879,
+      "step": 414
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 0.53852,
+      "grad_norm": 0.13118426501750946,
+      "learning_rate": 9.150251048775403e-06,
+      "loss": 4.6113,
+      "step": 415
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.53982,
+      "grad_norm": 0.12771986424922943,
+      "learning_rate": 9.143919935928975e-06,
+      "loss": 4.8223,
+      "step": 416
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.54112,
+      "grad_norm": 0.13158395886421204,
+      "learning_rate": 9.137567532651477e-06,
+      "loss": 4.6729,
+      "step": 417
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.54242,
+      "grad_norm": 0.13526186347007751,
+      "learning_rate": 9.131193871579975e-06,
+      "loss": 4.4736,
+      "step": 418
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8100,
+      "epoch": 0.54371,
+      "grad_norm": 0.14715400338172913,
+      "learning_rate": 9.124798985460759e-06,
+      "loss": 4.917,
+      "step": 419
+    },
+    {
+      "batch_num_effect_tokens": 7918,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8101,
+      "epoch": 0.54501,
+      "grad_norm": 0.12297821789979935,
+      "learning_rate": 9.118382907149164e-06,
+      "loss": 4.8252,
+      "step": 420
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.54631,
+      "grad_norm": 0.14057192206382751,
+      "learning_rate": 9.111945669609408e-06,
+      "loss": 4.5547,
+      "step": 421
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8181,
+      "epoch": 0.54761,
+      "grad_norm": 0.1264130175113678,
+      "learning_rate": 9.105487305914415e-06,
+      "loss": 4.6621,
+      "step": 422
+    },
+    {
+      "batch_num_effect_tokens": 7898,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8126,
+      "epoch": 0.54891,
+      "grad_norm": 0.12750910222530365,
+      "learning_rate": 9.099007849245656e-06,
+      "loss": 4.7354,
+      "step": 423
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.5502,
+      "grad_norm": 0.12850888073444366,
+      "learning_rate": 9.092507332892968e-06,
+      "loss": 4.5928,
+      "step": 424
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.5515,
+      "grad_norm": 0.1398119032382965,
+      "learning_rate": 9.08598579025439e-06,
+      "loss": 5.082,
+      "step": 425
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.5528,
+      "grad_norm": 0.12561438977718353,
+      "learning_rate": 9.079443254835987e-06,
+      "loss": 4.8418,
+      "step": 426
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8177,
+      "epoch": 0.5541,
+      "grad_norm": 0.13627532124519348,
+      "learning_rate": 9.07287976025168e-06,
+      "loss": 4.748,
+      "step": 427
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.55539,
+      "grad_norm": 0.1425987184047699,
+      "learning_rate": 9.066295340223073e-06,
+      "loss": 4.8652,
+      "step": 428
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.55669,
+      "grad_norm": 0.12989814579486847,
+      "learning_rate": 9.059690028579285e-06,
+      "loss": 4.5225,
+      "step": 429
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.55799,
+      "grad_norm": 0.12423403561115265,
+      "learning_rate": 9.05306385925676e-06,
+      "loss": 4.8164,
+      "step": 430
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.55929,
+      "grad_norm": 0.1329265534877777,
+      "learning_rate": 9.04641686629911e-06,
+      "loss": 4.667,
+      "step": 431
+    },
+    {
+      "batch_num_effect_tokens": 7906,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8086,
+      "epoch": 0.56058,
+      "grad_norm": 0.13709376752376556,
+      "learning_rate": 9.039749083856938e-06,
+      "loss": 4.6504,
+      "step": 432
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.56188,
+      "grad_norm": 0.1327197253704071,
+      "learning_rate": 9.033060546187651e-06,
+      "loss": 4.9004,
+      "step": 433
+    },
+    {
+      "batch_num_effect_tokens": 7986,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.56318,
+      "grad_norm": 0.12089107930660248,
+      "learning_rate": 9.026351287655294e-06,
+      "loss": 4.6582,
+      "step": 434
+    },
+    {
+      "batch_num_effect_tokens": 8072,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.56448,
+      "grad_norm": 0.12628640234470367,
+      "learning_rate": 9.019621342730369e-06,
+      "loss": 4.7559,
+      "step": 435
+    },
+    {
+      "batch_num_effect_tokens": 8072,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.56577,
+      "grad_norm": 0.12535719573497772,
+      "learning_rate": 9.012870745989663e-06,
+      "loss": 4.7764,
+      "step": 436
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.56707,
+      "grad_norm": 0.12927192449569702,
+      "learning_rate": 9.006099532116066e-06,
+      "loss": 4.6074,
+      "step": 437
+    },
+    {
+      "batch_num_effect_tokens": 7960,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8146,
+      "epoch": 0.56837,
+      "grad_norm": 0.12784621119499207,
+      "learning_rate": 8.999307735898389e-06,
+      "loss": 4.3076,
+      "step": 438
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8186,
+      "epoch": 0.56967,
+      "grad_norm": 0.12837977707386017,
+      "learning_rate": 8.992495392231195e-06,
+      "loss": 4.6992,
+      "step": 439
+    },
+    {
+      "batch_num_effect_tokens": 7883,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8091,
+      "epoch": 0.57097,
+      "grad_norm": 0.1314186155796051,
+      "learning_rate": 8.985662536114614e-06,
+      "loss": 4.6367,
+      "step": 440
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8144,
+      "epoch": 0.57226,
+      "grad_norm": 0.13375988602638245,
+      "learning_rate": 8.978809202654161e-06,
+      "loss": 4.8691,
+      "step": 441
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.57356,
+      "grad_norm": 0.1275053322315216,
+      "learning_rate": 8.971935427060563e-06,
+      "loss": 4.6514,
+      "step": 442
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8169,
+      "epoch": 0.57486,
+      "grad_norm": 0.14429806172847748,
+      "learning_rate": 8.965041244649572e-06,
+      "loss": 5.0264,
+      "step": 443
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8093,
+      "epoch": 0.57616,
+      "grad_norm": 0.13653282821178436,
+      "learning_rate": 8.95812669084178e-06,
+      "loss": 4.5127,
+      "step": 444
+    },
+    {
+      "batch_num_effect_tokens": 7938,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8093,
+      "epoch": 0.57745,
+      "grad_norm": 0.1339128315448761,
+      "learning_rate": 8.951191801162453e-06,
+      "loss": 4.4707,
+      "step": 445
+    },
+    {
+      "batch_num_effect_tokens": 7966,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.57875,
+      "grad_norm": 0.12954694032669067,
+      "learning_rate": 8.944236611241323e-06,
+      "loss": 4.8291,
+      "step": 446
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.58005,
+      "grad_norm": 0.13166210055351257,
+      "learning_rate": 8.937261156812436e-06,
+      "loss": 4.7471,
+      "step": 447
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.58135,
+      "grad_norm": 0.12804020941257477,
+      "learning_rate": 8.930265473713939e-06,
+      "loss": 4.7012,
+      "step": 448
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 0.58264,
+      "grad_norm": 0.13141286373138428,
+      "learning_rate": 8.923249597887913e-06,
+      "loss": 4.7891,
+      "step": 449
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.58394,
+      "grad_norm": 0.12595658004283905,
+      "learning_rate": 8.916213565380188e-06,
+      "loss": 5.0732,
+      "step": 450
+    },
+    {
+      "batch_num_effect_tokens": 8005,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.58524,
+      "grad_norm": 0.1271679848432541,
+      "learning_rate": 8.90915741234015e-06,
+      "loss": 4.5,
+      "step": 451
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.58654,
+      "grad_norm": 0.13062655925750732,
+      "learning_rate": 8.902081175020558e-06,
+      "loss": 4.8711,
+      "step": 452
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8165,
+      "epoch": 0.58783,
+      "grad_norm": 0.12481129914522171,
+      "learning_rate": 8.894984889777365e-06,
+      "loss": 4.8623,
+      "step": 453
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8146,
+      "epoch": 0.58913,
+      "grad_norm": 0.1350601315498352,
+      "learning_rate": 8.88786859306952e-06,
+      "loss": 4.916,
+      "step": 454
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.59043,
+      "grad_norm": 0.12822680175304413,
+      "learning_rate": 8.880732321458785e-06,
+      "loss": 4.4541,
+      "step": 455
+    },
+    {
+      "batch_num_effect_tokens": 7981,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8160,
+      "epoch": 0.59173,
+      "grad_norm": 0.1304287612438202,
+      "learning_rate": 8.873576111609552e-06,
+      "loss": 4.8018,
+      "step": 456
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8155,
+      "epoch": 0.59303,
+      "grad_norm": 0.13626275956630707,
+      "learning_rate": 8.866400000288652e-06,
+      "loss": 4.9375,
+      "step": 457
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8173,
+      "epoch": 0.59432,
+      "grad_norm": 0.13052628934383392,
+      "learning_rate": 8.85920402436516e-06,
+      "loss": 4.9404,
+      "step": 458
+    },
+    {
+      "batch_num_effect_tokens": 7932,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8111,
+      "epoch": 0.59562,
+      "grad_norm": 0.12803995609283447,
+      "learning_rate": 8.85198822081021e-06,
+      "loss": 4.7695,
+      "step": 459
+    },
+    {
+      "batch_num_effect_tokens": 7900,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8074,
+      "epoch": 0.59692,
+      "grad_norm": 0.12911297380924225,
+      "learning_rate": 8.84475262669681e-06,
+      "loss": 4.6729,
+      "step": 460
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.59822,
+      "grad_norm": 0.12205401062965393,
+      "learning_rate": 8.837497279199647e-06,
+      "loss": 4.5557,
+      "step": 461
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8143,
+      "epoch": 0.59951,
+      "grad_norm": 0.1380755603313446,
+      "learning_rate": 8.83022221559489e-06,
+      "loss": 5.0176,
+      "step": 462
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8143,
+      "epoch": 0.59951,
+      "eval_eval_loss": 0.597096860408783,
+      "eval_eval_runtime": 114.9903,
+      "eval_eval_samples_per_second": 43.482,
+      "eval_eval_steps_per_second": 2.722,
+      "step": 462
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.60081,
+      "grad_norm": 0.12681666016578674,
+      "learning_rate": 8.822927473260012e-06,
+      "loss": 4.998,
+      "step": 463
+    },
+    {
+      "batch_num_effect_tokens": 8070,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.60211,
+      "grad_norm": 0.1322617530822754,
+      "learning_rate": 8.815613089673584e-06,
+      "loss": 4.9268,
+      "step": 464
+    },
+    {
+      "batch_num_effect_tokens": 7983,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8135,
+      "epoch": 0.60341,
+      "grad_norm": 0.13114838302135468,
+      "learning_rate": 8.808279102415093e-06,
+      "loss": 4.6543,
+      "step": 465
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8158,
+      "epoch": 0.6047,
+      "grad_norm": 0.12885698676109314,
+      "learning_rate": 8.800925549164742e-06,
+      "loss": 4.6309,
+      "step": 466
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8123,
+      "epoch": 0.606,
+      "grad_norm": 0.14466021955013275,
+      "learning_rate": 8.79355246770326e-06,
+      "loss": 4.5459,
+      "step": 467
+    },
+    {
+      "batch_num_effect_tokens": 8068,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.6073,
+      "grad_norm": 0.1357675939798355,
+      "learning_rate": 8.786159895911712e-06,
+      "loss": 4.873,
+      "step": 468
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.6086,
+      "grad_norm": 0.1205708235502243,
+      "learning_rate": 8.778747871771293e-06,
+      "loss": 4.8271,
+      "step": 469
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.60989,
+      "grad_norm": 0.137266144156456,
+      "learning_rate": 8.771316433363139e-06,
+      "loss": 4.9111,
+      "step": 470
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.61119,
+      "grad_norm": 0.1284547597169876,
+      "learning_rate": 8.763865618868136e-06,
+      "loss": 4.7998,
+      "step": 471
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.61249,
+      "grad_norm": 0.12678979337215424,
+      "learning_rate": 8.756395466566718e-06,
+      "loss": 4.707,
+      "step": 472
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.61379,
+      "grad_norm": 0.1220792606472969,
+      "learning_rate": 8.748906014838672e-06,
+      "loss": 4.6953,
+      "step": 473
+    },
+    {
+      "batch_num_effect_tokens": 7918,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8086,
+      "epoch": 0.61509,
+      "grad_norm": 0.1242067739367485,
+      "learning_rate": 8.74139730216294e-06,
+      "loss": 4.7539,
+      "step": 474
+    },
+    {
+      "batch_num_effect_tokens": 8079,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.61638,
+      "grad_norm": 0.1328115165233612,
+      "learning_rate": 8.73386936711742e-06,
+      "loss": 4.8418,
+      "step": 475
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8180,
+      "epoch": 0.61768,
+      "grad_norm": 0.1341410130262375,
+      "learning_rate": 8.726322248378775e-06,
+      "loss": 4.6699,
+      "step": 476
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.61898,
+      "grad_norm": 0.13784657418727875,
+      "learning_rate": 8.718755984722224e-06,
+      "loss": 4.7334,
+      "step": 477
+    },
+    {
+      "batch_num_effect_tokens": 7959,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8168,
+      "epoch": 0.62028,
+      "grad_norm": 0.1370074301958084,
+      "learning_rate": 8.71117061502135e-06,
+      "loss": 4.7334,
+      "step": 478
+    },
+    {
+      "batch_num_effect_tokens": 7899,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8078,
+      "epoch": 0.62157,
+      "grad_norm": 0.12483939528465271,
+      "learning_rate": 8.7035661782479e-06,
+      "loss": 4.6084,
+      "step": 479
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.62287,
+      "grad_norm": 0.14714112877845764,
+      "learning_rate": 8.695942713471578e-06,
+      "loss": 5.0137,
+      "step": 480
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.62417,
+      "grad_norm": 0.12480289489030838,
+      "learning_rate": 8.688300259859855e-06,
+      "loss": 4.5625,
+      "step": 481
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8177,
+      "epoch": 0.62547,
+      "grad_norm": 0.132341668009758,
+      "learning_rate": 8.680638856677754e-06,
+      "loss": 4.8096,
+      "step": 482
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8190,
+      "epoch": 0.62676,
+      "grad_norm": 0.1296975016593933,
+      "learning_rate": 8.672958543287666e-06,
+      "loss": 4.499,
+      "step": 483
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8146,
+      "epoch": 0.62806,
+      "grad_norm": 0.12684115767478943,
+      "learning_rate": 8.665259359149132e-06,
+      "loss": 4.9092,
+      "step": 484
+    },
+    {
+      "batch_num_effect_tokens": 7810,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 7989,
+      "epoch": 0.62936,
+      "grad_norm": 0.12434987723827362,
+      "learning_rate": 8.657541343818646e-06,
+      "loss": 4.5098,
+      "step": 485
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.63066,
+      "grad_norm": 0.13358746469020844,
+      "learning_rate": 8.649804536949453e-06,
+      "loss": 4.875,
+      "step": 486
+    },
+    {
+      "batch_num_effect_tokens": 7944,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8156,
+      "epoch": 0.63195,
+      "grad_norm": 0.13076664507389069,
+      "learning_rate": 8.642048978291347e-06,
+      "loss": 4.8301,
+      "step": 487
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.63325,
+      "grad_norm": 0.12482704967260361,
+      "learning_rate": 8.634274707690458e-06,
+      "loss": 5.208,
+      "step": 488
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 0.63455,
+      "grad_norm": 0.13764898478984833,
+      "learning_rate": 8.626481765089058e-06,
+      "loss": 5.4395,
+      "step": 489
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8155,
+      "epoch": 0.63585,
+      "grad_norm": 0.12434025853872299,
+      "learning_rate": 8.61867019052535e-06,
+      "loss": 4.8242,
+      "step": 490
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.63715,
+      "grad_norm": 0.13372331857681274,
+      "learning_rate": 8.610840024133266e-06,
+      "loss": 4.9395,
+      "step": 491
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.63844,
+      "grad_norm": 0.12320306152105331,
+      "learning_rate": 8.602991306142252e-06,
+      "loss": 4.4512,
+      "step": 492
+    },
+    {
+      "batch_num_effect_tokens": 7924,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8100,
+      "epoch": 0.63974,
+      "grad_norm": 0.13152460753917694,
+      "learning_rate": 8.595124076877074e-06,
+      "loss": 4.8301,
+      "step": 493
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.64104,
+      "grad_norm": 0.1355685591697693,
+      "learning_rate": 8.587238376757597e-06,
+      "loss": 4.7451,
+      "step": 494
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8100,
+      "epoch": 0.64234,
+      "grad_norm": 0.1284026801586151,
+      "learning_rate": 8.579334246298593e-06,
+      "loss": 4.9092,
+      "step": 495
+    },
+    {
+      "batch_num_effect_tokens": 7918,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8130,
+      "epoch": 0.64363,
+      "grad_norm": 0.13228504359722137,
+      "learning_rate": 8.571411726109518e-06,
+      "loss": 4.5225,
+      "step": 496
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.64493,
+      "grad_norm": 0.14529111981391907,
+      "learning_rate": 8.563470856894316e-06,
+      "loss": 4.9707,
+      "step": 497
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8191,
+      "epoch": 0.64623,
+      "grad_norm": 0.13738293945789337,
+      "learning_rate": 8.555511679451197e-06,
+      "loss": 4.6738,
+      "step": 498
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.64753,
+      "grad_norm": 0.13121268153190613,
+      "learning_rate": 8.547534234672435e-06,
+      "loss": 4.4697,
+      "step": 499
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.64882,
+      "grad_norm": 0.1251576542854309,
+      "learning_rate": 8.539538563544165e-06,
+      "loss": 4.7764,
+      "step": 500
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.65012,
+      "grad_norm": 0.13326169550418854,
+      "learning_rate": 8.531524707146154e-06,
+      "loss": 4.6836,
+      "step": 501
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8144,
+      "epoch": 0.65142,
+      "grad_norm": 0.13999317586421967,
+      "learning_rate": 8.523492706651607e-06,
+      "loss": 4.9355,
+      "step": 502
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.65272,
+      "grad_norm": 0.12973648309707642,
+      "learning_rate": 8.515442603326948e-06,
+      "loss": 4.7969,
+      "step": 503
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.65401,
+      "grad_norm": 0.1382308453321457,
+      "learning_rate": 8.507374438531606e-06,
+      "loss": 5.1699,
+      "step": 504
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8152,
+      "epoch": 0.65531,
+      "grad_norm": 0.12315783649682999,
+      "learning_rate": 8.49928825371781e-06,
+      "loss": 4.8672,
+      "step": 505
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.65661,
+      "grad_norm": 0.14307758212089539,
+      "learning_rate": 8.491184090430365e-06,
+      "loss": 4.5527,
+      "step": 506
+    },
+    {
+      "batch_num_effect_tokens": 7951,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8091,
+      "epoch": 0.65791,
+      "grad_norm": 0.12455622851848602,
+      "learning_rate": 8.483061990306451e-06,
+      "loss": 4.9229,
+      "step": 507
+    },
+    {
+      "batch_num_effect_tokens": 7904,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8086,
+      "epoch": 0.65921,
+      "grad_norm": 0.13688203692436218,
+      "learning_rate": 8.474921995075399e-06,
+      "loss": 4.5957,
+      "step": 508
+    },
+    {
+      "batch_num_effect_tokens": 7943,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8100,
+      "epoch": 0.6605,
+      "grad_norm": 0.1287027895450592,
+      "learning_rate": 8.466764146558482e-06,
+      "loss": 4.9189,
+      "step": 509
+    },
+    {
+      "batch_num_effect_tokens": 8075,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.6618,
+      "grad_norm": 0.12565134465694427,
+      "learning_rate": 8.4585884866687e-06,
+      "loss": 5.2285,
+      "step": 510
+    },
+    {
+      "batch_num_effect_tokens": 7930,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8089,
+      "epoch": 0.6631,
+      "grad_norm": 0.11928825825452805,
+      "learning_rate": 8.450395057410561e-06,
+      "loss": 4.7051,
+      "step": 511
+    },
+    {
+      "batch_num_effect_tokens": 8075,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.6644,
+      "grad_norm": 0.1257006675004959,
+      "learning_rate": 8.44218390087987e-06,
+      "loss": 4.7148,
+      "step": 512
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.66569,
+      "grad_norm": 0.14077560603618622,
+      "learning_rate": 8.433955059263508e-06,
+      "loss": 4.8691,
+      "step": 513
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.66699,
+      "grad_norm": 0.1343725621700287,
+      "learning_rate": 8.425708574839221e-06,
+      "loss": 4.8486,
+      "step": 514
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.66829,
+      "grad_norm": 0.1511705070734024,
+      "learning_rate": 8.417444489975396e-06,
+      "loss": 4.584,
+      "step": 515
+    },
+    {
+      "batch_num_effect_tokens": 8077,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.66959,
+      "grad_norm": 0.15018604695796967,
+      "learning_rate": 8.409162847130847e-06,
+      "loss": 5.0859,
+      "step": 516
+    },
+    {
+      "batch_num_effect_tokens": 8006,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.67088,
+      "grad_norm": 0.13522522151470184,
+      "learning_rate": 8.400863688854598e-06,
+      "loss": 4.9492,
+      "step": 517
+    },
+    {
+      "batch_num_effect_tokens": 7935,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8092,
+      "epoch": 0.67218,
+      "grad_norm": 0.1357102394104004,
+      "learning_rate": 8.392547057785662e-06,
+      "loss": 4.4229,
+      "step": 518
+    },
+    {
+      "batch_num_effect_tokens": 7896,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8140,
+      "epoch": 0.67348,
+      "grad_norm": 0.12707726657390594,
+      "learning_rate": 8.384212996652823e-06,
+      "loss": 4.5303,
+      "step": 519
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8135,
+      "epoch": 0.67478,
+      "grad_norm": 0.127869114279747,
+      "learning_rate": 8.375861548274417e-06,
+      "loss": 4.6426,
+      "step": 520
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8192,
+      "epoch": 0.67607,
+      "grad_norm": 0.13051429390907288,
+      "learning_rate": 8.367492755558111e-06,
+      "loss": 4.9316,
+      "step": 521
+    },
+    {
+      "batch_num_effect_tokens": 7906,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8112,
+      "epoch": 0.67737,
+      "grad_norm": 0.13462385535240173,
+      "learning_rate": 8.359106661500683e-06,
+      "loss": 4.7568,
+      "step": 522
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.67867,
+      "grad_norm": 0.1249893382191658,
+      "learning_rate": 8.3507033091878e-06,
+      "loss": 4.5117,
+      "step": 523
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8168,
+      "epoch": 0.67997,
+      "grad_norm": 0.13060161471366882,
+      "learning_rate": 8.342282741793797e-06,
+      "loss": 4.8574,
+      "step": 524
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.68127,
+      "grad_norm": 0.11890089511871338,
+      "learning_rate": 8.33384500258146e-06,
+      "loss": 4.6885,
+      "step": 525
+    },
+    {
+      "batch_num_effect_tokens": 7939,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8076,
+      "epoch": 0.68256,
+      "grad_norm": 0.12186378985643387,
+      "learning_rate": 8.325390134901794e-06,
+      "loss": 4.4736,
+      "step": 526
+    },
+    {
+      "batch_num_effect_tokens": 7961,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8118,
+      "epoch": 0.68386,
+      "grad_norm": 0.140080064535141,
+      "learning_rate": 8.316918182193811e-06,
+      "loss": 4.8838,
+      "step": 527
+    },
+    {
+      "batch_num_effect_tokens": 7969,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8176,
+      "epoch": 0.68516,
+      "grad_norm": 0.1309884935617447,
+      "learning_rate": 8.308429187984298e-06,
+      "loss": 4.8018,
+      "step": 528
+    },
+    {
+      "batch_num_effect_tokens": 7935,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 0.68646,
+      "grad_norm": 0.1284397542476654,
+      "learning_rate": 8.299923195887599e-06,
+      "loss": 4.4141,
+      "step": 529
+    },
+    {
+      "batch_num_effect_tokens": 7875,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8074,
+      "epoch": 0.68775,
+      "grad_norm": 0.12711189687252045,
+      "learning_rate": 8.291400249605387e-06,
+      "loss": 4.6455,
+      "step": 530
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.68905,
+      "grad_norm": 0.13765956461429596,
+      "learning_rate": 8.282860392926442e-06,
+      "loss": 4.4688,
+      "step": 531
+    },
+    {
+      "batch_num_effect_tokens": 7934,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8146,
+      "epoch": 0.69035,
+      "grad_norm": 0.13526172935962677,
+      "learning_rate": 8.274303669726427e-06,
+      "loss": 4.6934,
+      "step": 532
+    },
+    {
+      "batch_num_effect_tokens": 7921,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8160,
+      "epoch": 0.69165,
+      "grad_norm": 0.13791659474372864,
+      "learning_rate": 8.26573012396766e-06,
+      "loss": 4.8594,
+      "step": 533
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.69294,
+      "grad_norm": 0.1339079588651657,
+      "learning_rate": 8.257139799698887e-06,
+      "loss": 5.1318,
+      "step": 534
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8120,
+      "epoch": 0.69424,
+      "grad_norm": 0.13170425593852997,
+      "learning_rate": 8.248532741055061e-06,
+      "loss": 4.5645,
+      "step": 535
+    },
+    {
+      "batch_num_effect_tokens": 7932,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8100,
+      "epoch": 0.69554,
+      "grad_norm": 0.1507280021905899,
+      "learning_rate": 8.239908992257114e-06,
+      "loss": 4.7578,
+      "step": 536
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.69684,
+      "grad_norm": 0.13211563229560852,
+      "learning_rate": 8.231268597611722e-06,
+      "loss": 5.0664,
+      "step": 537
+    },
+    {
+      "batch_num_effect_tokens": 8060,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.69813,
+      "grad_norm": 0.135163813829422,
+      "learning_rate": 8.222611601511084e-06,
+      "loss": 4.5693,
+      "step": 538
+    },
+    {
+      "batch_num_effect_tokens": 7743,
+      "batch_num_samples": 29,
+      "batch_num_tokens": 8007,
+      "epoch": 0.69943,
+      "grad_norm": 0.13044171035289764,
+      "learning_rate": 8.213938048432697e-06,
+      "loss": 4.8115,
+      "step": 539
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.70073,
+      "grad_norm": 0.13288696110248566,
+      "learning_rate": 8.205247982939124e-06,
+      "loss": 4.7236,
+      "step": 540
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.70203,
+      "grad_norm": 0.12817522883415222,
+      "learning_rate": 8.196541449677758e-06,
+      "loss": 4.7334,
+      "step": 541
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.70333,
+      "grad_norm": 0.1278115212917328,
+      "learning_rate": 8.187818493380607e-06,
+      "loss": 4.6318,
+      "step": 542
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.70462,
+      "grad_norm": 0.12950873374938965,
+      "learning_rate": 8.179079158864053e-06,
+      "loss": 4.8809,
+      "step": 543
+    },
+    {
+      "batch_num_effect_tokens": 7919,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8083,
+      "epoch": 0.70592,
+      "grad_norm": 0.1247512623667717,
+      "learning_rate": 8.170323491028625e-06,
+      "loss": 4.9658,
+      "step": 544
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.70722,
+      "grad_norm": 0.12909561395645142,
+      "learning_rate": 8.161551534858767e-06,
+      "loss": 4.7041,
+      "step": 545
+    },
+    {
+      "batch_num_effect_tokens": 7979,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8146,
+      "epoch": 0.70852,
+      "grad_norm": 0.12214743345975876,
+      "learning_rate": 8.152763335422612e-06,
+      "loss": 5.0234,
+      "step": 546
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.70981,
+      "grad_norm": 0.12689444422721863,
+      "learning_rate": 8.143958937871748e-06,
+      "loss": 4.8711,
+      "step": 547
+    },
+    {
+      "batch_num_effect_tokens": 7902,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8080,
+      "epoch": 0.71111,
+      "grad_norm": 0.12359411269426346,
+      "learning_rate": 8.135138387440978e-06,
+      "loss": 4.877,
+      "step": 548
+    },
+    {
+      "batch_num_effect_tokens": 7921,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8086,
+      "epoch": 0.71241,
+      "grad_norm": 0.1347743421792984,
+      "learning_rate": 8.126301729448101e-06,
+      "loss": 4.8076,
+      "step": 549
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.71371,
+      "grad_norm": 0.12478054314851761,
+      "learning_rate": 8.117449009293668e-06,
+      "loss": 4.6523,
+      "step": 550
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.715,
+      "grad_norm": 0.12902334332466125,
+      "learning_rate": 8.108580272460759e-06,
+      "loss": 4.6719,
+      "step": 551
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.7163,
+      "grad_norm": 0.1194981262087822,
+      "learning_rate": 8.099695564514738e-06,
+      "loss": 4.6465,
+      "step": 552
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.7176,
+      "grad_norm": 0.12877826392650604,
+      "learning_rate": 8.090794931103026e-06,
+      "loss": 4.7939,
+      "step": 553
+    },
+    {
+      "batch_num_effect_tokens": 8069,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.7189,
+      "grad_norm": 0.13682805001735687,
+      "learning_rate": 8.08187841795487e-06,
+      "loss": 4.7344,
+      "step": 554
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.72019,
+      "grad_norm": 0.13331077992916107,
+      "learning_rate": 8.072946070881095e-06,
+      "loss": 5.084,
+      "step": 555
+    },
+    {
+      "batch_num_effect_tokens": 7942,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 0.72149,
+      "grad_norm": 0.13391903042793274,
+      "learning_rate": 8.063997935773885e-06,
+      "loss": 4.6699,
+      "step": 556
+    },
+    {
+      "batch_num_effect_tokens": 7917,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8078,
+      "epoch": 0.72279,
+      "grad_norm": 0.1353442519903183,
+      "learning_rate": 8.055034058606533e-06,
+      "loss": 4.7354,
+      "step": 557
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8146,
+      "epoch": 0.72409,
+      "grad_norm": 0.13279181718826294,
+      "learning_rate": 8.046054485433211e-06,
+      "loss": 4.7617,
+      "step": 558
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8165,
+      "epoch": 0.72539,
+      "grad_norm": 0.13407373428344727,
+      "learning_rate": 8.03705926238874e-06,
+      "loss": 4.667,
+      "step": 559
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.72668,
+      "grad_norm": 0.12700553238391876,
+      "learning_rate": 8.028048435688333e-06,
+      "loss": 4.4395,
+      "step": 560
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.72798,
+      "grad_norm": 0.1360039860010147,
+      "learning_rate": 8.019022051627387e-06,
+      "loss": 4.7686,
+      "step": 561
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.72928,
+      "grad_norm": 0.121078722178936,
+      "learning_rate": 8.009980156581218e-06,
+      "loss": 4.6289,
+      "step": 562
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73058,
+      "grad_norm": 0.12270597368478775,
+      "learning_rate": 8.000922797004835e-06,
+      "loss": 4.5605,
+      "step": 563
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73187,
+      "grad_norm": 0.13452237844467163,
+      "learning_rate": 7.991850019432701e-06,
+      "loss": 4.6885,
+      "step": 564
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73317,
+      "grad_norm": 0.12567798793315887,
+      "learning_rate": 7.982761870478495e-06,
+      "loss": 4.8379,
+      "step": 565
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73447,
+      "grad_norm": 0.1310405284166336,
+      "learning_rate": 7.973658396834868e-06,
+      "loss": 4.6504,
+      "step": 566
+    },
+    {
+      "batch_num_effect_tokens": 7962,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73577,
+      "grad_norm": 0.13182175159454346,
+      "learning_rate": 7.964539645273204e-06,
+      "loss": 4.7881,
+      "step": 567
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73706,
+      "grad_norm": 0.12408516556024551,
+      "learning_rate": 7.955405662643384e-06,
+      "loss": 4.458,
+      "step": 568
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73836,
+      "grad_norm": 0.12731651961803436,
+      "learning_rate": 7.946256495873542e-06,
+      "loss": 5.0205,
+      "step": 569
+    },
+    {
+      "batch_num_effect_tokens": 8067,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.73966,
+      "grad_norm": 0.13854657113552094,
+      "learning_rate": 7.937092191969821e-06,
+      "loss": 4.6074,
+      "step": 570
+    },
+    {
+      "batch_num_effect_tokens": 7958,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 0.74096,
+      "grad_norm": 0.127966970205307,
+      "learning_rate": 7.927912798016144e-06,
+      "loss": 5.0039,
+      "step": 571
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.74225,
+      "grad_norm": 0.1232946589589119,
+      "learning_rate": 7.918718361173951e-06,
+      "loss": 4.749,
+      "step": 572
+    },
+    {
+      "batch_num_effect_tokens": 7932,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8181,
+      "epoch": 0.74355,
+      "grad_norm": 0.12766174972057343,
+      "learning_rate": 7.909508928681975e-06,
+      "loss": 5.0156,
+      "step": 573
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.74485,
+      "grad_norm": 0.1258310079574585,
+      "learning_rate": 7.900284547855992e-06,
+      "loss": 4.4893,
+      "step": 574
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.74615,
+      "grad_norm": 0.13721963763237,
+      "learning_rate": 7.89104526608858e-06,
+      "loss": 4.7783,
+      "step": 575
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.74745,
+      "grad_norm": 0.11561396718025208,
+      "learning_rate": 7.881791130848872e-06,
+      "loss": 4.6162,
+      "step": 576
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 0.74874,
+      "grad_norm": 0.11618711799383163,
+      "learning_rate": 7.872522189682318e-06,
+      "loss": 4.541,
+      "step": 577
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.75004,
+      "grad_norm": 0.1265118569135666,
+      "learning_rate": 7.863238490210432e-06,
+      "loss": 4.6934,
+      "step": 578
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.75134,
+      "grad_norm": 0.11919250339269638,
+      "learning_rate": 7.853940080130556e-06,
+      "loss": 4.4326,
+      "step": 579
+    },
+    {
+      "batch_num_effect_tokens": 7886,
+      "batch_num_samples": 27,
+      "batch_num_tokens": 8110,
+      "epoch": 0.75264,
+      "grad_norm": 0.13348767161369324,
+      "learning_rate": 7.844627007215613e-06,
+      "loss": 4.9668,
+      "step": 580
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.75393,
+      "grad_norm": 0.12527728080749512,
+      "learning_rate": 7.835299319313854e-06,
+      "loss": 4.8496,
+      "step": 581
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8076,
+      "epoch": 0.75523,
+      "grad_norm": 0.12182778120040894,
+      "learning_rate": 7.825957064348625e-06,
+      "loss": 4.6016,
+      "step": 582
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8165,
+      "epoch": 0.75653,
+      "grad_norm": 0.11762472242116928,
+      "learning_rate": 7.81660029031811e-06,
+      "loss": 4.6768,
+      "step": 583
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8176,
+      "epoch": 0.75783,
+      "grad_norm": 0.1293763965368271,
+      "learning_rate": 7.80722904529509e-06,
+      "loss": 4.7266,
+      "step": 584
+    },
+    {
+      "batch_num_effect_tokens": 7942,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8191,
+      "epoch": 0.75912,
+      "grad_norm": 0.12573941051959991,
+      "learning_rate": 7.797843377426693e-06,
+      "loss": 4.6338,
+      "step": 585
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.76042,
+      "grad_norm": 0.1293158084154129,
+      "learning_rate": 7.788443334934148e-06,
+      "loss": 4.5762,
+      "step": 586
+    },
+    {
+      "batch_num_effect_tokens": 7907,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8098,
+      "epoch": 0.76172,
+      "grad_norm": 0.14639928936958313,
+      "learning_rate": 7.779028966112538e-06,
+      "loss": 5.0459,
+      "step": 587
+    },
+    {
+      "batch_num_effect_tokens": 7981,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.76302,
+      "grad_norm": 0.1278136819601059,
+      "learning_rate": 7.769600319330553e-06,
+      "loss": 4.585,
+      "step": 588
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.76431,
+      "grad_norm": 0.12693235278129578,
+      "learning_rate": 7.760157443030234e-06,
+      "loss": 4.7744,
+      "step": 589
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.76561,
+      "grad_norm": 0.11962468177080154,
+      "learning_rate": 7.750700385726736e-06,
+      "loss": 4.6338,
+      "step": 590
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.76691,
+      "grad_norm": 0.13058899343013763,
+      "learning_rate": 7.741229196008068e-06,
+      "loss": 4.9893,
+      "step": 591
+    },
+    {
+      "batch_num_effect_tokens": 7970,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8123,
+      "epoch": 0.76821,
+      "grad_norm": 0.1270352005958557,
+      "learning_rate": 7.731743922534854e-06,
+      "loss": 4.5371,
+      "step": 592
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.76951,
+      "grad_norm": 0.1201217994093895,
+      "learning_rate": 7.722244614040068e-06,
+      "loss": 4.3867,
+      "step": 593
+    },
+    {
+      "batch_num_effect_tokens": 7950,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8122,
+      "epoch": 0.7708,
+      "grad_norm": 0.11798641830682755,
+      "learning_rate": 7.712731319328798e-06,
+      "loss": 4.873,
+      "step": 594
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8187,
+      "epoch": 0.7721,
+      "grad_norm": 0.1351042240858078,
+      "learning_rate": 7.703204087277989e-06,
+      "loss": 4.4766,
+      "step": 595
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 0.7734,
+      "grad_norm": 0.12383313477039337,
+      "learning_rate": 7.693662966836191e-06,
+      "loss": 4.6631,
+      "step": 596
+    },
+    {
+      "batch_num_effect_tokens": 7880,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8062,
+      "epoch": 0.7747,
+      "grad_norm": 0.12391753494739532,
+      "learning_rate": 7.684108007023313e-06,
+      "loss": 4.5283,
+      "step": 597
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 0.77599,
+      "grad_norm": 0.12056957185268402,
+      "learning_rate": 7.674539256930364e-06,
+      "loss": 4.5322,
+      "step": 598
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.77729,
+      "grad_norm": 0.11704017966985703,
+      "learning_rate": 7.6649567657192e-06,
+      "loss": 4.6699,
+      "step": 599
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.77859,
+      "grad_norm": 0.12692949175834656,
+      "learning_rate": 7.655360582622287e-06,
+      "loss": 4.5049,
+      "step": 600
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.77989,
+      "grad_norm": 0.1289125233888626,
+      "learning_rate": 7.645750756942425e-06,
+      "loss": 4.8818,
+      "step": 601
+    },
+    {
+      "batch_num_effect_tokens": 7924,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8078,
+      "epoch": 0.78118,
+      "grad_norm": 0.12068326771259308,
+      "learning_rate": 7.636127338052513e-06,
+      "loss": 4.7988,
+      "step": 602
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.78248,
+      "grad_norm": 0.1243966817855835,
+      "learning_rate": 7.626490375395286e-06,
+      "loss": 4.6328,
+      "step": 603
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.78378,
+      "grad_norm": 0.12233472615480423,
+      "learning_rate": 7.616839918483061e-06,
+      "loss": 4.5869,
+      "step": 604
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8166,
+      "epoch": 0.78508,
+      "grad_norm": 0.11933887749910355,
+      "learning_rate": 7.607176016897491e-06,
+      "loss": 4.7559,
+      "step": 605
+    },
+    {
+      "batch_num_effect_tokens": 7950,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 0.78637,
+      "grad_norm": 0.11429915577173233,
+      "learning_rate": 7.597498720289302e-06,
+      "loss": 4.4414,
+      "step": 606
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.78767,
+      "grad_norm": 0.12575142085552216,
+      "learning_rate": 7.587808078378036e-06,
+      "loss": 4.5176,
+      "step": 607
+    },
+    {
+      "batch_num_effect_tokens": 7824,
+      "batch_num_samples": 29,
+      "batch_num_tokens": 8092,
+      "epoch": 0.78897,
+      "grad_norm": 0.1377941220998764,
+      "learning_rate": 7.578104140951806e-06,
+      "loss": 4.6582,
+      "step": 608
+    },
+    {
+      "batch_num_effect_tokens": 7923,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8081,
+      "epoch": 0.79027,
+      "grad_norm": 0.12251248210668564,
+      "learning_rate": 7.568386957867033e-06,
+      "loss": 4.5859,
+      "step": 609
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.79157,
+      "grad_norm": 0.12789161503314972,
+      "learning_rate": 7.5586565790481855e-06,
+      "loss": 4.7432,
+      "step": 610
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.79286,
+      "grad_norm": 0.14461970329284668,
+      "learning_rate": 7.548913054487537e-06,
+      "loss": 4.7646,
+      "step": 611
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.79416,
+      "grad_norm": 0.12491682916879654,
+      "learning_rate": 7.539156434244892e-06,
+      "loss": 4.6553,
+      "step": 612
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8178,
+      "epoch": 0.79546,
+      "grad_norm": 0.12910866737365723,
+      "learning_rate": 7.529386768447342e-06,
+      "loss": 4.9033,
+      "step": 613
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.79676,
+      "grad_norm": 0.1255904883146286,
+      "learning_rate": 7.519604107289004e-06,
+      "loss": 4.7559,
+      "step": 614
+    },
+    {
+      "batch_num_effect_tokens": 8073,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.79805,
+      "grad_norm": 0.12721942365169525,
+      "learning_rate": 7.50980850103076e-06,
+      "loss": 4.5977,
+      "step": 615
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.79935,
+      "grad_norm": 0.12462284415960312,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 4.3994,
+      "step": 616
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.79935,
+      "eval_eval_loss": 0.5898093581199646,
+      "eval_eval_runtime": 115.3418,
+      "eval_eval_samples_per_second": 43.349,
+      "eval_eval_steps_per_second": 2.714,
+      "step": 616
+    },
+    {
+      "batch_num_effect_tokens": 8079,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.80065,
+      "grad_norm": 0.12710371613502502,
+      "learning_rate": 7.490178654590367e-06,
+      "loss": 4.9082,
+      "step": 617
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.80195,
+      "grad_norm": 0.12795433402061462,
+      "learning_rate": 7.480344515261495e-06,
+      "loss": 4.6973,
+      "step": 618
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 0.80324,
+      "grad_norm": 0.13104230165481567,
+      "learning_rate": 7.470497632538743e-06,
+      "loss": 4.9326,
+      "step": 619
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.80454,
+      "grad_norm": 0.12019386142492294,
+      "learning_rate": 7.460638057012956e-06,
+      "loss": 4.665,
+      "step": 620
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.80584,
+      "grad_norm": 0.1315028965473175,
+      "learning_rate": 7.450765839340175e-06,
+      "loss": 4.9375,
+      "step": 621
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.80714,
+      "grad_norm": 0.13341425359249115,
+      "learning_rate": 7.440881030241407e-06,
+      "loss": 4.7939,
+      "step": 622
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.80843,
+      "grad_norm": 0.13055460155010223,
+      "learning_rate": 7.430983680502344e-06,
+      "loss": 4.9736,
+      "step": 623
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.80973,
+      "grad_norm": 0.12852461636066437,
+      "learning_rate": 7.4210738409731095e-06,
+      "loss": 4.6982,
+      "step": 624
+    },
+    {
+      "batch_num_effect_tokens": 7932,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8164,
+      "epoch": 0.81103,
+      "grad_norm": 0.13344988226890564,
+      "learning_rate": 7.411151562567999e-06,
+      "loss": 4.7471,
+      "step": 625
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8138,
+      "epoch": 0.81233,
+      "grad_norm": 0.13381238281726837,
+      "learning_rate": 7.401216896265208e-06,
+      "loss": 4.6709,
+      "step": 626
+    },
+    {
+      "batch_num_effect_tokens": 7977,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8181,
+      "epoch": 0.81363,
+      "grad_norm": 0.13180480897426605,
+      "learning_rate": 7.391269893106592e-06,
+      "loss": 4.8457,
+      "step": 627
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.81492,
+      "grad_norm": 0.12136835604906082,
+      "learning_rate": 7.381310604197375e-06,
+      "loss": 4.8252,
+      "step": 628
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8118,
+      "epoch": 0.81622,
+      "grad_norm": 0.1308746188879013,
+      "learning_rate": 7.371339080705913e-06,
+      "loss": 4.5479,
+      "step": 629
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.81752,
+      "grad_norm": 0.13940449059009552,
+      "learning_rate": 7.361355373863415e-06,
+      "loss": 4.7676,
+      "step": 630
+    },
+    {
+      "batch_num_effect_tokens": 8076,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.81882,
+      "grad_norm": 0.1301494687795639,
+      "learning_rate": 7.351359534963684e-06,
+      "loss": 4.4824,
+      "step": 631
+    },
+    {
+      "batch_num_effect_tokens": 8070,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.82011,
+      "grad_norm": 0.12944510579109192,
+      "learning_rate": 7.3413516153628605e-06,
+      "loss": 4.8672,
+      "step": 632
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.82141,
+      "grad_norm": 0.13300953805446625,
+      "learning_rate": 7.331331666479149e-06,
+      "loss": 4.9189,
+      "step": 633
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.82271,
+      "grad_norm": 0.12441015988588333,
+      "learning_rate": 7.321299739792553e-06,
+      "loss": 4.4316,
+      "step": 634
+    },
+    {
+      "batch_num_effect_tokens": 7972,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.82401,
+      "grad_norm": 0.12126115709543228,
+      "learning_rate": 7.311255886844624e-06,
+      "loss": 4.5771,
+      "step": 635
+    },
+    {
+      "batch_num_effect_tokens": 7986,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8135,
+      "epoch": 0.8253,
+      "grad_norm": 0.12674327194690704,
+      "learning_rate": 7.30120015923818e-06,
+      "loss": 4.8574,
+      "step": 636
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.8266,
+      "grad_norm": 0.12705481052398682,
+      "learning_rate": 7.291132608637053e-06,
+      "loss": 4.334,
+      "step": 637
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.8279,
+      "grad_norm": 0.12920741736888885,
+      "learning_rate": 7.281053286765816e-06,
+      "loss": 4.6611,
+      "step": 638
+    },
+    {
+      "batch_num_effect_tokens": 7761,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8034,
+      "epoch": 0.8292,
+      "grad_norm": 0.1230921670794487,
+      "learning_rate": 7.27096224540952e-06,
+      "loss": 4.5332,
+      "step": 639
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.83049,
+      "grad_norm": 0.12442672997713089,
+      "learning_rate": 7.260859536413429e-06,
+      "loss": 4.7666,
+      "step": 640
+    },
+    {
+      "batch_num_effect_tokens": 7965,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8120,
+      "epoch": 0.83179,
+      "grad_norm": 0.13749492168426514,
+      "learning_rate": 7.250745211682752e-06,
+      "loss": 4.9414,
+      "step": 641
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.83309,
+      "grad_norm": 0.1354299634695053,
+      "learning_rate": 7.240619323182378e-06,
+      "loss": 4.9287,
+      "step": 642
+    },
+    {
+      "batch_num_effect_tokens": 7934,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8100,
+      "epoch": 0.83439,
+      "grad_norm": 0.1304435133934021,
+      "learning_rate": 7.2304819229366015e-06,
+      "loss": 4.4697,
+      "step": 643
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 0.83569,
+      "grad_norm": 0.12707528471946716,
+      "learning_rate": 7.2203330630288714e-06,
+      "loss": 5.0391,
+      "step": 644
+    },
+    {
+      "batch_num_effect_tokens": 7918,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8100,
+      "epoch": 0.83698,
+      "grad_norm": 0.12164284288883209,
+      "learning_rate": 7.210172795601506e-06,
+      "loss": 4.2236,
+      "step": 645
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.83828,
+      "grad_norm": 0.13518215715885162,
+      "learning_rate": 7.200001172855436e-06,
+      "loss": 4.7686,
+      "step": 646
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.83958,
+      "grad_norm": 0.12766359746456146,
+      "learning_rate": 7.189818247049931e-06,
+      "loss": 4.5146,
+      "step": 647
+    },
+    {
+      "batch_num_effect_tokens": 7961,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 0.84088,
+      "grad_norm": 0.12365400791168213,
+      "learning_rate": 7.179624070502334e-06,
+      "loss": 4.9824,
+      "step": 648
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8123,
+      "epoch": 0.84217,
+      "grad_norm": 0.13238734006881714,
+      "learning_rate": 7.169418695587791e-06,
+      "loss": 4.9043,
+      "step": 649
+    },
+    {
+      "batch_num_effect_tokens": 7933,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8122,
+      "epoch": 0.84347,
+      "grad_norm": 0.11986955255270004,
+      "learning_rate": 7.159202174738984e-06,
+      "loss": 4.3682,
+      "step": 650
+    },
+    {
+      "batch_num_effect_tokens": 7899,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8085,
+      "epoch": 0.84477,
+      "grad_norm": 0.1329929381608963,
+      "learning_rate": 7.148974560445859e-06,
+      "loss": 4.6943,
+      "step": 651
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.84607,
+      "grad_norm": 0.1225806325674057,
+      "learning_rate": 7.138735905255355e-06,
+      "loss": 4.8477,
+      "step": 652
+    },
+    {
+      "batch_num_effect_tokens": 7858,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8108,
+      "epoch": 0.84736,
+      "grad_norm": 0.12441520392894745,
+      "learning_rate": 7.128486261771142e-06,
+      "loss": 4.5928,
+      "step": 653
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8133,
+      "epoch": 0.84866,
+      "grad_norm": 0.11729971319437027,
+      "learning_rate": 7.1182256826533365e-06,
+      "loss": 4.8398,
+      "step": 654
+    },
+    {
+      "batch_num_effect_tokens": 7973,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8159,
+      "epoch": 0.84996,
+      "grad_norm": 0.1261633038520813,
+      "learning_rate": 7.107954220618251e-06,
+      "loss": 4.9746,
+      "step": 655
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.85126,
+      "grad_norm": 0.13062052428722382,
+      "learning_rate": 7.097671928438101e-06,
+      "loss": 4.6182,
+      "step": 656
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.85255,
+      "grad_norm": 0.12817202508449554,
+      "learning_rate": 7.08737885894075e-06,
+      "loss": 4.6768,
+      "step": 657
+    },
+    {
+      "batch_num_effect_tokens": 7970,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8139,
+      "epoch": 0.85385,
+      "grad_norm": 0.13156555593013763,
+      "learning_rate": 7.0770750650094335e-06,
+      "loss": 4.7109,
+      "step": 658
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.85515,
+      "grad_norm": 0.13155829906463623,
+      "learning_rate": 7.066760599582481e-06,
+      "loss": 4.9395,
+      "step": 659
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8182,
+      "epoch": 0.85645,
+      "grad_norm": 0.12426735460758209,
+      "learning_rate": 7.056435515653059e-06,
+      "loss": 4.5059,
+      "step": 660
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.85775,
+      "grad_norm": 0.11923466622829437,
+      "learning_rate": 7.046099866268878e-06,
+      "loss": 4.6162,
+      "step": 661
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.85904,
+      "grad_norm": 0.12440145760774612,
+      "learning_rate": 7.03575370453194e-06,
+      "loss": 4.6377,
+      "step": 662
+    },
+    {
+      "batch_num_effect_tokens": 7967,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8181,
+      "epoch": 0.86034,
+      "grad_norm": 0.12746423482894897,
+      "learning_rate": 7.025397083598251e-06,
+      "loss": 4.7217,
+      "step": 663
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.86164,
+      "grad_norm": 0.1230856403708458,
+      "learning_rate": 7.015030056677559e-06,
+      "loss": 4.3516,
+      "step": 664
+    },
+    {
+      "batch_num_effect_tokens": 7945,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8128,
+      "epoch": 0.86294,
+      "grad_norm": 0.14013619720935822,
+      "learning_rate": 7.004652677033069e-06,
+      "loss": 4.9609,
+      "step": 665
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.86423,
+      "grad_norm": 0.1289435476064682,
+      "learning_rate": 6.9942649979811836e-06,
+      "loss": 4.7041,
+      "step": 666
+    },
+    {
+      "batch_num_effect_tokens": 7973,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 0.86553,
+      "grad_norm": 0.11872459203004837,
+      "learning_rate": 6.983867072891213e-06,
+      "loss": 4.5059,
+      "step": 667
+    },
+    {
+      "batch_num_effect_tokens": 7942,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8074,
+      "epoch": 0.86683,
+      "grad_norm": 0.11841870099306107,
+      "learning_rate": 6.973458955185116e-06,
+      "loss": 4.834,
+      "step": 668
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.86813,
+      "grad_norm": 0.13256226480007172,
+      "learning_rate": 6.963040698337215e-06,
+      "loss": 4.7764,
+      "step": 669
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.86942,
+      "grad_norm": 0.12503276765346527,
+      "learning_rate": 6.952612355873922e-06,
+      "loss": 4.5635,
+      "step": 670
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.87072,
+      "grad_norm": 0.13926607370376587,
+      "learning_rate": 6.942173981373474e-06,
+      "loss": 4.9668,
+      "step": 671
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.87202,
+      "grad_norm": 0.1426723152399063,
+      "learning_rate": 6.931725628465643e-06,
+      "loss": 4.9531,
+      "step": 672
+    },
+    {
+      "batch_num_effect_tokens": 7934,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8079,
+      "epoch": 0.87332,
+      "grad_norm": 0.12457671016454697,
+      "learning_rate": 6.9212673508314734e-06,
+      "loss": 4.6992,
+      "step": 673
+    },
+    {
+      "batch_num_effect_tokens": 7773,
+      "batch_num_samples": 30,
+      "batch_num_tokens": 8057,
+      "epoch": 0.87461,
+      "grad_norm": 0.1329599916934967,
+      "learning_rate": 6.910799202202993e-06,
+      "loss": 4.7793,
+      "step": 674
+    },
+    {
+      "batch_num_effect_tokens": 8075,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.87591,
+      "grad_norm": 0.1251561939716339,
+      "learning_rate": 6.900321236362952e-06,
+      "loss": 4.7969,
+      "step": 675
+    },
+    {
+      "batch_num_effect_tokens": 7908,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 0.87721,
+      "grad_norm": 0.12417805939912796,
+      "learning_rate": 6.889833507144534e-06,
+      "loss": 4.9121,
+      "step": 676
+    },
+    {
+      "batch_num_effect_tokens": 8005,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8156,
+      "epoch": 0.87851,
+      "grad_norm": 0.1217103824019432,
+      "learning_rate": 6.879336068431086e-06,
+      "loss": 4.8389,
+      "step": 677
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.87981,
+      "grad_norm": 0.1301388144493103,
+      "learning_rate": 6.868828974155841e-06,
+      "loss": 5.5527,
+      "step": 678
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.8811,
+      "grad_norm": 0.12873168289661407,
+      "learning_rate": 6.858312278301638e-06,
+      "loss": 4.6826,
+      "step": 679
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8163,
+      "epoch": 0.8824,
+      "grad_norm": 0.12014926224946976,
+      "learning_rate": 6.847786034900648e-06,
+      "loss": 4.4951,
+      "step": 680
+    },
+    {
+      "batch_num_effect_tokens": 7913,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8088,
+      "epoch": 0.8837,
+      "grad_norm": 0.12555824220180511,
+      "learning_rate": 6.837250298034095e-06,
+      "loss": 4.5303,
+      "step": 681
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8158,
+      "epoch": 0.885,
+      "grad_norm": 0.12343177944421768,
+      "learning_rate": 6.8267051218319766e-06,
+      "loss": 4.4834,
+      "step": 682
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.88629,
+      "grad_norm": 0.12695080041885376,
+      "learning_rate": 6.816150560472787e-06,
+      "loss": 4.4951,
+      "step": 683
+    },
+    {
+      "batch_num_effect_tokens": 7893,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8085,
+      "epoch": 0.88759,
+      "grad_norm": 0.12869073450565338,
+      "learning_rate": 6.805586668183242e-06,
+      "loss": 4.7705,
+      "step": 684
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8142,
+      "epoch": 0.88889,
+      "grad_norm": 0.12758734822273254,
+      "learning_rate": 6.7950134992379935e-06,
+      "loss": 4.4922,
+      "step": 685
+    },
+    {
+      "batch_num_effect_tokens": 7884,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8087,
+      "epoch": 0.89019,
+      "grad_norm": 0.12447866797447205,
+      "learning_rate": 6.78443110795936e-06,
+      "loss": 4.5645,
+      "step": 686
+    },
+    {
+      "batch_num_effect_tokens": 8066,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.89148,
+      "grad_norm": 0.1174447238445282,
+      "learning_rate": 6.773839548717036e-06,
+      "loss": 4.6055,
+      "step": 687
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.89278,
+      "grad_norm": 0.1315164715051651,
+      "learning_rate": 6.7632388759278225e-06,
+      "loss": 4.4893,
+      "step": 688
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.89408,
+      "grad_norm": 0.119374580681324,
+      "learning_rate": 6.752629144055342e-06,
+      "loss": 4.4414,
+      "step": 689
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.89538,
+      "grad_norm": 0.12017811089754105,
+      "learning_rate": 6.742010407609759e-06,
+      "loss": 4.8516,
+      "step": 690
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.89667,
+      "grad_norm": 0.11169980466365814,
+      "learning_rate": 6.731382721147509e-06,
+      "loss": 4.6357,
+      "step": 691
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.89797,
+      "grad_norm": 0.12770824134349823,
+      "learning_rate": 6.720746139270997e-06,
+      "loss": 4.7705,
+      "step": 692
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.89927,
+      "grad_norm": 0.12186800688505173,
+      "learning_rate": 6.710100716628345e-06,
+      "loss": 4.2812,
+      "step": 693
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.90057,
+      "grad_norm": 0.12909391522407532,
+      "learning_rate": 6.699446507913083e-06,
+      "loss": 4.7236,
+      "step": 694
+    },
+    {
+      "batch_num_effect_tokens": 7907,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8128,
+      "epoch": 0.90187,
+      "grad_norm": 0.1295677274465561,
+      "learning_rate": 6.6887835678638944e-06,
+      "loss": 4.6318,
+      "step": 695
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.90316,
+      "grad_norm": 0.12642303109169006,
+      "learning_rate": 6.6781119512643136e-06,
+      "loss": 4.3809,
+      "step": 696
+    },
+    {
+      "batch_num_effect_tokens": 7971,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.90446,
+      "grad_norm": 0.12434201687574387,
+      "learning_rate": 6.6674317129424535e-06,
+      "loss": 4.5703,
+      "step": 697
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.90576,
+      "grad_norm": 0.11948221176862717,
+      "learning_rate": 6.656742907770728e-06,
+      "loss": 4.6201,
+      "step": 698
+    },
+    {
+      "batch_num_effect_tokens": 7950,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8131,
+      "epoch": 0.90706,
+      "grad_norm": 0.12394808977842331,
+      "learning_rate": 6.6460455906655595e-06,
+      "loss": 4.4463,
+      "step": 699
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.90835,
+      "grad_norm": 0.1280088871717453,
+      "learning_rate": 6.635339816587109e-06,
+      "loss": 4.7422,
+      "step": 700
+    },
+    {
+      "batch_num_effect_tokens": 7911,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 0.90965,
+      "grad_norm": 0.1304391473531723,
+      "learning_rate": 6.6246256405389805e-06,
+      "loss": 4.2695,
+      "step": 701
+    },
+    {
+      "batch_num_effect_tokens": 7914,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8164,
+      "epoch": 0.91095,
+      "grad_norm": 0.1250067502260208,
+      "learning_rate": 6.613903117567951e-06,
+      "loss": 4.5664,
+      "step": 702
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.91225,
+      "grad_norm": 0.12943719327449799,
+      "learning_rate": 6.6031723027636775e-06,
+      "loss": 4.5186,
+      "step": 703
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.91354,
+      "grad_norm": 0.11747249215841293,
+      "learning_rate": 6.592433251258423e-06,
+      "loss": 4.7568,
+      "step": 704
+    },
+    {
+      "batch_num_effect_tokens": 8066,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.91484,
+      "grad_norm": 0.13342788815498352,
+      "learning_rate": 6.581686018226764e-06,
+      "loss": 4.6963,
+      "step": 705
+    },
+    {
+      "batch_num_effect_tokens": 7763,
+      "batch_num_samples": 28,
+      "batch_num_tokens": 8008,
+      "epoch": 0.91614,
+      "grad_norm": 0.12844344973564148,
+      "learning_rate": 6.570930658885314e-06,
+      "loss": 4.6582,
+      "step": 706
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.91744,
+      "grad_norm": 0.14021001756191254,
+      "learning_rate": 6.560167228492436e-06,
+      "loss": 4.8984,
+      "step": 707
+    },
+    {
+      "batch_num_effect_tokens": 7956,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8114,
+      "epoch": 0.91873,
+      "grad_norm": 0.1323830634355545,
+      "learning_rate": 6.549395782347963e-06,
+      "loss": 4.7314,
+      "step": 708
+    },
+    {
+      "batch_num_effect_tokens": 7965,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8157,
+      "epoch": 0.92003,
+      "grad_norm": 0.12167170643806458,
+      "learning_rate": 6.53861637579291e-06,
+      "loss": 4.9531,
+      "step": 709
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8156,
+      "epoch": 0.92133,
+      "grad_norm": 0.12227673083543777,
+      "learning_rate": 6.527829064209187e-06,
+      "loss": 4.8438,
+      "step": 710
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.92263,
+      "grad_norm": 0.12005326896905899,
+      "learning_rate": 6.517033903019323e-06,
+      "loss": 5.04,
+      "step": 711
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.92393,
+      "grad_norm": 0.11777986586093903,
+      "learning_rate": 6.5062309476861714e-06,
+      "loss": 4.9141,
+      "step": 712
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.92522,
+      "grad_norm": 0.12427664548158646,
+      "learning_rate": 6.495420253712636e-06,
+      "loss": 5.0312,
+      "step": 713
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.92652,
+      "grad_norm": 0.12834517657756805,
+      "learning_rate": 6.484601876641375e-06,
+      "loss": 4.7354,
+      "step": 714
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 0.92782,
+      "grad_norm": 0.13113918900489807,
+      "learning_rate": 6.473775872054522e-06,
+      "loss": 4.6543,
+      "step": 715
+    },
+    {
+      "batch_num_effect_tokens": 7916,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8087,
+      "epoch": 0.92912,
+      "grad_norm": 0.1328577995300293,
+      "learning_rate": 6.4629422955733975e-06,
+      "loss": 4.9062,
+      "step": 716
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 0.93041,
+      "grad_norm": 0.11206276714801788,
+      "learning_rate": 6.452101202858229e-06,
+      "loss": 4.6455,
+      "step": 717
+    },
+    {
+      "batch_num_effect_tokens": 7956,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8164,
+      "epoch": 0.93171,
+      "grad_norm": 0.12403670698404312,
+      "learning_rate": 6.4412526496078555e-06,
+      "loss": 4.5957,
+      "step": 718
+    },
+    {
+      "batch_num_effect_tokens": 7954,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8125,
+      "epoch": 0.93301,
+      "grad_norm": 0.142649307847023,
+      "learning_rate": 6.430396691559446e-06,
+      "loss": 4.876,
+      "step": 719
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.93431,
+      "grad_norm": 0.12546393275260925,
+      "learning_rate": 6.419533384488221e-06,
+      "loss": 4.5439,
+      "step": 720
+    },
+    {
+      "batch_num_effect_tokens": 7889,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8108,
+      "epoch": 0.9356,
+      "grad_norm": 0.12057095021009445,
+      "learning_rate": 6.408662784207149e-06,
+      "loss": 4.6611,
+      "step": 721
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.9369,
+      "grad_norm": 0.1193976029753685,
+      "learning_rate": 6.397784946566676e-06,
+      "loss": 4.7529,
+      "step": 722
+    },
+    {
+      "batch_num_effect_tokens": 7955,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8121,
+      "epoch": 0.9382,
+      "grad_norm": 0.12544573843479156,
+      "learning_rate": 6.3868999274544264e-06,
+      "loss": 4.9453,
+      "step": 723
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8192,
+      "epoch": 0.9395,
+      "grad_norm": 0.1267952173948288,
+      "learning_rate": 6.376007782794926e-06,
+      "loss": 4.7207,
+      "step": 724
+    },
+    {
+      "batch_num_effect_tokens": 7882,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8096,
+      "epoch": 0.94079,
+      "grad_norm": 0.14192216098308563,
+      "learning_rate": 6.365108568549308e-06,
+      "loss": 5.0576,
+      "step": 725
+    },
+    {
+      "batch_num_effect_tokens": 7956,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.94209,
+      "grad_norm": 0.12729580700397491,
+      "learning_rate": 6.354202340715027e-06,
+      "loss": 4.6826,
+      "step": 726
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8188,
+      "epoch": 0.94339,
+      "grad_norm": 0.12701718509197235,
+      "learning_rate": 6.34328915532557e-06,
+      "loss": 4.8945,
+      "step": 727
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.94469,
+      "grad_norm": 0.13189218938350677,
+      "learning_rate": 6.332369068450175e-06,
+      "loss": 4.8848,
+      "step": 728
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.94599,
+      "grad_norm": 0.12214604765176773,
+      "learning_rate": 6.321442136193535e-06,
+      "loss": 4.6484,
+      "step": 729
+    },
+    {
+      "batch_num_effect_tokens": 7778,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 7989,
+      "epoch": 0.94728,
+      "grad_norm": 0.12111053615808487,
+      "learning_rate": 6.310508414695511e-06,
+      "loss": 4.7383,
+      "step": 730
+    },
+    {
+      "batch_num_effect_tokens": 7963,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 0.94858,
+      "grad_norm": 0.12662427127361298,
+      "learning_rate": 6.29956796013085e-06,
+      "loss": 4.6602,
+      "step": 731
+    },
+    {
+      "batch_num_effect_tokens": 8073,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.94988,
+      "grad_norm": 0.11730318516492844,
+      "learning_rate": 6.288620828708888e-06,
+      "loss": 4.5,
+      "step": 732
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.95118,
+      "grad_norm": 0.12332963943481445,
+      "learning_rate": 6.277667076673266e-06,
+      "loss": 4.7041,
+      "step": 733
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8167,
+      "epoch": 0.95247,
+      "grad_norm": 0.12476927042007446,
+      "learning_rate": 6.266706760301641e-06,
+      "loss": 4.5742,
+      "step": 734
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.95377,
+      "grad_norm": 0.13098326325416565,
+      "learning_rate": 6.255739935905396e-06,
+      "loss": 4.4307,
+      "step": 735
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.95507,
+      "grad_norm": 0.13105835020542145,
+      "learning_rate": 6.244766659829351e-06,
+      "loss": 4.8428,
+      "step": 736
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.95637,
+      "grad_norm": 0.1244327500462532,
+      "learning_rate": 6.233786988451468e-06,
+      "loss": 4.3555,
+      "step": 737
+    },
+    {
+      "batch_num_effect_tokens": 7967,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 0.95766,
+      "grad_norm": 0.12690576910972595,
+      "learning_rate": 6.222800978182576e-06,
+      "loss": 4.7607,
+      "step": 738
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 0.95896,
+      "grad_norm": 0.12604181468486786,
+      "learning_rate": 6.211808685466063e-06,
+      "loss": 4.9053,
+      "step": 739
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8176,
+      "epoch": 0.96026,
+      "grad_norm": 0.11882328987121582,
+      "learning_rate": 6.200810166777598e-06,
+      "loss": 4.167,
+      "step": 740
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 0.96156,
+      "grad_norm": 0.12758396565914154,
+      "learning_rate": 6.189805478624838e-06,
+      "loss": 4.5254,
+      "step": 741
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.96285,
+      "grad_norm": 0.12387480586767197,
+      "learning_rate": 6.178794677547138e-06,
+      "loss": 4.5957,
+      "step": 742
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.96415,
+      "grad_norm": 0.1272263377904892,
+      "learning_rate": 6.167777820115254e-06,
+      "loss": 4.5576,
+      "step": 743
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 0.96545,
+      "grad_norm": 0.1208052784204483,
+      "learning_rate": 6.156754962931069e-06,
+      "loss": 4.4629,
+      "step": 744
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.96675,
+      "grad_norm": 0.11877097934484482,
+      "learning_rate": 6.145726162627278e-06,
+      "loss": 4.6768,
+      "step": 745
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8114,
+      "epoch": 0.96805,
+      "grad_norm": 0.12087776511907578,
+      "learning_rate": 6.134691475867122e-06,
+      "loss": 4.6719,
+      "step": 746
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8079,
+      "epoch": 0.96934,
+      "grad_norm": 0.1275898814201355,
+      "learning_rate": 6.123650959344075e-06,
+      "loss": 4.666,
+      "step": 747
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.97064,
+      "grad_norm": 0.11717364192008972,
+      "learning_rate": 6.112604669781572e-06,
+      "loss": 4.6748,
+      "step": 748
+    },
+    {
+      "batch_num_effect_tokens": 8079,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.97194,
+      "grad_norm": 0.1246257945895195,
+      "learning_rate": 6.101552663932704e-06,
+      "loss": 4.5859,
+      "step": 749
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.97324,
+      "grad_norm": 0.12895013391971588,
+      "learning_rate": 6.090494998579929e-06,
+      "loss": 4.7861,
+      "step": 750
+    },
+    {
+      "batch_num_effect_tokens": 7823,
+      "batch_num_samples": 27,
+      "batch_num_tokens": 8091,
+      "epoch": 0.97453,
+      "grad_norm": 0.13118872046470642,
+      "learning_rate": 6.079431730534786e-06,
+      "loss": 4.7031,
+      "step": 751
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 0.97583,
+      "grad_norm": 0.1227671355009079,
+      "learning_rate": 6.0683629166375955e-06,
+      "loss": 4.5049,
+      "step": 752
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 0.97713,
+      "grad_norm": 0.12133854627609253,
+      "learning_rate": 6.057288613757178e-06,
+      "loss": 4.7334,
+      "step": 753
+    },
+    {
+      "batch_num_effect_tokens": 8074,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.97843,
+      "grad_norm": 0.12346214056015015,
+      "learning_rate": 6.046208878790543e-06,
+      "loss": 4.7197,
+      "step": 754
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8191,
+      "epoch": 0.97972,
+      "grad_norm": 0.11389555037021637,
+      "learning_rate": 6.035123768662622e-06,
+      "loss": 4.7832,
+      "step": 755
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 0.98102,
+      "grad_norm": 0.1279493272304535,
+      "learning_rate": 6.024033340325954e-06,
+      "loss": 4.7656,
+      "step": 756
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 0.98232,
+      "grad_norm": 0.12431464344263077,
+      "learning_rate": 6.012937650760406e-06,
+      "loss": 4.96,
+      "step": 757
+    },
+    {
+      "batch_num_effect_tokens": 8076,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 0.98362,
+      "grad_norm": 0.126203715801239,
+      "learning_rate": 6.001836756972873e-06,
+      "loss": 4.752,
+      "step": 758
+    },
+    {
+      "batch_num_effect_tokens": 7894,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8143,
+      "epoch": 0.98491,
+      "grad_norm": 0.12350256741046906,
+      "learning_rate": 5.990730715996989e-06,
+      "loss": 4.6533,
+      "step": 759
+    },
+    {
+      "batch_num_effect_tokens": 7952,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.98621,
+      "grad_norm": 0.12190555781126022,
+      "learning_rate": 5.979619584892834e-06,
+      "loss": 4.6904,
+      "step": 760
+    },
+    {
+      "batch_num_effect_tokens": 7993,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 0.98751,
+      "grad_norm": 0.12734700739383698,
+      "learning_rate": 5.968503420746638e-06,
+      "loss": 5.4248,
+      "step": 761
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.98881,
+      "grad_norm": 0.12115947902202606,
+      "learning_rate": 5.957382280670494e-06,
+      "loss": 4.6416,
+      "step": 762
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8152,
+      "epoch": 0.99011,
+      "grad_norm": 0.12552732229232788,
+      "learning_rate": 5.946256221802052e-06,
+      "loss": 4.4473,
+      "step": 763
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 0.9914,
+      "grad_norm": 0.129593163728714,
+      "learning_rate": 5.935125301304241e-06,
+      "loss": 4.9512,
+      "step": 764
+    },
+    {
+      "batch_num_effect_tokens": 7873,
+      "batch_num_samples": 27,
+      "batch_num_tokens": 8120,
+      "epoch": 0.9927,
+      "grad_norm": 0.1321675032377243,
+      "learning_rate": 5.9239895763649635e-06,
+      "loss": 4.8701,
+      "step": 765
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 0.994,
+      "grad_norm": 0.12967081367969513,
+      "learning_rate": 5.91284910419681e-06,
+      "loss": 4.4424,
+      "step": 766
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8088,
+      "epoch": 0.9953,
+      "grad_norm": 0.12642012536525726,
+      "learning_rate": 5.901703942036755e-06,
+      "loss": 5.1172,
+      "step": 767
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8142,
+      "epoch": 0.99659,
+      "grad_norm": 0.13420720398426056,
+      "learning_rate": 5.890554147145875e-06,
+      "loss": 4.7734,
+      "step": 768
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8090,
+      "epoch": 0.99789,
+      "grad_norm": 0.12632231414318085,
+      "learning_rate": 5.879399776809047e-06,
+      "loss": 4.4863,
+      "step": 769
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.99919,
+      "grad_norm": 0.12234952300786972,
+      "learning_rate": 5.8682408883346535e-06,
+      "loss": 4.6025,
+      "step": 770
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 0.99919,
+      "eval_eval_loss": 0.5797469019889832,
+      "eval_eval_runtime": 114.9751,
+      "eval_eval_samples_per_second": 43.488,
+      "eval_eval_steps_per_second": 2.722,
+      "step": 770
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.00049,
+      "grad_norm": 0.13404805958271027,
+      "learning_rate": 5.857077539054289e-06,
+      "loss": 4.9434,
+      "step": 771
+    },
+    {
+      "batch_num_effect_tokens": 7897,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8142,
+      "epoch": 1.00178,
+      "grad_norm": 0.1201479434967041,
+      "learning_rate": 5.8459097863224705e-06,
+      "loss": 4.8154,
+      "step": 772
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.00308,
+      "grad_norm": 0.12970799207687378,
+      "learning_rate": 5.834737687516336e-06,
+      "loss": 4.8105,
+      "step": 773
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.00438,
+      "grad_norm": 0.12445977330207825,
+      "learning_rate": 5.823561300035355e-06,
+      "loss": 4.2812,
+      "step": 774
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.00568,
+      "grad_norm": 0.14487802982330322,
+      "learning_rate": 5.812380681301031e-06,
+      "loss": 4.6328,
+      "step": 775
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8166,
+      "epoch": 1.00697,
+      "grad_norm": 0.12720316648483276,
+      "learning_rate": 5.8011958887565986e-06,
+      "loss": 4.6025,
+      "step": 776
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.00827,
+      "grad_norm": 0.13001519441604614,
+      "learning_rate": 5.79000697986675e-06,
+      "loss": 4.4268,
+      "step": 777
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 1.00957,
+      "grad_norm": 0.12651684880256653,
+      "learning_rate": 5.778814012117315e-06,
+      "loss": 4.4297,
+      "step": 778
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8128,
+      "epoch": 1.01087,
+      "grad_norm": 0.13454923033714294,
+      "learning_rate": 5.767617043014985e-06,
+      "loss": 4.3477,
+      "step": 779
+    },
+    {
+      "batch_num_effect_tokens": 7983,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.01217,
+      "grad_norm": 0.14765796065330505,
+      "learning_rate": 5.756416130087002e-06,
+      "loss": 4.8281,
+      "step": 780
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8186,
+      "epoch": 1.01346,
+      "grad_norm": 0.14546914398670197,
+      "learning_rate": 5.745211330880872e-06,
+      "loss": 4.3789,
+      "step": 781
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.01476,
+      "grad_norm": 0.11960723996162415,
+      "learning_rate": 5.7340027029640755e-06,
+      "loss": 4.3809,
+      "step": 782
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.01606,
+      "grad_norm": 0.14267292618751526,
+      "learning_rate": 5.7227903039237535e-06,
+      "loss": 4.5361,
+      "step": 783
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8155,
+      "epoch": 1.01736,
+      "grad_norm": 0.1270764023065567,
+      "learning_rate": 5.711574191366427e-06,
+      "loss": 4.5635,
+      "step": 784
+    },
+    {
+      "batch_num_effect_tokens": 8066,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.01865,
+      "grad_norm": 0.14421948790550232,
+      "learning_rate": 5.7003544229176955e-06,
+      "loss": 5.0654,
+      "step": 785
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.01995,
+      "grad_norm": 0.12795937061309814,
+      "learning_rate": 5.689131056221944e-06,
+      "loss": 4.3809,
+      "step": 786
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 1.02125,
+      "grad_norm": 0.1386982649564743,
+      "learning_rate": 5.677904148942039e-06,
+      "loss": 4.5127,
+      "step": 787
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8190,
+      "epoch": 1.02255,
+      "grad_norm": 0.1462188959121704,
+      "learning_rate": 5.666673758759045e-06,
+      "loss": 4.3438,
+      "step": 788
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.02384,
+      "grad_norm": 0.16186738014221191,
+      "learning_rate": 5.655439943371912e-06,
+      "loss": 4.7744,
+      "step": 789
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.02514,
+      "grad_norm": 0.12250373512506485,
+      "learning_rate": 5.644202760497195e-06,
+      "loss": 4.2549,
+      "step": 790
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8146,
+      "epoch": 1.02644,
+      "grad_norm": 0.17847490310668945,
+      "learning_rate": 5.632962267868747e-06,
+      "loss": 4.7627,
+      "step": 791
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.02774,
+      "grad_norm": 0.1615590900182724,
+      "learning_rate": 5.621718523237427e-06,
+      "loss": 4.8848,
+      "step": 792
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.02903,
+      "grad_norm": 0.13113276660442352,
+      "learning_rate": 5.6104715843708e-06,
+      "loss": 4.4883,
+      "step": 793
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 1.03033,
+      "grad_norm": 0.13856875896453857,
+      "learning_rate": 5.599221509052844e-06,
+      "loss": 4.5146,
+      "step": 794
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.03163,
+      "grad_norm": 0.13490043580532074,
+      "learning_rate": 5.587968355083654e-06,
+      "loss": 4.5322,
+      "step": 795
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.03293,
+      "grad_norm": 0.12667182087898254,
+      "learning_rate": 5.576712180279134e-06,
+      "loss": 4.5234,
+      "step": 796
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.03423,
+      "grad_norm": 0.12577715516090393,
+      "learning_rate": 5.565453042470717e-06,
+      "loss": 4.5273,
+      "step": 797
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.03552,
+      "grad_norm": 0.1268448680639267,
+      "learning_rate": 5.5541909995050554e-06,
+      "loss": 4.7002,
+      "step": 798
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8140,
+      "epoch": 1.03682,
+      "grad_norm": 0.12386433035135269,
+      "learning_rate": 5.542926109243727e-06,
+      "loss": 4.5459,
+      "step": 799
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.03812,
+      "grad_norm": 0.13546870648860931,
+      "learning_rate": 5.53165842956294e-06,
+      "loss": 4.6445,
+      "step": 800
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.03942,
+      "grad_norm": 0.12320155650377274,
+      "learning_rate": 5.520388018353233e-06,
+      "loss": 4.2441,
+      "step": 801
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 1.04071,
+      "grad_norm": 0.14361971616744995,
+      "learning_rate": 5.509114933519179e-06,
+      "loss": 4.4756,
+      "step": 802
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.04201,
+      "grad_norm": 0.13132131099700928,
+      "learning_rate": 5.497839232979084e-06,
+      "loss": 4.2627,
+      "step": 803
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.04331,
+      "grad_norm": 0.1350114941596985,
+      "learning_rate": 5.4865609746647e-06,
+      "loss": 4.957,
+      "step": 804
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.04461,
+      "grad_norm": 0.12061459571123123,
+      "learning_rate": 5.475280216520913e-06,
+      "loss": 4.4453,
+      "step": 805
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.0459,
+      "grad_norm": 0.12335465103387833,
+      "learning_rate": 5.463997016505459e-06,
+      "loss": 4.1699,
+      "step": 806
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 1.0472,
+      "grad_norm": 0.12723498046398163,
+      "learning_rate": 5.4527114325886145e-06,
+      "loss": 4.1455,
+      "step": 807
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 1.0485,
+      "grad_norm": 0.13448134064674377,
+      "learning_rate": 5.441423522752904e-06,
+      "loss": 4.625,
+      "step": 808
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.0498,
+      "grad_norm": 0.12482011318206787,
+      "learning_rate": 5.430133344992807e-06,
+      "loss": 4.5391,
+      "step": 809
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8190,
+      "epoch": 1.05109,
+      "grad_norm": 0.12930616736412048,
+      "learning_rate": 5.418840957314451e-06,
+      "loss": 4.1719,
+      "step": 810
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8189,
+      "epoch": 1.05239,
+      "grad_norm": 0.13999834656715393,
+      "learning_rate": 5.4075464177353165e-06,
+      "loss": 4.7783,
+      "step": 811
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.05369,
+      "grad_norm": 0.13308821618556976,
+      "learning_rate": 5.396249784283943e-06,
+      "loss": 4.3184,
+      "step": 812
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 1.05499,
+      "grad_norm": 0.12184661626815796,
+      "learning_rate": 5.3849511149996255e-06,
+      "loss": 4.5986,
+      "step": 813
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8191,
+      "epoch": 1.05629,
+      "grad_norm": 0.13552594184875488,
+      "learning_rate": 5.373650467932122e-06,
+      "loss": 4.4873,
+      "step": 814
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.05758,
+      "grad_norm": 0.12386645376682281,
+      "learning_rate": 5.362347901141348e-06,
+      "loss": 4.4834,
+      "step": 815
+    },
+    {
+      "batch_num_effect_tokens": 8005,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.05888,
+      "grad_norm": 0.12549902498722076,
+      "learning_rate": 5.351043472697082e-06,
+      "loss": 4.9111,
+      "step": 816
+    },
+    {
+      "batch_num_effect_tokens": 7901,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8128,
+      "epoch": 1.06018,
+      "grad_norm": 0.12777559459209442,
+      "learning_rate": 5.339737240678671e-06,
+      "loss": 4.4678,
+      "step": 817
+    },
+    {
+      "batch_num_effect_tokens": 7908,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8112,
+      "epoch": 1.06148,
+      "grad_norm": 0.14045743644237518,
+      "learning_rate": 5.328429263174725e-06,
+      "loss": 4.4395,
+      "step": 818
+    },
+    {
+      "batch_num_effect_tokens": 7903,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8125,
+      "epoch": 1.06277,
+      "grad_norm": 0.13065417110919952,
+      "learning_rate": 5.317119598282823e-06,
+      "loss": 4.668,
+      "step": 819
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8074,
+      "epoch": 1.06407,
+      "grad_norm": 0.1203104555606842,
+      "learning_rate": 5.3058083041092145e-06,
+      "loss": 4.0527,
+      "step": 820
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8192,
+      "epoch": 1.06537,
+      "grad_norm": 0.1304715871810913,
+      "learning_rate": 5.294495438768517e-06,
+      "loss": 4.2881,
+      "step": 821
+    },
+    {
+      "batch_num_effect_tokens": 7990,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8155,
+      "epoch": 1.06667,
+      "grad_norm": 0.1247849240899086,
+      "learning_rate": 5.283181060383423e-06,
+      "loss": 4.2393,
+      "step": 822
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8166,
+      "epoch": 1.06796,
+      "grad_norm": 0.11411383748054504,
+      "learning_rate": 5.271865227084397e-06,
+      "loss": 4.7168,
+      "step": 823
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.06926,
+      "grad_norm": 0.1270046830177307,
+      "learning_rate": 5.260547997009379e-06,
+      "loss": 4.5264,
+      "step": 824
+    },
+    {
+      "batch_num_effect_tokens": 7917,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8079,
+      "epoch": 1.07056,
+      "grad_norm": 0.12649409472942352,
+      "learning_rate": 5.249229428303486e-06,
+      "loss": 4.5293,
+      "step": 825
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.07186,
+      "grad_norm": 0.1309773325920105,
+      "learning_rate": 5.237909579118713e-06,
+      "loss": 4.2207,
+      "step": 826
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.07315,
+      "grad_norm": 0.12407130748033524,
+      "learning_rate": 5.226588507613629e-06,
+      "loss": 4.4414,
+      "step": 827
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8152,
+      "epoch": 1.07445,
+      "grad_norm": 0.12215977907180786,
+      "learning_rate": 5.21526627195309e-06,
+      "loss": 4.9326,
+      "step": 828
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8126,
+      "epoch": 1.07575,
+      "grad_norm": 0.12580524384975433,
+      "learning_rate": 5.2039429303079294e-06,
+      "loss": 4.4629,
+      "step": 829
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.07705,
+      "grad_norm": 0.12904728949069977,
+      "learning_rate": 5.1926185408546604e-06,
+      "loss": 4.3467,
+      "step": 830
+    },
+    {
+      "batch_num_effect_tokens": 7957,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 1.07835,
+      "grad_norm": 0.13490159809589386,
+      "learning_rate": 5.181293161775186e-06,
+      "loss": 4.4609,
+      "step": 831
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8133,
+      "epoch": 1.07964,
+      "grad_norm": 0.13504654169082642,
+      "learning_rate": 5.169966851256489e-06,
+      "loss": 4.2334,
+      "step": 832
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.08094,
+      "grad_norm": 0.1325363963842392,
+      "learning_rate": 5.15863966749034e-06,
+      "loss": 4.5322,
+      "step": 833
+    },
+    {
+      "batch_num_effect_tokens": 8060,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.08224,
+      "grad_norm": 0.13651366531848907,
+      "learning_rate": 5.147311668672991e-06,
+      "loss": 4.6211,
+      "step": 834
+    },
+    {
+      "batch_num_effect_tokens": 7933,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8092,
+      "epoch": 1.08354,
+      "grad_norm": 0.15273743867874146,
+      "learning_rate": 5.135982913004889e-06,
+      "loss": 4.9326,
+      "step": 835
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.08483,
+      "grad_norm": 0.12992307543754578,
+      "learning_rate": 5.1246534586903655e-06,
+      "loss": 4.623,
+      "step": 836
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.08613,
+      "grad_norm": 0.13165108859539032,
+      "learning_rate": 5.11332336393734e-06,
+      "loss": 4.5762,
+      "step": 837
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.08743,
+      "grad_norm": 0.125563845038414,
+      "learning_rate": 5.101992686957028e-06,
+      "loss": 4.0518,
+      "step": 838
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.08873,
+      "grad_norm": 0.12698976695537567,
+      "learning_rate": 5.090661485963628e-06,
+      "loss": 4.3701,
+      "step": 839
+    },
+    {
+      "batch_num_effect_tokens": 7922,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8080,
+      "epoch": 1.09002,
+      "grad_norm": 0.12403866648674011,
+      "learning_rate": 5.07932981917404e-06,
+      "loss": 4.1211,
+      "step": 840
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.09132,
+      "grad_norm": 0.12252990156412125,
+      "learning_rate": 5.06799774480755e-06,
+      "loss": 4.2109,
+      "step": 841
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8189,
+      "epoch": 1.09262,
+      "grad_norm": 0.11591480672359467,
+      "learning_rate": 5.056665321085542e-06,
+      "loss": 4.6582,
+      "step": 842
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.09392,
+      "grad_norm": 0.12528832256793976,
+      "learning_rate": 5.045332606231191e-06,
+      "loss": 4.5166,
+      "step": 843
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.09521,
+      "grad_norm": 0.1252938210964203,
+      "learning_rate": 5.033999658469174e-06,
+      "loss": 4.1709,
+      "step": 844
+    },
+    {
+      "batch_num_effect_tokens": 7936,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8146,
+      "epoch": 1.09651,
+      "grad_norm": 0.12733793258666992,
+      "learning_rate": 5.022666536025359e-06,
+      "loss": 4.4521,
+      "step": 845
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.09781,
+      "grad_norm": 0.12023656815290451,
+      "learning_rate": 5.011333297126513e-06,
+      "loss": 4.3408,
+      "step": 846
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8171,
+      "epoch": 1.09911,
+      "grad_norm": 0.11743319034576416,
+      "learning_rate": 5e-06,
+      "loss": 4.5605,
+      "step": 847
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.10041,
+      "grad_norm": 0.12908881902694702,
+      "learning_rate": 4.98866670287349e-06,
+      "loss": 4.5234,
+      "step": 848
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8152,
+      "epoch": 1.1017,
+      "grad_norm": 0.1330942064523697,
+      "learning_rate": 4.977333463974643e-06,
+      "loss": 4.6748,
+      "step": 849
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 1.103,
+      "grad_norm": 0.13068662583827972,
+      "learning_rate": 4.966000341530827e-06,
+      "loss": 4.543,
+      "step": 850
+    },
+    {
+      "batch_num_effect_tokens": 7926,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8074,
+      "epoch": 1.1043,
+      "grad_norm": 0.12920166552066803,
+      "learning_rate": 4.9546673937688086e-06,
+      "loss": 4.1533,
+      "step": 851
+    },
+    {
+      "batch_num_effect_tokens": 7905,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8083,
+      "epoch": 1.1056,
+      "grad_norm": 0.12737612426280975,
+      "learning_rate": 4.94333467891446e-06,
+      "loss": 4.5225,
+      "step": 852
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.10689,
+      "grad_norm": 0.11825668811798096,
+      "learning_rate": 4.932002255192452e-06,
+      "loss": 4.542,
+      "step": 853
+    },
+    {
+      "batch_num_effect_tokens": 7872,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8086,
+      "epoch": 1.10819,
+      "grad_norm": 0.1468936651945114,
+      "learning_rate": 4.9206701808259605e-06,
+      "loss": 5.1279,
+      "step": 854
+    },
+    {
+      "batch_num_effect_tokens": 7952,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 1.10949,
+      "grad_norm": 0.1266355961561203,
+      "learning_rate": 4.909338514036373e-06,
+      "loss": 4.3125,
+      "step": 855
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.11079,
+      "grad_norm": 0.13342653214931488,
+      "learning_rate": 4.898007313042975e-06,
+      "loss": 4.3066,
+      "step": 856
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.11208,
+      "grad_norm": 0.12773488461971283,
+      "learning_rate": 4.8866766360626615e-06,
+      "loss": 4.4775,
+      "step": 857
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.11338,
+      "grad_norm": 0.13515350222587585,
+      "learning_rate": 4.875346541309637e-06,
+      "loss": 4.3096,
+      "step": 858
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8139,
+      "epoch": 1.11468,
+      "grad_norm": 0.13417378067970276,
+      "learning_rate": 4.864017086995112e-06,
+      "loss": 4.7119,
+      "step": 859
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.11598,
+      "grad_norm": 0.13580955564975739,
+      "learning_rate": 4.852688331327011e-06,
+      "loss": 4.8125,
+      "step": 860
+    },
+    {
+      "batch_num_effect_tokens": 7911,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8144,
+      "epoch": 1.11727,
+      "grad_norm": 0.12650568783283234,
+      "learning_rate": 4.841360332509663e-06,
+      "loss": 4.5361,
+      "step": 861
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.11857,
+      "grad_norm": 0.1183309480547905,
+      "learning_rate": 4.830033148743512e-06,
+      "loss": 4.0146,
+      "step": 862
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 1.11987,
+      "grad_norm": 0.126793771982193,
+      "learning_rate": 4.818706838224815e-06,
+      "loss": 4.502,
+      "step": 863
+    },
+    {
+      "batch_num_effect_tokens": 7941,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8073,
+      "epoch": 1.12117,
+      "grad_norm": 0.125539630651474,
+      "learning_rate": 4.8073814591453395e-06,
+      "loss": 4.2773,
+      "step": 864
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8176,
+      "epoch": 1.12247,
+      "grad_norm": 0.13130030035972595,
+      "learning_rate": 4.796057069692073e-06,
+      "loss": 4.6611,
+      "step": 865
+    },
+    {
+      "batch_num_effect_tokens": 7915,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8090,
+      "epoch": 1.12376,
+      "grad_norm": 0.12117452919483185,
+      "learning_rate": 4.784733728046912e-06,
+      "loss": 4.3535,
+      "step": 866
+    },
+    {
+      "batch_num_effect_tokens": 7944,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8096,
+      "epoch": 1.12506,
+      "grad_norm": 0.12501753866672516,
+      "learning_rate": 4.773411492386372e-06,
+      "loss": 4.7051,
+      "step": 867
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.12636,
+      "grad_norm": 0.12537001073360443,
+      "learning_rate": 4.762090420881289e-06,
+      "loss": 4.7852,
+      "step": 868
+    },
+    {
+      "batch_num_effect_tokens": 8001,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.12766,
+      "grad_norm": 0.11610813438892365,
+      "learning_rate": 4.750770571696514e-06,
+      "loss": 4.1914,
+      "step": 869
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.12895,
+      "grad_norm": 0.12504693865776062,
+      "learning_rate": 4.739452002990621e-06,
+      "loss": 3.9814,
+      "step": 870
+    },
+    {
+      "batch_num_effect_tokens": 7964,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8143,
+      "epoch": 1.13025,
+      "grad_norm": 0.122451052069664,
+      "learning_rate": 4.728134772915605e-06,
+      "loss": 4.2246,
+      "step": 871
+    },
+    {
+      "batch_num_effect_tokens": 7910,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8089,
+      "epoch": 1.13155,
+      "grad_norm": 0.1333150565624237,
+      "learning_rate": 4.716818939616578e-06,
+      "loss": 4.5938,
+      "step": 872
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.13285,
+      "grad_norm": 0.13619881868362427,
+      "learning_rate": 4.705504561231485e-06,
+      "loss": 4.6348,
+      "step": 873
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.13414,
+      "grad_norm": 0.1358458697795868,
+      "learning_rate": 4.694191695890788e-06,
+      "loss": 4.4453,
+      "step": 874
+    },
+    {
+      "batch_num_effect_tokens": 7844,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8080,
+      "epoch": 1.13544,
+      "grad_norm": 0.12052467465400696,
+      "learning_rate": 4.682880401717178e-06,
+      "loss": 4.4482,
+      "step": 875
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.13674,
+      "grad_norm": 0.11595940589904785,
+      "learning_rate": 4.671570736825277e-06,
+      "loss": 4.1816,
+      "step": 876
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.13804,
+      "grad_norm": 0.1310449242591858,
+      "learning_rate": 4.660262759321331e-06,
+      "loss": 4.4414,
+      "step": 877
+    },
+    {
+      "batch_num_effect_tokens": 7966,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8092,
+      "epoch": 1.13933,
+      "grad_norm": 0.1393718123435974,
+      "learning_rate": 4.6489565273029196e-06,
+      "loss": 4.3857,
+      "step": 878
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.14063,
+      "grad_norm": 0.13982267677783966,
+      "learning_rate": 4.637652098858655e-06,
+      "loss": 4.4727,
+      "step": 879
+    },
+    {
+      "batch_num_effect_tokens": 7863,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8073,
+      "epoch": 1.14193,
+      "grad_norm": 0.14085061848163605,
+      "learning_rate": 4.626349532067879e-06,
+      "loss": 4.707,
+      "step": 880
+    },
+    {
+      "batch_num_effect_tokens": 8078,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.14323,
+      "grad_norm": 0.13555698096752167,
+      "learning_rate": 4.615048885000375e-06,
+      "loss": 4.4424,
+      "step": 881
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8188,
+      "epoch": 1.14453,
+      "grad_norm": 0.13564865291118622,
+      "learning_rate": 4.603750215716057e-06,
+      "loss": 4.2158,
+      "step": 882
+    },
+    {
+      "batch_num_effect_tokens": 8066,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.14582,
+      "grad_norm": 0.13119758665561676,
+      "learning_rate": 4.592453582264684e-06,
+      "loss": 4.6748,
+      "step": 883
+    },
+    {
+      "batch_num_effect_tokens": 7983,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8137,
+      "epoch": 1.14712,
+      "grad_norm": 0.1268366128206253,
+      "learning_rate": 4.581159042685552e-06,
+      "loss": 4.3467,
+      "step": 884
+    },
+    {
+      "batch_num_effect_tokens": 7963,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8161,
+      "epoch": 1.14842,
+      "grad_norm": 0.12466870993375778,
+      "learning_rate": 4.569866655007193e-06,
+      "loss": 4.7676,
+      "step": 885
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.14972,
+      "grad_norm": 0.13537685573101044,
+      "learning_rate": 4.558576477247097e-06,
+      "loss": 4.1719,
+      "step": 886
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8114,
+      "epoch": 1.15101,
+      "grad_norm": 0.12638984620571136,
+      "learning_rate": 4.547288567411388e-06,
+      "loss": 4.3047,
+      "step": 887
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.15231,
+      "grad_norm": 0.12655675411224365,
+      "learning_rate": 4.5360029834945425e-06,
+      "loss": 4.5449,
+      "step": 888
+    },
+    {
+      "batch_num_effect_tokens": 8071,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.15361,
+      "grad_norm": 0.13120026886463165,
+      "learning_rate": 4.524719783479088e-06,
+      "loss": 4.3848,
+      "step": 889
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.15491,
+      "grad_norm": 0.12158391624689102,
+      "learning_rate": 4.513439025335302e-06,
+      "loss": 4.4844,
+      "step": 890
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 1.1562,
+      "grad_norm": 0.13134336471557617,
+      "learning_rate": 4.502160767020918e-06,
+      "loss": 5.5039,
+      "step": 891
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.1575,
+      "grad_norm": 0.12917788326740265,
+      "learning_rate": 4.4908850664808245e-06,
+      "loss": 4.6094,
+      "step": 892
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8188,
+      "epoch": 1.1588,
+      "grad_norm": 0.13100962340831757,
+      "learning_rate": 4.4796119816467685e-06,
+      "loss": 4.3867,
+      "step": 893
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.1601,
+      "grad_norm": 0.13368336856365204,
+      "learning_rate": 4.468341570437061e-06,
+      "loss": 4.4697,
+      "step": 894
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.16139,
+      "grad_norm": 0.12571601569652557,
+      "learning_rate": 4.457073890756273e-06,
+      "loss": 4.249,
+      "step": 895
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.16269,
+      "grad_norm": 0.1370954066514969,
+      "learning_rate": 4.445809000494945e-06,
+      "loss": 4.5898,
+      "step": 896
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8163,
+      "epoch": 1.16399,
+      "grad_norm": 0.12560419738292694,
+      "learning_rate": 4.434546957529283e-06,
+      "loss": 4.3018,
+      "step": 897
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.16529,
+      "grad_norm": 0.1281665414571762,
+      "learning_rate": 4.423287819720866e-06,
+      "loss": 4.2051,
+      "step": 898
+    },
+    {
+      "batch_num_effect_tokens": 8076,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.16659,
+      "grad_norm": 0.13026204705238342,
+      "learning_rate": 4.412031644916348e-06,
+      "loss": 4.3184,
+      "step": 899
+    },
+    {
+      "batch_num_effect_tokens": 7923,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8124,
+      "epoch": 1.16788,
+      "grad_norm": 0.1349964737892151,
+      "learning_rate": 4.400778490947157e-06,
+      "loss": 4.7354,
+      "step": 900
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.16918,
+      "grad_norm": 0.13438257575035095,
+      "learning_rate": 4.389528415629201e-06,
+      "loss": 4.5127,
+      "step": 901
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.17048,
+      "grad_norm": 0.12244999408721924,
+      "learning_rate": 4.3782814767625755e-06,
+      "loss": 4.752,
+      "step": 902
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.17178,
+      "grad_norm": 0.12824036180973053,
+      "learning_rate": 4.367037732131254e-06,
+      "loss": 4.1963,
+      "step": 903
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.17307,
+      "grad_norm": 0.13308383524417877,
+      "learning_rate": 4.355797239502807e-06,
+      "loss": 4.3115,
+      "step": 904
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.17437,
+      "grad_norm": 0.12866966426372528,
+      "learning_rate": 4.34456005662809e-06,
+      "loss": 4.7246,
+      "step": 905
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.17567,
+      "grad_norm": 0.12427198141813278,
+      "learning_rate": 4.3333262412409575e-06,
+      "loss": 4.4072,
+      "step": 906
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.17697,
+      "grad_norm": 0.12111925333738327,
+      "learning_rate": 4.322095851057962e-06,
+      "loss": 4.4688,
+      "step": 907
+    },
+    {
+      "batch_num_effect_tokens": 8005,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8170,
+      "epoch": 1.17826,
+      "grad_norm": 0.13822191953659058,
+      "learning_rate": 4.310868943778057e-06,
+      "loss": 4.6992,
+      "step": 908
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8100,
+      "epoch": 1.17956,
+      "grad_norm": 0.13831211626529694,
+      "learning_rate": 4.299645577082305e-06,
+      "loss": 4.749,
+      "step": 909
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.18086,
+      "grad_norm": 0.11694054305553436,
+      "learning_rate": 4.2884258086335755e-06,
+      "loss": 4.2002,
+      "step": 910
+    },
+    {
+      "batch_num_effect_tokens": 7919,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 1.18216,
+      "grad_norm": 0.13560707867145538,
+      "learning_rate": 4.277209696076248e-06,
+      "loss": 4.751,
+      "step": 911
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.18345,
+      "grad_norm": 0.11765341460704803,
+      "learning_rate": 4.265997297035926e-06,
+      "loss": 4.2842,
+      "step": 912
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8190,
+      "epoch": 1.18475,
+      "grad_norm": 0.13359101116657257,
+      "learning_rate": 4.254788669119127e-06,
+      "loss": 4.832,
+      "step": 913
+    },
+    {
+      "batch_num_effect_tokens": 8069,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.18605,
+      "grad_norm": 0.13347217440605164,
+      "learning_rate": 4.243583869913e-06,
+      "loss": 4.5137,
+      "step": 914
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.18735,
+      "grad_norm": 0.12483620643615723,
+      "learning_rate": 4.232382956985017e-06,
+      "loss": 4.1416,
+      "step": 915
+    },
+    {
+      "batch_num_effect_tokens": 7913,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8089,
+      "epoch": 1.18865,
+      "grad_norm": 0.1381087601184845,
+      "learning_rate": 4.221185987882684e-06,
+      "loss": 4.5977,
+      "step": 916
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.18994,
+      "grad_norm": 0.15423673391342163,
+      "learning_rate": 4.209993020133251e-06,
+      "loss": 4.1279,
+      "step": 917
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8122,
+      "epoch": 1.19124,
+      "grad_norm": 0.14219102263450623,
+      "learning_rate": 4.198804111243403e-06,
+      "loss": 4.1191,
+      "step": 918
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8144,
+      "epoch": 1.19254,
+      "grad_norm": 0.1339540034532547,
+      "learning_rate": 4.187619318698971e-06,
+      "loss": 4.3799,
+      "step": 919
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.19384,
+      "grad_norm": 0.1272207647562027,
+      "learning_rate": 4.176438699964646e-06,
+      "loss": 4.2256,
+      "step": 920
+    },
+    {
+      "batch_num_effect_tokens": 7961,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8122,
+      "epoch": 1.19513,
+      "grad_norm": 0.1126057505607605,
+      "learning_rate": 4.165262312483664e-06,
+      "loss": 4.7441,
+      "step": 921
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8184,
+      "epoch": 1.19643,
+      "grad_norm": 0.13080507516860962,
+      "learning_rate": 4.154090213677531e-06,
+      "loss": 4.5449,
+      "step": 922
+    },
+    {
+      "batch_num_effect_tokens": 7840,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8006,
+      "epoch": 1.19773,
+      "grad_norm": 0.13032877445220947,
+      "learning_rate": 4.1429224609457135e-06,
+      "loss": 4.7578,
+      "step": 923
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8114,
+      "epoch": 1.19903,
+      "grad_norm": 0.12347196787595749,
+      "learning_rate": 4.131759111665349e-06,
+      "loss": 4.6602,
+      "step": 924
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8114,
+      "epoch": 1.19903,
+      "eval_eval_loss": 0.5730312466621399,
+      "eval_eval_runtime": 115.3354,
+      "eval_eval_samples_per_second": 43.352,
+      "eval_eval_steps_per_second": 2.714,
+      "step": 924
+    },
+    {
+      "batch_num_effect_tokens": 7754,
+      "batch_num_samples": 30,
+      "batch_num_tokens": 8011,
+      "epoch": 1.20032,
+      "grad_norm": 0.12765365839004517,
+      "learning_rate": 4.120600223190955e-06,
+      "loss": 4.1592,
+      "step": 925
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.20162,
+      "grad_norm": 0.13900645077228546,
+      "learning_rate": 4.109445852854125e-06,
+      "loss": 4.457,
+      "step": 926
+    },
+    {
+      "batch_num_effect_tokens": 7980,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8134,
+      "epoch": 1.20292,
+      "grad_norm": 0.12310691922903061,
+      "learning_rate": 4.098296057963246e-06,
+      "loss": 4.6924,
+      "step": 927
+    },
+    {
+      "batch_num_effect_tokens": 7909,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8087,
+      "epoch": 1.20422,
+      "grad_norm": 0.1377318799495697,
+      "learning_rate": 4.087150895803192e-06,
+      "loss": 4.085,
+      "step": 928
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8188,
+      "epoch": 1.20552,
+      "grad_norm": 0.14090469479560852,
+      "learning_rate": 4.076010423635037e-06,
+      "loss": 4.8594,
+      "step": 929
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.20681,
+      "grad_norm": 0.12155922502279282,
+      "learning_rate": 4.064874698695761e-06,
+      "loss": 4.3281,
+      "step": 930
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.20811,
+      "grad_norm": 0.13325005769729614,
+      "learning_rate": 4.053743778197951e-06,
+      "loss": 4.668,
+      "step": 931
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.20941,
+      "grad_norm": 0.1313610076904297,
+      "learning_rate": 4.042617719329507e-06,
+      "loss": 4.7402,
+      "step": 932
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.21071,
+      "grad_norm": 0.1211569532752037,
+      "learning_rate": 4.0314965792533635e-06,
+      "loss": 4.4873,
+      "step": 933
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8191,
+      "epoch": 1.212,
+      "grad_norm": 0.12477646768093109,
+      "learning_rate": 4.020380415107167e-06,
+      "loss": 4.4355,
+      "step": 934
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.2133,
+      "grad_norm": 0.1258799135684967,
+      "learning_rate": 4.009269284003014e-06,
+      "loss": 4.6689,
+      "step": 935
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 1.2146,
+      "grad_norm": 0.1279737502336502,
+      "learning_rate": 3.99816324302713e-06,
+      "loss": 4.0469,
+      "step": 936
+    },
+    {
+      "batch_num_effect_tokens": 7933,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8122,
+      "epoch": 1.2159,
+      "grad_norm": 0.12206988781690598,
+      "learning_rate": 3.987062349239596e-06,
+      "loss": 4.1738,
+      "step": 937
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.21719,
+      "grad_norm": 0.1263066828250885,
+      "learning_rate": 3.975966659674048e-06,
+      "loss": 4.3438,
+      "step": 938
+    },
+    {
+      "batch_num_effect_tokens": 8066,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.21849,
+      "grad_norm": 0.1253357082605362,
+      "learning_rate": 3.964876231337379e-06,
+      "loss": 4.2725,
+      "step": 939
+    },
+    {
+      "batch_num_effect_tokens": 7945,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8128,
+      "epoch": 1.21979,
+      "grad_norm": 0.146712064743042,
+      "learning_rate": 3.953791121209458e-06,
+      "loss": 4.1543,
+      "step": 940
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8155,
+      "epoch": 1.22109,
+      "grad_norm": 0.13425204157829285,
+      "learning_rate": 3.942711386242826e-06,
+      "loss": 4.8652,
+      "step": 941
+    },
+    {
+      "batch_num_effect_tokens": 7906,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8079,
+      "epoch": 1.22238,
+      "grad_norm": 0.13137395679950714,
+      "learning_rate": 3.931637083362405e-06,
+      "loss": 4.0664,
+      "step": 942
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.22368,
+      "grad_norm": 0.12695470452308655,
+      "learning_rate": 3.920568269465216e-06,
+      "loss": 4.3643,
+      "step": 943
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8144,
+      "epoch": 1.22498,
+      "grad_norm": 0.12561850249767303,
+      "learning_rate": 3.909505001420072e-06,
+      "loss": 4.4287,
+      "step": 944
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.22628,
+      "grad_norm": 0.13399529457092285,
+      "learning_rate": 3.898447336067297e-06,
+      "loss": 4.2666,
+      "step": 945
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8182,
+      "epoch": 1.22758,
+      "grad_norm": 0.1406509280204773,
+      "learning_rate": 3.887395330218429e-06,
+      "loss": 4.4023,
+      "step": 946
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.22887,
+      "grad_norm": 0.12866802513599396,
+      "learning_rate": 3.876349040655925e-06,
+      "loss": 4.2334,
+      "step": 947
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8163,
+      "epoch": 1.23017,
+      "grad_norm": 0.1247694343328476,
+      "learning_rate": 3.86530852413288e-06,
+      "loss": 4.5361,
+      "step": 948
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.23147,
+      "grad_norm": 0.12325099110603333,
+      "learning_rate": 3.854273837372724e-06,
+      "loss": 4.2549,
+      "step": 949
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.23277,
+      "grad_norm": 0.1377268135547638,
+      "learning_rate": 3.843245037068932e-06,
+      "loss": 4.5361,
+      "step": 950
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.23406,
+      "grad_norm": 0.1305629014968872,
+      "learning_rate": 3.832222179884747e-06,
+      "loss": 4.4932,
+      "step": 951
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.23536,
+      "grad_norm": 0.1270604133605957,
+      "learning_rate": 3.821205322452863e-06,
+      "loss": 4.877,
+      "step": 952
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8106,
+      "epoch": 1.23666,
+      "grad_norm": 0.1398427039384842,
+      "learning_rate": 3.8101945213751635e-06,
+      "loss": 4.583,
+      "step": 953
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.23796,
+      "grad_norm": 0.12297014147043228,
+      "learning_rate": 3.799189833222404e-06,
+      "loss": 4.4424,
+      "step": 954
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8107,
+      "epoch": 1.23925,
+      "grad_norm": 0.1305890530347824,
+      "learning_rate": 3.7881913145339387e-06,
+      "loss": 4.5752,
+      "step": 955
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.24055,
+      "grad_norm": 0.1272619217634201,
+      "learning_rate": 3.777199021817426e-06,
+      "loss": 4.9912,
+      "step": 956
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.24185,
+      "grad_norm": 0.13338087499141693,
+      "learning_rate": 3.7662130115485317e-06,
+      "loss": 4.6572,
+      "step": 957
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.24315,
+      "grad_norm": 0.13084988296031952,
+      "learning_rate": 3.7552333401706508e-06,
+      "loss": 4.0723,
+      "step": 958
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.24444,
+      "grad_norm": 0.12794189155101776,
+      "learning_rate": 3.7442600640946045e-06,
+      "loss": 4.2676,
+      "step": 959
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.24574,
+      "grad_norm": 0.1372474730014801,
+      "learning_rate": 3.733293239698359e-06,
+      "loss": 4.6777,
+      "step": 960
+    },
+    {
+      "batch_num_effect_tokens": 7919,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8081,
+      "epoch": 1.24704,
+      "grad_norm": 0.14294537901878357,
+      "learning_rate": 3.7223329233267354e-06,
+      "loss": 4.585,
+      "step": 961
+    },
+    {
+      "batch_num_effect_tokens": 7930,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8086,
+      "epoch": 1.24834,
+      "grad_norm": 0.13273997604846954,
+      "learning_rate": 3.711379171291115e-06,
+      "loss": 4.2793,
+      "step": 962
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.24964,
+      "grad_norm": 0.13120055198669434,
+      "learning_rate": 3.7004320398691507e-06,
+      "loss": 4.5068,
+      "step": 963
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.25093,
+      "grad_norm": 0.12945452332496643,
+      "learning_rate": 3.689491585304491e-06,
+      "loss": 4.873,
+      "step": 964
+    },
+    {
+      "batch_num_effect_tokens": 7897,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8112,
+      "epoch": 1.25223,
+      "grad_norm": 0.13530410826206207,
+      "learning_rate": 3.6785578638064655e-06,
+      "loss": 4.6064,
+      "step": 965
+    },
+    {
+      "batch_num_effect_tokens": 8075,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.25353,
+      "grad_norm": 0.12716589868068695,
+      "learning_rate": 3.667630931549826e-06,
+      "loss": 4.0967,
+      "step": 966
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.25483,
+      "grad_norm": 0.12516002357006073,
+      "learning_rate": 3.6567108446744314e-06,
+      "loss": 4.5811,
+      "step": 967
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.25612,
+      "grad_norm": 0.13172359764575958,
+      "learning_rate": 3.6457976592849753e-06,
+      "loss": 4.4248,
+      "step": 968
+    },
+    {
+      "batch_num_effect_tokens": 7955,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8167,
+      "epoch": 1.25742,
+      "grad_norm": 0.12710091471672058,
+      "learning_rate": 3.6348914314506944e-06,
+      "loss": 4.167,
+      "step": 969
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.25872,
+      "grad_norm": 0.1424860805273056,
+      "learning_rate": 3.623992217205075e-06,
+      "loss": 4.3926,
+      "step": 970
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.26002,
+      "grad_norm": 0.12454716116189957,
+      "learning_rate": 3.6131000725455756e-06,
+      "loss": 4.5156,
+      "step": 971
+    },
+    {
+      "batch_num_effect_tokens": 7839,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8080,
+      "epoch": 1.26131,
+      "grad_norm": 0.13083425164222717,
+      "learning_rate": 3.6022150534333267e-06,
+      "loss": 4.667,
+      "step": 972
+    },
+    {
+      "batch_num_effect_tokens": 7960,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8160,
+      "epoch": 1.26261,
+      "grad_norm": 0.13552185893058777,
+      "learning_rate": 3.5913372157928515e-06,
+      "loss": 4.8428,
+      "step": 973
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8128,
+      "epoch": 1.26391,
+      "grad_norm": 0.119549959897995,
+      "learning_rate": 3.5804666155117807e-06,
+      "loss": 4.6348,
+      "step": 974
+    },
+    {
+      "batch_num_effect_tokens": 7979,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.26521,
+      "grad_norm": 0.12790077924728394,
+      "learning_rate": 3.5696033084405535e-06,
+      "loss": 4.1328,
+      "step": 975
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.2665,
+      "grad_norm": 0.12213937938213348,
+      "learning_rate": 3.558747350392146e-06,
+      "loss": 4.584,
+      "step": 976
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.2678,
+      "grad_norm": 0.12294816225767136,
+      "learning_rate": 3.5478987971417723e-06,
+      "loss": 4.5674,
+      "step": 977
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8184,
+      "epoch": 1.2691,
+      "grad_norm": 0.12941895425319672,
+      "learning_rate": 3.537057704426602e-06,
+      "loss": 4.6514,
+      "step": 978
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.2704,
+      "grad_norm": 0.11868005990982056,
+      "learning_rate": 3.526224127945479e-06,
+      "loss": 4.3252,
+      "step": 979
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 1.2717,
+      "grad_norm": 0.13335098326206207,
+      "learning_rate": 3.5153981233586277e-06,
+      "loss": 4.7188,
+      "step": 980
+    },
+    {
+      "batch_num_effect_tokens": 7880,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8062,
+      "epoch": 1.27299,
+      "grad_norm": 0.11686256527900696,
+      "learning_rate": 3.5045797462873643e-06,
+      "loss": 4.2773,
+      "step": 981
+    },
+    {
+      "batch_num_effect_tokens": 7808,
+      "batch_num_samples": 30,
+      "batch_num_tokens": 8082,
+      "epoch": 1.27429,
+      "grad_norm": 0.13790954649448395,
+      "learning_rate": 3.4937690523138302e-06,
+      "loss": 4.5752,
+      "step": 982
+    },
+    {
+      "batch_num_effect_tokens": 8003,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.27559,
+      "grad_norm": 0.1367255300283432,
+      "learning_rate": 3.4829660969806776e-06,
+      "loss": 4.6543,
+      "step": 983
+    },
+    {
+      "batch_num_effect_tokens": 7947,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8116,
+      "epoch": 1.27689,
+      "grad_norm": 0.14571502804756165,
+      "learning_rate": 3.4721709357908146e-06,
+      "loss": 4.248,
+      "step": 984
+    },
+    {
+      "batch_num_effect_tokens": 7961,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8108,
+      "epoch": 1.27818,
+      "grad_norm": 0.12180175632238388,
+      "learning_rate": 3.461383624207092e-06,
+      "loss": 4.6895,
+      "step": 985
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8176,
+      "epoch": 1.27948,
+      "grad_norm": 0.1343916952610016,
+      "learning_rate": 3.4506042176520375e-06,
+      "loss": 4.3574,
+      "step": 986
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.28078,
+      "grad_norm": 0.13950027525424957,
+      "learning_rate": 3.439832771507565e-06,
+      "loss": 4.165,
+      "step": 987
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8146,
+      "epoch": 1.28208,
+      "grad_norm": 0.1476665586233139,
+      "learning_rate": 3.4290693411146882e-06,
+      "loss": 4.8047,
+      "step": 988
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8186,
+      "epoch": 1.28337,
+      "grad_norm": 0.13402019441127777,
+      "learning_rate": 3.418313981773238e-06,
+      "loss": 4.5215,
+      "step": 989
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8101,
+      "epoch": 1.28467,
+      "grad_norm": 0.1343563050031662,
+      "learning_rate": 3.4075667487415785e-06,
+      "loss": 4.3604,
+      "step": 990
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.28597,
+      "grad_norm": 0.13799680769443512,
+      "learning_rate": 3.3968276972363224e-06,
+      "loss": 4.3262,
+      "step": 991
+    },
+    {
+      "batch_num_effect_tokens": 7924,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8192,
+      "epoch": 1.28727,
+      "grad_norm": 0.13754983246326447,
+      "learning_rate": 3.3860968824320507e-06,
+      "loss": 4.3242,
+      "step": 992
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8188,
+      "epoch": 1.28856,
+      "grad_norm": 0.13641497492790222,
+      "learning_rate": 3.3753743594610216e-06,
+      "loss": 4.498,
+      "step": 993
+    },
+    {
+      "batch_num_effect_tokens": 8067,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.28986,
+      "grad_norm": 0.1407681405544281,
+      "learning_rate": 3.3646601834128924e-06,
+      "loss": 4.7393,
+      "step": 994
+    },
+    {
+      "batch_num_effect_tokens": 7816,
+      "batch_num_samples": 27,
+      "batch_num_tokens": 8041,
+      "epoch": 1.29116,
+      "grad_norm": 0.14411447942256927,
+      "learning_rate": 3.353954409334442e-06,
+      "loss": 4.3535,
+      "step": 995
+    },
+    {
+      "batch_num_effect_tokens": 7919,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8119,
+      "epoch": 1.29246,
+      "grad_norm": 0.13188710808753967,
+      "learning_rate": 3.3432570922292728e-06,
+      "loss": 4.2559,
+      "step": 996
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8186,
+      "epoch": 1.29376,
+      "grad_norm": 0.12662333250045776,
+      "learning_rate": 3.3325682870575478e-06,
+      "loss": 4.0684,
+      "step": 997
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8179,
+      "epoch": 1.29505,
+      "grad_norm": 0.12670519948005676,
+      "learning_rate": 3.3218880487356885e-06,
+      "loss": 4.5381,
+      "step": 998
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.29635,
+      "grad_norm": 0.13219594955444336,
+      "learning_rate": 3.3112164321361064e-06,
+      "loss": 4.7129,
+      "step": 999
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.29765,
+      "grad_norm": 0.12541687488555908,
+      "learning_rate": 3.3005534920869175e-06,
+      "loss": 4.333,
+      "step": 1000
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8160,
+      "epoch": 1.29895,
+      "grad_norm": 0.12586721777915955,
+      "learning_rate": 3.289899283371657e-06,
+      "loss": 4.4082,
+      "step": 1001
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.30024,
+      "grad_norm": 0.11721961200237274,
+      "learning_rate": 3.2792538607290036e-06,
+      "loss": 4.3291,
+      "step": 1002
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8130,
+      "epoch": 1.30154,
+      "grad_norm": 0.14944490790367126,
+      "learning_rate": 3.268617278852494e-06,
+      "loss": 4.0322,
+      "step": 1003
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8180,
+      "epoch": 1.30284,
+      "grad_norm": 0.13059593737125397,
+      "learning_rate": 3.257989592390241e-06,
+      "loss": 4.1982,
+      "step": 1004
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.30414,
+      "grad_norm": 0.12026825547218323,
+      "learning_rate": 3.2473708559446606e-06,
+      "loss": 4.377,
+      "step": 1005
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.30543,
+      "grad_norm": 0.1340772956609726,
+      "learning_rate": 3.2367611240721796e-06,
+      "loss": 4.5195,
+      "step": 1006
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 1.30673,
+      "grad_norm": 0.13199454545974731,
+      "learning_rate": 3.226160451282965e-06,
+      "loss": 4.3545,
+      "step": 1007
+    },
+    {
+      "batch_num_effect_tokens": 7976,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.30803,
+      "grad_norm": 0.11274772882461548,
+      "learning_rate": 3.2155688920406415e-06,
+      "loss": 4.1318,
+      "step": 1008
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.30933,
+      "grad_norm": 0.13221722841262817,
+      "learning_rate": 3.204986500762006e-06,
+      "loss": 4.499,
+      "step": 1009
+    },
+    {
+      "batch_num_effect_tokens": 7972,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8126,
+      "epoch": 1.31062,
+      "grad_norm": 0.13470391929149628,
+      "learning_rate": 3.194413331816759e-06,
+      "loss": 4.7568,
+      "step": 1010
+    },
+    {
+      "batch_num_effect_tokens": 7829,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8112,
+      "epoch": 1.31192,
+      "grad_norm": 0.1261770874261856,
+      "learning_rate": 3.1838494395272155e-06,
+      "loss": 4.3809,
+      "step": 1011
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.31322,
+      "grad_norm": 0.13255248963832855,
+      "learning_rate": 3.173294878168025e-06,
+      "loss": 4.3428,
+      "step": 1012
+    },
+    {
+      "batch_num_effect_tokens": 7932,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8076,
+      "epoch": 1.31452,
+      "grad_norm": 0.13333898782730103,
+      "learning_rate": 3.162749701965907e-06,
+      "loss": 4.5859,
+      "step": 1013
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.31582,
+      "grad_norm": 0.13349847495555878,
+      "learning_rate": 3.152213965099352e-06,
+      "loss": 4.8818,
+      "step": 1014
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 1.31711,
+      "grad_norm": 0.12797629833221436,
+      "learning_rate": 3.141687721698363e-06,
+      "loss": 4.6836,
+      "step": 1015
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 1.31841,
+      "grad_norm": 0.1437963992357254,
+      "learning_rate": 3.1311710258441607e-06,
+      "loss": 4.3955,
+      "step": 1016
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 1.31971,
+      "grad_norm": 0.12287195771932602,
+      "learning_rate": 3.1206639315689154e-06,
+      "loss": 3.877,
+      "step": 1017
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.32101,
+      "grad_norm": 0.1235748901963234,
+      "learning_rate": 3.110166492855468e-06,
+      "loss": 4.1572,
+      "step": 1018
+    },
+    {
+      "batch_num_effect_tokens": 8071,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.3223,
+      "grad_norm": 0.1256420761346817,
+      "learning_rate": 3.0996787636370495e-06,
+      "loss": 4.1611,
+      "step": 1019
+    },
+    {
+      "batch_num_effect_tokens": 8071,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.3236,
+      "grad_norm": 0.13278773427009583,
+      "learning_rate": 3.0892007977970083e-06,
+      "loss": 4.5078,
+      "step": 1020
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8188,
+      "epoch": 1.3249,
+      "grad_norm": 0.130946084856987,
+      "learning_rate": 3.0787326491685287e-06,
+      "loss": 4.6865,
+      "step": 1021
+    },
+    {
+      "batch_num_effect_tokens": 7960,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 1.3262,
+      "grad_norm": 0.130752831697464,
+      "learning_rate": 3.0682743715343565e-06,
+      "loss": 4.5918,
+      "step": 1022
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8192,
+      "epoch": 1.32749,
+      "grad_norm": 0.13391391932964325,
+      "learning_rate": 3.057826018626527e-06,
+      "loss": 4.5469,
+      "step": 1023
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.32879,
+      "grad_norm": 0.13457229733467102,
+      "learning_rate": 3.0473876441260786e-06,
+      "loss": 4.7383,
+      "step": 1024
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.33009,
+      "grad_norm": 0.13133271038532257,
+      "learning_rate": 3.0369593016627867e-06,
+      "loss": 4.1289,
+      "step": 1025
+    },
+    {
+      "batch_num_effect_tokens": 7963,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8098,
+      "epoch": 1.33139,
+      "grad_norm": 0.13519789278507233,
+      "learning_rate": 3.026541044814885e-06,
+      "loss": 4.1523,
+      "step": 1026
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8081,
+      "epoch": 1.33268,
+      "grad_norm": 0.137874037027359,
+      "learning_rate": 3.016132927108787e-06,
+      "loss": 4.8057,
+      "step": 1027
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8163,
+      "epoch": 1.33398,
+      "grad_norm": 0.11607379466295242,
+      "learning_rate": 3.005735002018818e-06,
+      "loss": 4.1748,
+      "step": 1028
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.33528,
+      "grad_norm": 0.12126877903938293,
+      "learning_rate": 2.995347322966933e-06,
+      "loss": 4.4434,
+      "step": 1029
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.33658,
+      "grad_norm": 0.12571828067302704,
+      "learning_rate": 2.9849699433224423e-06,
+      "loss": 4.2344,
+      "step": 1030
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.33788,
+      "grad_norm": 0.134991854429245,
+      "learning_rate": 2.974602916401751e-06,
+      "loss": 4.3867,
+      "step": 1031
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.33917,
+      "grad_norm": 0.13775356113910675,
+      "learning_rate": 2.9642462954680605e-06,
+      "loss": 4.7285,
+      "step": 1032
+    },
+    {
+      "batch_num_effect_tokens": 7844,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 7989,
+      "epoch": 1.34047,
+      "grad_norm": 0.12610451877117157,
+      "learning_rate": 2.9539001337311234e-06,
+      "loss": 4.1611,
+      "step": 1033
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.34177,
+      "grad_norm": 0.13606472313404083,
+      "learning_rate": 2.9435644843469434e-06,
+      "loss": 4.2061,
+      "step": 1034
+    },
+    {
+      "batch_num_effect_tokens": 7909,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8080,
+      "epoch": 1.34307,
+      "grad_norm": 0.12449389696121216,
+      "learning_rate": 2.933239400417519e-06,
+      "loss": 4.1045,
+      "step": 1035
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.34436,
+      "grad_norm": 0.12599188089370728,
+      "learning_rate": 2.9229249349905686e-06,
+      "loss": 4.1924,
+      "step": 1036
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8097,
+      "epoch": 1.34566,
+      "grad_norm": 0.1278277039527893,
+      "learning_rate": 2.9126211410592527e-06,
+      "loss": 4.5273,
+      "step": 1037
+    },
+    {
+      "batch_num_effect_tokens": 7967,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8126,
+      "epoch": 1.34696,
+      "grad_norm": 0.13065114617347717,
+      "learning_rate": 2.9023280715619005e-06,
+      "loss": 4.5469,
+      "step": 1038
+    },
+    {
+      "batch_num_effect_tokens": 8077,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.34826,
+      "grad_norm": 0.14062908291816711,
+      "learning_rate": 2.8920457793817507e-06,
+      "loss": 4.5078,
+      "step": 1039
+    },
+    {
+      "batch_num_effect_tokens": 7943,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8101,
+      "epoch": 1.34955,
+      "grad_norm": 0.13684336841106415,
+      "learning_rate": 2.881774317346664e-06,
+      "loss": 4.6104,
+      "step": 1040
+    },
+    {
+      "batch_num_effect_tokens": 7958,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8114,
+      "epoch": 1.35085,
+      "grad_norm": 0.12485930323600769,
+      "learning_rate": 2.871513738228861e-06,
+      "loss": 4.4727,
+      "step": 1041
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.35215,
+      "grad_norm": 0.13105593621730804,
+      "learning_rate": 2.861264094744647e-06,
+      "loss": 4.6553,
+      "step": 1042
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.35345,
+      "grad_norm": 0.14259664714336395,
+      "learning_rate": 2.851025439554142e-06,
+      "loss": 4.5693,
+      "step": 1043
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8191,
+      "epoch": 1.35474,
+      "grad_norm": 0.12127009779214859,
+      "learning_rate": 2.840797825261017e-06,
+      "loss": 4.0859,
+      "step": 1044
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.35604,
+      "grad_norm": 0.13202065229415894,
+      "learning_rate": 2.83058130441221e-06,
+      "loss": 4.2617,
+      "step": 1045
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.35734,
+      "grad_norm": 0.13531279563903809,
+      "learning_rate": 2.8203759294976687e-06,
+      "loss": 4.2031,
+      "step": 1046
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8182,
+      "epoch": 1.35864,
+      "grad_norm": 0.13501350581645966,
+      "learning_rate": 2.810181752950072e-06,
+      "loss": 4.1162,
+      "step": 1047
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.35994,
+      "grad_norm": 0.1449299156665802,
+      "learning_rate": 2.7999988271445643e-06,
+      "loss": 4.2139,
+      "step": 1048
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8101,
+      "epoch": 1.36123,
+      "grad_norm": 0.15028077363967896,
+      "learning_rate": 2.7898272043984947e-06,
+      "loss": 4.8975,
+      "step": 1049
+    },
+    {
+      "batch_num_effect_tokens": 7871,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8074,
+      "epoch": 1.36253,
+      "grad_norm": 0.1480277180671692,
+      "learning_rate": 2.7796669369711294e-06,
+      "loss": 4.4062,
+      "step": 1050
+    },
+    {
+      "batch_num_effect_tokens": 7916,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8126,
+      "epoch": 1.36383,
+      "grad_norm": 0.14486616849899292,
+      "learning_rate": 2.7695180770633993e-06,
+      "loss": 4.582,
+      "step": 1051
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.36513,
+      "grad_norm": 0.13963328301906586,
+      "learning_rate": 2.7593806768176244e-06,
+      "loss": 4.0654,
+      "step": 1052
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8078,
+      "epoch": 1.36642,
+      "grad_norm": 0.1411132663488388,
+      "learning_rate": 2.7492547883172473e-06,
+      "loss": 4.2812,
+      "step": 1053
+    },
+    {
+      "batch_num_effect_tokens": 7897,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8080,
+      "epoch": 1.36772,
+      "grad_norm": 0.14485670626163483,
+      "learning_rate": 2.7391404635865725e-06,
+      "loss": 4.6309,
+      "step": 1054
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.36902,
+      "grad_norm": 0.13264106214046478,
+      "learning_rate": 2.7290377545904823e-06,
+      "loss": 4.4268,
+      "step": 1055
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.37032,
+      "grad_norm": 0.12862573564052582,
+      "learning_rate": 2.718946713234185e-06,
+      "loss": 4.1699,
+      "step": 1056
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.37161,
+      "grad_norm": 0.12804825603961945,
+      "learning_rate": 2.708867391362948e-06,
+      "loss": 4.5527,
+      "step": 1057
+    },
+    {
+      "batch_num_effect_tokens": 7860,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8062,
+      "epoch": 1.37291,
+      "grad_norm": 0.14360636472702026,
+      "learning_rate": 2.6987998407618216e-06,
+      "loss": 4.084,
+      "step": 1058
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.37421,
+      "grad_norm": 0.13846245408058167,
+      "learning_rate": 2.688744113155378e-06,
+      "loss": 4.0527,
+      "step": 1059
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.37551,
+      "grad_norm": 0.1346784085035324,
+      "learning_rate": 2.678700260207449e-06,
+      "loss": 4.7793,
+      "step": 1060
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 1.3768,
+      "grad_norm": 0.14536811411380768,
+      "learning_rate": 2.6686683335208526e-06,
+      "loss": 4.9219,
+      "step": 1061
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.3781,
+      "grad_norm": 0.13124246895313263,
+      "learning_rate": 2.65864838463714e-06,
+      "loss": 4.498,
+      "step": 1062
+    },
+    {
+      "batch_num_effect_tokens": 7892,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8091,
+      "epoch": 1.3794,
+      "grad_norm": 0.12533849477767944,
+      "learning_rate": 2.648640465036316e-06,
+      "loss": 4.3086,
+      "step": 1063
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.3807,
+      "grad_norm": 0.13352860510349274,
+      "learning_rate": 2.6386446261365874e-06,
+      "loss": 4.4902,
+      "step": 1064
+    },
+    {
+      "batch_num_effect_tokens": 7728,
+      "batch_num_samples": 33,
+      "batch_num_tokens": 7999,
+      "epoch": 1.382,
+      "grad_norm": 0.141183003783226,
+      "learning_rate": 2.6286609192940887e-06,
+      "loss": 4.1797,
+      "step": 1065
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.38329,
+      "grad_norm": 0.12944729626178741,
+      "learning_rate": 2.6186893958026245e-06,
+      "loss": 4.6572,
+      "step": 1066
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.38459,
+      "grad_norm": 0.1253744512796402,
+      "learning_rate": 2.608730106893411e-06,
+      "loss": 4.2881,
+      "step": 1067
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.38589,
+      "grad_norm": 0.14108753204345703,
+      "learning_rate": 2.5987831037347933e-06,
+      "loss": 4.2412,
+      "step": 1068
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.38719,
+      "grad_norm": 0.12252593785524368,
+      "learning_rate": 2.5888484374320033e-06,
+      "loss": 4.2715,
+      "step": 1069
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.38848,
+      "grad_norm": 0.1265387237071991,
+      "learning_rate": 2.578926159026891e-06,
+      "loss": 4.2881,
+      "step": 1070
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.38978,
+      "grad_norm": 0.13131491839885712,
+      "learning_rate": 2.5690163194976576e-06,
+      "loss": 4.3721,
+      "step": 1071
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.39108,
+      "grad_norm": 0.13168731331825256,
+      "learning_rate": 2.559118969758595e-06,
+      "loss": 4.291,
+      "step": 1072
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8171,
+      "epoch": 1.39238,
+      "grad_norm": 0.1283009946346283,
+      "learning_rate": 2.549234160659827e-06,
+      "loss": 4.3809,
+      "step": 1073
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.39367,
+      "grad_norm": 0.13365747034549713,
+      "learning_rate": 2.539361942987046e-06,
+      "loss": 4.8643,
+      "step": 1074
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8142,
+      "epoch": 1.39497,
+      "grad_norm": 0.13202007114887238,
+      "learning_rate": 2.5295023674612568e-06,
+      "loss": 4.3945,
+      "step": 1075
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8156,
+      "epoch": 1.39627,
+      "grad_norm": 0.13004064559936523,
+      "learning_rate": 2.519655484738507e-06,
+      "loss": 4.4951,
+      "step": 1076
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.39757,
+      "grad_norm": 0.1316070407629013,
+      "learning_rate": 2.509821345409633e-06,
+      "loss": 4.7695,
+      "step": 1077
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.39886,
+      "grad_norm": 0.12465286254882812,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 4.4688,
+      "step": 1078
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.39886,
+      "eval_eval_loss": 0.5671281218528748,
+      "eval_eval_runtime": 115.3241,
+      "eval_eval_samples_per_second": 43.356,
+      "eval_eval_steps_per_second": 2.714,
+      "step": 1078
+    },
+    {
+      "batch_num_effect_tokens": 7894,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.40016,
+      "grad_norm": 0.1325971782207489,
+      "learning_rate": 2.4901914989692405e-06,
+      "loss": 4.2979,
+      "step": 1079
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.40146,
+      "grad_norm": 0.12942376732826233,
+      "learning_rate": 2.480395892710997e-06,
+      "loss": 4.3652,
+      "step": 1080
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8191,
+      "epoch": 1.40276,
+      "grad_norm": 0.12413739413022995,
+      "learning_rate": 2.470613231552661e-06,
+      "loss": 4.5146,
+      "step": 1081
+    },
+    {
+      "batch_num_effect_tokens": 7966,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 1.40406,
+      "grad_norm": 0.129132479429245,
+      "learning_rate": 2.46084356575511e-06,
+      "loss": 3.9941,
+      "step": 1082
+    },
+    {
+      "batch_num_effect_tokens": 7873,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8106,
+      "epoch": 1.40535,
+      "grad_norm": 0.126277893781662,
+      "learning_rate": 2.451086945512465e-06,
+      "loss": 4.3438,
+      "step": 1083
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.40665,
+      "grad_norm": 0.1285809427499771,
+      "learning_rate": 2.4413434209518137e-06,
+      "loss": 4.2676,
+      "step": 1084
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 1.40795,
+      "grad_norm": 0.1211712658405304,
+      "learning_rate": 2.4316130421329696e-06,
+      "loss": 4.6133,
+      "step": 1085
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8177,
+      "epoch": 1.40925,
+      "grad_norm": 0.12398537993431091,
+      "learning_rate": 2.421895859048196e-06,
+      "loss": 4.0205,
+      "step": 1086
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 1.41054,
+      "grad_norm": 0.12769639492034912,
+      "learning_rate": 2.4121919216219646e-06,
+      "loss": 4.2617,
+      "step": 1087
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.41184,
+      "grad_norm": 0.1281779408454895,
+      "learning_rate": 2.4025012797107e-06,
+      "loss": 4.3457,
+      "step": 1088
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.41314,
+      "grad_norm": 0.1288890838623047,
+      "learning_rate": 2.39282398310251e-06,
+      "loss": 4.0811,
+      "step": 1089
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.41444,
+      "grad_norm": 0.1370052844285965,
+      "learning_rate": 2.383160081516941e-06,
+      "loss": 4.4746,
+      "step": 1090
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.41573,
+      "grad_norm": 0.11726196855306625,
+      "learning_rate": 2.373509624604717e-06,
+      "loss": 4.4619,
+      "step": 1091
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8158,
+      "epoch": 1.41703,
+      "grad_norm": 0.12008494138717651,
+      "learning_rate": 2.363872661947488e-06,
+      "loss": 4.2627,
+      "step": 1092
+    },
+    {
+      "batch_num_effect_tokens": 7961,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8137,
+      "epoch": 1.41833,
+      "grad_norm": 0.12784817814826965,
+      "learning_rate": 2.3542492430575752e-06,
+      "loss": 4.4648,
+      "step": 1093
+    },
+    {
+      "batch_num_effect_tokens": 7973,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.41963,
+      "grad_norm": 0.14390228688716888,
+      "learning_rate": 2.344639417377714e-06,
+      "loss": 4.4814,
+      "step": 1094
+    },
+    {
+      "batch_num_effect_tokens": 7863,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8023,
+      "epoch": 1.42092,
+      "grad_norm": 0.13291668891906738,
+      "learning_rate": 2.3350432342808003e-06,
+      "loss": 4.4844,
+      "step": 1095
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8188,
+      "epoch": 1.42222,
+      "grad_norm": 0.13262400031089783,
+      "learning_rate": 2.3254607430696393e-06,
+      "loss": 4.3721,
+      "step": 1096
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.42352,
+      "grad_norm": 0.12368138134479523,
+      "learning_rate": 2.315891992976687e-06,
+      "loss": 4.29,
+      "step": 1097
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8173,
+      "epoch": 1.42482,
+      "grad_norm": 0.13449449837207794,
+      "learning_rate": 2.3063370331638084e-06,
+      "loss": 4.1885,
+      "step": 1098
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8114,
+      "epoch": 1.42612,
+      "grad_norm": 0.12767821550369263,
+      "learning_rate": 2.296795912722014e-06,
+      "loss": 4.3281,
+      "step": 1099
+    },
+    {
+      "batch_num_effect_tokens": 7935,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8104,
+      "epoch": 1.42741,
+      "grad_norm": 0.13633497059345245,
+      "learning_rate": 2.2872686806712037e-06,
+      "loss": 4.5225,
+      "step": 1100
+    },
+    {
+      "batch_num_effect_tokens": 7834,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8072,
+      "epoch": 1.42871,
+      "grad_norm": 0.13452018797397614,
+      "learning_rate": 2.277755385959934e-06,
+      "loss": 4.4902,
+      "step": 1101
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.43001,
+      "grad_norm": 0.1304135024547577,
+      "learning_rate": 2.2682560774651458e-06,
+      "loss": 4.3936,
+      "step": 1102
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.43131,
+      "grad_norm": 0.1268252283334732,
+      "learning_rate": 2.258770803991932e-06,
+      "loss": 4.2588,
+      "step": 1103
+    },
+    {
+      "batch_num_effect_tokens": 7986,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8188,
+      "epoch": 1.4326,
+      "grad_norm": 0.1350371092557907,
+      "learning_rate": 2.249299614273266e-06,
+      "loss": 4.2285,
+      "step": 1104
+    },
+    {
+      "batch_num_effect_tokens": 7901,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8074,
+      "epoch": 1.4339,
+      "grad_norm": 0.12892520427703857,
+      "learning_rate": 2.2398425569697667e-06,
+      "loss": 4.3389,
+      "step": 1105
+    },
+    {
+      "batch_num_effect_tokens": 7979,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.4352,
+      "grad_norm": 0.11750482767820358,
+      "learning_rate": 2.230399680669449e-06,
+      "loss": 4.2197,
+      "step": 1106
+    },
+    {
+      "batch_num_effect_tokens": 7980,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 1.4365,
+      "grad_norm": 0.12220905721187592,
+      "learning_rate": 2.220971033887463e-06,
+      "loss": 4.3984,
+      "step": 1107
+    },
+    {
+      "batch_num_effect_tokens": 8001,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.43779,
+      "grad_norm": 0.12369371205568314,
+      "learning_rate": 2.211556665065854e-06,
+      "loss": 4.5498,
+      "step": 1108
+    },
+    {
+      "batch_num_effect_tokens": 7932,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8076,
+      "epoch": 1.43909,
+      "grad_norm": 0.13513018190860748,
+      "learning_rate": 2.2021566225733094e-06,
+      "loss": 4.3604,
+      "step": 1109
+    },
+    {
+      "batch_num_effect_tokens": 8075,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.44039,
+      "grad_norm": 0.12401560693979263,
+      "learning_rate": 2.1927709547049096e-06,
+      "loss": 4.2617,
+      "step": 1110
+    },
+    {
+      "batch_num_effect_tokens": 7918,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8086,
+      "epoch": 1.44169,
+      "grad_norm": 0.12758344411849976,
+      "learning_rate": 2.1833997096818897e-06,
+      "loss": 4.7422,
+      "step": 1111
+    },
+    {
+      "batch_num_effect_tokens": 7989,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8191,
+      "epoch": 1.44298,
+      "grad_norm": 0.12552611529827118,
+      "learning_rate": 2.174042935651377e-06,
+      "loss": 4.3584,
+      "step": 1112
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.44428,
+      "grad_norm": 0.12584245204925537,
+      "learning_rate": 2.1647006806861472e-06,
+      "loss": 4.2412,
+      "step": 1113
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.44558,
+      "grad_norm": 0.13394629955291748,
+      "learning_rate": 2.1553729927843894e-06,
+      "loss": 4.0078,
+      "step": 1114
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8186,
+      "epoch": 1.44688,
+      "grad_norm": 0.12964493036270142,
+      "learning_rate": 2.146059919869444e-06,
+      "loss": 4.3604,
+      "step": 1115
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8173,
+      "epoch": 1.44818,
+      "grad_norm": 0.12681354582309723,
+      "learning_rate": 2.1367615097895707e-06,
+      "loss": 3.9531,
+      "step": 1116
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.44947,
+      "grad_norm": 0.13806065917015076,
+      "learning_rate": 2.1274778103176854e-06,
+      "loss": 4.4346,
+      "step": 1117
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8156,
+      "epoch": 1.45077,
+      "grad_norm": 0.14222683012485504,
+      "learning_rate": 2.1182088691511287e-06,
+      "loss": 4.541,
+      "step": 1118
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8142,
+      "epoch": 1.45207,
+      "grad_norm": 0.14069655537605286,
+      "learning_rate": 2.1089547339114215e-06,
+      "loss": 4.3574,
+      "step": 1119
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8164,
+      "epoch": 1.45337,
+      "grad_norm": 0.13768544793128967,
+      "learning_rate": 2.09971545214401e-06,
+      "loss": 4.7539,
+      "step": 1120
+    },
+    {
+      "batch_num_effect_tokens": 7943,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8101,
+      "epoch": 1.45466,
+      "grad_norm": 0.13390763103961945,
+      "learning_rate": 2.0904910713180275e-06,
+      "loss": 4.2021,
+      "step": 1121
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.45596,
+      "grad_norm": 0.1425563544034958,
+      "learning_rate": 2.081281638826052e-06,
+      "loss": 4.1191,
+      "step": 1122
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.45726,
+      "grad_norm": 0.13213439285755157,
+      "learning_rate": 2.072087201983857e-06,
+      "loss": 4.6445,
+      "step": 1123
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.45856,
+      "grad_norm": 0.13721303641796112,
+      "learning_rate": 2.0629078080301782e-06,
+      "loss": 4.1318,
+      "step": 1124
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8188,
+      "epoch": 1.45985,
+      "grad_norm": 0.1321757435798645,
+      "learning_rate": 2.0537435041264597e-06,
+      "loss": 4.1426,
+      "step": 1125
+    },
+    {
+      "batch_num_effect_tokens": 7970,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8126,
+      "epoch": 1.46115,
+      "grad_norm": 0.13200123608112335,
+      "learning_rate": 2.0445943373566178e-06,
+      "loss": 4.5459,
+      "step": 1126
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.46245,
+      "grad_norm": 0.1399904489517212,
+      "learning_rate": 2.0354603547267985e-06,
+      "loss": 4.2383,
+      "step": 1127
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.46375,
+      "grad_norm": 0.14394307136535645,
+      "learning_rate": 2.0263416031651335e-06,
+      "loss": 5.0156,
+      "step": 1128
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.46504,
+      "grad_norm": 0.12676626443862915,
+      "learning_rate": 2.017238129521506e-06,
+      "loss": 4.1953,
+      "step": 1129
+    },
+    {
+      "batch_num_effect_tokens": 7999,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.46634,
+      "grad_norm": 0.12323788553476334,
+      "learning_rate": 2.0081499805673015e-06,
+      "loss": 4.3027,
+      "step": 1130
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8170,
+      "epoch": 1.46764,
+      "grad_norm": 0.12607906758785248,
+      "learning_rate": 1.9990772029951665e-06,
+      "loss": 3.9326,
+      "step": 1131
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.46894,
+      "grad_norm": 0.12812362611293793,
+      "learning_rate": 1.9900198434187838e-06,
+      "loss": 4.7461,
+      "step": 1132
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.47024,
+      "grad_norm": 0.13349904119968414,
+      "learning_rate": 1.980977948372612e-06,
+      "loss": 4.3418,
+      "step": 1133
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.47153,
+      "grad_norm": 0.12421982735395432,
+      "learning_rate": 1.971951564311668e-06,
+      "loss": 4.1846,
+      "step": 1134
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.47283,
+      "grad_norm": 0.1404682993888855,
+      "learning_rate": 1.962940737611264e-06,
+      "loss": 4.3447,
+      "step": 1135
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8088,
+      "epoch": 1.47413,
+      "grad_norm": 0.1217743456363678,
+      "learning_rate": 1.953945514566789e-06,
+      "loss": 3.9229,
+      "step": 1136
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.47543,
+      "grad_norm": 0.13491998612880707,
+      "learning_rate": 1.9449659413934684e-06,
+      "loss": 4.1543,
+      "step": 1137
+    },
+    {
+      "batch_num_effect_tokens": 8078,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.47672,
+      "grad_norm": 0.12323690205812454,
+      "learning_rate": 1.9360020642261155e-06,
+      "loss": 4.2627,
+      "step": 1138
+    },
+    {
+      "batch_num_effect_tokens": 7875,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8108,
+      "epoch": 1.47802,
+      "grad_norm": 0.13586825132369995,
+      "learning_rate": 1.9270539291189054e-06,
+      "loss": 4.5137,
+      "step": 1139
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.47932,
+      "grad_norm": 0.12245763093233109,
+      "learning_rate": 1.918121582045132e-06,
+      "loss": 4.5752,
+      "step": 1140
+    },
+    {
+      "batch_num_effect_tokens": 7842,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8048,
+      "epoch": 1.48062,
+      "grad_norm": 0.12825018167495728,
+      "learning_rate": 1.9092050688969736e-06,
+      "loss": 4.5674,
+      "step": 1141
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.48191,
+      "grad_norm": 0.12709911167621613,
+      "learning_rate": 1.9003044354852634e-06,
+      "loss": 4.3623,
+      "step": 1142
+    },
+    {
+      "batch_num_effect_tokens": 8006,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.48321,
+      "grad_norm": 0.11626652628183365,
+      "learning_rate": 1.8914197275392444e-06,
+      "loss": 4.0283,
+      "step": 1143
+    },
+    {
+      "batch_num_effect_tokens": 7904,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8088,
+      "epoch": 1.48451,
+      "grad_norm": 0.12491626292467117,
+      "learning_rate": 1.8825509907063328e-06,
+      "loss": 4.376,
+      "step": 1144
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8188,
+      "epoch": 1.48581,
+      "grad_norm": 0.1228693351149559,
+      "learning_rate": 1.8736982705519013e-06,
+      "loss": 4.1221,
+      "step": 1145
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.4871,
+      "grad_norm": 0.13609446585178375,
+      "learning_rate": 1.8648616125590218e-06,
+      "loss": 3.9092,
+      "step": 1146
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.4884,
+      "grad_norm": 0.13068844377994537,
+      "learning_rate": 1.8560410621282543e-06,
+      "loss": 4.2646,
+      "step": 1147
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8086,
+      "epoch": 1.4897,
+      "grad_norm": 0.12820830941200256,
+      "learning_rate": 1.8472366645773892e-06,
+      "loss": 4.3457,
+      "step": 1148
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8188,
+      "epoch": 1.491,
+      "grad_norm": 0.12200096249580383,
+      "learning_rate": 1.8384484651412338e-06,
+      "loss": 4.3672,
+      "step": 1149
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.4923,
+      "grad_norm": 0.13038086891174316,
+      "learning_rate": 1.829676508971377e-06,
+      "loss": 4.2734,
+      "step": 1150
+    },
+    {
+      "batch_num_effect_tokens": 7874,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8091,
+      "epoch": 1.49359,
+      "grad_norm": 0.11553595215082169,
+      "learning_rate": 1.8209208411359485e-06,
+      "loss": 4.3574,
+      "step": 1151
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.49489,
+      "grad_norm": 0.12026538699865341,
+      "learning_rate": 1.8121815066193944e-06,
+      "loss": 4.4014,
+      "step": 1152
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.49619,
+      "grad_norm": 0.12895485758781433,
+      "learning_rate": 1.8034585503222441e-06,
+      "loss": 4.7461,
+      "step": 1153
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.49749,
+      "grad_norm": 0.1211390346288681,
+      "learning_rate": 1.7947520170608774e-06,
+      "loss": 4.3555,
+      "step": 1154
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8182,
+      "epoch": 1.49878,
+      "grad_norm": 0.1284831017255783,
+      "learning_rate": 1.7860619515673034e-06,
+      "loss": 4.2139,
+      "step": 1155
+    },
+    {
+      "batch_num_effect_tokens": 7974,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8098,
+      "epoch": 1.50008,
+      "grad_norm": 0.12839344143867493,
+      "learning_rate": 1.7773883984889178e-06,
+      "loss": 4.4795,
+      "step": 1156
+    },
+    {
+      "batch_num_effect_tokens": 7808,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8052,
+      "epoch": 1.50138,
+      "grad_norm": 0.13333484530448914,
+      "learning_rate": 1.7687314023882806e-06,
+      "loss": 4.5332,
+      "step": 1157
+    },
+    {
+      "batch_num_effect_tokens": 7969,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8156,
+      "epoch": 1.50268,
+      "grad_norm": 0.1345791220664978,
+      "learning_rate": 1.760091007742888e-06,
+      "loss": 4.5273,
+      "step": 1158
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.50397,
+      "grad_norm": 0.12442679703235626,
+      "learning_rate": 1.7514672589449378e-06,
+      "loss": 4.248,
+      "step": 1159
+    },
+    {
+      "batch_num_effect_tokens": 7963,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 1.50527,
+      "grad_norm": 0.1204453706741333,
+      "learning_rate": 1.7428602003011136e-06,
+      "loss": 4.6221,
+      "step": 1160
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.50657,
+      "grad_norm": 0.13553136587142944,
+      "learning_rate": 1.734269876032344e-06,
+      "loss": 4.1123,
+      "step": 1161
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.50787,
+      "grad_norm": 0.13123448193073273,
+      "learning_rate": 1.7256963302735752e-06,
+      "loss": 4.3574,
+      "step": 1162
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8173,
+      "epoch": 1.50916,
+      "grad_norm": 0.12561143934726715,
+      "learning_rate": 1.7171396070735602e-06,
+      "loss": 4.0078,
+      "step": 1163
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.51046,
+      "grad_norm": 0.13120131194591522,
+      "learning_rate": 1.7085997503946144e-06,
+      "loss": 4.377,
+      "step": 1164
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.51176,
+      "grad_norm": 0.1411786675453186,
+      "learning_rate": 1.7000768041124038e-06,
+      "loss": 4.5732,
+      "step": 1165
+    },
+    {
+      "batch_num_effect_tokens": 7977,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8152,
+      "epoch": 1.51306,
+      "grad_norm": 0.12541553378105164,
+      "learning_rate": 1.6915708120157042e-06,
+      "loss": 4.3467,
+      "step": 1166
+    },
+    {
+      "batch_num_effect_tokens": 7889,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8163,
+      "epoch": 1.51436,
+      "grad_norm": 0.12473280727863312,
+      "learning_rate": 1.6830818178061897e-06,
+      "loss": 4.1572,
+      "step": 1167
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.51565,
+      "grad_norm": 0.14233461022377014,
+      "learning_rate": 1.6746098650982072e-06,
+      "loss": 4.6309,
+      "step": 1168
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.51695,
+      "grad_norm": 0.12509174644947052,
+      "learning_rate": 1.6661549974185426e-06,
+      "loss": 4.5986,
+      "step": 1169
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.51825,
+      "grad_norm": 0.13657966256141663,
+      "learning_rate": 1.657717258206205e-06,
+      "loss": 4.9082,
+      "step": 1170
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.51955,
+      "grad_norm": 0.12476794421672821,
+      "learning_rate": 1.6492966908122033e-06,
+      "loss": 4.1396,
+      "step": 1171
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8139,
+      "epoch": 1.52084,
+      "grad_norm": 0.12963926792144775,
+      "learning_rate": 1.6408933384993187e-06,
+      "loss": 4.4238,
+      "step": 1172
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.52214,
+      "grad_norm": 0.12490490823984146,
+      "learning_rate": 1.63250724444189e-06,
+      "loss": 4.6895,
+      "step": 1173
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8121,
+      "epoch": 1.52344,
+      "grad_norm": 0.1316630095243454,
+      "learning_rate": 1.6241384517255854e-06,
+      "loss": 4.5205,
+      "step": 1174
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8128,
+      "epoch": 1.52474,
+      "grad_norm": 0.1253131479024887,
+      "learning_rate": 1.6157870033471785e-06,
+      "loss": 4.4424,
+      "step": 1175
+    },
+    {
+      "batch_num_effect_tokens": 7782,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8010,
+      "epoch": 1.52603,
+      "grad_norm": 0.12522853910923004,
+      "learning_rate": 1.6074529422143398e-06,
+      "loss": 4.3564,
+      "step": 1176
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.52733,
+      "grad_norm": 0.12250207364559174,
+      "learning_rate": 1.5991363111454023e-06,
+      "loss": 4.7217,
+      "step": 1177
+    },
+    {
+      "batch_num_effect_tokens": 8074,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.52863,
+      "grad_norm": 0.12559637427330017,
+      "learning_rate": 1.5908371528691553e-06,
+      "loss": 4.4688,
+      "step": 1178
+    },
+    {
+      "batch_num_effect_tokens": 7855,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8023,
+      "epoch": 1.52993,
+      "grad_norm": 0.13404156267642975,
+      "learning_rate": 1.5825555100246066e-06,
+      "loss": 4.5176,
+      "step": 1179
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.53122,
+      "grad_norm": 0.12473981827497482,
+      "learning_rate": 1.5742914251607794e-06,
+      "loss": 4.2764,
+      "step": 1180
+    },
+    {
+      "batch_num_effect_tokens": 7951,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8092,
+      "epoch": 1.53252,
+      "grad_norm": 0.1322951763868332,
+      "learning_rate": 1.5660449407364919e-06,
+      "loss": 4.5439,
+      "step": 1181
+    },
+    {
+      "batch_num_effect_tokens": 7907,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8128,
+      "epoch": 1.53382,
+      "grad_norm": 0.13409925997257233,
+      "learning_rate": 1.5578160991201313e-06,
+      "loss": 4.3096,
+      "step": 1182
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.53512,
+      "grad_norm": 0.11892995983362198,
+      "learning_rate": 1.549604942589441e-06,
+      "loss": 4.3701,
+      "step": 1183
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.53642,
+      "grad_norm": 0.13523681461811066,
+      "learning_rate": 1.5414115133313029e-06,
+      "loss": 4.2051,
+      "step": 1184
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 1.53771,
+      "grad_norm": 0.12341441959142685,
+      "learning_rate": 1.5332358534415192e-06,
+      "loss": 4.4072,
+      "step": 1185
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.53901,
+      "grad_norm": 0.12092790752649307,
+      "learning_rate": 1.5250780049246028e-06,
+      "loss": 4.498,
+      "step": 1186
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.54031,
+      "grad_norm": 0.12938234210014343,
+      "learning_rate": 1.516938009693551e-06,
+      "loss": 4.6123,
+      "step": 1187
+    },
+    {
+      "batch_num_effect_tokens": 7905,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8088,
+      "epoch": 1.54161,
+      "grad_norm": 0.1272503286600113,
+      "learning_rate": 1.5088159095696365e-06,
+      "loss": 4.2402,
+      "step": 1188
+    },
+    {
+      "batch_num_effect_tokens": 7971,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.5429,
+      "grad_norm": 0.14063680171966553,
+      "learning_rate": 1.500711746282192e-06,
+      "loss": 4.2334,
+      "step": 1189
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.5442,
+      "grad_norm": 0.11090775579214096,
+      "learning_rate": 1.4926255614683931e-06,
+      "loss": 4.2617,
+      "step": 1190
+    },
+    {
+      "batch_num_effect_tokens": 7936,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8162,
+      "epoch": 1.5455,
+      "grad_norm": 0.12592093646526337,
+      "learning_rate": 1.484557396673052e-06,
+      "loss": 4.1523,
+      "step": 1191
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.5468,
+      "grad_norm": 0.12234609574079514,
+      "learning_rate": 1.4765072933483949e-06,
+      "loss": 4.2314,
+      "step": 1192
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8168,
+      "epoch": 1.54809,
+      "grad_norm": 0.11718329787254333,
+      "learning_rate": 1.468475292853847e-06,
+      "loss": 4.0645,
+      "step": 1193
+    },
+    {
+      "batch_num_effect_tokens": 7944,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8133,
+      "epoch": 1.54939,
+      "grad_norm": 0.13274738192558289,
+      "learning_rate": 1.4604614364558372e-06,
+      "loss": 4.4199,
+      "step": 1194
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8166,
+      "epoch": 1.55069,
+      "grad_norm": 0.12475960701704025,
+      "learning_rate": 1.4524657653275653e-06,
+      "loss": 4.043,
+      "step": 1195
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.55199,
+      "grad_norm": 0.12034016847610474,
+      "learning_rate": 1.444488320548807e-06,
+      "loss": 4.0186,
+      "step": 1196
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.55328,
+      "grad_norm": 0.12400320917367935,
+      "learning_rate": 1.4365291431056871e-06,
+      "loss": 4.248,
+      "step": 1197
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8173,
+      "epoch": 1.55458,
+      "grad_norm": 0.12639890611171722,
+      "learning_rate": 1.4285882738904822e-06,
+      "loss": 4.6426,
+      "step": 1198
+    },
+    {
+      "batch_num_effect_tokens": 7946,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8123,
+      "epoch": 1.55588,
+      "grad_norm": 0.12510213255882263,
+      "learning_rate": 1.4206657537014078e-06,
+      "loss": 4.248,
+      "step": 1199
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.55718,
+      "grad_norm": 0.11327057331800461,
+      "learning_rate": 1.4127616232424042e-06,
+      "loss": 3.8843,
+      "step": 1200
+    },
+    {
+      "batch_num_effect_tokens": 8006,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.55848,
+      "grad_norm": 0.13065361976623535,
+      "learning_rate": 1.404875923122928e-06,
+      "loss": 4.3848,
+      "step": 1201
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8170,
+      "epoch": 1.55977,
+      "grad_norm": 0.1324455589056015,
+      "learning_rate": 1.3970086938577492e-06,
+      "loss": 4.3975,
+      "step": 1202
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.56107,
+      "grad_norm": 0.12078768759965897,
+      "learning_rate": 1.389159975866734e-06,
+      "loss": 4.0654,
+      "step": 1203
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.56237,
+      "grad_norm": 0.1465671807527542,
+      "learning_rate": 1.3813298094746491e-06,
+      "loss": 4.2598,
+      "step": 1204
+    },
+    {
+      "batch_num_effect_tokens": 7993,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.56367,
+      "grad_norm": 0.13225287199020386,
+      "learning_rate": 1.3735182349109428e-06,
+      "loss": 4.4785,
+      "step": 1205
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.56496,
+      "grad_norm": 0.1285356879234314,
+      "learning_rate": 1.3657252923095437e-06,
+      "loss": 4.54,
+      "step": 1206
+    },
+    {
+      "batch_num_effect_tokens": 7879,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8090,
+      "epoch": 1.56626,
+      "grad_norm": 0.13167142868041992,
+      "learning_rate": 1.357951021708655e-06,
+      "loss": 4.0176,
+      "step": 1207
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.56756,
+      "grad_norm": 0.13281382620334625,
+      "learning_rate": 1.3501954630505464e-06,
+      "loss": 4.2588,
+      "step": 1208
+    },
+    {
+      "batch_num_effect_tokens": 7945,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8079,
+      "epoch": 1.56886,
+      "grad_norm": 0.12919877469539642,
+      "learning_rate": 1.342458656181354e-06,
+      "loss": 4.5977,
+      "step": 1209
+    },
+    {
+      "batch_num_effect_tokens": 8021,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.57015,
+      "grad_norm": 0.13256850838661194,
+      "learning_rate": 1.3347406408508695e-06,
+      "loss": 4.1484,
+      "step": 1210
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.57145,
+      "grad_norm": 0.13196738064289093,
+      "learning_rate": 1.3270414567123342e-06,
+      "loss": 4.1807,
+      "step": 1211
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.57275,
+      "grad_norm": 0.1331268548965454,
+      "learning_rate": 1.3193611433222465e-06,
+      "loss": 4.4814,
+      "step": 1212
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8182,
+      "epoch": 1.57405,
+      "grad_norm": 0.1267508864402771,
+      "learning_rate": 1.311699740140146e-06,
+      "loss": 4.3477,
+      "step": 1213
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.57534,
+      "grad_norm": 0.1376553326845169,
+      "learning_rate": 1.3040572865284234e-06,
+      "loss": 4.5098,
+      "step": 1214
+    },
+    {
+      "batch_num_effect_tokens": 7972,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8100,
+      "epoch": 1.57664,
+      "grad_norm": 0.13548849523067474,
+      "learning_rate": 1.2964338217521021e-06,
+      "loss": 4.3359,
+      "step": 1215
+    },
+    {
+      "batch_num_effect_tokens": 8069,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.57794,
+      "grad_norm": 0.13228566944599152,
+      "learning_rate": 1.2888293849786503e-06,
+      "loss": 4.4883,
+      "step": 1216
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.57924,
+      "grad_norm": 0.1392965018749237,
+      "learning_rate": 1.2812440152777773e-06,
+      "loss": 4.5889,
+      "step": 1217
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.58054,
+      "grad_norm": 0.12365755438804626,
+      "learning_rate": 1.2736777516212267e-06,
+      "loss": 4.541,
+      "step": 1218
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.58183,
+      "grad_norm": 0.12618915736675262,
+      "learning_rate": 1.2661306328825818e-06,
+      "loss": 4.1309,
+      "step": 1219
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8163,
+      "epoch": 1.58313,
+      "grad_norm": 0.12724487483501434,
+      "learning_rate": 1.258602697837063e-06,
+      "loss": 4.7012,
+      "step": 1220
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 1.58443,
+      "grad_norm": 0.12507867813110352,
+      "learning_rate": 1.2510939851613285e-06,
+      "loss": 4.6133,
+      "step": 1221
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8190,
+      "epoch": 1.58573,
+      "grad_norm": 0.124232716858387,
+      "learning_rate": 1.2436045334332824e-06,
+      "loss": 4.6475,
+      "step": 1222
+    },
+    {
+      "batch_num_effect_tokens": 7957,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 1.58702,
+      "grad_norm": 0.13959231972694397,
+      "learning_rate": 1.2361343811318665e-06,
+      "loss": 4.6484,
+      "step": 1223
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.58832,
+      "grad_norm": 0.12538942694664001,
+      "learning_rate": 1.2286835666368623e-06,
+      "loss": 4.4883,
+      "step": 1224
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.58962,
+      "grad_norm": 0.12452027201652527,
+      "learning_rate": 1.2212521282287093e-06,
+      "loss": 4.0713,
+      "step": 1225
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.59092,
+      "grad_norm": 0.13691964745521545,
+      "learning_rate": 1.2138401040882874e-06,
+      "loss": 4.5684,
+      "step": 1226
+    },
+    {
+      "batch_num_effect_tokens": 7894,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8096,
+      "epoch": 1.59221,
+      "grad_norm": 0.12278582155704498,
+      "learning_rate": 1.20644753229674e-06,
+      "loss": 4.1816,
+      "step": 1227
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.59351,
+      "grad_norm": 0.12262444198131561,
+      "learning_rate": 1.1990744508352604e-06,
+      "loss": 4.0322,
+      "step": 1228
+    },
+    {
+      "batch_num_effect_tokens": 7945,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8106,
+      "epoch": 1.59481,
+      "grad_norm": 0.13073332607746124,
+      "learning_rate": 1.191720897584908e-06,
+      "loss": 4.5508,
+      "step": 1229
+    },
+    {
+      "batch_num_effect_tokens": 7928,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8113,
+      "epoch": 1.59611,
+      "grad_norm": 0.13038553297519684,
+      "learning_rate": 1.1843869103264173e-06,
+      "loss": 4.6455,
+      "step": 1230
+    },
+    {
+      "batch_num_effect_tokens": 8026,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8187,
+      "epoch": 1.5974,
+      "grad_norm": 0.11809444427490234,
+      "learning_rate": 1.1770725267399892e-06,
+      "loss": 4.1328,
+      "step": 1231
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8189,
+      "epoch": 1.5987,
+      "grad_norm": 0.11818146705627441,
+      "learning_rate": 1.1697777844051105e-06,
+      "loss": 4.2168,
+      "step": 1232
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8189,
+      "epoch": 1.5987,
+      "eval_eval_loss": 0.5610187649726868,
+      "eval_eval_runtime": 115.0388,
+      "eval_eval_samples_per_second": 43.464,
+      "eval_eval_steps_per_second": 2.721,
+      "step": 1232
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.6,
+      "grad_norm": 0.12877851724624634,
+      "learning_rate": 1.1625027208003547e-06,
+      "loss": 4.4346,
+      "step": 1233
+    },
+    {
+      "batch_num_effect_tokens": 7934,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8104,
+      "epoch": 1.6013,
+      "grad_norm": 0.13281628489494324,
+      "learning_rate": 1.1552473733031893e-06,
+      "loss": 4.0088,
+      "step": 1234
+    },
+    {
+      "batch_num_effect_tokens": 7990,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8182,
+      "epoch": 1.6026,
+      "grad_norm": 0.133657768368721,
+      "learning_rate": 1.148011779189791e-06,
+      "loss": 4.5449,
+      "step": 1235
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.60389,
+      "grad_norm": 0.11349672079086304,
+      "learning_rate": 1.1407959756348424e-06,
+      "loss": 4.1943,
+      "step": 1236
+    },
+    {
+      "batch_num_effect_tokens": 7980,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.60519,
+      "grad_norm": 0.14057059586048126,
+      "learning_rate": 1.133599999711349e-06,
+      "loss": 4.209,
+      "step": 1237
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.60649,
+      "grad_norm": 0.13591386377811432,
+      "learning_rate": 1.1264238883904483e-06,
+      "loss": 4.25,
+      "step": 1238
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8096,
+      "epoch": 1.60779,
+      "grad_norm": 0.1403154879808426,
+      "learning_rate": 1.1192676785412154e-06,
+      "loss": 4.5283,
+      "step": 1239
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.60908,
+      "grad_norm": 0.12656299769878387,
+      "learning_rate": 1.112131406930481e-06,
+      "loss": 4.4707,
+      "step": 1240
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.61038,
+      "grad_norm": 0.1420130729675293,
+      "learning_rate": 1.1050151102226369e-06,
+      "loss": 4.4678,
+      "step": 1241
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.61168,
+      "grad_norm": 0.12833444774150848,
+      "learning_rate": 1.097918824979442e-06,
+      "loss": 4.2129,
+      "step": 1242
+    },
+    {
+      "batch_num_effect_tokens": 7962,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.61298,
+      "grad_norm": 0.13786719739437103,
+      "learning_rate": 1.0908425876598512e-06,
+      "loss": 4.3408,
+      "step": 1243
+    },
+    {
+      "batch_num_effect_tokens": 7948,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8116,
+      "epoch": 1.61427,
+      "grad_norm": 0.13015086948871613,
+      "learning_rate": 1.0837864346198117e-06,
+      "loss": 4.4014,
+      "step": 1244
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.61557,
+      "grad_norm": 0.1163618341088295,
+      "learning_rate": 1.0767504021120884e-06,
+      "loss": 3.9277,
+      "step": 1245
+    },
+    {
+      "batch_num_effect_tokens": 7856,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8040,
+      "epoch": 1.61687,
+      "grad_norm": 0.12296216189861298,
+      "learning_rate": 1.0697345262860638e-06,
+      "loss": 4.5225,
+      "step": 1246
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8136,
+      "epoch": 1.61817,
+      "grad_norm": 0.1348690241575241,
+      "learning_rate": 1.062738843187565e-06,
+      "loss": 4.3125,
+      "step": 1247
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.61946,
+      "grad_norm": 0.12651218473911285,
+      "learning_rate": 1.0557633887586765e-06,
+      "loss": 4.2842,
+      "step": 1248
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.62076,
+      "grad_norm": 0.12578874826431274,
+      "learning_rate": 1.0488081988375493e-06,
+      "loss": 4.0039,
+      "step": 1249
+    },
+    {
+      "batch_num_effect_tokens": 8064,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.62206,
+      "grad_norm": 0.1441895216703415,
+      "learning_rate": 1.04187330915822e-06,
+      "loss": 4.1748,
+      "step": 1250
+    },
+    {
+      "batch_num_effect_tokens": 7945,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8146,
+      "epoch": 1.62336,
+      "grad_norm": 0.13080251216888428,
+      "learning_rate": 1.0349587553504298e-06,
+      "loss": 4.6592,
+      "step": 1251
+    },
+    {
+      "batch_num_effect_tokens": 7926,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 1.62466,
+      "grad_norm": 0.1321391612291336,
+      "learning_rate": 1.0280645729394368e-06,
+      "loss": 4.3711,
+      "step": 1252
+    },
+    {
+      "batch_num_effect_tokens": 7936,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8083,
+      "epoch": 1.62595,
+      "grad_norm": 0.14101214706897736,
+      "learning_rate": 1.0211907973458391e-06,
+      "loss": 4.248,
+      "step": 1253
+    },
+    {
+      "batch_num_effect_tokens": 8074,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.62725,
+      "grad_norm": 0.13918721675872803,
+      "learning_rate": 1.0143374638853892e-06,
+      "loss": 4.5293,
+      "step": 1254
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.62855,
+      "grad_norm": 0.1319819986820221,
+      "learning_rate": 1.0075046077688067e-06,
+      "loss": 4.3057,
+      "step": 1255
+    },
+    {
+      "batch_num_effect_tokens": 7918,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8101,
+      "epoch": 1.62985,
+      "grad_norm": 0.11569049954414368,
+      "learning_rate": 1.0006922641016131e-06,
+      "loss": 4.0488,
+      "step": 1256
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.63114,
+      "grad_norm": 0.13643518090248108,
+      "learning_rate": 9.939004678839348e-07,
+      "loss": 4.96,
+      "step": 1257
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.63244,
+      "grad_norm": 0.1374160200357437,
+      "learning_rate": 9.871292540103377e-07,
+      "loss": 4.2441,
+      "step": 1258
+    },
+    {
+      "batch_num_effect_tokens": 8060,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.63374,
+      "grad_norm": 0.12853796780109406,
+      "learning_rate": 9.803786572696321e-07,
+      "loss": 4.377,
+      "step": 1259
+    },
+    {
+      "batch_num_effect_tokens": 7908,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8086,
+      "epoch": 1.63504,
+      "grad_norm": 0.14067308604717255,
+      "learning_rate": 9.73648712344707e-07,
+      "loss": 4.4121,
+      "step": 1260
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.63633,
+      "grad_norm": 0.1275281310081482,
+      "learning_rate": 9.6693945381235e-07,
+      "loss": 4.5293,
+      "step": 1261
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.63763,
+      "grad_norm": 0.12822289764881134,
+      "learning_rate": 9.602509161430628e-07,
+      "loss": 4.0166,
+      "step": 1262
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.63893,
+      "grad_norm": 0.13135388493537903,
+      "learning_rate": 9.53583133700891e-07,
+      "loss": 4.1982,
+      "step": 1263
+    },
+    {
+      "batch_num_effect_tokens": 7980,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.64023,
+      "grad_norm": 0.13726817071437836,
+      "learning_rate": 9.469361407432431e-07,
+      "loss": 4.4258,
+      "step": 1264
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.64152,
+      "grad_norm": 0.1434670388698578,
+      "learning_rate": 9.403099714207175e-07,
+      "loss": 4.5791,
+      "step": 1265
+    },
+    {
+      "batch_num_effect_tokens": 7957,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8110,
+      "epoch": 1.64282,
+      "grad_norm": 0.12421970069408417,
+      "learning_rate": 9.337046597769272e-07,
+      "loss": 4.0225,
+      "step": 1266
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8142,
+      "epoch": 1.64412,
+      "grad_norm": 0.13223737478256226,
+      "learning_rate": 9.271202397483214e-07,
+      "loss": 3.9521,
+      "step": 1267
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.64542,
+      "grad_norm": 0.12750263512134552,
+      "learning_rate": 9.205567451640151e-07,
+      "loss": 4.0049,
+      "step": 1268
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.64672,
+      "grad_norm": 0.12239693850278854,
+      "learning_rate": 9.140142097456117e-07,
+      "loss": 4.2539,
+      "step": 1269
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8179,
+      "epoch": 1.64801,
+      "grad_norm": 0.1410730630159378,
+      "learning_rate": 9.074926671070322e-07,
+      "loss": 4.3662,
+      "step": 1270
+    },
+    {
+      "batch_num_effect_tokens": 7938,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8085,
+      "epoch": 1.64931,
+      "grad_norm": 0.12114191055297852,
+      "learning_rate": 9.009921507543445e-07,
+      "loss": 4.6211,
+      "step": 1271
+    },
+    {
+      "batch_num_effect_tokens": 7922,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8092,
+      "epoch": 1.65061,
+      "grad_norm": 0.12979581952095032,
+      "learning_rate": 8.945126940855864e-07,
+      "loss": 4.6465,
+      "step": 1272
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.65191,
+      "grad_norm": 0.12733100354671478,
+      "learning_rate": 8.880543303905931e-07,
+      "loss": 4.082,
+      "step": 1273
+    },
+    {
+      "batch_num_effect_tokens": 7917,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8089,
+      "epoch": 1.6532,
+      "grad_norm": 0.11588294804096222,
+      "learning_rate": 8.816170928508367e-07,
+      "loss": 4.1299,
+      "step": 1274
+    },
+    {
+      "batch_num_effect_tokens": 8001,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.6545,
+      "grad_norm": 0.11835164576768875,
+      "learning_rate": 8.752010145392408e-07,
+      "loss": 4.1543,
+      "step": 1275
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.6558,
+      "grad_norm": 0.13759027421474457,
+      "learning_rate": 8.688061284200266e-07,
+      "loss": 4.5654,
+      "step": 1276
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.6571,
+      "grad_norm": 0.13586406409740448,
+      "learning_rate": 8.624324673485252e-07,
+      "loss": 4.3682,
+      "step": 1277
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8164,
+      "epoch": 1.65839,
+      "grad_norm": 0.13176386058330536,
+      "learning_rate": 8.560800640710248e-07,
+      "loss": 4.3643,
+      "step": 1278
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.65969,
+      "grad_norm": 0.1281612068414688,
+      "learning_rate": 8.497489512245971e-07,
+      "loss": 4.0146,
+      "step": 1279
+    },
+    {
+      "batch_num_effect_tokens": 7869,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 1.66099,
+      "grad_norm": 0.11786817759275436,
+      "learning_rate": 8.434391613369258e-07,
+      "loss": 3.9717,
+      "step": 1280
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.66229,
+      "grad_norm": 0.13192172348499298,
+      "learning_rate": 8.371507268261436e-07,
+      "loss": 4.3145,
+      "step": 1281
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.66358,
+      "grad_norm": 0.13585922122001648,
+      "learning_rate": 8.308836800006648e-07,
+      "loss": 4.2051,
+      "step": 1282
+    },
+    {
+      "batch_num_effect_tokens": 8044,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.66488,
+      "grad_norm": 0.13544043898582458,
+      "learning_rate": 8.246380530590175e-07,
+      "loss": 4.2764,
+      "step": 1283
+    },
+    {
+      "batch_num_effect_tokens": 7965,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8110,
+      "epoch": 1.66618,
+      "grad_norm": 0.1333230584859848,
+      "learning_rate": 8.184138780896839e-07,
+      "loss": 4.2051,
+      "step": 1284
+    },
+    {
+      "batch_num_effect_tokens": 7949,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8088,
+      "epoch": 1.66748,
+      "grad_norm": 0.1322176456451416,
+      "learning_rate": 8.122111870709287e-07,
+      "loss": 4.2715,
+      "step": 1285
+    },
+    {
+      "batch_num_effect_tokens": 7749,
+      "batch_num_samples": 30,
+      "batch_num_tokens": 8018,
+      "epoch": 1.66878,
+      "grad_norm": 0.13512325286865234,
+      "learning_rate": 8.060300118706327e-07,
+      "loss": 3.9844,
+      "step": 1286
+    },
+    {
+      "batch_num_effect_tokens": 7993,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8119,
+      "epoch": 1.67007,
+      "grad_norm": 0.13298512995243073,
+      "learning_rate": 7.99870384246143e-07,
+      "loss": 4.3496,
+      "step": 1287
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.67137,
+      "grad_norm": 0.134169802069664,
+      "learning_rate": 7.937323358440935e-07,
+      "loss": 4.5732,
+      "step": 1288
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.67267,
+      "grad_norm": 0.139683797955513,
+      "learning_rate": 7.876158982002552e-07,
+      "loss": 4.2637,
+      "step": 1289
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.67397,
+      "grad_norm": 0.13284529745578766,
+      "learning_rate": 7.815211027393616e-07,
+      "loss": 4.3613,
+      "step": 1290
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8165,
+      "epoch": 1.67526,
+      "grad_norm": 0.1360265165567398,
+      "learning_rate": 7.754479807749571e-07,
+      "loss": 3.9727,
+      "step": 1291
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.67656,
+      "grad_norm": 0.13193875551223755,
+      "learning_rate": 7.693965635092365e-07,
+      "loss": 4.5273,
+      "step": 1292
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.67786,
+      "grad_norm": 0.1406845897436142,
+      "learning_rate": 7.633668820328765e-07,
+      "loss": 4.4824,
+      "step": 1293
+    },
+    {
+      "batch_num_effect_tokens": 7877,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8100,
+      "epoch": 1.67916,
+      "grad_norm": 0.12547780573368073,
+      "learning_rate": 7.573589673248833e-07,
+      "loss": 4.3389,
+      "step": 1294
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8175,
+      "epoch": 1.68045,
+      "grad_norm": 0.12817974388599396,
+      "learning_rate": 7.513728502524286e-07,
+      "loss": 4.0254,
+      "step": 1295
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.68175,
+      "grad_norm": 0.1354372501373291,
+      "learning_rate": 7.454085615706951e-07,
+      "loss": 4.4277,
+      "step": 1296
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.68305,
+      "grad_norm": 0.12303349375724792,
+      "learning_rate": 7.394661319227175e-07,
+      "loss": 4.6855,
+      "step": 1297
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.68435,
+      "grad_norm": 0.11604474484920502,
+      "learning_rate": 7.33545591839222e-07,
+      "loss": 4.2285,
+      "step": 1298
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.68564,
+      "grad_norm": 0.12446524202823639,
+      "learning_rate": 7.276469717384726e-07,
+      "loss": 4.1304,
+      "step": 1299
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.68694,
+      "grad_norm": 0.1320941001176834,
+      "learning_rate": 7.217703019261135e-07,
+      "loss": 4.4512,
+      "step": 1300
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.68824,
+      "grad_norm": 0.13994361460208893,
+      "learning_rate": 7.15915612595014e-07,
+      "loss": 4.5732,
+      "step": 1301
+    },
+    {
+      "batch_num_effect_tokens": 7981,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8121,
+      "epoch": 1.68954,
+      "grad_norm": 0.1323327273130417,
+      "learning_rate": 7.100829338251147e-07,
+      "loss": 4.2119,
+      "step": 1302
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8152,
+      "epoch": 1.69084,
+      "grad_norm": 0.12983821332454681,
+      "learning_rate": 7.042722955832703e-07,
+      "loss": 4.5225,
+      "step": 1303
+    },
+    {
+      "batch_num_effect_tokens": 7942,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8116,
+      "epoch": 1.69213,
+      "grad_norm": 0.13648320734500885,
+      "learning_rate": 6.984837277230927e-07,
+      "loss": 4.3262,
+      "step": 1304
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.69343,
+      "grad_norm": 0.13606522977352142,
+      "learning_rate": 6.927172599848092e-07,
+      "loss": 4.4053,
+      "step": 1305
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8185,
+      "epoch": 1.69473,
+      "grad_norm": 0.1341053992509842,
+      "learning_rate": 6.86972921995096e-07,
+      "loss": 4.0566,
+      "step": 1306
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.69603,
+      "grad_norm": 0.13478976488113403,
+      "learning_rate": 6.812507432669374e-07,
+      "loss": 4.1006,
+      "step": 1307
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.69732,
+      "grad_norm": 0.12561902403831482,
+      "learning_rate": 6.755507531994637e-07,
+      "loss": 4.1797,
+      "step": 1308
+    },
+    {
+      "batch_num_effect_tokens": 7813,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 7989,
+      "epoch": 1.69862,
+      "grad_norm": 0.13221710920333862,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 4.3721,
+      "step": 1309
+    },
+    {
+      "batch_num_effect_tokens": 7902,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8090,
+      "epoch": 1.69992,
+      "grad_norm": 0.1371319591999054,
+      "learning_rate": 6.642174560729514e-07,
+      "loss": 4.084,
+      "step": 1310
+    },
+    {
+      "batch_num_effect_tokens": 7862,
+      "batch_num_samples": 29,
+      "batch_num_tokens": 8092,
+      "epoch": 1.70122,
+      "grad_norm": 0.12322220206260681,
+      "learning_rate": 6.585842072415799e-07,
+      "loss": 4.2988,
+      "step": 1311
+    },
+    {
+      "batch_num_effect_tokens": 7981,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8110,
+      "epoch": 1.70251,
+      "grad_norm": 0.13426773250102997,
+      "learning_rate": 6.529732635259234e-07,
+      "loss": 4.7168,
+      "step": 1312
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.70381,
+      "grad_norm": 0.12802381813526154,
+      "learning_rate": 6.473846537536183e-07,
+      "loss": 4.4238,
+      "step": 1313
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8189,
+      "epoch": 1.70511,
+      "grad_norm": 0.1425420641899109,
+      "learning_rate": 6.41818406637551e-07,
+      "loss": 4.29,
+      "step": 1314
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8190,
+      "epoch": 1.70641,
+      "grad_norm": 0.13316959142684937,
+      "learning_rate": 6.36274550775719e-07,
+      "loss": 4.2227,
+      "step": 1315
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.7077,
+      "grad_norm": 0.1335574984550476,
+      "learning_rate": 6.307531146510754e-07,
+      "loss": 4.5928,
+      "step": 1316
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.709,
+      "grad_norm": 0.1302870213985443,
+      "learning_rate": 6.252541266313866e-07,
+      "loss": 4.0986,
+      "step": 1317
+    },
+    {
+      "batch_num_effect_tokens": 8042,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.7103,
+      "grad_norm": 0.13204161822795868,
+      "learning_rate": 6.197776149690871e-07,
+      "loss": 4.5967,
+      "step": 1318
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.7116,
+      "grad_norm": 0.12694908678531647,
+      "learning_rate": 6.143236078011317e-07,
+      "loss": 4.2031,
+      "step": 1319
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8086,
+      "epoch": 1.7129,
+      "grad_norm": 0.12293847650289536,
+      "learning_rate": 6.088921331488568e-07,
+      "loss": 3.9863,
+      "step": 1320
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 1.71419,
+      "grad_norm": 0.12406273186206818,
+      "learning_rate": 6.034832189178302e-07,
+      "loss": 4.2266,
+      "step": 1321
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.71549,
+      "grad_norm": 0.12104199081659317,
+      "learning_rate": 5.980968928977049e-07,
+      "loss": 4.2158,
+      "step": 1322
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.71679,
+      "grad_norm": 0.13361838459968567,
+      "learning_rate": 5.927331827620902e-07,
+      "loss": 4.3867,
+      "step": 1323
+    },
+    {
+      "batch_num_effect_tokens": 8051,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.71809,
+      "grad_norm": 0.1261344850063324,
+      "learning_rate": 5.873921160683943e-07,
+      "loss": 4.332,
+      "step": 1324
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.71938,
+      "grad_norm": 0.127448171377182,
+      "learning_rate": 5.820737202576909e-07,
+      "loss": 4.499,
+      "step": 1325
+    },
+    {
+      "batch_num_effect_tokens": 7944,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8112,
+      "epoch": 1.72068,
+      "grad_norm": 0.12296268343925476,
+      "learning_rate": 5.767780226545766e-07,
+      "loss": 4.0928,
+      "step": 1326
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.72198,
+      "grad_norm": 0.1200428232550621,
+      "learning_rate": 5.715050504670288e-07,
+      "loss": 4.5107,
+      "step": 1327
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.72328,
+      "grad_norm": 0.1229812353849411,
+      "learning_rate": 5.662548307862714e-07,
+      "loss": 4.6191,
+      "step": 1328
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.72457,
+      "grad_norm": 0.1287735551595688,
+      "learning_rate": 5.61027390586626e-07,
+      "loss": 4.6602,
+      "step": 1329
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.72587,
+      "grad_norm": 0.12394028156995773,
+      "learning_rate": 5.558227567253832e-07,
+      "loss": 4.3076,
+      "step": 1330
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8149,
+      "epoch": 1.72717,
+      "grad_norm": 0.12008914351463318,
+      "learning_rate": 5.506409559426573e-07,
+      "loss": 4.373,
+      "step": 1331
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.72847,
+      "grad_norm": 0.12099087238311768,
+      "learning_rate": 5.454820148612533e-07,
+      "loss": 4.0195,
+      "step": 1332
+    },
+    {
+      "batch_num_effect_tokens": 7910,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8096,
+      "epoch": 1.72976,
+      "grad_norm": 0.13755109906196594,
+      "learning_rate": 5.403459599865307e-07,
+      "loss": 4.459,
+      "step": 1333
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.73106,
+      "grad_norm": 0.12216237932443619,
+      "learning_rate": 5.352328177062626e-07,
+      "loss": 4.0791,
+      "step": 1334
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8105,
+      "epoch": 1.73236,
+      "grad_norm": 0.14445650577545166,
+      "learning_rate": 5.301426142905019e-07,
+      "loss": 4.2441,
+      "step": 1335
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.73366,
+      "grad_norm": 0.12957431375980377,
+      "learning_rate": 5.250753758914506e-07,
+      "loss": 4.7578,
+      "step": 1336
+    },
+    {
+      "batch_num_effect_tokens": 7838,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8114,
+      "epoch": 1.73496,
+      "grad_norm": 0.12466391175985336,
+      "learning_rate": 5.200311285433213e-07,
+      "loss": 4.2861,
+      "step": 1337
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.73625,
+      "grad_norm": 0.12449593842029572,
+      "learning_rate": 5.15009898162202e-07,
+      "loss": 4.5078,
+      "step": 1338
+    },
+    {
+      "batch_num_effect_tokens": 8008,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.73755,
+      "grad_norm": 0.12438397109508514,
+      "learning_rate": 5.100117105459279e-07,
+      "loss": 4.2246,
+      "step": 1339
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.73885,
+      "grad_norm": 0.1266915500164032,
+      "learning_rate": 5.050365913739441e-07,
+      "loss": 3.8174,
+      "step": 1340
+    },
+    {
+      "batch_num_effect_tokens": 8032,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74015,
+      "grad_norm": 0.12431465089321136,
+      "learning_rate": 5.000845662071779e-07,
+      "loss": 4.334,
+      "step": 1341
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74144,
+      "grad_norm": 0.12137281149625778,
+      "learning_rate": 4.951556604879049e-07,
+      "loss": 4.001,
+      "step": 1342
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74274,
+      "grad_norm": 0.12117177993059158,
+      "learning_rate": 4.902498995396166e-07,
+      "loss": 4.2422,
+      "step": 1343
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74404,
+      "grad_norm": 0.12428870052099228,
+      "learning_rate": 4.853673085668947e-07,
+      "loss": 4.1943,
+      "step": 1344
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74534,
+      "grad_norm": 0.12221790850162506,
+      "learning_rate": 4.80507912655277e-07,
+      "loss": 4.5078,
+      "step": 1345
+    },
+    {
+      "batch_num_effect_tokens": 7997,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74663,
+      "grad_norm": 0.12552009522914886,
+      "learning_rate": 4.75671736771135e-07,
+      "loss": 4.2559,
+      "step": 1346
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74793,
+      "grad_norm": 0.11707472056150436,
+      "learning_rate": 4.7085880576153765e-07,
+      "loss": 3.9307,
+      "step": 1347
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.74923,
+      "grad_norm": 0.11228827387094498,
+      "learning_rate": 4.660691443541282e-07,
+      "loss": 4.1211,
+      "step": 1348
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.75053,
+      "grad_norm": 0.12909752130508423,
+      "learning_rate": 4.6130277715699777e-07,
+      "loss": 4.2334,
+      "step": 1349
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.75182,
+      "grad_norm": 0.1351223587989807,
+      "learning_rate": 4.565597286585555e-07,
+      "loss": 4.5566,
+      "step": 1350
+    },
+    {
+      "batch_num_effect_tokens": 7985,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8167,
+      "epoch": 1.75312,
+      "grad_norm": 0.13169695436954498,
+      "learning_rate": 4.5184002322740784e-07,
+      "loss": 4.4697,
+      "step": 1351
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.75442,
+      "grad_norm": 0.12474015355110168,
+      "learning_rate": 4.4714368511222905e-07,
+      "loss": 3.9141,
+      "step": 1352
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.75572,
+      "grad_norm": 0.1359989494085312,
+      "learning_rate": 4.4247073844163434e-07,
+      "loss": 4.4863,
+      "step": 1353
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.75702,
+      "grad_norm": 0.13669313490390778,
+      "learning_rate": 4.3782120722406565e-07,
+      "loss": 4.3809,
+      "step": 1354
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.75831,
+      "grad_norm": 0.13616794347763062,
+      "learning_rate": 4.331951153476588e-07,
+      "loss": 4.0215,
+      "step": 1355
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8191,
+      "epoch": 1.75961,
+      "grad_norm": 0.13854430615901947,
+      "learning_rate": 4.285924865801233e-07,
+      "loss": 4.5312,
+      "step": 1356
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.76091,
+      "grad_norm": 0.13246670365333557,
+      "learning_rate": 4.2401334456862344e-07,
+      "loss": 4.4092,
+      "step": 1357
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.76221,
+      "grad_norm": 0.12861448526382446,
+      "learning_rate": 4.194577128396521e-07,
+      "loss": 4.2188,
+      "step": 1358
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.7635,
+      "grad_norm": 0.1204322949051857,
+      "learning_rate": 4.149256147989139e-07,
+      "loss": 4.2617,
+      "step": 1359
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 1.7648,
+      "grad_norm": 0.14009417593479156,
+      "learning_rate": 4.1041707373120354e-07,
+      "loss": 4.0459,
+      "step": 1360
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.7661,
+      "grad_norm": 0.13464143872261047,
+      "learning_rate": 4.05932112800283e-07,
+      "loss": 4.4492,
+      "step": 1361
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8192,
+      "epoch": 1.7674,
+      "grad_norm": 0.12960170209407806,
+      "learning_rate": 4.0147075504876844e-07,
+      "loss": 4.3623,
+      "step": 1362
+    },
+    {
+      "batch_num_effect_tokens": 8084,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.76869,
+      "grad_norm": 0.1406746506690979,
+      "learning_rate": 3.9703302339800687e-07,
+      "loss": 4.4395,
+      "step": 1363
+    },
+    {
+      "batch_num_effect_tokens": 8039,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.76999,
+      "grad_norm": 0.12615273892879486,
+      "learning_rate": 3.9261894064796136e-07,
+      "loss": 4.4746,
+      "step": 1364
+    },
+    {
+      "batch_num_effect_tokens": 8056,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.77129,
+      "grad_norm": 0.13323578238487244,
+      "learning_rate": 3.882285294770938e-07,
+      "loss": 4.1318,
+      "step": 1365
+    },
+    {
+      "batch_num_effect_tokens": 7836,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8011,
+      "epoch": 1.77259,
+      "grad_norm": 0.12410898506641388,
+      "learning_rate": 3.8386181244224274e-07,
+      "loss": 4.2305,
+      "step": 1366
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.77388,
+      "grad_norm": 0.1302991360425949,
+      "learning_rate": 3.7951881197851816e-07,
+      "loss": 4.3486,
+      "step": 1367
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8081,
+      "epoch": 1.77518,
+      "grad_norm": 0.13446044921875,
+      "learning_rate": 3.751995503991762e-07,
+      "loss": 4.4795,
+      "step": 1368
+    },
+    {
+      "batch_num_effect_tokens": 8090,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.77648,
+      "grad_norm": 0.1244499534368515,
+      "learning_rate": 3.709040498955102e-07,
+      "loss": 4.1152,
+      "step": 1369
+    },
+    {
+      "batch_num_effect_tokens": 7669,
+      "batch_num_samples": 32,
+      "batch_num_tokens": 7981,
+      "epoch": 1.77778,
+      "grad_norm": 0.13350388407707214,
+      "learning_rate": 3.666323325367344e-07,
+      "loss": 4.3936,
+      "step": 1370
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8108,
+      "epoch": 1.77908,
+      "grad_norm": 0.1298922747373581,
+      "learning_rate": 3.623844202698701e-07,
+      "loss": 4.291,
+      "step": 1371
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8166,
+      "epoch": 1.78037,
+      "grad_norm": 0.12909585237503052,
+      "learning_rate": 3.581603349196372e-07,
+      "loss": 4.3447,
+      "step": 1372
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.78167,
+      "grad_norm": 0.1274796426296234,
+      "learning_rate": 3.5396009818833567e-07,
+      "loss": 4.3516,
+      "step": 1373
+    },
+    {
+      "batch_num_effect_tokens": 8005,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8184,
+      "epoch": 1.78297,
+      "grad_norm": 0.11704126745462418,
+      "learning_rate": 3.497837316557384e-07,
+      "loss": 4.2861,
+      "step": 1374
+    },
+    {
+      "batch_num_effect_tokens": 7885,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.78427,
+      "grad_norm": 0.11343449354171753,
+      "learning_rate": 3.4563125677897936e-07,
+      "loss": 4.2314,
+      "step": 1375
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.78556,
+      "grad_norm": 0.12782780826091766,
+      "learning_rate": 3.41502694892441e-07,
+      "loss": 4.335,
+      "step": 1376
+    },
+    {
+      "batch_num_effect_tokens": 8075,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.78686,
+      "grad_norm": 0.12656031548976898,
+      "learning_rate": 3.373980672076516e-07,
+      "loss": 4.3848,
+      "step": 1377
+    },
+    {
+      "batch_num_effect_tokens": 8057,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.78816,
+      "grad_norm": 0.12621362507343292,
+      "learning_rate": 3.333173948131663e-07,
+      "loss": 4.1943,
+      "step": 1378
+    },
+    {
+      "batch_num_effect_tokens": 8012,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.78946,
+      "grad_norm": 0.12268788367509842,
+      "learning_rate": 3.2926069867446673e-07,
+      "loss": 4.7314,
+      "step": 1379
+    },
+    {
+      "batch_num_effect_tokens": 7885,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8124,
+      "epoch": 1.79075,
+      "grad_norm": 0.12487363070249557,
+      "learning_rate": 3.252279996338492e-07,
+      "loss": 4.0225,
+      "step": 1380
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.79205,
+      "grad_norm": 0.13526642322540283,
+      "learning_rate": 3.212193184103196e-07,
+      "loss": 4.2959,
+      "step": 1381
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8146,
+      "epoch": 1.79335,
+      "grad_norm": 0.13416649401187897,
+      "learning_rate": 3.172346755994865e-07,
+      "loss": 4.4658,
+      "step": 1382
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8133,
+      "epoch": 1.79465,
+      "grad_norm": 0.11787126213312149,
+      "learning_rate": 3.132740916734556e-07,
+      "loss": 4.0879,
+      "step": 1383
+    },
+    {
+      "batch_num_effect_tokens": 7981,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8192,
+      "epoch": 1.79594,
+      "grad_norm": 0.13434545695781708,
+      "learning_rate": 3.0933758698072023e-07,
+      "loss": 4.5879,
+      "step": 1384
+    },
+    {
+      "batch_num_effect_tokens": 8062,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.79724,
+      "grad_norm": 0.13060152530670166,
+      "learning_rate": 3.054251817460663e-07,
+      "loss": 4.4268,
+      "step": 1385
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.79854,
+      "grad_norm": 0.12576042115688324,
+      "learning_rate": 3.015368960704584e-07,
+      "loss": 4.0508,
+      "step": 1386
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.79854,
+      "eval_eval_loss": 0.5592343807220459,
+      "eval_eval_runtime": 115.3013,
+      "eval_eval_samples_per_second": 43.365,
+      "eval_eval_steps_per_second": 2.715,
+      "step": 1386
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8189,
+      "epoch": 1.79984,
+      "grad_norm": 0.13307738304138184,
+      "learning_rate": 2.9767274993094285e-07,
+      "loss": 4.0938,
+      "step": 1387
+    },
+    {
+      "batch_num_effect_tokens": 7991,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8168,
+      "epoch": 1.80114,
+      "grad_norm": 0.12784725427627563,
+      "learning_rate": 2.938327631805421e-07,
+      "loss": 4.2842,
+      "step": 1388
+    },
+    {
+      "batch_num_effect_tokens": 8072,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.80243,
+      "grad_norm": 0.12473003566265106,
+      "learning_rate": 2.900169555481536e-07,
+      "loss": 4.373,
+      "step": 1389
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.80373,
+      "grad_norm": 0.12554599344730377,
+      "learning_rate": 2.862253466384507e-07,
+      "loss": 4.249,
+      "step": 1390
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.80503,
+      "grad_norm": 0.12785503268241882,
+      "learning_rate": 2.8245795593177637e-07,
+      "loss": 4.376,
+      "step": 1391
+    },
+    {
+      "batch_num_effect_tokens": 8068,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.80633,
+      "grad_norm": 0.13512638211250305,
+      "learning_rate": 2.787148027840486e-07,
+      "loss": 4.3232,
+      "step": 1392
+    },
+    {
+      "batch_num_effect_tokens": 8052,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.80762,
+      "grad_norm": 0.1361304223537445,
+      "learning_rate": 2.7499590642665773e-07,
+      "loss": 4.4668,
+      "step": 1393
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.80892,
+      "grad_norm": 0.1287328451871872,
+      "learning_rate": 2.713012859663694e-07,
+      "loss": 4.2832,
+      "step": 1394
+    },
+    {
+      "batch_num_effect_tokens": 8080,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.81022,
+      "grad_norm": 0.13318751752376556,
+      "learning_rate": 2.6763096038522673e-07,
+      "loss": 4.5,
+      "step": 1395
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8107,
+      "epoch": 1.81152,
+      "grad_norm": 0.12792308628559113,
+      "learning_rate": 2.6398494854045055e-07,
+      "loss": 4.0791,
+      "step": 1396
+    },
+    {
+      "batch_num_effect_tokens": 8067,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.81281,
+      "grad_norm": 0.13105538487434387,
+      "learning_rate": 2.6036326916434153e-07,
+      "loss": 4.4668,
+      "step": 1397
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.81411,
+      "grad_norm": 0.1282477080821991,
+      "learning_rate": 2.5676594086419037e-07,
+      "loss": 4.124,
+      "step": 1398
+    },
+    {
+      "batch_num_effect_tokens": 7992,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8144,
+      "epoch": 1.81541,
+      "grad_norm": 0.13009123504161835,
+      "learning_rate": 2.531929821221768e-07,
+      "loss": 4.4424,
+      "step": 1399
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.81671,
+      "grad_norm": 0.14188528060913086,
+      "learning_rate": 2.4964441129527337e-07,
+      "loss": 4.3438,
+      "step": 1400
+    },
+    {
+      "batch_num_effect_tokens": 8068,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.818,
+      "grad_norm": 0.13717369735240936,
+      "learning_rate": 2.4612024661515686e-07,
+      "loss": 4.4883,
+      "step": 1401
+    },
+    {
+      "batch_num_effect_tokens": 8076,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.8193,
+      "grad_norm": 0.13617396354675293,
+      "learning_rate": 2.426205061881082e-07,
+      "loss": 5.0674,
+      "step": 1402
+    },
+    {
+      "batch_num_effect_tokens": 7924,
+      "batch_num_samples": 25,
+      "batch_num_tokens": 8156,
+      "epoch": 1.8206,
+      "grad_norm": 0.1338592916727066,
+      "learning_rate": 2.3914520799492527e-07,
+      "loss": 4.501,
+      "step": 1403
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.8219,
+      "grad_norm": 0.11929450929164886,
+      "learning_rate": 2.3569436989082705e-07,
+      "loss": 4.0938,
+      "step": 1404
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.8232,
+      "grad_norm": 0.1299748420715332,
+      "learning_rate": 2.32268009605362e-07,
+      "loss": 4.1816,
+      "step": 1405
+    },
+    {
+      "batch_num_effect_tokens": 8020,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8187,
+      "epoch": 1.82449,
+      "grad_norm": 0.13757820427417755,
+      "learning_rate": 2.2886614474231794e-07,
+      "loss": 4.4873,
+      "step": 1406
+    },
+    {
+      "batch_num_effect_tokens": 8046,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8164,
+      "epoch": 1.82579,
+      "grad_norm": 0.1272607445716858,
+      "learning_rate": 2.2548879277963065e-07,
+      "loss": 4.7734,
+      "step": 1407
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8142,
+      "epoch": 1.82709,
+      "grad_norm": 0.1257350742816925,
+      "learning_rate": 2.2213597106929608e-07,
+      "loss": 4.124,
+      "step": 1408
+    },
+    {
+      "batch_num_effect_tokens": 7921,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8087,
+      "epoch": 1.82839,
+      "grad_norm": 0.12541215121746063,
+      "learning_rate": 2.1880769683727986e-07,
+      "loss": 3.9297,
+      "step": 1409
+    },
+    {
+      "batch_num_effect_tokens": 7907,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8074,
+      "epoch": 1.82968,
+      "grad_norm": 0.11980535089969635,
+      "learning_rate": 2.1550398718342692e-07,
+      "loss": 3.9902,
+      "step": 1410
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.83098,
+      "grad_norm": 0.13398943841457367,
+      "learning_rate": 2.1222485908137747e-07,
+      "loss": 3.9873,
+      "step": 1411
+    },
+    {
+      "batch_num_effect_tokens": 8078,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.83228,
+      "grad_norm": 0.12076178193092346,
+      "learning_rate": 2.0897032937847616e-07,
+      "loss": 4.5732,
+      "step": 1412
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.83358,
+      "grad_norm": 0.12311708927154541,
+      "learning_rate": 2.0574041479568817e-07,
+      "loss": 4.1377,
+      "step": 1413
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8182,
+      "epoch": 1.83487,
+      "grad_norm": 0.13371671736240387,
+      "learning_rate": 2.0253513192751374e-07,
+      "loss": 4.7109,
+      "step": 1414
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 1.83617,
+      "grad_norm": 0.12499105930328369,
+      "learning_rate": 1.9935449724189705e-07,
+      "loss": 4.2949,
+      "step": 1415
+    },
+    {
+      "batch_num_effect_tokens": 7971,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8157,
+      "epoch": 1.83747,
+      "grad_norm": 0.12765094637870789,
+      "learning_rate": 1.9619852708015142e-07,
+      "loss": 4.6221,
+      "step": 1416
+    },
+    {
+      "batch_num_effect_tokens": 7899,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8081,
+      "epoch": 1.83877,
+      "grad_norm": 0.12610025703907013,
+      "learning_rate": 1.9306723765686598e-07,
+      "loss": 4.502,
+      "step": 1417
+    },
+    {
+      "batch_num_effect_tokens": 7882,
+      "batch_num_samples": 27,
+      "batch_num_tokens": 8120,
+      "epoch": 1.84006,
+      "grad_norm": 0.14963027834892273,
+      "learning_rate": 1.8996064505982903e-07,
+      "loss": 4.8271,
+      "step": 1418
+    },
+    {
+      "batch_num_effect_tokens": 7996,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.84136,
+      "grad_norm": 0.12004819512367249,
+      "learning_rate": 1.8687876524993987e-07,
+      "loss": 4.5586,
+      "step": 1419
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8191,
+      "epoch": 1.84266,
+      "grad_norm": 0.1406659036874771,
+      "learning_rate": 1.8382161406113208e-07,
+      "loss": 4.3506,
+      "step": 1420
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.84396,
+      "grad_norm": 0.1230742409825325,
+      "learning_rate": 1.807892072002898e-07,
+      "loss": 4.8135,
+      "step": 1421
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.84526,
+      "grad_norm": 0.1338808238506317,
+      "learning_rate": 1.7778156024716497e-07,
+      "loss": 4.292,
+      "step": 1422
+    },
+    {
+      "batch_num_effect_tokens": 8011,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.84655,
+      "grad_norm": 0.11327774822711945,
+      "learning_rate": 1.7479868865430072e-07,
+      "loss": 4.3232,
+      "step": 1423
+    },
+    {
+      "batch_num_effect_tokens": 8029,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8163,
+      "epoch": 1.84785,
+      "grad_norm": 0.13712045550346375,
+      "learning_rate": 1.7184060774695033e-07,
+      "loss": 4.5557,
+      "step": 1424
+    },
+    {
+      "batch_num_effect_tokens": 7921,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8079,
+      "epoch": 1.84915,
+      "grad_norm": 0.12802965939044952,
+      "learning_rate": 1.689073327229973e-07,
+      "loss": 3.8936,
+      "step": 1425
+    },
+    {
+      "batch_num_effect_tokens": 7866,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8028,
+      "epoch": 1.85045,
+      "grad_norm": 0.14027731120586395,
+      "learning_rate": 1.659988786528821e-07,
+      "loss": 4.3535,
+      "step": 1426
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.85174,
+      "grad_norm": 0.1289585828781128,
+      "learning_rate": 1.6311526047951774e-07,
+      "loss": 4.3008,
+      "step": 1427
+    },
+    {
+      "batch_num_effect_tokens": 8002,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.85304,
+      "grad_norm": 0.12682278454303741,
+      "learning_rate": 1.6025649301821877e-07,
+      "loss": 4.6836,
+      "step": 1428
+    },
+    {
+      "batch_num_effect_tokens": 8006,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8189,
+      "epoch": 1.85434,
+      "grad_norm": 0.1304275095462799,
+      "learning_rate": 1.5742259095662126e-07,
+      "loss": 4.1318,
+      "step": 1429
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.85564,
+      "grad_norm": 0.1194472685456276,
+      "learning_rate": 1.5461356885461077e-07,
+      "loss": 4.4521,
+      "step": 1430
+    },
+    {
+      "batch_num_effect_tokens": 8049,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.85693,
+      "grad_norm": 0.12798550724983215,
+      "learning_rate": 1.5182944114424337e-07,
+      "loss": 4.1016,
+      "step": 1431
+    },
+    {
+      "batch_num_effect_tokens": 8059,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.85823,
+      "grad_norm": 0.12709662318229675,
+      "learning_rate": 1.4907022212967803e-07,
+      "loss": 4.1875,
+      "step": 1432
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.85953,
+      "grad_norm": 0.11923374235630035,
+      "learning_rate": 1.463359259870939e-07,
+      "loss": 3.9727,
+      "step": 1433
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.86083,
+      "grad_norm": 0.11731646209955215,
+      "learning_rate": 1.436265667646275e-07,
+      "loss": 4.1377,
+      "step": 1434
+    },
+    {
+      "batch_num_effect_tokens": 7870,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8076,
+      "epoch": 1.86212,
+      "grad_norm": 0.12564802169799805,
+      "learning_rate": 1.4094215838229176e-07,
+      "loss": 4.3174,
+      "step": 1435
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.86342,
+      "grad_norm": 0.1299310177564621,
+      "learning_rate": 1.38282714631911e-07,
+      "loss": 4.2109,
+      "step": 1436
+    },
+    {
+      "batch_num_effect_tokens": 8068,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.86472,
+      "grad_norm": 0.1304285228252411,
+      "learning_rate": 1.3564824917704556e-07,
+      "loss": 4.2119,
+      "step": 1437
+    },
+    {
+      "batch_num_effect_tokens": 8061,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.86602,
+      "grad_norm": 0.1279984563589096,
+      "learning_rate": 1.3303877555292443e-07,
+      "loss": 4.4756,
+      "step": 1438
+    },
+    {
+      "batch_num_effect_tokens": 7857,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8086,
+      "epoch": 1.86732,
+      "grad_norm": 0.12695564329624176,
+      "learning_rate": 1.3045430716637608e-07,
+      "loss": 4.6621,
+      "step": 1439
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.86861,
+      "grad_norm": 0.13486185669898987,
+      "learning_rate": 1.2789485729575612e-07,
+      "loss": 4.252,
+      "step": 1440
+    },
+    {
+      "batch_num_effect_tokens": 8022,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.86991,
+      "grad_norm": 0.11354608088731766,
+      "learning_rate": 1.253604390908819e-07,
+      "loss": 3.9609,
+      "step": 1441
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.87121,
+      "grad_norm": 0.13337519764900208,
+      "learning_rate": 1.2285106557296479e-07,
+      "loss": 3.9141,
+      "step": 1442
+    },
+    {
+      "batch_num_effect_tokens": 8017,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.87251,
+      "grad_norm": 0.13385063409805298,
+      "learning_rate": 1.2036674963454232e-07,
+      "loss": 4.4512,
+      "step": 1443
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.8738,
+      "grad_norm": 0.12899687886238098,
+      "learning_rate": 1.1790750403941231e-07,
+      "loss": 4.5166,
+      "step": 1444
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.8751,
+      "grad_norm": 0.1389113813638687,
+      "learning_rate": 1.1547334142256895e-07,
+      "loss": 4.4268,
+      "step": 1445
+    },
+    {
+      "batch_num_effect_tokens": 8048,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.8764,
+      "grad_norm": 0.13730958104133606,
+      "learning_rate": 1.1306427429013222e-07,
+      "loss": 4.4258,
+      "step": 1446
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.8777,
+      "grad_norm": 0.1304236501455307,
+      "learning_rate": 1.1068031501929366e-07,
+      "loss": 4.334,
+      "step": 1447
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.87899,
+      "grad_norm": 0.12509453296661377,
+      "learning_rate": 1.0832147585824182e-07,
+      "loss": 3.9365,
+      "step": 1448
+    },
+    {
+      "batch_num_effect_tokens": 7965,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8140,
+      "epoch": 1.88029,
+      "grad_norm": 0.1231050118803978,
+      "learning_rate": 1.0598776892610685e-07,
+      "loss": 4.208,
+      "step": 1449
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.88159,
+      "grad_norm": 0.12538541853427887,
+      "learning_rate": 1.0367920621289496e-07,
+      "loss": 4.3555,
+      "step": 1450
+    },
+    {
+      "batch_num_effect_tokens": 7939,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8087,
+      "epoch": 1.88289,
+      "grad_norm": 0.1338019073009491,
+      "learning_rate": 1.0139579957942736e-07,
+      "loss": 3.7793,
+      "step": 1451
+    },
+    {
+      "batch_num_effect_tokens": 7940,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8128,
+      "epoch": 1.88418,
+      "grad_norm": 0.13684529066085815,
+      "learning_rate": 9.913756075728088e-08,
+      "loss": 4.415,
+      "step": 1452
+    },
+    {
+      "batch_num_effect_tokens": 7970,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8120,
+      "epoch": 1.88548,
+      "grad_norm": 0.1501617431640625,
+      "learning_rate": 9.69045013487252e-08,
+      "loss": 4.5879,
+      "step": 1453
+    },
+    {
+      "batch_num_effect_tokens": 7784,
+      "batch_num_samples": 28,
+      "batch_num_tokens": 8032,
+      "epoch": 1.88678,
+      "grad_norm": 0.1272551417350769,
+      "learning_rate": 9.469663282666519e-08,
+      "loss": 4.4873,
+      "step": 1454
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.88808,
+      "grad_norm": 0.13146163523197174,
+      "learning_rate": 9.251396653457978e-08,
+      "loss": 4.5342,
+      "step": 1455
+    },
+    {
+      "batch_num_effect_tokens": 8006,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.88938,
+      "grad_norm": 0.12587164342403412,
+      "learning_rate": 9.035651368646647e-08,
+      "loss": 4.1934,
+      "step": 1456
+    },
+    {
+      "batch_num_effect_tokens": 8075,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.89067,
+      "grad_norm": 0.12512744963169098,
+      "learning_rate": 8.822428536678251e-08,
+      "loss": 4.3193,
+      "step": 1457
+    },
+    {
+      "batch_num_effect_tokens": 7942,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8088,
+      "epoch": 1.89197,
+      "grad_norm": 0.13346132636070251,
+      "learning_rate": 8.611729253038658e-08,
+      "loss": 4.7422,
+      "step": 1458
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.89327,
+      "grad_norm": 0.13034431636333466,
+      "learning_rate": 8.403554600248498e-08,
+      "loss": 4.293,
+      "step": 1459
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.89457,
+      "grad_norm": 0.12772700190544128,
+      "learning_rate": 8.197905647857385e-08,
+      "loss": 4.1699,
+      "step": 1460
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.89586,
+      "grad_norm": 0.12720619142055511,
+      "learning_rate": 7.994783452438592e-08,
+      "loss": 4.1992,
+      "step": 1461
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8152,
+      "epoch": 1.89716,
+      "grad_norm": 0.12452313303947449,
+      "learning_rate": 7.794189057583335e-08,
+      "loss": 4.1182,
+      "step": 1462
+    },
+    {
+      "batch_num_effect_tokens": 7979,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8178,
+      "epoch": 1.89846,
+      "grad_norm": 0.1336333006620407,
+      "learning_rate": 7.59612349389599e-08,
+      "loss": 4.4717,
+      "step": 1463
+    },
+    {
+      "batch_num_effect_tokens": 7935,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8074,
+      "epoch": 1.89976,
+      "grad_norm": 0.134145587682724,
+      "learning_rate": 7.400587778988055e-08,
+      "loss": 4.3926,
+      "step": 1464
+    },
+    {
+      "batch_num_effect_tokens": 7802,
+      "batch_num_samples": 27,
+      "batch_num_tokens": 8039,
+      "epoch": 1.90105,
+      "grad_norm": 0.12880048155784607,
+      "learning_rate": 7.207582917473532e-08,
+      "loss": 4.9883,
+      "step": 1465
+    },
+    {
+      "batch_num_effect_tokens": 7901,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8076,
+      "epoch": 1.90235,
+      "grad_norm": 0.13317479193210602,
+      "learning_rate": 7.017109900963437e-08,
+      "loss": 3.8604,
+      "step": 1466
+    },
+    {
+      "batch_num_effect_tokens": 7849,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8023,
+      "epoch": 1.90365,
+      "grad_norm": 0.13144908845424652,
+      "learning_rate": 6.829169708060745e-08,
+      "loss": 4.3184,
+      "step": 1467
+    },
+    {
+      "batch_num_effect_tokens": 7972,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8146,
+      "epoch": 1.90495,
+      "grad_norm": 0.1278463453054428,
+      "learning_rate": 6.643763304355566e-08,
+      "loss": 4.335,
+      "step": 1468
+    },
+    {
+      "batch_num_effect_tokens": 8047,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.90624,
+      "grad_norm": 0.12541644275188446,
+      "learning_rate": 6.460891642419865e-08,
+      "loss": 4.5146,
+      "step": 1469
+    },
+    {
+      "batch_num_effect_tokens": 7816,
+      "batch_num_samples": 26,
+      "batch_num_tokens": 8032,
+      "epoch": 1.90754,
+      "grad_norm": 0.13894987106323242,
+      "learning_rate": 6.280555661802857e-08,
+      "loss": 4.9238,
+      "step": 1470
+    },
+    {
+      "batch_num_effect_tokens": 8050,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.90884,
+      "grad_norm": 0.13302992284297943,
+      "learning_rate": 6.102756289025957e-08,
+      "loss": 3.9033,
+      "step": 1471
+    },
+    {
+      "batch_num_effect_tokens": 7917,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8110,
+      "epoch": 1.91014,
+      "grad_norm": 0.12157834321260452,
+      "learning_rate": 5.92749443757823e-08,
+      "loss": 4.4756,
+      "step": 1472
+    },
+    {
+      "batch_num_effect_tokens": 8001,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.91144,
+      "grad_norm": 0.12770642340183258,
+      "learning_rate": 5.754771007911441e-08,
+      "loss": 4.5713,
+      "step": 1473
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.91273,
+      "grad_norm": 0.12481579184532166,
+      "learning_rate": 5.584586887435739e-08,
+      "loss": 4.3623,
+      "step": 1474
+    },
+    {
+      "batch_num_effect_tokens": 7933,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8086,
+      "epoch": 1.91403,
+      "grad_norm": 0.12227432429790497,
+      "learning_rate": 5.4169429505148144e-08,
+      "loss": 4.3359,
+      "step": 1475
+    },
+    {
+      "batch_num_effect_tokens": 8028,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.91533,
+      "grad_norm": 0.12159877270460129,
+      "learning_rate": 5.251840058461577e-08,
+      "loss": 4.0273,
+      "step": 1476
+    },
+    {
+      "batch_num_effect_tokens": 7994,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 1.91663,
+      "grad_norm": 0.1284315288066864,
+      "learning_rate": 5.089279059533658e-08,
+      "loss": 4.3184,
+      "step": 1477
+    },
+    {
+      "batch_num_effect_tokens": 8019,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.91792,
+      "grad_norm": 0.135638028383255,
+      "learning_rate": 4.92926078892908e-08,
+      "loss": 4.3555,
+      "step": 1478
+    },
+    {
+      "batch_num_effect_tokens": 8018,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8146,
+      "epoch": 1.91922,
+      "grad_norm": 0.14031581580638885,
+      "learning_rate": 4.7717860687819254e-08,
+      "loss": 4.165,
+      "step": 1479
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.92052,
+      "grad_norm": 0.13316749036312103,
+      "learning_rate": 4.6168557081582854e-08,
+      "loss": 4.5801,
+      "step": 1480
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.92182,
+      "grad_norm": 0.12714573740959167,
+      "learning_rate": 4.464470503051765e-08,
+      "loss": 3.9043,
+      "step": 1481
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 1.92311,
+      "grad_norm": 0.13386176526546478,
+      "learning_rate": 4.314631236379707e-08,
+      "loss": 4.0039,
+      "step": 1482
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.92441,
+      "grad_norm": 0.13148388266563416,
+      "learning_rate": 4.167338677979027e-08,
+      "loss": 4.3115,
+      "step": 1483
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.92571,
+      "grad_norm": 0.13195869326591492,
+      "learning_rate": 4.02259358460233e-08,
+      "loss": 4.5586,
+      "step": 1484
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8156,
+      "epoch": 1.92701,
+      "grad_norm": 0.14345306158065796,
+      "learning_rate": 3.8803966999139686e-08,
+      "loss": 4.5898,
+      "step": 1485
+    },
+    {
+      "batch_num_effect_tokens": 8005,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.9283,
+      "grad_norm": 0.13300980627536774,
+      "learning_rate": 3.7407487544861565e-08,
+      "loss": 4.5,
+      "step": 1486
+    },
+    {
+      "batch_num_effect_tokens": 7975,
+      "batch_num_samples": 19,
+      "batch_num_tokens": 8131,
+      "epoch": 1.9296,
+      "grad_norm": 0.1455434411764145,
+      "learning_rate": 3.603650465795305e-08,
+      "loss": 4.6602,
+      "step": 1487
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.9309,
+      "grad_norm": 0.12875740230083466,
+      "learning_rate": 3.4691025382184165e-08,
+      "loss": 4.6846,
+      "step": 1488
+    },
+    {
+      "batch_num_effect_tokens": 8058,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8177,
+      "epoch": 1.9322,
+      "grad_norm": 0.12339173257350922,
+      "learning_rate": 3.337105663029361e-08,
+      "loss": 4.2891,
+      "step": 1489
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8152,
+      "epoch": 1.9335,
+      "grad_norm": 0.12480738013982773,
+      "learning_rate": 3.2076605183951614e-08,
+      "loss": 4.4785,
+      "step": 1490
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.93479,
+      "grad_norm": 0.13804928958415985,
+      "learning_rate": 3.080767769372939e-08,
+      "loss": 4.7588,
+      "step": 1491
+    },
+    {
+      "batch_num_effect_tokens": 8016,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.93609,
+      "grad_norm": 0.14307373762130737,
+      "learning_rate": 2.9564280679060255e-08,
+      "loss": 4.8125,
+      "step": 1492
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.93739,
+      "grad_norm": 0.12223173677921295,
+      "learning_rate": 2.834642052820913e-08,
+      "loss": 4.1455,
+      "step": 1493
+    },
+    {
+      "batch_num_effect_tokens": 8027,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8168,
+      "epoch": 1.93869,
+      "grad_norm": 0.13116061687469482,
+      "learning_rate": 2.715410349823977e-08,
+      "loss": 3.9893,
+      "step": 1494
+    },
+    {
+      "batch_num_effect_tokens": 8014,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8191,
+      "epoch": 1.93998,
+      "grad_norm": 0.1298002302646637,
+      "learning_rate": 2.59873357149798e-08,
+      "loss": 4.1182,
+      "step": 1495
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.94128,
+      "grad_norm": 0.12619996070861816,
+      "learning_rate": 2.4846123172992953e-08,
+      "loss": 4.2354,
+      "step": 1496
+    },
+    {
+      "batch_num_effect_tokens": 7887,
+      "batch_num_samples": 23,
+      "batch_num_tokens": 8105,
+      "epoch": 1.94258,
+      "grad_norm": 0.12969645857810974,
+      "learning_rate": 2.3730471735545213e-08,
+      "loss": 4.1484,
+      "step": 1497
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.94388,
+      "grad_norm": 0.12418043613433838,
+      "learning_rate": 2.264038713457706e-08,
+      "loss": 4.3389,
+      "step": 1498
+    },
+    {
+      "batch_num_effect_tokens": 7918,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8125,
+      "epoch": 1.94517,
+      "grad_norm": 0.122939832508564,
+      "learning_rate": 2.157587497067182e-08,
+      "loss": 3.9756,
+      "step": 1499
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.94647,
+      "grad_norm": 0.12949828803539276,
+      "learning_rate": 2.0536940713028475e-08,
+      "loss": 4.4385,
+      "step": 1500
+    },
+    {
+      "batch_num_effect_tokens": 8024,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.94777,
+      "grad_norm": 0.11821475625038147,
+      "learning_rate": 1.9523589699433355e-08,
+      "loss": 4.0107,
+      "step": 1501
+    },
+    {
+      "batch_num_effect_tokens": 8072,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.94907,
+      "grad_norm": 0.1258937120437622,
+      "learning_rate": 1.8535827136232365e-08,
+      "loss": 4.6211,
+      "step": 1502
+    },
+    {
+      "batch_num_effect_tokens": 7988,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8166,
+      "epoch": 1.95036,
+      "grad_norm": 0.1313227117061615,
+      "learning_rate": 1.7573658098304357e-08,
+      "loss": 4.418,
+      "step": 1503
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8189,
+      "epoch": 1.95166,
+      "grad_norm": 0.12842047214508057,
+      "learning_rate": 1.6637087529033925e-08,
+      "loss": 4.1191,
+      "step": 1504
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.95296,
+      "grad_norm": 0.1315157413482666,
+      "learning_rate": 1.5726120240288632e-08,
+      "loss": 4.5,
+      "step": 1505
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.95426,
+      "grad_norm": 0.1246904730796814,
+      "learning_rate": 1.4840760912391283e-08,
+      "loss": 4.2578,
+      "step": 1506
+    },
+    {
+      "batch_num_effect_tokens": 8031,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.95556,
+      "grad_norm": 0.125263512134552,
+      "learning_rate": 1.3981014094099354e-08,
+      "loss": 4.165,
+      "step": 1507
+    },
+    {
+      "batch_num_effect_tokens": 7982,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.95685,
+      "grad_norm": 0.12067491561174393,
+      "learning_rate": 1.314688420257726e-08,
+      "loss": 4.4043,
+      "step": 1508
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.95815,
+      "grad_norm": 0.12801378965377808,
+      "learning_rate": 1.2338375523378022e-08,
+      "loss": 4.1875,
+      "step": 1509
+    },
+    {
+      "batch_num_effect_tokens": 8025,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.95945,
+      "grad_norm": 0.14885924756526947,
+      "learning_rate": 1.1555492210418295e-08,
+      "loss": 4.7236,
+      "step": 1510
+    },
+    {
+      "batch_num_effect_tokens": 8043,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.96075,
+      "grad_norm": 0.14124317467212677,
+      "learning_rate": 1.0798238285957274e-08,
+      "loss": 4.7051,
+      "step": 1511
+    },
+    {
+      "batch_num_effect_tokens": 8041,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.96204,
+      "grad_norm": 0.11698544025421143,
+      "learning_rate": 1.006661764057837e-08,
+      "loss": 4.5605,
+      "step": 1512
+    },
+    {
+      "batch_num_effect_tokens": 7831,
+      "batch_num_samples": 31,
+      "batch_num_tokens": 8080,
+      "epoch": 1.96334,
+      "grad_norm": 0.1408989280462265,
+      "learning_rate": 9.36063403316534e-09,
+      "loss": 4.9922,
+      "step": 1513
+    },
+    {
+      "batch_num_effect_tokens": 7983,
+      "batch_num_samples": 20,
+      "batch_num_tokens": 8192,
+      "epoch": 1.96464,
+      "grad_norm": 0.12344139814376831,
+      "learning_rate": 8.680291090888416e-09,
+      "loss": 4.3076,
+      "step": 1514
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.96594,
+      "grad_norm": 0.1171666607260704,
+      "learning_rate": 8.02559230917932e-09,
+      "loss": 4.1758,
+      "step": 1515
+    },
+    {
+      "batch_num_effect_tokens": 7977,
+      "batch_num_samples": 22,
+      "batch_num_tokens": 8192,
+      "epoch": 1.96723,
+      "grad_norm": 0.12780164182186127,
+      "learning_rate": 7.3965410517179426e-09,
+      "loss": 4.2393,
+      "step": 1516
+    },
+    {
+      "batch_num_effect_tokens": 7998,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.96853,
+      "grad_norm": 0.12426239252090454,
+      "learning_rate": 6.793140550414024e-09,
+      "loss": 3.9375,
+      "step": 1517
+    },
+    {
+      "batch_num_effect_tokens": 8071,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.96983,
+      "grad_norm": 0.13120593130588531,
+      "learning_rate": 6.215393905388278e-09,
+      "loss": 4.624,
+      "step": 1518
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8167,
+      "epoch": 1.97113,
+      "grad_norm": 0.12499607354402542,
+      "learning_rate": 5.6633040849601865e-09,
+      "loss": 4.4346,
+      "step": 1519
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.97242,
+      "grad_norm": 0.12693850696086884,
+      "learning_rate": 5.1368739256296704e-09,
+      "loss": 4.0693,
+      "step": 1520
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.97372,
+      "grad_norm": 0.13233652710914612,
+      "learning_rate": 4.636106132064888e-09,
+      "loss": 4.4072,
+      "step": 1521
+    },
+    {
+      "batch_num_effect_tokens": 8037,
+      "batch_num_samples": 15,
+      "batch_num_tokens": 8192,
+      "epoch": 1.97502,
+      "grad_norm": 0.11534731835126877,
+      "learning_rate": 4.161003277085574e-09,
+      "loss": 3.8291,
+      "step": 1522
+    },
+    {
+      "batch_num_effect_tokens": 8010,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8192,
+      "epoch": 1.97632,
+      "grad_norm": 0.13354112207889557,
+      "learning_rate": 3.711567801652494e-09,
+      "loss": 4.1992,
+      "step": 1523
+    },
+    {
+      "batch_num_effect_tokens": 7938,
+      "batch_num_samples": 21,
+      "batch_num_tokens": 8142,
+      "epoch": 1.97762,
+      "grad_norm": 0.13712389767169952,
+      "learning_rate": 3.2878020148530143e-09,
+      "loss": 4.1602,
+      "step": 1524
+    },
+    {
+      "batch_num_effect_tokens": 7910,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8090,
+      "epoch": 1.97891,
+      "grad_norm": 0.12684784829616547,
+      "learning_rate": 2.8897080938916634e-09,
+      "loss": 4.5029,
+      "step": 1525
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.98021,
+      "grad_norm": 0.13627074658870697,
+      "learning_rate": 2.5172880840745873e-09,
+      "loss": 4.7266,
+      "step": 1526
+    },
+    {
+      "batch_num_effect_tokens": 7935,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8122,
+      "epoch": 1.98151,
+      "grad_norm": 0.13185909390449524,
+      "learning_rate": 2.1705438988040005e-09,
+      "loss": 4.0479,
+      "step": 1527
+    },
+    {
+      "batch_num_effect_tokens": 8065,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.98281,
+      "grad_norm": 0.12739528715610504,
+      "learning_rate": 1.849477319564863e-09,
+      "loss": 4.4717,
+      "step": 1528
+    },
+    {
+      "batch_num_effect_tokens": 8036,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8191,
+      "epoch": 1.9841,
+      "grad_norm": 0.12966281175613403,
+      "learning_rate": 1.5540899959187727e-09,
+      "loss": 4.1494,
+      "step": 1529
+    },
+    {
+      "batch_num_effect_tokens": 7937,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8139,
+      "epoch": 1.9854,
+      "grad_norm": 0.1280137449502945,
+      "learning_rate": 1.2843834454911997e-09,
+      "loss": 4.377,
+      "step": 1530
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 24,
+      "batch_num_tokens": 8192,
+      "epoch": 1.9867,
+      "grad_norm": 0.13096709549427032,
+      "learning_rate": 1.040359053967599e-09,
+      "loss": 4.127,
+      "step": 1531
+    },
+    {
+      "batch_num_effect_tokens": 7984,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8128,
+      "epoch": 1.988,
+      "grad_norm": 0.12772879004478455,
+      "learning_rate": 8.220180750850848e-10,
+      "loss": 4.3184,
+      "step": 1532
+    },
+    {
+      "batch_num_effect_tokens": 8040,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.98929,
+      "grad_norm": 0.1290796995162964,
+      "learning_rate": 6.293616306246586e-10,
+      "loss": 4.8291,
+      "step": 1533
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8146,
+      "epoch": 1.99059,
+      "grad_norm": 0.12888257205486298,
+      "learning_rate": 4.623907104084335e-10,
+      "loss": 4.251,
+      "step": 1534
+    },
+    {
+      "batch_num_effect_tokens": 8063,
+      "batch_num_samples": 14,
+      "batch_num_tokens": 8192,
+      "epoch": 1.99189,
+      "grad_norm": 0.13058820366859436,
+      "learning_rate": 3.211061722901976e-10,
+      "loss": 4.3525,
+      "step": 1535
+    },
+    {
+      "batch_num_effect_tokens": 7978,
+      "batch_num_samples": 17,
+      "batch_num_tokens": 8139,
+      "epoch": 1.99319,
+      "grad_norm": 0.12405020743608475,
+      "learning_rate": 2.0550874215541362e-10,
+      "loss": 4.4014,
+      "step": 1536
+    },
+    {
+      "batch_num_effect_tokens": 8015,
+      "batch_num_samples": 13,
+      "batch_num_tokens": 8192,
+      "epoch": 1.99448,
+      "grad_norm": 0.1255870759487152,
+      "learning_rate": 1.1559901391511308e-10,
+      "loss": 4.2578,
+      "step": 1537
+    },
+    {
+      "batch_num_effect_tokens": 8054,
+      "batch_num_samples": 16,
+      "batch_num_tokens": 8192,
+      "epoch": 1.99578,
+      "grad_norm": 0.12560389935970306,
+      "learning_rate": 5.137744950312051e-11,
+      "loss": 4.3672,
+      "step": 1538
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8184,
+      "epoch": 1.99708,
+      "grad_norm": 0.1317049115896225,
+      "learning_rate": 1.2844378873833053e-11,
+      "loss": 4.2676,
+      "step": 1539
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.99838,
+      "grad_norm": 0.12680941820144653,
+      "learning_rate": 0.0,
+      "loss": 4.6318,
+      "step": 1540
+    },
+    {
+      "batch_num_effect_tokens": 8033,
+      "batch_num_samples": 18,
+      "batch_num_tokens": 8192,
+      "epoch": 1.99838,
+      "eval_eval_loss": 0.5588093996047974,
+      "eval_eval_runtime": 115.4397,
+      "eval_eval_samples_per_second": 43.313,
+      "eval_eval_steps_per_second": 2.711,
+      "step": 1540
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1540,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}