diff --git "a/CFunModel/trainer_state.json" "b/CFunModel/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/CFunModel/trainer_state.json"
@@ -0,0 +1,22033 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 220,
+  "global_step": 2200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "batch_num_effect_tokens": 7156,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50569,
+      "epoch": 0.00091,
+      "grad_norm": 30.947839736938477,
+      "learning_rate": 9.090909090909091e-08,
+      "loss": 2.9805,
+      "step": 1
+    },
+    {
+      "batch_num_effect_tokens": 6304,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.00182,
+      "grad_norm": 38.34111785888672,
+      "learning_rate": 1.8181818181818183e-07,
+      "loss": 3.1035,
+      "step": 2
+    },
+    {
+      "batch_num_effect_tokens": 8816,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 0.00273,
+      "grad_norm": 28.859642028808594,
+      "learning_rate": 2.7272727272727274e-07,
+      "loss": 3.1328,
+      "step": 3
+    },
+    {
+      "batch_num_effect_tokens": 8810,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 0.00364,
+      "grad_norm": 28.776859283447266,
+      "learning_rate": 3.6363636363636366e-07,
+      "loss": 2.8384,
+      "step": 4
+    },
+    {
+      "batch_num_effect_tokens": 7038,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.00455,
+      "grad_norm": 34.464454650878906,
+      "learning_rate": 4.5454545454545457e-07,
+      "loss": 2.9175,
+      "step": 5
+    },
+    {
+      "batch_num_effect_tokens": 11920,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 0.00545,
+      "grad_norm": 19.11911392211914,
+      "learning_rate": 5.454545454545455e-07,
+      "loss": 3.0703,
+      "step": 6
+    },
+    {
+      "batch_num_effect_tokens": 6632,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.00636,
+      "grad_norm": 37.14106750488281,
+      "learning_rate": 6.363636363636364e-07,
+      "loss": 3.0396,
+      "step": 7
+    },
+    {
+      "batch_num_effect_tokens": 7207,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 0.00727,
+      "grad_norm": 32.404598236083984,
+      "learning_rate": 7.272727272727273e-07,
+      "loss": 3.0059,
+      "step": 8
+    },
+    {
+      "batch_num_effect_tokens": 8803,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 0.00818,
+      "grad_norm": 26.34598731994629,
+      "learning_rate": 8.181818181818182e-07,
+      "loss": 2.9204,
+      "step": 9
+    },
+    {
+      "batch_num_effect_tokens": 6747,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.00909,
+      "grad_norm": 33.00784683227539,
+      "learning_rate": 9.090909090909091e-07,
+      "loss": 2.7783,
+      "step": 10
+    },
+    {
+      "batch_num_effect_tokens": 7202,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.01,
+      "grad_norm": 27.24934196472168,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.7075,
+      "step": 11
+    },
+    {
+      "batch_num_effect_tokens": 7435,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.01091,
+      "grad_norm": 21.876995086669922,
+      "learning_rate": 1.090909090909091e-06,
+      "loss": 2.7607,
+      "step": 12
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 0.01182,
+      "grad_norm": 16.76250648498535,
+      "learning_rate": 1.181818181818182e-06,
+      "loss": 2.6719,
+      "step": 13
+    },
+    {
+      "batch_num_effect_tokens": 5491,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.01273,
+      "grad_norm": 20.941076278686523,
+      "learning_rate": 1.2727272727272728e-06,
+      "loss": 2.6194,
+      "step": 14
+    },
+    {
+      "batch_num_effect_tokens": 8461,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.01364,
+      "grad_norm": 10.907334327697754,
+      "learning_rate": 1.3636363636363636e-06,
+      "loss": 2.5329,
+      "step": 15
+    },
+    {
+      "batch_num_effect_tokens": 7714,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.01455,
+      "grad_norm": 7.7442803382873535,
+      "learning_rate": 1.4545454545454546e-06,
+      "loss": 2.4167,
+      "step": 16
+    },
+    {
+      "batch_num_effect_tokens": 6003,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.01545,
+      "grad_norm": 7.480747222900391,
+      "learning_rate": 1.5454545454545454e-06,
+      "loss": 2.2767,
+      "step": 17
+    },
+    {
+      "batch_num_effect_tokens": 8222,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.01636,
+      "grad_norm": 6.072074890136719,
+      "learning_rate": 1.6363636363636365e-06,
+      "loss": 2.2823,
+      "step": 18
+    },
+    {
+      "batch_num_effect_tokens": 4948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.01727,
+      "grad_norm": 5.665102481842041,
+      "learning_rate": 1.7272727272727275e-06,
+      "loss": 1.5879,
+      "step": 19
+    },
+    {
+      "batch_num_effect_tokens": 4090,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.01818,
+      "grad_norm": 5.8563666343688965,
+      "learning_rate": 1.8181818181818183e-06,
+      "loss": 1.5819,
+      "step": 20
+    },
+    {
+      "batch_num_effect_tokens": 6086,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 0.01909,
+      "grad_norm": 4.837255001068115,
+      "learning_rate": 1.9090909090909095e-06,
+      "loss": 1.9778,
+      "step": 21
+    },
+    {
+      "batch_num_effect_tokens": 5614,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.02,
+      "grad_norm": 8.160093307495117,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 2.0801,
+      "step": 22
+    },
+    {
+      "batch_num_effect_tokens": 6381,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 0.02091,
+      "grad_norm": 4.796694755554199,
+      "learning_rate": 2.090909090909091e-06,
+      "loss": 2.1119,
+      "step": 23
+    },
+    {
+      "batch_num_effect_tokens": 5944,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.02182,
+      "grad_norm": 4.278438091278076,
+      "learning_rate": 2.181818181818182e-06,
+      "loss": 1.9312,
+      "step": 24
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.02273,
+      "grad_norm": 4.323818683624268,
+      "learning_rate": 2.2727272727272728e-06,
+      "loss": 2.2292,
+      "step": 25
+    },
+    {
+      "batch_num_effect_tokens": 8597,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 0.02364,
+      "grad_norm": 4.498684406280518,
+      "learning_rate": 2.363636363636364e-06,
+      "loss": 2.3325,
+      "step": 26
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.02455,
+      "grad_norm": 4.384886741638184,
+      "learning_rate": 2.454545454545455e-06,
+      "loss": 2.2605,
+      "step": 27
+    },
+    {
+      "batch_num_effect_tokens": 7764,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.02545,
+      "grad_norm": 4.394514560699463,
+      "learning_rate": 2.5454545454545456e-06,
+      "loss": 2.219,
+      "step": 28
+    },
+    {
+      "batch_num_effect_tokens": 6261,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.02636,
+      "grad_norm": 4.165273189544678,
+      "learning_rate": 2.6363636363636364e-06,
+      "loss": 2.028,
+      "step": 29
+    },
+    {
+      "batch_num_effect_tokens": 7873,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.02727,
+      "grad_norm": 4.326238632202148,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 2.3767,
+      "step": 30
+    },
+    {
+      "batch_num_effect_tokens": 6274,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.02818,
+      "grad_norm": 4.087923526763916,
+      "learning_rate": 2.818181818181818e-06,
+      "loss": 2.1954,
+      "step": 31
+    },
+    {
+      "batch_num_effect_tokens": 5754,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.02909,
+      "grad_norm": 4.483077526092529,
+      "learning_rate": 2.9090909090909093e-06,
+      "loss": 2.1962,
+      "step": 32
+    },
+    {
+      "batch_num_effect_tokens": 3874,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 0.03,
+      "grad_norm": 4.695860862731934,
+      "learning_rate": 3e-06,
+      "loss": 1.4154,
+      "step": 33
+    },
+    {
+      "batch_num_effect_tokens": 7026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.03091,
+      "grad_norm": 4.0346293449401855,
+      "learning_rate": 3.090909090909091e-06,
+      "loss": 2.1736,
+      "step": 34
+    },
+    {
+      "batch_num_effect_tokens": 9795,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 0.03182,
+      "grad_norm": 3.8173470497131348,
+      "learning_rate": 3.181818181818182e-06,
+      "loss": 2.5917,
+      "step": 35
+    },
+    {
+      "batch_num_effect_tokens": 7027,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.03273,
+      "grad_norm": 3.9705400466918945,
+      "learning_rate": 3.272727272727273e-06,
+      "loss": 2.1748,
+      "step": 36
+    },
+    {
+      "batch_num_effect_tokens": 6395,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52129,
+      "epoch": 0.03364,
+      "grad_norm": 4.2313923835754395,
+      "learning_rate": 3.3636363636363637e-06,
+      "loss": 2.0255,
+      "step": 37
+    },
+    {
+      "batch_num_effect_tokens": 5370,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.03455,
+      "grad_norm": 4.113375186920166,
+      "learning_rate": 3.454545454545455e-06,
+      "loss": 1.8015,
+      "step": 38
+    },
+    {
+      "batch_num_effect_tokens": 5401,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.03545,
+      "grad_norm": 5.210320949554443,
+      "learning_rate": 3.5454545454545458e-06,
+      "loss": 1.9014,
+      "step": 39
+    },
+    {
+      "batch_num_effect_tokens": 4069,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.03636,
+      "grad_norm": 4.81237268447876,
+      "learning_rate": 3.6363636363636366e-06,
+      "loss": 1.6282,
+      "step": 40
+    },
+    {
+      "batch_num_effect_tokens": 7244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.03727,
+      "grad_norm": 3.647012233734131,
+      "learning_rate": 3.727272727272728e-06,
+      "loss": 1.9749,
+      "step": 41
+    },
+    {
+      "batch_num_effect_tokens": 6864,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52218,
+      "epoch": 0.03818,
+      "grad_norm": 3.990999460220337,
+      "learning_rate": 3.818181818181819e-06,
+      "loss": 2.0056,
+      "step": 42
+    },
+    {
+      "batch_num_effect_tokens": 6332,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.03909,
+      "grad_norm": 4.233860969543457,
+      "learning_rate": 3.90909090909091e-06,
+      "loss": 2.155,
+      "step": 43
+    },
+    {
+      "batch_num_effect_tokens": 7136,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.04,
+      "grad_norm": 3.865319013595581,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.9899,
+      "step": 44
+    },
+    {
+      "batch_num_effect_tokens": 5198,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.04091,
+      "grad_norm": 3.954425096511841,
+      "learning_rate": 4.0909090909090915e-06,
+      "loss": 1.6524,
+      "step": 45
+    },
+    {
+      "batch_num_effect_tokens": 5478,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 0.04182,
+      "grad_norm": 4.367476940155029,
+      "learning_rate": 4.181818181818182e-06,
+      "loss": 2.0508,
+      "step": 46
+    },
+    {
+      "batch_num_effect_tokens": 4745,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.04273,
+      "grad_norm": 4.386691570281982,
+      "learning_rate": 4.272727272727273e-06,
+      "loss": 1.6864,
+      "step": 47
+    },
+    {
+      "batch_num_effect_tokens": 4896,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.04364,
+      "grad_norm": 4.337430477142334,
+      "learning_rate": 4.363636363636364e-06,
+      "loss": 1.5469,
+      "step": 48
+    },
+    {
+      "batch_num_effect_tokens": 5867,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.04455,
+      "grad_norm": 4.468596458435059,
+      "learning_rate": 4.454545454545455e-06,
+      "loss": 2.1321,
+      "step": 49
+    },
+    {
+      "batch_num_effect_tokens": 3693,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 0.04545,
+      "grad_norm": 4.92578125,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.4774,
+      "step": 50
+    },
+    {
+      "batch_num_effect_tokens": 8159,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.04636,
+      "grad_norm": 3.620227098464966,
+      "learning_rate": 4.636363636363636e-06,
+      "loss": 2.0459,
+      "step": 51
+    },
+    {
+      "batch_num_effect_tokens": 7699,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 0.04727,
+      "grad_norm": 3.7664144039154053,
+      "learning_rate": 4.727272727272728e-06,
+      "loss": 2.2405,
+      "step": 52
+    },
+    {
+      "batch_num_effect_tokens": 6907,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.04818,
+      "grad_norm": 3.695967435836792,
+      "learning_rate": 4.818181818181819e-06,
+      "loss": 2.0831,
+      "step": 53
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.04909,
+      "grad_norm": 3.6256861686706543,
+      "learning_rate": 4.90909090909091e-06,
+      "loss": 1.9589,
+      "step": 54
+    },
+    {
+      "batch_num_effect_tokens": 5507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 0.05,
+      "grad_norm": 4.499513626098633,
+      "learning_rate": 5e-06,
+      "loss": 2.1505,
+      "step": 55
+    },
+    {
+      "batch_num_effect_tokens": 7857,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.05091,
+      "grad_norm": 3.6978249549865723,
+      "learning_rate": 5.090909090909091e-06,
+      "loss": 2.1525,
+      "step": 56
+    },
+    {
+      "batch_num_effect_tokens": 9794,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.05182,
+      "grad_norm": 3.654414653778076,
+      "learning_rate": 5.181818181818182e-06,
+      "loss": 2.3803,
+      "step": 57
+    },
+    {
+      "batch_num_effect_tokens": 7112,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.05273,
+      "grad_norm": 3.9470489025115967,
+      "learning_rate": 5.272727272727273e-06,
+      "loss": 2.1569,
+      "step": 58
+    },
+    {
+      "batch_num_effect_tokens": 5113,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.05364,
+      "grad_norm": 4.192826747894287,
+      "learning_rate": 5.3636363636363645e-06,
+      "loss": 1.9103,
+      "step": 59
+    },
+    {
+      "batch_num_effect_tokens": 7647,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.05455,
+      "grad_norm": 4.059316635131836,
+      "learning_rate": 5.4545454545454545e-06,
+      "loss": 2.1822,
+      "step": 60
+    },
+    {
+      "batch_num_effect_tokens": 4677,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50524,
+      "epoch": 0.05545,
+      "grad_norm": 4.552770137786865,
+      "learning_rate": 5.545454545454546e-06,
+      "loss": 1.7139,
+      "step": 61
+    },
+    {
+      "batch_num_effect_tokens": 9222,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 0.05636,
+      "grad_norm": 3.726174831390381,
+      "learning_rate": 5.636363636363636e-06,
+      "loss": 2.2145,
+      "step": 62
+    },
+    {
+      "batch_num_effect_tokens": 4083,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.05727,
+      "grad_norm": 4.518796443939209,
+      "learning_rate": 5.727272727272728e-06,
+      "loss": 1.4964,
+      "step": 63
+    },
+    {
+      "batch_num_effect_tokens": 6641,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52152,
+      "epoch": 0.05818,
+      "grad_norm": 4.039824485778809,
+      "learning_rate": 5.8181818181818185e-06,
+      "loss": 1.9297,
+      "step": 64
+    },
+    {
+      "batch_num_effect_tokens": 6507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 0.05909,
+      "grad_norm": 4.078365802764893,
+      "learning_rate": 5.90909090909091e-06,
+      "loss": 2.0474,
+      "step": 65
+    },
+    {
+      "batch_num_effect_tokens": 6856,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52111,
+      "epoch": 0.06,
+      "grad_norm": 4.096185207366943,
+      "learning_rate": 6e-06,
+      "loss": 2.1375,
+      "step": 66
+    },
+    {
+      "batch_num_effect_tokens": 6408,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.06091,
+      "grad_norm": 4.003268718719482,
+      "learning_rate": 6.090909090909092e-06,
+      "loss": 2.0536,
+      "step": 67
+    },
+    {
+      "batch_num_effect_tokens": 6312,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.06182,
+      "grad_norm": 4.170324325561523,
+      "learning_rate": 6.181818181818182e-06,
+      "loss": 2.3304,
+      "step": 68
+    },
+    {
+      "batch_num_effect_tokens": 8592,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.06273,
+      "grad_norm": 3.4838006496429443,
+      "learning_rate": 6.2727272727272734e-06,
+      "loss": 2.1861,
+      "step": 69
+    },
+    {
+      "batch_num_effect_tokens": 6987,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.06364,
+      "grad_norm": 3.732853651046753,
+      "learning_rate": 6.363636363636364e-06,
+      "loss": 2.2194,
+      "step": 70
+    },
+    {
+      "batch_num_effect_tokens": 5225,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.06455,
+      "grad_norm": 4.206742286682129,
+      "learning_rate": 6.454545454545456e-06,
+      "loss": 1.926,
+      "step": 71
+    },
+    {
+      "batch_num_effect_tokens": 9847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.06545,
+      "grad_norm": 3.4091250896453857,
+      "learning_rate": 6.545454545454546e-06,
+      "loss": 2.2813,
+      "step": 72
+    },
+    {
+      "batch_num_effect_tokens": 5197,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.06636,
+      "grad_norm": 4.02803897857666,
+      "learning_rate": 6.6363636363636375e-06,
+      "loss": 1.8807,
+      "step": 73
+    },
+    {
+      "batch_num_effect_tokens": 9185,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.06727,
+      "grad_norm": 3.7550344467163086,
+      "learning_rate": 6.7272727272727275e-06,
+      "loss": 2.2754,
+      "step": 74
+    },
+    {
+      "batch_num_effect_tokens": 7074,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.06818,
+      "grad_norm": 3.7421798706054688,
+      "learning_rate": 6.818181818181818e-06,
+      "loss": 1.9489,
+      "step": 75
+    },
+    {
+      "batch_num_effect_tokens": 5189,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.06909,
+      "grad_norm": 4.224911689758301,
+      "learning_rate": 6.90909090909091e-06,
+      "loss": 1.9238,
+      "step": 76
+    },
+    {
+      "batch_num_effect_tokens": 5192,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.07,
+      "grad_norm": 3.922016143798828,
+      "learning_rate": 7e-06,
+      "loss": 1.7985,
+      "step": 77
+    },
+    {
+      "batch_num_effect_tokens": 5656,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.07091,
+      "grad_norm": 4.252484321594238,
+      "learning_rate": 7.0909090909090916e-06,
+      "loss": 1.9175,
+      "step": 78
+    },
+    {
+      "batch_num_effect_tokens": 4409,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.07182,
+      "grad_norm": 4.439416885375977,
+      "learning_rate": 7.181818181818182e-06,
+      "loss": 1.6123,
+      "step": 79
+    },
+    {
+      "batch_num_effect_tokens": 5713,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.07273,
+      "grad_norm": 4.0134124755859375,
+      "learning_rate": 7.272727272727273e-06,
+      "loss": 1.9377,
+      "step": 80
+    },
+    {
+      "batch_num_effect_tokens": 6445,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.07364,
+      "grad_norm": 4.2473907470703125,
+      "learning_rate": 7.363636363636364e-06,
+      "loss": 1.9624,
+      "step": 81
+    },
+    {
+      "batch_num_effect_tokens": 7883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 0.07455,
+      "grad_norm": 3.6550698280334473,
+      "learning_rate": 7.454545454545456e-06,
+      "loss": 2.0343,
+      "step": 82
+    },
+    {
+      "batch_num_effect_tokens": 5319,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.07545,
+      "grad_norm": 4.416125774383545,
+      "learning_rate": 7.545454545454546e-06,
+      "loss": 1.9891,
+      "step": 83
+    },
+    {
+      "batch_num_effect_tokens": 6165,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52206,
+      "epoch": 0.07636,
+      "grad_norm": 4.053107738494873,
+      "learning_rate": 7.636363636363638e-06,
+      "loss": 1.9417,
+      "step": 84
+    },
+    {
+      "batch_num_effect_tokens": 6178,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.07727,
+      "grad_norm": 4.197380065917969,
+      "learning_rate": 7.727272727272727e-06,
+      "loss": 2.1951,
+      "step": 85
+    },
+    {
+      "batch_num_effect_tokens": 6818,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.07818,
+      "grad_norm": 3.6595616340637207,
+      "learning_rate": 7.81818181818182e-06,
+      "loss": 2.0797,
+      "step": 86
+    },
+    {
+      "batch_num_effect_tokens": 6648,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52218,
+      "epoch": 0.07909,
+      "grad_norm": 4.197087287902832,
+      "learning_rate": 7.909090909090909e-06,
+      "loss": 2.2547,
+      "step": 87
+    },
+    {
+      "batch_num_effect_tokens": 7709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52136,
+      "epoch": 0.08,
+      "grad_norm": 4.223806381225586,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 2.1499,
+      "step": 88
+    },
+    {
+      "batch_num_effect_tokens": 6877,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.08091,
+      "grad_norm": 3.888137102127075,
+      "learning_rate": 8.090909090909092e-06,
+      "loss": 2.1985,
+      "step": 89
+    },
+    {
+      "batch_num_effect_tokens": 8419,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.08182,
+      "grad_norm": 3.3746907711029053,
+      "learning_rate": 8.181818181818183e-06,
+      "loss": 2.1038,
+      "step": 90
+    },
+    {
+      "batch_num_effect_tokens": 5839,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.08273,
+      "grad_norm": 4.06436824798584,
+      "learning_rate": 8.272727272727274e-06,
+      "loss": 1.9237,
+      "step": 91
+    },
+    {
+      "batch_num_effect_tokens": 3673,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 0.08364,
+      "grad_norm": 4.105583667755127,
+      "learning_rate": 8.363636363636365e-06,
+      "loss": 1.5624,
+      "step": 92
+    },
+    {
+      "batch_num_effect_tokens": 4391,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.08455,
+      "grad_norm": 4.130006313323975,
+      "learning_rate": 8.454545454545455e-06,
+      "loss": 1.6963,
+      "step": 93
+    },
+    {
+      "batch_num_effect_tokens": 5418,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 0.08545,
+      "grad_norm": 5.248917102813721,
+      "learning_rate": 8.545454545454546e-06,
+      "loss": 1.7968,
+      "step": 94
+    },
+    {
+      "batch_num_effect_tokens": 6581,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.08636,
+      "grad_norm": 4.27993106842041,
+      "learning_rate": 8.636363636363637e-06,
+      "loss": 2.0516,
+      "step": 95
+    },
+    {
+      "batch_num_effect_tokens": 6002,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.08727,
+      "grad_norm": 4.21064567565918,
+      "learning_rate": 8.727272727272728e-06,
+      "loss": 2.1405,
+      "step": 96
+    },
+    {
+      "batch_num_effect_tokens": 4853,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52202,
+      "epoch": 0.08818,
+      "grad_norm": 4.228909015655518,
+      "learning_rate": 8.818181818181819e-06,
+      "loss": 1.5968,
+      "step": 97
+    },
+    {
+      "batch_num_effect_tokens": 4766,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.08909,
+      "grad_norm": 4.670202255249023,
+      "learning_rate": 8.90909090909091e-06,
+      "loss": 1.8443,
+      "step": 98
+    },
+    {
+      "batch_num_effect_tokens": 7877,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.09,
+      "grad_norm": 3.8756136894226074,
+      "learning_rate": 9e-06,
+      "loss": 2.3089,
+      "step": 99
+    },
+    {
+      "batch_num_effect_tokens": 5806,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 0.09091,
+      "grad_norm": 3.7578859329223633,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.9039,
+      "step": 100
+    },
+    {
+      "batch_num_effect_tokens": 5456,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 0.09182,
+      "grad_norm": 4.004074573516846,
+      "learning_rate": 9.181818181818184e-06,
+      "loss": 1.8958,
+      "step": 101
+    },
+    {
+      "batch_num_effect_tokens": 7545,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 0.09273,
+      "grad_norm": 4.003213405609131,
+      "learning_rate": 9.272727272727273e-06,
+      "loss": 2.1584,
+      "step": 102
+    },
+    {
+      "batch_num_effect_tokens": 6497,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 0.09364,
+      "grad_norm": 3.9445877075195312,
+      "learning_rate": 9.363636363636365e-06,
+      "loss": 2.1446,
+      "step": 103
+    },
+    {
+      "batch_num_effect_tokens": 11118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 0.09455,
+      "grad_norm": 3.3731114864349365,
+      "learning_rate": 9.454545454545456e-06,
+      "loss": 2.582,
+      "step": 104
+    },
+    {
+      "batch_num_effect_tokens": 8679,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.09545,
+      "grad_norm": 3.720583915710449,
+      "learning_rate": 9.545454545454547e-06,
+      "loss": 2.2406,
+      "step": 105
+    },
+    {
+      "batch_num_effect_tokens": 7387,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.09636,
+      "grad_norm": 4.026169776916504,
+      "learning_rate": 9.636363636363638e-06,
+      "loss": 2.1927,
+      "step": 106
+    },
+    {
+      "batch_num_effect_tokens": 6047,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.09727,
+      "grad_norm": 3.9075863361358643,
+      "learning_rate": 9.727272727272728e-06,
+      "loss": 1.8892,
+      "step": 107
+    },
+    {
+      "batch_num_effect_tokens": 8357,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52084,
+      "epoch": 0.09818,
+      "grad_norm": 3.6489765644073486,
+      "learning_rate": 9.81818181818182e-06,
+      "loss": 2.1783,
+      "step": 108
+    },
+    {
+      "batch_num_effect_tokens": 8611,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.09909,
+      "grad_norm": 4.339054107666016,
+      "learning_rate": 9.90909090909091e-06,
+      "loss": 2.2626,
+      "step": 109
+    },
+    {
+      "batch_num_effect_tokens": 7198,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.1,
+      "grad_norm": 3.8931431770324707,
+      "learning_rate": 1e-05,
+      "loss": 2.1807,
+      "step": 110
+    },
+    {
+      "batch_num_effect_tokens": 4704,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.10091,
+      "grad_norm": 3.9868645668029785,
+      "learning_rate": 1.0090909090909092e-05,
+      "loss": 1.5651,
+      "step": 111
+    },
+    {
+      "batch_num_effect_tokens": 7161,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52106,
+      "epoch": 0.10182,
+      "grad_norm": 3.8002917766571045,
+      "learning_rate": 1.0181818181818182e-05,
+      "loss": 2.0946,
+      "step": 112
+    },
+    {
+      "batch_num_effect_tokens": 7873,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.10273,
+      "grad_norm": 4.119603157043457,
+      "learning_rate": 1.0272727272727275e-05,
+      "loss": 2.3052,
+      "step": 113
+    },
+    {
+      "batch_num_effect_tokens": 5651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.10364,
+      "grad_norm": 4.133640766143799,
+      "learning_rate": 1.0363636363636364e-05,
+      "loss": 1.9881,
+      "step": 114
+    },
+    {
+      "batch_num_effect_tokens": 5795,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.10455,
+      "grad_norm": 4.669517517089844,
+      "learning_rate": 1.0454545454545455e-05,
+      "loss": 2.0219,
+      "step": 115
+    },
+    {
+      "batch_num_effect_tokens": 8210,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.10545,
+      "grad_norm": 3.619904041290283,
+      "learning_rate": 1.0545454545454546e-05,
+      "loss": 2.2578,
+      "step": 116
+    },
+    {
+      "batch_num_effect_tokens": 6330,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50569,
+      "epoch": 0.10636,
+      "grad_norm": 4.191747665405273,
+      "learning_rate": 1.0636363636363638e-05,
+      "loss": 2.0283,
+      "step": 117
+    },
+    {
+      "batch_num_effect_tokens": 5419,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.10727,
+      "grad_norm": 3.955883264541626,
+      "learning_rate": 1.0727272727272729e-05,
+      "loss": 1.7797,
+      "step": 118
+    },
+    {
+      "batch_num_effect_tokens": 6625,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 0.10818,
+      "grad_norm": 3.954253673553467,
+      "learning_rate": 1.0818181818181818e-05,
+      "loss": 2.0083,
+      "step": 119
+    },
+    {
+      "batch_num_effect_tokens": 7511,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.10909,
+      "grad_norm": 4.029529571533203,
+      "learning_rate": 1.0909090909090909e-05,
+      "loss": 2.2898,
+      "step": 120
+    },
+    {
+      "batch_num_effect_tokens": 7908,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52213,
+      "epoch": 0.11,
+      "grad_norm": 4.126637935638428,
+      "learning_rate": 1.1000000000000001e-05,
+      "loss": 2.3242,
+      "step": 121
+    },
+    {
+      "batch_num_effect_tokens": 6498,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.11091,
+      "grad_norm": 3.942624092102051,
+      "learning_rate": 1.1090909090909092e-05,
+      "loss": 2.0677,
+      "step": 122
+    },
+    {
+      "batch_num_effect_tokens": 7055,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.11182,
+      "grad_norm": 4.413516521453857,
+      "learning_rate": 1.1181818181818183e-05,
+      "loss": 2.178,
+      "step": 123
+    },
+    {
+      "batch_num_effect_tokens": 11530,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.11273,
+      "grad_norm": 3.2890639305114746,
+      "learning_rate": 1.1272727272727272e-05,
+      "loss": 2.4971,
+      "step": 124
+    },
+    {
+      "batch_num_effect_tokens": 5723,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 0.11364,
+      "grad_norm": 4.125791072845459,
+      "learning_rate": 1.1363636363636366e-05,
+      "loss": 2.0655,
+      "step": 125
+    },
+    {
+      "batch_num_effect_tokens": 5965,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.11455,
+      "grad_norm": 3.919264316558838,
+      "learning_rate": 1.1454545454545455e-05,
+      "loss": 2.1151,
+      "step": 126
+    },
+    {
+      "batch_num_effect_tokens": 9544,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.11545,
+      "grad_norm": 3.3462822437286377,
+      "learning_rate": 1.1545454545454546e-05,
+      "loss": 2.2,
+      "step": 127
+    },
+    {
+      "batch_num_effect_tokens": 8131,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.11636,
+      "grad_norm": 3.6498684883117676,
+      "learning_rate": 1.1636363636363637e-05,
+      "loss": 2.1605,
+      "step": 128
+    },
+    {
+      "batch_num_effect_tokens": 6962,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.11727,
+      "grad_norm": 4.1046929359436035,
+      "learning_rate": 1.1727272727272728e-05,
+      "loss": 2.3661,
+      "step": 129
+    },
+    {
+      "batch_num_effect_tokens": 5453,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52131,
+      "epoch": 0.11818,
+      "grad_norm": 3.9962284564971924,
+      "learning_rate": 1.181818181818182e-05,
+      "loss": 1.6459,
+      "step": 130
+    },
+    {
+      "batch_num_effect_tokens": 6656,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52143,
+      "epoch": 0.11909,
+      "grad_norm": 4.055619239807129,
+      "learning_rate": 1.190909090909091e-05,
+      "loss": 2.0234,
+      "step": 131
+    },
+    {
+      "batch_num_effect_tokens": 8693,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52179,
+      "epoch": 0.12,
+      "grad_norm": 4.601987838745117,
+      "learning_rate": 1.2e-05,
+      "loss": 2.1816,
+      "step": 132
+    },
+    {
+      "batch_num_effect_tokens": 8201,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 0.12091,
+      "grad_norm": 3.506317615509033,
+      "learning_rate": 1.2090909090909091e-05,
+      "loss": 2.0799,
+      "step": 133
+    },
+    {
+      "batch_num_effect_tokens": 5992,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.12182,
+      "grad_norm": 5.071348190307617,
+      "learning_rate": 1.2181818181818184e-05,
+      "loss": 2.0837,
+      "step": 134
+    },
+    {
+      "batch_num_effect_tokens": 6012,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.12273,
+      "grad_norm": 3.723125457763672,
+      "learning_rate": 1.2272727272727274e-05,
+      "loss": 1.8454,
+      "step": 135
+    },
+    {
+      "batch_num_effect_tokens": 8301,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52218,
+      "epoch": 0.12364,
+      "grad_norm": 3.9254860877990723,
+      "learning_rate": 1.2363636363636364e-05,
+      "loss": 2.3271,
+      "step": 136
+    },
+    {
+      "batch_num_effect_tokens": 4103,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 0.12455,
+      "grad_norm": 4.1657233238220215,
+      "learning_rate": 1.2454545454545454e-05,
+      "loss": 1.4053,
+      "step": 137
+    },
+    {
+      "batch_num_effect_tokens": 6719,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52128,
+      "epoch": 0.12545,
+      "grad_norm": 4.125800132751465,
+      "learning_rate": 1.2545454545454547e-05,
+      "loss": 2.1318,
+      "step": 138
+    },
+    {
+      "batch_num_effect_tokens": 5288,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.12636,
+      "grad_norm": 4.3959174156188965,
+      "learning_rate": 1.2636363636363638e-05,
+      "loss": 1.8073,
+      "step": 139
+    },
+    {
+      "batch_num_effect_tokens": 4890,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.12727,
+      "grad_norm": 3.882976770401001,
+      "learning_rate": 1.2727272727272728e-05,
+      "loss": 1.4711,
+      "step": 140
+    },
+    {
+      "batch_num_effect_tokens": 8337,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.12818,
+      "grad_norm": 4.436567306518555,
+      "learning_rate": 1.281818181818182e-05,
+      "loss": 2.4239,
+      "step": 141
+    },
+    {
+      "batch_num_effect_tokens": 8868,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 0.12909,
+      "grad_norm": 3.8208858966827393,
+      "learning_rate": 1.2909090909090912e-05,
+      "loss": 2.2625,
+      "step": 142
+    },
+    {
+      "batch_num_effect_tokens": 6646,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.13,
+      "grad_norm": 3.840920925140381,
+      "learning_rate": 1.3000000000000001e-05,
+      "loss": 2.0837,
+      "step": 143
+    },
+    {
+      "batch_num_effect_tokens": 6819,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.13091,
+      "grad_norm": 4.054839134216309,
+      "learning_rate": 1.3090909090909092e-05,
+      "loss": 1.9195,
+      "step": 144
+    },
+    {
+      "batch_num_effect_tokens": 4915,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.13182,
+      "grad_norm": 3.954787492752075,
+      "learning_rate": 1.3181818181818183e-05,
+      "loss": 1.5082,
+      "step": 145
+    },
+    {
+      "batch_num_effect_tokens": 8163,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52154,
+      "epoch": 0.13273,
+      "grad_norm": 3.4955637454986572,
+      "learning_rate": 1.3272727272727275e-05,
+      "loss": 2.3483,
+      "step": 146
+    },
+    {
+      "batch_num_effect_tokens": 7112,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 0.13364,
+      "grad_norm": 3.9054946899414062,
+      "learning_rate": 1.3363636363636366e-05,
+      "loss": 2.0471,
+      "step": 147
+    },
+    {
+      "batch_num_effect_tokens": 6689,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.13455,
+      "grad_norm": 3.8406553268432617,
+      "learning_rate": 1.3454545454545455e-05,
+      "loss": 1.997,
+      "step": 148
+    },
+    {
+      "batch_num_effect_tokens": 6119,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.13545,
+      "grad_norm": 4.178918838500977,
+      "learning_rate": 1.3545454545454546e-05,
+      "loss": 1.9578,
+      "step": 149
+    },
+    {
+      "batch_num_effect_tokens": 6297,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.13636,
+      "grad_norm": 4.147446632385254,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 2.0032,
+      "step": 150
+    },
+    {
+      "batch_num_effect_tokens": 6621,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.13727,
+      "grad_norm": 4.597929954528809,
+      "learning_rate": 1.3727272727272729e-05,
+      "loss": 2.2709,
+      "step": 151
+    },
+    {
+      "batch_num_effect_tokens": 7233,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 0.13818,
+      "grad_norm": 3.8162758350372314,
+      "learning_rate": 1.381818181818182e-05,
+      "loss": 2.1774,
+      "step": 152
+    },
+    {
+      "batch_num_effect_tokens": 7396,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52102,
+      "epoch": 0.13909,
+      "grad_norm": 3.5979058742523193,
+      "learning_rate": 1.390909090909091e-05,
+      "loss": 2.1845,
+      "step": 153
+    },
+    {
+      "batch_num_effect_tokens": 7449,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.14,
+      "grad_norm": 3.8547518253326416,
+      "learning_rate": 1.4e-05,
+      "loss": 2.201,
+      "step": 154
+    },
+    {
+      "batch_num_effect_tokens": 6947,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 0.14091,
+      "grad_norm": 3.8475093841552734,
+      "learning_rate": 1.4090909090909092e-05,
+      "loss": 2.0156,
+      "step": 155
+    },
+    {
+      "batch_num_effect_tokens": 9360,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.14182,
+      "grad_norm": 3.3440723419189453,
+      "learning_rate": 1.4181818181818183e-05,
+      "loss": 2.4424,
+      "step": 156
+    },
+    {
+      "batch_num_effect_tokens": 7812,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.14273,
+      "grad_norm": 4.060225963592529,
+      "learning_rate": 1.4272727272727274e-05,
+      "loss": 2.1743,
+      "step": 157
+    },
+    {
+      "batch_num_effect_tokens": 5670,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 0.14364,
+      "grad_norm": 4.626434326171875,
+      "learning_rate": 1.4363636363636365e-05,
+      "loss": 1.8219,
+      "step": 158
+    },
+    {
+      "batch_num_effect_tokens": 4969,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.14455,
+      "grad_norm": 3.8437905311584473,
+      "learning_rate": 1.4454545454545457e-05,
+      "loss": 1.7275,
+      "step": 159
+    },
+    {
+      "batch_num_effect_tokens": 6736,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 0.14545,
+      "grad_norm": 4.022931098937988,
+      "learning_rate": 1.4545454545454546e-05,
+      "loss": 2.219,
+      "step": 160
+    },
+    {
+      "batch_num_effect_tokens": 6256,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 0.14636,
+      "grad_norm": 3.9149224758148193,
+      "learning_rate": 1.4636363636363637e-05,
+      "loss": 2.0597,
+      "step": 161
+    },
+    {
+      "batch_num_effect_tokens": 5138,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 0.14727,
+      "grad_norm": 4.706681728363037,
+      "learning_rate": 1.4727272727272728e-05,
+      "loss": 1.9799,
+      "step": 162
+    },
+    {
+      "batch_num_effect_tokens": 5858,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.14818,
+      "grad_norm": 4.0887370109558105,
+      "learning_rate": 1.481818181818182e-05,
+      "loss": 1.9767,
+      "step": 163
+    },
+    {
+      "batch_num_effect_tokens": 7680,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.14909,
+      "grad_norm": 3.7380943298339844,
+      "learning_rate": 1.4909090909090911e-05,
+      "loss": 2.2115,
+      "step": 164
+    },
+    {
+      "batch_num_effect_tokens": 6705,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.15,
+      "grad_norm": 4.241217136383057,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 2.0222,
+      "step": 165
+    },
+    {
+      "batch_num_effect_tokens": 5555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.15091,
+      "grad_norm": 3.943678379058838,
+      "learning_rate": 1.5090909090909091e-05,
+      "loss": 1.9941,
+      "step": 166
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.15182,
+      "grad_norm": 3.7555346488952637,
+      "learning_rate": 1.5181818181818182e-05,
+      "loss": 2.2806,
+      "step": 167
+    },
+    {
+      "batch_num_effect_tokens": 5555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.15273,
+      "grad_norm": 3.931405782699585,
+      "learning_rate": 1.5272727272727276e-05,
+      "loss": 1.8538,
+      "step": 168
+    },
+    {
+      "batch_num_effect_tokens": 6675,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.15364,
+      "grad_norm": 3.9357919692993164,
+      "learning_rate": 1.5363636363636365e-05,
+      "loss": 2.0735,
+      "step": 169
+    },
+    {
+      "batch_num_effect_tokens": 6744,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.15455,
+      "grad_norm": 3.840468645095825,
+      "learning_rate": 1.5454545454545454e-05,
+      "loss": 1.9243,
+      "step": 170
+    },
+    {
+      "batch_num_effect_tokens": 4856,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50540,
+      "epoch": 0.15545,
+      "grad_norm": 3.940772294998169,
+      "learning_rate": 1.5545454545454547e-05,
+      "loss": 1.7354,
+      "step": 171
+    },
+    {
+      "batch_num_effect_tokens": 5268,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 0.15636,
+      "grad_norm": 4.208749294281006,
+      "learning_rate": 1.563636363636364e-05,
+      "loss": 1.7741,
+      "step": 172
+    },
+    {
+      "batch_num_effect_tokens": 7239,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 0.15727,
+      "grad_norm": 3.6911253929138184,
+      "learning_rate": 1.572727272727273e-05,
+      "loss": 2.1765,
+      "step": 173
+    },
+    {
+      "batch_num_effect_tokens": 7250,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.15818,
+      "grad_norm": 3.9492316246032715,
+      "learning_rate": 1.5818181818181818e-05,
+      "loss": 2.145,
+      "step": 174
+    },
+    {
+      "batch_num_effect_tokens": 9635,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.15909,
+      "grad_norm": 3.3741977214813232,
+      "learning_rate": 1.590909090909091e-05,
+      "loss": 2.2064,
+      "step": 175
+    },
+    {
+      "batch_num_effect_tokens": 7578,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52141,
+      "epoch": 0.16,
+      "grad_norm": 3.4716920852661133,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.9224,
+      "step": 176
+    },
+    {
+      "batch_num_effect_tokens": 8088,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.16091,
+      "grad_norm": 3.4077889919281006,
+      "learning_rate": 1.6090909090909092e-05,
+      "loss": 1.9219,
+      "step": 177
+    },
+    {
+      "batch_num_effect_tokens": 7364,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.16182,
+      "grad_norm": 3.86611270904541,
+      "learning_rate": 1.6181818181818184e-05,
+      "loss": 2.1574,
+      "step": 178
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.16273,
+      "grad_norm": 3.459207773208618,
+      "learning_rate": 1.6272727272727273e-05,
+      "loss": 2.0184,
+      "step": 179
+    },
+    {
+      "batch_num_effect_tokens": 4996,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.16364,
+      "grad_norm": 4.295303821563721,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.8571,
+      "step": 180
+    },
+    {
+      "batch_num_effect_tokens": 7540,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.16455,
+      "grad_norm": 3.390723943710327,
+      "learning_rate": 1.6454545454545455e-05,
+      "loss": 1.8768,
+      "step": 181
+    },
+    {
+      "batch_num_effect_tokens": 6088,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.16545,
+      "grad_norm": 4.339141368865967,
+      "learning_rate": 1.6545454545454548e-05,
+      "loss": 2.2682,
+      "step": 182
+    },
+    {
+      "batch_num_effect_tokens": 5206,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.16636,
+      "grad_norm": 3.778287649154663,
+      "learning_rate": 1.6636363636363637e-05,
+      "loss": 1.5497,
+      "step": 183
+    },
+    {
+      "batch_num_effect_tokens": 7747,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.16727,
+      "grad_norm": 3.729515790939331,
+      "learning_rate": 1.672727272727273e-05,
+      "loss": 2.0643,
+      "step": 184
+    },
+    {
+      "batch_num_effect_tokens": 6412,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.16818,
+      "grad_norm": 3.697777271270752,
+      "learning_rate": 1.681818181818182e-05,
+      "loss": 1.869,
+      "step": 185
+    },
+    {
+      "batch_num_effect_tokens": 5186,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.16909,
+      "grad_norm": 4.121335506439209,
+      "learning_rate": 1.690909090909091e-05,
+      "loss": 1.7163,
+      "step": 186
+    },
+    {
+      "batch_num_effect_tokens": 6355,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.17,
+      "grad_norm": 4.144448280334473,
+      "learning_rate": 1.7e-05,
+      "loss": 2.0994,
+      "step": 187
+    },
+    {
+      "batch_num_effect_tokens": 9178,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.17091,
+      "grad_norm": 3.9074015617370605,
+      "learning_rate": 1.7090909090909092e-05,
+      "loss": 2.3383,
+      "step": 188
+    },
+    {
+      "batch_num_effect_tokens": 4468,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52136,
+      "epoch": 0.17182,
+      "grad_norm": 4.231569290161133,
+      "learning_rate": 1.7181818181818185e-05,
+      "loss": 1.7775,
+      "step": 189
+    },
+    {
+      "batch_num_effect_tokens": 6013,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.17273,
+      "grad_norm": 4.446117877960205,
+      "learning_rate": 1.7272727272727274e-05,
+      "loss": 1.9935,
+      "step": 190
+    },
+    {
+      "batch_num_effect_tokens": 6840,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.17364,
+      "grad_norm": 3.933229684829712,
+      "learning_rate": 1.7363636363636363e-05,
+      "loss": 2.1809,
+      "step": 191
+    },
+    {
+      "batch_num_effect_tokens": 6780,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.17455,
+      "grad_norm": 4.084712028503418,
+      "learning_rate": 1.7454545454545456e-05,
+      "loss": 2.1108,
+      "step": 192
+    },
+    {
+      "batch_num_effect_tokens": 6101,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.17545,
+      "grad_norm": 3.8516998291015625,
+      "learning_rate": 1.7545454545454548e-05,
+      "loss": 2.0655,
+      "step": 193
+    },
+    {
+      "batch_num_effect_tokens": 5591,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.17636,
+      "grad_norm": 3.7539749145507812,
+      "learning_rate": 1.7636363636363637e-05,
+      "loss": 1.7109,
+      "step": 194
+    },
+    {
+      "batch_num_effect_tokens": 6305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50593,
+      "epoch": 0.17727,
+      "grad_norm": 4.456710338592529,
+      "learning_rate": 1.772727272727273e-05,
+      "loss": 2.2209,
+      "step": 195
+    },
+    {
+      "batch_num_effect_tokens": 6776,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.17818,
+      "grad_norm": 3.7552881240844727,
+      "learning_rate": 1.781818181818182e-05,
+      "loss": 2.1274,
+      "step": 196
+    },
+    {
+      "batch_num_effect_tokens": 7808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.17909,
+      "grad_norm": 3.59682559967041,
+      "learning_rate": 1.790909090909091e-05,
+      "loss": 2.186,
+      "step": 197
+    },
+    {
+      "batch_num_effect_tokens": 10696,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52174,
+      "epoch": 0.18,
+      "grad_norm": 3.125765323638916,
+      "learning_rate": 1.8e-05,
+      "loss": 2.4307,
+      "step": 198
+    },
+    {
+      "batch_num_effect_tokens": 9770,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.18091,
+      "grad_norm": 3.97883939743042,
+      "learning_rate": 1.8090909090909093e-05,
+      "loss": 2.5137,
+      "step": 199
+    },
+    {
+      "batch_num_effect_tokens": 6270,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.18182,
+      "grad_norm": 4.143767356872559,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 2.023,
+      "step": 200
+    },
+    {
+      "batch_num_effect_tokens": 7577,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52153,
+      "epoch": 0.18273,
+      "grad_norm": 3.6797730922698975,
+      "learning_rate": 1.8272727272727275e-05,
+      "loss": 2.0665,
+      "step": 201
+    },
+    {
+      "batch_num_effect_tokens": 5559,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 0.18364,
+      "grad_norm": 4.038333415985107,
+      "learning_rate": 1.8363636363636367e-05,
+      "loss": 2.075,
+      "step": 202
+    },
+    {
+      "batch_num_effect_tokens": 8606,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 0.18455,
+      "grad_norm": 4.0963239669799805,
+      "learning_rate": 1.8454545454545456e-05,
+      "loss": 2.4193,
+      "step": 203
+    },
+    {
+      "batch_num_effect_tokens": 6120,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50540,
+      "epoch": 0.18545,
+      "grad_norm": 3.896514415740967,
+      "learning_rate": 1.8545454545454545e-05,
+      "loss": 2.1565,
+      "step": 204
+    },
+    {
+      "batch_num_effect_tokens": 7134,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.18636,
+      "grad_norm": 3.8699703216552734,
+      "learning_rate": 1.8636363636363638e-05,
+      "loss": 2.3128,
+      "step": 205
+    },
+    {
+      "batch_num_effect_tokens": 8291,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 0.18727,
+      "grad_norm": 3.4299514293670654,
+      "learning_rate": 1.872727272727273e-05,
+      "loss": 2.2254,
+      "step": 206
+    },
+    {
+      "batch_num_effect_tokens": 6028,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.18818,
+      "grad_norm": 4.2485198974609375,
+      "learning_rate": 1.881818181818182e-05,
+      "loss": 1.8865,
+      "step": 207
+    },
+    {
+      "batch_num_effect_tokens": 6741,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.18909,
+      "grad_norm": 3.9595305919647217,
+      "learning_rate": 1.8909090909090912e-05,
+      "loss": 2.0176,
+      "step": 208
+    },
+    {
+      "batch_num_effect_tokens": 4830,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.19,
+      "grad_norm": 4.34491491317749,
+      "learning_rate": 1.9e-05,
+      "loss": 1.7146,
+      "step": 209
+    },
+    {
+      "batch_num_effect_tokens": 4728,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.19091,
+      "grad_norm": 5.689029216766357,
+      "learning_rate": 1.9090909090909094e-05,
+      "loss": 1.7834,
+      "step": 210
+    },
+    {
+      "batch_num_effect_tokens": 5023,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.19182,
+      "grad_norm": 4.221342086791992,
+      "learning_rate": 1.9181818181818183e-05,
+      "loss": 1.9373,
+      "step": 211
+    },
+    {
+      "batch_num_effect_tokens": 7665,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 0.19273,
+      "grad_norm": 3.7729616165161133,
+      "learning_rate": 1.9272727272727275e-05,
+      "loss": 2.299,
+      "step": 212
+    },
+    {
+      "batch_num_effect_tokens": 7062,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.19364,
+      "grad_norm": 3.5272579193115234,
+      "learning_rate": 1.9363636363636364e-05,
+      "loss": 2.2194,
+      "step": 213
+    },
+    {
+      "batch_num_effect_tokens": 6712,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.19455,
+      "grad_norm": 3.839637279510498,
+      "learning_rate": 1.9454545454545457e-05,
+      "loss": 2.2719,
+      "step": 214
+    },
+    {
+      "batch_num_effect_tokens": 4438,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.19545,
+      "grad_norm": 4.071404933929443,
+      "learning_rate": 1.9545454545454546e-05,
+      "loss": 1.7334,
+      "step": 215
+    },
+    {
+      "batch_num_effect_tokens": 5133,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.19636,
+      "grad_norm": 3.8490676879882812,
+      "learning_rate": 1.963636363636364e-05,
+      "loss": 1.8869,
+      "step": 216
+    },
+    {
+      "batch_num_effect_tokens": 6167,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.19727,
+      "grad_norm": 4.4581732749938965,
+      "learning_rate": 1.9727272727272728e-05,
+      "loss": 1.9831,
+      "step": 217
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52220,
+      "epoch": 0.19818,
+      "grad_norm": 3.9144175052642822,
+      "learning_rate": 1.981818181818182e-05,
+      "loss": 2.1646,
+      "step": 218
+    },
+    {
+      "batch_num_effect_tokens": 8765,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.19909,
+      "grad_norm": 3.417576789855957,
+      "learning_rate": 1.9909090909090913e-05,
+      "loss": 2.2986,
+      "step": 219
+    },
+    {
+      "batch_num_effect_tokens": 5925,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.2,
+      "grad_norm": 3.8771045207977295,
+      "learning_rate": 2e-05,
+      "loss": 2.001,
+      "step": 220
+    },
+    {
+      "batch_num_effect_tokens": 7624,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52178,
+      "epoch": 0.20091,
+      "grad_norm": 3.4873709678649902,
+      "learning_rate": 1.9999987412505956e-05,
+      "loss": 2.0297,
+      "step": 221
+    },
+    {
+      "batch_num_effect_tokens": 5772,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.20182,
+      "grad_norm": 4.14601993560791,
+      "learning_rate": 1.9999949650055512e-05,
+      "loss": 2.0554,
+      "step": 222
+    },
+    {
+      "batch_num_effect_tokens": 5791,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.20273,
+      "grad_norm": 3.8745217323303223,
+      "learning_rate": 1.9999886712743734e-05,
+      "loss": 1.9404,
+      "step": 223
+    },
+    {
+      "batch_num_effect_tokens": 6915,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.20364,
+      "grad_norm": 4.043534278869629,
+      "learning_rate": 1.9999798600729067e-05,
+      "loss": 2.2196,
+      "step": 224
+    },
+    {
+      "batch_num_effect_tokens": 5540,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52206,
+      "epoch": 0.20455,
+      "grad_norm": 3.968337059020996,
+      "learning_rate": 1.9999685314233333e-05,
+      "loss": 1.8076,
+      "step": 225
+    },
+    {
+      "batch_num_effect_tokens": 6473,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.20545,
+      "grad_norm": 4.364017486572266,
+      "learning_rate": 1.9999546853541728e-05,
+      "loss": 2.166,
+      "step": 226
+    },
+    {
+      "batch_num_effect_tokens": 8991,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52202,
+      "epoch": 0.20636,
+      "grad_norm": 3.8495795726776123,
+      "learning_rate": 1.9999383219002836e-05,
+      "loss": 2.3994,
+      "step": 227
+    },
+    {
+      "batch_num_effect_tokens": 7268,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 0.20727,
+      "grad_norm": 3.666080951690674,
+      "learning_rate": 1.9999194411028596e-05,
+      "loss": 2.1414,
+      "step": 228
+    },
+    {
+      "batch_num_effect_tokens": 7291,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52193,
+      "epoch": 0.20818,
+      "grad_norm": 3.8996286392211914,
+      "learning_rate": 1.9998980430094333e-05,
+      "loss": 2.2034,
+      "step": 229
+    },
+    {
+      "batch_num_effect_tokens": 5587,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.20909,
+      "grad_norm": 3.8308238983154297,
+      "learning_rate": 1.9998741276738753e-05,
+      "loss": 1.9367,
+      "step": 230
+    },
+    {
+      "batch_num_effect_tokens": 8107,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 0.21,
+      "grad_norm": 3.7341339588165283,
+      "learning_rate": 1.9998476951563914e-05,
+      "loss": 2.2329,
+      "step": 231
+    },
+    {
+      "batch_num_effect_tokens": 5474,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.21091,
+      "grad_norm": 3.69163179397583,
+      "learning_rate": 1.999818745523526e-05,
+      "loss": 1.5736,
+      "step": 232
+    },
+    {
+      "batch_num_effect_tokens": 5578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.21182,
+      "grad_norm": 4.45286226272583,
+      "learning_rate": 1.9997872788481595e-05,
+      "loss": 1.9841,
+      "step": 233
+    },
+    {
+      "batch_num_effect_tokens": 5678,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.21273,
+      "grad_norm": 3.889274835586548,
+      "learning_rate": 1.9997532952095093e-05,
+      "loss": 1.7809,
+      "step": 234
+    },
+    {
+      "batch_num_effect_tokens": 5605,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.21364,
+      "grad_norm": 3.6350696086883545,
+      "learning_rate": 1.9997167946931293e-05,
+      "loss": 1.9915,
+      "step": 235
+    },
+    {
+      "batch_num_effect_tokens": 7913,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 0.21455,
+      "grad_norm": 4.61751127243042,
+      "learning_rate": 1.9996777773909093e-05,
+      "loss": 2.2168,
+      "step": 236
+    },
+    {
+      "batch_num_effect_tokens": 6433,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 0.21545,
+      "grad_norm": 3.994758129119873,
+      "learning_rate": 1.9996362434010754e-05,
+      "loss": 1.9697,
+      "step": 237
+    },
+    {
+      "batch_num_effect_tokens": 5781,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.21636,
+      "grad_norm": 3.7514944076538086,
+      "learning_rate": 1.9995921928281893e-05,
+      "loss": 2.1329,
+      "step": 238
+    },
+    {
+      "batch_num_effect_tokens": 5847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52112,
+      "epoch": 0.21727,
+      "grad_norm": 3.858617067337036,
+      "learning_rate": 1.9995456257831484e-05,
+      "loss": 2.0299,
+      "step": 239
+    },
+    {
+      "batch_num_effect_tokens": 7325,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50555,
+      "epoch": 0.21818,
+      "grad_norm": 3.4423437118530273,
+      "learning_rate": 1.9994965423831853e-05,
+      "loss": 2.0997,
+      "step": 240
+    },
+    {
+      "batch_num_effect_tokens": 5637,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.21909,
+      "grad_norm": 4.060293197631836,
+      "learning_rate": 1.999444942751867e-05,
+      "loss": 2.0875,
+      "step": 241
+    },
+    {
+      "batch_num_effect_tokens": 7749,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 0.22,
+      "grad_norm": 3.5604634284973145,
+      "learning_rate": 1.999390827019096e-05,
+      "loss": 2.2539,
+      "step": 242
+    },
+    {
+      "batch_num_effect_tokens": 6767,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.22091,
+      "grad_norm": 3.51718807220459,
+      "learning_rate": 1.999334195321108e-05,
+      "loss": 2.155,
+      "step": 243
+    },
+    {
+      "batch_num_effect_tokens": 7804,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.22182,
+      "grad_norm": 3.5282416343688965,
+      "learning_rate": 1.999275047800474e-05,
+      "loss": 2.2269,
+      "step": 244
+    },
+    {
+      "batch_num_effect_tokens": 6813,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.22273,
+      "grad_norm": 4.269754886627197,
+      "learning_rate": 1.999213384606097e-05,
+      "loss": 2.083,
+      "step": 245
+    },
+    {
+      "batch_num_effect_tokens": 7350,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.22364,
+      "grad_norm": 4.033580303192139,
+      "learning_rate": 1.9991492058932143e-05,
+      "loss": 2.21,
+      "step": 246
+    },
+    {
+      "batch_num_effect_tokens": 5880,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52143,
+      "epoch": 0.22455,
+      "grad_norm": 4.044896125793457,
+      "learning_rate": 1.9990825118233958e-05,
+      "loss": 2.257,
+      "step": 247
+    },
+    {
+      "batch_num_effect_tokens": 7649,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52217,
+      "epoch": 0.22545,
+      "grad_norm": 3.287102222442627,
+      "learning_rate": 1.999013302564544e-05,
+      "loss": 1.96,
+      "step": 248
+    },
+    {
+      "batch_num_effect_tokens": 6009,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 0.22636,
+      "grad_norm": 3.5989625453948975,
+      "learning_rate": 1.998941578290893e-05,
+      "loss": 1.705,
+      "step": 249
+    },
+    {
+      "batch_num_effect_tokens": 6845,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.22727,
+      "grad_norm": 4.358585357666016,
+      "learning_rate": 1.9988673391830082e-05,
+      "loss": 2.0284,
+      "step": 250
+    },
+    {
+      "batch_num_effect_tokens": 4556,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 0.22818,
+      "grad_norm": 3.9450693130493164,
+      "learning_rate": 1.9987905854277867e-05,
+      "loss": 1.6541,
+      "step": 251
+    },
+    {
+      "batch_num_effect_tokens": 4540,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.22909,
+      "grad_norm": 3.910506010055542,
+      "learning_rate": 1.9987113172184562e-05,
+      "loss": 1.5919,
+      "step": 252
+    },
+    {
+      "batch_num_effect_tokens": 6801,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50546,
+      "epoch": 0.23,
+      "grad_norm": 3.481827974319458,
+      "learning_rate": 1.9986295347545738e-05,
+      "loss": 1.9684,
+      "step": 253
+    },
+    {
+      "batch_num_effect_tokens": 5553,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.23091,
+      "grad_norm": 4.6898674964904785,
+      "learning_rate": 1.9985452382420277e-05,
+      "loss": 1.9056,
+      "step": 254
+    },
+    {
+      "batch_num_effect_tokens": 7498,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.23182,
+      "grad_norm": 3.7411327362060547,
+      "learning_rate": 1.9984584278930333e-05,
+      "loss": 1.7946,
+      "step": 255
+    },
+    {
+      "batch_num_effect_tokens": 5376,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.23273,
+      "grad_norm": 4.128430366516113,
+      "learning_rate": 1.9983691039261358e-05,
+      "loss": 1.9899,
+      "step": 256
+    },
+    {
+      "batch_num_effect_tokens": 4172,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52121,
+      "epoch": 0.23364,
+      "grad_norm": 3.863016128540039,
+      "learning_rate": 1.9982772665662083e-05,
+      "loss": 1.4849,
+      "step": 257
+    },
+    {
+      "batch_num_effect_tokens": 5703,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.23455,
+      "grad_norm": 4.168217658996582,
+      "learning_rate": 1.9981829160444515e-05,
+      "loss": 2.035,
+      "step": 258
+    },
+    {
+      "batch_num_effect_tokens": 6098,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 0.23545,
+      "grad_norm": 4.579566478729248,
+      "learning_rate": 1.9980860525983924e-05,
+      "loss": 2.1822,
+      "step": 259
+    },
+    {
+      "batch_num_effect_tokens": 4666,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.23636,
+      "grad_norm": 3.742579698562622,
+      "learning_rate": 1.9979866764718846e-05,
+      "loss": 1.4695,
+      "step": 260
+    },
+    {
+      "batch_num_effect_tokens": 6150,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.23727,
+      "grad_norm": 4.352850437164307,
+      "learning_rate": 1.9978847879151076e-05,
+      "loss": 2.0964,
+      "step": 261
+    },
+    {
+      "batch_num_effect_tokens": 8410,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.23818,
+      "grad_norm": 3.5962162017822266,
+      "learning_rate": 1.997780387184565e-05,
+      "loss": 2.375,
+      "step": 262
+    },
+    {
+      "batch_num_effect_tokens": 6367,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.23909,
+      "grad_norm": 3.5124571323394775,
+      "learning_rate": 1.997673474543087e-05,
+      "loss": 1.8841,
+      "step": 263
+    },
+    {
+      "batch_num_effect_tokens": 6349,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.24,
+      "grad_norm": 3.808424234390259,
+      "learning_rate": 1.9975640502598243e-05,
+      "loss": 2.02,
+      "step": 264
+    },
+    {
+      "batch_num_effect_tokens": 7481,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.24091,
+      "grad_norm": 3.9207777976989746,
+      "learning_rate": 1.9974521146102535e-05,
+      "loss": 2.2218,
+      "step": 265
+    },
+    {
+      "batch_num_effect_tokens": 8286,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52178,
+      "epoch": 0.24182,
+      "grad_norm": 3.6245782375335693,
+      "learning_rate": 1.9973376678761726e-05,
+      "loss": 2.146,
+      "step": 266
+    },
+    {
+      "batch_num_effect_tokens": 8987,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.24273,
+      "grad_norm": 3.7090020179748535,
+      "learning_rate": 1.9972207103457e-05,
+      "loss": 2.3555,
+      "step": 267
+    },
+    {
+      "batch_num_effect_tokens": 7652,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 0.24364,
+      "grad_norm": 3.7007925510406494,
+      "learning_rate": 1.9971012423132776e-05,
+      "loss": 2.201,
+      "step": 268
+    },
+    {
+      "batch_num_effect_tokens": 5345,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.24455,
+      "grad_norm": 3.480968952178955,
+      "learning_rate": 1.996979264079665e-05,
+      "loss": 1.8025,
+      "step": 269
+    },
+    {
+      "batch_num_effect_tokens": 6528,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.24545,
+      "grad_norm": 4.5325398445129395,
+      "learning_rate": 1.9968547759519426e-05,
+      "loss": 2.0981,
+      "step": 270
+    },
+    {
+      "batch_num_effect_tokens": 5371,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52212,
+      "epoch": 0.24636,
+      "grad_norm": 4.283621788024902,
+      "learning_rate": 1.996727778243509e-05,
+      "loss": 2.0344,
+      "step": 271
+    },
+    {
+      "batch_num_effect_tokens": 3862,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 0.24727,
+      "grad_norm": 4.2085280418396,
+      "learning_rate": 1.996598271274081e-05,
+      "loss": 1.2084,
+      "step": 272
+    },
+    {
+      "batch_num_effect_tokens": 5831,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.24818,
+      "grad_norm": 3.9223201274871826,
+      "learning_rate": 1.9964662553696915e-05,
+      "loss": 1.9943,
+      "step": 273
+    },
+    {
+      "batch_num_effect_tokens": 7808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.24909,
+      "grad_norm": 3.444143295288086,
+      "learning_rate": 1.9963317308626916e-05,
+      "loss": 2.0967,
+      "step": 274
+    },
+    {
+      "batch_num_effect_tokens": 5857,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.25,
+      "grad_norm": 3.47464919090271,
+      "learning_rate": 1.9961946980917457e-05,
+      "loss": 1.95,
+      "step": 275
+    },
+    {
+      "batch_num_effect_tokens": 6623,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52193,
+      "epoch": 0.25091,
+      "grad_norm": 3.51035475730896,
+      "learning_rate": 1.996055157401834e-05,
+      "loss": 2.0641,
+      "step": 276
+    },
+    {
+      "batch_num_effect_tokens": 5545,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.25182,
+      "grad_norm": 3.6571717262268066,
+      "learning_rate": 1.9959131091442497e-05,
+      "loss": 1.8225,
+      "step": 277
+    },
+    {
+      "batch_num_effect_tokens": 5112,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.25273,
+      "grad_norm": 3.652961015701294,
+      "learning_rate": 1.9957685536765998e-05,
+      "loss": 1.7117,
+      "step": 278
+    },
+    {
+      "batch_num_effect_tokens": 4869,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.25364,
+      "grad_norm": 4.006814479827881,
+      "learning_rate": 1.9956214913628015e-05,
+      "loss": 1.7309,
+      "step": 279
+    },
+    {
+      "batch_num_effect_tokens": 7287,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.25455,
+      "grad_norm": 3.6142709255218506,
+      "learning_rate": 1.9954719225730847e-05,
+      "loss": 1.9263,
+      "step": 280
+    },
+    {
+      "batch_num_effect_tokens": 6941,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.25545,
+      "grad_norm": 3.489576816558838,
+      "learning_rate": 1.9953198476839886e-05,
+      "loss": 2.0953,
+      "step": 281
+    },
+    {
+      "batch_num_effect_tokens": 6004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50545,
+      "epoch": 0.25636,
+      "grad_norm": 4.156979560852051,
+      "learning_rate": 1.9951652670783615e-05,
+      "loss": 2.1339,
+      "step": 282
+    },
+    {
+      "batch_num_effect_tokens": 7416,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.25727,
+      "grad_norm": 3.9066414833068848,
+      "learning_rate": 1.9950081811453598e-05,
+      "loss": 2.3405,
+      "step": 283
+    },
+    {
+      "batch_num_effect_tokens": 4681,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.25818,
+      "grad_norm": 4.091082572937012,
+      "learning_rate": 1.9948485902804472e-05,
+      "loss": 1.6512,
+      "step": 284
+    },
+    {
+      "batch_num_effect_tokens": 8297,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.25909,
+      "grad_norm": 3.9799063205718994,
+      "learning_rate": 1.9946864948853936e-05,
+      "loss": 2.4767,
+      "step": 285
+    },
+    {
+      "batch_num_effect_tokens": 7533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.26,
+      "grad_norm": 3.5255239009857178,
+      "learning_rate": 1.9945218953682736e-05,
+      "loss": 2.1628,
+      "step": 286
+    },
+    {
+      "batch_num_effect_tokens": 8860,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52218,
+      "epoch": 0.26091,
+      "grad_norm": 3.5023748874664307,
+      "learning_rate": 1.9943547921434666e-05,
+      "loss": 2.2402,
+      "step": 287
+    },
+    {
+      "batch_num_effect_tokens": 7771,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.26182,
+      "grad_norm": 3.9955313205718994,
+      "learning_rate": 1.994185185631655e-05,
+      "loss": 2.1782,
+      "step": 288
+    },
+    {
+      "batch_num_effect_tokens": 5847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.26273,
+      "grad_norm": 3.5915133953094482,
+      "learning_rate": 1.9940130762598224e-05,
+      "loss": 1.8022,
+      "step": 289
+    },
+    {
+      "batch_num_effect_tokens": 6404,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.26364,
+      "grad_norm": 3.4613535404205322,
+      "learning_rate": 1.9938384644612542e-05,
+      "loss": 2.0824,
+      "step": 290
+    },
+    {
+      "batch_num_effect_tokens": 7125,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52148,
+      "epoch": 0.26455,
+      "grad_norm": 3.2900590896606445,
+      "learning_rate": 1.9936613506755357e-05,
+      "loss": 2.134,
+      "step": 291
+    },
+    {
+      "batch_num_effect_tokens": 6211,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.26545,
+      "grad_norm": 4.248907089233398,
+      "learning_rate": 1.99348173534855e-05,
+      "loss": 2.0173,
+      "step": 292
+    },
+    {
+      "batch_num_effect_tokens": 5748,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 0.26636,
+      "grad_norm": 3.5205302238464355,
+      "learning_rate": 1.9932996189324796e-05,
+      "loss": 1.8658,
+      "step": 293
+    },
+    {
+      "batch_num_effect_tokens": 4489,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.26727,
+      "grad_norm": 3.9939587116241455,
+      "learning_rate": 1.9931150018858013e-05,
+      "loss": 1.9508,
+      "step": 294
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 0.26818,
+      "grad_norm": 3.3021066188812256,
+      "learning_rate": 1.9929278846732883e-05,
+      "loss": 2.3226,
+      "step": 295
+    },
+    {
+      "batch_num_effect_tokens": 8879,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52155,
+      "epoch": 0.26909,
+      "grad_norm": 3.1655590534210205,
+      "learning_rate": 1.992738267766009e-05,
+      "loss": 2.3486,
+      "step": 296
+    },
+    {
+      "batch_num_effect_tokens": 7963,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52201,
+      "epoch": 0.27,
+      "grad_norm": 3.3706653118133545,
+      "learning_rate": 1.9925461516413224e-05,
+      "loss": 2.2756,
+      "step": 297
+    },
+    {
+      "batch_num_effect_tokens": 6110,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.27091,
+      "grad_norm": 3.7976138591766357,
+      "learning_rate": 1.9923515367828812e-05,
+      "loss": 2.2094,
+      "step": 298
+    },
+    {
+      "batch_num_effect_tokens": 8280,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.27182,
+      "grad_norm": 3.4359230995178223,
+      "learning_rate": 1.9921544236806284e-05,
+      "loss": 2.3017,
+      "step": 299
+    },
+    {
+      "batch_num_effect_tokens": 5775,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.27273,
+      "grad_norm": 4.127292633056641,
+      "learning_rate": 1.9919548128307954e-05,
+      "loss": 1.8205,
+      "step": 300
+    },
+    {
+      "batch_num_effect_tokens": 5309,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 0.27364,
+      "grad_norm": 3.933262348175049,
+      "learning_rate": 1.991752704735903e-05,
+      "loss": 1.8377,
+      "step": 301
+    },
+    {
+      "batch_num_effect_tokens": 9964,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.27455,
+      "grad_norm": 4.070802688598633,
+      "learning_rate": 1.9915480999047573e-05,
+      "loss": 2.2202,
+      "step": 302
+    },
+    {
+      "batch_num_effect_tokens": 6187,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.27545,
+      "grad_norm": 3.5866167545318604,
+      "learning_rate": 1.991340998852451e-05,
+      "loss": 1.9792,
+      "step": 303
+    },
+    {
+      "batch_num_effect_tokens": 5621,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.27636,
+      "grad_norm": 3.8665053844451904,
+      "learning_rate": 1.9911314021003614e-05,
+      "loss": 1.8149,
+      "step": 304
+    },
+    {
+      "batch_num_effect_tokens": 5774,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.27727,
+      "grad_norm": 3.8318331241607666,
+      "learning_rate": 1.990919310176147e-05,
+      "loss": 1.9496,
+      "step": 305
+    },
+    {
+      "batch_num_effect_tokens": 4571,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.27818,
+      "grad_norm": 4.002640724182129,
+      "learning_rate": 1.99070472361375e-05,
+      "loss": 1.8213,
+      "step": 306
+    },
+    {
+      "batch_num_effect_tokens": 5408,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.27909,
+      "grad_norm": 3.516658306121826,
+      "learning_rate": 1.9904876429533912e-05,
+      "loss": 1.8694,
+      "step": 307
+    },
+    {
+      "batch_num_effect_tokens": 5948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.28,
+      "grad_norm": 3.7011466026306152,
+      "learning_rate": 1.9902680687415704e-05,
+      "loss": 2.0552,
+      "step": 308
+    },
+    {
+      "batch_num_effect_tokens": 7477,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52212,
+      "epoch": 0.28091,
+      "grad_norm": 3.722318649291992,
+      "learning_rate": 1.9900460015310667e-05,
+      "loss": 2.2286,
+      "step": 309
+    },
+    {
+      "batch_num_effect_tokens": 6647,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 0.28182,
+      "grad_norm": 3.540323257446289,
+      "learning_rate": 1.989821441880933e-05,
+      "loss": 1.975,
+      "step": 310
+    },
+    {
+      "batch_num_effect_tokens": 7885,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 0.28273,
+      "grad_norm": 3.5634026527404785,
+      "learning_rate": 1.989594390356498e-05,
+      "loss": 2.2363,
+      "step": 311
+    },
+    {
+      "batch_num_effect_tokens": 7326,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52190,
+      "epoch": 0.28364,
+      "grad_norm": 3.539261817932129,
+      "learning_rate": 1.9893648475293646e-05,
+      "loss": 1.939,
+      "step": 312
+    },
+    {
+      "batch_num_effect_tokens": 3660,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 0.28455,
+      "grad_norm": 3.6285557746887207,
+      "learning_rate": 1.9891328139774057e-05,
+      "loss": 1.2964,
+      "step": 313
+    },
+    {
+      "batch_num_effect_tokens": 9164,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.28545,
+      "grad_norm": 5.367908000946045,
+      "learning_rate": 1.9888982902847658e-05,
+      "loss": 2.4264,
+      "step": 314
+    },
+    {
+      "batch_num_effect_tokens": 7064,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52107,
+      "epoch": 0.28636,
+      "grad_norm": 3.4953806400299072,
+      "learning_rate": 1.988661277041858e-05,
+      "loss": 1.9632,
+      "step": 315
+    },
+    {
+      "batch_num_effect_tokens": 10838,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.28727,
+      "grad_norm": 3.466254472732544,
+      "learning_rate": 1.9884217748453625e-05,
+      "loss": 2.5673,
+      "step": 316
+    },
+    {
+      "batch_num_effect_tokens": 8718,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.28818,
+      "grad_norm": 3.52352237701416,
+      "learning_rate": 1.9881797842982265e-05,
+      "loss": 2.222,
+      "step": 317
+    },
+    {
+      "batch_num_effect_tokens": 5664,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.28909,
+      "grad_norm": 4.089743137359619,
+      "learning_rate": 1.98793530600966e-05,
+      "loss": 1.9174,
+      "step": 318
+    },
+    {
+      "batch_num_effect_tokens": 5837,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.29,
+      "grad_norm": 4.193911552429199,
+      "learning_rate": 1.9876883405951378e-05,
+      "loss": 2.004,
+      "step": 319
+    },
+    {
+      "batch_num_effect_tokens": 5214,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.29091,
+      "grad_norm": 4.251591205596924,
+      "learning_rate": 1.9874388886763944e-05,
+      "loss": 1.9869,
+      "step": 320
+    },
+    {
+      "batch_num_effect_tokens": 5105,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.29182,
+      "grad_norm": 3.726947546005249,
+      "learning_rate": 1.987186950881425e-05,
+      "loss": 1.4984,
+      "step": 321
+    },
+    {
+      "batch_num_effect_tokens": 7011,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 0.29273,
+      "grad_norm": 3.730116128921509,
+      "learning_rate": 1.9869325278444824e-05,
+      "loss": 2.1577,
+      "step": 322
+    },
+    {
+      "batch_num_effect_tokens": 4520,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 0.29364,
+      "grad_norm": 3.86989426612854,
+      "learning_rate": 1.9866756202060764e-05,
+      "loss": 1.7366,
+      "step": 323
+    },
+    {
+      "batch_num_effect_tokens": 5279,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 0.29455,
+      "grad_norm": 4.149124622344971,
+      "learning_rate": 1.986416228612972e-05,
+      "loss": 1.8388,
+      "step": 324
+    },
+    {
+      "batch_num_effect_tokens": 8876,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.29545,
+      "grad_norm": 5.287720203399658,
+      "learning_rate": 1.986154353718187e-05,
+      "loss": 2.1111,
+      "step": 325
+    },
+    {
+      "batch_num_effect_tokens": 7286,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52219,
+      "epoch": 0.29636,
+      "grad_norm": 3.4820773601531982,
+      "learning_rate": 1.9858899961809904e-05,
+      "loss": 1.877,
+      "step": 326
+    },
+    {
+      "batch_num_effect_tokens": 4332,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 0.29727,
+      "grad_norm": 3.8096587657928467,
+      "learning_rate": 1.9856231566669036e-05,
+      "loss": 1.6738,
+      "step": 327
+    },
+    {
+      "batch_num_effect_tokens": 8761,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 0.29818,
+      "grad_norm": 4.4504852294921875,
+      "learning_rate": 1.9853538358476933e-05,
+      "loss": 2.0797,
+      "step": 328
+    },
+    {
+      "batch_num_effect_tokens": 5662,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.29909,
+      "grad_norm": 4.1594696044921875,
+      "learning_rate": 1.985082034401375e-05,
+      "loss": 1.9774,
+      "step": 329
+    },
+    {
+      "batch_num_effect_tokens": 6919,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.3,
+      "grad_norm": 3.5411789417266846,
+      "learning_rate": 1.9848077530122083e-05,
+      "loss": 2.0552,
+      "step": 330
+    },
+    {
+      "batch_num_effect_tokens": 7886,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.30091,
+      "grad_norm": 5.033746719360352,
+      "learning_rate": 1.9845309923706965e-05,
+      "loss": 1.6951,
+      "step": 331
+    },
+    {
+      "batch_num_effect_tokens": 8567,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52221,
+      "epoch": 0.30182,
+      "grad_norm": 4.57145881652832,
+      "learning_rate": 1.9842517531735837e-05,
+      "loss": 2.2676,
+      "step": 332
+    },
+    {
+      "batch_num_effect_tokens": 9606,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.30273,
+      "grad_norm": 3.5114569664001465,
+      "learning_rate": 1.9839700361238548e-05,
+      "loss": 2.5096,
+      "step": 333
+    },
+    {
+      "batch_num_effect_tokens": 9075,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.30364,
+      "grad_norm": 3.247704029083252,
+      "learning_rate": 1.9836858419307325e-05,
+      "loss": 2.3547,
+      "step": 334
+    },
+    {
+      "batch_num_effect_tokens": 6285,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.30455,
+      "grad_norm": 3.7255682945251465,
+      "learning_rate": 1.9833991713096742e-05,
+      "loss": 2.1543,
+      "step": 335
+    },
+    {
+      "batch_num_effect_tokens": 8129,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.30545,
+      "grad_norm": 3.3006324768066406,
+      "learning_rate": 1.9831100249823732e-05,
+      "loss": 2.0478,
+      "step": 336
+    },
+    {
+      "batch_num_effect_tokens": 5358,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.30636,
+      "grad_norm": 3.799280881881714,
+      "learning_rate": 1.9828184036767556e-05,
+      "loss": 1.6325,
+      "step": 337
+    },
+    {
+      "batch_num_effect_tokens": 4368,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.30727,
+      "grad_norm": 4.022277355194092,
+      "learning_rate": 1.9825243081269778e-05,
+      "loss": 1.5553,
+      "step": 338
+    },
+    {
+      "batch_num_effect_tokens": 7283,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.30818,
+      "grad_norm": 4.02409553527832,
+      "learning_rate": 1.982227739073424e-05,
+      "loss": 2.3756,
+      "step": 339
+    },
+    {
+      "batch_num_effect_tokens": 6278,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52179,
+      "epoch": 0.30909,
+      "grad_norm": 3.805237293243408,
+      "learning_rate": 1.9819286972627066e-05,
+      "loss": 1.9413,
+      "step": 340
+    },
+    {
+      "batch_num_effect_tokens": 4420,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52195,
+      "epoch": 0.31,
+      "grad_norm": 3.8129725456237793,
+      "learning_rate": 1.9816271834476642e-05,
+      "loss": 1.432,
+      "step": 341
+    },
+    {
+      "batch_num_effect_tokens": 4993,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.31091,
+      "grad_norm": 3.3998563289642334,
+      "learning_rate": 1.9813231983873563e-05,
+      "loss": 1.5523,
+      "step": 342
+    },
+    {
+      "batch_num_effect_tokens": 6330,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52192,
+      "epoch": 0.31182,
+      "grad_norm": 4.177271842956543,
+      "learning_rate": 1.9810167428470653e-05,
+      "loss": 2.2859,
+      "step": 343
+    },
+    {
+      "batch_num_effect_tokens": 7214,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 0.31273,
+      "grad_norm": 3.3524763584136963,
+      "learning_rate": 1.9807078175982925e-05,
+      "loss": 2.167,
+      "step": 344
+    },
+    {
+      "batch_num_effect_tokens": 7129,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.31364,
+      "grad_norm": 3.9093234539031982,
+      "learning_rate": 1.980396423418757e-05,
+      "loss": 2.3973,
+      "step": 345
+    },
+    {
+      "batch_num_effect_tokens": 7229,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 0.31455,
+      "grad_norm": 3.5451037883758545,
+      "learning_rate": 1.9800825610923937e-05,
+      "loss": 2.0647,
+      "step": 346
+    },
+    {
+      "batch_num_effect_tokens": 6494,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.31545,
+      "grad_norm": 3.35791277885437,
+      "learning_rate": 1.9797662314093496e-05,
+      "loss": 1.9389,
+      "step": 347
+    },
+    {
+      "batch_num_effect_tokens": 7132,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.31636,
+      "grad_norm": 3.3549084663391113,
+      "learning_rate": 1.9794474351659854e-05,
+      "loss": 2.1595,
+      "step": 348
+    },
+    {
+      "batch_num_effect_tokens": 6921,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52212,
+      "epoch": 0.31727,
+      "grad_norm": 3.5743157863616943,
+      "learning_rate": 1.9791261731648694e-05,
+      "loss": 2.1689,
+      "step": 349
+    },
+    {
+      "batch_num_effect_tokens": 5448,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.31818,
+      "grad_norm": 3.6076207160949707,
+      "learning_rate": 1.978802446214779e-05,
+      "loss": 1.9005,
+      "step": 350
+    },
+    {
+      "batch_num_effect_tokens": 4677,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50568,
+      "epoch": 0.31909,
+      "grad_norm": 3.772836208343506,
+      "learning_rate": 1.978476255130696e-05,
+      "loss": 1.8154,
+      "step": 351
+    },
+    {
+      "batch_num_effect_tokens": 6046,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.32,
+      "grad_norm": 3.8440117835998535,
+      "learning_rate": 1.9781476007338058e-05,
+      "loss": 2.0698,
+      "step": 352
+    },
+    {
+      "batch_num_effect_tokens": 3892,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.32091,
+      "grad_norm": 4.189990043640137,
+      "learning_rate": 1.977816483851496e-05,
+      "loss": 1.6277,
+      "step": 353
+    },
+    {
+      "batch_num_effect_tokens": 7533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.32182,
+      "grad_norm": 3.25177001953125,
+      "learning_rate": 1.977482905317353e-05,
+      "loss": 2.2104,
+      "step": 354
+    },
+    {
+      "batch_num_effect_tokens": 6350,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.32273,
+      "grad_norm": 3.4130544662475586,
+      "learning_rate": 1.9771468659711595e-05,
+      "loss": 1.9284,
+      "step": 355
+    },
+    {
+      "batch_num_effect_tokens": 5356,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.32364,
+      "grad_norm": 3.7726941108703613,
+      "learning_rate": 1.9768083666588954e-05,
+      "loss": 1.9012,
+      "step": 356
+    },
+    {
+      "batch_num_effect_tokens": 5176,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 0.32455,
+      "grad_norm": 3.8374075889587402,
+      "learning_rate": 1.9764674082327313e-05,
+      "loss": 1.7814,
+      "step": 357
+    },
+    {
+      "batch_num_effect_tokens": 4955,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52154,
+      "epoch": 0.32545,
+      "grad_norm": 4.390801429748535,
+      "learning_rate": 1.9761239915510302e-05,
+      "loss": 1.652,
+      "step": 358
+    },
+    {
+      "batch_num_effect_tokens": 4207,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.32636,
+      "grad_norm": 3.5614068508148193,
+      "learning_rate": 1.975778117478343e-05,
+      "loss": 1.4238,
+      "step": 359
+    },
+    {
+      "batch_num_effect_tokens": 6837,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 0.32727,
+      "grad_norm": 3.47572660446167,
+      "learning_rate": 1.9754297868854075e-05,
+      "loss": 2.0447,
+      "step": 360
+    },
+    {
+      "batch_num_effect_tokens": 9300,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 0.32818,
+      "grad_norm": 4.148090362548828,
+      "learning_rate": 1.9750790006491447e-05,
+      "loss": 2.416,
+      "step": 361
+    },
+    {
+      "batch_num_effect_tokens": 6922,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.32909,
+      "grad_norm": 3.374837636947632,
+      "learning_rate": 1.9747257596526594e-05,
+      "loss": 1.7274,
+      "step": 362
+    },
+    {
+      "batch_num_effect_tokens": 6355,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.33,
+      "grad_norm": 3.4720685482025146,
+      "learning_rate": 1.9743700647852356e-05,
+      "loss": 2.0928,
+      "step": 363
+    },
+    {
+      "batch_num_effect_tokens": 5826,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.33091,
+      "grad_norm": 3.680713176727295,
+      "learning_rate": 1.9740119169423337e-05,
+      "loss": 1.75,
+      "step": 364
+    },
+    {
+      "batch_num_effect_tokens": 7576,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.33182,
+      "grad_norm": 3.1444408893585205,
+      "learning_rate": 1.973651317025591e-05,
+      "loss": 2.1317,
+      "step": 365
+    },
+    {
+      "batch_num_effect_tokens": 8806,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 0.33273,
+      "grad_norm": 3.1442887783050537,
+      "learning_rate": 1.973288265942818e-05,
+      "loss": 2.3923,
+      "step": 366
+    },
+    {
+      "batch_num_effect_tokens": 7476,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.33364,
+      "grad_norm": 3.6415657997131348,
+      "learning_rate": 1.9729227646079942e-05,
+      "loss": 2.3736,
+      "step": 367
+    },
+    {
+      "batch_num_effect_tokens": 3792,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.33455,
+      "grad_norm": 3.8284006118774414,
+      "learning_rate": 1.9725548139412693e-05,
+      "loss": 1.3342,
+      "step": 368
+    },
+    {
+      "batch_num_effect_tokens": 6077,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.33545,
+      "grad_norm": 3.79321551322937,
+      "learning_rate": 1.972184414868959e-05,
+      "loss": 2.0298,
+      "step": 369
+    },
+    {
+      "batch_num_effect_tokens": 6096,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.33636,
+      "grad_norm": 4.172370433807373,
+      "learning_rate": 1.9718115683235418e-05,
+      "loss": 1.912,
+      "step": 370
+    },
+    {
+      "batch_num_effect_tokens": 5990,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.33727,
+      "grad_norm": 3.5405514240264893,
+      "learning_rate": 1.971436275243659e-05,
+      "loss": 1.9807,
+      "step": 371
+    },
+    {
+      "batch_num_effect_tokens": 6166,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.33818,
+      "grad_norm": 3.740248680114746,
+      "learning_rate": 1.9710585365741105e-05,
+      "loss": 2.1228,
+      "step": 372
+    },
+    {
+      "batch_num_effect_tokens": 8286,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.33909,
+      "grad_norm": 3.0704259872436523,
+      "learning_rate": 1.9706783532658528e-05,
+      "loss": 2.2266,
+      "step": 373
+    },
+    {
+      "batch_num_effect_tokens": 8179,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.34,
+      "grad_norm": 3.2725648880004883,
+      "learning_rate": 1.9702957262759964e-05,
+      "loss": 2.2335,
+      "step": 374
+    },
+    {
+      "batch_num_effect_tokens": 6548,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.34091,
+      "grad_norm": 3.4414355754852295,
+      "learning_rate": 1.969910656567805e-05,
+      "loss": 2.115,
+      "step": 375
+    },
+    {
+      "batch_num_effect_tokens": 10572,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.34182,
+      "grad_norm": 3.2218399047851562,
+      "learning_rate": 1.9695231451106914e-05,
+      "loss": 2.415,
+      "step": 376
+    },
+    {
+      "batch_num_effect_tokens": 7220,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.34273,
+      "grad_norm": 3.478093147277832,
+      "learning_rate": 1.9691331928802144e-05,
+      "loss": 2.2834,
+      "step": 377
+    },
+    {
+      "batch_num_effect_tokens": 6895,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 0.34364,
+      "grad_norm": 3.4649593830108643,
+      "learning_rate": 1.9687408008580785e-05,
+      "loss": 1.9373,
+      "step": 378
+    },
+    {
+      "batch_num_effect_tokens": 4249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.34455,
+      "grad_norm": 3.788116455078125,
+      "learning_rate": 1.9683459700321305e-05,
+      "loss": 1.7062,
+      "step": 379
+    },
+    {
+      "batch_num_effect_tokens": 7538,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.34545,
+      "grad_norm": 3.7470011711120605,
+      "learning_rate": 1.9679487013963566e-05,
+      "loss": 2.2533,
+      "step": 380
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.34636,
+      "grad_norm": 3.405688524246216,
+      "learning_rate": 1.9675489959508794e-05,
+      "loss": 2.088,
+      "step": 381
+    },
+    {
+      "batch_num_effect_tokens": 4731,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.34727,
+      "grad_norm": 3.5091192722320557,
+      "learning_rate": 1.9671468547019575e-05,
+      "loss": 1.4163,
+      "step": 382
+    },
+    {
+      "batch_num_effect_tokens": 6690,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.34818,
+      "grad_norm": 3.5613765716552734,
+      "learning_rate": 1.9667422786619804e-05,
+      "loss": 1.9267,
+      "step": 383
+    },
+    {
+      "batch_num_effect_tokens": 5752,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.34909,
+      "grad_norm": 3.2376224994659424,
+      "learning_rate": 1.9663352688494686e-05,
+      "loss": 1.5661,
+      "step": 384
+    },
+    {
+      "batch_num_effect_tokens": 5535,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.35,
+      "grad_norm": 3.3313965797424316,
+      "learning_rate": 1.9659258262890683e-05,
+      "loss": 1.4459,
+      "step": 385
+    },
+    {
+      "batch_num_effect_tokens": 6964,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.35091,
+      "grad_norm": 4.082736968994141,
+      "learning_rate": 1.965513952011551e-05,
+      "loss": 2.1208,
+      "step": 386
+    },
+    {
+      "batch_num_effect_tokens": 4936,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.35182,
+      "grad_norm": 3.491699695587158,
+      "learning_rate": 1.9650996470538093e-05,
+      "loss": 1.5283,
+      "step": 387
+    },
+    {
+      "batch_num_effect_tokens": 6667,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.35273,
+      "grad_norm": 6.086740016937256,
+      "learning_rate": 1.964682912458856e-05,
+      "loss": 1.8542,
+      "step": 388
+    },
+    {
+      "batch_num_effect_tokens": 7105,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.35364,
+      "grad_norm": 3.77236008644104,
+      "learning_rate": 1.9642637492758193e-05,
+      "loss": 2.1419,
+      "step": 389
+    },
+    {
+      "batch_num_effect_tokens": 6647,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52127,
+      "epoch": 0.35455,
+      "grad_norm": 3.3697710037231445,
+      "learning_rate": 1.9638421585599422e-05,
+      "loss": 2.0109,
+      "step": 390
+    },
+    {
+      "batch_num_effect_tokens": 8727,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 0.35545,
+      "grad_norm": 3.1939914226531982,
+      "learning_rate": 1.963418141372579e-05,
+      "loss": 2.136,
+      "step": 391
+    },
+    {
+      "batch_num_effect_tokens": 4262,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.35636,
+      "grad_norm": 3.370107889175415,
+      "learning_rate": 1.9629916987811924e-05,
+      "loss": 1.3332,
+      "step": 392
+    },
+    {
+      "batch_num_effect_tokens": 5487,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.35727,
+      "grad_norm": 4.085422039031982,
+      "learning_rate": 1.9625628318593514e-05,
+      "loss": 2.1505,
+      "step": 393
+    },
+    {
+      "batch_num_effect_tokens": 6407,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.35818,
+      "grad_norm": 3.593017578125,
+      "learning_rate": 1.9621315416867274e-05,
+      "loss": 2.0573,
+      "step": 394
+    },
+    {
+      "batch_num_effect_tokens": 7014,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52186,
+      "epoch": 0.35909,
+      "grad_norm": 3.438126564025879,
+      "learning_rate": 1.961697829349093e-05,
+      "loss": 2.1716,
+      "step": 395
+    },
+    {
+      "batch_num_effect_tokens": 5753,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 0.36,
+      "grad_norm": 3.483640432357788,
+      "learning_rate": 1.961261695938319e-05,
+      "loss": 1.7637,
+      "step": 396
+    },
+    {
+      "batch_num_effect_tokens": 3818,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52125,
+      "epoch": 0.36091,
+      "grad_norm": 3.637179136276245,
+      "learning_rate": 1.9608231425523702e-05,
+      "loss": 1.425,
+      "step": 397
+    },
+    {
+      "batch_num_effect_tokens": 6166,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.36182,
+      "grad_norm": 4.281872272491455,
+      "learning_rate": 1.9603821702953047e-05,
+      "loss": 1.8861,
+      "step": 398
+    },
+    {
+      "batch_num_effect_tokens": 6623,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.36273,
+      "grad_norm": 3.598214864730835,
+      "learning_rate": 1.9599387802772693e-05,
+      "loss": 2.0335,
+      "step": 399
+    },
+    {
+      "batch_num_effect_tokens": 8465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.36364,
+      "grad_norm": 6.010089874267578,
+      "learning_rate": 1.9594929736144978e-05,
+      "loss": 1.5298,
+      "step": 400
+    },
+    {
+      "batch_num_effect_tokens": 7077,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52194,
+      "epoch": 0.36455,
+      "grad_norm": 3.569758415222168,
+      "learning_rate": 1.959044751429308e-05,
+      "loss": 1.9929,
+      "step": 401
+    },
+    {
+      "batch_num_effect_tokens": 11051,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.36545,
+      "grad_norm": 3.3714101314544678,
+      "learning_rate": 1.9585941148500987e-05,
+      "loss": 2.5957,
+      "step": 402
+    },
+    {
+      "batch_num_effect_tokens": 6003,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50555,
+      "epoch": 0.36636,
+      "grad_norm": 3.890573501586914,
+      "learning_rate": 1.958141065011347e-05,
+      "loss": 2.0343,
+      "step": 403
+    },
+    {
+      "batch_num_effect_tokens": 6037,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.36727,
+      "grad_norm": 5.133095741271973,
+      "learning_rate": 1.9576856030536055e-05,
+      "loss": 1.7706,
+      "step": 404
+    },
+    {
+      "batch_num_effect_tokens": 5590,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.36818,
+      "grad_norm": 3.661139726638794,
+      "learning_rate": 1.957227730123499e-05,
+      "loss": 1.9668,
+      "step": 405
+    },
+    {
+      "batch_num_effect_tokens": 5160,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.36909,
+      "grad_norm": 3.9254584312438965,
+      "learning_rate": 1.956767447373722e-05,
+      "loss": 1.7646,
+      "step": 406
+    },
+    {
+      "batch_num_effect_tokens": 7085,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.37,
+      "grad_norm": 9.332328796386719,
+      "learning_rate": 1.9563047559630356e-05,
+      "loss": 1.9354,
+      "step": 407
+    },
+    {
+      "batch_num_effect_tokens": 7484,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.37091,
+      "grad_norm": 3.1720235347747803,
+      "learning_rate": 1.955839657056265e-05,
+      "loss": 2.0569,
+      "step": 408
+    },
+    {
+      "batch_num_effect_tokens": 4746,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52195,
+      "epoch": 0.37182,
+      "grad_norm": 3.7923851013183594,
+      "learning_rate": 1.955372151824297e-05,
+      "loss": 1.7236,
+      "step": 409
+    },
+    {
+      "batch_num_effect_tokens": 4527,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.37273,
+      "grad_norm": 4.349642276763916,
+      "learning_rate": 1.9549022414440738e-05,
+      "loss": 1.5834,
+      "step": 410
+    },
+    {
+      "batch_num_effect_tokens": 4718,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50544,
+      "epoch": 0.37364,
+      "grad_norm": 3.7531752586364746,
+      "learning_rate": 1.9544299270985958e-05,
+      "loss": 1.6154,
+      "step": 411
+    },
+    {
+      "batch_num_effect_tokens": 8186,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50591,
+      "epoch": 0.37455,
+      "grad_norm": 15.484469413757324,
+      "learning_rate": 1.9539552099769128e-05,
+      "loss": 2.3389,
+      "step": 412
+    },
+    {
+      "batch_num_effect_tokens": 5516,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.37545,
+      "grad_norm": 3.6566741466522217,
+      "learning_rate": 1.953478091274125e-05,
+      "loss": 1.6849,
+      "step": 413
+    },
+    {
+      "batch_num_effect_tokens": 8184,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.37636,
+      "grad_norm": 3.60722279548645,
+      "learning_rate": 1.952998572191378e-05,
+      "loss": 2.1157,
+      "step": 414
+    },
+    {
+      "batch_num_effect_tokens": 6107,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.37727,
+      "grad_norm": 3.568403720855713,
+      "learning_rate": 1.9525166539358608e-05,
+      "loss": 1.8607,
+      "step": 415
+    },
+    {
+      "batch_num_effect_tokens": 5373,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.37818,
+      "grad_norm": 3.696733236312866,
+      "learning_rate": 1.9520323377208017e-05,
+      "loss": 1.6664,
+      "step": 416
+    },
+    {
+      "batch_num_effect_tokens": 5750,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.37909,
+      "grad_norm": 4.781795978546143,
+      "learning_rate": 1.951545624765466e-05,
+      "loss": 1.9601,
+      "step": 417
+    },
+    {
+      "batch_num_effect_tokens": 6467,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.38,
+      "grad_norm": 3.721050262451172,
+      "learning_rate": 1.9510565162951538e-05,
+      "loss": 1.7897,
+      "step": 418
+    },
+    {
+      "batch_num_effect_tokens": 5158,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.38091,
+      "grad_norm": 3.6470894813537598,
+      "learning_rate": 1.950565013541194e-05,
+      "loss": 1.8109,
+      "step": 419
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.38182,
+      "grad_norm": 3.234846591949463,
+      "learning_rate": 1.9500711177409456e-05,
+      "loss": 2.2473,
+      "step": 420
+    },
+    {
+      "batch_num_effect_tokens": 5828,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.38273,
+      "grad_norm": 3.299785852432251,
+      "learning_rate": 1.9495748301377895e-05,
+      "loss": 1.7766,
+      "step": 421
+    },
+    {
+      "batch_num_effect_tokens": 9265,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.38364,
+      "grad_norm": 3.1475846767425537,
+      "learning_rate": 1.9490761519811295e-05,
+      "loss": 2.3207,
+      "step": 422
+    },
+    {
+      "batch_num_effect_tokens": 6778,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.38455,
+      "grad_norm": 3.426121234893799,
+      "learning_rate": 1.9485750845263874e-05,
+      "loss": 2.1797,
+      "step": 423
+    },
+    {
+      "batch_num_effect_tokens": 7228,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.38545,
+      "grad_norm": 3.2111194133758545,
+      "learning_rate": 1.9480716290349998e-05,
+      "loss": 1.9292,
+      "step": 424
+    },
+    {
+      "batch_num_effect_tokens": 4729,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.38636,
+      "grad_norm": 4.0305047035217285,
+      "learning_rate": 1.9475657867744153e-05,
+      "loss": 1.6335,
+      "step": 425
+    },
+    {
+      "batch_num_effect_tokens": 6369,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 0.38727,
+      "grad_norm": 3.4517645835876465,
+      "learning_rate": 1.947057559018091e-05,
+      "loss": 1.9695,
+      "step": 426
+    },
+    {
+      "batch_num_effect_tokens": 3094,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.38818,
+      "grad_norm": 3.5539233684539795,
+      "learning_rate": 1.94654694704549e-05,
+      "loss": 0.765,
+      "step": 427
+    },
+    {
+      "batch_num_effect_tokens": 5971,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.38909,
+      "grad_norm": 3.3859667778015137,
+      "learning_rate": 1.946033952142077e-05,
+      "loss": 1.6888,
+      "step": 428
+    },
+    {
+      "batch_num_effect_tokens": 7126,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.39,
+      "grad_norm": 3.397372007369995,
+      "learning_rate": 1.945518575599317e-05,
+      "loss": 2.0718,
+      "step": 429
+    },
+    {
+      "batch_num_effect_tokens": 8198,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52143,
+      "epoch": 0.39091,
+      "grad_norm": 3.590057373046875,
+      "learning_rate": 1.9450008187146685e-05,
+      "loss": 2.2892,
+      "step": 430
+    },
+    {
+      "batch_num_effect_tokens": 5553,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.39182,
+      "grad_norm": 3.60089373588562,
+      "learning_rate": 1.9444806827915848e-05,
+      "loss": 1.8969,
+      "step": 431
+    },
+    {
+      "batch_num_effect_tokens": 6068,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52166,
+      "epoch": 0.39273,
+      "grad_norm": 3.4060354232788086,
+      "learning_rate": 1.943958169139507e-05,
+      "loss": 1.858,
+      "step": 432
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.39364,
+      "grad_norm": 3.4720919132232666,
+      "learning_rate": 1.9434332790738625e-05,
+      "loss": 2.2429,
+      "step": 433
+    },
+    {
+      "batch_num_effect_tokens": 4799,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.39455,
+      "grad_norm": 3.771951913833618,
+      "learning_rate": 1.942906013916062e-05,
+      "loss": 1.7619,
+      "step": 434
+    },
+    {
+      "batch_num_effect_tokens": 5977,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 0.39545,
+      "grad_norm": 3.652972936630249,
+      "learning_rate": 1.9423763749934942e-05,
+      "loss": 1.7407,
+      "step": 435
+    },
+    {
+      "batch_num_effect_tokens": 5247,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 0.39636,
+      "grad_norm": 3.8835346698760986,
+      "learning_rate": 1.941844363639525e-05,
+      "loss": 1.9667,
+      "step": 436
+    },
+    {
+      "batch_num_effect_tokens": 7674,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.39727,
+      "grad_norm": 3.0361108779907227,
+      "learning_rate": 1.941309981193492e-05,
+      "loss": 1.94,
+      "step": 437
+    },
+    {
+      "batch_num_effect_tokens": 5285,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.39818,
+      "grad_norm": 3.8913521766662598,
+      "learning_rate": 1.9407732290007023e-05,
+      "loss": 2.0395,
+      "step": 438
+    },
+    {
+      "batch_num_effect_tokens": 5503,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.39909,
+      "grad_norm": 3.5886588096618652,
+      "learning_rate": 1.9402341084124298e-05,
+      "loss": 1.7529,
+      "step": 439
+    },
+    {
+      "batch_num_effect_tokens": 7691,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52122,
+      "epoch": 0.4,
+      "grad_norm": 3.757056474685669,
+      "learning_rate": 1.9396926207859085e-05,
+      "loss": 2.373,
+      "step": 440
+    },
+    {
+      "batch_num_effect_tokens": 4646,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.40091,
+      "grad_norm": 3.726503610610962,
+      "learning_rate": 1.939148767484334e-05,
+      "loss": 1.4204,
+      "step": 441
+    },
+    {
+      "batch_num_effect_tokens": 5042,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.40182,
+      "grad_norm": 3.9943902492523193,
+      "learning_rate": 1.938602549876856e-05,
+      "loss": 2.0746,
+      "step": 442
+    },
+    {
+      "batch_num_effect_tokens": 5524,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.40273,
+      "grad_norm": 4.357745170593262,
+      "learning_rate": 1.9380539693385763e-05,
+      "loss": 1.9158,
+      "step": 443
+    },
+    {
+      "batch_num_effect_tokens": 9174,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50580,
+      "epoch": 0.40364,
+      "grad_norm": 3.923656940460205,
+      "learning_rate": 1.9375030272505463e-05,
+      "loss": 2.5712,
+      "step": 444
+    },
+    {
+      "batch_num_effect_tokens": 6709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.40455,
+      "grad_norm": 3.5520644187927246,
+      "learning_rate": 1.936949724999762e-05,
+      "loss": 2.0598,
+      "step": 445
+    },
+    {
+      "batch_num_effect_tokens": 6041,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52176,
+      "epoch": 0.40545,
+      "grad_norm": 3.5794007778167725,
+      "learning_rate": 1.9363940639791607e-05,
+      "loss": 1.9235,
+      "step": 446
+    },
+    {
+      "batch_num_effect_tokens": 6017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52121,
+      "epoch": 0.40636,
+      "grad_norm": 3.34661602973938,
+      "learning_rate": 1.935836045587619e-05,
+      "loss": 1.9621,
+      "step": 447
+    },
+    {
+      "batch_num_effect_tokens": 6171,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.40727,
+      "grad_norm": 3.2858693599700928,
+      "learning_rate": 1.9352756712299467e-05,
+      "loss": 1.6668,
+      "step": 448
+    },
+    {
+      "batch_num_effect_tokens": 4035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.40818,
+      "grad_norm": 4.985914707183838,
+      "learning_rate": 1.934712942316886e-05,
+      "loss": 1.3112,
+      "step": 449
+    },
+    {
+      "batch_num_effect_tokens": 6588,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.40909,
+      "grad_norm": 3.5650634765625,
+      "learning_rate": 1.9341478602651068e-05,
+      "loss": 1.6175,
+      "step": 450
+    },
+    {
+      "batch_num_effect_tokens": 7081,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.41,
+      "grad_norm": 6.88336181640625,
+      "learning_rate": 1.9335804264972018e-05,
+      "loss": 1.4374,
+      "step": 451
+    },
+    {
+      "batch_num_effect_tokens": 6896,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 0.41091,
+      "grad_norm": 3.438095808029175,
+      "learning_rate": 1.9330106424416852e-05,
+      "loss": 2.1431,
+      "step": 452
+    },
+    {
+      "batch_num_effect_tokens": 7606,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.41182,
+      "grad_norm": 3.3056182861328125,
+      "learning_rate": 1.9324385095329875e-05,
+      "loss": 2.2578,
+      "step": 453
+    },
+    {
+      "batch_num_effect_tokens": 5966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.41273,
+      "grad_norm": 3.2087910175323486,
+      "learning_rate": 1.9318640292114526e-05,
+      "loss": 1.6353,
+      "step": 454
+    },
+    {
+      "batch_num_effect_tokens": 4157,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.41364,
+      "grad_norm": 3.875227689743042,
+      "learning_rate": 1.931287202923334e-05,
+      "loss": 1.3412,
+      "step": 455
+    },
+    {
+      "batch_num_effect_tokens": 5966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52129,
+      "epoch": 0.41455,
+      "grad_norm": 3.8848047256469727,
+      "learning_rate": 1.9307080321207913e-05,
+      "loss": 2.1777,
+      "step": 456
+    },
+    {
+      "batch_num_effect_tokens": 7158,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.41545,
+      "grad_norm": 3.2563788890838623,
+      "learning_rate": 1.9301265182618862e-05,
+      "loss": 2.1152,
+      "step": 457
+    },
+    {
+      "batch_num_effect_tokens": 8421,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.41636,
+      "grad_norm": 3.4559903144836426,
+      "learning_rate": 1.9295426628105792e-05,
+      "loss": 1.502,
+      "step": 458
+    },
+    {
+      "batch_num_effect_tokens": 6188,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52186,
+      "epoch": 0.41727,
+      "grad_norm": 3.458390235900879,
+      "learning_rate": 1.928956467236726e-05,
+      "loss": 1.7265,
+      "step": 459
+    },
+    {
+      "batch_num_effect_tokens": 6964,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.41818,
+      "grad_norm": 4.283545017242432,
+      "learning_rate": 1.9283679330160726e-05,
+      "loss": 2.3138,
+      "step": 460
+    },
+    {
+      "batch_num_effect_tokens": 5525,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52214,
+      "epoch": 0.41909,
+      "grad_norm": 3.834029197692871,
+      "learning_rate": 1.927777061630254e-05,
+      "loss": 1.7211,
+      "step": 461
+    },
+    {
+      "batch_num_effect_tokens": 6867,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.42,
+      "grad_norm": 3.8594608306884766,
+      "learning_rate": 1.9271838545667876e-05,
+      "loss": 2.2086,
+      "step": 462
+    },
+    {
+      "batch_num_effect_tokens": 5732,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.42091,
+      "grad_norm": 3.368337869644165,
+      "learning_rate": 1.9265883133190715e-05,
+      "loss": 1.6516,
+      "step": 463
+    },
+    {
+      "batch_num_effect_tokens": 8993,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.42182,
+      "grad_norm": 3.0719375610351562,
+      "learning_rate": 1.9259904393863804e-05,
+      "loss": 2.1589,
+      "step": 464
+    },
+    {
+      "batch_num_effect_tokens": 4547,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52156,
+      "epoch": 0.42273,
+      "grad_norm": 3.582049608230591,
+      "learning_rate": 1.9253902342738612e-05,
+      "loss": 1.6844,
+      "step": 465
+    },
+    {
+      "batch_num_effect_tokens": 5729,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.42364,
+      "grad_norm": 3.6378562450408936,
+      "learning_rate": 1.9247876994925293e-05,
+      "loss": 1.8612,
+      "step": 466
+    },
+    {
+      "batch_num_effect_tokens": 7245,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.42455,
+      "grad_norm": 3.700286626815796,
+      "learning_rate": 1.9241828365592653e-05,
+      "loss": 2.0882,
+      "step": 467
+    },
+    {
+      "batch_num_effect_tokens": 5252,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.42545,
+      "grad_norm": 3.388467311859131,
+      "learning_rate": 1.9235756469968112e-05,
+      "loss": 1.5082,
+      "step": 468
+    },
+    {
+      "batch_num_effect_tokens": 5824,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.42636,
+      "grad_norm": 3.6123640537261963,
+      "learning_rate": 1.922966132333766e-05,
+      "loss": 1.8822,
+      "step": 469
+    },
+    {
+      "batch_num_effect_tokens": 5069,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52156,
+      "epoch": 0.42727,
+      "grad_norm": 3.3630306720733643,
+      "learning_rate": 1.9223542941045817e-05,
+      "loss": 1.2953,
+      "step": 470
+    },
+    {
+      "batch_num_effect_tokens": 7137,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.42818,
+      "grad_norm": 3.2389023303985596,
+      "learning_rate": 1.9217401338495605e-05,
+      "loss": 2.0332,
+      "step": 471
+    },
+    {
+      "batch_num_effect_tokens": 9415,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.42909,
+      "grad_norm": 3.571338176727295,
+      "learning_rate": 1.92112365311485e-05,
+      "loss": 2.4274,
+      "step": 472
+    },
+    {
+      "batch_num_effect_tokens": 6512,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 0.43,
+      "grad_norm": 3.4565165042877197,
+      "learning_rate": 1.9205048534524405e-05,
+      "loss": 2.134,
+      "step": 473
+    },
+    {
+      "batch_num_effect_tokens": 6771,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52081,
+      "epoch": 0.43091,
+      "grad_norm": 3.388167381286621,
+      "learning_rate": 1.9198837364201587e-05,
+      "loss": 2.0009,
+      "step": 474
+    },
+    {
+      "batch_num_effect_tokens": 6701,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.43182,
+      "grad_norm": 3.1186001300811768,
+      "learning_rate": 1.9192603035816657e-05,
+      "loss": 2.0379,
+      "step": 475
+    },
+    {
+      "batch_num_effect_tokens": 6036,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 0.43273,
+      "grad_norm": 3.3869993686676025,
+      "learning_rate": 1.918634556506454e-05,
+      "loss": 1.9541,
+      "step": 476
+    },
+    {
+      "batch_num_effect_tokens": 4699,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.43364,
+      "grad_norm": 3.9716951847076416,
+      "learning_rate": 1.91800649676984e-05,
+      "loss": 1.6997,
+      "step": 477
+    },
+    {
+      "batch_num_effect_tokens": 5120,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52167,
+      "epoch": 0.43455,
+      "grad_norm": 3.8032917976379395,
+      "learning_rate": 1.9173761259529634e-05,
+      "loss": 1.7517,
+      "step": 478
+    },
+    {
+      "batch_num_effect_tokens": 5568,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 0.43545,
+      "grad_norm": 3.4945125579833984,
+      "learning_rate": 1.916743445642783e-05,
+      "loss": 1.5496,
+      "step": 479
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.43636,
+      "grad_norm": 3.0792834758758545,
+      "learning_rate": 1.9161084574320696e-05,
+      "loss": 2.0386,
+      "step": 480
+    },
+    {
+      "batch_num_effect_tokens": 7660,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52153,
+      "epoch": 0.43727,
+      "grad_norm": 3.443307399749756,
+      "learning_rate": 1.9154711629194062e-05,
+      "loss": 2.0699,
+      "step": 481
+    },
+    {
+      "batch_num_effect_tokens": 5759,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.43818,
+      "grad_norm": 3.4880309104919434,
+      "learning_rate": 1.9148315637091805e-05,
+      "loss": 1.7058,
+      "step": 482
+    },
+    {
+      "batch_num_effect_tokens": 6151,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.43909,
+      "grad_norm": 3.6025350093841553,
+      "learning_rate": 1.9141896614115824e-05,
+      "loss": 2.1311,
+      "step": 483
+    },
+    {
+      "batch_num_effect_tokens": 8317,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 0.44,
+      "grad_norm": 3.1102218627929688,
+      "learning_rate": 1.913545457642601e-05,
+      "loss": 2.2208,
+      "step": 484
+    },
+    {
+      "batch_num_effect_tokens": 5579,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.44091,
+      "grad_norm": 3.600101947784424,
+      "learning_rate": 1.9128989540240178e-05,
+      "loss": 2.0342,
+      "step": 485
+    },
+    {
+      "batch_num_effect_tokens": 6947,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.44182,
+      "grad_norm": 3.5309343338012695,
+      "learning_rate": 1.9122501521834052e-05,
+      "loss": 2.2342,
+      "step": 486
+    },
+    {
+      "batch_num_effect_tokens": 6241,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.44273,
+      "grad_norm": 3.5790889263153076,
+      "learning_rate": 1.9115990537541217e-05,
+      "loss": 2.1857,
+      "step": 487
+    },
+    {
+      "batch_num_effect_tokens": 7397,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52146,
+      "epoch": 0.44364,
+      "grad_norm": 3.4434750080108643,
+      "learning_rate": 1.910945660375305e-05,
+      "loss": 2.326,
+      "step": 488
+    },
+    {
+      "batch_num_effect_tokens": 9990,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.44455,
+      "grad_norm": 2.4198713302612305,
+      "learning_rate": 1.9102899736918742e-05,
+      "loss": 1.5947,
+      "step": 489
+    },
+    {
+      "batch_num_effect_tokens": 4837,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 0.44545,
+      "grad_norm": 3.3059439659118652,
+      "learning_rate": 1.9096319953545186e-05,
+      "loss": 1.5192,
+      "step": 490
+    },
+    {
+      "batch_num_effect_tokens": 7704,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.44636,
+      "grad_norm": 3.98941969871521,
+      "learning_rate": 1.9089717270196982e-05,
+      "loss": 1.4529,
+      "step": 491
+    },
+    {
+      "batch_num_effect_tokens": 4778,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.44727,
+      "grad_norm": 3.2443060874938965,
+      "learning_rate": 1.9083091703496373e-05,
+      "loss": 1.6286,
+      "step": 492
+    },
+    {
+      "batch_num_effect_tokens": 9290,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.44818,
+      "grad_norm": 2.968362808227539,
+      "learning_rate": 1.9076443270123222e-05,
+      "loss": 2.2458,
+      "step": 493
+    },
+    {
+      "batch_num_effect_tokens": 7917,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.44909,
+      "grad_norm": 2.978633403778076,
+      "learning_rate": 1.9069771986814948e-05,
+      "loss": 2.103,
+      "step": 494
+    },
+    {
+      "batch_num_effect_tokens": 4098,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.45,
+      "grad_norm": 4.185185432434082,
+      "learning_rate": 1.9063077870366504e-05,
+      "loss": 1.7879,
+      "step": 495
+    },
+    {
+      "batch_num_effect_tokens": 7130,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52144,
+      "epoch": 0.45091,
+      "grad_norm": 3.301975727081299,
+      "learning_rate": 1.905636093763031e-05,
+      "loss": 2.2401,
+      "step": 496
+    },
+    {
+      "batch_num_effect_tokens": 7763,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.45182,
+      "grad_norm": 9.562934875488281,
+      "learning_rate": 1.9049621205516243e-05,
+      "loss": 2.104,
+      "step": 497
+    },
+    {
+      "batch_num_effect_tokens": 3216,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.45273,
+      "grad_norm": 3.6380558013916016,
+      "learning_rate": 1.9042858690991574e-05,
+      "loss": 1.3197,
+      "step": 498
+    },
+    {
+      "batch_num_effect_tokens": 5344,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 0.45364,
+      "grad_norm": 3.306352376937866,
+      "learning_rate": 1.9036073411080917e-05,
+      "loss": 1.7201,
+      "step": 499
+    },
+    {
+      "batch_num_effect_tokens": 5946,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.45455,
+      "grad_norm": 3.5917532444000244,
+      "learning_rate": 1.9029265382866216e-05,
+      "loss": 2.0292,
+      "step": 500
+    },
+    {
+      "batch_num_effect_tokens": 3840,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.45545,
+      "grad_norm": 3.4000165462493896,
+      "learning_rate": 1.902243462348666e-05,
+      "loss": 1.3569,
+      "step": 501
+    },
+    {
+      "batch_num_effect_tokens": 6128,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.45636,
+      "grad_norm": 3.308820962905884,
+      "learning_rate": 1.9015581150138693e-05,
+      "loss": 1.7542,
+      "step": 502
+    },
+    {
+      "batch_num_effect_tokens": 6220,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.45727,
+      "grad_norm": 3.2647831439971924,
+      "learning_rate": 1.9008704980075915e-05,
+      "loss": 2.0074,
+      "step": 503
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 0.45818,
+      "grad_norm": 3.451694965362549,
+      "learning_rate": 1.900180613060908e-05,
+      "loss": 2.301,
+      "step": 504
+    },
+    {
+      "batch_num_effect_tokens": 6552,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.45909,
+      "grad_norm": 3.290741443634033,
+      "learning_rate": 1.8994884619106034e-05,
+      "loss": 2.1138,
+      "step": 505
+    },
+    {
+      "batch_num_effect_tokens": 4136,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.46,
+      "grad_norm": 3.646514654159546,
+      "learning_rate": 1.8987940462991673e-05,
+      "loss": 1.6685,
+      "step": 506
+    },
+    {
+      "batch_num_effect_tokens": 7924,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.46091,
+      "grad_norm": 3.5173795223236084,
+      "learning_rate": 1.8980973679747897e-05,
+      "loss": 2.1933,
+      "step": 507
+    },
+    {
+      "batch_num_effect_tokens": 4122,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.46182,
+      "grad_norm": 3.5924158096313477,
+      "learning_rate": 1.8973984286913584e-05,
+      "loss": 1.2037,
+      "step": 508
+    },
+    {
+      "batch_num_effect_tokens": 6331,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.46273,
+      "grad_norm": 3.658207654953003,
+      "learning_rate": 1.8966972302084516e-05,
+      "loss": 2.0125,
+      "step": 509
+    },
+    {
+      "batch_num_effect_tokens": 7272,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.46364,
+      "grad_norm": 3.5813214778900146,
+      "learning_rate": 1.895993774291336e-05,
+      "loss": 2.1851,
+      "step": 510
+    },
+    {
+      "batch_num_effect_tokens": 6230,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.46455,
+      "grad_norm": 3.268073558807373,
+      "learning_rate": 1.8952880627109606e-05,
+      "loss": 1.9113,
+      "step": 511
+    },
+    {
+      "batch_num_effect_tokens": 4682,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 0.46545,
+      "grad_norm": 3.3010008335113525,
+      "learning_rate": 1.894580097243954e-05,
+      "loss": 1.2218,
+      "step": 512
+    },
+    {
+      "batch_num_effect_tokens": 7353,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.46636,
+      "grad_norm": 3.572483539581299,
+      "learning_rate": 1.8938698796726177e-05,
+      "loss": 2.0842,
+      "step": 513
+    },
+    {
+      "batch_num_effect_tokens": 9044,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 0.46727,
+      "grad_norm": 3.1595988273620605,
+      "learning_rate": 1.893157411784924e-05,
+      "loss": 2.2458,
+      "step": 514
+    },
+    {
+      "batch_num_effect_tokens": 10117,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.46818,
+      "grad_norm": 3.0882046222686768,
+      "learning_rate": 1.89244269537451e-05,
+      "loss": 2.1724,
+      "step": 515
+    },
+    {
+      "batch_num_effect_tokens": 4353,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.46909,
+      "grad_norm": 3.614635467529297,
+      "learning_rate": 1.8917257322406735e-05,
+      "loss": 1.6285,
+      "step": 516
+    },
+    {
+      "batch_num_effect_tokens": 5500,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.47,
+      "grad_norm": 3.201978921890259,
+      "learning_rate": 1.891006524188368e-05,
+      "loss": 1.6012,
+      "step": 517
+    },
+    {
+      "batch_num_effect_tokens": 5168,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.47091,
+      "grad_norm": 3.692725419998169,
+      "learning_rate": 1.8902850730281993e-05,
+      "loss": 1.9266,
+      "step": 518
+    },
+    {
+      "batch_num_effect_tokens": 4942,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52193,
+      "epoch": 0.47182,
+      "grad_norm": 4.545666694641113,
+      "learning_rate": 1.8895613805764196e-05,
+      "loss": 2.0146,
+      "step": 519
+    },
+    {
+      "batch_num_effect_tokens": 7190,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.47273,
+      "grad_norm": 3.550752878189087,
+      "learning_rate": 1.8888354486549238e-05,
+      "loss": 2.1113,
+      "step": 520
+    },
+    {
+      "batch_num_effect_tokens": 8762,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.47364,
+      "grad_norm": 3.4238336086273193,
+      "learning_rate": 1.8881072790912445e-05,
+      "loss": 2.2257,
+      "step": 521
+    },
+    {
+      "batch_num_effect_tokens": 6978,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52189,
+      "epoch": 0.47455,
+      "grad_norm": 3.5296685695648193,
+      "learning_rate": 1.887376873718548e-05,
+      "loss": 2.2325,
+      "step": 522
+    },
+    {
+      "batch_num_effect_tokens": 4304,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.47545,
+      "grad_norm": 3.596109628677368,
+      "learning_rate": 1.8866442343756288e-05,
+      "loss": 1.5695,
+      "step": 523
+    },
+    {
+      "batch_num_effect_tokens": 6116,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 0.47636,
+      "grad_norm": 3.8315112590789795,
+      "learning_rate": 1.8859093629069057e-05,
+      "loss": 1.7386,
+      "step": 524
+    },
+    {
+      "batch_num_effect_tokens": 5970,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52112,
+      "epoch": 0.47727,
+      "grad_norm": 3.498262405395508,
+      "learning_rate": 1.8851722611624166e-05,
+      "loss": 1.9725,
+      "step": 525
+    },
+    {
+      "batch_num_effect_tokens": 5960,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52154,
+      "epoch": 0.47818,
+      "grad_norm": 3.7059292793273926,
+      "learning_rate": 1.8844329309978146e-05,
+      "loss": 2.0527,
+      "step": 526
+    },
+    {
+      "batch_num_effect_tokens": 5305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.47909,
+      "grad_norm": 3.4222211837768555,
+      "learning_rate": 1.883691374274362e-05,
+      "loss": 1.8587,
+      "step": 527
+    },
+    {
+      "batch_num_effect_tokens": 7227,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 0.48,
+      "grad_norm": 3.3856403827667236,
+      "learning_rate": 1.8829475928589272e-05,
+      "loss": 2.215,
+      "step": 528
+    },
+    {
+      "batch_num_effect_tokens": 4177,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.48091,
+      "grad_norm": 3.9069483280181885,
+      "learning_rate": 1.882201588623979e-05,
+      "loss": 1.5857,
+      "step": 529
+    },
+    {
+      "batch_num_effect_tokens": 5385,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.48182,
+      "grad_norm": 3.723468542098999,
+      "learning_rate": 1.881453363447582e-05,
+      "loss": 2.0356,
+      "step": 530
+    },
+    {
+      "batch_num_effect_tokens": 5821,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.48273,
+      "grad_norm": 3.635272741317749,
+      "learning_rate": 1.8807029192133927e-05,
+      "loss": 1.7682,
+      "step": 531
+    },
+    {
+      "batch_num_effect_tokens": 6328,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.48364,
+      "grad_norm": 3.9412262439727783,
+      "learning_rate": 1.8799502578106533e-05,
+      "loss": 2.1657,
+      "step": 532
+    },
+    {
+      "batch_num_effect_tokens": 6253,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.48455,
+      "grad_norm": 3.4781086444854736,
+      "learning_rate": 1.879195381134188e-05,
+      "loss": 1.8876,
+      "step": 533
+    },
+    {
+      "batch_num_effect_tokens": 7108,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.48545,
+      "grad_norm": 3.2070069313049316,
+      "learning_rate": 1.8784382910843978e-05,
+      "loss": 2.125,
+      "step": 534
+    },
+    {
+      "batch_num_effect_tokens": 6323,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 0.48636,
+      "grad_norm": 4.461274147033691,
+      "learning_rate": 1.8776789895672557e-05,
+      "loss": 1.7811,
+      "step": 535
+    },
+    {
+      "batch_num_effect_tokens": 7237,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.48727,
+      "grad_norm": 3.477790355682373,
+      "learning_rate": 1.8769174784943032e-05,
+      "loss": 1.9597,
+      "step": 536
+    },
+    {
+      "batch_num_effect_tokens": 4006,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.48818,
+      "grad_norm": 3.06264066696167,
+      "learning_rate": 1.8761537597826426e-05,
+      "loss": 1.2381,
+      "step": 537
+    },
+    {
+      "batch_num_effect_tokens": 7709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.48909,
+      "grad_norm": 3.4257607460021973,
+      "learning_rate": 1.8753878353549357e-05,
+      "loss": 2.1019,
+      "step": 538
+    },
+    {
+      "batch_num_effect_tokens": 7508,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52099,
+      "epoch": 0.49,
+      "grad_norm": 2.888709545135498,
+      "learning_rate": 1.874619707139396e-05,
+      "loss": 1.5886,
+      "step": 539
+    },
+    {
+      "batch_num_effect_tokens": 7412,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 0.49091,
+      "grad_norm": 3.1860759258270264,
+      "learning_rate": 1.873849377069785e-05,
+      "loss": 2.0846,
+      "step": 540
+    },
+    {
+      "batch_num_effect_tokens": 4228,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.49182,
+      "grad_norm": 3.918816328048706,
+      "learning_rate": 1.8730768470854085e-05,
+      "loss": 1.763,
+      "step": 541
+    },
+    {
+      "batch_num_effect_tokens": 4622,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 0.49273,
+      "grad_norm": 3.312856912612915,
+      "learning_rate": 1.872302119131109e-05,
+      "loss": 1.5452,
+      "step": 542
+    },
+    {
+      "batch_num_effect_tokens": 4885,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 0.49364,
+      "grad_norm": 3.5208563804626465,
+      "learning_rate": 1.8715251951572635e-05,
+      "loss": 1.7166,
+      "step": 543
+    },
+    {
+      "batch_num_effect_tokens": 5722,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.49455,
+      "grad_norm": 4.089620590209961,
+      "learning_rate": 1.8707460771197773e-05,
+      "loss": 2.1785,
+      "step": 544
+    },
+    {
+      "batch_num_effect_tokens": 6006,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.49545,
+      "grad_norm": 3.4604692459106445,
+      "learning_rate": 1.869964766980079e-05,
+      "loss": 1.9571,
+      "step": 545
+    },
+    {
+      "batch_num_effect_tokens": 7263,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 0.49636,
+      "grad_norm": 3.3346478939056396,
+      "learning_rate": 1.8691812667051164e-05,
+      "loss": 2.19,
+      "step": 546
+    },
+    {
+      "batch_num_effect_tokens": 7015,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 0.49727,
+      "grad_norm": 3.139596700668335,
+      "learning_rate": 1.8683955782673496e-05,
+      "loss": 2.203,
+      "step": 547
+    },
+    {
+      "batch_num_effect_tokens": 5609,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.49818,
+      "grad_norm": 3.289130687713623,
+      "learning_rate": 1.867607703644749e-05,
+      "loss": 1.8295,
+      "step": 548
+    },
+    {
+      "batch_num_effect_tokens": 5855,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 0.49909,
+      "grad_norm": 3.2381234169006348,
+      "learning_rate": 1.8668176448207883e-05,
+      "loss": 1.8196,
+      "step": 549
+    },
+    {
+      "batch_num_effect_tokens": 6374,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52167,
+      "epoch": 0.5,
+      "grad_norm": 3.6862313747406006,
+      "learning_rate": 1.866025403784439e-05,
+      "loss": 2.1483,
+      "step": 550
+    },
+    {
+      "batch_num_effect_tokens": 7273,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.50091,
+      "grad_norm": 3.053278923034668,
+      "learning_rate": 1.865230982530167e-05,
+      "loss": 2.1838,
+      "step": 551
+    },
+    {
+      "batch_num_effect_tokens": 4038,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52108,
+      "epoch": 0.50182,
+      "grad_norm": 3.9057250022888184,
+      "learning_rate": 1.864434383057927e-05,
+      "loss": 1.7801,
+      "step": 552
+    },
+    {
+      "batch_num_effect_tokens": 7194,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52168,
+      "epoch": 0.50273,
+      "grad_norm": 2.9885618686676025,
+      "learning_rate": 1.863635607373157e-05,
+      "loss": 1.9528,
+      "step": 553
+    },
+    {
+      "batch_num_effect_tokens": 3872,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.50364,
+      "grad_norm": 3.4793949127197266,
+      "learning_rate": 1.8628346574867748e-05,
+      "loss": 1.2668,
+      "step": 554
+    },
+    {
+      "batch_num_effect_tokens": 5104,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.50455,
+      "grad_norm": 3.6043288707733154,
+      "learning_rate": 1.8620315354151695e-05,
+      "loss": 1.8763,
+      "step": 555
+    },
+    {
+      "batch_num_effect_tokens": 9724,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.50545,
+      "grad_norm": 7.592609882354736,
+      "learning_rate": 1.861226243180201e-05,
+      "loss": 1.3798,
+      "step": 556
+    },
+    {
+      "batch_num_effect_tokens": 6299,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.50636,
+      "grad_norm": 3.6786558628082275,
+      "learning_rate": 1.8604187828091906e-05,
+      "loss": 2.0197,
+      "step": 557
+    },
+    {
+      "batch_num_effect_tokens": 7648,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.50727,
+      "grad_norm": 3.2028064727783203,
+      "learning_rate": 1.859609156334919e-05,
+      "loss": 1.9882,
+      "step": 558
+    },
+    {
+      "batch_num_effect_tokens": 6900,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52156,
+      "epoch": 0.50818,
+      "grad_norm": 3.237802267074585,
+      "learning_rate": 1.858797365795621e-05,
+      "loss": 1.9579,
+      "step": 559
+    },
+    {
+      "batch_num_effect_tokens": 6792,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50565,
+      "epoch": 0.50909,
+      "grad_norm": 3.682898759841919,
+      "learning_rate": 1.8579834132349773e-05,
+      "loss": 2.3555,
+      "step": 560
+    },
+    {
+      "batch_num_effect_tokens": 6533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.51,
+      "grad_norm": 3.4176299571990967,
+      "learning_rate": 1.8571673007021124e-05,
+      "loss": 1.9165,
+      "step": 561
+    },
+    {
+      "batch_num_effect_tokens": 7501,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.51091,
+      "grad_norm": 3.5271034240722656,
+      "learning_rate": 1.856349030251589e-05,
+      "loss": 2.236,
+      "step": 562
+    },
+    {
+      "batch_num_effect_tokens": 5026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.51182,
+      "grad_norm": 3.6298139095306396,
+      "learning_rate": 1.8555286039434022e-05,
+      "loss": 1.8674,
+      "step": 563
+    },
+    {
+      "batch_num_effect_tokens": 6409,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 0.51273,
+      "grad_norm": 3.4671781063079834,
+      "learning_rate": 1.8547060238429737e-05,
+      "loss": 1.9354,
+      "step": 564
+    },
+    {
+      "batch_num_effect_tokens": 7138,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.51364,
+      "grad_norm": 3.528872013092041,
+      "learning_rate": 1.8538812920211484e-05,
+      "loss": 2.2646,
+      "step": 565
+    },
+    {
+      "batch_num_effect_tokens": 7711,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 0.51455,
+      "grad_norm": 3.2639312744140625,
+      "learning_rate": 1.8530544105541872e-05,
+      "loss": 2.2585,
+      "step": 566
+    },
+    {
+      "batch_num_effect_tokens": 7945,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.51545,
+      "grad_norm": 3.3265206813812256,
+      "learning_rate": 1.8522253815237636e-05,
+      "loss": 2.1542,
+      "step": 567
+    },
+    {
+      "batch_num_effect_tokens": 7369,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.51636,
+      "grad_norm": 2.9828522205352783,
+      "learning_rate": 1.8513942070169572e-05,
+      "loss": 2.2732,
+      "step": 568
+    },
+    {
+      "batch_num_effect_tokens": 7247,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.51727,
+      "grad_norm": 3.3099958896636963,
+      "learning_rate": 1.8505608891262487e-05,
+      "loss": 2.2839,
+      "step": 569
+    },
+    {
+      "batch_num_effect_tokens": 4585,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.51818,
+      "grad_norm": 3.316478729248047,
+      "learning_rate": 1.8497254299495147e-05,
+      "loss": 1.5033,
+      "step": 570
+    },
+    {
+      "batch_num_effect_tokens": 6077,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.51909,
+      "grad_norm": 3.613713502883911,
+      "learning_rate": 1.8488878315900228e-05,
+      "loss": 2.1368,
+      "step": 571
+    },
+    {
+      "batch_num_effect_tokens": 9597,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.52,
+      "grad_norm": 3.1866140365600586,
+      "learning_rate": 1.848048096156426e-05,
+      "loss": 2.093,
+      "step": 572
+    },
+    {
+      "batch_num_effect_tokens": 4443,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.52091,
+      "grad_norm": 3.794059991836548,
+      "learning_rate": 1.8472062257627573e-05,
+      "loss": 1.549,
+      "step": 573
+    },
+    {
+      "batch_num_effect_tokens": 3305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.52182,
+      "grad_norm": 3.145803451538086,
+      "learning_rate": 1.8463622225284242e-05,
+      "loss": 0.9547,
+      "step": 574
+    },
+    {
+      "batch_num_effect_tokens": 3851,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 0.52273,
+      "grad_norm": 3.587178945541382,
+      "learning_rate": 1.8455160885782045e-05,
+      "loss": 1.4285,
+      "step": 575
+    },
+    {
+      "batch_num_effect_tokens": 8686,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.52364,
+      "grad_norm": 3.6582915782928467,
+      "learning_rate": 1.8446678260422388e-05,
+      "loss": 1.9506,
+      "step": 576
+    },
+    {
+      "batch_num_effect_tokens": 4839,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.52455,
+      "grad_norm": 3.1688389778137207,
+      "learning_rate": 1.8438174370560263e-05,
+      "loss": 1.3682,
+      "step": 577
+    },
+    {
+      "batch_num_effect_tokens": 6489,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52153,
+      "epoch": 0.52545,
+      "grad_norm": 3.809812307357788,
+      "learning_rate": 1.8429649237604215e-05,
+      "loss": 1.9886,
+      "step": 578
+    },
+    {
+      "batch_num_effect_tokens": 5519,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.52636,
+      "grad_norm": 3.732606887817383,
+      "learning_rate": 1.8421102883016253e-05,
+      "loss": 1.6313,
+      "step": 579
+    },
+    {
+      "batch_num_effect_tokens": 10408,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 0.52727,
+      "grad_norm": 3.174161911010742,
+      "learning_rate": 1.8412535328311813e-05,
+      "loss": 2.4339,
+      "step": 580
+    },
+    {
+      "batch_num_effect_tokens": 4047,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50510,
+      "epoch": 0.52818,
+      "grad_norm": 3.608646869659424,
+      "learning_rate": 1.8403946595059705e-05,
+      "loss": 1.3289,
+      "step": 581
+    },
+    {
+      "batch_num_effect_tokens": 6078,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 0.52909,
+      "grad_norm": 3.461907148361206,
+      "learning_rate": 1.839533670488205e-05,
+      "loss": 1.9802,
+      "step": 582
+    },
+    {
+      "batch_num_effect_tokens": 4242,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 0.53,
+      "grad_norm": 3.994670867919922,
+      "learning_rate": 1.8386705679454243e-05,
+      "loss": 1.6785,
+      "step": 583
+    },
+    {
+      "batch_num_effect_tokens": 5623,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.53091,
+      "grad_norm": 3.693882465362549,
+      "learning_rate": 1.8378053540504874e-05,
+      "loss": 1.8519,
+      "step": 584
+    },
+    {
+      "batch_num_effect_tokens": 4473,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.53182,
+      "grad_norm": 3.797433614730835,
+      "learning_rate": 1.83693803098157e-05,
+      "loss": 1.2907,
+      "step": 585
+    },
+    {
+      "batch_num_effect_tokens": 7440,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52148,
+      "epoch": 0.53273,
+      "grad_norm": 3.0073556900024414,
+      "learning_rate": 1.836068600922156e-05,
+      "loss": 1.8691,
+      "step": 586
+    },
+    {
+      "batch_num_effect_tokens": 9577,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 0.53364,
+      "grad_norm": 3.577150583267212,
+      "learning_rate": 1.835197066061035e-05,
+      "loss": 2.1412,
+      "step": 587
+    },
+    {
+      "batch_num_effect_tokens": 7733,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.53455,
+      "grad_norm": 3.0998103618621826,
+      "learning_rate": 1.8343234285922955e-05,
+      "loss": 1.8911,
+      "step": 588
+    },
+    {
+      "batch_num_effect_tokens": 6919,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.53545,
+      "grad_norm": 3.259941816329956,
+      "learning_rate": 1.8334476907153177e-05,
+      "loss": 2.0594,
+      "step": 589
+    },
+    {
+      "batch_num_effect_tokens": 6546,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.53636,
+      "grad_norm": 3.149838924407959,
+      "learning_rate": 1.8325698546347714e-05,
+      "loss": 1.9796,
+      "step": 590
+    },
+    {
+      "batch_num_effect_tokens": 10309,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 0.53727,
+      "grad_norm": 2.3748481273651123,
+      "learning_rate": 1.8316899225606078e-05,
+      "loss": 1.6333,
+      "step": 591
+    },
+    {
+      "batch_num_effect_tokens": 5608,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.53818,
+      "grad_norm": 3.143259048461914,
+      "learning_rate": 1.8308078967080547e-05,
+      "loss": 1.6868,
+      "step": 592
+    },
+    {
+      "batch_num_effect_tokens": 5158,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.53909,
+      "grad_norm": 3.805025339126587,
+      "learning_rate": 1.829923779297611e-05,
+      "loss": 1.8101,
+      "step": 593
+    },
+    {
+      "batch_num_effect_tokens": 6560,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.54,
+      "grad_norm": 3.417865753173828,
+      "learning_rate": 1.8290375725550417e-05,
+      "loss": 2.1457,
+      "step": 594
+    },
+    {
+      "batch_num_effect_tokens": 6663,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.54091,
+      "grad_norm": 3.3823914527893066,
+      "learning_rate": 1.8281492787113707e-05,
+      "loss": 1.9639,
+      "step": 595
+    },
+    {
+      "batch_num_effect_tokens": 5353,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 0.54182,
+      "grad_norm": 3.1551902294158936,
+      "learning_rate": 1.8272589000028774e-05,
+      "loss": 1.8591,
+      "step": 596
+    },
+    {
+      "batch_num_effect_tokens": 8126,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.54273,
+      "grad_norm": 2.9723100662231445,
+      "learning_rate": 1.826366438671088e-05,
+      "loss": 2.0776,
+      "step": 597
+    },
+    {
+      "batch_num_effect_tokens": 5902,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52102,
+      "epoch": 0.54364,
+      "grad_norm": 3.1751818656921387,
+      "learning_rate": 1.825471896962774e-05,
+      "loss": 1.9042,
+      "step": 598
+    },
+    {
+      "batch_num_effect_tokens": 7567,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.54455,
+      "grad_norm": 2.933289051055908,
+      "learning_rate": 1.8245752771299426e-05,
+      "loss": 2.1582,
+      "step": 599
+    },
+    {
+      "batch_num_effect_tokens": 6325,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.54545,
+      "grad_norm": 3.5300397872924805,
+      "learning_rate": 1.8236765814298328e-05,
+      "loss": 2.0646,
+      "step": 600
+    },
+    {
+      "batch_num_effect_tokens": 4882,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52089,
+      "epoch": 0.54636,
+      "grad_norm": 3.5921027660369873,
+      "learning_rate": 1.8227758121249108e-05,
+      "loss": 1.7715,
+      "step": 601
+    },
+    {
+      "batch_num_effect_tokens": 6232,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.54727,
+      "grad_norm": 3.275217294692993,
+      "learning_rate": 1.8218729714828612e-05,
+      "loss": 2.0114,
+      "step": 602
+    },
+    {
+      "batch_num_effect_tokens": 6150,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.54818,
+      "grad_norm": 3.6507728099823,
+      "learning_rate": 1.820968061776585e-05,
+      "loss": 2.1635,
+      "step": 603
+    },
+    {
+      "batch_num_effect_tokens": 5137,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.54909,
+      "grad_norm": 3.0628061294555664,
+      "learning_rate": 1.8200610852841913e-05,
+      "loss": 1.5982,
+      "step": 604
+    },
+    {
+      "batch_num_effect_tokens": 5457,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.55,
+      "grad_norm": 3.211014986038208,
+      "learning_rate": 1.819152044288992e-05,
+      "loss": 1.7999,
+      "step": 605
+    },
+    {
+      "batch_num_effect_tokens": 6750,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.55091,
+      "grad_norm": 3.275047540664673,
+      "learning_rate": 1.818240941079497e-05,
+      "loss": 2.1458,
+      "step": 606
+    },
+    {
+      "batch_num_effect_tokens": 5085,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 0.55182,
+      "grad_norm": 3.6827852725982666,
+      "learning_rate": 1.817327777949407e-05,
+      "loss": 1.8882,
+      "step": 607
+    },
+    {
+      "batch_num_effect_tokens": 6080,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.55273,
+      "grad_norm": 3.345921277999878,
+      "learning_rate": 1.81641255719761e-05,
+      "loss": 1.9049,
+      "step": 608
+    },
+    {
+      "batch_num_effect_tokens": 5989,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 0.55364,
+      "grad_norm": 3.1699001789093018,
+      "learning_rate": 1.8154952811281723e-05,
+      "loss": 1.7104,
+      "step": 609
+    },
+    {
+      "batch_num_effect_tokens": 6347,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 0.55455,
+      "grad_norm": 3.049734592437744,
+      "learning_rate": 1.814575952050336e-05,
+      "loss": 1.758,
+      "step": 610
+    },
+    {
+      "batch_num_effect_tokens": 6280,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50569,
+      "epoch": 0.55545,
+      "grad_norm": 3.2737529277801514,
+      "learning_rate": 1.81365457227851e-05,
+      "loss": 1.4851,
+      "step": 611
+    },
+    {
+      "batch_num_effect_tokens": 5818,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.55636,
+      "grad_norm": 3.4388372898101807,
+      "learning_rate": 1.812731144132268e-05,
+      "loss": 1.8755,
+      "step": 612
+    },
+    {
+      "batch_num_effect_tokens": 5249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.55727,
+      "grad_norm": 3.5681707859039307,
+      "learning_rate": 1.8118056699363386e-05,
+      "loss": 1.8418,
+      "step": 613
+    },
+    {
+      "batch_num_effect_tokens": 6035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.55818,
+      "grad_norm": 4.498365879058838,
+      "learning_rate": 1.810878152020602e-05,
+      "loss": 1.8722,
+      "step": 614
+    },
+    {
+      "batch_num_effect_tokens": 4748,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.55909,
+      "grad_norm": 3.7093660831451416,
+      "learning_rate": 1.809948592720084e-05,
+      "loss": 1.5109,
+      "step": 615
+    },
+    {
+      "batch_num_effect_tokens": 5118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.56,
+      "grad_norm": 3.8467259407043457,
+      "learning_rate": 1.8090169943749477e-05,
+      "loss": 1.65,
+      "step": 616
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52213,
+      "epoch": 0.56091,
+      "grad_norm": 3.0363943576812744,
+      "learning_rate": 1.8080833593304917e-05,
+      "loss": 2.1111,
+      "step": 617
+    },
+    {
+      "batch_num_effect_tokens": 5406,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 0.56182,
+      "grad_norm": 3.4476122856140137,
+      "learning_rate": 1.8071476899371414e-05,
+      "loss": 1.5297,
+      "step": 618
+    },
+    {
+      "batch_num_effect_tokens": 7348,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52167,
+      "epoch": 0.56273,
+      "grad_norm": 3.2488205432891846,
+      "learning_rate": 1.806209988550443e-05,
+      "loss": 2.1373,
+      "step": 619
+    },
+    {
+      "batch_num_effect_tokens": 5017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.56364,
+      "grad_norm": 3.3816072940826416,
+      "learning_rate": 1.8052702575310588e-05,
+      "loss": 1.8264,
+      "step": 620
+    },
+    {
+      "batch_num_effect_tokens": 6569,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 0.56455,
+      "grad_norm": 3.2509372234344482,
+      "learning_rate": 1.8043284992447603e-05,
+      "loss": 2.0343,
+      "step": 621
+    },
+    {
+      "batch_num_effect_tokens": 5804,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.56545,
+      "grad_norm": 3.390310287475586,
+      "learning_rate": 1.803384716062423e-05,
+      "loss": 2.2642,
+      "step": 622
+    },
+    {
+      "batch_num_effect_tokens": 5884,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.56636,
+      "grad_norm": 3.26361346244812,
+      "learning_rate": 1.8024389103600196e-05,
+      "loss": 1.8734,
+      "step": 623
+    },
+    {
+      "batch_num_effect_tokens": 5589,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.56727,
+      "grad_norm": 3.1460154056549072,
+      "learning_rate": 1.8014910845186154e-05,
+      "loss": 1.8174,
+      "step": 624
+    },
+    {
+      "batch_num_effect_tokens": 8632,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 0.56818,
+      "grad_norm": 2.8307669162750244,
+      "learning_rate": 1.8005412409243604e-05,
+      "loss": 2.0944,
+      "step": 625
+    },
+    {
+      "batch_num_effect_tokens": 5966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.56909,
+      "grad_norm": 3.4551782608032227,
+      "learning_rate": 1.799589381968485e-05,
+      "loss": 2.0861,
+      "step": 626
+    },
+    {
+      "batch_num_effect_tokens": 7451,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.57,
+      "grad_norm": 2.963189125061035,
+      "learning_rate": 1.798635510047293e-05,
+      "loss": 1.8454,
+      "step": 627
+    },
+    {
+      "batch_num_effect_tokens": 6263,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.57091,
+      "grad_norm": 3.049030065536499,
+      "learning_rate": 1.7976796275621556e-05,
+      "loss": 1.8021,
+      "step": 628
+    },
+    {
+      "batch_num_effect_tokens": 5990,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.57182,
+      "grad_norm": 3.6765613555908203,
+      "learning_rate": 1.7967217369195058e-05,
+      "loss": 1.9847,
+      "step": 629
+    },
+    {
+      "batch_num_effect_tokens": 9942,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.57273,
+      "grad_norm": 2.79931902885437,
+      "learning_rate": 1.7957618405308323e-05,
+      "loss": 2.1257,
+      "step": 630
+    },
+    {
+      "batch_num_effect_tokens": 7262,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.57364,
+      "grad_norm": 3.0973598957061768,
+      "learning_rate": 1.794799940812673e-05,
+      "loss": 1.9928,
+      "step": 631
+    },
+    {
+      "batch_num_effect_tokens": 5831,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.57455,
+      "grad_norm": 3.324289321899414,
+      "learning_rate": 1.7938360401866096e-05,
+      "loss": 1.8391,
+      "step": 632
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.57545,
+      "grad_norm": 3.042884111404419,
+      "learning_rate": 1.79287014107926e-05,
+      "loss": 2.0706,
+      "step": 633
+    },
+    {
+      "batch_num_effect_tokens": 5643,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 0.57636,
+      "grad_norm": 3.443073272705078,
+      "learning_rate": 1.7919022459222754e-05,
+      "loss": 1.966,
+      "step": 634
+    },
+    {
+      "batch_num_effect_tokens": 4436,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.57727,
+      "grad_norm": 3.825007438659668,
+      "learning_rate": 1.7909323571523295e-05,
+      "loss": 2.0158,
+      "step": 635
+    },
+    {
+      "batch_num_effect_tokens": 4307,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.57818,
+      "grad_norm": 3.416240692138672,
+      "learning_rate": 1.7899604772111163e-05,
+      "loss": 1.417,
+      "step": 636
+    },
+    {
+      "batch_num_effect_tokens": 6263,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.57909,
+      "grad_norm": 3.2189557552337646,
+      "learning_rate": 1.788986608545343e-05,
+      "loss": 1.8994,
+      "step": 637
+    },
+    {
+      "batch_num_effect_tokens": 6508,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52217,
+      "epoch": 0.58,
+      "grad_norm": 3.308938503265381,
+      "learning_rate": 1.788010753606722e-05,
+      "loss": 2.2697,
+      "step": 638
+    },
+    {
+      "batch_num_effect_tokens": 8259,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.58091,
+      "grad_norm": 3.0805373191833496,
+      "learning_rate": 1.7870329148519675e-05,
+      "loss": 2.0879,
+      "step": 639
+    },
+    {
+      "batch_num_effect_tokens": 8419,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.58182,
+      "grad_norm": 3.1611268520355225,
+      "learning_rate": 1.7860530947427878e-05,
+      "loss": 1.9171,
+      "step": 640
+    },
+    {
+      "batch_num_effect_tokens": 5999,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52216,
+      "epoch": 0.58273,
+      "grad_norm": 3.2548370361328125,
+      "learning_rate": 1.7850712957458777e-05,
+      "loss": 1.7961,
+      "step": 641
+    },
+    {
+      "batch_num_effect_tokens": 5583,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 0.58364,
+      "grad_norm": 3.7277276515960693,
+      "learning_rate": 1.784087520332916e-05,
+      "loss": 1.6307,
+      "step": 642
+    },
+    {
+      "batch_num_effect_tokens": 3962,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.58455,
+      "grad_norm": 3.814300060272217,
+      "learning_rate": 1.7831017709805555e-05,
+      "loss": 1.6648,
+      "step": 643
+    },
+    {
+      "batch_num_effect_tokens": 9244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.58545,
+      "grad_norm": 2.9811148643493652,
+      "learning_rate": 1.7821140501704195e-05,
+      "loss": 1.951,
+      "step": 644
+    },
+    {
+      "batch_num_effect_tokens": 6123,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.58636,
+      "grad_norm": 3.252951145172119,
+      "learning_rate": 1.7811243603890934e-05,
+      "loss": 1.8387,
+      "step": 645
+    },
+    {
+      "batch_num_effect_tokens": 6517,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 0.58727,
+      "grad_norm": 3.345839023590088,
+      "learning_rate": 1.780132704128121e-05,
+      "loss": 1.9952,
+      "step": 646
+    },
+    {
+      "batch_num_effect_tokens": 7450,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52175,
+      "epoch": 0.58818,
+      "grad_norm": 3.410034656524658,
+      "learning_rate": 1.7791390838839946e-05,
+      "loss": 2.075,
+      "step": 647
+    },
+    {
+      "batch_num_effect_tokens": 7236,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 0.58909,
+      "grad_norm": 3.4517953395843506,
+      "learning_rate": 1.7781435021581527e-05,
+      "loss": 2.0206,
+      "step": 648
+    },
+    {
+      "batch_num_effect_tokens": 8934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.59,
+      "grad_norm": 3.131535768508911,
+      "learning_rate": 1.777145961456971e-05,
+      "loss": 2.4076,
+      "step": 649
+    },
+    {
+      "batch_num_effect_tokens": 11262,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.59091,
+      "grad_norm": 3.0515923500061035,
+      "learning_rate": 1.776146464291757e-05,
+      "loss": 2.2349,
+      "step": 650
+    },
+    {
+      "batch_num_effect_tokens": 8278,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.59182,
+      "grad_norm": 3.1197054386138916,
+      "learning_rate": 1.7751450131787435e-05,
+      "loss": 2.2192,
+      "step": 651
+    },
+    {
+      "batch_num_effect_tokens": 6649,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.59273,
+      "grad_norm": 3.2355058193206787,
+      "learning_rate": 1.7741416106390828e-05,
+      "loss": 2.0891,
+      "step": 652
+    },
+    {
+      "batch_num_effect_tokens": 6802,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 0.59364,
+      "grad_norm": 3.1941349506378174,
+      "learning_rate": 1.773136259198839e-05,
+      "loss": 2.1189,
+      "step": 653
+    },
+    {
+      "batch_num_effect_tokens": 6640,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.59455,
+      "grad_norm": 3.1458933353424072,
+      "learning_rate": 1.7721289613889835e-05,
+      "loss": 2.1445,
+      "step": 654
+    },
+    {
+      "batch_num_effect_tokens": 7602,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.59545,
+      "grad_norm": 2.9972329139709473,
+      "learning_rate": 1.771119719745388e-05,
+      "loss": 2.2091,
+      "step": 655
+    },
+    {
+      "batch_num_effect_tokens": 5104,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.59636,
+      "grad_norm": 3.2239010334014893,
+      "learning_rate": 1.7701085368088157e-05,
+      "loss": 1.6984,
+      "step": 656
+    },
+    {
+      "batch_num_effect_tokens": 7291,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.59727,
+      "grad_norm": 3.082944631576538,
+      "learning_rate": 1.7690954151249196e-05,
+      "loss": 2.0532,
+      "step": 657
+    },
+    {
+      "batch_num_effect_tokens": 6642,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50519,
+      "epoch": 0.59818,
+      "grad_norm": 3.380568504333496,
+      "learning_rate": 1.768080357244232e-05,
+      "loss": 1.9457,
+      "step": 658
+    },
+    {
+      "batch_num_effect_tokens": 6324,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 0.59909,
+      "grad_norm": 3.1841609477996826,
+      "learning_rate": 1.7670633657221602e-05,
+      "loss": 2.0081,
+      "step": 659
+    },
+    {
+      "batch_num_effect_tokens": 8588,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 0.6,
+      "grad_norm": 3.0000061988830566,
+      "learning_rate": 1.766044443118978e-05,
+      "loss": 2.34,
+      "step": 660
+    },
+    {
+      "batch_num_effect_tokens": 5045,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.60091,
+      "grad_norm": 3.462127685546875,
+      "learning_rate": 1.7650235919998234e-05,
+      "loss": 1.8486,
+      "step": 661
+    },
+    {
+      "batch_num_effect_tokens": 7103,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52114,
+      "epoch": 0.60182,
+      "grad_norm": 3.326214075088501,
+      "learning_rate": 1.7640008149346866e-05,
+      "loss": 2.0843,
+      "step": 662
+    },
+    {
+      "batch_num_effect_tokens": 5104,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.60273,
+      "grad_norm": 3.820380449295044,
+      "learning_rate": 1.7629761144984087e-05,
+      "loss": 1.4826,
+      "step": 663
+    },
+    {
+      "batch_num_effect_tokens": 8910,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.60364,
+      "grad_norm": 2.995626211166382,
+      "learning_rate": 1.761949493270671e-05,
+      "loss": 2.3316,
+      "step": 664
+    },
+    {
+      "batch_num_effect_tokens": 8323,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.60455,
+      "grad_norm": 3.0097763538360596,
+      "learning_rate": 1.7609209538359917e-05,
+      "loss": 2.2883,
+      "step": 665
+    },
+    {
+      "batch_num_effect_tokens": 7378,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.60545,
+      "grad_norm": 3.102829933166504,
+      "learning_rate": 1.759890498783717e-05,
+      "loss": 1.9407,
+      "step": 666
+    },
+    {
+      "batch_num_effect_tokens": 4611,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.60636,
+      "grad_norm": 3.913276433944702,
+      "learning_rate": 1.758858130708017e-05,
+      "loss": 1.9572,
+      "step": 667
+    },
+    {
+      "batch_num_effect_tokens": 7009,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.60727,
+      "grad_norm": 3.0823023319244385,
+      "learning_rate": 1.757823852207877e-05,
+      "loss": 1.5677,
+      "step": 668
+    },
+    {
+      "batch_num_effect_tokens": 7465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 0.60818,
+      "grad_norm": 3.0849194526672363,
+      "learning_rate": 1.7567876658870917e-05,
+      "loss": 2.2721,
+      "step": 669
+    },
+    {
+      "batch_num_effect_tokens": 8093,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52217,
+      "epoch": 0.60909,
+      "grad_norm": 3.021416664123535,
+      "learning_rate": 1.7557495743542586e-05,
+      "loss": 2.2716,
+      "step": 670
+    },
+    {
+      "batch_num_effect_tokens": 9927,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.61,
+      "grad_norm": 2.8188624382019043,
+      "learning_rate": 1.7547095802227723e-05,
+      "loss": 2.3289,
+      "step": 671
+    },
+    {
+      "batch_num_effect_tokens": 6864,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52165,
+      "epoch": 0.61091,
+      "grad_norm": 3.308123826980591,
+      "learning_rate": 1.7536676861108167e-05,
+      "loss": 2.0859,
+      "step": 672
+    },
+    {
+      "batch_num_effect_tokens": 3919,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.61182,
+      "grad_norm": 3.851428270339966,
+      "learning_rate": 1.752623894641359e-05,
+      "loss": 1.6075,
+      "step": 673
+    },
+    {
+      "batch_num_effect_tokens": 5389,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 0.61273,
+      "grad_norm": 3.3875057697296143,
+      "learning_rate": 1.7515782084421426e-05,
+      "loss": 1.3186,
+      "step": 674
+    },
+    {
+      "batch_num_effect_tokens": 4684,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.61364,
+      "grad_norm": 3.598844289779663,
+      "learning_rate": 1.7505306301456823e-05,
+      "loss": 1.5316,
+      "step": 675
+    },
+    {
+      "batch_num_effect_tokens": 7285,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.61455,
+      "grad_norm": 3.2655041217803955,
+      "learning_rate": 1.7494811623892543e-05,
+      "loss": 1.7023,
+      "step": 676
+    },
+    {
+      "batch_num_effect_tokens": 5171,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.61545,
+      "grad_norm": 3.2970337867736816,
+      "learning_rate": 1.7484298078148926e-05,
+      "loss": 1.5864,
+      "step": 677
+    },
+    {
+      "batch_num_effect_tokens": 8958,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.61636,
+      "grad_norm": 3.159494638442993,
+      "learning_rate": 1.7473765690693812e-05,
+      "loss": 2.0336,
+      "step": 678
+    },
+    {
+      "batch_num_effect_tokens": 7201,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52144,
+      "epoch": 0.61727,
+      "grad_norm": 3.675417184829712,
+      "learning_rate": 1.7463214488042472e-05,
+      "loss": 2.2643,
+      "step": 679
+    },
+    {
+      "batch_num_effect_tokens": 5768,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.61818,
+      "grad_norm": 3.556011199951172,
+      "learning_rate": 1.745264449675755e-05,
+      "loss": 1.975,
+      "step": 680
+    },
+    {
+      "batch_num_effect_tokens": 6721,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52177,
+      "epoch": 0.61909,
+      "grad_norm": 3.3262288570404053,
+      "learning_rate": 1.744205574344898e-05,
+      "loss": 2.1309,
+      "step": 681
+    },
+    {
+      "batch_num_effect_tokens": 5329,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 0.62,
+      "grad_norm": 3.306814193725586,
+      "learning_rate": 1.7431448254773943e-05,
+      "loss": 1.461,
+      "step": 682
+    },
+    {
+      "batch_num_effect_tokens": 6465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52212,
+      "epoch": 0.62091,
+      "grad_norm": 2.9740865230560303,
+      "learning_rate": 1.7420822057436777e-05,
+      "loss": 1.4722,
+      "step": 683
+    },
+    {
+      "batch_num_effect_tokens": 4547,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.62182,
+      "grad_norm": 3.394707679748535,
+      "learning_rate": 1.7410177178188917e-05,
+      "loss": 1.2162,
+      "step": 684
+    },
+    {
+      "batch_num_effect_tokens": 5792,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.62273,
+      "grad_norm": 3.283432722091675,
+      "learning_rate": 1.739951364382884e-05,
+      "loss": 1.5428,
+      "step": 685
+    },
+    {
+      "batch_num_effect_tokens": 6506,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 0.62364,
+      "grad_norm": 3.075127601623535,
+      "learning_rate": 1.738883148120198e-05,
+      "loss": 2.0264,
+      "step": 686
+    },
+    {
+      "batch_num_effect_tokens": 7452,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.62455,
+      "grad_norm": 3.2405524253845215,
+      "learning_rate": 1.737813071720066e-05,
+      "loss": 1.8857,
+      "step": 687
+    },
+    {
+      "batch_num_effect_tokens": 6921,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.62545,
+      "grad_norm": 3.0870354175567627,
+      "learning_rate": 1.736741137876405e-05,
+      "loss": 1.9788,
+      "step": 688
+    },
+    {
+      "batch_num_effect_tokens": 6641,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.62636,
+      "grad_norm": 3.2581539154052734,
+      "learning_rate": 1.7356673492878073e-05,
+      "loss": 1.8177,
+      "step": 689
+    },
+    {
+      "batch_num_effect_tokens": 7005,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.62727,
+      "grad_norm": 3.6070356369018555,
+      "learning_rate": 1.734591708657533e-05,
+      "loss": 2.2334,
+      "step": 690
+    },
+    {
+      "batch_num_effect_tokens": 5194,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 0.62818,
+      "grad_norm": 3.5002195835113525,
+      "learning_rate": 1.7335142186935083e-05,
+      "loss": 1.6752,
+      "step": 691
+    },
+    {
+      "batch_num_effect_tokens": 4651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.62909,
+      "grad_norm": 3.570697784423828,
+      "learning_rate": 1.732434882108311e-05,
+      "loss": 1.7339,
+      "step": 692
+    },
+    {
+      "batch_num_effect_tokens": 6218,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.63,
+      "grad_norm": 3.5183212757110596,
+      "learning_rate": 1.7313537016191706e-05,
+      "loss": 1.883,
+      "step": 693
+    },
+    {
+      "batch_num_effect_tokens": 8700,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.63091,
+      "grad_norm": 3.099947690963745,
+      "learning_rate": 1.7302706799479575e-05,
+      "loss": 2.0278,
+      "step": 694
+    },
+    {
+      "batch_num_effect_tokens": 5236,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.63182,
+      "grad_norm": 3.500469207763672,
+      "learning_rate": 1.7291858198211772e-05,
+      "loss": 1.52,
+      "step": 695
+    },
+    {
+      "batch_num_effect_tokens": 6232,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52131,
+      "epoch": 0.63273,
+      "grad_norm": 3.528430938720703,
+      "learning_rate": 1.7280991239699643e-05,
+      "loss": 2.1318,
+      "step": 696
+    },
+    {
+      "batch_num_effect_tokens": 5888,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.63364,
+      "grad_norm": 3.3069233894348145,
+      "learning_rate": 1.727010595130074e-05,
+      "loss": 1.8845,
+      "step": 697
+    },
+    {
+      "batch_num_effect_tokens": 8851,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 0.63455,
+      "grad_norm": 2.9502975940704346,
+      "learning_rate": 1.7259202360418765e-05,
+      "loss": 2.3618,
+      "step": 698
+    },
+    {
+      "batch_num_effect_tokens": 6625,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.63545,
+      "grad_norm": 3.252659320831299,
+      "learning_rate": 1.724828049450349e-05,
+      "loss": 2.0632,
+      "step": 699
+    },
+    {
+      "batch_num_effect_tokens": 3409,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 0.63636,
+      "grad_norm": 3.150601863861084,
+      "learning_rate": 1.72373403810507e-05,
+      "loss": 1.1395,
+      "step": 700
+    },
+    {
+      "batch_num_effect_tokens": 4073,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.63727,
+      "grad_norm": 3.0718331336975098,
+      "learning_rate": 1.722638204760213e-05,
+      "loss": 1.2115,
+      "step": 701
+    },
+    {
+      "batch_num_effect_tokens": 7952,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.63818,
+      "grad_norm": 3.1261394023895264,
+      "learning_rate": 1.7215405521745358e-05,
+      "loss": 1.9513,
+      "step": 702
+    },
+    {
+      "batch_num_effect_tokens": 10026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.63909,
+      "grad_norm": 2.8539605140686035,
+      "learning_rate": 1.7204410831113778e-05,
+      "loss": 2.3353,
+      "step": 703
+    },
+    {
+      "batch_num_effect_tokens": 8168,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.64,
+      "grad_norm": 3.016489267349243,
+      "learning_rate": 1.7193398003386514e-05,
+      "loss": 2.254,
+      "step": 704
+    },
+    {
+      "batch_num_effect_tokens": 6234,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.64091,
+      "grad_norm": 3.294809103012085,
+      "learning_rate": 1.7182367066288344e-05,
+      "loss": 2.1232,
+      "step": 705
+    },
+    {
+      "batch_num_effect_tokens": 9518,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.64182,
+      "grad_norm": 2.9158883094787598,
+      "learning_rate": 1.7171318047589637e-05,
+      "loss": 2.1875,
+      "step": 706
+    },
+    {
+      "batch_num_effect_tokens": 5380,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.64273,
+      "grad_norm": 3.651305913925171,
+      "learning_rate": 1.7160250975106286e-05,
+      "loss": 1.8801,
+      "step": 707
+    },
+    {
+      "batch_num_effect_tokens": 6389,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 0.64364,
+      "grad_norm": 3.2613284587860107,
+      "learning_rate": 1.7149165876699635e-05,
+      "loss": 1.9556,
+      "step": 708
+    },
+    {
+      "batch_num_effect_tokens": 5791,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.64455,
+      "grad_norm": 3.0628511905670166,
+      "learning_rate": 1.7138062780276404e-05,
+      "loss": 1.6504,
+      "step": 709
+    },
+    {
+      "batch_num_effect_tokens": 5752,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.64545,
+      "grad_norm": 3.2969539165496826,
+      "learning_rate": 1.7126941713788633e-05,
+      "loss": 2.0699,
+      "step": 710
+    },
+    {
+      "batch_num_effect_tokens": 5496,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.64636,
+      "grad_norm": 3.300597667694092,
+      "learning_rate": 1.7115802705233576e-05,
+      "loss": 1.7136,
+      "step": 711
+    },
+    {
+      "batch_num_effect_tokens": 6962,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 0.64727,
+      "grad_norm": 3.275751829147339,
+      "learning_rate": 1.710464578265369e-05,
+      "loss": 2.0422,
+      "step": 712
+    },
+    {
+      "batch_num_effect_tokens": 5475,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.64818,
+      "grad_norm": 3.213657855987549,
+      "learning_rate": 1.7093470974136505e-05,
+      "loss": 1.8713,
+      "step": 713
+    },
+    {
+      "batch_num_effect_tokens": 7187,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.64909,
+      "grad_norm": 3.5997188091278076,
+      "learning_rate": 1.7082278307814593e-05,
+      "loss": 1.8454,
+      "step": 714
+    },
+    {
+      "batch_num_effect_tokens": 6372,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.65,
+      "grad_norm": 3.3948583602905273,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 2.0798,
+      "step": 715
+    },
+    {
+      "batch_num_effect_tokens": 5744,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.65091,
+      "grad_norm": 3.3523178100585938,
+      "learning_rate": 1.7059839514511565e-05,
+      "loss": 1.8491,
+      "step": 716
+    },
+    {
+      "batch_num_effect_tokens": 6495,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 0.65182,
+      "grad_norm": 3.0404746532440186,
+      "learning_rate": 1.7048593444020084e-05,
+      "loss": 1.8901,
+      "step": 717
+    },
+    {
+      "batch_num_effect_tokens": 4383,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.65273,
+      "grad_norm": 3.1420235633850098,
+      "learning_rate": 1.7037329628703005e-05,
+      "loss": 1.4262,
+      "step": 718
+    },
+    {
+      "batch_num_effect_tokens": 8547,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 0.65364,
+      "grad_norm": 2.8673055171966553,
+      "learning_rate": 1.702604809691697e-05,
+      "loss": 2.0797,
+      "step": 719
+    },
+    {
+      "batch_num_effect_tokens": 7211,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.65455,
+      "grad_norm": 3.111631155014038,
+      "learning_rate": 1.7014748877063212e-05,
+      "loss": 1.7795,
+      "step": 720
+    },
+    {
+      "batch_num_effect_tokens": 5651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.65545,
+      "grad_norm": 3.360675573348999,
+      "learning_rate": 1.7003431997587516e-05,
+      "loss": 1.6566,
+      "step": 721
+    },
+    {
+      "batch_num_effect_tokens": 5821,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52125,
+      "epoch": 0.65636,
+      "grad_norm": 3.098942995071411,
+      "learning_rate": 1.6992097486980107e-05,
+      "loss": 1.5144,
+      "step": 722
+    },
+    {
+      "batch_num_effect_tokens": 7546,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.65727,
+      "grad_norm": 3.1112258434295654,
+      "learning_rate": 1.6980745373775604e-05,
+      "loss": 2.0011,
+      "step": 723
+    },
+    {
+      "batch_num_effect_tokens": 5119,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 0.65818,
+      "grad_norm": 3.3119330406188965,
+      "learning_rate": 1.696937568655294e-05,
+      "loss": 1.4487,
+      "step": 724
+    },
+    {
+      "batch_num_effect_tokens": 6011,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.65909,
+      "grad_norm": 3.974515199661255,
+      "learning_rate": 1.6957988453935276e-05,
+      "loss": 2.169,
+      "step": 725
+    },
+    {
+      "batch_num_effect_tokens": 5712,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52216,
+      "epoch": 0.66,
+      "grad_norm": 3.2805824279785156,
+      "learning_rate": 1.6946583704589973e-05,
+      "loss": 1.5914,
+      "step": 726
+    },
+    {
+      "batch_num_effect_tokens": 5982,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.66091,
+      "grad_norm": 3.1364593505859375,
+      "learning_rate": 1.6935161467228466e-05,
+      "loss": 1.4481,
+      "step": 727
+    },
+    {
+      "batch_num_effect_tokens": 4891,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.66182,
+      "grad_norm": 3.0658605098724365,
+      "learning_rate": 1.692372177060623e-05,
+      "loss": 1.5435,
+      "step": 728
+    },
+    {
+      "batch_num_effect_tokens": 6283,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52121,
+      "epoch": 0.66273,
+      "grad_norm": 4.253247261047363,
+      "learning_rate": 1.691226464352268e-05,
+      "loss": 1.808,
+      "step": 729
+    },
+    {
+      "batch_num_effect_tokens": 7333,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.66364,
+      "grad_norm": 3.105001211166382,
+      "learning_rate": 1.6900790114821122e-05,
+      "loss": 2.1523,
+      "step": 730
+    },
+    {
+      "batch_num_effect_tokens": 6461,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.66455,
+      "grad_norm": 3.2539286613464355,
+      "learning_rate": 1.688929821338867e-05,
+      "loss": 1.9806,
+      "step": 731
+    },
+    {
+      "batch_num_effect_tokens": 7060,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.66545,
+      "grad_norm": 3.383930206298828,
+      "learning_rate": 1.6877788968156172e-05,
+      "loss": 1.7445,
+      "step": 732
+    },
+    {
+      "batch_num_effect_tokens": 6503,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52110,
+      "epoch": 0.66636,
+      "grad_norm": 3.3413214683532715,
+      "learning_rate": 1.6866262408098134e-05,
+      "loss": 1.8225,
+      "step": 733
+    },
+    {
+      "batch_num_effect_tokens": 8637,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.66727,
+      "grad_norm": 2.968226671218872,
+      "learning_rate": 1.685471856223267e-05,
+      "loss": 2.0853,
+      "step": 734
+    },
+    {
+      "batch_num_effect_tokens": 11260,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.66818,
+      "grad_norm": 2.7590177059173584,
+      "learning_rate": 1.6843157459621386e-05,
+      "loss": 2.2001,
+      "step": 735
+    },
+    {
+      "batch_num_effect_tokens": 7229,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 0.66909,
+      "grad_norm": 3.267742872238159,
+      "learning_rate": 1.6831579129369347e-05,
+      "loss": 1.9719,
+      "step": 736
+    },
+    {
+      "batch_num_effect_tokens": 7764,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.67,
+      "grad_norm": 3.2558839321136475,
+      "learning_rate": 1.6819983600624986e-05,
+      "loss": 1.9983,
+      "step": 737
+    },
+    {
+      "batch_num_effect_tokens": 7576,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.67091,
+      "grad_norm": 3.4332239627838135,
+      "learning_rate": 1.6808370902580034e-05,
+      "loss": 2.0056,
+      "step": 738
+    },
+    {
+      "batch_num_effect_tokens": 4863,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.67182,
+      "grad_norm": 3.064371347427368,
+      "learning_rate": 1.6796741064469446e-05,
+      "loss": 1.2753,
+      "step": 739
+    },
+    {
+      "batch_num_effect_tokens": 7655,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.67273,
+      "grad_norm": 3.035062074661255,
+      "learning_rate": 1.6785094115571323e-05,
+      "loss": 1.9743,
+      "step": 740
+    },
+    {
+      "batch_num_effect_tokens": 6245,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.67364,
+      "grad_norm": 3.305992841720581,
+      "learning_rate": 1.677343008520685e-05,
+      "loss": 1.8482,
+      "step": 741
+    },
+    {
+      "batch_num_effect_tokens": 5362,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.67455,
+      "grad_norm": 8.169853210449219,
+      "learning_rate": 1.6761749002740195e-05,
+      "loss": 2.1479,
+      "step": 742
+    },
+    {
+      "batch_num_effect_tokens": 7209,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 0.67545,
+      "grad_norm": 3.5025248527526855,
+      "learning_rate": 1.6750050897578484e-05,
+      "loss": 2.3072,
+      "step": 743
+    },
+    {
+      "batch_num_effect_tokens": 4681,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.67636,
+      "grad_norm": 3.109935760498047,
+      "learning_rate": 1.673833579917168e-05,
+      "loss": 1.4393,
+      "step": 744
+    },
+    {
+      "batch_num_effect_tokens": 5669,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52122,
+      "epoch": 0.67727,
+      "grad_norm": 3.120631694793701,
+      "learning_rate": 1.6726603737012527e-05,
+      "loss": 1.6593,
+      "step": 745
+    },
+    {
+      "batch_num_effect_tokens": 6840,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.67818,
+      "grad_norm": 3.159054756164551,
+      "learning_rate": 1.6714854740636477e-05,
+      "loss": 2.0482,
+      "step": 746
+    },
+    {
+      "batch_num_effect_tokens": 9810,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.67909,
+      "grad_norm": 3.1278321743011475,
+      "learning_rate": 1.6703088839621616e-05,
+      "loss": 2.3994,
+      "step": 747
+    },
+    {
+      "batch_num_effect_tokens": 7824,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.68,
+      "grad_norm": 3.290961980819702,
+      "learning_rate": 1.6691306063588583e-05,
+      "loss": 2.0672,
+      "step": 748
+    },
+    {
+      "batch_num_effect_tokens": 5326,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.68091,
+      "grad_norm": 3.196922540664673,
+      "learning_rate": 1.6679506442200508e-05,
+      "loss": 2.0027,
+      "step": 749
+    },
+    {
+      "batch_num_effect_tokens": 4276,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.68182,
+      "grad_norm": 3.541163444519043,
+      "learning_rate": 1.666769000516292e-05,
+      "loss": 1.5725,
+      "step": 750
+    },
+    {
+      "batch_num_effect_tokens": 8160,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.68273,
+      "grad_norm": 2.104133129119873,
+      "learning_rate": 1.6655856782223682e-05,
+      "loss": 1.1344,
+      "step": 751
+    },
+    {
+      "batch_num_effect_tokens": 8092,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.68364,
+      "grad_norm": 3.19911789894104,
+      "learning_rate": 1.6644006803172926e-05,
+      "loss": 2.2778,
+      "step": 752
+    },
+    {
+      "batch_num_effect_tokens": 8542,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52192,
+      "epoch": 0.68455,
+      "grad_norm": 3.007779121398926,
+      "learning_rate": 1.6632140097842953e-05,
+      "loss": 2.2804,
+      "step": 753
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52112,
+      "epoch": 0.68545,
+      "grad_norm": 3.221501111984253,
+      "learning_rate": 1.6620256696108187e-05,
+      "loss": 2.1006,
+      "step": 754
+    },
+    {
+      "batch_num_effect_tokens": 6048,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.68636,
+      "grad_norm": 3.1645705699920654,
+      "learning_rate": 1.660835662788507e-05,
+      "loss": 2.0649,
+      "step": 755
+    },
+    {
+      "batch_num_effect_tokens": 6160,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.68727,
+      "grad_norm": 3.147974729537964,
+      "learning_rate": 1.6596439923132016e-05,
+      "loss": 1.666,
+      "step": 756
+    },
+    {
+      "batch_num_effect_tokens": 5991,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 0.68818,
+      "grad_norm": 3.1017847061157227,
+      "learning_rate": 1.6584506611849313e-05,
+      "loss": 1.7526,
+      "step": 757
+    },
+    {
+      "batch_num_effect_tokens": 7163,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.68909,
+      "grad_norm": 3.19863224029541,
+      "learning_rate": 1.6572556724079055e-05,
+      "loss": 1.8921,
+      "step": 758
+    },
+    {
+      "batch_num_effect_tokens": 7250,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.69,
+      "grad_norm": 3.080247640609741,
+      "learning_rate": 1.6560590289905074e-05,
+      "loss": 1.9408,
+      "step": 759
+    },
+    {
+      "batch_num_effect_tokens": 9067,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.69091,
+      "grad_norm": 3.105419874191284,
+      "learning_rate": 1.6548607339452853e-05,
+      "loss": 2.2802,
+      "step": 760
+    },
+    {
+      "batch_num_effect_tokens": 6394,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52155,
+      "epoch": 0.69182,
+      "grad_norm": 3.3251821994781494,
+      "learning_rate": 1.6536607902889453e-05,
+      "loss": 2.0261,
+      "step": 761
+    },
+    {
+      "batch_num_effect_tokens": 5741,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.69273,
+      "grad_norm": 3.2268760204315186,
+      "learning_rate": 1.6524592010423444e-05,
+      "loss": 1.6041,
+      "step": 762
+    },
+    {
+      "batch_num_effect_tokens": 6934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52218,
+      "epoch": 0.69364,
+      "grad_norm": 3.1550705432891846,
+      "learning_rate": 1.651255969230482e-05,
+      "loss": 1.6185,
+      "step": 763
+    },
+    {
+      "batch_num_effect_tokens": 9521,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.69455,
+      "grad_norm": 2.332301616668701,
+      "learning_rate": 1.6500510978824928e-05,
+      "loss": 1.3868,
+      "step": 764
+    },
+    {
+      "batch_num_effect_tokens": 5507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.69545,
+      "grad_norm": 3.3931100368499756,
+      "learning_rate": 1.6488445900316388e-05,
+      "loss": 1.7854,
+      "step": 765
+    },
+    {
+      "batch_num_effect_tokens": 5535,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52136,
+      "epoch": 0.69636,
+      "grad_norm": 3.425886392593384,
+      "learning_rate": 1.6476364487153024e-05,
+      "loss": 1.8349,
+      "step": 766
+    },
+    {
+      "batch_num_effect_tokens": 7539,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52216,
+      "epoch": 0.69727,
+      "grad_norm": 3.0860488414764404,
+      "learning_rate": 1.6464266769749774e-05,
+      "loss": 2.1777,
+      "step": 767
+    },
+    {
+      "batch_num_effect_tokens": 4574,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 0.69818,
+      "grad_norm": 3.5921261310577393,
+      "learning_rate": 1.6452152778562633e-05,
+      "loss": 1.6029,
+      "step": 768
+    },
+    {
+      "batch_num_effect_tokens": 7124,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.69909,
+      "grad_norm": 2.9657366275787354,
+      "learning_rate": 1.6440022544088553e-05,
+      "loss": 1.6801,
+      "step": 769
+    },
+    {
+      "batch_num_effect_tokens": 7162,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.7,
+      "grad_norm": 3.191507577896118,
+      "learning_rate": 1.6427876096865394e-05,
+      "loss": 2.0602,
+      "step": 770
+    },
+    {
+      "batch_num_effect_tokens": 7288,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.70091,
+      "grad_norm": 3.031268358230591,
+      "learning_rate": 1.6415713467471817e-05,
+      "loss": 2.1469,
+      "step": 771
+    },
+    {
+      "batch_num_effect_tokens": 4286,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.70182,
+      "grad_norm": 3.0480797290802,
+      "learning_rate": 1.6403534686527223e-05,
+      "loss": 1.1808,
+      "step": 772
+    },
+    {
+      "batch_num_effect_tokens": 10561,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.70273,
+      "grad_norm": 2.8739101886749268,
+      "learning_rate": 1.6391339784691685e-05,
+      "loss": 2.2059,
+      "step": 773
+    },
+    {
+      "batch_num_effect_tokens": 6132,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.70364,
+      "grad_norm": 3.1238291263580322,
+      "learning_rate": 1.6379128792665853e-05,
+      "loss": 1.7822,
+      "step": 774
+    },
+    {
+      "batch_num_effect_tokens": 5755,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.70455,
+      "grad_norm": 3.3250069618225098,
+      "learning_rate": 1.6366901741190885e-05,
+      "loss": 1.8115,
+      "step": 775
+    },
+    {
+      "batch_num_effect_tokens": 5908,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.70545,
+      "grad_norm": 4.2498393058776855,
+      "learning_rate": 1.6354658661048364e-05,
+      "loss": 1.6392,
+      "step": 776
+    },
+    {
+      "batch_num_effect_tokens": 6284,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52205,
+      "epoch": 0.70636,
+      "grad_norm": 3.3605003356933594,
+      "learning_rate": 1.6342399583060234e-05,
+      "loss": 2.0054,
+      "step": 777
+    },
+    {
+      "batch_num_effect_tokens": 7090,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.70727,
+      "grad_norm": 3.663482666015625,
+      "learning_rate": 1.6330124538088705e-05,
+      "loss": 1.6892,
+      "step": 778
+    },
+    {
+      "batch_num_effect_tokens": 5537,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.70818,
+      "grad_norm": 3.340763568878174,
+      "learning_rate": 1.6317833557036193e-05,
+      "loss": 1.5238,
+      "step": 779
+    },
+    {
+      "batch_num_effect_tokens": 7954,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.70909,
+      "grad_norm": 2.9226419925689697,
+      "learning_rate": 1.6305526670845225e-05,
+      "loss": 1.8049,
+      "step": 780
+    },
+    {
+      "batch_num_effect_tokens": 5450,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.71,
+      "grad_norm": 3.6275925636291504,
+      "learning_rate": 1.6293203910498375e-05,
+      "loss": 1.8456,
+      "step": 781
+    },
+    {
+      "batch_num_effect_tokens": 7314,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.71091,
+      "grad_norm": 3.454686403274536,
+      "learning_rate": 1.6280865307018177e-05,
+      "loss": 2.1397,
+      "step": 782
+    },
+    {
+      "batch_num_effect_tokens": 7268,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.71182,
+      "grad_norm": 3.575932741165161,
+      "learning_rate": 1.6268510891467048e-05,
+      "loss": 2.07,
+      "step": 783
+    },
+    {
+      "batch_num_effect_tokens": 7145,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.71273,
+      "grad_norm": 3.2153611183166504,
+      "learning_rate": 1.6256140694947217e-05,
+      "loss": 1.788,
+      "step": 784
+    },
+    {
+      "batch_num_effect_tokens": 9017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.71364,
+      "grad_norm": 2.9733734130859375,
+      "learning_rate": 1.6243754748600637e-05,
+      "loss": 2.1755,
+      "step": 785
+    },
+    {
+      "batch_num_effect_tokens": 6388,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.71455,
+      "grad_norm": 3.064711809158325,
+      "learning_rate": 1.623135308360891e-05,
+      "loss": 1.6803,
+      "step": 786
+    },
+    {
+      "batch_num_effect_tokens": 6036,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.71545,
+      "grad_norm": 3.391117811203003,
+      "learning_rate": 1.6218935731193223e-05,
+      "loss": 2.102,
+      "step": 787
+    },
+    {
+      "batch_num_effect_tokens": 9903,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.71636,
+      "grad_norm": 2.822171449661255,
+      "learning_rate": 1.620650272261424e-05,
+      "loss": 2.1866,
+      "step": 788
+    },
+    {
+      "batch_num_effect_tokens": 6496,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.71727,
+      "grad_norm": 3.41025972366333,
+      "learning_rate": 1.6194054089172043e-05,
+      "loss": 2.017,
+      "step": 789
+    },
+    {
+      "batch_num_effect_tokens": 6931,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 0.71818,
+      "grad_norm": 3.487778425216675,
+      "learning_rate": 1.6181589862206053e-05,
+      "loss": 1.9893,
+      "step": 790
+    },
+    {
+      "batch_num_effect_tokens": 4613,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.71909,
+      "grad_norm": 2.997297763824463,
+      "learning_rate": 1.616911007309495e-05,
+      "loss": 1.387,
+      "step": 791
+    },
+    {
+      "batch_num_effect_tokens": 6661,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.72,
+      "grad_norm": 3.055971384048462,
+      "learning_rate": 1.6156614753256583e-05,
+      "loss": 1.896,
+      "step": 792
+    },
+    {
+      "batch_num_effect_tokens": 5808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.72091,
+      "grad_norm": 3.1818058490753174,
+      "learning_rate": 1.614410393414791e-05,
+      "loss": 1.488,
+      "step": 793
+    },
+    {
+      "batch_num_effect_tokens": 7035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.72182,
+      "grad_norm": 3.7383460998535156,
+      "learning_rate": 1.6131577647264903e-05,
+      "loss": 2.0533,
+      "step": 794
+    },
+    {
+      "batch_num_effect_tokens": 5100,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.72273,
+      "grad_norm": 4.117101192474365,
+      "learning_rate": 1.6119035924142468e-05,
+      "loss": 1.4298,
+      "step": 795
+    },
+    {
+      "batch_num_effect_tokens": 6566,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.72364,
+      "grad_norm": 3.8849780559539795,
+      "learning_rate": 1.6106478796354382e-05,
+      "loss": 1.943,
+      "step": 796
+    },
+    {
+      "batch_num_effect_tokens": 6001,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.72455,
+      "grad_norm": 3.643822193145752,
+      "learning_rate": 1.6093906295513202e-05,
+      "loss": 1.5991,
+      "step": 797
+    },
+    {
+      "batch_num_effect_tokens": 5333,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.72545,
+      "grad_norm": 3.197996139526367,
+      "learning_rate": 1.608131845327018e-05,
+      "loss": 1.6382,
+      "step": 798
+    },
+    {
+      "batch_num_effect_tokens": 2904,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 0.72636,
+      "grad_norm": 3.439863443374634,
+      "learning_rate": 1.6068715301315195e-05,
+      "loss": 0.8895,
+      "step": 799
+    },
+    {
+      "batch_num_effect_tokens": 10600,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.72727,
+      "grad_norm": 2.662959337234497,
+      "learning_rate": 1.6056096871376667e-05,
+      "loss": 2.3137,
+      "step": 800
+    },
+    {
+      "batch_num_effect_tokens": 5538,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.72818,
+      "grad_norm": 3.3200247287750244,
+      "learning_rate": 1.604346319522148e-05,
+      "loss": 1.7092,
+      "step": 801
+    },
+    {
+      "batch_num_effect_tokens": 6844,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 0.72909,
+      "grad_norm": 3.4481406211853027,
+      "learning_rate": 1.6030814304654895e-05,
+      "loss": 1.9916,
+      "step": 802
+    },
+    {
+      "batch_num_effect_tokens": 4004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 0.73,
+      "grad_norm": 4.091022491455078,
+      "learning_rate": 1.6018150231520486e-05,
+      "loss": 1.253,
+      "step": 803
+    },
+    {
+      "batch_num_effect_tokens": 5369,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 0.73091,
+      "grad_norm": 3.431154489517212,
+      "learning_rate": 1.6005471007700033e-05,
+      "loss": 1.5734,
+      "step": 804
+    },
+    {
+      "batch_num_effect_tokens": 5870,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.73182,
+      "grad_norm": 3.191513776779175,
+      "learning_rate": 1.599277666511347e-05,
+      "loss": 1.6448,
+      "step": 805
+    },
+    {
+      "batch_num_effect_tokens": 6880,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.73273,
+      "grad_norm": 19.013717651367188,
+      "learning_rate": 1.5980067235718793e-05,
+      "loss": 2.1849,
+      "step": 806
+    },
+    {
+      "batch_num_effect_tokens": 7321,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 0.73364,
+      "grad_norm": 3.0356898307800293,
+      "learning_rate": 1.596734275151197e-05,
+      "loss": 1.6153,
+      "step": 807
+    },
+    {
+      "batch_num_effect_tokens": 4577,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.73455,
+      "grad_norm": 3.775296449661255,
+      "learning_rate": 1.595460324452688e-05,
+      "loss": 1.5571,
+      "step": 808
+    },
+    {
+      "batch_num_effect_tokens": 5624,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.73545,
+      "grad_norm": 4.002346038818359,
+      "learning_rate": 1.5941848746835216e-05,
+      "loss": 1.9669,
+      "step": 809
+    },
+    {
+      "batch_num_effect_tokens": 5883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.73636,
+      "grad_norm": 3.727356195449829,
+      "learning_rate": 1.5929079290546408e-05,
+      "loss": 1.7648,
+      "step": 810
+    },
+    {
+      "batch_num_effect_tokens": 7143,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.73727,
+      "grad_norm": 3.607943534851074,
+      "learning_rate": 1.5916294907807547e-05,
+      "loss": 2.041,
+      "step": 811
+    },
+    {
+      "batch_num_effect_tokens": 5295,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.73818,
+      "grad_norm": 3.056272268295288,
+      "learning_rate": 1.5903495630803302e-05,
+      "loss": 1.3723,
+      "step": 812
+    },
+    {
+      "batch_num_effect_tokens": 5226,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.73909,
+      "grad_norm": 3.8680360317230225,
+      "learning_rate": 1.5890681491755838e-05,
+      "loss": 1.972,
+      "step": 813
+    },
+    {
+      "batch_num_effect_tokens": 4905,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.74,
+      "grad_norm": 3.7667770385742188,
+      "learning_rate": 1.5877852522924733e-05,
+      "loss": 1.6848,
+      "step": 814
+    },
+    {
+      "batch_num_effect_tokens": 6354,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.74091,
+      "grad_norm": 3.2568600177764893,
+      "learning_rate": 1.5865008756606905e-05,
+      "loss": 2.0856,
+      "step": 815
+    },
+    {
+      "batch_num_effect_tokens": 2948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 0.74182,
+      "grad_norm": 3.1932148933410645,
+      "learning_rate": 1.585215022513652e-05,
+      "loss": 0.774,
+      "step": 816
+    },
+    {
+      "batch_num_effect_tokens": 5706,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52111,
+      "epoch": 0.74273,
+      "grad_norm": 3.7973036766052246,
+      "learning_rate": 1.5839276960884906e-05,
+      "loss": 1.8912,
+      "step": 817
+    },
+    {
+      "batch_num_effect_tokens": 7678,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 0.74364,
+      "grad_norm": 3.9283089637756348,
+      "learning_rate": 1.5826388996260503e-05,
+      "loss": 2.115,
+      "step": 818
+    },
+    {
+      "batch_num_effect_tokens": 8872,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.74455,
+      "grad_norm": 3.237718343734741,
+      "learning_rate": 1.581348636370874e-05,
+      "loss": 2.1578,
+      "step": 819
+    },
+    {
+      "batch_num_effect_tokens": 6072,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.74545,
+      "grad_norm": 3.0094594955444336,
+      "learning_rate": 1.5800569095711983e-05,
+      "loss": 1.6896,
+      "step": 820
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.74636,
+      "grad_norm": 2.8689441680908203,
+      "learning_rate": 1.5787637224789434e-05,
+      "loss": 2.2964,
+      "step": 821
+    },
+    {
+      "batch_num_effect_tokens": 4579,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.74727,
+      "grad_norm": 3.3919966220855713,
+      "learning_rate": 1.5774690783497066e-05,
+      "loss": 1.5906,
+      "step": 822
+    },
+    {
+      "batch_num_effect_tokens": 7528,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.74818,
+      "grad_norm": 2.8416075706481934,
+      "learning_rate": 1.576172980442753e-05,
+      "loss": 1.7786,
+      "step": 823
+    },
+    {
+      "batch_num_effect_tokens": 6082,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.74909,
+      "grad_norm": 3.191624641418457,
+      "learning_rate": 1.5748754320210074e-05,
+      "loss": 1.7562,
+      "step": 824
+    },
+    {
+      "batch_num_effect_tokens": 8173,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.75,
+      "grad_norm": 3.029482364654541,
+      "learning_rate": 1.573576436351046e-05,
+      "loss": 2.0347,
+      "step": 825
+    },
+    {
+      "batch_num_effect_tokens": 4003,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.75091,
+      "grad_norm": 3.279954195022583,
+      "learning_rate": 1.5722759967030898e-05,
+      "loss": 1.1664,
+      "step": 826
+    },
+    {
+      "batch_num_effect_tokens": 5465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.75182,
+      "grad_norm": 6.8594183921813965,
+      "learning_rate": 1.5709741163509934e-05,
+      "loss": 1.7524,
+      "step": 827
+    },
+    {
+      "batch_num_effect_tokens": 7520,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 0.75273,
+      "grad_norm": 2.951838731765747,
+      "learning_rate": 1.569670798572239e-05,
+      "loss": 1.999,
+      "step": 828
+    },
+    {
+      "batch_num_effect_tokens": 7578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.75364,
+      "grad_norm": 3.045968770980835,
+      "learning_rate": 1.5683660466479276e-05,
+      "loss": 2.1318,
+      "step": 829
+    },
+    {
+      "batch_num_effect_tokens": 5717,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.75455,
+      "grad_norm": 3.512777090072632,
+      "learning_rate": 1.5670598638627707e-05,
+      "loss": 2.0365,
+      "step": 830
+    },
+    {
+      "batch_num_effect_tokens": 3331,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.75545,
+      "grad_norm": 3.2990052700042725,
+      "learning_rate": 1.565752253505082e-05,
+      "loss": 1.0083,
+      "step": 831
+    },
+    {
+      "batch_num_effect_tokens": 5666,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 0.75636,
+      "grad_norm": 3.1069211959838867,
+      "learning_rate": 1.5644432188667695e-05,
+      "loss": 1.5099,
+      "step": 832
+    },
+    {
+      "batch_num_effect_tokens": 4232,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.75727,
+      "grad_norm": 3.772783041000366,
+      "learning_rate": 1.563132763243325e-05,
+      "loss": 1.4528,
+      "step": 833
+    },
+    {
+      "batch_num_effect_tokens": 5526,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.75818,
+      "grad_norm": 3.1174569129943848,
+      "learning_rate": 1.56182088993382e-05,
+      "loss": 1.7541,
+      "step": 834
+    },
+    {
+      "batch_num_effect_tokens": 6759,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.75909,
+      "grad_norm": 3.053814172744751,
+      "learning_rate": 1.560507602240894e-05,
+      "loss": 1.8855,
+      "step": 835
+    },
+    {
+      "batch_num_effect_tokens": 4630,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52127,
+      "epoch": 0.76,
+      "grad_norm": 3.168306350708008,
+      "learning_rate": 1.5591929034707468e-05,
+      "loss": 1.5212,
+      "step": 836
+    },
+    {
+      "batch_num_effect_tokens": 5538,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52168,
+      "epoch": 0.76091,
+      "grad_norm": 3.2831947803497314,
+      "learning_rate": 1.5578767969331315e-05,
+      "loss": 1.54,
+      "step": 837
+    },
+    {
+      "batch_num_effect_tokens": 6277,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52206,
+      "epoch": 0.76182,
+      "grad_norm": 3.1704232692718506,
+      "learning_rate": 1.5565592859413442e-05,
+      "loss": 1.3928,
+      "step": 838
+    },
+    {
+      "batch_num_effect_tokens": 5844,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.76273,
+      "grad_norm": 3.5144307613372803,
+      "learning_rate": 1.555240373812217e-05,
+      "loss": 2.1053,
+      "step": 839
+    },
+    {
+      "batch_num_effect_tokens": 6697,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.76364,
+      "grad_norm": 3.301517963409424,
+      "learning_rate": 1.5539200638661106e-05,
+      "loss": 2.1613,
+      "step": 840
+    },
+    {
+      "batch_num_effect_tokens": 8281,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.76455,
+      "grad_norm": 3.2360894680023193,
+      "learning_rate": 1.5525983594269026e-05,
+      "loss": 2.3262,
+      "step": 841
+    },
+    {
+      "batch_num_effect_tokens": 7297,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.76545,
+      "grad_norm": 3.1873526573181152,
+      "learning_rate": 1.5512752638219834e-05,
+      "loss": 2.1013,
+      "step": 842
+    },
+    {
+      "batch_num_effect_tokens": 3934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.76636,
+      "grad_norm": 3.1415605545043945,
+      "learning_rate": 1.549950780382244e-05,
+      "loss": 1.0846,
+      "step": 843
+    },
+    {
+      "batch_num_effect_tokens": 5417,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 0.76727,
+      "grad_norm": 3.3242571353912354,
+      "learning_rate": 1.5486249124420702e-05,
+      "loss": 1.8593,
+      "step": 844
+    },
+    {
+      "batch_num_effect_tokens": 7916,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.76818,
+      "grad_norm": 2.9603023529052734,
+      "learning_rate": 1.5472976633393325e-05,
+      "loss": 1.9193,
+      "step": 845
+    },
+    {
+      "batch_num_effect_tokens": 6049,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.76909,
+      "grad_norm": 3.532238721847534,
+      "learning_rate": 1.5459690364153792e-05,
+      "loss": 2.0857,
+      "step": 846
+    },
+    {
+      "batch_num_effect_tokens": 7548,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.77,
+      "grad_norm": 3.206171989440918,
+      "learning_rate": 1.5446390350150272e-05,
+      "loss": 2.0077,
+      "step": 847
+    },
+    {
+      "batch_num_effect_tokens": 7110,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 0.77091,
+      "grad_norm": 2.141608715057373,
+      "learning_rate": 1.5433076624865533e-05,
+      "loss": 0.8776,
+      "step": 848
+    },
+    {
+      "batch_num_effect_tokens": 6005,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.77182,
+      "grad_norm": 3.2913215160369873,
+      "learning_rate": 1.541974922181686e-05,
+      "loss": 1.8563,
+      "step": 849
+    },
+    {
+      "batch_num_effect_tokens": 5249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.77273,
+      "grad_norm": 4.281654357910156,
+      "learning_rate": 1.5406408174555978e-05,
+      "loss": 1.6274,
+      "step": 850
+    },
+    {
+      "batch_num_effect_tokens": 5526,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.77364,
+      "grad_norm": 3.324355363845825,
+      "learning_rate": 1.5393053516668954e-05,
+      "loss": 1.8066,
+      "step": 851
+    },
+    {
+      "batch_num_effect_tokens": 9426,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 0.77455,
+      "grad_norm": 3.0341014862060547,
+      "learning_rate": 1.5379685281776125e-05,
+      "loss": 2.1172,
+      "step": 852
+    },
+    {
+      "batch_num_effect_tokens": 6460,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 0.77545,
+      "grad_norm": 3.261878490447998,
+      "learning_rate": 1.536630350353201e-05,
+      "loss": 1.4555,
+      "step": 853
+    },
+    {
+      "batch_num_effect_tokens": 6866,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.77636,
+      "grad_norm": 3.649355888366699,
+      "learning_rate": 1.5352908215625215e-05,
+      "loss": 2.0898,
+      "step": 854
+    },
+    {
+      "batch_num_effect_tokens": 5492,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.77727,
+      "grad_norm": 3.3275415897369385,
+      "learning_rate": 1.5339499451778363e-05,
+      "loss": 1.2897,
+      "step": 855
+    },
+    {
+      "batch_num_effect_tokens": 7780,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 0.77818,
+      "grad_norm": 3.252866744995117,
+      "learning_rate": 1.5326077245747998e-05,
+      "loss": 2.2499,
+      "step": 856
+    },
+    {
+      "batch_num_effect_tokens": 4070,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.77909,
+      "grad_norm": 3.7300028800964355,
+      "learning_rate": 1.5312641631324513e-05,
+      "loss": 1.4009,
+      "step": 857
+    },
+    {
+      "batch_num_effect_tokens": 5965,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 0.78,
+      "grad_norm": 3.201202630996704,
+      "learning_rate": 1.529919264233205e-05,
+      "loss": 1.9491,
+      "step": 858
+    },
+    {
+      "batch_num_effect_tokens": 7213,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 0.78091,
+      "grad_norm": 3.3597118854522705,
+      "learning_rate": 1.528573031262842e-05,
+      "loss": 1.6224,
+      "step": 859
+    },
+    {
+      "batch_num_effect_tokens": 5073,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 0.78182,
+      "grad_norm": 3.368971824645996,
+      "learning_rate": 1.5272254676105026e-05,
+      "loss": 1.4534,
+      "step": 860
+    },
+    {
+      "batch_num_effect_tokens": 6545,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.78273,
+      "grad_norm": 3.2033519744873047,
+      "learning_rate": 1.5258765766686762e-05,
+      "loss": 2.1782,
+      "step": 861
+    },
+    {
+      "batch_num_effect_tokens": 6338,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.78364,
+      "grad_norm": 2.733912944793701,
+      "learning_rate": 1.5245263618331944e-05,
+      "loss": 1.6437,
+      "step": 862
+    },
+    {
+      "batch_num_effect_tokens": 4746,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50509,
+      "epoch": 0.78455,
+      "grad_norm": 3.5325381755828857,
+      "learning_rate": 1.5231748265032216e-05,
+      "loss": 1.7903,
+      "step": 863
+    },
+    {
+      "batch_num_effect_tokens": 5785,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 0.78545,
+      "grad_norm": 3.381405830383301,
+      "learning_rate": 1.521821974081246e-05,
+      "loss": 2.1012,
+      "step": 864
+    },
+    {
+      "batch_num_effect_tokens": 8847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.78636,
+      "grad_norm": 2.947990655899048,
+      "learning_rate": 1.5204678079730724e-05,
+      "loss": 2.1991,
+      "step": 865
+    },
+    {
+      "batch_num_effect_tokens": 6221,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 0.78727,
+      "grad_norm": 3.320669174194336,
+      "learning_rate": 1.5191123315878123e-05,
+      "loss": 1.6411,
+      "step": 866
+    },
+    {
+      "batch_num_effect_tokens": 4073,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.78818,
+      "grad_norm": 3.4623115062713623,
+      "learning_rate": 1.5177555483378752e-05,
+      "loss": 1.276,
+      "step": 867
+    },
+    {
+      "batch_num_effect_tokens": 7142,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52214,
+      "epoch": 0.78909,
+      "grad_norm": 3.2191693782806396,
+      "learning_rate": 1.5163974616389621e-05,
+      "loss": 1.9639,
+      "step": 868
+    },
+    {
+      "batch_num_effect_tokens": 9174,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 0.79,
+      "grad_norm": 2.8348443508148193,
+      "learning_rate": 1.5150380749100545e-05,
+      "loss": 2.0828,
+      "step": 869
+    },
+    {
+      "batch_num_effect_tokens": 6934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 0.79091,
+      "grad_norm": 3.363175868988037,
+      "learning_rate": 1.5136773915734067e-05,
+      "loss": 2.0916,
+      "step": 870
+    },
+    {
+      "batch_num_effect_tokens": 5090,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.79182,
+      "grad_norm": 4.208559989929199,
+      "learning_rate": 1.5123154150545372e-05,
+      "loss": 1.7012,
+      "step": 871
+    },
+    {
+      "batch_num_effect_tokens": 4662,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.79273,
+      "grad_norm": 3.3394784927368164,
+      "learning_rate": 1.5109521487822208e-05,
+      "loss": 1.7266,
+      "step": 872
+    },
+    {
+      "batch_num_effect_tokens": 9093,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.79364,
+      "grad_norm": 2.944627285003662,
+      "learning_rate": 1.5095875961884781e-05,
+      "loss": 2.1492,
+      "step": 873
+    },
+    {
+      "batch_num_effect_tokens": 4703,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.79455,
+      "grad_norm": 3.579747200012207,
+      "learning_rate": 1.5082217607085692e-05,
+      "loss": 1.4926,
+      "step": 874
+    },
+    {
+      "batch_num_effect_tokens": 7135,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.79545,
+      "grad_norm": 3.227884531021118,
+      "learning_rate": 1.5068546457809831e-05,
+      "loss": 1.7614,
+      "step": 875
+    },
+    {
+      "batch_num_effect_tokens": 6036,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.79636,
+      "grad_norm": 3.272951126098633,
+      "learning_rate": 1.5054862548474298e-05,
+      "loss": 1.8083,
+      "step": 876
+    },
+    {
+      "batch_num_effect_tokens": 3696,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52090,
+      "epoch": 0.79727,
+      "grad_norm": 3.2300150394439697,
+      "learning_rate": 1.504116591352832e-05,
+      "loss": 1.3745,
+      "step": 877
+    },
+    {
+      "batch_num_effect_tokens": 7175,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52129,
+      "epoch": 0.79818,
+      "grad_norm": 3.1032445430755615,
+      "learning_rate": 1.5027456587453159e-05,
+      "loss": 1.9573,
+      "step": 878
+    },
+    {
+      "batch_num_effect_tokens": 7828,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52177,
+      "epoch": 0.79909,
+      "grad_norm": 3.465160369873047,
+      "learning_rate": 1.5013734604762032e-05,
+      "loss": 1.7241,
+      "step": 879
+    },
+    {
+      "batch_num_effect_tokens": 7234,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52188,
+      "epoch": 0.8,
+      "grad_norm": 3.1303765773773193,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 1.7644,
+      "step": 880
+    },
+    {
+      "batch_num_effect_tokens": 5925,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.80091,
+      "grad_norm": 3.201023578643799,
+      "learning_rate": 1.4986252807743928e-05,
+      "loss": 1.829,
+      "step": 881
+    },
+    {
+      "batch_num_effect_tokens": 7170,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.80182,
+      "grad_norm": 2.98020601272583,
+      "learning_rate": 1.4972493062602355e-05,
+      "loss": 1.9255,
+      "step": 882
+    },
+    {
+      "batch_num_effect_tokens": 4321,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.80273,
+      "grad_norm": 3.3040101528167725,
+      "learning_rate": 1.4958720799215414e-05,
+      "loss": 1.4459,
+      "step": 883
+    },
+    {
+      "batch_num_effect_tokens": 5098,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.80364,
+      "grad_norm": 3.1221706867218018,
+      "learning_rate": 1.494493605225477e-05,
+      "loss": 1.8362,
+      "step": 884
+    },
+    {
+      "batch_num_effect_tokens": 5578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.80455,
+      "grad_norm": 3.223900556564331,
+      "learning_rate": 1.4931138856423504e-05,
+      "loss": 1.7424,
+      "step": 885
+    },
+    {
+      "batch_num_effect_tokens": 8348,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.80545,
+      "grad_norm": 2.6039373874664307,
+      "learning_rate": 1.491732924645604e-05,
+      "loss": 1.5469,
+      "step": 886
+    },
+    {
+      "batch_num_effect_tokens": 5352,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.80636,
+      "grad_norm": 3.405869245529175,
+      "learning_rate": 1.4903507257118054e-05,
+      "loss": 1.7076,
+      "step": 887
+    },
+    {
+      "batch_num_effect_tokens": 5396,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.80727,
+      "grad_norm": 3.1065635681152344,
+      "learning_rate": 1.488967292320639e-05,
+      "loss": 1.482,
+      "step": 888
+    },
+    {
+      "batch_num_effect_tokens": 7004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 0.80818,
+      "grad_norm": 4.136509895324707,
+      "learning_rate": 1.4875826279548964e-05,
+      "loss": 1.824,
+      "step": 889
+    },
+    {
+      "batch_num_effect_tokens": 7966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.80909,
+      "grad_norm": 3.3540987968444824,
+      "learning_rate": 1.4861967361004687e-05,
+      "loss": 2.1597,
+      "step": 890
+    },
+    {
+      "batch_num_effect_tokens": 5298,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 0.81,
+      "grad_norm": 3.02241849899292,
+      "learning_rate": 1.4848096202463373e-05,
+      "loss": 1.6436,
+      "step": 891
+    },
+    {
+      "batch_num_effect_tokens": 6420,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52097,
+      "epoch": 0.81091,
+      "grad_norm": 3.1122889518737793,
+      "learning_rate": 1.4834212838845639e-05,
+      "loss": 1.4681,
+      "step": 892
+    },
+    {
+      "batch_num_effect_tokens": 6113,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 0.81182,
+      "grad_norm": 3.4665653705596924,
+      "learning_rate": 1.4820317305102842e-05,
+      "loss": 2.1555,
+      "step": 893
+    },
+    {
+      "batch_num_effect_tokens": 7208,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.81273,
+      "grad_norm": 2.9199295043945312,
+      "learning_rate": 1.4806409636216974e-05,
+      "loss": 1.7773,
+      "step": 894
+    },
+    {
+      "batch_num_effect_tokens": 5194,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50593,
+      "epoch": 0.81364,
+      "grad_norm": 3.5358550548553467,
+      "learning_rate": 1.479248986720057e-05,
+      "loss": 1.7775,
+      "step": 895
+    },
+    {
+      "batch_num_effect_tokens": 4817,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 0.81455,
+      "grad_norm": 3.5894317626953125,
+      "learning_rate": 1.4778558033096633e-05,
+      "loss": 1.9001,
+      "step": 896
+    },
+    {
+      "batch_num_effect_tokens": 6619,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 0.81545,
+      "grad_norm": 3.2133636474609375,
+      "learning_rate": 1.4764614168978539e-05,
+      "loss": 1.821,
+      "step": 897
+    },
+    {
+      "batch_num_effect_tokens": 7815,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 0.81636,
+      "grad_norm": 3.0909602642059326,
+      "learning_rate": 1.4750658309949953e-05,
+      "loss": 2.0219,
+      "step": 898
+    },
+    {
+      "batch_num_effect_tokens": 9105,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.81727,
+      "grad_norm": 3.031285285949707,
+      "learning_rate": 1.4736690491144724e-05,
+      "loss": 2.0928,
+      "step": 899
+    },
+    {
+      "batch_num_effect_tokens": 4729,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.81818,
+      "grad_norm": 3.4227333068847656,
+      "learning_rate": 1.472271074772683e-05,
+      "loss": 1.3522,
+      "step": 900
+    },
+    {
+      "batch_num_effect_tokens": 5764,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.81909,
+      "grad_norm": 3.352691888809204,
+      "learning_rate": 1.470871911489025e-05,
+      "loss": 2.0215,
+      "step": 901
+    },
+    {
+      "batch_num_effect_tokens": 6642,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 0.82,
+      "grad_norm": 2.8963751792907715,
+      "learning_rate": 1.469471562785891e-05,
+      "loss": 1.5203,
+      "step": 902
+    },
+    {
+      "batch_num_effect_tokens": 7384,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52128,
+      "epoch": 0.82091,
+      "grad_norm": 3.655452013015747,
+      "learning_rate": 1.4680700321886567e-05,
+      "loss": 2.0525,
+      "step": 903
+    },
+    {
+      "batch_num_effect_tokens": 8367,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 0.82182,
+      "grad_norm": 3.0699498653411865,
+      "learning_rate": 1.4666673232256738e-05,
+      "loss": 1.9485,
+      "step": 904
+    },
+    {
+      "batch_num_effect_tokens": 5188,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.82273,
+      "grad_norm": 4.488002777099609,
+      "learning_rate": 1.4652634394282608e-05,
+      "loss": 1.0071,
+      "step": 905
+    },
+    {
+      "batch_num_effect_tokens": 8174,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.82364,
+      "grad_norm": 3.199490785598755,
+      "learning_rate": 1.4638583843306928e-05,
+      "loss": 2.0605,
+      "step": 906
+    },
+    {
+      "batch_num_effect_tokens": 8521,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.82455,
+      "grad_norm": 2.913740873336792,
+      "learning_rate": 1.462452161470195e-05,
+      "loss": 1.9692,
+      "step": 907
+    },
+    {
+      "batch_num_effect_tokens": 5555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.82545,
+      "grad_norm": 3.375941038131714,
+      "learning_rate": 1.4610447743869313e-05,
+      "loss": 1.7169,
+      "step": 908
+    },
+    {
+      "batch_num_effect_tokens": 5281,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 0.82636,
+      "grad_norm": 3.434762716293335,
+      "learning_rate": 1.4596362266239974e-05,
+      "loss": 1.7983,
+      "step": 909
+    },
+    {
+      "batch_num_effect_tokens": 10814,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.82727,
+      "grad_norm": 2.8641369342803955,
+      "learning_rate": 1.4582265217274105e-05,
+      "loss": 2.3574,
+      "step": 910
+    },
+    {
+      "batch_num_effect_tokens": 5793,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 0.82818,
+      "grad_norm": 3.170866012573242,
+      "learning_rate": 1.4568156632461008e-05,
+      "loss": 1.7385,
+      "step": 911
+    },
+    {
+      "batch_num_effect_tokens": 8426,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.82909,
+      "grad_norm": 2.9237499237060547,
+      "learning_rate": 1.4554036547319033e-05,
+      "loss": 1.7105,
+      "step": 912
+    },
+    {
+      "batch_num_effect_tokens": 5176,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.83,
+      "grad_norm": 3.3719794750213623,
+      "learning_rate": 1.4539904997395468e-05,
+      "loss": 1.7381,
+      "step": 913
+    },
+    {
+      "batch_num_effect_tokens": 6254,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.83091,
+      "grad_norm": 3.7223596572875977,
+      "learning_rate": 1.4525762018266484e-05,
+      "loss": 2.2058,
+      "step": 914
+    },
+    {
+      "batch_num_effect_tokens": 6421,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.83182,
+      "grad_norm": 2.9281041622161865,
+      "learning_rate": 1.4511607645537009e-05,
+      "loss": 1.6528,
+      "step": 915
+    },
+    {
+      "batch_num_effect_tokens": 5651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 0.83273,
+      "grad_norm": 3.437027931213379,
+      "learning_rate": 1.449744191484066e-05,
+      "loss": 1.869,
+      "step": 916
+    },
+    {
+      "batch_num_effect_tokens": 6709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.83364,
+      "grad_norm": 2.9521644115448,
+      "learning_rate": 1.4483264861839646e-05,
+      "loss": 1.5646,
+      "step": 917
+    },
+    {
+      "batch_num_effect_tokens": 4699,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.83455,
+      "grad_norm": 3.5215671062469482,
+      "learning_rate": 1.4469076522224683e-05,
+      "loss": 1.294,
+      "step": 918
+    },
+    {
+      "batch_num_effect_tokens": 6396,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.83545,
+      "grad_norm": 4.2995686531066895,
+      "learning_rate": 1.4454876931714896e-05,
+      "loss": 1.6281,
+      "step": 919
+    },
+    {
+      "batch_num_effect_tokens": 5342,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52128,
+      "epoch": 0.83636,
+      "grad_norm": 3.4456379413604736,
+      "learning_rate": 1.4440666126057743e-05,
+      "loss": 1.9214,
+      "step": 920
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.83727,
+      "grad_norm": 3.3013765811920166,
+      "learning_rate": 1.4426444141028905e-05,
+      "loss": 1.9053,
+      "step": 921
+    },
+    {
+      "batch_num_effect_tokens": 6480,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.83818,
+      "grad_norm": 3.5072872638702393,
+      "learning_rate": 1.4412211012432213e-05,
+      "loss": 1.818,
+      "step": 922
+    },
+    {
+      "batch_num_effect_tokens": 3625,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.83909,
+      "grad_norm": 3.228583812713623,
+      "learning_rate": 1.4397966776099558e-05,
+      "loss": 1.2451,
+      "step": 923
+    },
+    {
+      "batch_num_effect_tokens": 6205,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 0.84,
+      "grad_norm": 3.156794548034668,
+      "learning_rate": 1.4383711467890776e-05,
+      "loss": 1.7222,
+      "step": 924
+    },
+    {
+      "batch_num_effect_tokens": 6118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 0.84091,
+      "grad_norm": 3.497466564178467,
+      "learning_rate": 1.4369445123693595e-05,
+      "loss": 2.0007,
+      "step": 925
+    },
+    {
+      "batch_num_effect_tokens": 5880,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.84182,
+      "grad_norm": 3.0640792846679688,
+      "learning_rate": 1.4355167779423525e-05,
+      "loss": 1.6348,
+      "step": 926
+    },
+    {
+      "batch_num_effect_tokens": 7842,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.84273,
+      "grad_norm": 3.0178539752960205,
+      "learning_rate": 1.4340879471023752e-05,
+      "loss": 2.1852,
+      "step": 927
+    },
+    {
+      "batch_num_effect_tokens": 5592,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.84364,
+      "grad_norm": 3.0909364223480225,
+      "learning_rate": 1.4326580234465084e-05,
+      "loss": 1.6169,
+      "step": 928
+    },
+    {
+      "batch_num_effect_tokens": 6014,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.84455,
+      "grad_norm": 3.195918560028076,
+      "learning_rate": 1.4312270105745829e-05,
+      "loss": 1.9621,
+      "step": 929
+    },
+    {
+      "batch_num_effect_tokens": 9469,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.84545,
+      "grad_norm": 3.2743117809295654,
+      "learning_rate": 1.4297949120891718e-05,
+      "loss": 2.3096,
+      "step": 930
+    },
+    {
+      "batch_num_effect_tokens": 5212,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.84636,
+      "grad_norm": 2.9876720905303955,
+      "learning_rate": 1.4283617315955815e-05,
+      "loss": 1.5374,
+      "step": 931
+    },
+    {
+      "batch_num_effect_tokens": 4148,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.84727,
+      "grad_norm": 3.05871844291687,
+      "learning_rate": 1.4269274727018419e-05,
+      "loss": 1.1089,
+      "step": 932
+    },
+    {
+      "batch_num_effect_tokens": 5868,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.84818,
+      "grad_norm": 4.548148155212402,
+      "learning_rate": 1.4254921390186986e-05,
+      "loss": 1.7198,
+      "step": 933
+    },
+    {
+      "batch_num_effect_tokens": 4948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.84909,
+      "grad_norm": 3.468212127685547,
+      "learning_rate": 1.424055734159602e-05,
+      "loss": 1.3738,
+      "step": 934
+    },
+    {
+      "batch_num_effect_tokens": 3911,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 0.85,
+      "grad_norm": 3.750844717025757,
+      "learning_rate": 1.4226182617406996e-05,
+      "loss": 1.9055,
+      "step": 935
+    },
+    {
+      "batch_num_effect_tokens": 8673,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 0.85091,
+      "grad_norm": 2.934328556060791,
+      "learning_rate": 1.4211797253808268e-05,
+      "loss": 2.0859,
+      "step": 936
+    },
+    {
+      "batch_num_effect_tokens": 6360,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 0.85182,
+      "grad_norm": 3.3923490047454834,
+      "learning_rate": 1.419740128701497e-05,
+      "loss": 1.7395,
+      "step": 937
+    },
+    {
+      "batch_num_effect_tokens": 6507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.85273,
+      "grad_norm": 3.219904661178589,
+      "learning_rate": 1.4182994753268929e-05,
+      "loss": 1.8854,
+      "step": 938
+    },
+    {
+      "batch_num_effect_tokens": 4322,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.85364,
+      "grad_norm": 3.351370096206665,
+      "learning_rate": 1.4168577688838581e-05,
+      "loss": 1.5269,
+      "step": 939
+    },
+    {
+      "batch_num_effect_tokens": 6869,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52125,
+      "epoch": 0.85455,
+      "grad_norm": 3.1984567642211914,
+      "learning_rate": 1.4154150130018867e-05,
+      "loss": 2.1609,
+      "step": 940
+    },
+    {
+      "batch_num_effect_tokens": 6322,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.85545,
+      "grad_norm": 3.1590569019317627,
+      "learning_rate": 1.4139712113131146e-05,
+      "loss": 1.786,
+      "step": 941
+    },
+    {
+      "batch_num_effect_tokens": 6568,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.85636,
+      "grad_norm": 3.2718281745910645,
+      "learning_rate": 1.4125263674523113e-05,
+      "loss": 1.7956,
+      "step": 942
+    },
+    {
+      "batch_num_effect_tokens": 6453,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.85727,
+      "grad_norm": 3.1080870628356934,
+      "learning_rate": 1.4110804850568691e-05,
+      "loss": 2.0176,
+      "step": 943
+    },
+    {
+      "batch_num_effect_tokens": 7243,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.85818,
+      "grad_norm": 2.9790542125701904,
+      "learning_rate": 1.4096335677667954e-05,
+      "loss": 1.6818,
+      "step": 944
+    },
+    {
+      "batch_num_effect_tokens": 6578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.85909,
+      "grad_norm": 2.952096462249756,
+      "learning_rate": 1.4081856192247032e-05,
+      "loss": 1.8174,
+      "step": 945
+    },
+    {
+      "batch_num_effect_tokens": 6645,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.86,
+      "grad_norm": 3.055183172225952,
+      "learning_rate": 1.4067366430758004e-05,
+      "loss": 1.8087,
+      "step": 946
+    },
+    {
+      "batch_num_effect_tokens": 6191,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 0.86091,
+      "grad_norm": 3.5459861755371094,
+      "learning_rate": 1.4052866429678832e-05,
+      "loss": 1.5099,
+      "step": 947
+    },
+    {
+      "batch_num_effect_tokens": 8808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 0.86182,
+      "grad_norm": 3.076371192932129,
+      "learning_rate": 1.403835622551325e-05,
+      "loss": 2.0291,
+      "step": 948
+    },
+    {
+      "batch_num_effect_tokens": 8282,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.86273,
+      "grad_norm": 3.062523126602173,
+      "learning_rate": 1.4023835854790682e-05,
+      "loss": 2.123,
+      "step": 949
+    },
+    {
+      "batch_num_effect_tokens": 6325,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.86364,
+      "grad_norm": 3.318748950958252,
+      "learning_rate": 1.4009305354066138e-05,
+      "loss": 1.9367,
+      "step": 950
+    },
+    {
+      "batch_num_effect_tokens": 5377,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 0.86455,
+      "grad_norm": 3.2072744369506836,
+      "learning_rate": 1.3994764759920144e-05,
+      "loss": 1.7947,
+      "step": 951
+    },
+    {
+      "batch_num_effect_tokens": 5308,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 0.86545,
+      "grad_norm": 4.14223051071167,
+      "learning_rate": 1.3980214108958626e-05,
+      "loss": 1.2963,
+      "step": 952
+    },
+    {
+      "batch_num_effect_tokens": 6727,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 0.86636,
+      "grad_norm": 3.146315574645996,
+      "learning_rate": 1.3965653437812825e-05,
+      "loss": 1.6541,
+      "step": 953
+    },
+    {
+      "batch_num_effect_tokens": 7145,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52131,
+      "epoch": 0.86727,
+      "grad_norm": 2.9531638622283936,
+      "learning_rate": 1.3951082783139221e-05,
+      "loss": 1.9211,
+      "step": 954
+    },
+    {
+      "batch_num_effect_tokens": 7363,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.86818,
+      "grad_norm": 3.0392003059387207,
+      "learning_rate": 1.3936502181619415e-05,
+      "loss": 1.7422,
+      "step": 955
+    },
+    {
+      "batch_num_effect_tokens": 5878,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 0.86909,
+      "grad_norm": 3.071474552154541,
+      "learning_rate": 1.3921911669960055e-05,
+      "loss": 1.8491,
+      "step": 956
+    },
+    {
+      "batch_num_effect_tokens": 9591,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.87,
+      "grad_norm": 2.797166347503662,
+      "learning_rate": 1.3907311284892737e-05,
+      "loss": 2.0928,
+      "step": 957
+    },
+    {
+      "batch_num_effect_tokens": 9939,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.87091,
+      "grad_norm": 2.7077417373657227,
+      "learning_rate": 1.3892701063173917e-05,
+      "loss": 2.0326,
+      "step": 958
+    },
+    {
+      "batch_num_effect_tokens": 4945,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.87182,
+      "grad_norm": 3.3089983463287354,
+      "learning_rate": 1.3878081041584803e-05,
+      "loss": 1.51,
+      "step": 959
+    },
+    {
+      "batch_num_effect_tokens": 7892,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.87273,
+      "grad_norm": 2.2191925048828125,
+      "learning_rate": 1.3863451256931286e-05,
+      "loss": 1.0674,
+      "step": 960
+    },
+    {
+      "batch_num_effect_tokens": 7345,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.87364,
+      "grad_norm": 3.0597195625305176,
+      "learning_rate": 1.3848811746043835e-05,
+      "loss": 1.7672,
+      "step": 961
+    },
+    {
+      "batch_num_effect_tokens": 4985,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.87455,
+      "grad_norm": 3.9039957523345947,
+      "learning_rate": 1.3834162545777394e-05,
+      "loss": 1.938,
+      "step": 962
+    },
+    {
+      "batch_num_effect_tokens": 4052,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.87545,
+      "grad_norm": 3.0499753952026367,
+      "learning_rate": 1.3819503693011314e-05,
+      "loss": 1.1316,
+      "step": 963
+    },
+    {
+      "batch_num_effect_tokens": 5414,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 0.87636,
+      "grad_norm": 3.102602481842041,
+      "learning_rate": 1.380483522464923e-05,
+      "loss": 1.4438,
+      "step": 964
+    },
+    {
+      "batch_num_effect_tokens": 5948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.87727,
+      "grad_norm": 3.1843950748443604,
+      "learning_rate": 1.3790157177619005e-05,
+      "loss": 1.9162,
+      "step": 965
+    },
+    {
+      "batch_num_effect_tokens": 4852,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 0.87818,
+      "grad_norm": 3.322779655456543,
+      "learning_rate": 1.3775469588872601e-05,
+      "loss": 1.5374,
+      "step": 966
+    },
+    {
+      "batch_num_effect_tokens": 5462,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50560,
+      "epoch": 0.87909,
+      "grad_norm": 3.453280210494995,
+      "learning_rate": 1.3760772495385998e-05,
+      "loss": 1.9769,
+      "step": 967
+    },
+    {
+      "batch_num_effect_tokens": 5586,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 0.88,
+      "grad_norm": 3.1580238342285156,
+      "learning_rate": 1.3746065934159123e-05,
+      "loss": 1.7682,
+      "step": 968
+    },
+    {
+      "batch_num_effect_tokens": 5068,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.88091,
+      "grad_norm": 3.2250802516937256,
+      "learning_rate": 1.3731349942215718e-05,
+      "loss": 1.6973,
+      "step": 969
+    },
+    {
+      "batch_num_effect_tokens": 4459,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 0.88182,
+      "grad_norm": 3.6121764183044434,
+      "learning_rate": 1.3716624556603275e-05,
+      "loss": 1.7362,
+      "step": 970
+    },
+    {
+      "batch_num_effect_tokens": 6493,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 0.88273,
+      "grad_norm": 3.3951101303100586,
+      "learning_rate": 1.3701889814392944e-05,
+      "loss": 2.0725,
+      "step": 971
+    },
+    {
+      "batch_num_effect_tokens": 9339,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.88364,
+      "grad_norm": 2.8378098011016846,
+      "learning_rate": 1.3687145752679409e-05,
+      "loss": 1.9609,
+      "step": 972
+    },
+    {
+      "batch_num_effect_tokens": 6144,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.88455,
+      "grad_norm": 3.2717912197113037,
+      "learning_rate": 1.3672392408580834e-05,
+      "loss": 1.9867,
+      "step": 973
+    },
+    {
+      "batch_num_effect_tokens": 7002,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 0.88545,
+      "grad_norm": 3.2133514881134033,
+      "learning_rate": 1.3657629819238747e-05,
+      "loss": 1.7634,
+      "step": 974
+    },
+    {
+      "batch_num_effect_tokens": 7274,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52214,
+      "epoch": 0.88636,
+      "grad_norm": 3.2251951694488525,
+      "learning_rate": 1.3642858021817944e-05,
+      "loss": 2.203,
+      "step": 975
+    },
+    {
+      "batch_num_effect_tokens": 5400,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.88727,
+      "grad_norm": 3.3960633277893066,
+      "learning_rate": 1.362807705350641e-05,
+      "loss": 1.2251,
+      "step": 976
+    },
+    {
+      "batch_num_effect_tokens": 7883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.88818,
+      "grad_norm": 2.9552037715911865,
+      "learning_rate": 1.3613286951515216e-05,
+      "loss": 1.9736,
+      "step": 977
+    },
+    {
+      "batch_num_effect_tokens": 7173,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.88909,
+      "grad_norm": 3.043170690536499,
+      "learning_rate": 1.3598487753078426e-05,
+      "loss": 1.7401,
+      "step": 978
+    },
+    {
+      "batch_num_effect_tokens": 7428,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 0.89,
+      "grad_norm": 3.0731730461120605,
+      "learning_rate": 1.3583679495453e-05,
+      "loss": 1.9956,
+      "step": 979
+    },
+    {
+      "batch_num_effect_tokens": 10259,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 0.89091,
+      "grad_norm": 2.731771469116211,
+      "learning_rate": 1.356886221591872e-05,
+      "loss": 2.0283,
+      "step": 980
+    },
+    {
+      "batch_num_effect_tokens": 5746,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 0.89182,
+      "grad_norm": 3.238330602645874,
+      "learning_rate": 1.355403595177806e-05,
+      "loss": 1.5323,
+      "step": 981
+    },
+    {
+      "batch_num_effect_tokens": 7136,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 0.89273,
+      "grad_norm": 3.06024432182312,
+      "learning_rate": 1.353920074035612e-05,
+      "loss": 1.8443,
+      "step": 982
+    },
+    {
+      "batch_num_effect_tokens": 4411,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.89364,
+      "grad_norm": 3.3686585426330566,
+      "learning_rate": 1.3524356619000534e-05,
+      "loss": 1.4908,
+      "step": 983
+    },
+    {
+      "batch_num_effect_tokens": 8474,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.89455,
+      "grad_norm": 3.075028657913208,
+      "learning_rate": 1.350950362508136e-05,
+      "loss": 2.2764,
+      "step": 984
+    },
+    {
+      "batch_num_effect_tokens": 6610,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.89545,
+      "grad_norm": 3.16196608543396,
+      "learning_rate": 1.3494641795990986e-05,
+      "loss": 2.0977,
+      "step": 985
+    },
+    {
+      "batch_num_effect_tokens": 6030,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 0.89636,
+      "grad_norm": 2.9432215690612793,
+      "learning_rate": 1.3479771169144052e-05,
+      "loss": 1.778,
+      "step": 986
+    },
+    {
+      "batch_num_effect_tokens": 6252,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.89727,
+      "grad_norm": 3.090939521789551,
+      "learning_rate": 1.346489178197735e-05,
+      "loss": 1.9385,
+      "step": 987
+    },
+    {
+      "batch_num_effect_tokens": 5933,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.89818,
+      "grad_norm": 3.3969521522521973,
+      "learning_rate": 1.3450003671949707e-05,
+      "loss": 1.5074,
+      "step": 988
+    },
+    {
+      "batch_num_effect_tokens": 6049,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.89909,
+      "grad_norm": 2.901487112045288,
+      "learning_rate": 1.3435106876541933e-05,
+      "loss": 1.6664,
+      "step": 989
+    },
+    {
+      "batch_num_effect_tokens": 7799,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.9,
+      "grad_norm": 2.844240188598633,
+      "learning_rate": 1.342020143325669e-05,
+      "loss": 1.8445,
+      "step": 990
+    },
+    {
+      "batch_num_effect_tokens": 5378,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.90091,
+      "grad_norm": 3.242816686630249,
+      "learning_rate": 1.340528737961841e-05,
+      "loss": 1.3773,
+      "step": 991
+    },
+    {
+      "batch_num_effect_tokens": 7026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 0.90182,
+      "grad_norm": 2.9425547122955322,
+      "learning_rate": 1.3390364753173206e-05,
+      "loss": 1.8544,
+      "step": 992
+    },
+    {
+      "batch_num_effect_tokens": 6883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.90273,
+      "grad_norm": 3.4907162189483643,
+      "learning_rate": 1.337543359148878e-05,
+      "loss": 1.9146,
+      "step": 993
+    },
+    {
+      "batch_num_effect_tokens": 6305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.90364,
+      "grad_norm": 3.1133174896240234,
+      "learning_rate": 1.3360493932154301e-05,
+      "loss": 1.875,
+      "step": 994
+    },
+    {
+      "batch_num_effect_tokens": 6244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.90455,
+      "grad_norm": 3.476804733276367,
+      "learning_rate": 1.3345545812780354e-05,
+      "loss": 1.7176,
+      "step": 995
+    },
+    {
+      "batch_num_effect_tokens": 4621,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 0.90545,
+      "grad_norm": 3.518723964691162,
+      "learning_rate": 1.3330589270998809e-05,
+      "loss": 1.8252,
+      "step": 996
+    },
+    {
+      "batch_num_effect_tokens": 7208,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.90636,
+      "grad_norm": 3.3271543979644775,
+      "learning_rate": 1.331562434446274e-05,
+      "loss": 2.199,
+      "step": 997
+    },
+    {
+      "batch_num_effect_tokens": 5059,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.90727,
+      "grad_norm": 3.5937557220458984,
+      "learning_rate": 1.3300651070846333e-05,
+      "loss": 1.6976,
+      "step": 998
+    },
+    {
+      "batch_num_effect_tokens": 6163,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 0.90818,
+      "grad_norm": 3.187220335006714,
+      "learning_rate": 1.3285669487844786e-05,
+      "loss": 1.6587,
+      "step": 999
+    },
+    {
+      "batch_num_effect_tokens": 6955,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.90909,
+      "grad_norm": 3.0915720462799072,
+      "learning_rate": 1.3270679633174219e-05,
+      "loss": 2.0351,
+      "step": 1000
+    },
+    {
+      "batch_num_effect_tokens": 5494,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.91,
+      "grad_norm": 2.7694406509399414,
+      "learning_rate": 1.3255681544571568e-05,
+      "loss": 1.0195,
+      "step": 1001
+    },
+    {
+      "batch_num_effect_tokens": 7607,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.91091,
+      "grad_norm": 2.9730849266052246,
+      "learning_rate": 1.3240675259794507e-05,
+      "loss": 1.9462,
+      "step": 1002
+    },
+    {
+      "batch_num_effect_tokens": 5629,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 0.91182,
+      "grad_norm": 3.193814277648926,
+      "learning_rate": 1.3225660816621342e-05,
+      "loss": 1.6498,
+      "step": 1003
+    },
+    {
+      "batch_num_effect_tokens": 10167,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52122,
+      "epoch": 0.91273,
+      "grad_norm": 2.7844138145446777,
+      "learning_rate": 1.321063825285091e-05,
+      "loss": 2.0605,
+      "step": 1004
+    },
+    {
+      "batch_num_effect_tokens": 5533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.91364,
+      "grad_norm": 3.48059344291687,
+      "learning_rate": 1.3195607606302501e-05,
+      "loss": 1.7594,
+      "step": 1005
+    },
+    {
+      "batch_num_effect_tokens": 10088,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 0.91455,
+      "grad_norm": 2.8770751953125,
+      "learning_rate": 1.3180568914815752e-05,
+      "loss": 2.077,
+      "step": 1006
+    },
+    {
+      "batch_num_effect_tokens": 7385,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 0.91545,
+      "grad_norm": 3.197993278503418,
+      "learning_rate": 1.3165522216250544e-05,
+      "loss": 1.8163,
+      "step": 1007
+    },
+    {
+      "batch_num_effect_tokens": 7704,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.91636,
+      "grad_norm": 2.8752472400665283,
+      "learning_rate": 1.3150467548486929e-05,
+      "loss": 1.9441,
+      "step": 1008
+    },
+    {
+      "batch_num_effect_tokens": 6017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.91727,
+      "grad_norm": 3.4210171699523926,
+      "learning_rate": 1.3135404949425015e-05,
+      "loss": 1.9371,
+      "step": 1009
+    },
+    {
+      "batch_num_effect_tokens": 6395,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.91818,
+      "grad_norm": 3.0630974769592285,
+      "learning_rate": 1.3120334456984871e-05,
+      "loss": 1.7543,
+      "step": 1010
+    },
+    {
+      "batch_num_effect_tokens": 4265,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.91909,
+      "grad_norm": 3.465033531188965,
+      "learning_rate": 1.310525610910645e-05,
+      "loss": 1.1579,
+      "step": 1011
+    },
+    {
+      "batch_num_effect_tokens": 4477,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.92,
+      "grad_norm": 3.1414945125579834,
+      "learning_rate": 1.3090169943749475e-05,
+      "loss": 1.2547,
+      "step": 1012
+    },
+    {
+      "batch_num_effect_tokens": 8085,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 0.92091,
+      "grad_norm": 3.4485549926757812,
+      "learning_rate": 1.3075075998893345e-05,
+      "loss": 2.2038,
+      "step": 1013
+    },
+    {
+      "batch_num_effect_tokens": 4847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 0.92182,
+      "grad_norm": 3.664796829223633,
+      "learning_rate": 1.3059974312537054e-05,
+      "loss": 1.5868,
+      "step": 1014
+    },
+    {
+      "batch_num_effect_tokens": 6282,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 0.92273,
+      "grad_norm": 2.9644715785980225,
+      "learning_rate": 1.3044864922699072e-05,
+      "loss": 1.5455,
+      "step": 1015
+    },
+    {
+      "batch_num_effect_tokens": 5664,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.92364,
+      "grad_norm": 3.3737034797668457,
+      "learning_rate": 1.3029747867417275e-05,
+      "loss": 1.6858,
+      "step": 1016
+    },
+    {
+      "batch_num_effect_tokens": 7258,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52178,
+      "epoch": 0.92455,
+      "grad_norm": 2.9502992630004883,
+      "learning_rate": 1.301462318474883e-05,
+      "loss": 1.8345,
+      "step": 1017
+    },
+    {
+      "batch_num_effect_tokens": 7294,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 0.92545,
+      "grad_norm": 3.1529641151428223,
+      "learning_rate": 1.2999490912770108e-05,
+      "loss": 2.0247,
+      "step": 1018
+    },
+    {
+      "batch_num_effect_tokens": 6216,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.92636,
+      "grad_norm": 3.3714842796325684,
+      "learning_rate": 1.2984351089576585e-05,
+      "loss": 1.9965,
+      "step": 1019
+    },
+    {
+      "batch_num_effect_tokens": 7921,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.92727,
+      "grad_norm": 2.8136098384857178,
+      "learning_rate": 1.296920375328275e-05,
+      "loss": 1.9386,
+      "step": 1020
+    },
+    {
+      "batch_num_effect_tokens": 6546,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52107,
+      "epoch": 0.92818,
+      "grad_norm": 3.0203120708465576,
+      "learning_rate": 1.2954048942022002e-05,
+      "loss": 1.6337,
+      "step": 1021
+    },
+    {
+      "batch_num_effect_tokens": 6707,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 0.92909,
+      "grad_norm": 3.1548757553100586,
+      "learning_rate": 1.2938886693946563e-05,
+      "loss": 1.8008,
+      "step": 1022
+    },
+    {
+      "batch_num_effect_tokens": 6462,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.93,
+      "grad_norm": 3.236276149749756,
+      "learning_rate": 1.2923717047227368e-05,
+      "loss": 2.0444,
+      "step": 1023
+    },
+    {
+      "batch_num_effect_tokens": 4891,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.93091,
+      "grad_norm": 3.0936660766601562,
+      "learning_rate": 1.2908540040053992e-05,
+      "loss": 1.4072,
+      "step": 1024
+    },
+    {
+      "batch_num_effect_tokens": 9397,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 0.93182,
+      "grad_norm": 2.655895948410034,
+      "learning_rate": 1.289335571063453e-05,
+      "loss": 1.9849,
+      "step": 1025
+    },
+    {
+      "batch_num_effect_tokens": 7461,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.93273,
+      "grad_norm": 3.800144672393799,
+      "learning_rate": 1.287816409719551e-05,
+      "loss": 1.9769,
+      "step": 1026
+    },
+    {
+      "batch_num_effect_tokens": 6287,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.93364,
+      "grad_norm": 3.192833662033081,
+      "learning_rate": 1.2862965237981804e-05,
+      "loss": 1.4238,
+      "step": 1027
+    },
+    {
+      "batch_num_effect_tokens": 6381,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 0.93455,
+      "grad_norm": 3.40364670753479,
+      "learning_rate": 1.2847759171256523e-05,
+      "loss": 1.4348,
+      "step": 1028
+    },
+    {
+      "batch_num_effect_tokens": 6186,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 0.93545,
+      "grad_norm": 3.084475517272949,
+      "learning_rate": 1.283254593530092e-05,
+      "loss": 1.5965,
+      "step": 1029
+    },
+    {
+      "batch_num_effect_tokens": 6089,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 0.93636,
+      "grad_norm": 3.103367805480957,
+      "learning_rate": 1.2817325568414299e-05,
+      "loss": 1.4762,
+      "step": 1030
+    },
+    {
+      "batch_num_effect_tokens": 7823,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.93727,
+      "grad_norm": 3.1137585639953613,
+      "learning_rate": 1.2802098108913914e-05,
+      "loss": 2.0879,
+      "step": 1031
+    },
+    {
+      "batch_num_effect_tokens": 7435,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 0.93818,
+      "grad_norm": 3.122626543045044,
+      "learning_rate": 1.278686359513488e-05,
+      "loss": 1.7587,
+      "step": 1032
+    },
+    {
+      "batch_num_effect_tokens": 4802,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 0.93909,
+      "grad_norm": 4.107133865356445,
+      "learning_rate": 1.2771622065430061e-05,
+      "loss": 1.6359,
+      "step": 1033
+    },
+    {
+      "batch_num_effect_tokens": 7525,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.94,
+      "grad_norm": 3.2098639011383057,
+      "learning_rate": 1.2756373558169992e-05,
+      "loss": 2.0099,
+      "step": 1034
+    },
+    {
+      "batch_num_effect_tokens": 5538,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.94091,
+      "grad_norm": 3.2654569149017334,
+      "learning_rate": 1.2741118111742778e-05,
+      "loss": 1.3742,
+      "step": 1035
+    },
+    {
+      "batch_num_effect_tokens": 6099,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 0.94182,
+      "grad_norm": 3.20632004737854,
+      "learning_rate": 1.2725855764553981e-05,
+      "loss": 1.6452,
+      "step": 1036
+    },
+    {
+      "batch_num_effect_tokens": 5712,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 0.94273,
+      "grad_norm": 3.3683362007141113,
+      "learning_rate": 1.2710586555026541e-05,
+      "loss": 1.6897,
+      "step": 1037
+    },
+    {
+      "batch_num_effect_tokens": 5909,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.94364,
+      "grad_norm": 3.2638537883758545,
+      "learning_rate": 1.269531052160068e-05,
+      "loss": 1.8701,
+      "step": 1038
+    },
+    {
+      "batch_num_effect_tokens": 5331,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 0.94455,
+      "grad_norm": 3.3751261234283447,
+      "learning_rate": 1.2680027702733791e-05,
+      "loss": 1.4539,
+      "step": 1039
+    },
+    {
+      "batch_num_effect_tokens": 6539,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 0.94545,
+      "grad_norm": 3.4274160861968994,
+      "learning_rate": 1.266473813690035e-05,
+      "loss": 2.0514,
+      "step": 1040
+    },
+    {
+      "batch_num_effect_tokens": 5271,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50543,
+      "epoch": 0.94636,
+      "grad_norm": 3.4731414318084717,
+      "learning_rate": 1.2649441862591826e-05,
+      "loss": 1.9529,
+      "step": 1041
+    },
+    {
+      "batch_num_effect_tokens": 5074,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.94727,
+      "grad_norm": 3.665320634841919,
+      "learning_rate": 1.2634138918316567e-05,
+      "loss": 1.7549,
+      "step": 1042
+    },
+    {
+      "batch_num_effect_tokens": 4107,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 0.94818,
+      "grad_norm": 3.3488118648529053,
+      "learning_rate": 1.2618829342599719e-05,
+      "loss": 1.4641,
+      "step": 1043
+    },
+    {
+      "batch_num_effect_tokens": 8762,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.94909,
+      "grad_norm": 2.7790908813476562,
+      "learning_rate": 1.2603513173983121e-05,
+      "loss": 1.4908,
+      "step": 1044
+    },
+    {
+      "batch_num_effect_tokens": 9360,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52200,
+      "epoch": 0.95,
+      "grad_norm": 2.7828152179718018,
+      "learning_rate": 1.2588190451025209e-05,
+      "loss": 2.2785,
+      "step": 1045
+    },
+    {
+      "batch_num_effect_tokens": 5616,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 0.95091,
+      "grad_norm": 3.2362778186798096,
+      "learning_rate": 1.2572861212300917e-05,
+      "loss": 1.5082,
+      "step": 1046
+    },
+    {
+      "batch_num_effect_tokens": 6733,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 0.95182,
+      "grad_norm": 2.959310531616211,
+      "learning_rate": 1.255752549640159e-05,
+      "loss": 1.6621,
+      "step": 1047
+    },
+    {
+      "batch_num_effect_tokens": 4214,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.95273,
+      "grad_norm": 3.2532103061676025,
+      "learning_rate": 1.2542183341934873e-05,
+      "loss": 1.0932,
+      "step": 1048
+    },
+    {
+      "batch_num_effect_tokens": 5282,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.95364,
+      "grad_norm": 3.0876121520996094,
+      "learning_rate": 1.2526834787524615e-05,
+      "loss": 0.952,
+      "step": 1049
+    },
+    {
+      "batch_num_effect_tokens": 7447,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 0.95455,
+      "grad_norm": 3.1305782794952393,
+      "learning_rate": 1.2511479871810792e-05,
+      "loss": 1.6978,
+      "step": 1050
+    },
+    {
+      "batch_num_effect_tokens": 10256,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 0.95545,
+      "grad_norm": 2.8973023891448975,
+      "learning_rate": 1.2496118633449386e-05,
+      "loss": 1.9767,
+      "step": 1051
+    },
+    {
+      "batch_num_effect_tokens": 7292,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 0.95636,
+      "grad_norm": 2.9504804611206055,
+      "learning_rate": 1.248075111111229e-05,
+      "loss": 1.5928,
+      "step": 1052
+    },
+    {
+      "batch_num_effect_tokens": 7355,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.95727,
+      "grad_norm": 3.3816585540771484,
+      "learning_rate": 1.2465377343487227e-05,
+      "loss": 1.6466,
+      "step": 1053
+    },
+    {
+      "batch_num_effect_tokens": 7898,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52219,
+      "epoch": 0.95818,
+      "grad_norm": 3.2212235927581787,
+      "learning_rate": 1.244999736927764e-05,
+      "loss": 1.5401,
+      "step": 1054
+    },
+    {
+      "batch_num_effect_tokens": 7335,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52177,
+      "epoch": 0.95909,
+      "grad_norm": 3.730105400085449,
+      "learning_rate": 1.2434611227202591e-05,
+      "loss": 1.6948,
+      "step": 1055
+    },
+    {
+      "batch_num_effect_tokens": 6588,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 0.96,
+      "grad_norm": 3.086858034133911,
+      "learning_rate": 1.2419218955996677e-05,
+      "loss": 1.5954,
+      "step": 1056
+    },
+    {
+      "batch_num_effect_tokens": 5296,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 0.96091,
+      "grad_norm": 3.705068588256836,
+      "learning_rate": 1.2403820594409926e-05,
+      "loss": 1.875,
+      "step": 1057
+    },
+    {
+      "batch_num_effect_tokens": 10256,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52203,
+      "epoch": 0.96182,
+      "grad_norm": 2.9357504844665527,
+      "learning_rate": 1.238841618120769e-05,
+      "loss": 2.0334,
+      "step": 1058
+    },
+    {
+      "batch_num_effect_tokens": 6726,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 0.96273,
+      "grad_norm": 3.0499720573425293,
+      "learning_rate": 1.2373005755170563e-05,
+      "loss": 1.666,
+      "step": 1059
+    },
+    {
+      "batch_num_effect_tokens": 7094,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.96364,
+      "grad_norm": 3.0460546016693115,
+      "learning_rate": 1.2357589355094275e-05,
+      "loss": 1.9798,
+      "step": 1060
+    },
+    {
+      "batch_num_effect_tokens": 7578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.96455,
+      "grad_norm": 2.8582043647766113,
+      "learning_rate": 1.234216701978959e-05,
+      "loss": 1.9126,
+      "step": 1061
+    },
+    {
+      "batch_num_effect_tokens": 4946,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 0.96545,
+      "grad_norm": 2.9517149925231934,
+      "learning_rate": 1.2326738788082225e-05,
+      "loss": 1.2517,
+      "step": 1062
+    },
+    {
+      "batch_num_effect_tokens": 11793,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.96636,
+      "grad_norm": 2.5095744132995605,
+      "learning_rate": 1.2311304698812732e-05,
+      "loss": 2.041,
+      "step": 1063
+    },
+    {
+      "batch_num_effect_tokens": 5526,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 0.96727,
+      "grad_norm": 3.3237781524658203,
+      "learning_rate": 1.2295864790836411e-05,
+      "loss": 1.6071,
+      "step": 1064
+    },
+    {
+      "batch_num_effect_tokens": 5858,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 0.96818,
+      "grad_norm": 3.2829267978668213,
+      "learning_rate": 1.2280419103023219e-05,
+      "loss": 1.843,
+      "step": 1065
+    },
+    {
+      "batch_num_effect_tokens": 5569,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.96909,
+      "grad_norm": 3.221628189086914,
+      "learning_rate": 1.2264967674257647e-05,
+      "loss": 1.5967,
+      "step": 1066
+    },
+    {
+      "batch_num_effect_tokens": 6244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.97,
+      "grad_norm": 3.0191125869750977,
+      "learning_rate": 1.2249510543438652e-05,
+      "loss": 1.6961,
+      "step": 1067
+    },
+    {
+      "batch_num_effect_tokens": 6027,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.97091,
+      "grad_norm": 3.282532215118408,
+      "learning_rate": 1.2234047749479543e-05,
+      "loss": 1.8343,
+      "step": 1068
+    },
+    {
+      "batch_num_effect_tokens": 7141,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 0.97182,
+      "grad_norm": 2.938680648803711,
+      "learning_rate": 1.2218579331307889e-05,
+      "loss": 1.863,
+      "step": 1069
+    },
+    {
+      "batch_num_effect_tokens": 6113,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52094,
+      "epoch": 0.97273,
+      "grad_norm": 3.5186402797698975,
+      "learning_rate": 1.2203105327865407e-05,
+      "loss": 1.8756,
+      "step": 1070
+    },
+    {
+      "batch_num_effect_tokens": 6241,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52186,
+      "epoch": 0.97364,
+      "grad_norm": 3.062868118286133,
+      "learning_rate": 1.218762577810789e-05,
+      "loss": 1.526,
+      "step": 1071
+    },
+    {
+      "batch_num_effect_tokens": 5190,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.97455,
+      "grad_norm": 2.7961323261260986,
+      "learning_rate": 1.217214072100508e-05,
+      "loss": 1.3068,
+      "step": 1072
+    },
+    {
+      "batch_num_effect_tokens": 7362,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 0.97545,
+      "grad_norm": 2.909553289413452,
+      "learning_rate": 1.2156650195540592e-05,
+      "loss": 1.9114,
+      "step": 1073
+    },
+    {
+      "batch_num_effect_tokens": 7249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 0.97636,
+      "grad_norm": 2.9898719787597656,
+      "learning_rate": 1.2141154240711806e-05,
+      "loss": 2.0127,
+      "step": 1074
+    },
+    {
+      "batch_num_effect_tokens": 7332,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.97727,
+      "grad_norm": 2.9858486652374268,
+      "learning_rate": 1.2125652895529766e-05,
+      "loss": 2.0471,
+      "step": 1075
+    },
+    {
+      "batch_num_effect_tokens": 7555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.97818,
+      "grad_norm": 3.1468863487243652,
+      "learning_rate": 1.2110146199019099e-05,
+      "loss": 2.057,
+      "step": 1076
+    },
+    {
+      "batch_num_effect_tokens": 6093,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 0.97909,
+      "grad_norm": 3.0881295204162598,
+      "learning_rate": 1.2094634190217886e-05,
+      "loss": 1.6342,
+      "step": 1077
+    },
+    {
+      "batch_num_effect_tokens": 10563,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 0.98,
+      "grad_norm": 2.679718017578125,
+      "learning_rate": 1.2079116908177592e-05,
+      "loss": 2.127,
+      "step": 1078
+    },
+    {
+      "batch_num_effect_tokens": 5258,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 0.98091,
+      "grad_norm": 2.9763917922973633,
+      "learning_rate": 1.2063594391962963e-05,
+      "loss": 1.5558,
+      "step": 1079
+    },
+    {
+      "batch_num_effect_tokens": 4843,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 0.98182,
+      "grad_norm": 3.296865940093994,
+      "learning_rate": 1.2048066680651908e-05,
+      "loss": 1.571,
+      "step": 1080
+    },
+    {
+      "batch_num_effect_tokens": 5420,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 0.98273,
+      "grad_norm": 3.038447856903076,
+      "learning_rate": 1.2032533813335423e-05,
+      "loss": 1.5036,
+      "step": 1081
+    },
+    {
+      "batch_num_effect_tokens": 5103,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.98364,
+      "grad_norm": 3.19463849067688,
+      "learning_rate": 1.2016995829117489e-05,
+      "loss": 1.9069,
+      "step": 1082
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 0.98455,
+      "grad_norm": 3.001164197921753,
+      "learning_rate": 1.2001452767114952e-05,
+      "loss": 2.1403,
+      "step": 1083
+    },
+    {
+      "batch_num_effect_tokens": 6417,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 0.98545,
+      "grad_norm": 2.9088382720947266,
+      "learning_rate": 1.1985904666457455e-05,
+      "loss": 1.4724,
+      "step": 1084
+    },
+    {
+      "batch_num_effect_tokens": 5631,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 0.98636,
+      "grad_norm": 3.3566412925720215,
+      "learning_rate": 1.1970351566287332e-05,
+      "loss": 1.7628,
+      "step": 1085
+    },
+    {
+      "batch_num_effect_tokens": 6616,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 0.98727,
+      "grad_norm": 3.4458236694335938,
+      "learning_rate": 1.1954793505759484e-05,
+      "loss": 1.8574,
+      "step": 1086
+    },
+    {
+      "batch_num_effect_tokens": 5497,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 0.98818,
+      "grad_norm": 3.271362781524658,
+      "learning_rate": 1.1939230524041315e-05,
+      "loss": 1.7046,
+      "step": 1087
+    },
+    {
+      "batch_num_effect_tokens": 7787,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 0.98909,
+      "grad_norm": 3.0926859378814697,
+      "learning_rate": 1.1923662660312611e-05,
+      "loss": 1.8148,
+      "step": 1088
+    },
+    {
+      "batch_num_effect_tokens": 7477,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 0.99,
+      "grad_norm": 2.8267569541931152,
+      "learning_rate": 1.190808995376545e-05,
+      "loss": 1.9415,
+      "step": 1089
+    },
+    {
+      "batch_num_effect_tokens": 6118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 0.99091,
+      "grad_norm": 3.279690980911255,
+      "learning_rate": 1.1892512443604103e-05,
+      "loss": 1.4473,
+      "step": 1090
+    },
+    {
+      "batch_num_effect_tokens": 5825,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.99182,
+      "grad_norm": 2.950528383255005,
+      "learning_rate": 1.1876930169044935e-05,
+      "loss": 1.6913,
+      "step": 1091
+    },
+    {
+      "batch_num_effect_tokens": 5726,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 0.99273,
+      "grad_norm": 3.1925201416015625,
+      "learning_rate": 1.1861343169316301e-05,
+      "loss": 1.6325,
+      "step": 1092
+    },
+    {
+      "batch_num_effect_tokens": 5895,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 0.99364,
+      "grad_norm": 2.982030153274536,
+      "learning_rate": 1.1845751483658454e-05,
+      "loss": 1.5498,
+      "step": 1093
+    },
+    {
+      "batch_num_effect_tokens": 3728,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 0.99455,
+      "grad_norm": 3.6174166202545166,
+      "learning_rate": 1.1830155151323447e-05,
+      "loss": 1.2632,
+      "step": 1094
+    },
+    {
+      "batch_num_effect_tokens": 4942,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52127,
+      "epoch": 0.99545,
+      "grad_norm": 3.5724058151245117,
+      "learning_rate": 1.1814554211575026e-05,
+      "loss": 1.7337,
+      "step": 1095
+    },
+    {
+      "batch_num_effect_tokens": 8458,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52194,
+      "epoch": 0.99636,
+      "grad_norm": 2.827831268310547,
+      "learning_rate": 1.179894870368854e-05,
+      "loss": 2.0079,
+      "step": 1096
+    },
+    {
+      "batch_num_effect_tokens": 3842,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 0.99727,
+      "grad_norm": 3.108935832977295,
+      "learning_rate": 1.1783338666950832e-05,
+      "loss": 1.1461,
+      "step": 1097
+    },
+    {
+      "batch_num_effect_tokens": 5537,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 0.99818,
+      "grad_norm": 3.582533359527588,
+      "learning_rate": 1.1767724140660158e-05,
+      "loss": 1.7423,
+      "step": 1098
+    },
+    {
+      "batch_num_effect_tokens": 8780,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 0.99909,
+      "grad_norm": 3.0004284381866455,
+      "learning_rate": 1.1752105164126062e-05,
+      "loss": 1.9102,
+      "step": 1099
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.0,
+      "grad_norm": 3.2643775939941406,
+      "learning_rate": 1.1736481776669307e-05,
+      "loss": 1.7432,
+      "step": 1100
+    },
+    {
+      "batch_num_effect_tokens": 6503,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52110,
+      "epoch": 1.00091,
+      "grad_norm": 3.069766044616699,
+      "learning_rate": 1.1720854017621744e-05,
+      "loss": 0.9619,
+      "step": 1101
+    },
+    {
+      "batch_num_effect_tokens": 4830,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.00182,
+      "grad_norm": 3.834742307662964,
+      "learning_rate": 1.170522192632624e-05,
+      "loss": 1.0151,
+      "step": 1102
+    },
+    {
+      "batch_num_effect_tokens": 7808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.00273,
+      "grad_norm": 3.0445826053619385,
+      "learning_rate": 1.1689585542136568e-05,
+      "loss": 1.398,
+      "step": 1103
+    },
+    {
+      "batch_num_effect_tokens": 8222,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.00364,
+      "grad_norm": 3.089691162109375,
+      "learning_rate": 1.1673944904417309e-05,
+      "loss": 1.8757,
+      "step": 1104
+    },
+    {
+      "batch_num_effect_tokens": 5186,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.00455,
+      "grad_norm": 3.261866569519043,
+      "learning_rate": 1.1658300052543742e-05,
+      "loss": 1.0662,
+      "step": 1105
+    },
+    {
+      "batch_num_effect_tokens": 5068,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.00545,
+      "grad_norm": 3.276542901992798,
+      "learning_rate": 1.1642651025901772e-05,
+      "loss": 0.9319,
+      "step": 1106
+    },
+    {
+      "batch_num_effect_tokens": 4540,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.00636,
+      "grad_norm": 3.2273130416870117,
+      "learning_rate": 1.1626997863887801e-05,
+      "loss": 0.7239,
+      "step": 1107
+    },
+    {
+      "batch_num_effect_tokens": 4436,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.00727,
+      "grad_norm": 3.6009256839752197,
+      "learning_rate": 1.1611340605908643e-05,
+      "loss": 0.9413,
+      "step": 1108
+    },
+    {
+      "batch_num_effect_tokens": 7451,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.00818,
+      "grad_norm": 3.0169732570648193,
+      "learning_rate": 1.159567929138143e-05,
+      "loss": 1.0808,
+      "step": 1109
+    },
+    {
+      "batch_num_effect_tokens": 8093,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52217,
+      "epoch": 1.00909,
+      "grad_norm": 2.988909959793091,
+      "learning_rate": 1.15800139597335e-05,
+      "loss": 1.3387,
+      "step": 1110
+    },
+    {
+      "batch_num_effect_tokens": 7321,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 1.01,
+      "grad_norm": 2.798431396484375,
+      "learning_rate": 1.156434465040231e-05,
+      "loss": 0.8343,
+      "step": 1111
+    },
+    {
+      "batch_num_effect_tokens": 7842,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.01091,
+      "grad_norm": 3.4338691234588623,
+      "learning_rate": 1.1548671402835325e-05,
+      "loss": 1.2682,
+      "step": 1112
+    },
+    {
+      "batch_num_effect_tokens": 5664,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.01182,
+      "grad_norm": 3.4470508098602295,
+      "learning_rate": 1.1532994256489926e-05,
+      "loss": 0.8029,
+      "step": 1113
+    },
+    {
+      "batch_num_effect_tokens": 5989,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 1.01273,
+      "grad_norm": 3.639939069747925,
+      "learning_rate": 1.1517313250833318e-05,
+      "loss": 0.8502,
+      "step": 1114
+    },
+    {
+      "batch_num_effect_tokens": 5629,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 1.01364,
+      "grad_norm": 3.206958532333374,
+      "learning_rate": 1.1501628425342404e-05,
+      "loss": 0.8156,
+      "step": 1115
+    },
+    {
+      "batch_num_effect_tokens": 6388,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.01455,
+      "grad_norm": 3.441685676574707,
+      "learning_rate": 1.1485939819503717e-05,
+      "loss": 0.8009,
+      "step": 1116
+    },
+    {
+      "batch_num_effect_tokens": 5590,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.01545,
+      "grad_norm": 4.197703838348389,
+      "learning_rate": 1.147024747281331e-05,
+      "loss": 0.887,
+      "step": 1117
+    },
+    {
+      "batch_num_effect_tokens": 7665,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 1.01636,
+      "grad_norm": 3.792309284210205,
+      "learning_rate": 1.1454551424776636e-05,
+      "loss": 1.5214,
+      "step": 1118
+    },
+    {
+      "batch_num_effect_tokens": 5329,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 1.01727,
+      "grad_norm": 3.148326873779297,
+      "learning_rate": 1.1438851714908483e-05,
+      "loss": 0.5887,
+      "step": 1119
+    },
+    {
+      "batch_num_effect_tokens": 4729,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.01818,
+      "grad_norm": 3.497055768966675,
+      "learning_rate": 1.1423148382732854e-05,
+      "loss": 0.7,
+      "step": 1120
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52112,
+      "epoch": 1.01909,
+      "grad_norm": 3.3380677700042725,
+      "learning_rate": 1.1407441467782865e-05,
+      "loss": 1.3483,
+      "step": 1121
+    },
+    {
+      "batch_num_effect_tokens": 8053,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.02,
+      "grad_norm": 3.140716075897217,
+      "learning_rate": 1.1391731009600655e-05,
+      "loss": 1.4393,
+      "step": 1122
+    },
+    {
+      "batch_num_effect_tokens": 3696,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52090,
+      "epoch": 1.02091,
+      "grad_norm": 2.911566972732544,
+      "learning_rate": 1.1376017047737292e-05,
+      "loss": 0.5999,
+      "step": 1123
+    },
+    {
+      "batch_num_effect_tokens": 8184,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.02182,
+      "grad_norm": 2.7683959007263184,
+      "learning_rate": 1.1360299621752644e-05,
+      "loss": 1.0861,
+      "step": 1124
+    },
+    {
+      "batch_num_effect_tokens": 8803,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 1.02273,
+      "grad_norm": 3.8152406215667725,
+      "learning_rate": 1.1344578771215319e-05,
+      "loss": 2.0837,
+      "step": 1125
+    },
+    {
+      "batch_num_effect_tokens": 4662,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.02364,
+      "grad_norm": 3.0464205741882324,
+      "learning_rate": 1.1328854535702542e-05,
+      "loss": 0.986,
+      "step": 1126
+    },
+    {
+      "batch_num_effect_tokens": 8958,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.02455,
+      "grad_norm": 2.634942054748535,
+      "learning_rate": 1.1313126954800053e-05,
+      "loss": 1.2716,
+      "step": 1127
+    },
+    {
+      "batch_num_effect_tokens": 6304,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.02545,
+      "grad_norm": 3.5925192832946777,
+      "learning_rate": 1.1297396068102019e-05,
+      "loss": 1.7713,
+      "step": 1128
+    },
+    {
+      "batch_num_effect_tokens": 6005,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.02636,
+      "grad_norm": 2.9206326007843018,
+      "learning_rate": 1.1281661915210931e-05,
+      "loss": 0.9858,
+      "step": 1129
+    },
+    {
+      "batch_num_effect_tokens": 5744,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.02727,
+      "grad_norm": 3.2044382095336914,
+      "learning_rate": 1.1265924535737494e-05,
+      "loss": 1.0135,
+      "step": 1130
+    },
+    {
+      "batch_num_effect_tokens": 4729,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.02818,
+      "grad_norm": 3.457561492919922,
+      "learning_rate": 1.1250183969300547e-05,
+      "loss": 0.6748,
+      "step": 1131
+    },
+    {
+      "batch_num_effect_tokens": 6517,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 1.02909,
+      "grad_norm": 3.122835874557495,
+      "learning_rate": 1.1234440255526948e-05,
+      "loss": 1.1057,
+      "step": 1132
+    },
+    {
+      "batch_num_effect_tokens": 8301,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52218,
+      "epoch": 1.03,
+      "grad_norm": 3.4122023582458496,
+      "learning_rate": 1.1218693434051475e-05,
+      "loss": 1.5586,
+      "step": 1133
+    },
+    {
+      "batch_num_effect_tokens": 5158,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.03091,
+      "grad_norm": 3.544649600982666,
+      "learning_rate": 1.1202943544516736e-05,
+      "loss": 0.9804,
+      "step": 1134
+    },
+    {
+      "batch_num_effect_tokens": 5538,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.03182,
+      "grad_norm": 2.7127685546875,
+      "learning_rate": 1.1187190626573052e-05,
+      "loss": 0.6383,
+      "step": 1135
+    },
+    {
+      "batch_num_effect_tokens": 6253,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.03273,
+      "grad_norm": 3.0972988605499268,
+      "learning_rate": 1.1171434719878385e-05,
+      "loss": 1.0666,
+      "step": 1136
+    },
+    {
+      "batch_num_effect_tokens": 5190,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.03364,
+      "grad_norm": 2.642263650894165,
+      "learning_rate": 1.11556758640982e-05,
+      "loss": 0.5828,
+      "step": 1137
+    },
+    {
+      "batch_num_effect_tokens": 7533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.03455,
+      "grad_norm": 3.2424395084381104,
+      "learning_rate": 1.1139914098905406e-05,
+      "loss": 1.219,
+      "step": 1138
+    },
+    {
+      "batch_num_effect_tokens": 7229,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 1.03545,
+      "grad_norm": 3.8125650882720947,
+      "learning_rate": 1.112414946398023e-05,
+      "loss": 1.1979,
+      "step": 1139
+    },
+    {
+      "batch_num_effect_tokens": 6539,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.03636,
+      "grad_norm": 3.3987605571746826,
+      "learning_rate": 1.1108381999010111e-05,
+      "loss": 1.1991,
+      "step": 1140
+    },
+    {
+      "batch_num_effect_tokens": 5621,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.03727,
+      "grad_norm": 3.7911298274993896,
+      "learning_rate": 1.1092611743689632e-05,
+      "loss": 0.9732,
+      "step": 1141
+    },
+    {
+      "batch_num_effect_tokens": 5192,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.03818,
+      "grad_norm": 4.324377536773682,
+      "learning_rate": 1.1076838737720392e-05,
+      "loss": 1.5059,
+      "step": 1142
+    },
+    {
+      "batch_num_effect_tokens": 6355,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.03909,
+      "grad_norm": 3.5138485431671143,
+      "learning_rate": 1.1061063020810909e-05,
+      "loss": 1.1693,
+      "step": 1143
+    },
+    {
+      "batch_num_effect_tokens": 5902,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52102,
+      "epoch": 1.04,
+      "grad_norm": 2.956238031387329,
+      "learning_rate": 1.1045284632676535e-05,
+      "loss": 1.0026,
+      "step": 1144
+    },
+    {
+      "batch_num_effect_tokens": 7857,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.04091,
+      "grad_norm": 3.6880264282226562,
+      "learning_rate": 1.1029503613039347e-05,
+      "loss": 1.8397,
+      "step": 1145
+    },
+    {
+      "batch_num_effect_tokens": 5948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.04182,
+      "grad_norm": 2.982151508331299,
+      "learning_rate": 1.1013720001628034e-05,
+      "loss": 1.1197,
+      "step": 1146
+    },
+    {
+      "batch_num_effect_tokens": 3216,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.04273,
+      "grad_norm": 3.204832077026367,
+      "learning_rate": 1.0997933838177828e-05,
+      "loss": 0.6265,
+      "step": 1147
+    },
+    {
+      "batch_num_effect_tokens": 8085,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 1.04364,
+      "grad_norm": 3.233100414276123,
+      "learning_rate": 1.0982145162430373e-05,
+      "loss": 1.432,
+      "step": 1148
+    },
+    {
+      "batch_num_effect_tokens": 8282,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.04455,
+      "grad_norm": 3.0157642364501953,
+      "learning_rate": 1.096635401413364e-05,
+      "loss": 1.2412,
+      "step": 1149
+    },
+    {
+      "batch_num_effect_tokens": 5305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.04545,
+      "grad_norm": 3.3882460594177246,
+      "learning_rate": 1.0950560433041825e-05,
+      "loss": 0.9461,
+      "step": 1150
+    },
+    {
+      "batch_num_effect_tokens": 4103,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 1.04636,
+      "grad_norm": 3.6615240573883057,
+      "learning_rate": 1.0934764458915258e-05,
+      "loss": 0.8385,
+      "step": 1151
+    },
+    {
+      "batch_num_effect_tokens": 6726,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 1.04727,
+      "grad_norm": 2.953944206237793,
+      "learning_rate": 1.0918966131520276e-05,
+      "loss": 0.9336,
+      "step": 1152
+    },
+    {
+      "batch_num_effect_tokens": 6332,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.04818,
+      "grad_norm": 4.152585506439209,
+      "learning_rate": 1.0903165490629152e-05,
+      "loss": 1.6518,
+      "step": 1153
+    },
+    {
+      "batch_num_effect_tokens": 6578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.04909,
+      "grad_norm": 3.0139248371124268,
+      "learning_rate": 1.0887362576019981e-05,
+      "loss": 0.9943,
+      "step": 1154
+    },
+    {
+      "batch_num_effect_tokens": 6072,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.05,
+      "grad_norm": 3.005803108215332,
+      "learning_rate": 1.0871557427476585e-05,
+      "loss": 0.9334,
+      "step": 1155
+    },
+    {
+      "batch_num_effect_tokens": 5623,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.05091,
+      "grad_norm": 3.361359119415283,
+      "learning_rate": 1.08557500847884e-05,
+      "loss": 0.9286,
+      "step": 1156
+    },
+    {
+      "batch_num_effect_tokens": 6921,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52212,
+      "epoch": 1.05182,
+      "grad_norm": 3.7183468341827393,
+      "learning_rate": 1.0839940587750394e-05,
+      "loss": 1.3418,
+      "step": 1157
+    },
+    {
+      "batch_num_effect_tokens": 7384,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52128,
+      "epoch": 1.05273,
+      "grad_norm": 8.443079948425293,
+      "learning_rate": 1.0824128976162964e-05,
+      "loss": 1.4034,
+      "step": 1158
+    },
+    {
+      "batch_num_effect_tokens": 7243,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.05364,
+      "grad_norm": 2.8543872833251953,
+      "learning_rate": 1.0808315289831814e-05,
+      "loss": 0.925,
+      "step": 1159
+    },
+    {
+      "batch_num_effect_tokens": 8632,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 1.05455,
+      "grad_norm": 2.774327516555786,
+      "learning_rate": 1.0792499568567885e-05,
+      "loss": 1.2081,
+      "step": 1160
+    },
+    {
+      "batch_num_effect_tokens": 7412,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 1.05545,
+      "grad_norm": 3.0567374229431152,
+      "learning_rate": 1.0776681852187239e-05,
+      "loss": 1.1328,
+      "step": 1161
+    },
+    {
+      "batch_num_effect_tokens": 4646,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.05636,
+      "grad_norm": 2.918250560760498,
+      "learning_rate": 1.076086218051095e-05,
+      "loss": 0.5601,
+      "step": 1162
+    },
+    {
+      "batch_num_effect_tokens": 5703,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.05727,
+      "grad_norm": 3.635793924331665,
+      "learning_rate": 1.0745040593365032e-05,
+      "loss": 1.2372,
+      "step": 1163
+    },
+    {
+      "batch_num_effect_tokens": 6002,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.05818,
+      "grad_norm": 3.530992269515991,
+      "learning_rate": 1.0729217130580309e-05,
+      "loss": 1.1045,
+      "step": 1164
+    },
+    {
+      "batch_num_effect_tokens": 6150,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.05909,
+      "grad_norm": 3.4126088619232178,
+      "learning_rate": 1.0713391831992324e-05,
+      "loss": 1.3539,
+      "step": 1165
+    },
+    {
+      "batch_num_effect_tokens": 7142,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52214,
+      "epoch": 1.06,
+      "grad_norm": 2.9573259353637695,
+      "learning_rate": 1.0697564737441254e-05,
+      "loss": 1.144,
+      "step": 1166
+    },
+    {
+      "batch_num_effect_tokens": 6675,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.06091,
+      "grad_norm": 3.1278979778289795,
+      "learning_rate": 1.068173588677179e-05,
+      "loss": 1.0402,
+      "step": 1167
+    },
+    {
+      "batch_num_effect_tokens": 6747,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.06182,
+      "grad_norm": 4.912282943725586,
+      "learning_rate": 1.066590531983304e-05,
+      "loss": 1.7302,
+      "step": 1168
+    },
+    {
+      "batch_num_effect_tokens": 6328,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 1.06273,
+      "grad_norm": 3.3606529235839844,
+      "learning_rate": 1.0650073076478442e-05,
+      "loss": 1.3774,
+      "step": 1169
+    },
+    {
+      "batch_num_effect_tokens": 4477,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.06364,
+      "grad_norm": 2.675183057785034,
+      "learning_rate": 1.0634239196565646e-05,
+      "loss": 0.5866,
+      "step": 1170
+    },
+    {
+      "batch_num_effect_tokens": 8367,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 1.06455,
+      "grad_norm": 2.8494794368743896,
+      "learning_rate": 1.0618403719956431e-05,
+      "loss": 1.1782,
+      "step": 1171
+    },
+    {
+      "batch_num_effect_tokens": 6840,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.06545,
+      "grad_norm": 3.3997509479522705,
+      "learning_rate": 1.0602566686516586e-05,
+      "loss": 1.3806,
+      "step": 1172
+    },
+    {
+      "batch_num_effect_tokens": 6867,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.06636,
+      "grad_norm": 2.9302570819854736,
+      "learning_rate": 1.0586728136115824e-05,
+      "loss": 1.0178,
+      "step": 1173
+    },
+    {
+      "batch_num_effect_tokens": 5868,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.06727,
+      "grad_norm": 2.8624753952026367,
+      "learning_rate": 1.0570888108627682e-05,
+      "loss": 0.7767,
+      "step": 1174
+    },
+    {
+      "batch_num_effect_tokens": 5831,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.06818,
+      "grad_norm": 3.2073261737823486,
+      "learning_rate": 1.0555046643929402e-05,
+      "loss": 1.0023,
+      "step": 1175
+    },
+    {
+      "batch_num_effect_tokens": 6077,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.06909,
+      "grad_norm": 3.3000328540802,
+      "learning_rate": 1.053920378190186e-05,
+      "loss": 1.2256,
+      "step": 1176
+    },
+    {
+      "batch_num_effect_tokens": 7508,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52099,
+      "epoch": 1.07,
+      "grad_norm": 2.593930959701538,
+      "learning_rate": 1.0523359562429441e-05,
+      "loss": 0.7808,
+      "step": 1177
+    },
+    {
+      "batch_num_effect_tokens": 7353,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.07091,
+      "grad_norm": 3.004465341567993,
+      "learning_rate": 1.0507514025399944e-05,
+      "loss": 1.1786,
+      "step": 1178
+    },
+    {
+      "batch_num_effect_tokens": 7435,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.07182,
+      "grad_norm": 2.8977913856506348,
+      "learning_rate": 1.0491667210704492e-05,
+      "loss": 1.0343,
+      "step": 1179
+    },
+    {
+      "batch_num_effect_tokens": 5870,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.07273,
+      "grad_norm": 3.019353151321411,
+      "learning_rate": 1.0475819158237426e-05,
+      "loss": 0.8489,
+      "step": 1180
+    },
+    {
+      "batch_num_effect_tokens": 5977,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 1.07364,
+      "grad_norm": 3.124189615249634,
+      "learning_rate": 1.0459969907896193e-05,
+      "loss": 0.7553,
+      "step": 1181
+    },
+    {
+      "batch_num_effect_tokens": 7198,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.07455,
+      "grad_norm": 4.0742387771606445,
+      "learning_rate": 1.0444119499581263e-05,
+      "loss": 1.6221,
+      "step": 1182
+    },
+    {
+      "batch_num_effect_tokens": 7704,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.07545,
+      "grad_norm": 2.7200570106506348,
+      "learning_rate": 1.0428267973196027e-05,
+      "loss": 0.794,
+      "step": 1183
+    },
+    {
+      "batch_num_effect_tokens": 8035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.07636,
+      "grad_norm": 3.0718557834625244,
+      "learning_rate": 1.0412415368646674e-05,
+      "loss": 1.0329,
+      "step": 1184
+    },
+    {
+      "batch_num_effect_tokens": 5666,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 1.07727,
+      "grad_norm": 3.2390568256378174,
+      "learning_rate": 1.0396561725842124e-05,
+      "loss": 0.6177,
+      "step": 1185
+    },
+    {
+      "batch_num_effect_tokens": 6701,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.07818,
+      "grad_norm": 3.020876884460449,
+      "learning_rate": 1.0380707084693902e-05,
+      "loss": 1.1314,
+      "step": 1186
+    },
+    {
+      "batch_num_effect_tokens": 7035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.07909,
+      "grad_norm": 4.152306079864502,
+      "learning_rate": 1.0364851485116047e-05,
+      "loss": 1.1334,
+      "step": 1187
+    },
+    {
+      "batch_num_effect_tokens": 8179,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.08,
+      "grad_norm": 3.2629177570343018,
+      "learning_rate": 1.0348994967025012e-05,
+      "loss": 1.4448,
+      "step": 1188
+    },
+    {
+      "batch_num_effect_tokens": 9174,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 1.08091,
+      "grad_norm": 3.2266151905059814,
+      "learning_rate": 1.0333137570339563e-05,
+      "loss": 1.2867,
+      "step": 1189
+    },
+    {
+      "batch_num_effect_tokens": 6245,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.08182,
+      "grad_norm": 3.2580463886260986,
+      "learning_rate": 1.031727933498068e-05,
+      "loss": 0.9441,
+      "step": 1190
+    },
+    {
+      "batch_num_effect_tokens": 7607,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.08273,
+      "grad_norm": 2.951713800430298,
+      "learning_rate": 1.0301420300871445e-05,
+      "loss": 1.109,
+      "step": 1191
+    },
+    {
+      "batch_num_effect_tokens": 7477,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.08364,
+      "grad_norm": 2.900982618331909,
+      "learning_rate": 1.0285560507936962e-05,
+      "loss": 1.1732,
+      "step": 1192
+    },
+    {
+      "batch_num_effect_tokens": 5992,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.08455,
+      "grad_norm": 4.045762538909912,
+      "learning_rate": 1.0269699996104246e-05,
+      "loss": 1.3233,
+      "step": 1193
+    },
+    {
+      "batch_num_effect_tokens": 5358,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.08545,
+      "grad_norm": 3.2340285778045654,
+      "learning_rate": 1.0253838805302106e-05,
+      "loss": 0.5123,
+      "step": 1194
+    },
+    {
+      "batch_num_effect_tokens": 8761,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 1.08636,
+      "grad_norm": 3.4777562618255615,
+      "learning_rate": 1.0237976975461074e-05,
+      "loss": 1.467,
+      "step": 1195
+    },
+    {
+      "batch_num_effect_tokens": 8088,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.08727,
+      "grad_norm": 3.107084274291992,
+      "learning_rate": 1.0222114546513296e-05,
+      "loss": 1.2546,
+      "step": 1196
+    },
+    {
+      "batch_num_effect_tokens": 7250,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.08818,
+      "grad_norm": 3.5884499549865723,
+      "learning_rate": 1.0206251558392408e-05,
+      "loss": 1.443,
+      "step": 1197
+    },
+    {
+      "batch_num_effect_tokens": 9426,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 1.08909,
+      "grad_norm": 2.9463024139404297,
+      "learning_rate": 1.0190388051033466e-05,
+      "loss": 1.3789,
+      "step": 1198
+    },
+    {
+      "batch_num_effect_tokens": 6048,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.09,
+      "grad_norm": 3.3203890323638916,
+      "learning_rate": 1.0174524064372837e-05,
+      "loss": 1.1597,
+      "step": 1199
+    },
+    {
+      "batch_num_effect_tokens": 4853,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52202,
+      "epoch": 1.09091,
+      "grad_norm": 3.7079813480377197,
+      "learning_rate": 1.015865963834808e-05,
+      "loss": 1.1309,
+      "step": 1200
+    },
+    {
+      "batch_num_effect_tokens": 5824,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.09182,
+      "grad_norm": 3.3707385063171387,
+      "learning_rate": 1.0142794812897874e-05,
+      "loss": 1.0394,
+      "step": 1201
+    },
+    {
+      "batch_num_effect_tokens": 10838,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.09273,
+      "grad_norm": 3.0608296394348145,
+      "learning_rate": 1.0126929627961896e-05,
+      "loss": 1.5519,
+      "step": 1202
+    },
+    {
+      "batch_num_effect_tokens": 5268,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 1.09364,
+      "grad_norm": 3.28275728225708,
+      "learning_rate": 1.0111064123480734e-05,
+      "loss": 0.8913,
+      "step": 1203
+    },
+    {
+      "batch_num_effect_tokens": 8278,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.09455,
+      "grad_norm": 3.032092332839966,
+      "learning_rate": 1.0095198339395769e-05,
+      "loss": 1.3646,
+      "step": 1204
+    },
+    {
+      "batch_num_effect_tokens": 5723,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 1.09545,
+      "grad_norm": 3.0130598545074463,
+      "learning_rate": 1.0079332315649097e-05,
+      "loss": 1.1537,
+      "step": 1205
+    },
+    {
+      "batch_num_effect_tokens": 5837,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.09636,
+      "grad_norm": 3.227560520172119,
+      "learning_rate": 1.006346609218342e-05,
+      "loss": 1.2023,
+      "step": 1206
+    },
+    {
+      "batch_num_effect_tokens": 7533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.09727,
+      "grad_norm": 3.0318572521209717,
+      "learning_rate": 1.0047599708941926e-05,
+      "loss": 1.3757,
+      "step": 1207
+    },
+    {
+      "batch_num_effect_tokens": 6667,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.09818,
+      "grad_norm": 3.773564338684082,
+      "learning_rate": 1.0031733205868223e-05,
+      "loss": 0.7025,
+      "step": 1208
+    },
+    {
+      "batch_num_effect_tokens": 9942,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.09909,
+      "grad_norm": 2.6167032718658447,
+      "learning_rate": 1.0015866622906216e-05,
+      "loss": 1.2315,
+      "step": 1209
+    },
+    {
+      "batch_num_effect_tokens": 5369,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 1.1,
+      "grad_norm": 3.179255485534668,
+      "learning_rate": 1e-05,
+      "loss": 0.689,
+      "step": 1210
+    },
+    {
+      "batch_num_effect_tokens": 7136,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 1.10091,
+      "grad_norm": 2.66573429107666,
+      "learning_rate": 9.98413337709379e-06,
+      "loss": 1.0452,
+      "step": 1211
+    },
+    {
+      "batch_num_effect_tokens": 6263,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.10182,
+      "grad_norm": 3.0495479106903076,
+      "learning_rate": 9.968266794131778e-06,
+      "loss": 1.0476,
+      "step": 1212
+    },
+    {
+      "batch_num_effect_tokens": 6082,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.10273,
+      "grad_norm": 3.077975273132324,
+      "learning_rate": 9.952400291058078e-06,
+      "loss": 1.0274,
+      "step": 1213
+    },
+    {
+      "batch_num_effect_tokens": 4148,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.10364,
+      "grad_norm": 2.852982521057129,
+      "learning_rate": 9.936533907816583e-06,
+      "loss": 0.5047,
+      "step": 1214
+    },
+    {
+      "batch_num_effect_tokens": 7908,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52213,
+      "epoch": 1.10455,
+      "grad_norm": 3.789891481399536,
+      "learning_rate": 9.920667684350906e-06,
+      "loss": 1.5248,
+      "step": 1215
+    },
+    {
+      "batch_num_effect_tokens": 6645,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.10545,
+      "grad_norm": 3.0531599521636963,
+      "learning_rate": 9.904801660604234e-06,
+      "loss": 0.9681,
+      "step": 1216
+    },
+    {
+      "batch_num_effect_tokens": 7027,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.10636,
+      "grad_norm": 4.8088788986206055,
+      "learning_rate": 9.888935876519272e-06,
+      "loss": 1.9039,
+      "step": 1217
+    },
+    {
+      "batch_num_effect_tokens": 6049,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.10727,
+      "grad_norm": 3.2031102180480957,
+      "learning_rate": 9.873070372038106e-06,
+      "loss": 0.8509,
+      "step": 1218
+    },
+    {
+      "batch_num_effect_tokens": 6047,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.10818,
+      "grad_norm": 3.8017449378967285,
+      "learning_rate": 9.85720518710213e-06,
+      "loss": 1.3184,
+      "step": 1219
+    },
+    {
+      "batch_num_effect_tokens": 7350,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.10909,
+      "grad_norm": 3.482895612716675,
+      "learning_rate": 9.841340361651921e-06,
+      "loss": 1.4786,
+      "step": 1220
+    },
+    {
+      "batch_num_effect_tokens": 3934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.11,
+      "grad_norm": 3.3409523963928223,
+      "learning_rate": 9.825475935627165e-06,
+      "loss": 0.5001,
+      "step": 1221
+    },
+    {
+      "batch_num_effect_tokens": 7873,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.11091,
+      "grad_norm": 3.9254238605499268,
+      "learning_rate": 9.809611948966534e-06,
+      "loss": 2.0723,
+      "step": 1222
+    },
+    {
+      "batch_num_effect_tokens": 6719,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52128,
+      "epoch": 1.11182,
+      "grad_norm": 3.5105459690093994,
+      "learning_rate": 9.793748441607595e-06,
+      "loss": 1.5862,
+      "step": 1223
+    },
+    {
+      "batch_num_effect_tokens": 5100,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.11273,
+      "grad_norm": 2.8462579250335693,
+      "learning_rate": 9.777885453486706e-06,
+      "loss": 0.6737,
+      "step": 1224
+    },
+    {
+      "batch_num_effect_tokens": 8851,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 1.11364,
+      "grad_norm": 3.0171377658843994,
+      "learning_rate": 9.762023024538928e-06,
+      "loss": 1.5563,
+      "step": 1225
+    },
+    {
+      "batch_num_effect_tokens": 8092,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.11455,
+      "grad_norm": 2.975921154022217,
+      "learning_rate": 9.746161194697895e-06,
+      "loss": 1.3743,
+      "step": 1226
+    },
+    {
+      "batch_num_effect_tokens": 9265,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.11545,
+      "grad_norm": 2.8814196586608887,
+      "learning_rate": 9.73030000389576e-06,
+      "loss": 1.4079,
+      "step": 1227
+    },
+    {
+      "batch_num_effect_tokens": 5494,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.11636,
+      "grad_norm": 2.375429391860962,
+      "learning_rate": 9.71443949206304e-06,
+      "loss": 0.4457,
+      "step": 1228
+    },
+    {
+      "batch_num_effect_tokens": 7578,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52141,
+      "epoch": 1.11727,
+      "grad_norm": 2.9076132774353027,
+      "learning_rate": 9.698579699128557e-06,
+      "loss": 1.1299,
+      "step": 1229
+    },
+    {
+      "batch_num_effect_tokens": 5535,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52136,
+      "epoch": 1.11818,
+      "grad_norm": 3.1913745403289795,
+      "learning_rate": 9.682720665019325e-06,
+      "loss": 0.9203,
+      "step": 1230
+    },
+    {
+      "batch_num_effect_tokens": 8991,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52202,
+      "epoch": 1.11909,
+      "grad_norm": 3.0180065631866455,
+      "learning_rate": 9.66686242966044e-06,
+      "loss": 1.6461,
+      "step": 1231
+    },
+    {
+      "batch_num_effect_tokens": 10561,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.12,
+      "grad_norm": 2.6574461460113525,
+      "learning_rate": 9.651005032974994e-06,
+      "loss": 1.3578,
+      "step": 1232
+    },
+    {
+      "batch_num_effect_tokens": 6736,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 1.12091,
+      "grad_norm": 3.393181324005127,
+      "learning_rate": 9.635148514883956e-06,
+      "loss": 1.4798,
+      "step": 1233
+    },
+    {
+      "batch_num_effect_tokens": 7173,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.12182,
+      "grad_norm": 2.7249555587768555,
+      "learning_rate": 9.619292915306103e-06,
+      "loss": 0.9275,
+      "step": 1234
+    },
+    {
+      "batch_num_effect_tokens": 7145,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.12273,
+      "grad_norm": 2.728337526321411,
+      "learning_rate": 9.603438274157878e-06,
+      "loss": 0.8287,
+      "step": 1235
+    },
+    {
+      "batch_num_effect_tokens": 5026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.12364,
+      "grad_norm": 3.712592363357544,
+      "learning_rate": 9.58758463135333e-06,
+      "loss": 0.8681,
+      "step": 1236
+    },
+    {
+      "batch_num_effect_tokens": 6123,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.12455,
+      "grad_norm": 3.236494541168213,
+      "learning_rate": 9.571732026803978e-06,
+      "loss": 0.9514,
+      "step": 1237
+    },
+    {
+      "batch_num_effect_tokens": 5858,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 1.12545,
+      "grad_norm": 3.327521562576294,
+      "learning_rate": 9.555880500418739e-06,
+      "loss": 1.0204,
+      "step": 1238
+    },
+    {
+      "batch_num_effect_tokens": 5589,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.12636,
+      "grad_norm": 3.551295518875122,
+      "learning_rate": 9.540030092103809e-06,
+      "loss": 0.9061,
+      "step": 1239
+    },
+    {
+      "batch_num_effect_tokens": 3728,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 1.12727,
+      "grad_norm": 3.2720189094543457,
+      "learning_rate": 9.524180841762577e-06,
+      "loss": 0.4908,
+      "step": 1240
+    },
+    {
+      "batch_num_effect_tokens": 6744,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.12818,
+      "grad_norm": 3.4976143836975098,
+      "learning_rate": 9.50833278929551e-06,
+      "loss": 1.0591,
+      "step": 1241
+    },
+    {
+      "batch_num_effect_tokens": 7771,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.12909,
+      "grad_norm": 4.257693290710449,
+      "learning_rate": 9.49248597460006e-06,
+      "loss": 1.575,
+      "step": 1242
+    },
+    {
+      "batch_num_effect_tokens": 4107,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 1.13,
+      "grad_norm": 3.3101322650909424,
+      "learning_rate": 9.476640437570562e-06,
+      "loss": 0.6062,
+      "step": 1243
+    },
+    {
+      "batch_num_effect_tokens": 7747,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.13091,
+      "grad_norm": 3.23077654838562,
+      "learning_rate": 9.460796218098143e-06,
+      "loss": 1.1486,
+      "step": 1244
+    },
+    {
+      "batch_num_effect_tokens": 9415,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.13182,
+      "grad_norm": 3.416395902633667,
+      "learning_rate": 9.444953356070601e-06,
+      "loss": 1.6655,
+      "step": 1245
+    },
+    {
+      "batch_num_effect_tokens": 5839,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.13273,
+      "grad_norm": 5.638548374176025,
+      "learning_rate": 9.42911189137232e-06,
+      "loss": 1.4627,
+      "step": 1246
+    },
+    {
+      "batch_num_effect_tokens": 5462,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50560,
+      "epoch": 1.13364,
+      "grad_norm": 3.1694421768188477,
+      "learning_rate": 9.413271863884177e-06,
+      "loss": 1.0307,
+      "step": 1247
+    },
+    {
+      "batch_num_effect_tokens": 5970,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52112,
+      "epoch": 1.13455,
+      "grad_norm": 3.264805555343628,
+      "learning_rate": 9.397433313483417e-06,
+      "loss": 1.1321,
+      "step": 1248
+    },
+    {
+      "batch_num_effect_tokens": 4547,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.13545,
+      "grad_norm": 3.039553642272949,
+      "learning_rate": 9.381596280043574e-06,
+      "loss": 0.5796,
+      "step": 1249
+    },
+    {
+      "batch_num_effect_tokens": 4459,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 1.13636,
+      "grad_norm": 3.0265426635742188,
+      "learning_rate": 9.365760803434356e-06,
+      "loss": 0.6882,
+      "step": 1250
+    },
+    {
+      "batch_num_effect_tokens": 6473,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.13727,
+      "grad_norm": 3.441258430480957,
+      "learning_rate": 9.349926923521563e-06,
+      "loss": 1.3783,
+      "step": 1251
+    },
+    {
+      "batch_num_effect_tokens": 4856,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50540,
+      "epoch": 1.13818,
+      "grad_norm": 3.1589627265930176,
+      "learning_rate": 9.334094680166962e-06,
+      "loss": 0.6254,
+      "step": 1252
+    },
+    {
+      "batch_num_effect_tokens": 5519,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.13909,
+      "grad_norm": 2.8274736404418945,
+      "learning_rate": 9.318264113228215e-06,
+      "loss": 0.5885,
+      "step": 1253
+    },
+    {
+      "batch_num_effect_tokens": 9360,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.14,
+      "grad_norm": 2.875990867614746,
+      "learning_rate": 9.302435262558748e-06,
+      "loss": 1.666,
+      "step": 1254
+    },
+    {
+      "batch_num_effect_tokens": 9990,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.14091,
+      "grad_norm": 2.228811264038086,
+      "learning_rate": 9.286608168007678e-06,
+      "loss": 1.0264,
+      "step": 1255
+    },
+    {
+      "batch_num_effect_tokens": 5826,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.14182,
+      "grad_norm": 3.2085955142974854,
+      "learning_rate": 9.270782869419694e-06,
+      "loss": 0.768,
+      "step": 1256
+    },
+    {
+      "batch_num_effect_tokens": 6325,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.14273,
+      "grad_norm": 3.118927001953125,
+      "learning_rate": 9.25495940663497e-06,
+      "loss": 1.1966,
+      "step": 1257
+    },
+    {
+      "batch_num_effect_tokens": 7294,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.14364,
+      "grad_norm": 2.9526238441467285,
+      "learning_rate": 9.239137819489047e-06,
+      "loss": 1.0929,
+      "step": 1258
+    },
+    {
+      "batch_num_effect_tokens": 4746,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50509,
+      "epoch": 1.14455,
+      "grad_norm": 3.432528257369995,
+      "learning_rate": 9.223318147812765e-06,
+      "loss": 0.8558,
+      "step": 1259
+    },
+    {
+      "batch_num_effect_tokens": 6433,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 1.14545,
+      "grad_norm": 3.367201328277588,
+      "learning_rate": 9.207500431432115e-06,
+      "loss": 1.213,
+      "step": 1260
+    },
+    {
+      "batch_num_effect_tokens": 8317,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 1.14636,
+      "grad_norm": 3.025263786315918,
+      "learning_rate": 9.191684710168188e-06,
+      "loss": 1.375,
+      "step": 1261
+    },
+    {
+      "batch_num_effect_tokens": 8129,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.14727,
+      "grad_norm": 3.154592514038086,
+      "learning_rate": 9.175871023837042e-06,
+      "loss": 1.2919,
+      "step": 1262
+    },
+    {
+      "batch_num_effect_tokens": 7062,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.14818,
+      "grad_norm": 3.2619194984436035,
+      "learning_rate": 9.160059412249607e-06,
+      "loss": 1.1456,
+      "step": 1263
+    },
+    {
+      "batch_num_effect_tokens": 6844,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 1.14909,
+      "grad_norm": 3.1822962760925293,
+      "learning_rate": 9.144249915211605e-06,
+      "loss": 1.1395,
+      "step": 1264
+    },
+    {
+      "batch_num_effect_tokens": 6011,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.15,
+      "grad_norm": 3.5489284992218018,
+      "learning_rate": 9.128442572523418e-06,
+      "loss": 1.1341,
+      "step": 1265
+    },
+    {
+      "batch_num_effect_tokens": 5420,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.15091,
+      "grad_norm": 2.9631083011627197,
+      "learning_rate": 9.11263742398002e-06,
+      "loss": 0.7856,
+      "step": 1266
+    },
+    {
+      "batch_num_effect_tokens": 7156,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50569,
+      "epoch": 1.15182,
+      "grad_norm": 4.286734104156494,
+      "learning_rate": 9.09683450937085e-06,
+      "loss": 2.0762,
+      "step": 1267
+    },
+    {
+      "batch_num_effect_tokens": 9164,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.15273,
+      "grad_norm": 2.735157012939453,
+      "learning_rate": 9.081033868479727e-06,
+      "loss": 1.132,
+      "step": 1268
+    },
+    {
+      "batch_num_effect_tokens": 7674,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.15364,
+      "grad_norm": 2.8595211505889893,
+      "learning_rate": 9.065235541084745e-06,
+      "loss": 1.0823,
+      "step": 1269
+    },
+    {
+      "batch_num_effect_tokens": 5831,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.15455,
+      "grad_norm": 2.9400060176849365,
+      "learning_rate": 9.049439566958176e-06,
+      "loss": 0.7926,
+      "step": 1270
+    },
+    {
+      "batch_num_effect_tokens": 7135,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.15545,
+      "grad_norm": 2.6830992698669434,
+      "learning_rate": 9.033645985866361e-06,
+      "loss": 0.9602,
+      "step": 1271
+    },
+    {
+      "batch_num_effect_tokens": 8934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.15636,
+      "grad_norm": 3.0802054405212402,
+      "learning_rate": 9.017854837569629e-06,
+      "loss": 1.4231,
+      "step": 1272
+    },
+    {
+      "batch_num_effect_tokens": 7525,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.15727,
+      "grad_norm": 2.9055581092834473,
+      "learning_rate": 9.002066161822174e-06,
+      "loss": 1.1685,
+      "step": 1273
+    },
+    {
+      "batch_num_effect_tokens": 9017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.15818,
+      "grad_norm": 2.9452826976776123,
+      "learning_rate": 8.986279998371968e-06,
+      "loss": 1.3317,
+      "step": 1274
+    },
+    {
+      "batch_num_effect_tokens": 5795,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.15909,
+      "grad_norm": 4.065710067749023,
+      "learning_rate": 8.970496386960657e-06,
+      "loss": 1.2286,
+      "step": 1275
+    },
+    {
+      "batch_num_effect_tokens": 6780,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.16,
+      "grad_norm": 3.2890782356262207,
+      "learning_rate": 8.954715367323468e-06,
+      "loss": 1.2892,
+      "step": 1276
+    },
+    {
+      "batch_num_effect_tokens": 6270,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.16091,
+      "grad_norm": 3.506971597671509,
+      "learning_rate": 8.938936979189091e-06,
+      "loss": 1.1679,
+      "step": 1277
+    },
+    {
+      "batch_num_effect_tokens": 6216,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.16182,
+      "grad_norm": 3.278193950653076,
+      "learning_rate": 8.923161262279611e-06,
+      "loss": 1.1085,
+      "step": 1278
+    },
+    {
+      "batch_num_effect_tokens": 7952,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.16273,
+      "grad_norm": 3.214967966079712,
+      "learning_rate": 8.907388256310373e-06,
+      "loss": 1.2083,
+      "step": 1279
+    },
+    {
+      "batch_num_effect_tokens": 8910,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.16364,
+      "grad_norm": 2.634761095046997,
+      "learning_rate": 8.89161800098989e-06,
+      "loss": 1.1466,
+      "step": 1280
+    },
+    {
+      "batch_num_effect_tokens": 4847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.16455,
+      "grad_norm": 3.5305864810943604,
+      "learning_rate": 8.875850536019775e-06,
+      "loss": 0.782,
+      "step": 1281
+    },
+    {
+      "batch_num_effect_tokens": 7283,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.16545,
+      "grad_norm": 3.4275307655334473,
+      "learning_rate": 8.860085901094595e-06,
+      "loss": 1.2666,
+      "step": 1282
+    },
+    {
+      "batch_num_effect_tokens": 7931,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.16636,
+      "grad_norm": 3.1967883110046387,
+      "learning_rate": 8.844324135901803e-06,
+      "loss": 1.2349,
+      "step": 1283
+    },
+    {
+      "batch_num_effect_tokens": 5999,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52216,
+      "epoch": 1.16727,
+      "grad_norm": 3.1412158012390137,
+      "learning_rate": 8.828565280121619e-06,
+      "loss": 0.8491,
+      "step": 1284
+    },
+    {
+      "batch_num_effect_tokens": 4611,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.16818,
+      "grad_norm": 3.7525036334991455,
+      "learning_rate": 8.812809373426951e-06,
+      "loss": 0.9368,
+      "step": 1285
+    },
+    {
+      "batch_num_effect_tokens": 4677,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50524,
+      "epoch": 1.16909,
+      "grad_norm": 4.773053169250488,
+      "learning_rate": 8.797056455483267e-06,
+      "loss": 1.2988,
+      "step": 1286
+    },
+    {
+      "batch_num_effect_tokens": 6205,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 1.17,
+      "grad_norm": 3.1944470405578613,
+      "learning_rate": 8.781306565948528e-06,
+      "loss": 0.8771,
+      "step": 1287
+    },
+    {
+      "batch_num_effect_tokens": 5908,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.17091,
+      "grad_norm": 4.351945877075195,
+      "learning_rate": 8.765559744473054e-06,
+      "loss": 0.9883,
+      "step": 1288
+    },
+    {
+      "batch_num_effect_tokens": 7314,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.17182,
+      "grad_norm": 3.1656649112701416,
+      "learning_rate": 8.749816030699456e-06,
+      "loss": 1.2371,
+      "step": 1289
+    },
+    {
+      "batch_num_effect_tokens": 5754,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.17273,
+      "grad_norm": 4.705539226531982,
+      "learning_rate": 8.734075464262507e-06,
+      "loss": 1.6817,
+      "step": 1290
+    },
+    {
+      "batch_num_effect_tokens": 3862,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 1.17364,
+      "grad_norm": 3.1189210414886475,
+      "learning_rate": 8.718338084789074e-06,
+      "loss": 0.3547,
+      "step": 1291
+    },
+    {
+      "batch_num_effect_tokens": 6167,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.17455,
+      "grad_norm": 3.551257848739624,
+      "learning_rate": 8.702603931897983e-06,
+      "loss": 1.1843,
+      "step": 1292
+    },
+    {
+      "batch_num_effect_tokens": 7435,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.17545,
+      "grad_norm": 4.381896495819092,
+      "learning_rate": 8.68687304519995e-06,
+      "loss": 2.0781,
+      "step": 1293
+    },
+    {
+      "batch_num_effect_tokens": 8727,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 1.17636,
+      "grad_norm": 2.9816768169403076,
+      "learning_rate": 8.67114546429746e-06,
+      "loss": 1.2507,
+      "step": 1294
+    },
+    {
+      "batch_num_effect_tokens": 6274,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.17727,
+      "grad_norm": 3.839073657989502,
+      "learning_rate": 8.655421228784683e-06,
+      "loss": 1.6419,
+      "step": 1295
+    },
+    {
+      "batch_num_effect_tokens": 4891,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.17818,
+      "grad_norm": 2.7432196140289307,
+      "learning_rate": 8.639700378247362e-06,
+      "loss": 0.6166,
+      "step": 1296
+    },
+    {
+      "batch_num_effect_tokens": 6322,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.17909,
+      "grad_norm": 2.8955793380737305,
+      "learning_rate": 8.623982952262713e-06,
+      "loss": 0.9081,
+      "step": 1297
+    },
+    {
+      "batch_num_effect_tokens": 6776,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.18,
+      "grad_norm": 2.928248167037964,
+      "learning_rate": 8.60826899039935e-06,
+      "loss": 1.2373,
+      "step": 1298
+    },
+    {
+      "batch_num_effect_tokens": 4579,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.18091,
+      "grad_norm": 3.005248546600342,
+      "learning_rate": 8.592558532217138e-06,
+      "loss": 0.8357,
+      "step": 1299
+    },
+    {
+      "batch_num_effect_tokens": 7649,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52217,
+      "epoch": 1.18182,
+      "grad_norm": 2.8046600818634033,
+      "learning_rate": 8.576851617267151e-06,
+      "loss": 1.1489,
+      "step": 1300
+    },
+    {
+      "batch_num_effect_tokens": 4630,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52127,
+      "epoch": 1.18273,
+      "grad_norm": 3.1205475330352783,
+      "learning_rate": 8.56114828509152e-06,
+      "loss": 0.6783,
+      "step": 1301
+    },
+    {
+      "batch_num_effect_tokens": 5282,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.18364,
+      "grad_norm": 2.6284232139587402,
+      "learning_rate": 8.545448575223369e-06,
+      "loss": 0.3939,
+      "step": 1302
+    },
+    {
+      "batch_num_effect_tokens": 5344,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 1.18455,
+      "grad_norm": 2.9303133487701416,
+      "learning_rate": 8.529752527186694e-06,
+      "loss": 0.7759,
+      "step": 1303
+    },
+    {
+      "batch_num_effect_tokens": 5568,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 1.18545,
+      "grad_norm": 2.889888286590576,
+      "learning_rate": 8.514060180496285e-06,
+      "loss": 0.6437,
+      "step": 1304
+    },
+    {
+      "batch_num_effect_tokens": 5664,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.18636,
+      "grad_norm": 3.5130727291107178,
+      "learning_rate": 8.498371574657596e-06,
+      "loss": 1.0154,
+      "step": 1305
+    },
+    {
+      "batch_num_effect_tokens": 11118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 1.18727,
+      "grad_norm": 2.980041980743408,
+      "learning_rate": 8.482686749166685e-06,
+      "loss": 1.9141,
+      "step": 1306
+    },
+    {
+      "batch_num_effect_tokens": 5524,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.18818,
+      "grad_norm": 3.3898441791534424,
+      "learning_rate": 8.467005743510072e-06,
+      "loss": 0.8405,
+      "step": 1307
+    },
+    {
+      "batch_num_effect_tokens": 4948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.18909,
+      "grad_norm": 3.7010104656219482,
+      "learning_rate": 8.451328597164679e-06,
+      "loss": 1.1394,
+      "step": 1308
+    },
+    {
+      "batch_num_effect_tokens": 7511,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.19,
+      "grad_norm": 3.6661295890808105,
+      "learning_rate": 8.43565534959769e-06,
+      "loss": 1.4902,
+      "step": 1309
+    },
+    {
+      "batch_num_effect_tokens": 5624,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.19091,
+      "grad_norm": 3.152494192123413,
+      "learning_rate": 8.419986040266502e-06,
+      "loss": 1.1014,
+      "step": 1310
+    },
+    {
+      "batch_num_effect_tokens": 5909,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.19182,
+      "grad_norm": 3.238676071166992,
+      "learning_rate": 8.404320708618572e-06,
+      "loss": 0.9795,
+      "step": 1311
+    },
+    {
+      "batch_num_effect_tokens": 10814,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.19273,
+      "grad_norm": 2.8736958503723145,
+      "learning_rate": 8.388659394091362e-06,
+      "loss": 1.4199,
+      "step": 1312
+    },
+    {
+      "batch_num_effect_tokens": 7138,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.19364,
+      "grad_norm": 3.616826057434082,
+      "learning_rate": 8.373002136112204e-06,
+      "loss": 1.4153,
+      "step": 1313
+    },
+    {
+      "batch_num_effect_tokens": 5662,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.19455,
+      "grad_norm": 3.198676586151123,
+      "learning_rate": 8.357348974098232e-06,
+      "loss": 0.9395,
+      "step": 1314
+    },
+    {
+      "batch_num_effect_tokens": 5553,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.19545,
+      "grad_norm": 3.6962339878082275,
+      "learning_rate": 8.341699947456261e-06,
+      "loss": 0.896,
+      "step": 1315
+    },
+    {
+      "batch_num_effect_tokens": 6349,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.19636,
+      "grad_norm": 3.261848211288452,
+      "learning_rate": 8.326055095582694e-06,
+      "loss": 1.0207,
+      "step": 1316
+    },
+    {
+      "batch_num_effect_tokens": 7606,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.19727,
+      "grad_norm": 3.192269802093506,
+      "learning_rate": 8.310414457863437e-06,
+      "loss": 1.1751,
+      "step": 1317
+    },
+    {
+      "batch_num_effect_tokens": 8860,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52218,
+      "epoch": 1.19818,
+      "grad_norm": 3.0157923698425293,
+      "learning_rate": 8.294778073673762e-06,
+      "loss": 1.5703,
+      "step": 1318
+    },
+    {
+      "batch_num_effect_tokens": 5214,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.19909,
+      "grad_norm": 3.5552825927734375,
+      "learning_rate": 8.279145982378261e-06,
+      "loss": 1.1386,
+      "step": 1319
+    },
+    {
+      "batch_num_effect_tokens": 10259,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 1.2,
+      "grad_norm": 2.6071653366088867,
+      "learning_rate": 8.263518223330698e-06,
+      "loss": 1.3003,
+      "step": 1320
+    },
+    {
+      "batch_num_effect_tokens": 6907,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.20091,
+      "grad_norm": 3.8927736282348633,
+      "learning_rate": 8.24789483587394e-06,
+      "loss": 1.8552,
+      "step": 1321
+    },
+    {
+      "batch_num_effect_tokens": 5271,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50543,
+      "epoch": 1.20182,
+      "grad_norm": 3.377779245376587,
+      "learning_rate": 8.232275859339842e-06,
+      "loss": 1.0852,
+      "step": 1322
+    },
+    {
+      "batch_num_effect_tokens": 4004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 1.20273,
+      "grad_norm": 3.201169013977051,
+      "learning_rate": 8.216661333049171e-06,
+      "loss": 0.5308,
+      "step": 1323
+    },
+    {
+      "batch_num_effect_tokens": 6163,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.20364,
+      "grad_norm": 3.0209457874298096,
+      "learning_rate": 8.201051296311462e-06,
+      "loss": 0.9274,
+      "step": 1324
+    },
+    {
+      "batch_num_effect_tokens": 5098,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.20455,
+      "grad_norm": 3.0369114875793457,
+      "learning_rate": 8.185445788424975e-06,
+      "loss": 1.0532,
+      "step": 1325
+    },
+    {
+      "batch_num_effect_tokens": 9244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.20545,
+      "grad_norm": 2.9742166996002197,
+      "learning_rate": 8.169844848676553e-06,
+      "loss": 1.063,
+      "step": 1326
+    },
+    {
+      "batch_num_effect_tokens": 4571,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.20636,
+      "grad_norm": 3.4904568195343018,
+      "learning_rate": 8.154248516341547e-06,
+      "loss": 0.8149,
+      "step": 1327
+    },
+    {
+      "batch_num_effect_tokens": 8765,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.20727,
+      "grad_norm": 3.0287582874298096,
+      "learning_rate": 8.1386568306837e-06,
+      "loss": 1.2435,
+      "step": 1328
+    },
+    {
+      "batch_num_effect_tokens": 4073,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.20818,
+      "grad_norm": 3.4525487422943115,
+      "learning_rate": 8.123069830955066e-06,
+      "loss": 0.5571,
+      "step": 1329
+    },
+    {
+      "batch_num_effect_tokens": 5591,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.20909,
+      "grad_norm": 3.343996047973633,
+      "learning_rate": 8.107487556395902e-06,
+      "loss": 0.962,
+      "step": 1330
+    },
+    {
+      "batch_num_effect_tokens": 4778,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.21,
+      "grad_norm": 2.7948801517486572,
+      "learning_rate": 8.091910046234552e-06,
+      "loss": 0.6016,
+      "step": 1331
+    },
+    {
+      "batch_num_effect_tokens": 7247,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.21091,
+      "grad_norm": 3.2723145484924316,
+      "learning_rate": 8.076337339687395e-06,
+      "loss": 1.203,
+      "step": 1332
+    },
+    {
+      "batch_num_effect_tokens": 7885,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 1.21182,
+      "grad_norm": 3.3220107555389404,
+      "learning_rate": 8.06076947595869e-06,
+      "loss": 1.2771,
+      "step": 1333
+    },
+    {
+      "batch_num_effect_tokens": 6931,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 1.21273,
+      "grad_norm": 3.1205132007598877,
+      "learning_rate": 8.04520649424052e-06,
+      "loss": 0.9818,
+      "step": 1334
+    },
+    {
+      "batch_num_effect_tokens": 4684,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.21364,
+      "grad_norm": 3.3503336906433105,
+      "learning_rate": 8.029648433712671e-06,
+      "loss": 0.5986,
+      "step": 1335
+    },
+    {
+      "batch_num_effect_tokens": 4863,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.21455,
+      "grad_norm": 2.5516700744628906,
+      "learning_rate": 8.014095333542548e-06,
+      "loss": 0.5332,
+      "step": 1336
+    },
+    {
+      "batch_num_effect_tokens": 9903,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.21545,
+      "grad_norm": 2.74527907371521,
+      "learning_rate": 7.998547232885053e-06,
+      "loss": 1.4125,
+      "step": 1337
+    },
+    {
+      "batch_num_effect_tokens": 7362,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.21636,
+      "grad_norm": 2.8555214405059814,
+      "learning_rate": 7.983004170882518e-06,
+      "loss": 1.1776,
+      "step": 1338
+    },
+    {
+      "batch_num_effect_tokens": 8013,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 1.21727,
+      "grad_norm": 2.9209163188934326,
+      "learning_rate": 7.967466186664579e-06,
+      "loss": 1.3352,
+      "step": 1339
+    },
+    {
+      "batch_num_effect_tokens": 6412,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.21818,
+      "grad_norm": 2.683448076248169,
+      "learning_rate": 7.951933319348095e-06,
+      "loss": 0.8411,
+      "step": 1340
+    },
+    {
+      "batch_num_effect_tokens": 4307,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.21909,
+      "grad_norm": 2.8081939220428467,
+      "learning_rate": 7.936405608037037e-06,
+      "loss": 0.5098,
+      "step": 1341
+    },
+    {
+      "batch_num_effect_tokens": 4409,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.22,
+      "grad_norm": 3.882086992263794,
+      "learning_rate": 7.92088309182241e-06,
+      "loss": 0.8913,
+      "step": 1342
+    },
+    {
+      "batch_num_effect_tokens": 9927,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.22091,
+      "grad_norm": 2.887500762939453,
+      "learning_rate": 7.905365809782115e-06,
+      "loss": 1.3527,
+      "step": 1343
+    },
+    {
+      "batch_num_effect_tokens": 4882,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52089,
+      "epoch": 1.22182,
+      "grad_norm": 3.064622163772583,
+      "learning_rate": 7.889853800980905e-06,
+      "loss": 0.6535,
+      "step": 1344
+    },
+    {
+      "batch_num_effect_tokens": 6178,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.22273,
+      "grad_norm": 3.8330609798431396,
+      "learning_rate": 7.874347104470234e-06,
+      "loss": 1.4913,
+      "step": 1345
+    },
+    {
+      "batch_num_effect_tokens": 6663,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.22364,
+      "grad_norm": 3.099398612976074,
+      "learning_rate": 7.858845759288198e-06,
+      "loss": 1.1149,
+      "step": 1346
+    },
+    {
+      "batch_num_effect_tokens": 6186,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.22455,
+      "grad_norm": 3.0903403759002686,
+      "learning_rate": 7.843349804459412e-06,
+      "loss": 0.8618,
+      "step": 1347
+    },
+    {
+      "batch_num_effect_tokens": 6350,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.22545,
+      "grad_norm": 3.0803728103637695,
+      "learning_rate": 7.827859278994924e-06,
+      "loss": 0.9878,
+      "step": 1348
+    },
+    {
+      "batch_num_effect_tokens": 6041,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52176,
+      "epoch": 1.22636,
+      "grad_norm": 3.533050775527954,
+      "learning_rate": 7.812374221892116e-06,
+      "loss": 0.9804,
+      "step": 1349
+    },
+    {
+      "batch_num_effect_tokens": 6119,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.22727,
+      "grad_norm": 3.218393325805664,
+      "learning_rate": 7.796894672134594e-06,
+      "loss": 1.0535,
+      "step": 1350
+    },
+    {
+      "batch_num_effect_tokens": 5933,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.22818,
+      "grad_norm": 2.996877908706665,
+      "learning_rate": 7.781420668692116e-06,
+      "loss": 0.8144,
+      "step": 1351
+    },
+    {
+      "batch_num_effect_tokens": 9178,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.22909,
+      "grad_norm": 3.118384838104248,
+      "learning_rate": 7.765952250520459e-06,
+      "loss": 1.2562,
+      "step": 1352
+    },
+    {
+      "batch_num_effect_tokens": 7624,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52178,
+      "epoch": 1.23,
+      "grad_norm": 3.031911611557007,
+      "learning_rate": 7.750489456561351e-06,
+      "loss": 0.9531,
+      "step": 1353
+    },
+    {
+      "batch_num_effect_tokens": 5752,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.23091,
+      "grad_norm": 2.5690088272094727,
+      "learning_rate": 7.735032325742355e-06,
+      "loss": 0.5922,
+      "step": 1354
+    },
+    {
+      "batch_num_effect_tokens": 5533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.23182,
+      "grad_norm": 4.04586124420166,
+      "learning_rate": 7.719580896976788e-06,
+      "loss": 0.9968,
+      "step": 1355
+    },
+    {
+      "batch_num_effect_tokens": 5669,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52122,
+      "epoch": 1.23273,
+      "grad_norm": 2.989699602127075,
+      "learning_rate": 7.704135209163589e-06,
+      "loss": 0.7587,
+      "step": 1356
+    },
+    {
+      "batch_num_effect_tokens": 6241,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52186,
+      "epoch": 1.23364,
+      "grad_norm": 2.8156251907348633,
+      "learning_rate": 7.68869530118727e-06,
+      "loss": 0.839,
+      "step": 1357
+    },
+    {
+      "batch_num_effect_tokens": 5059,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.23455,
+      "grad_norm": 3.2609455585479736,
+      "learning_rate": 7.673261211917777e-06,
+      "loss": 0.9088,
+      "step": 1358
+    },
+    {
+      "batch_num_effect_tokens": 5309,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 1.23545,
+      "grad_norm": 3.0399086475372314,
+      "learning_rate": 7.657832980210412e-06,
+      "loss": 0.8117,
+      "step": 1359
+    },
+    {
+      "batch_num_effect_tokens": 4817,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 1.23636,
+      "grad_norm": 3.6285390853881836,
+      "learning_rate": 7.642410644905726e-06,
+      "loss": 0.9068,
+      "step": 1360
+    },
+    {
+      "batch_num_effect_tokens": 4073,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.23727,
+      "grad_norm": 2.894031524658203,
+      "learning_rate": 7.626994244829441e-06,
+      "loss": 0.3597,
+      "step": 1361
+    },
+    {
+      "batch_num_effect_tokens": 6409,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 1.23818,
+      "grad_norm": 3.356076240539551,
+      "learning_rate": 7.611583818792311e-06,
+      "loss": 1.0443,
+      "step": 1362
+    },
+    {
+      "batch_num_effect_tokens": 6840,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.23909,
+      "grad_norm": 3.375852584838867,
+      "learning_rate": 7.596179405590076e-06,
+      "loss": 1.093,
+      "step": 1363
+    },
+    {
+      "batch_num_effect_tokens": 8879,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52155,
+      "epoch": 1.24,
+      "grad_norm": 3.4897732734680176,
+      "learning_rate": 7.580781044003324e-06,
+      "loss": 1.4193,
+      "step": 1364
+    },
+    {
+      "batch_num_effect_tokens": 5389,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 1.24091,
+      "grad_norm": 3.4354724884033203,
+      "learning_rate": 7.565388772797412e-06,
+      "loss": 0.5507,
+      "step": 1365
+    },
+    {
+      "batch_num_effect_tokens": 7913,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 1.24182,
+      "grad_norm": 3.8646090030670166,
+      "learning_rate": 7.550002630722366e-06,
+      "loss": 1.4203,
+      "step": 1366
+    },
+    {
+      "batch_num_effect_tokens": 3872,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.24273,
+      "grad_norm": 3.718017101287842,
+      "learning_rate": 7.534622656512777e-06,
+      "loss": 0.5086,
+      "step": 1367
+    },
+    {
+      "batch_num_effect_tokens": 8348,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.24364,
+      "grad_norm": 2.4211277961730957,
+      "learning_rate": 7.519248888887715e-06,
+      "loss": 0.8911,
+      "step": 1368
+    },
+    {
+      "batch_num_effect_tokens": 6355,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.24455,
+      "grad_norm": 3.647061824798584,
+      "learning_rate": 7.503881366550617e-06,
+      "loss": 1.2617,
+      "step": 1369
+    },
+    {
+      "batch_num_effect_tokens": 5586,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.24545,
+      "grad_norm": 3.2869815826416016,
+      "learning_rate": 7.488520128189209e-06,
+      "loss": 0.9019,
+      "step": 1370
+    },
+    {
+      "batch_num_effect_tokens": 5895,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.24636,
+      "grad_norm": 2.7943344116210938,
+      "learning_rate": 7.4731652124753865e-06,
+      "loss": 0.8344,
+      "step": 1371
+    },
+    {
+      "batch_num_effect_tokens": 6089,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.24727,
+      "grad_norm": 2.809882640838623,
+      "learning_rate": 7.4578166580651335e-06,
+      "loss": 0.8133,
+      "step": 1372
+    },
+    {
+      "batch_num_effect_tokens": 4249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.24818,
+      "grad_norm": 3.436528205871582,
+      "learning_rate": 7.442474503598412e-06,
+      "loss": 0.8629,
+      "step": 1373
+    },
+    {
+      "batch_num_effect_tokens": 5331,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 1.24909,
+      "grad_norm": 3.108729362487793,
+      "learning_rate": 7.4271387876990866e-06,
+      "loss": 0.6077,
+      "step": 1374
+    },
+    {
+      "batch_num_effect_tokens": 6733,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.25,
+      "grad_norm": 2.72878098487854,
+      "learning_rate": 7.411809548974792e-06,
+      "loss": 0.8801,
+      "step": 1375
+    },
+    {
+      "batch_num_effect_tokens": 5971,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.25091,
+      "grad_norm": 2.6645631790161133,
+      "learning_rate": 7.39648682601688e-06,
+      "loss": 0.5949,
+      "step": 1376
+    },
+    {
+      "batch_num_effect_tokens": 6856,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52111,
+      "epoch": 1.25182,
+      "grad_norm": 3.8237671852111816,
+      "learning_rate": 7.381170657400281e-06,
+      "loss": 1.6248,
+      "step": 1377
+    },
+    {
+      "batch_num_effect_tokens": 8547,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 1.25273,
+      "grad_norm": 2.754514694213867,
+      "learning_rate": 7.365861081683434e-06,
+      "loss": 1.1067,
+      "step": 1378
+    },
+    {
+      "batch_num_effect_tokens": 6625,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 1.25364,
+      "grad_norm": 3.414747714996338,
+      "learning_rate": 7.350558137408174e-06,
+      "loss": 1.3872,
+      "step": 1379
+    },
+    {
+      "batch_num_effect_tokens": 8201,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 1.25455,
+      "grad_norm": 2.9175808429718018,
+      "learning_rate": 7.335261863099652e-06,
+      "loss": 1.0167,
+      "step": 1380
+    },
+    {
+      "batch_num_effect_tokens": 5804,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.25545,
+      "grad_norm": 3.4364821910858154,
+      "learning_rate": 7.319972297266215e-06,
+      "loss": 1.2104,
+      "step": 1381
+    },
+    {
+      "batch_num_effect_tokens": 5729,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.25636,
+      "grad_norm": 3.21177077293396,
+      "learning_rate": 7.3046894783993225e-06,
+      "loss": 0.8951,
+      "step": 1382
+    },
+    {
+      "batch_num_effect_tokens": 7137,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.25727,
+      "grad_norm": 3.1714975833892822,
+      "learning_rate": 7.289413444973461e-06,
+      "loss": 1.2,
+      "step": 1383
+    },
+    {
+      "batch_num_effect_tokens": 6818,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.25818,
+      "grad_norm": 3.7215492725372314,
+      "learning_rate": 7.274144235446024e-06,
+      "loss": 1.5959,
+      "step": 1384
+    },
+    {
+      "batch_num_effect_tokens": 5616,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.25909,
+      "grad_norm": 2.9003236293792725,
+      "learning_rate": 7.2588818882572266e-06,
+      "loss": 0.6636,
+      "step": 1385
+    },
+    {
+      "batch_num_effect_tokens": 6325,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.26,
+      "grad_norm": 3.1018054485321045,
+      "learning_rate": 7.243626441830009e-06,
+      "loss": 0.9194,
+      "step": 1386
+    },
+    {
+      "batch_num_effect_tokens": 5608,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.26091,
+      "grad_norm": 2.823843479156494,
+      "learning_rate": 7.2283779345699455e-06,
+      "loss": 0.6002,
+      "step": 1387
+    },
+    {
+      "batch_num_effect_tokens": 6813,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.26182,
+      "grad_norm": 3.2531120777130127,
+      "learning_rate": 7.213136404865124e-06,
+      "loss": 0.8652,
+      "step": 1388
+    },
+    {
+      "batch_num_effect_tokens": 7764,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.26273,
+      "grad_norm": 3.0588696002960205,
+      "learning_rate": 7.19790189108609e-06,
+      "loss": 0.9424,
+      "step": 1389
+    },
+    {
+      "batch_num_effect_tokens": 3094,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.26364,
+      "grad_norm": 1.7678723335266113,
+      "learning_rate": 7.182674431585703e-06,
+      "loss": 0.1115,
+      "step": 1390
+    },
+    {
+      "batch_num_effect_tokens": 6241,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.26455,
+      "grad_norm": 4.503663539886475,
+      "learning_rate": 7.167454064699083e-06,
+      "loss": 1.3099,
+      "step": 1391
+    },
+    {
+      "batch_num_effect_tokens": 6661,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.26545,
+      "grad_norm": 2.982997417449951,
+      "learning_rate": 7.1522408287434774e-06,
+      "loss": 0.9009,
+      "step": 1392
+    },
+    {
+      "batch_num_effect_tokens": 6690,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.26636,
+      "grad_norm": 2.942383050918579,
+      "learning_rate": 7.137034762018198e-06,
+      "loss": 0.9115,
+      "step": 1393
+    },
+    {
+      "batch_num_effect_tokens": 5537,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.26727,
+      "grad_norm": 2.937274932861328,
+      "learning_rate": 7.12183590280449e-06,
+      "loss": 0.7368,
+      "step": 1394
+    },
+    {
+      "batch_num_effect_tokens": 9544,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.26818,
+      "grad_norm": 3.7725727558135986,
+      "learning_rate": 7.106644289365474e-06,
+      "loss": 1.7717,
+      "step": 1395
+    },
+    {
+      "batch_num_effect_tokens": 7207,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 1.26909,
+      "grad_norm": 4.409503936767578,
+      "learning_rate": 7.0914599599460095e-06,
+      "loss": 2.2532,
+      "step": 1396
+    },
+    {
+      "batch_num_effect_tokens": 6919,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.27,
+      "grad_norm": 3.1001265048980713,
+      "learning_rate": 7.076282952772634e-06,
+      "loss": 1.1373,
+      "step": 1397
+    },
+    {
+      "batch_num_effect_tokens": 5948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.27091,
+      "grad_norm": 3.3615310192108154,
+      "learning_rate": 7.061113306053443e-06,
+      "loss": 1.1018,
+      "step": 1398
+    },
+    {
+      "batch_num_effect_tokens": 4353,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.27182,
+      "grad_norm": 3.1632819175720215,
+      "learning_rate": 7.045951057978001e-06,
+      "loss": 0.735,
+      "step": 1399
+    },
+    {
+      "batch_num_effect_tokens": 6004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50545,
+      "epoch": 1.27273,
+      "grad_norm": 3.4299721717834473,
+      "learning_rate": 7.0307962467172555e-06,
+      "loss": 1.1528,
+      "step": 1400
+    },
+    {
+      "batch_num_effect_tokens": 7520,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 1.27364,
+      "grad_norm": 2.9699318408966064,
+      "learning_rate": 7.015648910423416e-06,
+      "loss": 1.0898,
+      "step": 1401
+    },
+    {
+      "batch_num_effect_tokens": 6107,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.27455,
+      "grad_norm": 3.086298704147339,
+      "learning_rate": 7.0005090872298955e-06,
+      "loss": 1.0031,
+      "step": 1402
+    },
+    {
+      "batch_num_effect_tokens": 6330,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50569,
+      "epoch": 1.27545,
+      "grad_norm": 3.487344741821289,
+      "learning_rate": 6.985376815251173e-06,
+      "loss": 1.3832,
+      "step": 1403
+    },
+    {
+      "batch_num_effect_tokens": 6372,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.27636,
+      "grad_norm": 3.3115811347961426,
+      "learning_rate": 6.970252132582729e-06,
+      "loss": 1.1073,
+      "step": 1404
+    },
+    {
+      "batch_num_effect_tokens": 8173,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.27727,
+      "grad_norm": 2.6352920532226562,
+      "learning_rate": 6.955135077300932e-06,
+      "loss": 1.1538,
+      "step": 1405
+    },
+    {
+      "batch_num_effect_tokens": 6707,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.27818,
+      "grad_norm": 2.982313632965088,
+      "learning_rate": 6.940025687462952e-06,
+      "loss": 1.0654,
+      "step": 1406
+    },
+    {
+      "batch_num_effect_tokens": 6610,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.27909,
+      "grad_norm": 3.0123355388641357,
+      "learning_rate": 6.924924001106655e-06,
+      "loss": 1.1569,
+      "step": 1407
+    },
+    {
+      "batch_num_effect_tokens": 7134,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.28,
+      "grad_norm": 3.0404586791992188,
+      "learning_rate": 6.909830056250527e-06,
+      "loss": 1.1935,
+      "step": 1408
+    },
+    {
+      "batch_num_effect_tokens": 5418,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 1.28091,
+      "grad_norm": 4.27562141418457,
+      "learning_rate": 6.8947438908935495e-06,
+      "loss": 1.0136,
+      "step": 1409
+    },
+    {
+      "batch_num_effect_tokens": 5188,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.28182,
+      "grad_norm": 4.322559356689453,
+      "learning_rate": 6.87966554301513e-06,
+      "loss": 0.489,
+      "step": 1410
+    },
+    {
+      "batch_num_effect_tokens": 6036,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.28273,
+      "grad_norm": 3.010681390762329,
+      "learning_rate": 6.86459505057499e-06,
+      "loss": 0.954,
+      "step": 1411
+    },
+    {
+      "batch_num_effect_tokens": 8780,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 1.28364,
+      "grad_norm": 2.756701946258545,
+      "learning_rate": 6.8495324515130744e-06,
+      "loss": 1.0386,
+      "step": 1412
+    },
+    {
+      "batch_num_effect_tokens": 6354,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.28455,
+      "grad_norm": 3.2075488567352295,
+      "learning_rate": 6.8344777837494555e-06,
+      "loss": 1.1925,
+      "step": 1413
+    },
+    {
+      "batch_num_effect_tokens": 9044,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 1.28545,
+      "grad_norm": 3.3874874114990234,
+      "learning_rate": 6.819431085184251e-06,
+      "loss": 1.2673,
+      "step": 1414
+    },
+    {
+      "batch_num_effect_tokens": 6507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.28636,
+      "grad_norm": 3.164731979370117,
+      "learning_rate": 6.804392393697502e-06,
+      "loss": 1.0044,
+      "step": 1415
+    },
+    {
+      "batch_num_effect_tokens": 6407,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.28727,
+      "grad_norm": 3.7156081199645996,
+      "learning_rate": 6.789361747149092e-06,
+      "loss": 1.0976,
+      "step": 1416
+    },
+    {
+      "batch_num_effect_tokens": 6394,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52155,
+      "epoch": 1.28818,
+      "grad_norm": 3.471351385116577,
+      "learning_rate": 6.774339183378663e-06,
+      "loss": 1.1154,
+      "step": 1417
+    },
+    {
+      "batch_num_effect_tokens": 5017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.28909,
+      "grad_norm": 3.259028196334839,
+      "learning_rate": 6.7593247402054955e-06,
+      "loss": 0.8695,
+      "step": 1418
+    },
+    {
+      "batch_num_effect_tokens": 5651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.29,
+      "grad_norm": 3.9814767837524414,
+      "learning_rate": 6.744318455428436e-06,
+      "loss": 1.2782,
+      "step": 1419
+    },
+    {
+      "batch_num_effect_tokens": 11260,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.29091,
+      "grad_norm": 2.5980544090270996,
+      "learning_rate": 6.729320366825785e-06,
+      "loss": 1.1878,
+      "step": 1420
+    },
+    {
+      "batch_num_effect_tokens": 9360,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52200,
+      "epoch": 1.29182,
+      "grad_norm": 2.950162649154663,
+      "learning_rate": 6.714330512155216e-06,
+      "loss": 1.4895,
+      "step": 1421
+    },
+    {
+      "batch_num_effect_tokens": 5925,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.29273,
+      "grad_norm": 3.376674175262451,
+      "learning_rate": 6.699348929153668e-06,
+      "loss": 1.0346,
+      "step": 1422
+    },
+    {
+      "batch_num_effect_tokens": 4621,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 1.29364,
+      "grad_norm": 3.315309524536133,
+      "learning_rate": 6.684375655537263e-06,
+      "loss": 0.9401,
+      "step": 1423
+    },
+    {
+      "batch_num_effect_tokens": 5105,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.29455,
+      "grad_norm": 2.5598363876342773,
+      "learning_rate": 6.669410729001193e-06,
+      "loss": 0.5127,
+      "step": 1424
+    },
+    {
+      "batch_num_effect_tokens": 6244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.29545,
+      "grad_norm": 3.0585110187530518,
+      "learning_rate": 6.654454187219649e-06,
+      "loss": 1.0119,
+      "step": 1425
+    },
+    {
+      "batch_num_effect_tokens": 5791,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.29636,
+      "grad_norm": 3.3678364753723145,
+      "learning_rate": 6.639506067845698e-06,
+      "loss": 0.9276,
+      "step": 1426
+    },
+    {
+      "batch_num_effect_tokens": 3842,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.29727,
+      "grad_norm": 2.618781805038452,
+      "learning_rate": 6.6245664085112235e-06,
+      "loss": 0.4342,
+      "step": 1427
+    },
+    {
+      "batch_num_effect_tokens": 9795,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 1.29818,
+      "grad_norm": 3.688122034072876,
+      "learning_rate": 6.6096352468267935e-06,
+      "loss": 1.9993,
+      "step": 1428
+    },
+    {
+      "batch_num_effect_tokens": 6919,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.29909,
+      "grad_norm": 2.862316608428955,
+      "learning_rate": 6.594712620381594e-06,
+      "loss": 0.8784,
+      "step": 1429
+    },
+    {
+      "batch_num_effect_tokens": 7136,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.3,
+      "grad_norm": 3.542351484298706,
+      "learning_rate": 6.579798566743314e-06,
+      "loss": 1.274,
+      "step": 1430
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 1.30091,
+      "grad_norm": 3.241546869277954,
+      "learning_rate": 6.56489312345807e-06,
+      "loss": 1.5165,
+      "step": 1431
+    },
+    {
+      "batch_num_effect_tokens": 7921,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.30182,
+      "grad_norm": 2.9332659244537354,
+      "learning_rate": 6.549996328050296e-06,
+      "loss": 1.1232,
+      "step": 1432
+    },
+    {
+      "batch_num_effect_tokens": 7440,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52148,
+      "epoch": 1.30273,
+      "grad_norm": 2.8302924633026123,
+      "learning_rate": 6.535108218022654e-06,
+      "loss": 0.9424,
+      "step": 1433
+    },
+    {
+      "batch_num_effect_tokens": 6560,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.30364,
+      "grad_norm": 3.1806371212005615,
+      "learning_rate": 6.52022883085595e-06,
+      "loss": 1.1798,
+      "step": 1434
+    },
+    {
+      "batch_num_effect_tokens": 7709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.30455,
+      "grad_norm": 3.062058448791504,
+      "learning_rate": 6.505358204009018e-06,
+      "loss": 1.1253,
+      "step": 1435
+    },
+    {
+      "batch_num_effect_tokens": 7883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.30545,
+      "grad_norm": 2.9780805110931396,
+      "learning_rate": 6.490496374918647e-06,
+      "loss": 1.1104,
+      "step": 1436
+    },
+    {
+      "batch_num_effect_tokens": 7285,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.30636,
+      "grad_norm": 2.6328577995300293,
+      "learning_rate": 6.475643380999469e-06,
+      "loss": 0.6742,
+      "step": 1437
+    },
+    {
+      "batch_num_effect_tokens": 6278,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52179,
+      "epoch": 1.30727,
+      "grad_norm": 3.3319318294525146,
+      "learning_rate": 6.460799259643884e-06,
+      "loss": 1.0524,
+      "step": 1438
+    },
+    {
+      "batch_num_effect_tokens": 4438,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.30818,
+      "grad_norm": 3.1229591369628906,
+      "learning_rate": 6.4459640482219445e-06,
+      "loss": 0.5901,
+      "step": 1439
+    },
+    {
+      "batch_num_effect_tokens": 7578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.30909,
+      "grad_norm": 3.132690906524658,
+      "learning_rate": 6.431137784081283e-06,
+      "loss": 1.2466,
+      "step": 1440
+    },
+    {
+      "batch_num_effect_tokens": 7262,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.31,
+      "grad_norm": 2.7710444927215576,
+      "learning_rate": 6.4163205045469975e-06,
+      "loss": 0.8145,
+      "step": 1441
+    },
+    {
+      "batch_num_effect_tokens": 5168,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.31091,
+      "grad_norm": 3.572665214538574,
+      "learning_rate": 6.401512246921576e-06,
+      "loss": 1.0269,
+      "step": 1442
+    },
+    {
+      "batch_num_effect_tokens": 6323,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 1.31182,
+      "grad_norm": 2.9520695209503174,
+      "learning_rate": 6.386713048484785e-06,
+      "loss": 0.6127,
+      "step": 1443
+    },
+    {
+      "batch_num_effect_tokens": 5373,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.31273,
+      "grad_norm": 3.336017608642578,
+      "learning_rate": 6.3719229464935915e-06,
+      "loss": 0.7666,
+      "step": 1444
+    },
+    {
+      "batch_num_effect_tokens": 5281,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 1.31364,
+      "grad_norm": 3.226888418197632,
+      "learning_rate": 6.357141978182056e-06,
+      "loss": 1.0242,
+      "step": 1445
+    },
+    {
+      "batch_num_effect_tokens": 5474,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.31455,
+      "grad_norm": 3.49110746383667,
+      "learning_rate": 6.342370180761256e-06,
+      "loss": 0.931,
+      "step": 1446
+    },
+    {
+      "batch_num_effect_tokens": 7325,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50555,
+      "epoch": 1.31545,
+      "grad_norm": 3.3710925579071045,
+      "learning_rate": 6.327607591419167e-06,
+      "loss": 1.3554,
+      "step": 1447
+    },
+    {
+      "batch_num_effect_tokens": 5500,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.31636,
+      "grad_norm": 2.879106283187866,
+      "learning_rate": 6.312854247320594e-06,
+      "loss": 0.6699,
+      "step": 1448
+    },
+    {
+      "batch_num_effect_tokens": 6277,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52206,
+      "epoch": 1.31727,
+      "grad_norm": 2.4635508060455322,
+      "learning_rate": 6.2981101856070625e-06,
+      "loss": 0.5001,
+      "step": 1449
+    },
+    {
+      "batch_num_effect_tokens": 5308,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.31818,
+      "grad_norm": 5.102665424346924,
+      "learning_rate": 6.283375443396726e-06,
+      "loss": 0.7164,
+      "step": 1450
+    },
+    {
+      "batch_num_effect_tokens": 7108,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.31909,
+      "grad_norm": 3.0555477142333984,
+      "learning_rate": 6.2686500577842875e-06,
+      "loss": 1.0813,
+      "step": 1451
+    },
+    {
+      "batch_num_effect_tokens": 4383,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.32,
+      "grad_norm": 3.1150074005126953,
+      "learning_rate": 6.25393406584088e-06,
+      "loss": 0.5792,
+      "step": 1452
+    },
+    {
+      "batch_num_effect_tokens": 5781,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.32091,
+      "grad_norm": 3.3713457584381104,
+      "learning_rate": 6.239227504614004e-06,
+      "loss": 1.0913,
+      "step": 1453
+    },
+    {
+      "batch_num_effect_tokens": 6017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.32182,
+      "grad_norm": 3.0999722480773926,
+      "learning_rate": 6.224530411127403e-06,
+      "loss": 1.0289,
+      "step": 1454
+    },
+    {
+      "batch_num_effect_tokens": 5342,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52128,
+      "epoch": 1.32273,
+      "grad_norm": 3.1640305519104004,
+      "learning_rate": 6.209842822380998e-06,
+      "loss": 0.8833,
+      "step": 1455
+    },
+    {
+      "batch_num_effect_tokens": 6408,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.32364,
+      "grad_norm": 3.7910890579223633,
+      "learning_rate": 6.19516477535077e-06,
+      "loss": 1.4728,
+      "step": 1456
+    },
+    {
+      "batch_num_effect_tokens": 3911,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 1.32455,
+      "grad_norm": 3.7931058406829834,
+      "learning_rate": 6.180496306988693e-06,
+      "loss": 0.8558,
+      "step": 1457
+    },
+    {
+      "batch_num_effect_tokens": 5491,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.32545,
+      "grad_norm": 5.476532936096191,
+      "learning_rate": 6.165837454222607e-06,
+      "loss": 2.1147,
+      "step": 1458
+    },
+    {
+      "batch_num_effect_tokens": 8286,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.32636,
+      "grad_norm": 3.141921281814575,
+      "learning_rate": 6.151188253956168e-06,
+      "loss": 1.4141,
+      "step": 1459
+    },
+    {
+      "batch_num_effect_tokens": 6548,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.32727,
+      "grad_norm": 3.4099161624908447,
+      "learning_rate": 6.136548743068713e-06,
+      "loss": 1.1871,
+      "step": 1460
+    },
+    {
+      "batch_num_effect_tokens": 5579,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.32818,
+      "grad_norm": 3.310654640197754,
+      "learning_rate": 6.1219189584152e-06,
+      "loss": 0.9689,
+      "step": 1461
+    },
+    {
+      "batch_num_effect_tokens": 6619,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 1.32909,
+      "grad_norm": 3.180025100708008,
+      "learning_rate": 6.107298936826086e-06,
+      "loss": 1.0157,
+      "step": 1462
+    },
+    {
+      "batch_num_effect_tokens": 6347,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 1.33,
+      "grad_norm": 2.9227607250213623,
+      "learning_rate": 6.092688715107265e-06,
+      "loss": 0.7552,
+      "step": 1463
+    },
+    {
+      "batch_num_effect_tokens": 5925,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.33091,
+      "grad_norm": 2.8292019367218018,
+      "learning_rate": 6.078088330039945e-06,
+      "loss": 0.9346,
+      "step": 1464
+    },
+    {
+      "batch_num_effect_tokens": 4699,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.33182,
+      "grad_norm": 2.105827569961548,
+      "learning_rate": 6.063497818380587e-06,
+      "loss": 0.2637,
+      "step": 1465
+    },
+    {
+      "batch_num_effect_tokens": 6003,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.33273,
+      "grad_norm": 4.578786373138428,
+      "learning_rate": 6.0489172168607816e-06,
+      "loss": 2.0915,
+      "step": 1466
+    },
+    {
+      "batch_num_effect_tokens": 6588,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.33364,
+      "grad_norm": 2.7085275650024414,
+      "learning_rate": 6.0343465621871774e-06,
+      "loss": 0.8409,
+      "step": 1467
+    },
+    {
+      "batch_num_effect_tokens": 6068,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52166,
+      "epoch": 1.33455,
+      "grad_norm": 3.308121919631958,
+      "learning_rate": 6.019785891041381e-06,
+      "loss": 0.9872,
+      "step": 1468
+    },
+    {
+      "batch_num_effect_tokens": 8186,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50591,
+      "epoch": 1.33545,
+      "grad_norm": 3.2569260597229004,
+      "learning_rate": 6.00523524007986e-06,
+      "loss": 1.6875,
+      "step": 1469
+    },
+    {
+      "batch_num_effect_tokens": 6507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 1.33636,
+      "grad_norm": 3.664898633956909,
+      "learning_rate": 5.990694645933866e-06,
+      "loss": 1.6415,
+      "step": 1470
+    },
+    {
+      "batch_num_effect_tokens": 3660,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 1.33727,
+      "grad_norm": 2.9594593048095703,
+      "learning_rate": 5.9761641452093225e-06,
+      "loss": 0.5522,
+      "step": 1471
+    },
+    {
+      "batch_num_effect_tokens": 6642,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50519,
+      "epoch": 1.33818,
+      "grad_norm": 3.2117419242858887,
+      "learning_rate": 5.961643774486754e-06,
+      "loss": 1.0508,
+      "step": 1472
+    },
+    {
+      "batch_num_effect_tokens": 6623,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.33909,
+      "grad_norm": 3.1382222175598145,
+      "learning_rate": 5.947133570321171e-06,
+      "loss": 1.0885,
+      "step": 1473
+    },
+    {
+      "batch_num_effect_tokens": 3305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.34,
+      "grad_norm": 1.8875463008880615,
+      "learning_rate": 5.932633569242e-06,
+      "loss": 0.1166,
+      "step": 1474
+    },
+    {
+      "batch_num_effect_tokens": 4473,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.34091,
+      "grad_norm": 2.190162420272827,
+      "learning_rate": 5.918143807752972e-06,
+      "loss": 0.2517,
+      "step": 1475
+    },
+    {
+      "batch_num_effect_tokens": 7009,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.34182,
+      "grad_norm": 2.6532437801361084,
+      "learning_rate": 5.903664322332048e-06,
+      "loss": 0.7956,
+      "step": 1476
+    },
+    {
+      "batch_num_effect_tokens": 5419,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.34273,
+      "grad_norm": 3.5957748889923096,
+      "learning_rate": 5.8891951494313096e-06,
+      "loss": 1.27,
+      "step": 1477
+    },
+    {
+      "batch_num_effect_tokens": 7214,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 1.34364,
+      "grad_norm": 2.9003989696502686,
+      "learning_rate": 5.87473632547689e-06,
+      "loss": 1.1812,
+      "step": 1478
+    },
+    {
+      "batch_num_effect_tokens": 3851,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 1.34455,
+      "grad_norm": 2.9135546684265137,
+      "learning_rate": 5.860287886868855e-06,
+      "loss": 0.4439,
+      "step": 1479
+    },
+    {
+      "batch_num_effect_tokens": 7369,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.34545,
+      "grad_norm": 3.2737839221954346,
+      "learning_rate": 5.845849869981137e-06,
+      "loss": 1.3717,
+      "step": 1480
+    },
+    {
+      "batch_num_effect_tokens": 4703,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.34636,
+      "grad_norm": 3.12182879447937,
+      "learning_rate": 5.831422311161421e-06,
+      "loss": 0.639,
+      "step": 1481
+    },
+    {
+      "batch_num_effect_tokens": 10256,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.34727,
+      "grad_norm": 2.677966594696045,
+      "learning_rate": 5.8170052467310734e-06,
+      "loss": 1.0875,
+      "step": 1482
+    },
+    {
+      "batch_num_effect_tokens": 7250,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.34818,
+      "grad_norm": 3.431149482727051,
+      "learning_rate": 5.802598712985032e-06,
+      "loss": 0.986,
+      "step": 1483
+    },
+    {
+      "batch_num_effect_tokens": 7450,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52175,
+      "epoch": 1.34909,
+      "grad_norm": 3.0749874114990234,
+      "learning_rate": 5.788202746191735e-06,
+      "loss": 1.2524,
+      "step": 1484
+    },
+    {
+      "batch_num_effect_tokens": 5855,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 1.35,
+      "grad_norm": 2.78263258934021,
+      "learning_rate": 5.773817382593008e-06,
+      "loss": 0.5778,
+      "step": 1485
+    },
+    {
+      "batch_num_effect_tokens": 8007,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.35091,
+      "grad_norm": 3.691978931427002,
+      "learning_rate": 5.759442658403985e-06,
+      "loss": 1.6977,
+      "step": 1486
+    },
+    {
+      "batch_num_effect_tokens": 6896,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 1.35182,
+      "grad_norm": 3.4524037837982178,
+      "learning_rate": 5.7450786098130196e-06,
+      "loss": 1.0297,
+      "step": 1487
+    },
+    {
+      "batch_num_effect_tokens": 6287,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.35273,
+      "grad_norm": 2.8186326026916504,
+      "learning_rate": 5.7307252729815835e-06,
+      "loss": 0.765,
+      "step": 1488
+    },
+    {
+      "batch_num_effect_tokens": 8597,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52153,
+      "epoch": 1.35364,
+      "grad_norm": 3.6953465938568115,
+      "learning_rate": 5.716382684044191e-06,
+      "loss": 1.7102,
+      "step": 1489
+    },
+    {
+      "batch_num_effect_tokens": 7220,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.35455,
+      "grad_norm": 2.9871528148651123,
+      "learning_rate": 5.702050879108284e-06,
+      "loss": 0.9676,
+      "step": 1490
+    },
+    {
+      "batch_num_effect_tokens": 3919,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.35545,
+      "grad_norm": 4.3469109535217285,
+      "learning_rate": 5.687729894254175e-06,
+      "loss": 0.7915,
+      "step": 1491
+    },
+    {
+      "batch_num_effect_tokens": 6381,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.35636,
+      "grad_norm": 2.662632703781128,
+      "learning_rate": 5.673419765534915e-06,
+      "loss": 0.6357,
+      "step": 1492
+    },
+    {
+      "batch_num_effect_tokens": 5042,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.35727,
+      "grad_norm": 3.612797737121582,
+      "learning_rate": 5.659120528976252e-06,
+      "loss": 1.028,
+      "step": 1493
+    },
+    {
+      "batch_num_effect_tokens": 5236,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.35818,
+      "grad_norm": 3.269341230392456,
+      "learning_rate": 5.64483222057648e-06,
+      "loss": 0.6053,
+      "step": 1494
+    },
+    {
+      "batch_num_effect_tokens": 6792,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50565,
+      "epoch": 1.35909,
+      "grad_norm": 3.3669612407684326,
+      "learning_rate": 5.630554876306407e-06,
+      "loss": 1.4502,
+      "step": 1495
+    },
+    {
+      "batch_num_effect_tokens": 5406,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 1.36,
+      "grad_norm": 2.8210713863372803,
+      "learning_rate": 5.616288532109225e-06,
+      "loss": 0.6667,
+      "step": 1496
+    },
+    {
+      "batch_num_effect_tokens": 4728,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.36091,
+      "grad_norm": 4.39499568939209,
+      "learning_rate": 5.6020332239004475e-06,
+      "loss": 0.9532,
+      "step": 1497
+    },
+    {
+      "batch_num_effect_tokens": 6533,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.36182,
+      "grad_norm": 3.180879592895508,
+      "learning_rate": 5.587788987567785e-06,
+      "loss": 0.996,
+      "step": 1498
+    },
+    {
+      "batch_num_effect_tokens": 5884,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.36273,
+      "grad_norm": 3.238677740097046,
+      "learning_rate": 5.5735558589711005e-06,
+      "loss": 0.9302,
+      "step": 1499
+    },
+    {
+      "batch_num_effect_tokens": 4839,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.36364,
+      "grad_norm": 2.734421730041504,
+      "learning_rate": 5.559333873942259e-06,
+      "loss": 0.5516,
+      "step": 1500
+    },
+    {
+      "batch_num_effect_tokens": 6528,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.36455,
+      "grad_norm": 3.6113054752349854,
+      "learning_rate": 5.545123068285105e-06,
+      "loss": 1.3667,
+      "step": 1501
+    },
+    {
+      "batch_num_effect_tokens": 6232,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52131,
+      "epoch": 1.36545,
+      "grad_norm": 3.091254472732544,
+      "learning_rate": 5.5309234777753225e-06,
+      "loss": 0.9556,
+      "step": 1502
+    },
+    {
+      "batch_num_effect_tokens": 6035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.36636,
+      "grad_norm": 3.764340877532959,
+      "learning_rate": 5.516735138160356e-06,
+      "loss": 1.0304,
+      "step": 1503
+    },
+    {
+      "batch_num_effect_tokens": 7763,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.36727,
+      "grad_norm": 3.538569927215576,
+      "learning_rate": 5.502558085159344e-06,
+      "loss": 1.4674,
+      "step": 1504
+    },
+    {
+      "batch_num_effect_tokens": 6006,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.36818,
+      "grad_norm": 3.272339105606079,
+      "learning_rate": 5.488392354462996e-06,
+      "loss": 1.0269,
+      "step": 1505
+    },
+    {
+      "batch_num_effect_tokens": 7161,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52106,
+      "epoch": 1.36909,
+      "grad_norm": 3.459012985229492,
+      "learning_rate": 5.474237981733521e-06,
+      "loss": 1.4757,
+      "step": 1506
+    },
+    {
+      "batch_num_effect_tokens": 6900,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52156,
+      "epoch": 1.37,
+      "grad_norm": 2.8426942825317383,
+      "learning_rate": 5.460095002604533e-06,
+      "loss": 0.981,
+      "step": 1507
+    },
+    {
+      "batch_num_effect_tokens": 3331,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.37091,
+      "grad_norm": 3.165037155151367,
+      "learning_rate": 5.445963452680974e-06,
+      "loss": 0.3777,
+      "step": 1508
+    },
+    {
+      "batch_num_effect_tokens": 6489,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52153,
+      "epoch": 1.37182,
+      "grad_norm": 2.9159345626831055,
+      "learning_rate": 5.431843367538992e-06,
+      "loss": 0.821,
+      "step": 1509
+    },
+    {
+      "batch_num_effect_tokens": 6978,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52189,
+      "epoch": 1.37273,
+      "grad_norm": 2.7146685123443604,
+      "learning_rate": 5.417734782725896e-06,
+      "loss": 0.8035,
+      "step": 1510
+    },
+    {
+      "batch_num_effect_tokens": 7808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.37364,
+      "grad_norm": 2.930331230163574,
+      "learning_rate": 5.403637733760025e-06,
+      "loss": 1.374,
+      "step": 1511
+    },
+    {
+      "batch_num_effect_tokens": 10696,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52174,
+      "epoch": 1.37455,
+      "grad_norm": 2.759847640991211,
+      "learning_rate": 5.38955225613069e-06,
+      "loss": 1.541,
+      "step": 1512
+    },
+    {
+      "batch_num_effect_tokens": 5352,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.37545,
+      "grad_norm": 3.5427732467651367,
+      "learning_rate": 5.375478385298052e-06,
+      "loss": 0.7671,
+      "step": 1513
+    },
+    {
+      "batch_num_effect_tokens": 5525,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52214,
+      "epoch": 1.37636,
+      "grad_norm": 2.800795078277588,
+      "learning_rate": 5.361416156693075e-06,
+      "loss": 0.7603,
+      "step": 1514
+    },
+    {
+      "batch_num_effect_tokens": 4214,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.37727,
+      "grad_norm": 2.761247158050537,
+      "learning_rate": 5.347365605717394e-06,
+      "loss": 0.4067,
+      "step": 1515
+    },
+    {
+      "batch_num_effect_tokens": 6234,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.37818,
+      "grad_norm": 3.1770267486572266,
+      "learning_rate": 5.333326767743263e-06,
+      "loss": 0.7996,
+      "step": 1516
+    },
+    {
+      "batch_num_effect_tokens": 7461,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.37909,
+      "grad_norm": 3.584190607070923,
+      "learning_rate": 5.319299678113432e-06,
+      "loss": 1.1818,
+      "step": 1517
+    },
+    {
+      "batch_num_effect_tokens": 5990,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.38,
+      "grad_norm": 3.2287638187408447,
+      "learning_rate": 5.305284372141095e-06,
+      "loss": 1.0169,
+      "step": 1518
+    },
+    {
+      "batch_num_effect_tokens": 5295,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.38091,
+      "grad_norm": 2.710073471069336,
+      "learning_rate": 5.291280885109756e-06,
+      "loss": 0.6068,
+      "step": 1519
+    },
+    {
+      "batch_num_effect_tokens": 5249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.38182,
+      "grad_norm": 3.27878999710083,
+      "learning_rate": 5.277289252273175e-06,
+      "loss": 0.8559,
+      "step": 1520
+    },
+    {
+      "batch_num_effect_tokens": 6568,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.38273,
+      "grad_norm": 2.8304049968719482,
+      "learning_rate": 5.26330950885528e-06,
+      "loss": 0.9576,
+      "step": 1521
+    },
+    {
+      "batch_num_effect_tokens": 6099,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.38364,
+      "grad_norm": 3.1691155433654785,
+      "learning_rate": 5.249341690050051e-06,
+      "loss": 0.8513,
+      "step": 1522
+    },
+    {
+      "batch_num_effect_tokens": 7202,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.38455,
+      "grad_norm": 4.808316230773926,
+      "learning_rate": 5.235385831021464e-06,
+      "loss": 1.9307,
+      "step": 1523
+    },
+    {
+      "batch_num_effect_tokens": 6017,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52121,
+      "epoch": 1.38545,
+      "grad_norm": 3.3912603855133057,
+      "learning_rate": 5.221441966903371e-06,
+      "loss": 1.0597,
+      "step": 1524
+    },
+    {
+      "batch_num_effect_tokens": 4748,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.38636,
+      "grad_norm": 2.911712884902954,
+      "learning_rate": 5.207510132799436e-06,
+      "loss": 0.5862,
+      "step": 1525
+    },
+    {
+      "batch_num_effect_tokens": 5880,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52143,
+      "epoch": 1.38727,
+      "grad_norm": 3.6869122982025146,
+      "learning_rate": 5.193590363783027e-06,
+      "loss": 1.2402,
+      "step": 1526
+    },
+    {
+      "batch_num_effect_tokens": 7678,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 1.38818,
+      "grad_norm": 3.216240644454956,
+      "learning_rate": 5.179682694897159e-06,
+      "loss": 1.4263,
+      "step": 1527
+    },
+    {
+      "batch_num_effect_tokens": 6767,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.38909,
+      "grad_norm": 3.3282392024993896,
+      "learning_rate": 5.165787161154361e-06,
+      "loss": 1.1479,
+      "step": 1528
+    },
+    {
+      "batch_num_effect_tokens": 4905,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.39,
+      "grad_norm": 3.347588300704956,
+      "learning_rate": 5.151903797536631e-06,
+      "loss": 0.8476,
+      "step": 1529
+    },
+    {
+      "batch_num_effect_tokens": 7924,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.39091,
+      "grad_norm": 2.953521728515625,
+      "learning_rate": 5.138032638995315e-06,
+      "loss": 1.2351,
+      "step": 1530
+    },
+    {
+      "batch_num_effect_tokens": 6819,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.39182,
+      "grad_norm": 3.2468717098236084,
+      "learning_rate": 5.12417372045104e-06,
+      "loss": 1.2991,
+      "step": 1531
+    },
+    {
+      "batch_num_effect_tokens": 4391,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.39273,
+      "grad_norm": 3.6957287788391113,
+      "learning_rate": 5.110327076793613e-06,
+      "loss": 1.2211,
+      "step": 1532
+    },
+    {
+      "batch_num_effect_tokens": 8461,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.39364,
+      "grad_norm": 3.3061976432800293,
+      "learning_rate": 5.096492742881949e-06,
+      "loss": 1.7162,
+      "step": 1533
+    },
+    {
+      "batch_num_effect_tokens": 6552,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 1.39455,
+      "grad_norm": 3.1427457332611084,
+      "learning_rate": 5.082670753543961e-06,
+      "loss": 1.0838,
+      "step": 1534
+    },
+    {
+      "batch_num_effect_tokens": 5090,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.39545,
+      "grad_norm": 3.309375524520874,
+      "learning_rate": 5.0688611435764975e-06,
+      "loss": 0.7022,
+      "step": 1535
+    },
+    {
+      "batch_num_effect_tokens": 5258,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.39636,
+      "grad_norm": 3.023341655731201,
+      "learning_rate": 5.055063947745234e-06,
+      "loss": 0.8063,
+      "step": 1536
+    },
+    {
+      "batch_num_effect_tokens": 6498,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.39727,
+      "grad_norm": 3.497258424758911,
+      "learning_rate": 5.04127920078459e-06,
+      "loss": 1.424,
+      "step": 1537
+    },
+    {
+      "batch_num_effect_tokens": 6588,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.39818,
+      "grad_norm": 2.1093833446502686,
+      "learning_rate": 5.027506937397653e-06,
+      "loss": 0.5216,
+      "step": 1538
+    },
+    {
+      "batch_num_effect_tokens": 5417,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.39909,
+      "grad_norm": 3.166273593902588,
+      "learning_rate": 5.013747192256073e-06,
+      "loss": 0.9344,
+      "step": 1539
+    },
+    {
+      "batch_num_effect_tokens": 5631,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 1.4,
+      "grad_norm": 3.173602342605591,
+      "learning_rate": 5.000000000000003e-06,
+      "loss": 0.9181,
+      "step": 1540
+    },
+    {
+      "batch_num_effect_tokens": 7966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.40091,
+      "grad_norm": 2.9153196811676025,
+      "learning_rate": 4.986265395237972e-06,
+      "loss": 1.2476,
+      "step": 1541
+    },
+    {
+      "batch_num_effect_tokens": 6299,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.40182,
+      "grad_norm": 3.3794291019439697,
+      "learning_rate": 4.972543412546842e-06,
+      "loss": 1.0079,
+      "step": 1542
+    },
+    {
+      "batch_num_effect_tokens": 5583,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 1.40273,
+      "grad_norm": 3.053807497024536,
+      "learning_rate": 4.958834086471683e-06,
+      "loss": 0.8067,
+      "step": 1543
+    },
+    {
+      "batch_num_effect_tokens": 6947,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 1.40364,
+      "grad_norm": 3.5736801624298096,
+      "learning_rate": 4.945137451525707e-06,
+      "loss": 1.4208,
+      "step": 1544
+    },
+    {
+      "batch_num_effect_tokens": 7828,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52177,
+      "epoch": 1.40455,
+      "grad_norm": 2.5810935497283936,
+      "learning_rate": 4.931453542190172e-06,
+      "loss": 0.9514,
+      "step": 1545
+    },
+    {
+      "batch_num_effect_tokens": 5808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.40545,
+      "grad_norm": 2.9263129234313965,
+      "learning_rate": 4.917782392914311e-06,
+      "loss": 0.67,
+      "step": 1546
+    },
+    {
+      "batch_num_effect_tokens": 5023,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.40636,
+      "grad_norm": 3.0986263751983643,
+      "learning_rate": 4.904124038115219e-06,
+      "loss": 0.593,
+      "step": 1547
+    },
+    {
+      "batch_num_effect_tokens": 7130,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52144,
+      "epoch": 1.40727,
+      "grad_norm": 3.111344575881958,
+      "learning_rate": 4.890478512177796e-06,
+      "loss": 1.1165,
+      "step": 1548
+    },
+    {
+      "batch_num_effect_tokens": 4993,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.40818,
+      "grad_norm": 3.050793409347534,
+      "learning_rate": 4.876845849454631e-06,
+      "loss": 0.5179,
+      "step": 1549
+    },
+    {
+      "batch_num_effect_tokens": 7141,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 1.40909,
+      "grad_norm": 3.0416104793548584,
+      "learning_rate": 4.863226084265939e-06,
+      "loss": 1.0796,
+      "step": 1550
+    },
+    {
+      "batch_num_effect_tokens": 3840,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.41,
+      "grad_norm": 3.3208167552948,
+      "learning_rate": 4.849619250899458e-06,
+      "loss": 0.6583,
+      "step": 1551
+    },
+    {
+      "batch_num_effect_tokens": 5326,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.41091,
+      "grad_norm": 3.120213508605957,
+      "learning_rate": 4.836025383610382e-06,
+      "loss": 1.0188,
+      "step": 1552
+    },
+    {
+      "batch_num_effect_tokens": 6750,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.41182,
+      "grad_norm": 3.2883312702178955,
+      "learning_rate": 4.822444516621252e-06,
+      "loss": 1.1825,
+      "step": 1553
+    },
+    {
+      "batch_num_effect_tokens": 5966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.41273,
+      "grad_norm": 3.5330169200897217,
+      "learning_rate": 4.808876684121882e-06,
+      "loss": 0.7584,
+      "step": 1554
+    },
+    {
+      "batch_num_effect_tokens": 5559,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 1.41364,
+      "grad_norm": 3.6764943599700928,
+      "learning_rate": 4.795321920269279e-06,
+      "loss": 1.0578,
+      "step": 1555
+    },
+    {
+      "batch_num_effect_tokens": 8163,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52154,
+      "epoch": 1.41455,
+      "grad_norm": 3.582327127456665,
+      "learning_rate": 4.781780259187543e-06,
+      "loss": 1.6447,
+      "step": 1556
+    },
+    {
+      "batch_num_effect_tokens": 7129,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.41545,
+      "grad_norm": 3.57389497756958,
+      "learning_rate": 4.7682517349677895e-06,
+      "loss": 1.5303,
+      "step": 1557
+    },
+    {
+      "batch_num_effect_tokens": 4651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.41636,
+      "grad_norm": 3.384352922439575,
+      "learning_rate": 4.754736381668057e-06,
+      "loss": 0.7794,
+      "step": 1558
+    },
+    {
+      "batch_num_effect_tokens": 5713,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.41727,
+      "grad_norm": 4.208513259887695,
+      "learning_rate": 4.741234233313241e-06,
+      "loss": 1.4527,
+      "step": 1559
+    },
+    {
+      "batch_num_effect_tokens": 6934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52218,
+      "epoch": 1.41818,
+      "grad_norm": 2.540071725845337,
+      "learning_rate": 4.727745323894976e-06,
+      "loss": 0.7746,
+      "step": 1560
+    },
+    {
+      "batch_num_effect_tokens": 5160,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.41909,
+      "grad_norm": 3.0919241905212402,
+      "learning_rate": 4.714269687371581e-06,
+      "loss": 0.8692,
+      "step": 1561
+    },
+    {
+      "batch_num_effect_tokens": 6866,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.42,
+      "grad_norm": 3.1210877895355225,
+      "learning_rate": 4.700807357667953e-06,
+      "loss": 1.0562,
+      "step": 1562
+    },
+    {
+      "batch_num_effect_tokens": 7396,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52102,
+      "epoch": 1.42091,
+      "grad_norm": 3.5360233783721924,
+      "learning_rate": 4.68735836867549e-06,
+      "loss": 1.5056,
+      "step": 1563
+    },
+    {
+      "batch_num_effect_tokens": 8045,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.42182,
+      "grad_norm": 3.2280681133270264,
+      "learning_rate": 4.673922754252001e-06,
+      "loss": 1.0675,
+      "step": 1564
+    },
+    {
+      "batch_num_effect_tokens": 6934,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 1.42273,
+      "grad_norm": 3.0071732997894287,
+      "learning_rate": 4.66050054822164e-06,
+      "loss": 1.0185,
+      "step": 1565
+    },
+    {
+      "batch_num_effect_tokens": 4468,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52136,
+      "epoch": 1.42364,
+      "grad_norm": 2.845442056655884,
+      "learning_rate": 4.647091784374786e-06,
+      "loss": 0.6274,
+      "step": 1566
+    },
+    {
+      "batch_num_effect_tokens": 5764,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.42455,
+      "grad_norm": 4.495404243469238,
+      "learning_rate": 4.633696496467991e-06,
+      "loss": 1.1627,
+      "step": 1567
+    },
+    {
+      "batch_num_effect_tokens": 7567,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.42545,
+      "grad_norm": 2.7251222133636475,
+      "learning_rate": 4.620314718223876e-06,
+      "loss": 1.0659,
+      "step": 1568
+    },
+    {
+      "batch_num_effect_tokens": 7245,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.42636,
+      "grad_norm": 3.165076494216919,
+      "learning_rate": 4.606946483331049e-06,
+      "loss": 1.1725,
+      "step": 1569
+    },
+    {
+      "batch_num_effect_tokens": 5553,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.42727,
+      "grad_norm": 3.2784016132354736,
+      "learning_rate": 4.593591825444028e-06,
+      "loss": 0.9274,
+      "step": 1570
+    },
+    {
+      "batch_num_effect_tokens": 4321,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.42818,
+      "grad_norm": 3.000758647918701,
+      "learning_rate": 4.580250778183143e-06,
+      "loss": 0.5813,
+      "step": 1571
+    },
+    {
+      "batch_num_effect_tokens": 5569,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.42909,
+      "grad_norm": 3.104440689086914,
+      "learning_rate": 4.5669233751344725e-06,
+      "loss": 0.8375,
+      "step": 1572
+    },
+    {
+      "batch_num_effect_tokens": 5609,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.43,
+      "grad_norm": 3.22114634513855,
+      "learning_rate": 4.5536096498497295e-06,
+      "loss": 0.8254,
+      "step": 1573
+    },
+    {
+      "batch_num_effect_tokens": 9521,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.43091,
+      "grad_norm": 2.0068347454071045,
+      "learning_rate": 4.5403096358462095e-06,
+      "loss": 0.557,
+      "step": 1574
+    },
+    {
+      "batch_num_effect_tokens": 7954,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.43182,
+      "grad_norm": 2.7655041217803955,
+      "learning_rate": 4.527023366606678e-06,
+      "loss": 1.0156,
+      "step": 1575
+    },
+    {
+      "batch_num_effect_tokens": 7397,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52146,
+      "epoch": 1.43273,
+      "grad_norm": 3.338231325149536,
+      "learning_rate": 4.513750875579303e-06,
+      "loss": 1.3523,
+      "step": 1576
+    },
+    {
+      "batch_num_effect_tokens": 7074,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.43364,
+      "grad_norm": 3.9786431789398193,
+      "learning_rate": 4.500492196177561e-06,
+      "loss": 1.6917,
+      "step": 1577
+    },
+    {
+      "batch_num_effect_tokens": 4802,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.43455,
+      "grad_norm": 3.6411070823669434,
+      "learning_rate": 4.487247361780169e-06,
+      "loss": 0.9016,
+      "step": 1578
+    },
+    {
+      "batch_num_effect_tokens": 9939,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.43545,
+      "grad_norm": 2.6721506118774414,
+      "learning_rate": 4.474016405730973e-06,
+      "loss": 1.2097,
+      "step": 1579
+    },
+    {
+      "batch_num_effect_tokens": 7528,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.43636,
+      "grad_norm": 2.798081874847412,
+      "learning_rate": 4.460799361338898e-06,
+      "loss": 0.9998,
+      "step": 1580
+    },
+    {
+      "batch_num_effect_tokens": 4006,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.43727,
+      "grad_norm": 2.9585371017456055,
+      "learning_rate": 4.447596261877832e-06,
+      "loss": 0.4894,
+      "step": 1581
+    },
+    {
+      "batch_num_effect_tokens": 4622,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 1.43818,
+      "grad_norm": 3.357067108154297,
+      "learning_rate": 4.4344071405865656e-06,
+      "loss": 0.709,
+      "step": 1582
+    },
+    {
+      "batch_num_effect_tokens": 5982,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.43909,
+      "grad_norm": 3.168221950531006,
+      "learning_rate": 4.421232030668688e-06,
+      "loss": 0.7801,
+      "step": 1583
+    },
+    {
+      "batch_num_effect_tokens": 5118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.44,
+      "grad_norm": 3.276728630065918,
+      "learning_rate": 4.408070965292534e-06,
+      "loss": 0.679,
+      "step": 1584
+    },
+    {
+      "batch_num_effect_tokens": 4891,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.44091,
+      "grad_norm": 3.0199480056762695,
+      "learning_rate": 4.394923977591059e-06,
+      "loss": 0.6886,
+      "step": 1585
+    },
+    {
+      "batch_num_effect_tokens": 5212,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.44182,
+      "grad_norm": 3.7399933338165283,
+      "learning_rate": 4.381791100661798e-06,
+      "loss": 0.8719,
+      "step": 1586
+    },
+    {
+      "batch_num_effect_tokens": 7335,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52177,
+      "epoch": 1.44273,
+      "grad_norm": 2.840649366378784,
+      "learning_rate": 4.368672367566751e-06,
+      "loss": 0.9722,
+      "step": 1587
+    },
+    {
+      "batch_num_effect_tokens": 7201,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52144,
+      "epoch": 1.44364,
+      "grad_norm": 2.9049928188323975,
+      "learning_rate": 4.355567811332311e-06,
+      "loss": 1.0122,
+      "step": 1588
+    },
+    {
+      "batch_num_effect_tokens": 5408,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.44455,
+      "grad_norm": 3.15315842628479,
+      "learning_rate": 4.342477464949182e-06,
+      "loss": 0.782,
+      "step": 1589
+    },
+    {
+      "batch_num_effect_tokens": 6864,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52218,
+      "epoch": 1.44545,
+      "grad_norm": 4.1640167236328125,
+      "learning_rate": 4.3294013613722944e-06,
+      "loss": 1.3601,
+      "step": 1590
+    },
+    {
+      "batch_num_effect_tokens": 4136,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.44636,
+      "grad_norm": 3.5017006397247314,
+      "learning_rate": 4.316339533520727e-06,
+      "loss": 0.7745,
+      "step": 1591
+    },
+    {
+      "batch_num_effect_tokens": 5319,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.44727,
+      "grad_norm": 3.5453526973724365,
+      "learning_rate": 4.3032920142776125e-06,
+      "loss": 0.9368,
+      "step": 1592
+    },
+    {
+      "batch_num_effect_tokens": 7886,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.44818,
+      "grad_norm": 2.3222293853759766,
+      "learning_rate": 4.29025883649007e-06,
+      "loss": 0.5264,
+      "step": 1593
+    },
+    {
+      "batch_num_effect_tokens": 5225,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.44909,
+      "grad_norm": 4.376035690307617,
+      "learning_rate": 4.2772400329691055e-06,
+      "loss": 1.4816,
+      "step": 1594
+    },
+    {
+      "batch_num_effect_tokens": 6460,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 1.45,
+      "grad_norm": 2.6237964630126953,
+      "learning_rate": 4.264235636489542e-06,
+      "loss": 0.6,
+      "step": 1595
+    },
+    {
+      "batch_num_effect_tokens": 6284,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52205,
+      "epoch": 1.45091,
+      "grad_norm": 3.014374256134033,
+      "learning_rate": 4.251245679789928e-06,
+      "loss": 0.9595,
+      "step": 1596
+    },
+    {
+      "batch_num_effect_tokens": 5966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52129,
+      "epoch": 1.45182,
+      "grad_norm": 3.3844950199127197,
+      "learning_rate": 4.2382701955724724e-06,
+      "loss": 0.9574,
+      "step": 1597
+    },
+    {
+      "batch_num_effect_tokens": 6036,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.45273,
+      "grad_norm": 3.3551244735717773,
+      "learning_rate": 4.225309216502933e-06,
+      "loss": 1.1822,
+      "step": 1598
+    },
+    {
+      "batch_num_effect_tokens": 6962,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 1.45364,
+      "grad_norm": 3.1049938201904297,
+      "learning_rate": 4.212362775210566e-06,
+      "loss": 1.0809,
+      "step": 1599
+    },
+    {
+      "batch_num_effect_tokens": 5748,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 1.45455,
+      "grad_norm": 3.115337610244751,
+      "learning_rate": 4.19943090428802e-06,
+      "loss": 0.7755,
+      "step": 1600
+    },
+    {
+      "batch_num_effect_tokens": 7236,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 1.45545,
+      "grad_norm": 3.6128201484680176,
+      "learning_rate": 4.186513636291263e-06,
+      "loss": 0.9413,
+      "step": 1601
+    },
+    {
+      "batch_num_effect_tokens": 6160,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.45636,
+      "grad_norm": 3.142624855041504,
+      "learning_rate": 4.173611003739498e-06,
+      "loss": 0.7049,
+      "step": 1602
+    },
+    {
+      "batch_num_effect_tokens": 4262,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.45727,
+      "grad_norm": 2.6013548374176025,
+      "learning_rate": 4.160723039115096e-06,
+      "loss": 0.4307,
+      "step": 1603
+    },
+    {
+      "batch_num_effect_tokens": 6962,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.45818,
+      "grad_norm": 3.305237054824829,
+      "learning_rate": 4.147849774863488e-06,
+      "loss": 1.2904,
+      "step": 1604
+    },
+    {
+      "batch_num_effect_tokens": 6581,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.45909,
+      "grad_norm": 3.872040033340454,
+      "learning_rate": 4.134991243393097e-06,
+      "loss": 1.4661,
+      "step": 1605
+    },
+    {
+      "batch_num_effect_tokens": 9770,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.46,
+      "grad_norm": 3.0166680812835693,
+      "learning_rate": 4.12214747707527e-06,
+      "loss": 1.6123,
+      "step": 1606
+    },
+    {
+      "batch_num_effect_tokens": 5821,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.46091,
+      "grad_norm": 2.6516101360321045,
+      "learning_rate": 4.109318508244168e-06,
+      "loss": 0.6149,
+      "step": 1607
+    },
+    {
+      "batch_num_effect_tokens": 6028,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.46182,
+      "grad_norm": 3.194568395614624,
+      "learning_rate": 4.0965043691967045e-06,
+      "loss": 1.0061,
+      "step": 1608
+    },
+    {
+      "batch_num_effect_tokens": 7292,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.46273,
+      "grad_norm": 2.660567045211792,
+      "learning_rate": 4.083705092192457e-06,
+      "loss": 0.8018,
+      "step": 1609
+    },
+    {
+      "batch_num_effect_tokens": 9606,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.46364,
+      "grad_norm": 3.1425766944885254,
+      "learning_rate": 4.070920709453597e-06,
+      "loss": 1.5588,
+      "step": 1610
+    },
+    {
+      "batch_num_effect_tokens": 7163,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.46455,
+      "grad_norm": 3.0139341354370117,
+      "learning_rate": 4.058151253164786e-06,
+      "loss": 1.0616,
+      "step": 1611
+    },
+    {
+      "batch_num_effect_tokens": 4896,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.46545,
+      "grad_norm": 3.9788472652435303,
+      "learning_rate": 4.045396755473121e-06,
+      "loss": 1.3641,
+      "step": 1612
+    },
+    {
+      "batch_num_effect_tokens": 5555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.46636,
+      "grad_norm": 3.3677539825439453,
+      "learning_rate": 4.032657248488031e-06,
+      "loss": 1.1608,
+      "step": 1613
+    },
+    {
+      "batch_num_effect_tokens": 11051,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.46727,
+      "grad_norm": 2.7463760375976562,
+      "learning_rate": 4.019932764281212e-06,
+      "loss": 1.6416,
+      "step": 1614
+    },
+    {
+      "batch_num_effect_tokens": 4681,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.46818,
+      "grad_norm": 3.089949369430542,
+      "learning_rate": 4.007223334886531e-06,
+      "loss": 0.6782,
+      "step": 1615
+    },
+    {
+      "batch_num_effect_tokens": 6030,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.46909,
+      "grad_norm": 2.998955249786377,
+      "learning_rate": 3.9945289922999705e-06,
+      "loss": 0.9849,
+      "step": 1616
+    },
+    {
+      "batch_num_effect_tokens": 5378,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.47,
+      "grad_norm": 2.4241385459899902,
+      "learning_rate": 3.981849768479516e-06,
+      "loss": 0.3628,
+      "step": 1617
+    },
+    {
+      "batch_num_effect_tokens": 6297,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.47091,
+      "grad_norm": 3.1372172832489014,
+      "learning_rate": 3.9691856953451044e-06,
+      "loss": 1.115,
+      "step": 1618
+    },
+    {
+      "batch_num_effect_tokens": 6312,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.47182,
+      "grad_norm": 3.268031120300293,
+      "learning_rate": 3.956536804778523e-06,
+      "loss": 1.3478,
+      "step": 1619
+    },
+    {
+      "batch_num_effect_tokens": 5785,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 1.47273,
+      "grad_norm": 3.2121145725250244,
+      "learning_rate": 3.943903128623336e-06,
+      "loss": 1.0974,
+      "step": 1620
+    },
+    {
+      "batch_num_effect_tokens": 5960,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52154,
+      "epoch": 1.47364,
+      "grad_norm": 3.1616177558898926,
+      "learning_rate": 3.931284698684809e-06,
+      "loss": 0.8974,
+      "step": 1621
+    },
+    {
+      "batch_num_effect_tokens": 4574,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.47455,
+      "grad_norm": 3.1711459159851074,
+      "learning_rate": 3.918681546729822e-06,
+      "loss": 0.718,
+      "step": 1622
+    },
+    {
+      "batch_num_effect_tokens": 8323,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.47545,
+      "grad_norm": 2.6506152153015137,
+      "learning_rate": 3.906093704486802e-06,
+      "loss": 1.1311,
+      "step": 1623
+    },
+    {
+      "batch_num_effect_tokens": 7577,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52153,
+      "epoch": 1.47636,
+      "grad_norm": 3.0118792057037354,
+      "learning_rate": 3.893521203645618e-06,
+      "loss": 0.8202,
+      "step": 1624
+    },
+    {
+      "batch_num_effect_tokens": 5825,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.47727,
+      "grad_norm": 2.8670742511749268,
+      "learning_rate": 3.880964075857535e-06,
+      "loss": 0.9281,
+      "step": 1625
+    },
+    {
+      "batch_num_effect_tokens": 5137,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.47818,
+      "grad_norm": 3.110680341720581,
+      "learning_rate": 3.8684223527351025e-06,
+      "loss": 0.6674,
+      "step": 1626
+    },
+    {
+      "batch_num_effect_tokens": 6837,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 1.47909,
+      "grad_norm": 3.045212745666504,
+      "learning_rate": 3.855896065852094e-06,
+      "loss": 0.9355,
+      "step": 1627
+    },
+    {
+      "batch_num_effect_tokens": 5605,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.48,
+      "grad_norm": 3.3067891597747803,
+      "learning_rate": 3.8433852467434175e-06,
+      "loss": 1.0483,
+      "step": 1628
+    },
+    {
+      "batch_num_effect_tokens": 5249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.48091,
+      "grad_norm": 3.023493528366089,
+      "learning_rate": 3.830889926905054e-06,
+      "loss": 0.866,
+      "step": 1629
+    },
+    {
+      "batch_num_effect_tokens": 4265,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.48182,
+      "grad_norm": 2.6965324878692627,
+      "learning_rate": 3.818410137793947e-06,
+      "loss": 0.5241,
+      "step": 1630
+    },
+    {
+      "batch_num_effect_tokens": 6096,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.48273,
+      "grad_norm": 3.0876448154449463,
+      "learning_rate": 3.8059459108279596e-06,
+      "loss": 0.7893,
+      "step": 1631
+    },
+    {
+      "batch_num_effect_tokens": 5400,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.48364,
+      "grad_norm": 3.066556692123413,
+      "learning_rate": 3.7934972773857637e-06,
+      "loss": 0.5504,
+      "step": 1632
+    },
+    {
+      "batch_num_effect_tokens": 5944,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.48455,
+      "grad_norm": 3.781280517578125,
+      "learning_rate": 3.78106426880678e-06,
+      "loss": 1.4937,
+      "step": 1633
+    },
+    {
+      "batch_num_effect_tokens": 5206,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.48545,
+      "grad_norm": 3.534966230392456,
+      "learning_rate": 3.768646916391089e-06,
+      "loss": 0.8883,
+      "step": 1634
+    },
+    {
+      "batch_num_effect_tokens": 6922,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.48636,
+      "grad_norm": 2.704559087753296,
+      "learning_rate": 3.7562452513993676e-06,
+      "loss": 0.6444,
+      "step": 1635
+    },
+    {
+      "batch_num_effect_tokens": 9185,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.48727,
+      "grad_norm": 3.4860382080078125,
+      "learning_rate": 3.743859305052785e-06,
+      "loss": 1.5957,
+      "step": 1636
+    },
+    {
+      "batch_num_effect_tokens": 7085,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.48818,
+      "grad_norm": 3.459604024887085,
+      "learning_rate": 3.731489108532954e-06,
+      "loss": 1.4537,
+      "step": 1637
+    },
+    {
+      "batch_num_effect_tokens": 8872,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.48909,
+      "grad_norm": 2.9613797664642334,
+      "learning_rate": 3.719134692981826e-06,
+      "loss": 1.211,
+      "step": 1638
+    },
+    {
+      "batch_num_effect_tokens": 4038,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52108,
+      "epoch": 1.49,
+      "grad_norm": 3.5438663959503174,
+      "learning_rate": 3.7067960895016277e-06,
+      "loss": 0.7451,
+      "step": 1639
+    },
+    {
+      "batch_num_effect_tokens": 8030,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.49091,
+      "grad_norm": 2.684650421142578,
+      "learning_rate": 3.6944733291547784e-06,
+      "loss": 0.907,
+      "step": 1640
+    },
+    {
+      "batch_num_effect_tokens": 7190,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.49182,
+      "grad_norm": 2.9121947288513184,
+      "learning_rate": 3.6821664429638093e-06,
+      "loss": 0.9502,
+      "step": 1641
+    },
+    {
+      "batch_num_effect_tokens": 7233,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 1.49273,
+      "grad_norm": 3.3889174461364746,
+      "learning_rate": 3.6698754619112974e-06,
+      "loss": 1.2518,
+      "step": 1642
+    },
+    {
+      "batch_num_effect_tokens": 8993,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 1.49364,
+      "grad_norm": 2.7589786052703857,
+      "learning_rate": 3.6576004169397684e-06,
+      "loss": 1.1827,
+      "step": 1643
+    },
+    {
+      "batch_num_effect_tokens": 7291,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52193,
+      "epoch": 1.49455,
+      "grad_norm": 3.014965772628784,
+      "learning_rate": 3.645341338951639e-06,
+      "loss": 1.1923,
+      "step": 1644
+    },
+    {
+      "batch_num_effect_tokens": 6077,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.49545,
+      "grad_norm": 2.734469413757324,
+      "learning_rate": 3.633098258809119e-06,
+      "loss": 0.747,
+      "step": 1645
+    },
+    {
+      "batch_num_effect_tokens": 7428,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 1.49636,
+      "grad_norm": 2.967613458633423,
+      "learning_rate": 3.62087120733415e-06,
+      "loss": 1.0398,
+      "step": 1646
+    },
+    {
+      "batch_num_effect_tokens": 6955,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.49727,
+      "grad_norm": 3.0190069675445557,
+      "learning_rate": 3.608660215308315e-06,
+      "loss": 1.2252,
+      "step": 1647
+    },
+    {
+      "batch_num_effect_tokens": 4942,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52127,
+      "epoch": 1.49818,
+      "grad_norm": 3.297241449356079,
+      "learning_rate": 3.596465313472778e-06,
+      "loss": 0.8813,
+      "step": 1648
+    },
+    {
+      "batch_num_effect_tokens": 7227,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52142,
+      "epoch": 1.49909,
+      "grad_norm": 2.968118906021118,
+      "learning_rate": 3.584286532528184e-06,
+      "loss": 1.215,
+      "step": 1649
+    },
+    {
+      "batch_num_effect_tokens": 7004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52115,
+      "epoch": 1.5,
+      "grad_norm": 2.9560141563415527,
+      "learning_rate": 3.5721239031346067e-06,
+      "loss": 1.0096,
+      "step": 1650
+    },
+    {
+      "batch_num_effect_tokens": 6649,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.50091,
+      "grad_norm": 2.98976731300354,
+      "learning_rate": 3.5599774559114475e-06,
+      "loss": 0.8184,
+      "step": 1651
+    },
+    {
+      "batch_num_effect_tokens": 7209,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 1.50182,
+      "grad_norm": 3.2497808933258057,
+      "learning_rate": 3.5478472214373716e-06,
+      "loss": 1.3472,
+      "step": 1652
+    },
+    {
+      "batch_num_effect_tokens": 6625,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 1.50273,
+      "grad_norm": 2.750164031982422,
+      "learning_rate": 3.535733230250228e-06,
+      "loss": 0.8109,
+      "step": 1653
+    },
+    {
+      "batch_num_effect_tokens": 4242,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 1.50364,
+      "grad_norm": 3.154306650161743,
+      "learning_rate": 3.5236355128469814e-06,
+      "loss": 0.4982,
+      "step": 1654
+    },
+    {
+      "batch_num_effect_tokens": 6324,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 1.50455,
+      "grad_norm": 3.2036898136138916,
+      "learning_rate": 3.5115540996836174e-06,
+      "loss": 1.0942,
+      "step": 1655
+    },
+    {
+      "batch_num_effect_tokens": 6331,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.50545,
+      "grad_norm": 3.789130210876465,
+      "learning_rate": 3.4994890211750754e-06,
+      "loss": 1.0987,
+      "step": 1656
+    },
+    {
+      "batch_num_effect_tokens": 7787,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.50636,
+      "grad_norm": 2.964521884918213,
+      "learning_rate": 3.4874403076951833e-06,
+      "loss": 1.1047,
+      "step": 1657
+    },
+    {
+      "batch_num_effect_tokens": 8055,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 1.50727,
+      "grad_norm": 4.526617050170898,
+      "learning_rate": 3.4754079895765604e-06,
+      "loss": 2.2207,
+      "step": 1658
+    },
+    {
+      "batch_num_effect_tokens": 8868,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 1.50818,
+      "grad_norm": 3.2682905197143555,
+      "learning_rate": 3.4633920971105515e-06,
+      "loss": 1.3323,
+      "step": 1659
+    },
+    {
+      "batch_num_effect_tokens": 8718,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.50909,
+      "grad_norm": 3.7098581790924072,
+      "learning_rate": 3.4513926605471504e-06,
+      "loss": 1.4918,
+      "step": 1660
+    },
+    {
+      "batch_num_effect_tokens": 5298,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 1.51,
+      "grad_norm": 2.9554543495178223,
+      "learning_rate": 3.4394097100949286e-06,
+      "loss": 0.6862,
+      "step": 1661
+    },
+    {
+      "batch_num_effect_tokens": 4945,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.51091,
+      "grad_norm": 3.075629949569702,
+      "learning_rate": 3.4274432759209454e-06,
+      "loss": 0.7103,
+      "step": 1662
+    },
+    {
+      "batch_num_effect_tokens": 5759,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.51182,
+      "grad_norm": 3.055271625518799,
+      "learning_rate": 3.415493388150689e-06,
+      "loss": 0.6997,
+      "step": 1663
+    },
+    {
+      "batch_num_effect_tokens": 7229,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 1.51273,
+      "grad_norm": 3.065497875213623,
+      "learning_rate": 3.4035600768679855e-06,
+      "loss": 1.0441,
+      "step": 1664
+    },
+    {
+      "batch_num_effect_tokens": 7170,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.51364,
+      "grad_norm": 2.845918655395508,
+      "learning_rate": 3.3916433721149323e-06,
+      "loss": 1.1487,
+      "step": 1665
+    },
+    {
+      "batch_num_effect_tokens": 5252,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.51455,
+      "grad_norm": 3.0218636989593506,
+      "learning_rate": 3.379743303891815e-06,
+      "loss": 0.6869,
+      "step": 1666
+    },
+    {
+      "batch_num_effect_tokens": 6367,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.51545,
+      "grad_norm": 3.1631014347076416,
+      "learning_rate": 3.367859902157048e-06,
+      "loss": 0.9597,
+      "step": 1667
+    },
+    {
+      "batch_num_effect_tokens": 5526,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.51636,
+      "grad_norm": 2.9969050884246826,
+      "learning_rate": 3.355993196827075e-06,
+      "loss": 0.8599,
+      "step": 1668
+    },
+    {
+      "batch_num_effect_tokens": 6648,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52218,
+      "epoch": 1.51727,
+      "grad_norm": 3.920488119125366,
+      "learning_rate": 3.344143217776319e-06,
+      "loss": 1.7936,
+      "step": 1669
+    },
+    {
+      "batch_num_effect_tokens": 7578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.51818,
+      "grad_norm": 2.879525899887085,
+      "learning_rate": 3.3323099948370853e-06,
+      "loss": 1.1958,
+      "step": 1670
+    },
+    {
+      "batch_num_effect_tokens": 4766,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.51909,
+      "grad_norm": 3.3454763889312744,
+      "learning_rate": 3.3204935577994967e-06,
+      "loss": 1.0473,
+      "step": 1671
+    },
+    {
+      "batch_num_effect_tokens": 5247,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 1.52,
+      "grad_norm": 3.61670184135437,
+      "learning_rate": 3.308693936411421e-06,
+      "loss": 1.1154,
+      "step": 1672
+    },
+    {
+      "batch_num_effect_tokens": 10026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.52091,
+      "grad_norm": 2.697474241256714,
+      "learning_rate": 3.296911160378388e-06,
+      "loss": 1.3497,
+      "step": 1673
+    },
+    {
+      "batch_num_effect_tokens": 6802,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 1.52182,
+      "grad_norm": 2.902420997619629,
+      "learning_rate": 3.2851452593635267e-06,
+      "loss": 0.9251,
+      "step": 1674
+    },
+    {
+      "batch_num_effect_tokens": 5791,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.52273,
+      "grad_norm": 2.8669610023498535,
+      "learning_rate": 3.273396262987475e-06,
+      "loss": 0.5728,
+      "step": 1675
+    },
+    {
+      "batch_num_effect_tokens": 7332,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.52364,
+      "grad_norm": 2.9916625022888184,
+      "learning_rate": 3.2616642008283218e-06,
+      "loss": 1.1941,
+      "step": 1676
+    },
+    {
+      "batch_num_effect_tokens": 5752,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.52455,
+      "grad_norm": 2.614650011062622,
+      "learning_rate": 3.249949102421518e-06,
+      "loss": 0.6,
+      "step": 1677
+    },
+    {
+      "batch_num_effect_tokens": 7273,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.52545,
+      "grad_norm": 2.79362416267395,
+      "learning_rate": 3.2382509972598087e-06,
+      "loss": 0.9564,
+      "step": 1678
+    },
+    {
+      "batch_num_effect_tokens": 7268,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 1.52636,
+      "grad_norm": 3.24861741065979,
+      "learning_rate": 3.2265699147931562e-06,
+      "loss": 1.3387,
+      "step": 1679
+    },
+    {
+      "batch_num_effect_tokens": 5726,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.52727,
+      "grad_norm": 2.863703966140747,
+      "learning_rate": 3.2149058844286796e-06,
+      "loss": 0.7691,
+      "step": 1680
+    },
+    {
+      "batch_num_effect_tokens": 5880,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.52818,
+      "grad_norm": 2.8440418243408203,
+      "learning_rate": 3.2032589355305544e-06,
+      "loss": 0.7248,
+      "step": 1681
+    },
+    {
+      "batch_num_effect_tokens": 6220,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.52909,
+      "grad_norm": 3.2520759105682373,
+      "learning_rate": 3.1916290974199658e-06,
+      "loss": 0.9424,
+      "step": 1682
+    },
+    {
+      "batch_num_effect_tokens": 10256,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52203,
+      "epoch": 1.53,
+      "grad_norm": 2.665754556655884,
+      "learning_rate": 3.1800163993750166e-06,
+      "loss": 0.9723,
+      "step": 1683
+    },
+    {
+      "batch_num_effect_tokens": 9810,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.53091,
+      "grad_norm": 3.15767240524292,
+      "learning_rate": 3.1684208706306572e-06,
+      "loss": 1.5059,
+      "step": 1684
+    },
+    {
+      "batch_num_effect_tokens": 4852,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 1.53182,
+      "grad_norm": 3.166114568710327,
+      "learning_rate": 3.1568425403786175e-06,
+      "loss": 0.6738,
+      "step": 1685
+    },
+    {
+      "batch_num_effect_tokens": 8126,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.53273,
+      "grad_norm": 3.4396119117736816,
+      "learning_rate": 3.1452814377673344e-06,
+      "loss": 1.1103,
+      "step": 1686
+    },
+    {
+      "batch_num_effect_tokens": 4718,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50544,
+      "epoch": 1.53364,
+      "grad_norm": 3.933088779449463,
+      "learning_rate": 3.133737591901864e-06,
+      "loss": 0.5764,
+      "step": 1687
+    },
+    {
+      "batch_num_effect_tokens": 8159,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.53455,
+      "grad_norm": 3.9641008377075195,
+      "learning_rate": 3.12221103184383e-06,
+      "loss": 1.6929,
+      "step": 1688
+    },
+    {
+      "batch_num_effect_tokens": 7258,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52178,
+      "epoch": 1.53545,
+      "grad_norm": 2.8331658840179443,
+      "learning_rate": 3.110701786611333e-06,
+      "loss": 0.778,
+      "step": 1689
+    },
+    {
+      "batch_num_effect_tokens": 8700,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.53636,
+      "grad_norm": 2.990499973297119,
+      "learning_rate": 3.099209885178882e-06,
+      "loss": 1.1492,
+      "step": 1690
+    },
+    {
+      "batch_num_effect_tokens": 5717,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.53727,
+      "grad_norm": 3.5310122966766357,
+      "learning_rate": 3.087735356477326e-06,
+      "loss": 1.0758,
+      "step": 1691
+    },
+    {
+      "batch_num_effect_tokens": 6778,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.53818,
+      "grad_norm": 2.8749349117279053,
+      "learning_rate": 3.076278229393773e-06,
+      "loss": 0.938,
+      "step": 1692
+    },
+    {
+      "batch_num_effect_tokens": 8421,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.53909,
+      "grad_norm": 2.7926547527313232,
+      "learning_rate": 3.0648385327715347e-06,
+      "loss": 0.867,
+      "step": 1693
+    },
+    {
+      "batch_num_effect_tokens": 7804,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.54,
+      "grad_norm": 3.0578343868255615,
+      "learning_rate": 3.0534162954100264e-06,
+      "loss": 1.0182,
+      "step": 1694
+    },
+    {
+      "batch_num_effect_tokens": 4232,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.54091,
+      "grad_norm": 2.940028429031372,
+      "learning_rate": 3.042011546064724e-06,
+      "loss": 0.6103,
+      "step": 1695
+    },
+    {
+      "batch_num_effect_tokens": 6254,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.54182,
+      "grad_norm": 3.4290342330932617,
+      "learning_rate": 3.0306243134470668e-06,
+      "loss": 1.2933,
+      "step": 1696
+    },
+    {
+      "batch_num_effect_tokens": 9518,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.54273,
+      "grad_norm": 2.8330044746398926,
+      "learning_rate": 3.0192546262243993e-06,
+      "loss": 1.2832,
+      "step": 1697
+    },
+    {
+      "batch_num_effect_tokens": 4527,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.54364,
+      "grad_norm": 3.1589434146881104,
+      "learning_rate": 3.0079025130198936e-06,
+      "loss": 0.5135,
+      "step": 1698
+    },
+    {
+      "batch_num_effect_tokens": 9635,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.54455,
+      "grad_norm": 2.468977689743042,
+      "learning_rate": 2.9965680024124856e-06,
+      "loss": 0.9308,
+      "step": 1699
+    },
+    {
+      "batch_num_effect_tokens": 5414,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.54545,
+      "grad_norm": 2.762683153152466,
+      "learning_rate": 2.9852511229367862e-06,
+      "loss": 0.6347,
+      "step": 1700
+    },
+    {
+      "batch_num_effect_tokens": 5226,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.54636,
+      "grad_norm": 3.7109501361846924,
+      "learning_rate": 2.9739519030830333e-06,
+      "loss": 0.9315,
+      "step": 1701
+    },
+    {
+      "batch_num_effect_tokens": 7824,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.54727,
+      "grad_norm": 2.943030595779419,
+      "learning_rate": 2.9626703712969962e-06,
+      "loss": 1.119,
+      "step": 1702
+    },
+    {
+      "batch_num_effect_tokens": 7364,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.54818,
+      "grad_norm": 3.1611106395721436,
+      "learning_rate": 2.9514065559799176e-06,
+      "loss": 1.0515,
+      "step": 1703
+    },
+    {
+      "batch_num_effect_tokens": 6546,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52107,
+      "epoch": 1.54909,
+      "grad_norm": 2.5471692085266113,
+      "learning_rate": 2.940160485488436e-06,
+      "loss": 0.7075,
+      "step": 1704
+    },
+    {
+      "batch_num_effect_tokens": 7447,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 1.55,
+      "grad_norm": 3.0841455459594727,
+      "learning_rate": 2.9289321881345257e-06,
+      "loss": 0.8351,
+      "step": 1705
+    },
+    {
+      "batch_num_effect_tokens": 5888,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.55091,
+      "grad_norm": 3.2254083156585693,
+      "learning_rate": 2.91772169218541e-06,
+      "loss": 0.9222,
+      "step": 1706
+    },
+    {
+      "batch_num_effect_tokens": 7465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.55182,
+      "grad_norm": 3.2685139179229736,
+      "learning_rate": 2.906529025863496e-06,
+      "loss": 1.2465,
+      "step": 1707
+    },
+    {
+      "batch_num_effect_tokens": 5138,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52204,
+      "epoch": 1.55273,
+      "grad_norm": 3.4334542751312256,
+      "learning_rate": 2.8953542173463133e-06,
+      "loss": 0.881,
+      "step": 1708
+    },
+    {
+      "batch_num_effect_tokens": 4731,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.55364,
+      "grad_norm": 3.9102420806884766,
+      "learning_rate": 2.8841972947664255e-06,
+      "loss": 0.603,
+      "step": 1709
+    },
+    {
+      "batch_num_effect_tokens": 5475,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.55455,
+      "grad_norm": 3.3529446125030518,
+      "learning_rate": 2.8730582862113743e-06,
+      "loss": 0.8389,
+      "step": 1710
+    },
+    {
+      "batch_num_effect_tokens": 7815,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 1.55545,
+      "grad_norm": 2.8974621295928955,
+      "learning_rate": 2.861937219723595e-06,
+      "loss": 1.0167,
+      "step": 1711
+    },
+    {
+      "batch_num_effect_tokens": 5746,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.55636,
+      "grad_norm": 2.9369184970855713,
+      "learning_rate": 2.8508341233003656e-06,
+      "loss": 0.7625,
+      "step": 1712
+    },
+    {
+      "batch_num_effect_tokens": 5279,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 1.55727,
+      "grad_norm": 3.400021553039551,
+      "learning_rate": 2.839749024893713e-06,
+      "loss": 1.1009,
+      "step": 1713
+    },
+    {
+      "batch_num_effect_tokens": 6465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52212,
+      "epoch": 1.55818,
+      "grad_norm": 2.686952829360962,
+      "learning_rate": 2.8286819524103657e-06,
+      "loss": 0.6808,
+      "step": 1714
+    },
+    {
+      "batch_num_effect_tokens": 4915,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.55909,
+      "grad_norm": 2.5672800540924072,
+      "learning_rate": 2.8176329337116604e-06,
+      "loss": 0.5838,
+      "step": 1715
+    },
+    {
+      "batch_num_effect_tokens": 5828,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.56,
+      "grad_norm": 3.3081350326538086,
+      "learning_rate": 2.8066019966134907e-06,
+      "loss": 0.9368,
+      "step": 1716
+    },
+    {
+      "batch_num_effect_tokens": 8521,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.56091,
+      "grad_norm": 2.824108362197876,
+      "learning_rate": 2.7955891688862235e-06,
+      "loss": 0.957,
+      "step": 1717
+    },
+    {
+      "batch_num_effect_tokens": 6462,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.56182,
+      "grad_norm": 3.050898551940918,
+      "learning_rate": 2.7845944782546453e-06,
+      "loss": 1.1143,
+      "step": 1718
+    },
+    {
+      "batch_num_effect_tokens": 9397,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.56273,
+      "grad_norm": 2.890437602996826,
+      "learning_rate": 2.773617952397871e-06,
+      "loss": 1.2549,
+      "step": 1719
+    },
+    {
+      "batch_num_effect_tokens": 7103,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52114,
+      "epoch": 1.56364,
+      "grad_norm": 3.1431565284729004,
+      "learning_rate": 2.7626596189492983e-06,
+      "loss": 1.072,
+      "step": 1720
+    },
+    {
+      "batch_num_effect_tokens": 8693,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52179,
+      "epoch": 1.56455,
+      "grad_norm": 2.832227945327759,
+      "learning_rate": 2.751719505496514e-06,
+      "loss": 1.0417,
+      "step": 1721
+    },
+    {
+      "batch_num_effect_tokens": 5353,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 1.56545,
+      "grad_norm": 3.170254707336426,
+      "learning_rate": 2.7407976395812417e-06,
+      "loss": 0.7899,
+      "step": 1722
+    },
+    {
+      "batch_num_effect_tokens": 5371,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52212,
+      "epoch": 1.56636,
+      "grad_norm": 3.459228754043579,
+      "learning_rate": 2.7298940486992654e-06,
+      "loss": 1.0546,
+      "step": 1723
+    },
+    {
+      "batch_num_effect_tokens": 7647,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.56727,
+      "grad_norm": 3.55985164642334,
+      "learning_rate": 2.719008760300359e-06,
+      "loss": 1.4869,
+      "step": 1724
+    },
+    {
+      "batch_num_effect_tokens": 4177,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.56818,
+      "grad_norm": 3.6153676509857178,
+      "learning_rate": 2.70814180178823e-06,
+      "loss": 0.6789,
+      "step": 1725
+    },
+    {
+      "batch_num_effect_tokens": 8808,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.56909,
+      "grad_norm": 3.0937259197235107,
+      "learning_rate": 2.6972932005204267e-06,
+      "loss": 1.286,
+      "step": 1726
+    },
+    {
+      "batch_num_effect_tokens": 3874,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 1.57,
+      "grad_norm": 3.9511656761169434,
+      "learning_rate": 2.6864629838082957e-06,
+      "loss": 0.9757,
+      "step": 1727
+    },
+    {
+      "batch_num_effect_tokens": 5497,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.57091,
+      "grad_norm": 3.421701669692993,
+      "learning_rate": 2.6756511789168926e-06,
+      "loss": 0.9162,
+      "step": 1728
+    },
+    {
+      "batch_num_effect_tokens": 5104,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.57182,
+      "grad_norm": 3.42718505859375,
+      "learning_rate": 2.6648578130649215e-06,
+      "loss": 0.7696,
+      "step": 1729
+    },
+    {
+      "batch_num_effect_tokens": 7263,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 1.57273,
+      "grad_norm": 2.9832472801208496,
+      "learning_rate": 2.6540829134246683e-06,
+      "loss": 0.9055,
+      "step": 1730
+    },
+    {
+      "batch_num_effect_tokens": 9577,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 1.57364,
+      "grad_norm": 2.9695019721984863,
+      "learning_rate": 2.643326507121933e-06,
+      "loss": 1.2408,
+      "step": 1731
+    },
+    {
+      "batch_num_effect_tokens": 7249,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.57455,
+      "grad_norm": 3.132408857345581,
+      "learning_rate": 2.6325886212359496e-06,
+      "loss": 1.2289,
+      "step": 1732
+    },
+    {
+      "batch_num_effect_tokens": 7387,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.57545,
+      "grad_norm": 3.4658567905426025,
+      "learning_rate": 2.621869282799342e-06,
+      "loss": 1.4882,
+      "step": 1733
+    },
+    {
+      "batch_num_effect_tokens": 5103,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.57636,
+      "grad_norm": 3.2325146198272705,
+      "learning_rate": 2.611168518798026e-06,
+      "loss": 1.085,
+      "step": 1734
+    },
+    {
+      "batch_num_effect_tokens": 5651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.57727,
+      "grad_norm": 2.9820148944854736,
+      "learning_rate": 2.6004863561711633e-06,
+      "loss": 0.6723,
+      "step": 1735
+    },
+    {
+      "batch_num_effect_tokens": 5592,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.57818,
+      "grad_norm": 2.9091336727142334,
+      "learning_rate": 2.5898228218110834e-06,
+      "loss": 0.7578,
+      "step": 1736
+    },
+    {
+      "batch_num_effect_tokens": 6166,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.57909,
+      "grad_norm": 3.0767745971679688,
+      "learning_rate": 2.5791779425632257e-06,
+      "loss": 0.936,
+      "step": 1737
+    },
+    {
+      "batch_num_effect_tokens": 4677,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50568,
+      "epoch": 1.58,
+      "grad_norm": 2.596431255340576,
+      "learning_rate": 2.5685517452260566e-06,
+      "loss": 0.4832,
+      "step": 1738
+    },
+    {
+      "batch_num_effect_tokens": 4098,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.58091,
+      "grad_norm": 3.6022675037384033,
+      "learning_rate": 2.5579442565510205e-06,
+      "loss": 0.823,
+      "step": 1739
+    },
+    {
+      "batch_num_effect_tokens": 7194,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52168,
+      "epoch": 1.58182,
+      "grad_norm": 2.7989399433135986,
+      "learning_rate": 2.5473555032424534e-06,
+      "loss": 0.9279,
+      "step": 1740
+    },
+    {
+      "batch_num_effect_tokens": 5119,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 1.58273,
+      "grad_norm": 2.3581643104553223,
+      "learning_rate": 2.5367855119575314e-06,
+      "loss": 0.3767,
+      "step": 1741
+    },
+    {
+      "batch_num_effect_tokens": 5965,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 1.58364,
+      "grad_norm": 3.442153215408325,
+      "learning_rate": 2.526234309306194e-06,
+      "loss": 0.9723,
+      "step": 1742
+    },
+    {
+      "batch_num_effect_tokens": 5158,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.58455,
+      "grad_norm": 3.5384604930877686,
+      "learning_rate": 2.515701921851077e-06,
+      "loss": 0.8521,
+      "step": 1743
+    },
+    {
+      "batch_num_effect_tokens": 7348,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52167,
+      "epoch": 1.58545,
+      "grad_norm": 3.1147358417510986,
+      "learning_rate": 2.5051883761074613e-06,
+      "loss": 1.1336,
+      "step": 1744
+    },
+    {
+      "batch_num_effect_tokens": 7892,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.58636,
+      "grad_norm": 2.0374927520751953,
+      "learning_rate": 2.494693698543179e-06,
+      "loss": 0.4602,
+      "step": 1745
+    },
+    {
+      "batch_num_effect_tokens": 8038,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.58727,
+      "grad_norm": 3.6828629970550537,
+      "learning_rate": 2.484217915578574e-06,
+      "loss": 1.5568,
+      "step": 1746
+    },
+    {
+      "batch_num_effect_tokens": 6741,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.58818,
+      "grad_norm": 3.2681405544281006,
+      "learning_rate": 2.4737610535864145e-06,
+      "loss": 0.9953,
+      "step": 1747
+    },
+    {
+      "batch_num_effect_tokens": 5818,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52173,
+      "epoch": 1.58909,
+      "grad_norm": 2.930443048477173,
+      "learning_rate": 2.4633231388918377e-06,
+      "loss": 0.788,
+      "step": 1748
+    },
+    {
+      "batch_num_effect_tokens": 7963,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52201,
+      "epoch": 1.59,
+      "grad_norm": 3.2641735076904297,
+      "learning_rate": 2.45290419777228e-06,
+      "loss": 1.3389,
+      "step": 1749
+    },
+    {
+      "batch_num_effect_tokens": 4948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.59091,
+      "grad_norm": 3.009199857711792,
+      "learning_rate": 2.4425042564574186e-06,
+      "loss": 0.4797,
+      "step": 1750
+    },
+    {
+      "batch_num_effect_tokens": 6445,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.59182,
+      "grad_norm": 3.7565250396728516,
+      "learning_rate": 2.432123341129087e-06,
+      "loss": 1.199,
+      "step": 1751
+    },
+    {
+      "batch_num_effect_tokens": 5496,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.59273,
+      "grad_norm": 3.0002763271331787,
+      "learning_rate": 2.421761477921232e-06,
+      "loss": 0.711,
+      "step": 1752
+    },
+    {
+      "batch_num_effect_tokens": 5755,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.59364,
+      "grad_norm": 3.30511474609375,
+      "learning_rate": 2.411418692919831e-06,
+      "loss": 1.0191,
+      "step": 1753
+    },
+    {
+      "batch_num_effect_tokens": 8357,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52084,
+      "epoch": 1.59455,
+      "grad_norm": 3.0193865299224854,
+      "learning_rate": 2.401095012162832e-06,
+      "loss": 1.3428,
+      "step": 1754
+    },
+    {
+      "batch_num_effect_tokens": 7660,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52153,
+      "epoch": 1.59545,
+      "grad_norm": 2.9472556114196777,
+      "learning_rate": 2.3907904616400855e-06,
+      "loss": 1.1688,
+      "step": 1755
+    },
+    {
+      "batch_num_effect_tokens": 7234,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52188,
+      "epoch": 1.59636,
+      "grad_norm": 2.4208133220672607,
+      "learning_rate": 2.380505067293293e-06,
+      "loss": 0.6147,
+      "step": 1756
+    },
+    {
+      "batch_num_effect_tokens": 8168,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.59727,
+      "grad_norm": 2.9983484745025635,
+      "learning_rate": 2.3702388550159172e-06,
+      "loss": 1.083,
+      "step": 1757
+    },
+    {
+      "batch_num_effect_tokens": 8542,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52192,
+      "epoch": 1.59818,
+      "grad_norm": 2.831907272338867,
+      "learning_rate": 2.3599918506531337e-06,
+      "loss": 1.1711,
+      "step": 1758
+    },
+    {
+      "batch_num_effect_tokens": 7883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 1.59909,
+      "grad_norm": 3.413278818130493,
+      "learning_rate": 2.3497640800017687e-06,
+      "loss": 1.4813,
+      "step": 1759
+    },
+    {
+      "batch_num_effect_tokens": 4681,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.6,
+      "grad_norm": 2.6594529151916504,
+      "learning_rate": 2.339555568810221e-06,
+      "loss": 0.581,
+      "step": 1760
+    },
+    {
+      "batch_num_effect_tokens": 10408,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 1.60091,
+      "grad_norm": 2.7606842517852783,
+      "learning_rate": 2.329366342778404e-06,
+      "loss": 1.3419,
+      "step": 1761
+    },
+    {
+      "batch_num_effect_tokens": 7211,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.60182,
+      "grad_norm": 2.9090163707733154,
+      "learning_rate": 2.3191964275576806e-06,
+      "loss": 0.8893,
+      "step": 1762
+    },
+    {
+      "batch_num_effect_tokens": 6113,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 1.60273,
+      "grad_norm": 3.3374078273773193,
+      "learning_rate": 2.309045848750806e-06,
+      "loss": 1.1442,
+      "step": 1763
+    },
+    {
+      "batch_num_effect_tokens": 8686,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.60364,
+      "grad_norm": 2.685124158859253,
+      "learning_rate": 2.2989146319118428e-06,
+      "loss": 1.0721,
+      "step": 1764
+    },
+    {
+      "batch_num_effect_tokens": 8679,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.60455,
+      "grad_norm": 3.2319931983947754,
+      "learning_rate": 2.288802802546124e-06,
+      "loss": 1.5218,
+      "step": 1765
+    },
+    {
+      "batch_num_effect_tokens": 7823,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.60545,
+      "grad_norm": 2.9048807621002197,
+      "learning_rate": 2.2787103861101656e-06,
+      "loss": 1.1865,
+      "step": 1766
+    },
+    {
+      "batch_num_effect_tokens": 6285,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.60636,
+      "grad_norm": 3.122467279434204,
+      "learning_rate": 2.2686374080116136e-06,
+      "loss": 1.092,
+      "step": 1767
+    },
+    {
+      "batch_num_effect_tokens": 7060,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.60727,
+      "grad_norm": 2.725559711456299,
+      "learning_rate": 2.2585838936091753e-06,
+      "loss": 0.6991,
+      "step": 1768
+    },
+    {
+      "batch_num_effect_tokens": 8588,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52182,
+      "epoch": 1.60818,
+      "grad_norm": 3.078824758529663,
+      "learning_rate": 2.2485498682125674e-06,
+      "loss": 1.3859,
+      "step": 1769
+    },
+    {
+      "batch_num_effect_tokens": 7064,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52107,
+      "epoch": 1.60909,
+      "grad_norm": 2.6935298442840576,
+      "learning_rate": 2.2385353570824308e-06,
+      "loss": 0.8102,
+      "step": 1770
+    },
+    {
+      "batch_num_effect_tokens": 5637,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.61,
+      "grad_norm": 3.3661398887634277,
+      "learning_rate": 2.2285403854302912e-06,
+      "loss": 1.0509,
+      "step": 1771
+    },
+    {
+      "batch_num_effect_tokens": 6232,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.61091,
+      "grad_norm": 3.273484706878662,
+      "learning_rate": 2.218564978418475e-06,
+      "loss": 0.7791,
+      "step": 1772
+    },
+    {
+      "batch_num_effect_tokens": 3892,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.61182,
+      "grad_norm": 4.022351264953613,
+      "learning_rate": 2.208609161160057e-06,
+      "loss": 0.8899,
+      "step": 1773
+    },
+    {
+      "batch_num_effect_tokens": 5333,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.61273,
+      "grad_norm": 2.932511806488037,
+      "learning_rate": 2.198672958718796e-06,
+      "loss": 0.7308,
+      "step": 1774
+    },
+    {
+      "batch_num_effect_tokens": 6280,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50569,
+      "epoch": 1.61364,
+      "grad_norm": 2.7373950481414795,
+      "learning_rate": 2.1887563961090664e-06,
+      "loss": 0.634,
+      "step": 1775
+    },
+    {
+      "batch_num_effect_tokens": 5883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.61455,
+      "grad_norm": 3.1122817993164062,
+      "learning_rate": 2.1788594982958087e-06,
+      "loss": 0.8156,
+      "step": 1776
+    },
+    {
+      "batch_num_effect_tokens": 8606,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 1.61545,
+      "grad_norm": 2.974762201309204,
+      "learning_rate": 2.1689822901944456e-06,
+      "loss": 1.2899,
+      "step": 1777
+    },
+    {
+      "batch_num_effect_tokens": 5448,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.61636,
+      "grad_norm": 3.2081286907196045,
+      "learning_rate": 2.159124796670843e-06,
+      "loss": 0.7461,
+      "step": 1778
+    },
+    {
+      "batch_num_effect_tokens": 7055,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.61727,
+      "grad_norm": 3.310664653778076,
+      "learning_rate": 2.149287042541225e-06,
+      "loss": 1.3933,
+      "step": 1779
+    },
+    {
+      "batch_num_effect_tokens": 7378,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.61818,
+      "grad_norm": 2.7810285091400146,
+      "learning_rate": 2.1394690525721275e-06,
+      "loss": 0.9525,
+      "step": 1780
+    },
+    {
+      "batch_num_effect_tokens": 6656,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52143,
+      "epoch": 1.61909,
+      "grad_norm": 3.0689432621002197,
+      "learning_rate": 2.1296708514803244e-06,
+      "loss": 1.1013,
+      "step": 1781
+    },
+    {
+      "batch_num_effect_tokens": 6009,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 1.62,
+      "grad_norm": 2.846339702606201,
+      "learning_rate": 2.119892463932781e-06,
+      "loss": 0.6385,
+      "step": 1782
+    },
+    {
+      "batch_num_effect_tokens": 7005,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.62091,
+      "grad_norm": 3.1606693267822266,
+      "learning_rate": 2.1101339145465725e-06,
+      "loss": 1.0057,
+      "step": 1783
+    },
+    {
+      "batch_num_effect_tokens": 6086,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 1.62182,
+      "grad_norm": 3.815751552581787,
+      "learning_rate": 2.1003952278888382e-06,
+      "loss": 1.5689,
+      "step": 1784
+    },
+    {
+      "batch_num_effect_tokens": 5806,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 1.62273,
+      "grad_norm": 3.289590835571289,
+      "learning_rate": 2.090676428476709e-06,
+      "loss": 1.0711,
+      "step": 1785
+    },
+    {
+      "batch_num_effect_tokens": 3409,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 1.62364,
+      "grad_norm": 2.739345073699951,
+      "learning_rate": 2.0809775407772505e-06,
+      "loss": 0.215,
+      "step": 1786
+    },
+    {
+      "batch_num_effect_tokens": 8592,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.62455,
+      "grad_norm": 2.8174004554748535,
+      "learning_rate": 2.071298589207399e-06,
+      "loss": 1.2823,
+      "step": 1787
+    },
+    {
+      "batch_num_effect_tokens": 7297,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.62545,
+      "grad_norm": 2.857300281524658,
+      "learning_rate": 2.0616395981339076e-06,
+      "loss": 1.0295,
+      "step": 1788
+    },
+    {
+      "batch_num_effect_tokens": 6110,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.62636,
+      "grad_norm": 3.4632177352905273,
+      "learning_rate": 2.05200059187327e-06,
+      "loss": 1.2141,
+      "step": 1789
+    },
+    {
+      "batch_num_effect_tokens": 6218,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.62727,
+      "grad_norm": 3.2370378971099854,
+      "learning_rate": 2.0423815946916783e-06,
+      "loss": 0.9353,
+      "step": 1790
+    },
+    {
+      "batch_num_effect_tokens": 4699,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.62818,
+      "grad_norm": 4.399960041046143,
+      "learning_rate": 2.032782630804945e-06,
+      "loss": 0.7076,
+      "step": 1791
+    },
+    {
+      "batch_num_effect_tokens": 5507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 1.62909,
+      "grad_norm": 3.497896671295166,
+      "learning_rate": 2.0232037243784475e-06,
+      "loss": 1.2132,
+      "step": 1792
+    },
+    {
+      "batch_num_effect_tokens": 6461,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.63,
+      "grad_norm": 2.8389227390289307,
+      "learning_rate": 2.013644899527074e-06,
+      "loss": 1.0196,
+      "step": 1793
+    },
+    {
+      "batch_num_effect_tokens": 6493,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.63091,
+      "grad_norm": 3.1590051651000977,
+      "learning_rate": 2.004106180315151e-06,
+      "loss": 1.1268,
+      "step": 1794
+    },
+    {
+      "batch_num_effect_tokens": 7699,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 1.63182,
+      "grad_norm": 3.41550874710083,
+      "learning_rate": 1.994587590756397e-06,
+      "loss": 1.2996,
+      "step": 1795
+    },
+    {
+      "batch_num_effect_tokens": 7288,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.63273,
+      "grad_norm": 3.010758399963379,
+      "learning_rate": 1.9850891548138463e-06,
+      "loss": 1.2565,
+      "step": 1796
+    },
+    {
+      "batch_num_effect_tokens": 6014,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.63364,
+      "grad_norm": 3.2456741333007812,
+      "learning_rate": 1.9756108963998054e-06,
+      "loss": 1.1311,
+      "step": 1797
+    },
+    {
+      "batch_num_effect_tokens": 4547,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52156,
+      "epoch": 1.63455,
+      "grad_norm": 2.8084323406219482,
+      "learning_rate": 1.9661528393757744e-06,
+      "loss": 0.4857,
+      "step": 1798
+    },
+    {
+      "batch_num_effect_tokens": 6305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.63545,
+      "grad_norm": 3.300767183303833,
+      "learning_rate": 1.956715007552401e-06,
+      "loss": 0.9622,
+      "step": 1799
+    },
+    {
+      "batch_num_effect_tokens": 7916,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.63636,
+      "grad_norm": 2.9808456897735596,
+      "learning_rate": 1.947297424689414e-06,
+      "loss": 0.9933,
+      "step": 1800
+    },
+    {
+      "batch_num_effect_tokens": 6046,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.63727,
+      "grad_norm": 3.626404285430908,
+      "learning_rate": 1.9379001144955713e-06,
+      "loss": 1.194,
+      "step": 1801
+    },
+    {
+      "batch_num_effect_tokens": 6036,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 1.63818,
+      "grad_norm": 3.19749116897583,
+      "learning_rate": 1.9285231006285855e-06,
+      "loss": 0.9358,
+      "step": 1802
+    },
+    {
+      "batch_num_effect_tokens": 4047,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50510,
+      "epoch": 1.63909,
+      "grad_norm": 2.7461113929748535,
+      "learning_rate": 1.9191664066950834e-06,
+      "loss": 0.4657,
+      "step": 1803
+    },
+    {
+      "batch_num_effect_tokens": 6395,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.64,
+      "grad_norm": 2.6643176078796387,
+      "learning_rate": 1.9098300562505266e-06,
+      "loss": 0.8469,
+      "step": 1804
+    },
+    {
+      "batch_num_effect_tokens": 4585,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.64091,
+      "grad_norm": 2.729966163635254,
+      "learning_rate": 1.9005140727991678e-06,
+      "loss": 0.5909,
+      "step": 1805
+    },
+    {
+      "batch_num_effect_tokens": 6252,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.64182,
+      "grad_norm": 3.1414084434509277,
+      "learning_rate": 1.8912184797939803e-06,
+      "loss": 1.0312,
+      "step": 1806
+    },
+    {
+      "batch_num_effect_tokens": 5722,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.64273,
+      "grad_norm": 3.29822039604187,
+      "learning_rate": 1.881943300636615e-06,
+      "loss": 1.0402,
+      "step": 1807
+    },
+    {
+      "batch_num_effect_tokens": 8174,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.64364,
+      "grad_norm": 3.040043830871582,
+      "learning_rate": 1.8726885586773213e-06,
+      "loss": 1.1538,
+      "step": 1808
+    },
+    {
+      "batch_num_effect_tokens": 6188,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52186,
+      "epoch": 1.64455,
+      "grad_norm": 3.0945098400115967,
+      "learning_rate": 1.8634542772148978e-06,
+      "loss": 0.7199,
+      "step": 1809
+    },
+    {
+      "batch_num_effect_tokens": 5535,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.64545,
+      "grad_norm": 2.3633594512939453,
+      "learning_rate": 1.854240479496643e-06,
+      "loss": 0.3655,
+      "step": 1810
+    },
+    {
+      "batch_num_effect_tokens": 7126,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.64636,
+      "grad_norm": 2.8708362579345703,
+      "learning_rate": 1.8450471887182797e-06,
+      "loss": 0.8611,
+      "step": 1811
+    },
+    {
+      "batch_num_effect_tokens": 5189,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.64727,
+      "grad_norm": 4.187164783477783,
+      "learning_rate": 1.8358744280239048e-06,
+      "loss": 1.5781,
+      "step": 1812
+    },
+    {
+      "batch_num_effect_tokens": 5376,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.64818,
+      "grad_norm": 3.8114173412323,
+      "learning_rate": 1.826722220505931e-06,
+      "loss": 0.947,
+      "step": 1813
+    },
+    {
+      "batch_num_effect_tokens": 7132,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.64909,
+      "grad_norm": 3.5395960807800293,
+      "learning_rate": 1.817590589205035e-06,
+      "loss": 1.1665,
+      "step": 1814
+    },
+    {
+      "batch_num_effect_tokens": 6801,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50546,
+      "epoch": 1.65,
+      "grad_norm": 3.047496795654297,
+      "learning_rate": 1.808479557110081e-06,
+      "loss": 0.897,
+      "step": 1815
+    },
+    {
+      "batch_num_effect_tokens": 6263,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.65091,
+      "grad_norm": 2.9120888710021973,
+      "learning_rate": 1.7993891471580894e-06,
+      "loss": 0.9763,
+      "step": 1816
+    },
+    {
+      "batch_num_effect_tokens": 8410,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.65182,
+      "grad_norm": 2.9664645195007324,
+      "learning_rate": 1.7903193822341513e-06,
+      "loss": 1.3164,
+      "step": 1817
+    },
+    {
+      "batch_num_effect_tokens": 4368,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.65273,
+      "grad_norm": 3.506654977798462,
+      "learning_rate": 1.7812702851713904e-06,
+      "loss": 0.6319,
+      "step": 1818
+    },
+    {
+      "batch_num_effect_tokens": 6964,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.65364,
+      "grad_norm": 3.4171743392944336,
+      "learning_rate": 1.7722418787508956e-06,
+      "loss": 1.1607,
+      "step": 1819
+    },
+    {
+      "batch_num_effect_tokens": 5712,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 1.65455,
+      "grad_norm": 3.163783550262451,
+      "learning_rate": 1.7632341857016733e-06,
+      "loss": 0.8253,
+      "step": 1820
+    },
+    {
+      "batch_num_effect_tokens": 9105,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.65545,
+      "grad_norm": 2.640961170196533,
+      "learning_rate": 1.754247228700575e-06,
+      "loss": 1.1542,
+      "step": 1821
+    },
+    {
+      "batch_num_effect_tokens": 5878,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.65636,
+      "grad_norm": 3.138787269592285,
+      "learning_rate": 1.74528103037226e-06,
+      "loss": 1.106,
+      "step": 1822
+    },
+    {
+      "batch_num_effect_tokens": 8034,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.65727,
+      "grad_norm": 3.100926399230957,
+      "learning_rate": 1.7363356132891196e-06,
+      "loss": 1.0607,
+      "step": 1823
+    },
+    {
+      "batch_num_effect_tokens": 7090,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.65818,
+      "grad_norm": 2.5713181495666504,
+      "learning_rate": 1.7274109999712295e-06,
+      "loss": 0.8876,
+      "step": 1824
+    },
+    {
+      "batch_num_effect_tokens": 8023,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.65909,
+      "grad_norm": 3.4491758346557617,
+      "learning_rate": 1.7185072128862934e-06,
+      "loss": 1.3966,
+      "step": 1825
+    },
+    {
+      "batch_num_effect_tokens": 5171,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.66,
+      "grad_norm": 2.647582530975342,
+      "learning_rate": 1.709624274449584e-06,
+      "loss": 0.6023,
+      "step": 1826
+    },
+    {
+      "batch_num_effect_tokens": 4286,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.66091,
+      "grad_norm": 2.4348816871643066,
+      "learning_rate": 1.7007622070238905e-06,
+      "loss": 0.3872,
+      "step": 1827
+    },
+    {
+      "batch_num_effect_tokens": 7501,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.66182,
+      "grad_norm": 3.173098087310791,
+      "learning_rate": 1.6919210329194535e-06,
+      "loss": 1.2194,
+      "step": 1828
+    },
+    {
+      "batch_num_effect_tokens": 7094,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.66273,
+      "grad_norm": 3.285263776779175,
+      "learning_rate": 1.6831007743939231e-06,
+      "loss": 1.0569,
+      "step": 1829
+    },
+    {
+      "batch_num_effect_tokens": 8286,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52178,
+      "epoch": 1.66364,
+      "grad_norm": 2.7359249591827393,
+      "learning_rate": 1.6743014536522872e-06,
+      "loss": 0.9939,
+      "step": 1830
+    },
+    {
+      "batch_num_effect_tokens": 6013,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.66455,
+      "grad_norm": 3.9989168643951416,
+      "learning_rate": 1.6655230928468257e-06,
+      "loss": 0.8864,
+      "step": 1831
+    },
+    {
+      "batch_num_effect_tokens": 7145,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52131,
+      "epoch": 1.66545,
+      "grad_norm": 3.026437759399414,
+      "learning_rate": 1.6567657140770477e-06,
+      "loss": 1.0664,
+      "step": 1832
+    },
+    {
+      "batch_num_effect_tokens": 6330,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52192,
+      "epoch": 1.66636,
+      "grad_norm": 3.3815693855285645,
+      "learning_rate": 1.6480293393896508e-06,
+      "loss": 1.0099,
+      "step": 1833
+    },
+    {
+      "batch_num_effect_tokens": 4332,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 1.66727,
+      "grad_norm": 1.886484980583191,
+      "learning_rate": 1.6393139907784405e-06,
+      "loss": 0.1685,
+      "step": 1834
+    },
+    {
+      "batch_num_effect_tokens": 6494,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.66818,
+      "grad_norm": 2.715851306915283,
+      "learning_rate": 1.630619690184303e-06,
+      "loss": 0.7091,
+      "step": 1835
+    },
+    {
+      "batch_num_effect_tokens": 7968,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52213,
+      "epoch": 1.66909,
+      "grad_norm": 2.8430440425872803,
+      "learning_rate": 1.6219464594951273e-06,
+      "loss": 1.0437,
+      "step": 1836
+    },
+    {
+      "batch_num_effect_tokens": 10572,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.67,
+      "grad_norm": 2.509157657623291,
+      "learning_rate": 1.6132943205457607e-06,
+      "loss": 1.0703,
+      "step": 1837
+    },
+    {
+      "batch_num_effect_tokens": 8004,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.67091,
+      "grad_norm": 3.3842663764953613,
+      "learning_rate": 1.6046632951179508e-06,
+      "loss": 1.3952,
+      "step": 1838
+    },
+    {
+      "batch_num_effect_tokens": 6566,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.67182,
+      "grad_norm": 3.1450865268707275,
+      "learning_rate": 1.5960534049402987e-06,
+      "loss": 0.9144,
+      "step": 1839
+    },
+    {
+      "batch_num_effect_tokens": 6941,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.67273,
+      "grad_norm": 3.1405293941497803,
+      "learning_rate": 1.587464671688187e-06,
+      "loss": 1.0331,
+      "step": 1840
+    },
+    {
+      "batch_num_effect_tokens": 4996,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.67364,
+      "grad_norm": 3.9549872875213623,
+      "learning_rate": 1.5788971169837474e-06,
+      "loss": 1.1802,
+      "step": 1841
+    },
+    {
+      "batch_num_effect_tokens": 6037,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.67455,
+      "grad_norm": 2.9194626808166504,
+      "learning_rate": 1.5703507623957848e-06,
+      "loss": 0.7473,
+      "step": 1842
+    },
+    {
+      "batch_num_effect_tokens": 9964,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.67545,
+      "grad_norm": 2.755096912384033,
+      "learning_rate": 1.5618256294397383e-06,
+      "loss": 1.011,
+      "step": 1843
+    },
+    {
+      "batch_num_effect_tokens": 7898,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52219,
+      "epoch": 1.67636,
+      "grad_norm": 2.4391679763793945,
+      "learning_rate": 1.553321739577619e-06,
+      "loss": 0.9038,
+      "step": 1844
+    },
+    {
+      "batch_num_effect_tokens": 3673,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52200,
+      "epoch": 1.67727,
+      "grad_norm": 3.347842216491699,
+      "learning_rate": 1.5448391142179575e-06,
+      "loss": 0.77,
+      "step": 1845
+    },
+    {
+      "batch_num_effect_tokens": 5867,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.67818,
+      "grad_norm": 4.084648132324219,
+      "learning_rate": 1.536377774715757e-06,
+      "loss": 1.792,
+      "step": 1846
+    },
+    {
+      "batch_num_effect_tokens": 6421,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.67909,
+      "grad_norm": 2.802750587463379,
+      "learning_rate": 1.5279377423724261e-06,
+      "loss": 0.7052,
+      "step": 1847
+    },
+    {
+      "batch_num_effect_tokens": 7452,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.68,
+      "grad_norm": 2.640302896499634,
+      "learning_rate": 1.5195190384357405e-06,
+      "loss": 0.8296,
+      "step": 1848
+    },
+    {
+      "batch_num_effect_tokens": 9075,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.68091,
+      "grad_norm": 3.084359884262085,
+      "learning_rate": 1.5111216840997745e-06,
+      "loss": 1.3787,
+      "step": 1849
+    },
+    {
+      "batch_num_effect_tokens": 6113,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52094,
+      "epoch": 1.68182,
+      "grad_norm": 3.2951931953430176,
+      "learning_rate": 1.5027457005048573e-06,
+      "loss": 1.1228,
+      "step": 1850
+    },
+    {
+      "batch_num_effect_tokens": 4228,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.68273,
+      "grad_norm": 3.9379076957702637,
+      "learning_rate": 1.4943911087375173e-06,
+      "loss": 0.8264,
+      "step": 1851
+    },
+    {
+      "batch_num_effect_tokens": 4946,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 1.68364,
+      "grad_norm": 2.608274459838867,
+      "learning_rate": 1.4860579298304311e-06,
+      "loss": 0.517,
+      "step": 1852
+    },
+    {
+      "batch_num_effect_tokens": 6964,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.68455,
+      "grad_norm": 3.3249189853668213,
+      "learning_rate": 1.4777461847623653e-06,
+      "loss": 1.1667,
+      "step": 1853
+    },
+    {
+      "batch_num_effect_tokens": 7873,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.68545,
+      "grad_norm": 3.2587575912475586,
+      "learning_rate": 1.4694558944581294e-06,
+      "loss": 1.5438,
+      "step": 1854
+    },
+    {
+      "batch_num_effect_tokens": 7917,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.68636,
+      "grad_norm": 3.054732322692871,
+      "learning_rate": 1.4611870797885196e-06,
+      "loss": 1.1736,
+      "step": 1855
+    },
+    {
+      "batch_num_effect_tokens": 9724,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.68727,
+      "grad_norm": 2.0472991466522217,
+      "learning_rate": 1.4529397615702656e-06,
+      "loss": 0.7589,
+      "step": 1856
+    },
+    {
+      "batch_num_effect_tokens": 6360,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 1.68818,
+      "grad_norm": 3.0648200511932373,
+      "learning_rate": 1.44471396056598e-06,
+      "loss": 0.9101,
+      "step": 1857
+    },
+    {
+      "batch_num_effect_tokens": 5741,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.68909,
+      "grad_norm": 2.9164628982543945,
+      "learning_rate": 1.436509697484111e-06,
+      "loss": 0.695,
+      "step": 1858
+    },
+    {
+      "batch_num_effect_tokens": 6632,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.69,
+      "grad_norm": 4.474465847015381,
+      "learning_rate": 1.4283269929788779e-06,
+      "loss": 1.7564,
+      "step": 1859
+    },
+    {
+      "batch_num_effect_tokens": 5356,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.69091,
+      "grad_norm": 3.284327745437622,
+      "learning_rate": 1.4201658676502294e-06,
+      "loss": 0.8759,
+      "step": 1860
+    },
+    {
+      "batch_num_effect_tokens": 8474,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.69182,
+      "grad_norm": 2.9198808670043945,
+      "learning_rate": 1.4120263420437919e-06,
+      "loss": 1.4958,
+      "step": 1861
+    },
+    {
+      "batch_num_effect_tokens": 8280,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.69273,
+      "grad_norm": 2.9854891300201416,
+      "learning_rate": 1.4039084366508094e-06,
+      "loss": 1.1542,
+      "step": 1862
+    },
+    {
+      "batch_num_effect_tokens": 4069,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.69364,
+      "grad_norm": 4.182518005371094,
+      "learning_rate": 1.3958121719080986e-06,
+      "loss": 1.0759,
+      "step": 1863
+    },
+    {
+      "batch_num_effect_tokens": 4885,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 1.69455,
+      "grad_norm": 3.351048469543457,
+      "learning_rate": 1.3877375681979944e-06,
+      "loss": 0.766,
+      "step": 1864
+    },
+    {
+      "batch_num_effect_tokens": 6128,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.69545,
+      "grad_norm": 3.1366031169891357,
+      "learning_rate": 1.379684645848307e-06,
+      "loss": 0.8039,
+      "step": 1865
+    },
+    {
+      "batch_num_effect_tokens": 6641,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52152,
+      "epoch": 1.69636,
+      "grad_norm": 2.8246731758117676,
+      "learning_rate": 1.3716534251322543e-06,
+      "loss": 0.765,
+      "step": 1866
+    },
+    {
+      "batch_num_effect_tokens": 5457,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.69727,
+      "grad_norm": 2.756589889526367,
+      "learning_rate": 1.3636439262684299e-06,
+      "loss": 0.7063,
+      "step": 1867
+    },
+    {
+      "batch_num_effect_tokens": 6369,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 1.69818,
+      "grad_norm": 3.16241192817688,
+      "learning_rate": 1.3556561694207337e-06,
+      "loss": 0.9265,
+      "step": 1868
+    },
+    {
+      "batch_num_effect_tokens": 7286,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52219,
+      "epoch": 1.69909,
+      "grad_norm": 2.5733494758605957,
+      "learning_rate": 1.347690174698335e-06,
+      "loss": 0.5214,
+      "step": 1869
+    },
+    {
+      "batch_num_effect_tokens": 7002,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.7,
+      "grad_norm": 2.967886447906494,
+      "learning_rate": 1.339745962155613e-06,
+      "loss": 1.0079,
+      "step": 1870
+    },
+    {
+      "batch_num_effect_tokens": 5487,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.70091,
+      "grad_norm": 3.7134454250335693,
+      "learning_rate": 1.3318235517921197e-06,
+      "loss": 1.1692,
+      "step": 1871
+    },
+    {
+      "batch_num_effect_tokens": 5793,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 1.70182,
+      "grad_norm": 2.976924419403076,
+      "learning_rate": 1.3239229635525074e-06,
+      "loss": 0.7546,
+      "step": 1872
+    },
+    {
+      "batch_num_effect_tokens": 7355,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.70273,
+      "grad_norm": 2.6366283893585205,
+      "learning_rate": 1.3160442173265032e-06,
+      "loss": 0.8432,
+      "step": 1873
+    },
+    {
+      "batch_num_effect_tokens": 5844,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.70364,
+      "grad_norm": 3.6266331672668457,
+      "learning_rate": 1.3081873329488393e-06,
+      "loss": 1.1098,
+      "step": 1874
+    },
+    {
+      "batch_num_effect_tokens": 4746,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52195,
+      "epoch": 1.70455,
+      "grad_norm": 3.1041576862335205,
+      "learning_rate": 1.3003523301992105e-06,
+      "loss": 0.564,
+      "step": 1875
+    },
+    {
+      "batch_num_effect_tokens": 9597,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.70545,
+      "grad_norm": 2.8251724243164062,
+      "learning_rate": 1.2925392288022299e-06,
+      "loss": 1.197,
+      "step": 1876
+    },
+    {
+      "batch_num_effect_tokens": 4799,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.70636,
+      "grad_norm": 3.5474135875701904,
+      "learning_rate": 1.2847480484273666e-06,
+      "loss": 0.8343,
+      "step": 1877
+    },
+    {
+      "batch_num_effect_tokens": 5678,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.70727,
+      "grad_norm": 3.4982004165649414,
+      "learning_rate": 1.2769788086889135e-06,
+      "loss": 0.9056,
+      "step": 1878
+    },
+    {
+      "batch_num_effect_tokens": 10563,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.70818,
+      "grad_norm": 2.6265292167663574,
+      "learning_rate": 1.269231529145918e-06,
+      "loss": 1.3058,
+      "step": 1879
+    },
+    {
+      "batch_num_effect_tokens": 10088,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 1.70909,
+      "grad_norm": 2.6288015842437744,
+      "learning_rate": 1.2615062293021508e-06,
+      "loss": 1.2069,
+      "step": 1880
+    },
+    {
+      "batch_num_effect_tokens": 6101,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.71,
+      "grad_norm": 3.580078363418579,
+      "learning_rate": 1.2538029286060428e-06,
+      "loss": 1.1826,
+      "step": 1881
+    },
+    {
+      "batch_num_effect_tokens": 6623,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52193,
+      "epoch": 1.71091,
+      "grad_norm": 3.16204571723938,
+      "learning_rate": 1.2461216464506454e-06,
+      "loss": 1.1779,
+      "step": 1882
+    },
+    {
+      "batch_num_effect_tokens": 4556,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 1.71182,
+      "grad_norm": 3.065345287322998,
+      "learning_rate": 1.2384624021735736e-06,
+      "loss": 0.5967,
+      "step": 1883
+    },
+    {
+      "batch_num_effect_tokens": 6305,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50593,
+      "epoch": 1.71273,
+      "grad_norm": 3.218221426010132,
+      "learning_rate": 1.230825215056971e-06,
+      "loss": 0.8445,
+      "step": 1884
+    },
+    {
+      "batch_num_effect_tokens": 4052,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.71364,
+      "grad_norm": 2.8410584926605225,
+      "learning_rate": 1.2232101043274437e-06,
+      "loss": 0.3147,
+      "step": 1885
+    },
+    {
+      "batch_num_effect_tokens": 10167,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52122,
+      "epoch": 1.71455,
+      "grad_norm": 2.5719006061553955,
+      "learning_rate": 1.215617089156026e-06,
+      "loss": 1.2197,
+      "step": 1886
+    },
+    {
+      "batch_num_effect_tokens": 7237,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.71545,
+      "grad_norm": 2.9863481521606445,
+      "learning_rate": 1.208046188658124e-06,
+      "loss": 1.0261,
+      "step": 1887
+    },
+    {
+      "batch_num_effect_tokens": 4322,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.71636,
+      "grad_norm": 3.265608549118042,
+      "learning_rate": 1.2004974218934695e-06,
+      "loss": 0.7267,
+      "step": 1888
+    },
+    {
+      "batch_num_effect_tokens": 5478,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 1.71727,
+      "grad_norm": 4.387427806854248,
+      "learning_rate": 1.192970807866073e-06,
+      "loss": 1.5839,
+      "step": 1889
+    },
+    {
+      "batch_num_effect_tokens": 5198,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.71818,
+      "grad_norm": 3.4425690174102783,
+      "learning_rate": 1.1854663655241804e-06,
+      "loss": 0.7649,
+      "step": 1890
+    },
+    {
+      "batch_num_effect_tokens": 6151,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.71909,
+      "grad_norm": 3.0315802097320557,
+      "learning_rate": 1.177984113760211e-06,
+      "loss": 0.8902,
+      "step": 1891
+    },
+    {
+      "batch_num_effect_tokens": 8876,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.72,
+      "grad_norm": 2.988717555999756,
+      "learning_rate": 1.1705240714107301e-06,
+      "loss": 1.3511,
+      "step": 1892
+    },
+    {
+      "batch_num_effect_tokens": 5401,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.72091,
+      "grad_norm": 4.3993730545043945,
+      "learning_rate": 1.163086257256385e-06,
+      "loss": 1.4819,
+      "step": 1893
+    },
+    {
+      "batch_num_effect_tokens": 6641,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52152,
+      "epoch": 1.72182,
+      "grad_norm": 3.5445523262023926,
+      "learning_rate": 1.1556706900218572e-06,
+      "loss": 1.4186,
+      "step": 1894
+    },
+    {
+      "batch_num_effect_tokens": 7538,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52137,
+      "epoch": 1.72273,
+      "grad_norm": 3.3498003482818604,
+      "learning_rate": 1.1482773883758357e-06,
+      "loss": 1.31,
+      "step": 1895
+    },
+    {
+      "batch_num_effect_tokens": 5370,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.72364,
+      "grad_norm": 4.466821670532227,
+      "learning_rate": 1.1409063709309442e-06,
+      "loss": 1.7006,
+      "step": 1896
+    },
+    {
+      "batch_num_effect_tokens": 6420,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52097,
+      "epoch": 1.72455,
+      "grad_norm": 2.439729928970337,
+      "learning_rate": 1.1335576562437134e-06,
+      "loss": 0.7083,
+      "step": 1897
+    },
+    {
+      "batch_num_effect_tokens": 7274,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52214,
+      "epoch": 1.72545,
+      "grad_norm": 3.1005828380584717,
+      "learning_rate": 1.126231262814521e-06,
+      "loss": 1.153,
+      "step": 1898
+    },
+    {
+      "batch_num_effect_tokens": 5792,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.72636,
+      "grad_norm": 2.4603047370910645,
+      "learning_rate": 1.1189272090875592e-06,
+      "loss": 0.4733,
+      "step": 1899
+    },
+    {
+      "batch_num_effect_tokens": 6616,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.72727,
+      "grad_norm": 3.304178476333618,
+      "learning_rate": 1.1116455134507665e-06,
+      "loss": 1.0372,
+      "step": 1900
+    },
+    {
+      "batch_num_effect_tokens": 4083,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.72818,
+      "grad_norm": 3.7938997745513916,
+      "learning_rate": 1.1043861942358081e-06,
+      "loss": 1.1319,
+      "step": 1901
+    },
+    {
+      "batch_num_effect_tokens": 6721,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52177,
+      "epoch": 1.72909,
+      "grad_norm": 3.1155800819396973,
+      "learning_rate": 1.0971492697180097e-06,
+      "loss": 1.0504,
+      "step": 1902
+    },
+    {
+      "batch_num_effect_tokens": 7175,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52129,
+      "epoch": 1.73,
+      "grad_norm": 2.911799907684326,
+      "learning_rate": 1.0899347581163222e-06,
+      "loss": 1.0896,
+      "step": 1903
+    },
+    {
+      "batch_num_effect_tokens": 7680,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.73091,
+      "grad_norm": 3.1543545722961426,
+      "learning_rate": 1.0827426775932658e-06,
+      "loss": 1.3,
+      "step": 1904
+    },
+    {
+      "batch_num_effect_tokens": 5453,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52131,
+      "epoch": 1.73182,
+      "grad_norm": 3.1990206241607666,
+      "learning_rate": 1.0755730462549008e-06,
+      "loss": 0.9333,
+      "step": 1905
+    },
+    {
+      "batch_num_effect_tokens": 7691,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52122,
+      "epoch": 1.73273,
+      "grad_norm": 2.3991081714630127,
+      "learning_rate": 1.068425882150762e-06,
+      "loss": 0.7612,
+      "step": 1906
+    },
+    {
+      "batch_num_effect_tokens": 6211,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.73364,
+      "grad_norm": 3.288367509841919,
+      "learning_rate": 1.0613012032738268e-06,
+      "loss": 1.0547,
+      "step": 1907
+    },
+    {
+      "batch_num_effect_tokens": 7345,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.73455,
+      "grad_norm": 2.6501219272613525,
+      "learning_rate": 1.054199027560463e-06,
+      "loss": 0.8752,
+      "step": 1908
+    },
+    {
+      "batch_num_effect_tokens": 10309,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52169,
+      "epoch": 1.73545,
+      "grad_norm": 2.221205711364746,
+      "learning_rate": 1.047119372890395e-06,
+      "loss": 0.9218,
+      "step": 1909
+    },
+    {
+      "batch_num_effect_tokens": 6150,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.73636,
+      "grad_norm": 3.7723615169525146,
+      "learning_rate": 1.0400622570866426e-06,
+      "loss": 1.232,
+      "step": 1910
+    },
+    {
+      "batch_num_effect_tokens": 5991,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 1.73727,
+      "grad_norm": 2.9985649585723877,
+      "learning_rate": 1.033027697915483e-06,
+      "loss": 0.8793,
+      "step": 1911
+    },
+    {
+      "batch_num_effect_tokens": 6396,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.73818,
+      "grad_norm": 3.3636891841888428,
+      "learning_rate": 1.0260157130864178e-06,
+      "loss": 0.8005,
+      "step": 1912
+    },
+    {
+      "batch_num_effect_tokens": 5555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.73909,
+      "grad_norm": 2.8261046409606934,
+      "learning_rate": 1.0190263202521033e-06,
+      "loss": 0.7,
+      "step": 1913
+    },
+    {
+      "batch_num_effect_tokens": 8611,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52174,
+      "epoch": 1.74,
+      "grad_norm": 3.332305669784546,
+      "learning_rate": 1.012059537008332e-06,
+      "loss": 1.4691,
+      "step": 1914
+    },
+    {
+      "batch_num_effect_tokens": 7709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52136,
+      "epoch": 1.74091,
+      "grad_norm": 3.5245392322540283,
+      "learning_rate": 1.0051153808939683e-06,
+      "loss": 1.6294,
+      "step": 1915
+    },
+    {
+      "batch_num_effect_tokens": 8637,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.74182,
+      "grad_norm": 2.706677198410034,
+      "learning_rate": 9.981938693909221e-07,
+      "loss": 1.1122,
+      "step": 1916
+    },
+    {
+      "batch_num_effect_tokens": 4411,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.74273,
+      "grad_norm": 3.1833739280700684,
+      "learning_rate": 9.912950199240867e-07,
+      "loss": 0.8263,
+      "step": 1917
+    },
+    {
+      "batch_num_effect_tokens": 4942,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52193,
+      "epoch": 1.74364,
+      "grad_norm": 3.8086676597595215,
+      "learning_rate": 9.844188498613117e-07,
+      "loss": 0.9263,
+      "step": 1918
+    },
+    {
+      "batch_num_effect_tokens": 6171,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.74455,
+      "grad_norm": 2.9984874725341797,
+      "learning_rate": 9.775653765133398e-07,
+      "loss": 0.6323,
+      "step": 1919
+    },
+    {
+      "batch_num_effect_tokens": 3792,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.74545,
+      "grad_norm": 3.45582914352417,
+      "learning_rate": 9.707346171337895e-07,
+      "loss": 0.5912,
+      "step": 1920
+    },
+    {
+      "batch_num_effect_tokens": 7333,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.74636,
+      "grad_norm": 2.9762768745422363,
+      "learning_rate": 9.63926588919083e-07,
+      "loss": 1.1486,
+      "step": 1921
+    },
+    {
+      "batch_num_effect_tokens": 5750,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.74727,
+      "grad_norm": 2.3640410900115967,
+      "learning_rate": 9.571413090084281e-07,
+      "loss": 0.5453,
+      "step": 1922
+    },
+    {
+      "batch_num_effect_tokens": 7812,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.74818,
+      "grad_norm": 3.1287717819213867,
+      "learning_rate": 9.503787944837562e-07,
+      "loss": 1.2124,
+      "step": 1923
+    },
+    {
+      "batch_num_effect_tokens": 5194,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50593,
+      "epoch": 1.74909,
+      "grad_norm": 3.3572943210601807,
+      "learning_rate": 9.436390623696911e-07,
+      "loss": 0.8045,
+      "step": 1924
+    },
+    {
+      "batch_num_effect_tokens": 4003,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.75,
+      "grad_norm": 2.8662009239196777,
+      "learning_rate": 9.369221296335007e-07,
+      "loss": 0.3116,
+      "step": 1925
+    },
+    {
+      "batch_num_effect_tokens": 8198,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52143,
+      "epoch": 1.75091,
+      "grad_norm": 3.0082473754882812,
+      "learning_rate": 9.302280131850538e-07,
+      "loss": 1.3142,
+      "step": 1926
+    },
+    {
+      "batch_num_effect_tokens": 5362,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.75182,
+      "grad_norm": 3.834455728530884,
+      "learning_rate": 9.235567298767812e-07,
+      "loss": 1.5696,
+      "step": 1927
+    },
+    {
+      "batch_num_effect_tokens": 5656,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.75273,
+      "grad_norm": 3.420131206512451,
+      "learning_rate": 9.16908296503628e-07,
+      "loss": 1.1827,
+      "step": 1928
+    },
+    {
+      "batch_num_effect_tokens": 2948,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52130,
+      "epoch": 1.75364,
+      "grad_norm": 1.886879801750183,
+      "learning_rate": 9.102827298030226e-07,
+      "loss": 0.0661,
+      "step": 1929
+    },
+    {
+      "batch_num_effect_tokens": 6512,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 1.75455,
+      "grad_norm": 3.2286927700042725,
+      "learning_rate": 9.036800464548157e-07,
+      "loss": 0.9319,
+      "step": 1930
+    },
+    {
+      "batch_num_effect_tokens": 6389,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50559,
+      "epoch": 1.75545,
+      "grad_norm": 2.8748347759246826,
+      "learning_rate": 8.97100263081262e-07,
+      "loss": 0.9958,
+      "step": 1931
+    },
+    {
+      "batch_num_effect_tokens": 11793,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.75636,
+      "grad_norm": 2.4396183490753174,
+      "learning_rate": 8.905433962469489e-07,
+      "loss": 1.2373,
+      "step": 1932
+    },
+    {
+      "batch_num_effect_tokens": 6132,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.75727,
+      "grad_norm": 2.9254977703094482,
+      "learning_rate": 8.840094624587892e-07,
+      "loss": 0.9092,
+      "step": 1933
+    },
+    {
+      "batch_num_effect_tokens": 4157,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.75818,
+      "grad_norm": 3.1090028285980225,
+      "learning_rate": 8.774984781659468e-07,
+      "loss": 0.3909,
+      "step": 1934
+    },
+    {
+      "batch_num_effect_tokens": 7081,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.75909,
+      "grad_norm": 2.3378493785858154,
+      "learning_rate": 8.710104597598224e-07,
+      "loss": 0.7473,
+      "step": 1935
+    },
+    {
+      "batch_num_effect_tokens": 5516,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.76,
+      "grad_norm": 3.141493558883667,
+      "learning_rate": 8.645454235739903e-07,
+      "loss": 0.8269,
+      "step": 1936
+    },
+    {
+      "batch_num_effect_tokens": 4122,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.76091,
+      "grad_norm": 3.506274938583374,
+      "learning_rate": 8.581033858841769e-07,
+      "loss": 0.5283,
+      "step": 1937
+    },
+    {
+      "batch_num_effect_tokens": 6230,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52185,
+      "epoch": 1.76182,
+      "grad_norm": 3.2314116954803467,
+      "learning_rate": 8.516843629081983e-07,
+      "loss": 1.0369,
+      "step": 1938
+    },
+    {
+      "batch_num_effect_tokens": 5545,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.76273,
+      "grad_norm": 3.286485433578491,
+      "learning_rate": 8.4528837080594e-07,
+      "loss": 0.9618,
+      "step": 1939
+    },
+    {
+      "batch_num_effect_tokens": 5104,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.76364,
+      "grad_norm": 2.489656686782837,
+      "learning_rate": 8.389154256793042e-07,
+      "loss": 0.6243,
+      "step": 1940
+    },
+    {
+      "batch_num_effect_tokens": 6727,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52215,
+      "epoch": 1.76455,
+      "grad_norm": 2.5669116973876953,
+      "learning_rate": 8.325655435721735e-07,
+      "loss": 0.8501,
+      "step": 1941
+    },
+    {
+      "batch_num_effect_tokens": 5069,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52156,
+      "epoch": 1.76545,
+      "grad_norm": 2.9081106185913086,
+      "learning_rate": 8.262387404703654e-07,
+      "loss": 0.5421,
+      "step": 1942
+    },
+    {
+      "batch_num_effect_tokens": 7477,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52212,
+      "epoch": 1.76636,
+      "grad_norm": 5.922401428222656,
+      "learning_rate": 8.199350323016042e-07,
+      "loss": 1.4223,
+      "step": 1943
+    },
+    {
+      "batch_num_effect_tokens": 6118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.76727,
+      "grad_norm": 3.127690076828003,
+      "learning_rate": 8.136544349354669e-07,
+      "loss": 0.9317,
+      "step": 1944
+    },
+    {
+      "batch_num_effect_tokens": 4070,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.76818,
+      "grad_norm": 3.2388033866882324,
+      "learning_rate": 8.073969641833446e-07,
+      "loss": 0.5806,
+      "step": 1945
+    },
+    {
+      "batch_num_effect_tokens": 6078,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 1.76909,
+      "grad_norm": 3.322255849838257,
+      "learning_rate": 8.011626357984182e-07,
+      "loss": 0.9383,
+      "step": 1946
+    },
+    {
+      "batch_num_effect_tokens": 8281,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.77,
+      "grad_norm": 3.274935245513916,
+      "learning_rate": 7.949514654755963e-07,
+      "loss": 1.4529,
+      "step": 1947
+    },
+    {
+      "batch_num_effect_tokens": 5857,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.77091,
+      "grad_norm": 3.174466848373413,
+      "learning_rate": 7.887634688515e-07,
+      "loss": 0.936,
+      "step": 1948
+    },
+    {
+      "batch_num_effect_tokens": 7239,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 1.77182,
+      "grad_norm": 3.1621530055999756,
+      "learning_rate": 7.825986615043967e-07,
+      "loss": 1.1346,
+      "step": 1949
+    },
+    {
+      "batch_num_effect_tokens": 8131,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.77273,
+      "grad_norm": 3.23815655708313,
+      "learning_rate": 7.764570589541876e-07,
+      "loss": 1.3067,
+      "step": 1950
+    },
+    {
+      "batch_num_effect_tokens": 7187,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.77364,
+      "grad_norm": 2.7597808837890625,
+      "learning_rate": 7.703386766623444e-07,
+      "loss": 0.7689,
+      "step": 1951
+    },
+    {
+      "batch_num_effect_tokens": 6545,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.77455,
+      "grad_norm": 3.302802324295044,
+      "learning_rate": 7.642435300318906e-07,
+      "loss": 1.2263,
+      "step": 1952
+    },
+    {
+      "batch_num_effect_tokens": 8419,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.77545,
+      "grad_norm": 2.663577079772949,
+      "learning_rate": 7.581716344073476e-07,
+      "loss": 0.9514,
+      "step": 1953
+    },
+    {
+      "batch_num_effect_tokens": 10600,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.77636,
+      "grad_norm": 2.5999982357025146,
+      "learning_rate": 7.521230050747086e-07,
+      "loss": 1.3394,
+      "step": 1954
+    },
+    {
+      "batch_num_effect_tokens": 7929,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52220,
+      "epoch": 1.77727,
+      "grad_norm": 3.399446487426758,
+      "learning_rate": 7.460976572613888e-07,
+      "loss": 1.2472,
+      "step": 1955
+    },
+    {
+      "batch_num_effect_tokens": 5821,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52125,
+      "epoch": 1.77818,
+      "grad_norm": 2.714089870452881,
+      "learning_rate": 7.400956061361975e-07,
+      "loss": 0.6145,
+      "step": 1956
+    },
+    {
+      "batch_num_effect_tokens": 5670,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 1.77909,
+      "grad_norm": 3.530482530593872,
+      "learning_rate": 7.341168668092857e-07,
+      "loss": 0.8642,
+      "step": 1957
+    },
+    {
+      "batch_num_effect_tokens": 5753,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50567,
+      "epoch": 1.78,
+      "grad_norm": 2.5470070838928223,
+      "learning_rate": 7.281614543321269e-07,
+      "loss": 0.5887,
+      "step": 1958
+    },
+    {
+      "batch_num_effect_tokens": 6883,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.78091,
+      "grad_norm": 2.87015438079834,
+      "learning_rate": 7.222293836974614e-07,
+      "loss": 0.8979,
+      "step": 1959
+    },
+    {
+      "batch_num_effect_tokens": 6880,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52138,
+      "epoch": 1.78182,
+      "grad_norm": 3.753164291381836,
+      "learning_rate": 7.163206698392744e-07,
+      "loss": 1.5885,
+      "step": 1960
+    },
+    {
+      "batch_num_effect_tokens": 6697,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.78273,
+      "grad_norm": 3.28859281539917,
+      "learning_rate": 7.104353276327414e-07,
+      "loss": 1.1016,
+      "step": 1961
+    },
+    {
+      "batch_num_effect_tokens": 6404,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.78364,
+      "grad_norm": 3.4291625022888184,
+      "learning_rate": 7.045733718942094e-07,
+      "loss": 1.1827,
+      "step": 1962
+    },
+    {
+      "batch_num_effect_tokens": 8762,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.78455,
+      "grad_norm": 2.9575815200805664,
+      "learning_rate": 6.987348173811415e-07,
+      "loss": 1.2368,
+      "step": 1963
+    },
+    {
+      "batch_num_effect_tokens": 9222,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 1.78545,
+      "grad_norm": 2.867325782775879,
+      "learning_rate": 6.9291967879209e-07,
+      "loss": 1.1963,
+      "step": 1964
+    },
+    {
+      "batch_num_effect_tokens": 6282,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.78636,
+      "grad_norm": 2.8609049320220947,
+      "learning_rate": 6.871279707666634e-07,
+      "loss": 0.8053,
+      "step": 1965
+    },
+    {
+      "batch_num_effect_tokens": 6003,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50555,
+      "epoch": 1.78727,
+      "grad_norm": 3.0638647079467773,
+      "learning_rate": 6.813597078854772e-07,
+      "loss": 0.8633,
+      "step": 1966
+    },
+    {
+      "batch_num_effect_tokens": 4843,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.78818,
+      "grad_norm": 3.135483741760254,
+      "learning_rate": 6.756149046701277e-07,
+      "loss": 0.7593,
+      "step": 1967
+    },
+    {
+      "batch_num_effect_tokens": 6118,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.78909,
+      "grad_norm": 2.4894979000091553,
+      "learning_rate": 6.698935755831493e-07,
+      "loss": 0.5755,
+      "step": 1968
+    },
+    {
+      "batch_num_effect_tokens": 7576,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.79,
+      "grad_norm": 2.775993585586548,
+      "learning_rate": 6.641957350279838e-07,
+      "loss": 0.8734,
+      "step": 1969
+    },
+    {
+      "batch_num_effect_tokens": 7291,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.79091,
+      "grad_norm": 3.0943503379821777,
+      "learning_rate": 6.585213973489335e-07,
+      "loss": 1.1922,
+      "step": 1970
+    },
+    {
+      "batch_num_effect_tokens": 6771,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52081,
+      "epoch": 1.79182,
+      "grad_norm": 2.703305959701538,
+      "learning_rate": 6.528705768311395e-07,
+      "loss": 0.651,
+      "step": 1971
+    },
+    {
+      "batch_num_effect_tokens": 6261,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.79273,
+      "grad_norm": 3.735302209854126,
+      "learning_rate": 6.472432877005341e-07,
+      "loss": 1.5032,
+      "step": 1972
+    },
+    {
+      "batch_num_effect_tokens": 5538,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.79364,
+      "grad_norm": 2.7473983764648438,
+      "learning_rate": 6.416395441238143e-07,
+      "loss": 0.7273,
+      "step": 1973
+    },
+    {
+      "batch_num_effect_tokens": 7363,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.79455,
+      "grad_norm": 2.6810669898986816,
+      "learning_rate": 6.360593602083942e-07,
+      "loss": 0.942,
+      "step": 1974
+    },
+    {
+      "batch_num_effect_tokens": 8806,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 1.79545,
+      "grad_norm": 3.0994985103607178,
+      "learning_rate": 6.305027500023841e-07,
+      "loss": 1.3185,
+      "step": 1975
+    },
+    {
+      "batch_num_effect_tokens": 6187,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.79636,
+      "grad_norm": 3.1788151264190674,
+      "learning_rate": 6.249697274945377e-07,
+      "loss": 0.8476,
+      "step": 1976
+    },
+    {
+      "batch_num_effect_tokens": 5772,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.79727,
+      "grad_norm": 3.620708703994751,
+      "learning_rate": 6.19460306614238e-07,
+      "loss": 1.2019,
+      "step": 1977
+    },
+    {
+      "batch_num_effect_tokens": 6709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.79818,
+      "grad_norm": 2.2427990436553955,
+      "learning_rate": 6.139745012314424e-07,
+      "loss": 0.5892,
+      "step": 1978
+    },
+    {
+      "batch_num_effect_tokens": 5345,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.79909,
+      "grad_norm": 2.9493868350982666,
+      "learning_rate": 6.085123251566616e-07,
+      "loss": 0.8059,
+      "step": 1979
+    },
+    {
+      "batch_num_effect_tokens": 7749,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 1.8,
+      "grad_norm": 3.1915297508239746,
+      "learning_rate": 6.030737921409169e-07,
+      "loss": 1.394,
+      "step": 1980
+    },
+    {
+      "batch_num_effect_tokens": 5492,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.80091,
+      "grad_norm": 2.7314164638519287,
+      "learning_rate": 5.976589158757074e-07,
+      "loss": 0.4688,
+      "step": 1981
+    },
+    {
+      "batch_num_effect_tokens": 5450,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.80182,
+      "grad_norm": 3.20293927192688,
+      "learning_rate": 5.922677099929785e-07,
+      "loss": 0.8613,
+      "step": 1982
+    },
+    {
+      "batch_num_effect_tokens": 7602,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.80273,
+      "grad_norm": 3.1139471530914307,
+      "learning_rate": 5.869001880650826e-07,
+      "loss": 1.2699,
+      "step": 1983
+    },
+    {
+      "batch_num_effect_tokens": 4704,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.80364,
+      "grad_norm": 3.194953680038452,
+      "learning_rate": 5.815563636047539e-07,
+      "loss": 0.6933,
+      "step": 1984
+    },
+    {
+      "batch_num_effect_tokens": 7539,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52216,
+      "epoch": 1.80455,
+      "grad_norm": 3.176323413848877,
+      "learning_rate": 5.762362500650598e-07,
+      "loss": 1.1949,
+      "step": 1985
+    },
+    {
+      "batch_num_effect_tokens": 7143,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.80545,
+      "grad_norm": 3.151172637939453,
+      "learning_rate": 5.709398608393835e-07,
+      "loss": 1.1113,
+      "step": 1986
+    },
+    {
+      "batch_num_effect_tokens": 8160,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.80636,
+      "grad_norm": 1.761962652206421,
+      "learning_rate": 5.656672092613757e-07,
+      "loss": 0.3349,
+      "step": 1987
+    },
+    {
+      "batch_num_effect_tokens": 7576,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.80727,
+      "grad_norm": 2.854128837585449,
+      "learning_rate": 5.604183086049342e-07,
+      "loss": 0.9761,
+      "step": 1988
+    },
+    {
+      "batch_num_effect_tokens": 8291,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52158,
+      "epoch": 1.80818,
+      "grad_norm": 3.2257771492004395,
+      "learning_rate": 5.551931720841541e-07,
+      "loss": 1.2465,
+      "step": 1989
+    },
+    {
+      "batch_num_effect_tokens": 5120,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52167,
+      "epoch": 1.80909,
+      "grad_norm": 3.463531732559204,
+      "learning_rate": 5.499918128533155e-07,
+      "loss": 0.8566,
+      "step": 1990
+    },
+    {
+      "batch_num_effect_tokens": 5706,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52111,
+      "epoch": 1.81,
+      "grad_norm": 3.1953766345977783,
+      "learning_rate": 5.448142440068316e-07,
+      "loss": 0.9884,
+      "step": 1991
+    },
+    {
+      "batch_num_effect_tokens": 5990,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.81091,
+      "grad_norm": 3.6692111492156982,
+      "learning_rate": 5.396604785792281e-07,
+      "loss": 0.9714,
+      "step": 1992
+    },
+    {
+      "batch_num_effect_tokens": 7945,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.81182,
+      "grad_norm": 3.071803092956543,
+      "learning_rate": 5.345305295450997e-07,
+      "loss": 1.2217,
+      "step": 1993
+    },
+    {
+      "batch_num_effect_tokens": 5578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.81273,
+      "grad_norm": 3.0185506343841553,
+      "learning_rate": 5.294244098190926e-07,
+      "loss": 0.8658,
+      "step": 1994
+    },
+    {
+      "batch_num_effect_tokens": 3962,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.81364,
+      "grad_norm": 3.220330238342285,
+      "learning_rate": 5.243421322558506e-07,
+      "loss": 0.619,
+      "step": 1995
+    },
+    {
+      "batch_num_effect_tokens": 11920,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52199,
+      "epoch": 1.81455,
+      "grad_norm": 3.5391247272491455,
+      "learning_rate": 5.192837096500058e-07,
+      "loss": 2.2227,
+      "step": 1996
+    },
+    {
+      "batch_num_effect_tokens": 4304,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.81545,
+      "grad_norm": 3.778665065765381,
+      "learning_rate": 5.142491547361294e-07,
+      "loss": 0.7287,
+      "step": 1997
+    },
+    {
+      "batch_num_effect_tokens": 8847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.81636,
+      "grad_norm": 2.801353931427002,
+      "learning_rate": 5.092384801887074e-07,
+      "loss": 1.2618,
+      "step": 1998
+    },
+    {
+      "batch_num_effect_tokens": 5465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.81727,
+      "grad_norm": 4.023580551147461,
+      "learning_rate": 5.04251698622108e-07,
+      "loss": 0.5592,
+      "step": 1999
+    },
+    {
+      "batch_num_effect_tokens": 7014,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52186,
+      "epoch": 1.81818,
+      "grad_norm": 2.3245139122009277,
+      "learning_rate": 4.992888225905467e-07,
+      "loss": 0.5798,
+      "step": 2000
+    },
+    {
+      "batch_num_effect_tokens": 7799,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.81909,
+      "grad_norm": 2.6953628063201904,
+      "learning_rate": 4.943498645880595e-07,
+      "loss": 1.0382,
+      "step": 2001
+    },
+    {
+      "batch_num_effect_tokens": 6712,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.82,
+      "grad_norm": 3.313474416732788,
+      "learning_rate": 4.894348370484648e-07,
+      "loss": 1.1191,
+      "step": 2002
+    },
+    {
+      "batch_num_effect_tokens": 4090,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.82091,
+      "grad_norm": 3.861618995666504,
+      "learning_rate": 4.845437523453411e-07,
+      "loss": 1.0173,
+      "step": 2003
+    },
+    {
+      "batch_num_effect_tokens": 6621,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.82182,
+      "grad_norm": 3.3845365047454834,
+      "learning_rate": 4.796766227919858e-07,
+      "loss": 1.0425,
+      "step": 2004
+    },
+    {
+      "batch_num_effect_tokens": 7026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 1.82273,
+      "grad_norm": 3.0628647804260254,
+      "learning_rate": 4.7483346064139513e-07,
+      "loss": 1.047,
+      "step": 2005
+    },
+    {
+      "batch_num_effect_tokens": 7244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.82364,
+      "grad_norm": 4.020042419433594,
+      "learning_rate": 4.7001427808622045e-07,
+      "loss": 1.7565,
+      "step": 2006
+    },
+    {
+      "batch_num_effect_tokens": 6116,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 1.82455,
+      "grad_norm": 5.533095836639404,
+      "learning_rate": 4.6521908725875253e-07,
+      "loss": 0.5821,
+      "step": 2007
+    },
+    {
+      "batch_num_effect_tokens": 6080,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.82545,
+      "grad_norm": 3.1684529781341553,
+      "learning_rate": 4.6044790023087373e-07,
+      "loss": 0.8718,
+      "step": 2008
+    },
+    {
+      "batch_num_effect_tokens": 8107,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 1.82636,
+      "grad_norm": 2.843222141265869,
+      "learning_rate": 4.5570072901404474e-07,
+      "loss": 1.1437,
+      "step": 2009
+    },
+    {
+      "batch_num_effect_tokens": 7416,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.82727,
+      "grad_norm": 3.33341383934021,
+      "learning_rate": 4.509775855592613e-07,
+      "loss": 1.1667,
+      "step": 2010
+    },
+    {
+      "batch_num_effect_tokens": 6191,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 1.82818,
+      "grad_norm": 2.8735108375549316,
+      "learning_rate": 4.4627848175703315e-07,
+      "loss": 0.7308,
+      "step": 2011
+    },
+    {
+      "batch_num_effect_tokens": 6283,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52121,
+      "epoch": 1.82909,
+      "grad_norm": 3.083097457885742,
+      "learning_rate": 4.4160342943734723e-07,
+      "loss": 0.8518,
+      "step": 2012
+    },
+    {
+      "batch_num_effect_tokens": 7764,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.83,
+      "grad_norm": 4.017691135406494,
+      "learning_rate": 4.3695244036964567e-07,
+      "loss": 2.1002,
+      "step": 2013
+    },
+    {
+      "batch_num_effect_tokens": 6987,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.83091,
+      "grad_norm": 3.9710652828216553,
+      "learning_rate": 4.323255262627846e-07,
+      "loss": 1.9213,
+      "step": 2014
+    },
+    {
+      "batch_num_effect_tokens": 8673,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.83182,
+      "grad_norm": 2.6227548122406006,
+      "learning_rate": 4.277226987650129e-07,
+      "loss": 1.0801,
+      "step": 2015
+    },
+    {
+      "batch_num_effect_tokens": 7711,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52190,
+      "epoch": 1.83273,
+      "grad_norm": 3.010826826095581,
+      "learning_rate": 4.2314396946394833e-07,
+      "loss": 1.2035,
+      "step": 2016
+    },
+    {
+      "batch_num_effect_tokens": 5578,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.83364,
+      "grad_norm": 3.5875613689422607,
+      "learning_rate": 4.1858934988653233e-07,
+      "loss": 1.0575,
+      "step": 2017
+    },
+    {
+      "batch_num_effect_tokens": 5538,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52168,
+      "epoch": 1.83455,
+      "grad_norm": 2.5732593536376953,
+      "learning_rate": 4.1405885149901623e-07,
+      "loss": 0.4825,
+      "step": 2018
+    },
+    {
+      "batch_num_effect_tokens": 6915,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.83545,
+      "grad_norm": 3.549899101257324,
+      "learning_rate": 4.095524857069244e-07,
+      "loss": 1.4148,
+      "step": 2019
+    },
+    {
+      "batch_num_effect_tokens": 6640,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.83636,
+      "grad_norm": 3.262498140335083,
+      "learning_rate": 4.0507026385502747e-07,
+      "loss": 1.0891,
+      "step": 2020
+    },
+    {
+      "batch_num_effect_tokens": 8297,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.83727,
+      "grad_norm": 3.292078733444214,
+      "learning_rate": 4.0061219722731136e-07,
+      "loss": 1.6603,
+      "step": 2021
+    },
+    {
+      "batch_num_effect_tokens": 7026,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.83818,
+      "grad_norm": 3.5750980377197266,
+      "learning_rate": 3.9617829704695634e-07,
+      "loss": 1.5395,
+      "step": 2022
+    },
+    {
+      "batch_num_effect_tokens": 7326,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52190,
+      "epoch": 1.83909,
+      "grad_norm": 3.339860200881958,
+      "learning_rate": 3.917685744762989e-07,
+      "loss": 1.1482,
+      "step": 2023
+    },
+    {
+      "batch_num_effect_tokens": 6495,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 1.84,
+      "grad_norm": 3.0043394565582275,
+      "learning_rate": 3.8738304061681107e-07,
+      "loss": 0.9975,
+      "step": 2024
+    },
+    {
+      "batch_num_effect_tokens": 7548,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.84091,
+      "grad_norm": 2.484179973602295,
+      "learning_rate": 3.8302170650907023e-07,
+      "loss": 0.8843,
+      "step": 2025
+    },
+    {
+      "batch_num_effect_tokens": 4985,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.84182,
+      "grad_norm": 3.724242687225342,
+      "learning_rate": 3.7868458313272906e-07,
+      "loss": 1.0107,
+      "step": 2026
+    },
+    {
+      "batch_num_effect_tokens": 4613,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.84273,
+      "grad_norm": 2.1949408054351807,
+      "learning_rate": 3.7437168140648904e-07,
+      "loss": 0.2,
+      "step": 2027
+    },
+    {
+      "batch_num_effect_tokens": 6506,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 1.84364,
+      "grad_norm": 3.2145578861236572,
+      "learning_rate": 3.7008301218807716e-07,
+      "loss": 0.9436,
+      "step": 2028
+    },
+    {
+      "batch_num_effect_tokens": 7112,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52145,
+      "epoch": 1.84455,
+      "grad_norm": 3.019531488418579,
+      "learning_rate": 3.658185862742103e-07,
+      "loss": 1.2709,
+      "step": 2029
+    },
+    {
+      "batch_num_effect_tokens": 9174,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50580,
+      "epoch": 1.84545,
+      "grad_norm": 3.0364532470703125,
+      "learning_rate": 3.615784144005796e-07,
+      "loss": 1.6047,
+      "step": 2030
+    },
+    {
+      "batch_num_effect_tokens": 7268,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.84636,
+      "grad_norm": 3.2398998737335205,
+      "learning_rate": 3.5736250724180965e-07,
+      "loss": 1.1237,
+      "step": 2031
+    },
+    {
+      "batch_num_effect_tokens": 2904,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52149,
+      "epoch": 1.84727,
+      "grad_norm": 2.7733280658721924,
+      "learning_rate": 3.531708754114438e-07,
+      "loss": 0.1986,
+      "step": 2032
+    },
+    {
+      "batch_num_effect_tokens": 6374,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52167,
+      "epoch": 1.84818,
+      "grad_norm": 3.345583200454712,
+      "learning_rate": 3.490035294619087e-07,
+      "loss": 1.0036,
+      "step": 2033
+    },
+    {
+      "batch_num_effect_tokens": 5774,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.84909,
+      "grad_norm": 3.2348711490631104,
+      "learning_rate": 3.448604798844912e-07,
+      "loss": 0.9395,
+      "step": 2034
+    },
+    {
+      "batch_num_effect_tokens": 9591,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.85,
+      "grad_norm": 2.7883570194244385,
+      "learning_rate": 3.4074173710931804e-07,
+      "loss": 1.3181,
+      "step": 2035
+    },
+    {
+      "batch_num_effect_tokens": 8987,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.85091,
+      "grad_norm": 3.0228030681610107,
+      "learning_rate": 3.3664731150531484e-07,
+      "loss": 1.3226,
+      "step": 2036
+    },
+    {
+      "batch_num_effect_tokens": 5526,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.85182,
+      "grad_norm": 3.1885998249053955,
+      "learning_rate": 3.3257721338019633e-07,
+      "loss": 0.7501,
+      "step": 2037
+    },
+    {
+      "batch_num_effect_tokens": 6166,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52198,
+      "epoch": 1.85273,
+      "grad_norm": 3.2657663822174072,
+      "learning_rate": 3.2853145298042954e-07,
+      "loss": 1.1105,
+      "step": 2038
+    },
+    {
+      "batch_num_effect_tokens": 5526,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.85364,
+      "grad_norm": 2.836792230606079,
+      "learning_rate": 3.2451004049120936e-07,
+      "loss": 0.7755,
+      "step": 2039
+    },
+    {
+      "batch_num_effect_tokens": 6221,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 1.85455,
+      "grad_norm": 3.320397138595581,
+      "learning_rate": 3.2051298603643754e-07,
+      "loss": 0.7057,
+      "step": 2040
+    },
+    {
+      "batch_num_effect_tokens": 5113,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.85545,
+      "grad_norm": 3.700753927230835,
+      "learning_rate": 3.165402996786948e-07,
+      "loss": 1.3216,
+      "step": 2041
+    },
+    {
+      "batch_num_effect_tokens": 8337,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.85636,
+      "grad_norm": 3.2562508583068848,
+      "learning_rate": 3.125919914192144e-07,
+      "loss": 1.5416,
+      "step": 2042
+    },
+    {
+      "batch_num_effect_tokens": 7714,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52124,
+      "epoch": 1.85727,
+      "grad_norm": 3.239128351211548,
+      "learning_rate": 3.086680711978574e-07,
+      "loss": 1.2409,
+      "step": 2043
+    },
+    {
+      "batch_num_effect_tokens": 4489,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52163,
+      "epoch": 1.85818,
+      "grad_norm": 4.076598644256592,
+      "learning_rate": 3.0476854889308737e-07,
+      "loss": 0.9881,
+      "step": 2044
+    },
+    {
+      "batch_num_effect_tokens": 7077,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52194,
+      "epoch": 1.85909,
+      "grad_norm": 2.9148364067077637,
+      "learning_rate": 3.008934343219483e-07,
+      "loss": 0.8965,
+      "step": 2045
+    },
+    {
+      "batch_num_effect_tokens": 6093,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.86,
+      "grad_norm": 2.8413338661193848,
+      "learning_rate": 2.970427372400353e-07,
+      "loss": 0.889,
+      "step": 2046
+    },
+    {
+      "batch_num_effect_tokens": 6947,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.86091,
+      "grad_norm": 3.3784499168395996,
+      "learning_rate": 2.93216467341475e-07,
+      "loss": 1.262,
+      "step": 2047
+    },
+    {
+      "batch_num_effect_tokens": 10117,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.86182,
+      "grad_norm": 2.520775318145752,
+      "learning_rate": 2.894146342588977e-07,
+      "loss": 1.0522,
+      "step": 2048
+    },
+    {
+      "batch_num_effect_tokens": 4035,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52201,
+      "epoch": 1.86273,
+      "grad_norm": 4.631101608276367,
+      "learning_rate": 2.856372475634106e-07,
+      "loss": 0.5698,
+      "step": 2049
+    },
+    {
+      "batch_num_effect_tokens": 5732,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.86364,
+      "grad_norm": 2.630826234817505,
+      "learning_rate": 2.818843167645835e-07,
+      "loss": 0.549,
+      "step": 2050
+    },
+    {
+      "batch_num_effect_tokens": 6049,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.86455,
+      "grad_norm": 3.310131072998047,
+      "learning_rate": 2.781558513104143e-07,
+      "loss": 1.0531,
+      "step": 2051
+    },
+    {
+      "batch_num_effect_tokens": 5768,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.86545,
+      "grad_norm": 3.0105044841766357,
+      "learning_rate": 2.744518605873092e-07,
+      "loss": 0.8083,
+      "step": 2052
+    },
+    {
+      "batch_num_effect_tokens": 6497,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 1.86636,
+      "grad_norm": 3.7525155544281006,
+      "learning_rate": 2.707723539200613e-07,
+      "loss": 1.6784,
+      "step": 2053
+    },
+    {
+      "batch_num_effect_tokens": 6647,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52175,
+      "epoch": 1.86727,
+      "grad_norm": 3.0726425647735596,
+      "learning_rate": 2.6711734057182417e-07,
+      "loss": 1.0663,
+      "step": 2054
+    },
+    {
+      "batch_num_effect_tokens": 6001,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.86818,
+      "grad_norm": 2.7889184951782227,
+      "learning_rate": 2.6348682974408956e-07,
+      "loss": 0.7351,
+      "step": 2055
+    },
+    {
+      "batch_num_effect_tokens": 6088,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.86909,
+      "grad_norm": 3.8500564098358154,
+      "learning_rate": 2.5988083057666534e-07,
+      "loss": 1.4846,
+      "step": 2056
+    },
+    {
+      "batch_num_effect_tokens": 5847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52112,
+      "epoch": 1.87,
+      "grad_norm": 3.5931131839752197,
+      "learning_rate": 2.5629935214764866e-07,
+      "loss": 1.168,
+      "step": 2057
+    },
+    {
+      "batch_num_effect_tokens": 6144,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.87091,
+      "grad_norm": 3.006503105163574,
+      "learning_rate": 2.527424034734072e-07,
+      "loss": 0.9912,
+      "step": 2058
+    },
+    {
+      "batch_num_effect_tokens": 9794,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.87182,
+      "grad_norm": 2.374946355819702,
+      "learning_rate": 2.492099935085546e-07,
+      "loss": 1.0785,
+      "step": 2059
+    },
+    {
+      "batch_num_effect_tokens": 9290,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.87273,
+      "grad_norm": 2.640920400619507,
+      "learning_rate": 2.4570213114592957e-07,
+      "loss": 1.1373,
+      "step": 2060
+    },
+    {
+      "batch_num_effect_tokens": 6709,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.87364,
+      "grad_norm": 3.0625946521759033,
+      "learning_rate": 2.422188252165714e-07,
+      "loss": 0.8988,
+      "step": 2061
+    },
+    {
+      "batch_num_effect_tokens": 4172,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52121,
+      "epoch": 1.87455,
+      "grad_norm": 3.482499837875366,
+      "learning_rate": 2.387600844896998e-07,
+      "loss": 0.6744,
+      "step": 2062
+    },
+    {
+      "batch_num_effect_tokens": 7208,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.87545,
+      "grad_norm": 3.2273452281951904,
+      "learning_rate": 2.3532591767268854e-07,
+      "loss": 1.2344,
+      "step": 2063
+    },
+    {
+      "batch_num_effect_tokens": 3818,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52125,
+      "epoch": 1.87636,
+      "grad_norm": 2.9675824642181396,
+      "learning_rate": 2.3191633341104859e-07,
+      "loss": 0.4268,
+      "step": 2064
+    },
+    {
+      "batch_num_effect_tokens": 8259,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.87727,
+      "grad_norm": 2.812716245651245,
+      "learning_rate": 2.2853134028840594e-07,
+      "loss": 1.0256,
+      "step": 2065
+    },
+    {
+      "batch_num_effect_tokens": 8465,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52168,
+      "epoch": 1.87818,
+      "grad_norm": 2.420287609100342,
+      "learning_rate": 2.25170946826474e-07,
+      "loss": 0.9995,
+      "step": 2066
+    },
+    {
+      "batch_num_effect_tokens": 7208,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.87909,
+      "grad_norm": 3.029155969619751,
+      "learning_rate": 2.2183516148504225e-07,
+      "loss": 0.9697,
+      "step": 2067
+    },
+    {
+      "batch_num_effect_tokens": 5775,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.88,
+      "grad_norm": 3.367401123046875,
+      "learning_rate": 2.1852399266194312e-07,
+      "loss": 1.0406,
+      "step": 2068
+    },
+    {
+      "batch_num_effect_tokens": 7011,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 1.88091,
+      "grad_norm": 3.2168996334075928,
+      "learning_rate": 2.152374486930442e-07,
+      "loss": 1.1897,
+      "step": 2069
+    },
+    {
+      "batch_num_effect_tokens": 6453,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.88182,
+      "grad_norm": 3.0506627559661865,
+      "learning_rate": 2.119755378522137e-07,
+      "loss": 1.0789,
+      "step": 2070
+    },
+    {
+      "batch_num_effect_tokens": 6546,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.88273,
+      "grad_norm": 2.782705783843994,
+      "learning_rate": 2.0873826835130728e-07,
+      "loss": 0.6964,
+      "step": 2071
+    },
+    {
+      "batch_num_effect_tokens": 8762,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.88364,
+      "grad_norm": 2.063823938369751,
+      "learning_rate": 2.0552564834014797e-07,
+      "loss": 0.5552,
+      "step": 2072
+    },
+    {
+      "batch_num_effect_tokens": 4420,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52195,
+      "epoch": 1.88455,
+      "grad_norm": 3.2488834857940674,
+      "learning_rate": 2.0233768590650405e-07,
+      "loss": 0.5407,
+      "step": 2073
+    },
+    {
+      "batch_num_effect_tokens": 6417,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.88545,
+      "grad_norm": 2.4004364013671875,
+      "learning_rate": 1.9917438907606556e-07,
+      "loss": 0.6243,
+      "step": 2074
+    },
+    {
+      "batch_num_effect_tokens": 6864,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52165,
+      "epoch": 1.88636,
+      "grad_norm": 3.1030189990997314,
+      "learning_rate": 1.960357658124301e-07,
+      "loss": 0.938,
+      "step": 2075
+    },
+    {
+      "batch_num_effect_tokens": 5456,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52219,
+      "epoch": 1.88727,
+      "grad_norm": 3.3235042095184326,
+      "learning_rate": 1.9292182401707603e-07,
+      "loss": 0.8727,
+      "step": 2076
+    },
+    {
+      "batch_num_effect_tokens": 7481,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.88818,
+      "grad_norm": 3.2268526554107666,
+      "learning_rate": 1.898325715293503e-07,
+      "loss": 1.1493,
+      "step": 2077
+    },
+    {
+      "batch_num_effect_tokens": 5946,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.88909,
+      "grad_norm": 3.3354926109313965,
+      "learning_rate": 1.8676801612643957e-07,
+      "loss": 1.0522,
+      "step": 2078
+    },
+    {
+      "batch_num_effect_tokens": 5537,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.89,
+      "grad_norm": 3.2507591247558594,
+      "learning_rate": 1.8372816552336025e-07,
+      "loss": 1.0263,
+      "step": 2079
+    },
+    {
+      "batch_num_effect_tokens": 8816,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 1.89091,
+      "grad_norm": 3.3479599952697754,
+      "learning_rate": 1.8071302737293294e-07,
+      "loss": 1.6094,
+      "step": 2080
+    },
+    {
+      "batch_num_effect_tokens": 3693,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52147,
+      "epoch": 1.89182,
+      "grad_norm": 3.6269097328186035,
+      "learning_rate": 1.7772260926576357e-07,
+      "loss": 0.8417,
+      "step": 2081
+    },
+    {
+      "batch_num_effect_tokens": 6569,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 1.89273,
+      "grad_norm": 3.275144338607788,
+      "learning_rate": 1.747569187302267e-07,
+      "loss": 1.0675,
+      "step": 2082
+    },
+    {
+      "batch_num_effect_tokens": 5288,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52196,
+      "epoch": 1.89364,
+      "grad_norm": 3.7092061042785645,
+      "learning_rate": 1.7181596323244453e-07,
+      "loss": 1.0236,
+      "step": 2083
+    },
+    {
+      "batch_num_effect_tokens": 4837,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52123,
+      "epoch": 1.89455,
+      "grad_norm": 3.253692865371704,
+      "learning_rate": 1.6889975017626902e-07,
+      "loss": 0.5135,
+      "step": 2084
+    },
+    {
+      "batch_num_effect_tokens": 4890,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.89545,
+      "grad_norm": 3.156859874725342,
+      "learning_rate": 1.6600828690326087e-07,
+      "loss": 0.7532,
+      "step": 2085
+    },
+    {
+      "batch_num_effect_tokens": 5073,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 1.89636,
+      "grad_norm": 2.6373441219329834,
+      "learning_rate": 1.631415806926795e-07,
+      "loss": 0.5579,
+      "step": 2086
+    },
+    {
+      "batch_num_effect_tokens": 7704,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52221,
+      "epoch": 1.89727,
+      "grad_norm": 2.8775365352630615,
+      "learning_rate": 1.6029963876145084e-07,
+      "loss": 1.0058,
+      "step": 2087
+    },
+    {
+      "batch_num_effect_tokens": 6338,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.89818,
+      "grad_norm": 2.6224660873413086,
+      "learning_rate": 1.574824682641629e-07,
+      "loss": 0.7892,
+      "step": 2088
+    },
+    {
+      "batch_num_effect_tokens": 7038,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 1.89909,
+      "grad_norm": 4.682471752166748,
+      "learning_rate": 1.5469007629303812e-07,
+      "loss": 2.0439,
+      "step": 2089
+    },
+    {
+      "batch_num_effect_tokens": 5197,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.9,
+      "grad_norm": 3.5335943698883057,
+      "learning_rate": 1.519224698779198e-07,
+      "loss": 1.2065,
+      "step": 2090
+    },
+    {
+      "batch_num_effect_tokens": 8009,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.90091,
+      "grad_norm": 2.6326913833618164,
+      "learning_rate": 1.4917965598625351e-07,
+      "loss": 0.8632,
+      "step": 2091
+    },
+    {
+      "batch_num_effect_tokens": 6759,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52171,
+      "epoch": 1.90182,
+      "grad_norm": 2.8118245601654053,
+      "learning_rate": 1.464616415230702e-07,
+      "loss": 0.9084,
+      "step": 2092
+    },
+    {
+      "batch_num_effect_tokens": 4443,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.90273,
+      "grad_norm": 2.5267601013183594,
+      "learning_rate": 1.4376843333096746e-07,
+      "loss": 0.3668,
+      "step": 2093
+    },
+    {
+      "batch_num_effect_tokens": 8810,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52155,
+      "epoch": 1.90364,
+      "grad_norm": 3.9348878860473633,
+      "learning_rate": 1.411000381900951e-07,
+      "loss": 1.8771,
+      "step": 2094
+    },
+    {
+      "batch_num_effect_tokens": 7110,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52166,
+      "epoch": 1.90455,
+      "grad_norm": 1.6358133554458618,
+      "learning_rate": 1.3845646281813508e-07,
+      "loss": 0.2877,
+      "step": 2095
+    },
+    {
+      "batch_num_effect_tokens": 6381,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 1.90545,
+      "grad_norm": 3.8244762420654297,
+      "learning_rate": 1.3583771387028267e-07,
+      "loss": 1.5889,
+      "step": 2096
+    },
+    {
+      "batch_num_effect_tokens": 6921,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.90636,
+      "grad_norm": 3.0918750762939453,
+      "learning_rate": 1.3324379793923648e-07,
+      "loss": 1.0494,
+      "step": 2097
+    },
+    {
+      "batch_num_effect_tokens": 5966,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.90727,
+      "grad_norm": 3.315189838409424,
+      "learning_rate": 1.3067472155517736e-07,
+      "loss": 1.1321,
+      "step": 2098
+    },
+    {
+      "batch_num_effect_tokens": 5133,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.90818,
+      "grad_norm": 3.508096218109131,
+      "learning_rate": 1.2813049118575282e-07,
+      "loss": 0.8875,
+      "step": 2099
+    },
+    {
+      "batch_num_effect_tokens": 4969,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.90909,
+      "grad_norm": 2.9247848987579346,
+      "learning_rate": 1.2561111323605714e-07,
+      "loss": 0.7568,
+      "step": 2100
+    },
+    {
+      "batch_num_effect_tokens": 4955,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52154,
+      "epoch": 1.91,
+      "grad_norm": 3.291416883468628,
+      "learning_rate": 1.231165940486234e-07,
+      "loss": 0.783,
+      "step": 2101
+    },
+    {
+      "batch_num_effect_tokens": 6480,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.91091,
+      "grad_norm": 3.497671127319336,
+      "learning_rate": 1.2064693990339936e-07,
+      "loss": 0.8337,
+      "step": 2102
+    },
+    {
+      "batch_num_effect_tokens": 8419,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52208,
+      "epoch": 1.91182,
+      "grad_norm": 3.0520131587982178,
+      "learning_rate": 1.1820215701773829e-07,
+      "loss": 1.4666,
+      "step": 2103
+    },
+    {
+      "batch_num_effect_tokens": 4869,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52210,
+      "epoch": 1.91273,
+      "grad_norm": 3.4749419689178467,
+      "learning_rate": 1.1578225154637579e-07,
+      "loss": 0.8401,
+      "step": 2104
+    },
+    {
+      "batch_num_effect_tokens": 7648,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52222,
+      "epoch": 1.91364,
+      "grad_norm": 2.883669853210449,
+      "learning_rate": 1.1338722958142311e-07,
+      "loss": 0.9917,
+      "step": 2105
+    },
+    {
+      "batch_num_effect_tokens": 6705,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.91455,
+      "grad_norm": 3.6415936946868896,
+      "learning_rate": 1.1101709715234388e-07,
+      "loss": 1.3216,
+      "step": 2106
+    },
+    {
+      "batch_num_effect_tokens": 7498,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.91545,
+      "grad_norm": 3.037210702896118,
+      "learning_rate": 1.08671860225944e-07,
+      "loss": 0.9825,
+      "step": 2107
+    },
+    {
+      "batch_num_effect_tokens": 6012,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.91636,
+      "grad_norm": 3.028671979904175,
+      "learning_rate": 1.0635152470635513e-07,
+      "loss": 0.933,
+      "step": 2108
+    },
+    {
+      "batch_num_effect_tokens": 5176,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52126,
+      "epoch": 1.91727,
+      "grad_norm": 3.4049301147460938,
+      "learning_rate": 1.0405609643501902e-07,
+      "loss": 0.8042,
+      "step": 2109
+    },
+    {
+      "batch_num_effect_tokens": 5858,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52132,
+      "epoch": 1.91818,
+      "grad_norm": 3.471726417541504,
+      "learning_rate": 1.0178558119067316e-07,
+      "loss": 1.2755,
+      "step": 2110
+    },
+    {
+      "batch_num_effect_tokens": 5847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.91909,
+      "grad_norm": 3.8953139781951904,
+      "learning_rate": 9.953998468933635e-08,
+      "loss": 0.9107,
+      "step": 2111
+    },
+    {
+      "batch_num_effect_tokens": 7213,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.92,
+      "grad_norm": 2.7599267959594727,
+      "learning_rate": 9.731931258429638e-08,
+      "loss": 0.6609,
+      "step": 2112
+    },
+    {
+      "batch_num_effect_tokens": 7449,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.92091,
+      "grad_norm": 3.0122787952423096,
+      "learning_rate": 9.512357046609244e-08,
+      "loss": 1.0107,
+      "step": 2113
+    },
+    {
+      "batch_num_effect_tokens": 4666,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52220,
+      "epoch": 1.92182,
+      "grad_norm": 2.793931245803833,
+      "learning_rate": 9.295276386250273e-08,
+      "loss": 0.5671,
+      "step": 2114
+    },
+    {
+      "batch_num_effect_tokens": 5614,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.92273,
+      "grad_norm": 3.8008737564086914,
+      "learning_rate": 9.080689823853017e-08,
+      "loss": 1.4121,
+      "step": 2115
+    },
+    {
+      "batch_num_effect_tokens": 6395,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52129,
+      "epoch": 1.92364,
+      "grad_norm": 3.3346686363220215,
+      "learning_rate": 8.868597899638897e-08,
+      "loss": 1.2089,
+      "step": 2116
+    },
+    {
+      "batch_num_effect_tokens": 8000,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52214,
+      "epoch": 1.92455,
+      "grad_norm": 3.25838303565979,
+      "learning_rate": 8.659001147548918e-08,
+      "loss": 1.2777,
+      "step": 2117
+    },
+    {
+      "batch_num_effect_tokens": 6689,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.92545,
+      "grad_norm": 3.4332966804504395,
+      "learning_rate": 8.451900095242882e-08,
+      "loss": 1.2815,
+      "step": 2118
+    },
+    {
+      "batch_num_effect_tokens": 6496,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.92636,
+      "grad_norm": 3.0824673175811768,
+      "learning_rate": 8.247295264097288e-08,
+      "loss": 0.985,
+      "step": 2119
+    },
+    {
+      "batch_num_effect_tokens": 7652,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52216,
+      "epoch": 1.92727,
+      "grad_norm": 3.0345468521118164,
+      "learning_rate": 8.04518716920466e-08,
+      "loss": 0.8738,
+      "step": 2120
+    },
+    {
+      "batch_num_effect_tokens": 6869,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52125,
+      "epoch": 1.92818,
+      "grad_norm": 3.161374092102051,
+      "learning_rate": 7.845576319371884e-08,
+      "loss": 1.1486,
+      "step": 2121
+    },
+    {
+      "batch_num_effect_tokens": 6895,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52133,
+      "epoch": 1.92909,
+      "grad_norm": 3.010014533996582,
+      "learning_rate": 7.648463217118985e-08,
+      "loss": 1.1652,
+      "step": 2122
+    },
+    {
+      "batch_num_effect_tokens": 7995,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52187,
+      "epoch": 1.93,
+      "grad_norm": 3.0425710678100586,
+      "learning_rate": 7.453848358678018e-08,
+      "loss": 1.0709,
+      "step": 2123
+    },
+    {
+      "batch_num_effect_tokens": 6120,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 50540,
+      "epoch": 1.93091,
+      "grad_norm": 3.1712393760681152,
+      "learning_rate": 7.261732233991514e-08,
+      "loss": 0.9768,
+      "step": 2124
+    },
+    {
+      "batch_num_effect_tokens": 7125,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52148,
+      "epoch": 1.93182,
+      "grad_norm": 3.0530688762664795,
+      "learning_rate": 7.072115326711704e-08,
+      "loss": 1.0305,
+      "step": 2125
+    },
+    {
+      "batch_num_effect_tokens": 6646,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.93273,
+      "grad_norm": 3.473911762237549,
+      "learning_rate": 6.88499811419896e-08,
+      "loss": 1.3033,
+      "step": 2126
+    },
+    {
+      "batch_num_effect_tokens": 6467,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.93364,
+      "grad_norm": 3.20442795753479,
+      "learning_rate": 6.700381067520578e-08,
+      "loss": 0.8315,
+      "step": 2127
+    },
+    {
+      "batch_num_effect_tokens": 9847,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52177,
+      "epoch": 1.93455,
+      "grad_norm": 3.296698570251465,
+      "learning_rate": 6.51826465144978e-08,
+      "loss": 1.7688,
+      "step": 2128
+    },
+    {
+      "batch_num_effect_tokens": 7015,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52172,
+      "epoch": 1.93545,
+      "grad_norm": 3.054431915283203,
+      "learning_rate": 6.338649324464375e-08,
+      "loss": 1.1406,
+      "step": 2129
+    },
+    {
+      "batch_num_effect_tokens": 4682,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52164,
+      "epoch": 1.93636,
+      "grad_norm": 2.780895233154297,
+      "learning_rate": 6.161535538745877e-08,
+      "loss": 0.4555,
+      "step": 2130
+    },
+    {
+      "batch_num_effect_tokens": 5045,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.93727,
+      "grad_norm": 3.6313552856445312,
+      "learning_rate": 5.986923740177841e-08,
+      "loss": 0.8906,
+      "step": 2131
+    },
+    {
+      "batch_num_effect_tokens": 8426,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.93818,
+      "grad_norm": 2.6858532428741455,
+      "learning_rate": 5.814814368345412e-08,
+      "loss": 0.7991,
+      "step": 2132
+    },
+    {
+      "batch_num_effect_tokens": 9339,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.93909,
+      "grad_norm": 2.4184558391571045,
+      "learning_rate": 5.6452078565335524e-08,
+      "loss": 1.0356,
+      "step": 2133
+    },
+    {
+      "batch_num_effect_tokens": 5380,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52118,
+      "epoch": 1.94,
+      "grad_norm": 3.1847732067108154,
+      "learning_rate": 5.4781046317267103e-08,
+      "loss": 0.9598,
+      "step": 2134
+    },
+    {
+      "batch_num_effect_tokens": 9300,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52117,
+      "epoch": 1.94091,
+      "grad_norm": 2.9929068088531494,
+      "learning_rate": 5.3135051146068203e-08,
+      "loss": 1.4023,
+      "step": 2135
+    },
+    {
+      "batch_num_effect_tokens": 7162,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.94182,
+      "grad_norm": 3.0101776123046875,
+      "learning_rate": 5.15140971955308e-08,
+      "loss": 0.9932,
+      "step": 2136
+    },
+    {
+      "batch_num_effect_tokens": 5296,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 1.94273,
+      "grad_norm": 3.4937398433685303,
+      "learning_rate": 4.991818854640396e-08,
+      "loss": 0.9352,
+      "step": 2137
+    },
+    {
+      "batch_num_effect_tokens": 7385,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52211,
+      "epoch": 1.94364,
+      "grad_norm": 2.734804153442383,
+      "learning_rate": 4.8347329216387184e-08,
+      "loss": 0.9585,
+      "step": 2138
+    },
+    {
+      "batch_num_effect_tokens": 7546,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52209,
+      "epoch": 1.94455,
+      "grad_norm": 2.896134614944458,
+      "learning_rate": 4.6801523160114884e-08,
+      "loss": 0.8693,
+      "step": 2139
+    },
+    {
+      "batch_num_effect_tokens": 7476,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.94545,
+      "grad_norm": 2.7789533138275146,
+      "learning_rate": 4.528077426915412e-08,
+      "loss": 0.9293,
+      "step": 2140
+    },
+    {
+      "batch_num_effect_tokens": 4207,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52146,
+      "epoch": 1.94636,
+      "grad_norm": 2.832606554031372,
+      "learning_rate": 4.378508637198686e-08,
+      "loss": 0.4552,
+      "step": 2141
+    },
+    {
+      "batch_num_effect_tokens": 5396,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.94727,
+      "grad_norm": 2.760693073272705,
+      "learning_rate": 4.231446323400557e-08,
+      "loss": 0.6138,
+      "step": 2142
+    },
+    {
+      "batch_num_effect_tokens": 3625,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52140,
+      "epoch": 1.94818,
+      "grad_norm": 2.6959006786346436,
+      "learning_rate": 4.086890855750425e-08,
+      "loss": 0.3044,
+      "step": 2143
+    },
+    {
+      "batch_num_effect_tokens": 7112,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52213,
+      "epoch": 1.94909,
+      "grad_norm": 3.302431344985962,
+      "learning_rate": 3.9448425981661876e-08,
+      "loss": 1.2291,
+      "step": 2144
+    },
+    {
+      "batch_num_effect_tokens": 7545,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52134,
+      "epoch": 1.95,
+      "grad_norm": 3.5944392681121826,
+      "learning_rate": 3.805301908254455e-08,
+      "loss": 1.5729,
+      "step": 2145
+    },
+    {
+      "batch_num_effect_tokens": 5555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.95091,
+      "grad_norm": 3.7243990898132324,
+      "learning_rate": 3.668269137308666e-08,
+      "loss": 1.2057,
+      "step": 2146
+    },
+    {
+      "batch_num_effect_tokens": 5712,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52216,
+      "epoch": 1.95182,
+      "grad_norm": 2.9040653705596924,
+      "learning_rate": 3.533744630308533e-08,
+      "loss": 0.4343,
+      "step": 2147
+    },
+    {
+      "batch_num_effect_tokens": 4745,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.95273,
+      "grad_norm": 3.090961217880249,
+      "learning_rate": 3.401728725919373e-08,
+      "loss": 0.678,
+      "step": 2148
+    },
+    {
+      "batch_num_effect_tokens": 5507,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52167,
+      "epoch": 1.95364,
+      "grad_norm": 2.775160074234009,
+      "learning_rate": 3.2722217564912226e-08,
+      "loss": 0.685,
+      "step": 2149
+    },
+    {
+      "batch_num_effect_tokens": 7124,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.95455,
+      "grad_norm": 2.7031285762786865,
+      "learning_rate": 3.1452240480577265e-08,
+      "loss": 0.8426,
+      "step": 2150
+    },
+    {
+      "batch_num_effect_tokens": 7540,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.95545,
+      "grad_norm": 2.8114101886749268,
+      "learning_rate": 3.020735920335138e-08,
+      "loss": 0.8927,
+      "step": 2151
+    },
+    {
+      "batch_num_effect_tokens": 5540,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52206,
+      "epoch": 1.95636,
+      "grad_norm": 2.670271873474121,
+      "learning_rate": 2.898757686722542e-08,
+      "loss": 0.5648,
+      "step": 2152
+    },
+    {
+      "batch_num_effect_tokens": 5503,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.95727,
+      "grad_norm": 3.1284098625183105,
+      "learning_rate": 2.779289654299855e-08,
+      "loss": 0.7566,
+      "step": 2153
+    },
+    {
+      "batch_num_effect_tokens": 5074,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.95818,
+      "grad_norm": 3.766758441925049,
+      "learning_rate": 2.6623321238277157e-08,
+      "loss": 0.9247,
+      "step": 2154
+    },
+    {
+      "batch_num_effect_tokens": 9093,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52139,
+      "epoch": 1.95909,
+      "grad_norm": 2.73887300491333,
+      "learning_rate": 2.547885389746485e-08,
+      "loss": 1.1392,
+      "step": 2155
+    },
+    {
+      "batch_num_effect_tokens": 5651,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 1.96,
+      "grad_norm": 3.31974196434021,
+      "learning_rate": 2.4359497401758026e-08,
+      "loss": 0.7835,
+      "step": 2156
+    },
+    {
+      "batch_num_effect_tokens": 6508,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52217,
+      "epoch": 1.96091,
+      "grad_norm": 3.1143388748168945,
+      "learning_rate": 2.3265254569133645e-08,
+      "loss": 0.9426,
+      "step": 2157
+    },
+    {
+      "batch_num_effect_tokens": 7877,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52157,
+      "epoch": 1.96182,
+      "grad_norm": 3.482353448867798,
+      "learning_rate": 2.219612815434924e-08,
+      "loss": 1.6844,
+      "step": 2158
+    },
+    {
+      "batch_num_effect_tokens": 6098,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52178,
+      "epoch": 1.96273,
+      "grad_norm": 3.5550529956817627,
+      "learning_rate": 2.115212084892737e-08,
+      "loss": 0.9044,
+      "step": 2159
+    },
+    {
+      "batch_num_effect_tokens": 5643,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52082,
+      "epoch": 1.96364,
+      "grad_norm": 3.3726587295532227,
+      "learning_rate": 2.013323528115674e-08,
+      "loss": 0.9737,
+      "step": 2160
+    },
+    {
+      "batch_num_effect_tokens": 5377,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52165,
+      "epoch": 1.96455,
+      "grad_norm": 3.1672065258026123,
+      "learning_rate": 1.913947401607774e-08,
+      "loss": 0.8833,
+      "step": 2161
+    },
+    {
+      "batch_num_effect_tokens": 7228,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52187,
+      "epoch": 1.96545,
+      "grad_norm": 2.9322712421417236,
+      "learning_rate": 1.817083955548693e-08,
+      "loss": 0.9785,
+      "step": 2162
+    },
+    {
+      "batch_num_effect_tokens": 8458,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52194,
+      "epoch": 1.96636,
+      "grad_norm": 2.756446599960327,
+      "learning_rate": 1.722733433791701e-08,
+      "loss": 1.103,
+      "step": 2163
+    },
+    {
+      "batch_num_effect_tokens": 11262,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52184,
+      "epoch": 1.96727,
+      "grad_norm": 2.5466971397399902,
+      "learning_rate": 1.630896073864352e-08,
+      "loss": 1.3516,
+      "step": 2164
+    },
+    {
+      "batch_num_effect_tokens": 6845,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52179,
+      "epoch": 1.96818,
+      "grad_norm": 3.5355217456817627,
+      "learning_rate": 1.5415721069669265e-08,
+      "loss": 1.2466,
+      "step": 2165
+    },
+    {
+      "batch_num_effect_tokens": 5385,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52142,
+      "epoch": 1.96909,
+      "grad_norm": 3.531609296798706,
+      "learning_rate": 1.4547617579725449e-08,
+      "loss": 0.8818,
+      "step": 2166
+    },
+    {
+      "batch_num_effect_tokens": 4276,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52223,
+      "epoch": 1.97,
+      "grad_norm": 2.2698025703430176,
+      "learning_rate": 1.370465245426167e-08,
+      "loss": 0.3254,
+      "step": 2167
+    },
+    {
+      "batch_num_effect_tokens": 6165,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52206,
+      "epoch": 1.97091,
+      "grad_norm": 3.9124033451080322,
+      "learning_rate": 1.2886827815440373e-08,
+      "loss": 1.5506,
+      "step": 2168
+    },
+    {
+      "batch_num_effect_tokens": 4577,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52197,
+      "epoch": 1.97182,
+      "grad_norm": 3.2252120971679688,
+      "learning_rate": 1.2094145722134631e-08,
+      "loss": 0.7644,
+      "step": 2169
+    },
+    {
+      "batch_num_effect_tokens": 7780,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52116,
+      "epoch": 1.97273,
+      "grad_norm": 3.0887022018432617,
+      "learning_rate": 1.1326608169920373e-08,
+      "loss": 1.1574,
+      "step": 2170
+    },
+    {
+      "batch_num_effect_tokens": 7272,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.97364,
+      "grad_norm": 3.182729959487915,
+      "learning_rate": 1.0584217091073046e-08,
+      "loss": 1.0788,
+      "step": 2171
+    },
+    {
+      "batch_num_effect_tokens": 11530,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52181,
+      "epoch": 1.97455,
+      "grad_norm": 2.8558449745178223,
+      "learning_rate": 9.866974354560966e-09,
+      "loss": 1.6694,
+      "step": 2172
+    },
+    {
+      "batch_num_effect_tokens": 7912,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52135,
+      "epoch": 1.97545,
+      "grad_norm": 3.0910027027130127,
+      "learning_rate": 9.174881766043086e-09,
+      "loss": 1.1643,
+      "step": 2173
+    },
+    {
+      "batch_num_effect_tokens": 8210,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52170,
+      "epoch": 1.97636,
+      "grad_norm": 3.4341235160827637,
+      "learning_rate": 8.507941067859016e-09,
+      "loss": 1.5604,
+      "step": 2174
+    },
+    {
+      "batch_num_effect_tokens": 5085,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52144,
+      "epoch": 1.97727,
+      "grad_norm": 4.295083999633789,
+      "learning_rate": 7.866153939033449e-09,
+      "loss": 0.8828,
+      "step": 2175
+    },
+    {
+      "batch_num_effect_tokens": 4520,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 1.97818,
+      "grad_norm": 3.1078341007232666,
+      "learning_rate": 7.2495219952639636e-09,
+      "loss": 0.6108,
+      "step": 2176
+    },
+    {
+      "batch_num_effect_tokens": 5965,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.97909,
+      "grad_norm": 3.652076005935669,
+      "learning_rate": 6.658046788921013e-09,
+      "loss": 1.323,
+      "step": 2177
+    },
+    {
+      "batch_num_effect_tokens": 9469,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.98,
+      "grad_norm": 3.063791513442993,
+      "learning_rate": 6.091729809042379e-09,
+      "loss": 1.5039,
+      "step": 2178
+    },
+    {
+      "batch_num_effect_tokens": 6244,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52188,
+      "epoch": 1.98091,
+      "grad_norm": 3.100691556930542,
+      "learning_rate": 5.550572481330951e-09,
+      "loss": 0.8641,
+      "step": 2179
+    },
+    {
+      "batch_num_effect_tokens": 7484,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52207,
+      "epoch": 1.98182,
+      "grad_norm": 2.8077163696289062,
+      "learning_rate": 5.034576168149175e-09,
+      "loss": 0.8706,
+      "step": 2180
+    },
+    {
+      "batch_num_effect_tokens": 6642,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52160,
+      "epoch": 1.98273,
+      "grad_norm": 2.6694931983947754,
+      "learning_rate": 4.543742168516829e-09,
+      "loss": 0.746,
+      "step": 2181
+    },
+    {
+      "batch_num_effect_tokens": 5587,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52151,
+      "epoch": 1.98364,
+      "grad_norm": 3.4628288745880127,
+      "learning_rate": 4.0780717181077015e-09,
+      "loss": 0.9929,
+      "step": 2182
+    },
+    {
+      "batch_num_effect_tokens": 6256,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52176,
+      "epoch": 1.98455,
+      "grad_norm": 3.096724510192871,
+      "learning_rate": 3.6375659892473604e-09,
+      "loss": 0.9109,
+      "step": 2183
+    },
+    {
+      "batch_num_effect_tokens": 7733,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52162,
+      "epoch": 1.98545,
+      "grad_norm": 2.4371373653411865,
+      "learning_rate": 3.22222609090872e-09,
+      "loss": 0.7449,
+      "step": 2184
+    },
+    {
+      "batch_num_effect_tokens": 7158,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.98636,
+      "grad_norm": 2.9890170097351074,
+      "learning_rate": 2.832053068709817e-09,
+      "loss": 0.9248,
+      "step": 2185
+    },
+    {
+      "batch_num_effect_tokens": 6877,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52143,
+      "epoch": 1.98727,
+      "grad_norm": 3.2912912368774414,
+      "learning_rate": 2.4670479049082596e-09,
+      "loss": 1.4214,
+      "step": 2186
+    },
+    {
+      "batch_num_effect_tokens": 5112,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52183,
+      "epoch": 1.98818,
+      "grad_norm": 3.2346763610839844,
+      "learning_rate": 2.1272115184067797e-09,
+      "loss": 0.7472,
+      "step": 2187
+    },
+    {
+      "batch_num_effect_tokens": 7287,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52161,
+      "epoch": 1.98909,
+      "grad_norm": 2.7889344692230225,
+      "learning_rate": 1.8125447647421302e-09,
+      "loss": 0.8528,
+      "step": 2188
+    },
+    {
+      "batch_num_effect_tokens": 9067,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52215,
+      "epoch": 1.99,
+      "grad_norm": 2.8845736980438232,
+      "learning_rate": 1.5230484360873043e-09,
+      "loss": 1.4642,
+      "step": 2189
+    },
+    {
+      "batch_num_effect_tokens": 6027,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52191,
+      "epoch": 1.99091,
+      "grad_norm": 3.2461230754852295,
+      "learning_rate": 1.2587232612493172e-09,
+      "loss": 0.8588,
+      "step": 2190
+    },
+    {
+      "batch_num_effect_tokens": 8567,
+      "batch_num_samples": 150,
+      "batch_num_tokens": 52221,
+      "epoch": 1.99182,
+      "grad_norm": 3.1907031536102295,
+      "learning_rate": 1.019569905666984e-09,
+      "loss": 1.421,
+      "step": 2191
+    },
+    {
+      "batch_num_effect_tokens": 4936,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52141,
+      "epoch": 1.99273,
+      "grad_norm": 2.44538950920105,
+      "learning_rate": 8.05588971406479e-10,
+      "loss": 0.2971,
+      "step": 2192
+    },
+    {
+      "batch_num_effect_tokens": 5104,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52180,
+      "epoch": 1.99364,
+      "grad_norm": 3.5328586101531982,
+      "learning_rate": 6.167809971668881e-10,
+      "loss": 0.8963,
+      "step": 2193
+    },
+    {
+      "batch_num_effect_tokens": 5176,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.99455,
+      "grad_norm": 3.223731517791748,
+      "learning_rate": 4.531464582713252e-10,
+      "loss": 0.7121,
+      "step": 2194
+    },
+    {
+      "batch_num_effect_tokens": 5194,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52150,
+      "epoch": 1.99545,
+      "grad_norm": 2.8537750244140625,
+      "learning_rate": 3.1468576666915383e-10,
+      "loss": 0.4552,
+      "step": 2195
+    },
+    {
+      "batch_num_effect_tokens": 7555,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52224,
+      "epoch": 1.99636,
+      "grad_norm": 2.976595401763916,
+      "learning_rate": 2.0139927093487666e-10,
+      "loss": 1.3754,
+      "step": 2196
+    },
+    {
+      "batch_num_effect_tokens": 5285,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52189,
+      "epoch": 1.99727,
+      "grad_norm": 3.4908828735351562,
+      "learning_rate": 1.1328725626813531e-10,
+      "loss": 1.0096,
+      "step": 2197
+    },
+    {
+      "batch_num_effect_tokens": 6647,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52127,
+      "epoch": 1.99818,
+      "grad_norm": 3.540771007537842,
+      "learning_rate": 5.034994448926967e-11,
+      "loss": 1.0589,
+      "step": 2198
+    },
+    {
+      "batch_num_effect_tokens": 7105,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 1.99909,
+      "grad_norm": 2.4064557552337646,
+      "learning_rate": 1.2587494044868919e-11,
+      "loss": 0.5547,
+      "step": 2199
+    },
+    {
+      "batch_num_effect_tokens": 7655,
+      "batch_num_samples": 149,
+      "batch_num_tokens": 52159,
+      "epoch": 2.0,
+      "grad_norm": 2.689605474472046,
+      "learning_rate": 0.0,
+      "loss": 0.991,
+      "step": 2200
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 2200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}