diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,22146 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 157,
+  "global_step": 1570,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 2.523393154144287,
+      "eval_ppl": 12.47084,
+      "eval_runtime": 43.864,
+      "eval_samples_per_second": 61.577,
+      "eval_steps_per_second": 3.853,
+      "memory/device_reserved (GiB)": 60.88,
+      "memory/max_active (GiB)": 50.21,
+      "memory/max_allocated (GiB)": 50.21,
+      "step": 0
+    },
+    {
+      "epoch": 0.0031847133757961785,
+      "grad_norm": 26.125,
+      "learning_rate": 0.0,
+      "loss": 2.513824939727783,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 60.25,
+      "memory/max_allocated (GiB)": 60.25,
+      "ppl": 12.35209,
+      "step": 1,
+      "tokens/total": 131072,
+      "tokens/train_per_sec_per_gpu": 2648.66,
+      "tokens/trainable": 14388
+    },
+    {
+      "epoch": 0.006369426751592357,
+      "grad_norm": 26.5,
+      "learning_rate": 3.1847133757961787e-07,
+      "loss": 2.5059545040130615,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 12.25525,
+      "step": 2,
+      "tokens/total": 262144,
+      "tokens/train_per_sec_per_gpu": 3269.04,
+      "tokens/trainable": 27845
+    },
+    {
+      "epoch": 0.009554140127388535,
+      "grad_norm": 25.625,
+      "learning_rate": 6.369426751592357e-07,
+      "loss": 2.4954071044921875,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 12.12667,
+      "step": 3,
+      "tokens/total": 393216,
+      "tokens/train_per_sec_per_gpu": 3166.68,
+      "tokens/trainable": 40998
+    },
+    {
+      "epoch": 0.012738853503184714,
+      "grad_norm": 26.25,
+      "learning_rate": 9.554140127388535e-07,
+      "loss": 2.526397943496704,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 12.50837,
+      "step": 4,
+      "tokens/total": 524288,
+      "tokens/train_per_sec_per_gpu": 3343.09,
+      "tokens/trainable": 54878
+    },
+    {
+      "epoch": 0.01592356687898089,
+      "grad_norm": 26.0,
+      "learning_rate": 1.2738853503184715e-06,
+      "loss": 2.480510711669922,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 11.94736,
+      "step": 5,
+      "tokens/total": 655360,
+      "tokens/train_per_sec_per_gpu": 3076.8,
+      "tokens/trainable": 67675
+    },
+    {
+      "epoch": 0.01910828025477707,
+      "grad_norm": 26.25,
+      "learning_rate": 1.5923566878980892e-06,
+      "loss": 2.5267443656921387,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 12.5127,
+      "step": 6,
+      "tokens/total": 786432,
+      "tokens/train_per_sec_per_gpu": 3657.85,
+      "tokens/trainable": 82725
+    },
+    {
+      "epoch": 0.022292993630573247,
+      "grad_norm": 25.625,
+      "learning_rate": 1.910828025477707e-06,
+      "loss": 2.505220651626587,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 12.24626,
+      "step": 7,
+      "tokens/total": 917504,
+      "tokens/train_per_sec_per_gpu": 3615.3,
+      "tokens/trainable": 97609
+    },
+    {
+      "epoch": 0.025477707006369428,
+      "grad_norm": 26.25,
+      "learning_rate": 2.229299363057325e-06,
+      "loss": 2.47495174407959,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 11.88113,
+      "step": 8,
+      "tokens/total": 1048576,
+      "tokens/train_per_sec_per_gpu": 3341.37,
+      "tokens/trainable": 111360
+    },
+    {
+      "epoch": 0.028662420382165606,
+      "grad_norm": 25.375,
+      "learning_rate": 2.547770700636943e-06,
+      "loss": 2.464661121368408,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 11.7595,
+      "step": 9,
+      "tokens/total": 1179648,
+      "tokens/train_per_sec_per_gpu": 3391.98,
+      "tokens/trainable": 125377
+    },
+    {
+      "epoch": 0.03184713375796178,
+      "grad_norm": 25.125,
+      "learning_rate": 2.8662420382165605e-06,
+      "loss": 2.4051315784454346,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 11.07989,
+      "step": 10,
+      "tokens/total": 1310720,
+      "tokens/train_per_sec_per_gpu": 3538.93,
+      "tokens/trainable": 139941
+    },
+    {
+      "epoch": 0.03503184713375796,
+      "grad_norm": 24.25,
+      "learning_rate": 3.1847133757961785e-06,
+      "loss": 2.3649113178253174,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 10.64309,
+      "step": 11,
+      "tokens/total": 1441792,
+      "tokens/train_per_sec_per_gpu": 3190.84,
+      "tokens/trainable": 153243
+    },
+    {
+      "epoch": 0.03821656050955414,
+      "grad_norm": 23.75,
+      "learning_rate": 3.5031847133757964e-06,
+      "loss": 2.2840771675109863,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 9.81662,
+      "step": 12,
+      "tokens/total": 1572864,
+      "tokens/train_per_sec_per_gpu": 3122.46,
+      "tokens/trainable": 166236
+    },
+    {
+      "epoch": 0.041401273885350316,
+      "grad_norm": 23.5,
+      "learning_rate": 3.821656050955414e-06,
+      "loss": 2.2835350036621094,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 9.8113,
+      "step": 13,
+      "tokens/total": 1703936,
+      "tokens/train_per_sec_per_gpu": 3508.01,
+      "tokens/trainable": 180765
+    },
+    {
+      "epoch": 0.044585987261146494,
+      "grad_norm": 22.625,
+      "learning_rate": 4.140127388535032e-06,
+      "loss": 2.178839921951294,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 8.83605,
+      "step": 14,
+      "tokens/total": 1835008,
+      "tokens/train_per_sec_per_gpu": 3391.4,
+      "tokens/trainable": 194814
+    },
+    {
+      "epoch": 0.04777070063694268,
+      "grad_norm": 20.625,
+      "learning_rate": 4.45859872611465e-06,
+      "loss": 2.029291868209839,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 7.6087,
+      "step": 15,
+      "tokens/total": 1966080,
+      "tokens/train_per_sec_per_gpu": 2894.11,
+      "tokens/trainable": 206939
+    },
+    {
+      "epoch": 0.050955414012738856,
+      "grad_norm": 19.125,
+      "learning_rate": 4.777070063694268e-06,
+      "loss": 1.9433990716934204,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 6.98244,
+      "step": 16,
+      "tokens/total": 2097152,
+      "tokens/train_per_sec_per_gpu": 3260.95,
+      "tokens/trainable": 220459
+    },
+    {
+      "epoch": 0.054140127388535034,
+      "grad_norm": 17.0,
+      "learning_rate": 5.095541401273886e-06,
+      "loss": 1.825382113456726,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 6.20517,
+      "step": 17,
+      "tokens/total": 2228224,
+      "tokens/train_per_sec_per_gpu": 3108.44,
+      "tokens/trainable": 233450
+    },
+    {
+      "epoch": 0.05732484076433121,
+      "grad_norm": 15.8125,
+      "learning_rate": 5.414012738853504e-06,
+      "loss": 1.7230491638183594,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 5.60158,
+      "step": 18,
+      "tokens/total": 2359296,
+      "tokens/train_per_sec_per_gpu": 3341.04,
+      "tokens/trainable": 247328
+    },
+    {
+      "epoch": 0.06050955414012739,
+      "grad_norm": 14.8125,
+      "learning_rate": 5.732484076433121e-06,
+      "loss": 1.6547000408172607,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 5.23151,
+      "step": 19,
+      "tokens/total": 2490368,
+      "tokens/train_per_sec_per_gpu": 3383.25,
+      "tokens/trainable": 261435
+    },
+    {
+      "epoch": 0.06369426751592357,
+      "grad_norm": 13.5625,
+      "learning_rate": 6.050955414012739e-06,
+      "loss": 1.544914960861206,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 4.68757,
+      "step": 20,
+      "tokens/total": 2621440,
+      "tokens/train_per_sec_per_gpu": 3349.84,
+      "tokens/trainable": 275370
+    },
+    {
+      "epoch": 0.06687898089171974,
+      "grad_norm": 12.6875,
+      "learning_rate": 6.369426751592357e-06,
+      "loss": 1.4839664697647095,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 4.4104,
+      "step": 21,
+      "tokens/total": 2752512,
+      "tokens/train_per_sec_per_gpu": 3158.43,
+      "tokens/trainable": 288580
+    },
+    {
+      "epoch": 0.07006369426751592,
+      "grad_norm": 12.0625,
+      "learning_rate": 6.687898089171975e-06,
+      "loss": 1.3859291076660156,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 3.99854,
+      "step": 22,
+      "tokens/total": 2883584,
+      "tokens/train_per_sec_per_gpu": 3623.28,
+      "tokens/trainable": 303623
+    },
+    {
+      "epoch": 0.0732484076433121,
+      "grad_norm": 11.1875,
+      "learning_rate": 7.006369426751593e-06,
+      "loss": 1.2559714317321777,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 3.51125,
+      "step": 23,
+      "tokens/total": 3014656,
+      "tokens/train_per_sec_per_gpu": 3333.96,
+      "tokens/trainable": 317478
+    },
+    {
+      "epoch": 0.07643312101910828,
+      "grad_norm": 10.1875,
+      "learning_rate": 7.32484076433121e-06,
+      "loss": 1.1163444519042969,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 3.05367,
+      "step": 24,
+      "tokens/total": 3145728,
+      "tokens/train_per_sec_per_gpu": 3273.07,
+      "tokens/trainable": 331087
+    },
+    {
+      "epoch": 0.07961783439490445,
+      "grad_norm": 9.625,
+      "learning_rate": 7.643312101910828e-06,
+      "loss": 0.9755889177322388,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 2.65273,
+      "step": 25,
+      "tokens/total": 3276800,
+      "tokens/train_per_sec_per_gpu": 3686.54,
+      "tokens/trainable": 346421
+    },
+    {
+      "epoch": 0.08280254777070063,
+      "grad_norm": 8.5625,
+      "learning_rate": 7.961783439490445e-06,
+      "loss": 0.8369104266166687,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 2.30922,
+      "step": 26,
+      "tokens/total": 3407872,
+      "tokens/train_per_sec_per_gpu": 3225.45,
+      "tokens/trainable": 359891
+    },
+    {
+      "epoch": 0.08598726114649681,
+      "grad_norm": 7.65625,
+      "learning_rate": 8.280254777070064e-06,
+      "loss": 0.7086498737335205,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 2.03125,
+      "step": 27,
+      "tokens/total": 3538944,
+      "tokens/train_per_sec_per_gpu": 3049.77,
+      "tokens/trainable": 372710
+    },
+    {
+      "epoch": 0.08917197452229299,
+      "grad_norm": 7.03125,
+      "learning_rate": 8.598726114649681e-06,
+      "loss": 0.6029537320137024,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.82751,
+      "step": 28,
+      "tokens/total": 3670016,
+      "tokens/train_per_sec_per_gpu": 3413.19,
+      "tokens/trainable": 386972
+    },
+    {
+      "epoch": 0.09235668789808917,
+      "grad_norm": 6.59375,
+      "learning_rate": 8.9171974522293e-06,
+      "loss": 0.5023248195648193,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.65256,
+      "step": 29,
+      "tokens/total": 3801088,
+      "tokens/train_per_sec_per_gpu": 2978.06,
+      "tokens/trainable": 399448
+    },
+    {
+      "epoch": 0.09554140127388536,
+      "grad_norm": 5.96875,
+      "learning_rate": 9.235668789808917e-06,
+      "loss": 0.4153555631637573,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.51491,
+      "step": 30,
+      "tokens/total": 3932160,
+      "tokens/train_per_sec_per_gpu": 3448.36,
+      "tokens/trainable": 413796
+    },
+    {
+      "epoch": 0.09872611464968153,
+      "grad_norm": 5.3125,
+      "learning_rate": 9.554140127388536e-06,
+      "loss": 0.329733669757843,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.3906,
+      "step": 31,
+      "tokens/total": 4063232,
+      "tokens/train_per_sec_per_gpu": 3050.66,
+      "tokens/trainable": 426585
+    },
+    {
+      "epoch": 0.10191082802547771,
+      "grad_norm": 4.65625,
+      "learning_rate": 9.872611464968155e-06,
+      "loss": 0.2749524414539337,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.31647,
+      "step": 32,
+      "tokens/total": 4194304,
+      "tokens/train_per_sec_per_gpu": 3412.69,
+      "tokens/trainable": 440864
+    },
+    {
+      "epoch": 0.10509554140127389,
+      "grad_norm": 3.8125,
+      "learning_rate": 1.0191082802547772e-05,
+      "loss": 0.2164468914270401,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.24166,
+      "step": 33,
+      "tokens/total": 4325376,
+      "tokens/train_per_sec_per_gpu": 3101.83,
+      "tokens/trainable": 453864
+    },
+    {
+      "epoch": 0.10828025477707007,
+      "grad_norm": 3.125,
+      "learning_rate": 1.0509554140127389e-05,
+      "loss": 0.16533951461315155,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.17979,
+      "step": 34,
+      "tokens/total": 4456448,
+      "tokens/train_per_sec_per_gpu": 2919.92,
+      "tokens/trainable": 466189
+    },
+    {
+      "epoch": 0.11146496815286625,
+      "grad_norm": 2.3125,
+      "learning_rate": 1.0828025477707008e-05,
+      "loss": 0.13319599628448486,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.14247,
+      "step": 35,
+      "tokens/total": 4587520,
+      "tokens/train_per_sec_per_gpu": 3395.27,
+      "tokens/trainable": 480345
+    },
+    {
+      "epoch": 0.11464968152866242,
+      "grad_norm": 1.734375,
+      "learning_rate": 1.1146496815286625e-05,
+      "loss": 0.11769881844520569,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.12491,
+      "step": 36,
+      "tokens/total": 4718592,
+      "tokens/train_per_sec_per_gpu": 3283.23,
+      "tokens/trainable": 494113
+    },
+    {
+      "epoch": 0.1178343949044586,
+      "grad_norm": 1.2734375,
+      "learning_rate": 1.1464968152866242e-05,
+      "loss": 0.09715006500482559,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.10203,
+      "step": 37,
+      "tokens/total": 4849664,
+      "tokens/train_per_sec_per_gpu": 3440.9,
+      "tokens/trainable": 508490
+    },
+    {
+      "epoch": 0.12101910828025478,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.178343949044586e-05,
+      "loss": 0.08853279799222946,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.09257,
+      "step": 38,
+      "tokens/total": 4980736,
+      "tokens/train_per_sec_per_gpu": 3324.91,
+      "tokens/trainable": 522428
+    },
+    {
+      "epoch": 0.12420382165605096,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.2101910828025478e-05,
+      "loss": 0.07282212376594543,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.07554,
+      "step": 39,
+      "tokens/total": 5111808,
+      "tokens/train_per_sec_per_gpu": 3291.66,
+      "tokens/trainable": 536220
+    },
+    {
+      "epoch": 0.12738853503184713,
+      "grad_norm": 0.921875,
+      "learning_rate": 1.2420382165605097e-05,
+      "loss": 0.07131636142730713,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.07392,
+      "step": 40,
+      "tokens/total": 5242880,
+      "tokens/train_per_sec_per_gpu": 3067.47,
+      "tokens/trainable": 549148
+    },
+    {
+      "epoch": 0.1305732484076433,
+      "grad_norm": 0.91015625,
+      "learning_rate": 1.2738853503184714e-05,
+      "loss": 0.07583475857973099,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.07878,
+      "step": 41,
+      "tokens/total": 5373952,
+      "tokens/train_per_sec_per_gpu": 3078.11,
+      "tokens/trainable": 562021
+    },
+    {
+      "epoch": 0.1337579617834395,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.3057324840764331e-05,
+      "loss": 0.05423282831907272,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.05573,
+      "step": 42,
+      "tokens/total": 5505024,
+      "tokens/train_per_sec_per_gpu": 3152.3,
+      "tokens/trainable": 575214
+    },
+    {
+      "epoch": 0.13694267515923567,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.337579617834395e-05,
+      "loss": 0.05849003419280052,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.06023,
+      "step": 43,
+      "tokens/total": 5636096,
+      "tokens/train_per_sec_per_gpu": 3026.82,
+      "tokens/trainable": 587989
+    },
+    {
+      "epoch": 0.14012738853503184,
+      "grad_norm": 0.671875,
+      "learning_rate": 1.3694267515923567e-05,
+      "loss": 0.047232724726200104,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04837,
+      "step": 44,
+      "tokens/total": 5767168,
+      "tokens/train_per_sec_per_gpu": 3186.14,
+      "tokens/trainable": 601337
+    },
+    {
+      "epoch": 0.14331210191082802,
+      "grad_norm": 0.8203125,
+      "learning_rate": 1.4012738853503186e-05,
+      "loss": 0.0633855015039444,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.06544,
+      "step": 45,
+      "tokens/total": 5898240,
+      "tokens/train_per_sec_per_gpu": 3243.91,
+      "tokens/trainable": 614903
+    },
+    {
+      "epoch": 0.1464968152866242,
+      "grad_norm": 0.7734375,
+      "learning_rate": 1.4331210191082803e-05,
+      "loss": 0.057890165597200394,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0596,
+      "step": 46,
+      "tokens/total": 6029312,
+      "tokens/train_per_sec_per_gpu": 3235.78,
+      "tokens/trainable": 628512
+    },
+    {
+      "epoch": 0.14968152866242038,
+      "grad_norm": 0.62890625,
+      "learning_rate": 1.464968152866242e-05,
+      "loss": 0.057463180273771286,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.05915,
+      "step": 47,
+      "tokens/total": 6160384,
+      "tokens/train_per_sec_per_gpu": 3663.91,
+      "tokens/trainable": 643746
+    },
+    {
+      "epoch": 0.15286624203821655,
+      "grad_norm": 0.55859375,
+      "learning_rate": 1.4968152866242039e-05,
+      "loss": 0.047860756516456604,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04902,
+      "step": 48,
+      "tokens/total": 6291456,
+      "tokens/train_per_sec_per_gpu": 3663.1,
+      "tokens/trainable": 659004
+    },
+    {
+      "epoch": 0.15605095541401273,
+      "grad_norm": 0.69140625,
+      "learning_rate": 1.5286624203821656e-05,
+      "loss": 0.04775935783982277,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04892,
+      "step": 49,
+      "tokens/total": 6422528,
+      "tokens/train_per_sec_per_gpu": 3484.38,
+      "tokens/trainable": 673537
+    },
+    {
+      "epoch": 0.1592356687898089,
+      "grad_norm": 0.65234375,
+      "learning_rate": 1.5605095541401275e-05,
+      "loss": 0.041205767542123795,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04207,
+      "step": 50,
+      "tokens/total": 6553600,
+      "tokens/train_per_sec_per_gpu": 3230.47,
+      "tokens/trainable": 687060
+    },
+    {
+      "epoch": 0.1624203821656051,
+      "grad_norm": 0.5625,
+      "learning_rate": 1.592356687898089e-05,
+      "loss": 0.04386754706501961,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04484,
+      "step": 51,
+      "tokens/total": 6684672,
+      "tokens/train_per_sec_per_gpu": 3268.41,
+      "tokens/trainable": 700730
+    },
+    {
+      "epoch": 0.16560509554140126,
+      "grad_norm": 0.44140625,
+      "learning_rate": 1.624203821656051e-05,
+      "loss": 0.041807860136032104,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04269,
+      "step": 52,
+      "tokens/total": 6815744,
+      "tokens/train_per_sec_per_gpu": 3368.11,
+      "tokens/trainable": 714773
+    },
+    {
+      "epoch": 0.16878980891719744,
+      "grad_norm": 0.54296875,
+      "learning_rate": 1.6560509554140128e-05,
+      "loss": 0.04267745837569237,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0436,
+      "step": 53,
+      "tokens/total": 6946816,
+      "tokens/train_per_sec_per_gpu": 3215.88,
+      "tokens/trainable": 728248
+    },
+    {
+      "epoch": 0.17197452229299362,
+      "grad_norm": 0.54296875,
+      "learning_rate": 1.6878980891719747e-05,
+      "loss": 0.04988788813352585,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.05115,
+      "step": 54,
+      "tokens/total": 7077888,
+      "tokens/train_per_sec_per_gpu": 3378.45,
+      "tokens/trainable": 742393
+    },
+    {
+      "epoch": 0.1751592356687898,
+      "grad_norm": 0.60546875,
+      "learning_rate": 1.7197452229299362e-05,
+      "loss": 0.03681975603103638,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03751,
+      "step": 55,
+      "tokens/total": 7208960,
+      "tokens/train_per_sec_per_gpu": 3317.61,
+      "tokens/trainable": 756289
+    },
+    {
+      "epoch": 0.17834394904458598,
+      "grad_norm": 0.54296875,
+      "learning_rate": 1.751592356687898e-05,
+      "loss": 0.03921874612569809,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04,
+      "step": 56,
+      "tokens/total": 7340032,
+      "tokens/train_per_sec_per_gpu": 3135.92,
+      "tokens/trainable": 769413
+    },
+    {
+      "epoch": 0.18152866242038215,
+      "grad_norm": 0.498046875,
+      "learning_rate": 1.78343949044586e-05,
+      "loss": 0.03980698809027672,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04061,
+      "step": 57,
+      "tokens/total": 7471104,
+      "tokens/train_per_sec_per_gpu": 3113.74,
+      "tokens/trainable": 782484
+    },
+    {
+      "epoch": 0.18471337579617833,
+      "grad_norm": 0.62109375,
+      "learning_rate": 1.8152866242038215e-05,
+      "loss": 0.03426855802536011,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03486,
+      "step": 58,
+      "tokens/total": 7602176,
+      "tokens/train_per_sec_per_gpu": 3252.75,
+      "tokens/trainable": 796083
+    },
+    {
+      "epoch": 0.18789808917197454,
+      "grad_norm": 0.51953125,
+      "learning_rate": 1.8471337579617834e-05,
+      "loss": 0.03522620350122452,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03585,
+      "step": 59,
+      "tokens/total": 7733248,
+      "tokens/train_per_sec_per_gpu": 3557.53,
+      "tokens/trainable": 810976
+    },
+    {
+      "epoch": 0.1910828025477707,
+      "grad_norm": 0.609375,
+      "learning_rate": 1.8789808917197453e-05,
+      "loss": 0.03881306201219559,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03958,
+      "step": 60,
+      "tokens/total": 7864320,
+      "tokens/train_per_sec_per_gpu": 3437.92,
+      "tokens/trainable": 825388
+    },
+    {
+      "epoch": 0.1942675159235669,
+      "grad_norm": 0.7890625,
+      "learning_rate": 1.910828025477707e-05,
+      "loss": 0.04205251485109329,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04295,
+      "step": 61,
+      "tokens/total": 7995392,
+      "tokens/train_per_sec_per_gpu": 2932.75,
+      "tokens/trainable": 837817
+    },
+    {
+      "epoch": 0.19745222929936307,
+      "grad_norm": 0.58203125,
+      "learning_rate": 1.942675159235669e-05,
+      "loss": 0.03300648555159569,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03356,
+      "step": 62,
+      "tokens/total": 8126464,
+      "tokens/train_per_sec_per_gpu": 3125.85,
+      "tokens/trainable": 850991
+    },
+    {
+      "epoch": 0.20063694267515925,
+      "grad_norm": 0.87109375,
+      "learning_rate": 1.974522292993631e-05,
+      "loss": 0.03468535467982292,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03529,
+      "step": 63,
+      "tokens/total": 8257536,
+      "tokens/train_per_sec_per_gpu": 3543.61,
+      "tokens/trainable": 865759
+    },
+    {
+      "epoch": 0.20382165605095542,
+      "grad_norm": 0.6171875,
+      "learning_rate": 2.0063694267515925e-05,
+      "loss": 0.035250235348939896,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03588,
+      "step": 64,
+      "tokens/total": 8388608,
+      "tokens/train_per_sec_per_gpu": 3393.88,
+      "tokens/trainable": 879904
+    },
+    {
+      "epoch": 0.2070063694267516,
+      "grad_norm": 0.63671875,
+      "learning_rate": 2.0382165605095544e-05,
+      "loss": 0.03242558240890503,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03296,
+      "step": 65,
+      "tokens/total": 8519680,
+      "tokens/train_per_sec_per_gpu": 2965.68,
+      "tokens/trainable": 892375
+    },
+    {
+      "epoch": 0.21019108280254778,
+      "grad_norm": 0.765625,
+      "learning_rate": 2.0700636942675162e-05,
+      "loss": 0.04080452769994736,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.04165,
+      "step": 66,
+      "tokens/total": 8650752,
+      "tokens/train_per_sec_per_gpu": 3513.42,
+      "tokens/trainable": 907090
+    },
+    {
+      "epoch": 0.21337579617834396,
+      "grad_norm": 0.40625,
+      "learning_rate": 2.1019108280254778e-05,
+      "loss": 0.02815978415310383,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02856,
+      "step": 67,
+      "tokens/total": 8781824,
+      "tokens/train_per_sec_per_gpu": 3257.68,
+      "tokens/trainable": 920761
+    },
+    {
+      "epoch": 0.21656050955414013,
+      "grad_norm": 0.53125,
+      "learning_rate": 2.1337579617834397e-05,
+      "loss": 0.034378018230199814,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03498,
+      "step": 68,
+      "tokens/total": 8912896,
+      "tokens/train_per_sec_per_gpu": 3612.11,
+      "tokens/trainable": 935785
+    },
+    {
+      "epoch": 0.2197452229299363,
+      "grad_norm": 0.65234375,
+      "learning_rate": 2.1656050955414015e-05,
+      "loss": 0.03373882547020912,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03431,
+      "step": 69,
+      "tokens/total": 9043968,
+      "tokens/train_per_sec_per_gpu": 3727.74,
+      "tokens/trainable": 951259
+    },
+    {
+      "epoch": 0.2229299363057325,
+      "grad_norm": 0.458984375,
+      "learning_rate": 2.197452229299363e-05,
+      "loss": 0.03272494301199913,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03327,
+      "step": 70,
+      "tokens/total": 9175040,
+      "tokens/train_per_sec_per_gpu": 3482.14,
+      "tokens/trainable": 965829
+    },
+    {
+      "epoch": 0.22611464968152867,
+      "grad_norm": 0.55078125,
+      "learning_rate": 2.229299363057325e-05,
+      "loss": 0.02994038723409176,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03039,
+      "step": 71,
+      "tokens/total": 9306112,
+      "tokens/train_per_sec_per_gpu": 3238.41,
+      "tokens/trainable": 979395
+    },
+    {
+      "epoch": 0.22929936305732485,
+      "grad_norm": 0.75390625,
+      "learning_rate": 2.261146496815287e-05,
+      "loss": 0.033101145178079605,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03366,
+      "step": 72,
+      "tokens/total": 9437184,
+      "tokens/train_per_sec_per_gpu": 3700.58,
+      "tokens/trainable": 994803
+    },
+    {
+      "epoch": 0.23248407643312102,
+      "grad_norm": 0.396484375,
+      "learning_rate": 2.2929936305732484e-05,
+      "loss": 0.03042842261493206,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0309,
+      "step": 73,
+      "tokens/total": 9568256,
+      "tokens/train_per_sec_per_gpu": 3386.78,
+      "tokens/trainable": 1008996
+    },
+    {
+      "epoch": 0.2356687898089172,
+      "grad_norm": 0.53515625,
+      "learning_rate": 2.3248407643312103e-05,
+      "loss": 0.02688576839864254,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02725,
+      "step": 74,
+      "tokens/total": 9699328,
+      "tokens/train_per_sec_per_gpu": 3353.01,
+      "tokens/trainable": 1023021
+    },
+    {
+      "epoch": 0.23885350318471338,
+      "grad_norm": 0.51953125,
+      "learning_rate": 2.356687898089172e-05,
+      "loss": 0.028813578188419342,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02923,
+      "step": 75,
+      "tokens/total": 9830400,
+      "tokens/train_per_sec_per_gpu": 3035.97,
+      "tokens/trainable": 1035757
+    },
+    {
+      "epoch": 0.24203821656050956,
+      "grad_norm": 0.546875,
+      "learning_rate": 2.388535031847134e-05,
+      "loss": 0.035763900727033615,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03641,
+      "step": 76,
+      "tokens/total": 9961472,
+      "tokens/train_per_sec_per_gpu": 2971.17,
+      "tokens/trainable": 1048202
+    },
+    {
+      "epoch": 0.24522292993630573,
+      "grad_norm": 0.61328125,
+      "learning_rate": 2.4203821656050956e-05,
+      "loss": 0.026223331689834595,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02657,
+      "step": 77,
+      "tokens/total": 10092544,
+      "tokens/train_per_sec_per_gpu": 3195.37,
+      "tokens/trainable": 1061576
+    },
+    {
+      "epoch": 0.2484076433121019,
+      "grad_norm": 0.451171875,
+      "learning_rate": 2.4522292993630575e-05,
+      "loss": 0.037136998027563095,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.03784,
+      "step": 78,
+      "tokens/total": 10223616,
+      "tokens/train_per_sec_per_gpu": 3185.64,
+      "tokens/trainable": 1074924
+    },
+    {
+      "epoch": 0.2515923566878981,
+      "grad_norm": 0.44140625,
+      "learning_rate": 2.4840764331210193e-05,
+      "loss": 0.02757476083934307,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02796,
+      "step": 79,
+      "tokens/total": 10354688,
+      "tokens/train_per_sec_per_gpu": 3141.94,
+      "tokens/trainable": 1088089
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "grad_norm": 0.60546875,
+      "learning_rate": 2.515923566878981e-05,
+      "loss": 0.026085954159498215,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02643,
+      "step": 80,
+      "tokens/total": 10485760,
+      "tokens/train_per_sec_per_gpu": 3340.19,
+      "tokens/trainable": 1102070
+    },
+    {
+      "epoch": 0.25796178343949044,
+      "grad_norm": 0.41015625,
+      "learning_rate": 2.5477707006369428e-05,
+      "loss": 0.027341356500983238,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02772,
+      "step": 81,
+      "tokens/total": 10616832,
+      "tokens/train_per_sec_per_gpu": 3294.79,
+      "tokens/trainable": 1115858
+    },
+    {
+      "epoch": 0.2611464968152866,
+      "grad_norm": 0.431640625,
+      "learning_rate": 2.5796178343949047e-05,
+      "loss": 0.028896335512399673,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02932,
+      "step": 82,
+      "tokens/total": 10747904,
+      "tokens/train_per_sec_per_gpu": 3433.89,
+      "tokens/trainable": 1130226
+    },
+    {
+      "epoch": 0.2643312101910828,
+      "grad_norm": 0.466796875,
+      "learning_rate": 2.6114649681528662e-05,
+      "loss": 0.026260778307914734,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02661,
+      "step": 83,
+      "tokens/total": 10878976,
+      "tokens/train_per_sec_per_gpu": 3711.62,
+      "tokens/trainable": 1145755
+    },
+    {
+      "epoch": 0.267515923566879,
+      "grad_norm": 0.53125,
+      "learning_rate": 2.643312101910828e-05,
+      "loss": 0.027284812182188034,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02766,
+      "step": 84,
+      "tokens/total": 11010048,
+      "tokens/train_per_sec_per_gpu": 3309.83,
+      "tokens/trainable": 1159641
+    },
+    {
+      "epoch": 0.27070063694267515,
+      "grad_norm": 0.376953125,
+      "learning_rate": 2.67515923566879e-05,
+      "loss": 0.02594919502735138,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02629,
+      "step": 85,
+      "tokens/total": 11141120,
+      "tokens/train_per_sec_per_gpu": 3424.34,
+      "tokens/trainable": 1173967
+    },
+    {
+      "epoch": 0.27388535031847133,
+      "grad_norm": 0.50390625,
+      "learning_rate": 2.707006369426752e-05,
+      "loss": 0.025507405400276184,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02584,
+      "step": 86,
+      "tokens/total": 11272192,
+      "tokens/train_per_sec_per_gpu": 2757.24,
+      "tokens/trainable": 1185544
+    },
+    {
+      "epoch": 0.2770700636942675,
+      "grad_norm": 0.4765625,
+      "learning_rate": 2.7388535031847134e-05,
+      "loss": 0.024133453145623207,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02443,
+      "step": 87,
+      "tokens/total": 11403264,
+      "tokens/train_per_sec_per_gpu": 3215.45,
+      "tokens/trainable": 1199051
+    },
+    {
+      "epoch": 0.2802547770700637,
+      "grad_norm": 0.45703125,
+      "learning_rate": 2.7707006369426753e-05,
+      "loss": 0.026854459196329117,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02722,
+      "step": 88,
+      "tokens/total": 11534336,
+      "tokens/train_per_sec_per_gpu": 3550.27,
+      "tokens/trainable": 1213857
+    },
+    {
+      "epoch": 0.28343949044585987,
+      "grad_norm": 0.45703125,
+      "learning_rate": 2.802547770700637e-05,
+      "loss": 0.02602829411625862,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02637,
+      "step": 89,
+      "tokens/total": 11665408,
+      "tokens/train_per_sec_per_gpu": 3183.98,
+      "tokens/trainable": 1227192
+    },
+    {
+      "epoch": 0.28662420382165604,
+      "grad_norm": 0.337890625,
+      "learning_rate": 2.8343949044585987e-05,
+      "loss": 0.020508471876382828,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02072,
+      "step": 90,
+      "tokens/total": 11796480,
+      "tokens/train_per_sec_per_gpu": 3402.64,
+      "tokens/trainable": 1241432
+    },
+    {
+      "epoch": 0.2898089171974522,
+      "grad_norm": 0.408203125,
+      "learning_rate": 2.8662420382165606e-05,
+      "loss": 0.017694037407636642,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01785,
+      "step": 91,
+      "tokens/total": 11927552,
+      "tokens/train_per_sec_per_gpu": 3333.79,
+      "tokens/trainable": 1255396
+    },
+    {
+      "epoch": 0.2929936305732484,
+      "grad_norm": 0.4140625,
+      "learning_rate": 2.8980891719745225e-05,
+      "loss": 0.027573810890316963,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02796,
+      "step": 92,
+      "tokens/total": 12058624,
+      "tokens/train_per_sec_per_gpu": 2994.34,
+      "tokens/trainable": 1268041
+    },
+    {
+      "epoch": 0.2961783439490446,
+      "grad_norm": 0.486328125,
+      "learning_rate": 2.929936305732484e-05,
+      "loss": 0.028143662959337234,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02854,
+      "step": 93,
+      "tokens/total": 12189696,
+      "tokens/train_per_sec_per_gpu": 3516.43,
+      "tokens/trainable": 1282765
+    },
+    {
+      "epoch": 0.29936305732484075,
+      "grad_norm": 0.4765625,
+      "learning_rate": 2.961783439490446e-05,
+      "loss": 0.026264818385243416,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02661,
+      "step": 94,
+      "tokens/total": 12320768,
+      "tokens/train_per_sec_per_gpu": 3304.3,
+      "tokens/trainable": 1296613
+    },
+    {
+      "epoch": 0.30254777070063693,
+      "grad_norm": 0.462890625,
+      "learning_rate": 2.9936305732484078e-05,
+      "loss": 0.026661768555641174,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02702,
+      "step": 95,
+      "tokens/total": 12451840,
+      "tokens/train_per_sec_per_gpu": 3563.3,
+      "tokens/trainable": 1311465
+    },
+    {
+      "epoch": 0.3057324840764331,
+      "grad_norm": 0.306640625,
+      "learning_rate": 3.0254777070063693e-05,
+      "loss": 0.017260678112506866,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01741,
+      "step": 96,
+      "tokens/total": 12582912,
+      "tokens/train_per_sec_per_gpu": 3428.76,
+      "tokens/trainable": 1325753
+    },
+    {
+      "epoch": 0.3089171974522293,
+      "grad_norm": 0.5703125,
+      "learning_rate": 3.057324840764331e-05,
+      "loss": 0.022419072687625885,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02267,
+      "step": 97,
+      "tokens/total": 12713984,
+      "tokens/train_per_sec_per_gpu": 3443.07,
+      "tokens/trainable": 1340109
+    },
+    {
+      "epoch": 0.31210191082802546,
+      "grad_norm": 0.50390625,
+      "learning_rate": 3.089171974522293e-05,
+      "loss": 0.023397397249937057,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02367,
+      "step": 98,
+      "tokens/total": 12845056,
+      "tokens/train_per_sec_per_gpu": 3420.73,
+      "tokens/trainable": 1354398
+    },
+    {
+      "epoch": 0.31528662420382164,
+      "grad_norm": 0.43359375,
+      "learning_rate": 3.121019108280255e-05,
+      "loss": 0.024743150919675827,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02505,
+      "step": 99,
+      "tokens/total": 12976128,
+      "tokens/train_per_sec_per_gpu": 3420.87,
+      "tokens/trainable": 1368740
+    },
+    {
+      "epoch": 0.3184713375796178,
+      "grad_norm": 0.3984375,
+      "learning_rate": 3.1528662420382165e-05,
+      "loss": 0.023541904985904694,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02382,
+      "step": 100,
+      "tokens/total": 13107200,
+      "tokens/train_per_sec_per_gpu": 3192.28,
+      "tokens/trainable": 1382180
+    },
+    {
+      "epoch": 0.321656050955414,
+      "grad_norm": 0.66015625,
+      "learning_rate": 3.184713375796178e-05,
+      "loss": 0.023172177374362946,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02344,
+      "step": 101,
+      "tokens/total": 13238272,
+      "tokens/train_per_sec_per_gpu": 3177.98,
+      "tokens/trainable": 1395593
+    },
+    {
+      "epoch": 0.3248407643312102,
+      "grad_norm": 0.48828125,
+      "learning_rate": 3.21656050955414e-05,
+      "loss": 0.025406980887055397,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02573,
+      "step": 102,
+      "tokens/total": 13369344,
+      "tokens/train_per_sec_per_gpu": 3638.95,
+      "tokens/trainable": 1410783
+    },
+    {
+      "epoch": 0.32802547770700635,
+      "grad_norm": 0.69921875,
+      "learning_rate": 3.248407643312102e-05,
+      "loss": 0.02435356006026268,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02465,
+      "step": 103,
+      "tokens/total": 13500416,
+      "tokens/train_per_sec_per_gpu": 3263.52,
+      "tokens/trainable": 1424464
+    },
+    {
+      "epoch": 0.33121019108280253,
+      "grad_norm": 0.404296875,
+      "learning_rate": 3.2802547770700634e-05,
+      "loss": 0.02753208577632904,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02791,
+      "step": 104,
+      "tokens/total": 13631488,
+      "tokens/train_per_sec_per_gpu": 3426.67,
+      "tokens/trainable": 1438808
+    },
+    {
+      "epoch": 0.3343949044585987,
+      "grad_norm": 0.404296875,
+      "learning_rate": 3.3121019108280256e-05,
+      "loss": 0.0209305789321661,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02115,
+      "step": 105,
+      "tokens/total": 13762560,
+      "tokens/train_per_sec_per_gpu": 3761.41,
+      "tokens/trainable": 1454488
+    },
+    {
+      "epoch": 0.3375796178343949,
+      "grad_norm": 0.5859375,
+      "learning_rate": 3.343949044585987e-05,
+      "loss": 0.023175280541181564,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02345,
+      "step": 106,
+      "tokens/total": 13893632,
+      "tokens/train_per_sec_per_gpu": 3065.06,
+      "tokens/trainable": 1467330
+    },
+    {
+      "epoch": 0.34076433121019106,
+      "grad_norm": 0.443359375,
+      "learning_rate": 3.375796178343949e-05,
+      "loss": 0.022064058110117912,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02231,
+      "step": 107,
+      "tokens/total": 14024704,
+      "tokens/train_per_sec_per_gpu": 3306.24,
+      "tokens/trainable": 1481142
+    },
+    {
+      "epoch": 0.34394904458598724,
+      "grad_norm": 0.490234375,
+      "learning_rate": 3.407643312101911e-05,
+      "loss": 0.0202829297631979,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02049,
+      "step": 108,
+      "tokens/total": 14155776,
+      "tokens/train_per_sec_per_gpu": 3532.79,
+      "tokens/trainable": 1495947
+    },
+    {
+      "epoch": 0.3471337579617834,
+      "grad_norm": 0.4453125,
+      "learning_rate": 3.4394904458598724e-05,
+      "loss": 0.01804858073592186,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01821,
+      "step": 109,
+      "tokens/total": 14286848,
+      "tokens/train_per_sec_per_gpu": 3518.95,
+      "tokens/trainable": 1510694
+    },
+    {
+      "epoch": 0.3503184713375796,
+      "grad_norm": 0.42578125,
+      "learning_rate": 3.4713375796178346e-05,
+      "loss": 0.0210330281406641,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02126,
+      "step": 110,
+      "tokens/total": 14417920,
+      "tokens/train_per_sec_per_gpu": 2978.43,
+      "tokens/trainable": 1523251
+    },
+    {
+      "epoch": 0.3535031847133758,
+      "grad_norm": 0.427734375,
+      "learning_rate": 3.503184713375796e-05,
+      "loss": 0.026296302676200867,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02665,
+      "step": 111,
+      "tokens/total": 14548992,
+      "tokens/train_per_sec_per_gpu": 3167.03,
+      "tokens/trainable": 1536545
+    },
+    {
+      "epoch": 0.35668789808917195,
+      "grad_norm": 0.5234375,
+      "learning_rate": 3.535031847133758e-05,
+      "loss": 0.020682599395513535,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0209,
+      "step": 112,
+      "tokens/total": 14680064,
+      "tokens/train_per_sec_per_gpu": 3285.72,
+      "tokens/trainable": 1550307
+    },
+    {
+      "epoch": 0.35987261146496813,
+      "grad_norm": 0.53125,
+      "learning_rate": 3.56687898089172e-05,
+      "loss": 0.018929051235318184,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01911,
+      "step": 113,
+      "tokens/total": 14811136,
+      "tokens/train_per_sec_per_gpu": 3527.93,
+      "tokens/trainable": 1565056
+    },
+    {
+      "epoch": 0.3630573248407643,
+      "grad_norm": 0.453125,
+      "learning_rate": 3.5987261146496815e-05,
+      "loss": 0.02578428015112877,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02612,
+      "step": 114,
+      "tokens/total": 14942208,
+      "tokens/train_per_sec_per_gpu": 3195.27,
+      "tokens/trainable": 1578471
+    },
+    {
+      "epoch": 0.3662420382165605,
+      "grad_norm": 0.54296875,
+      "learning_rate": 3.630573248407643e-05,
+      "loss": 0.02062690444290638,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02084,
+      "step": 115,
+      "tokens/total": 15073280,
+      "tokens/train_per_sec_per_gpu": 3476.31,
+      "tokens/trainable": 1593028
+    },
+    {
+      "epoch": 0.36942675159235666,
+      "grad_norm": 0.5546875,
+      "learning_rate": 3.662420382165605e-05,
+      "loss": 0.018274614587426186,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01844,
+      "step": 116,
+      "tokens/total": 15204352,
+      "tokens/train_per_sec_per_gpu": 3437.34,
+      "tokens/trainable": 1607412
+    },
+    {
+      "epoch": 0.37261146496815284,
+      "grad_norm": 0.3359375,
+      "learning_rate": 3.694267515923567e-05,
+      "loss": 0.02159012109041214,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02182,
+      "step": 117,
+      "tokens/total": 15335424,
+      "tokens/train_per_sec_per_gpu": 3467.82,
+      "tokens/trainable": 1621934
+    },
+    {
+      "epoch": 0.37579617834394907,
+      "grad_norm": 0.4609375,
+      "learning_rate": 3.7261146496815283e-05,
+      "loss": 0.0239134319126606,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0242,
+      "step": 118,
+      "tokens/total": 15466496,
+      "tokens/train_per_sec_per_gpu": 3526.82,
+      "tokens/trainable": 1636693
+    },
+    {
+      "epoch": 0.37898089171974525,
+      "grad_norm": 0.546875,
+      "learning_rate": 3.7579617834394906e-05,
+      "loss": 0.021818162873387337,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02206,
+      "step": 119,
+      "tokens/total": 15597568,
+      "tokens/train_per_sec_per_gpu": 3233.2,
+      "tokens/trainable": 1650256
+    },
+    {
+      "epoch": 0.3821656050955414,
+      "grad_norm": 0.3671875,
+      "learning_rate": 3.789808917197453e-05,
+      "loss": 0.023171117529273033,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02344,
+      "step": 120,
+      "tokens/total": 15728640,
+      "tokens/train_per_sec_per_gpu": 3502.77,
+      "tokens/trainable": 1664915
+    },
+    {
+      "epoch": 0.3853503184713376,
+      "grad_norm": 0.408203125,
+      "learning_rate": 3.821656050955414e-05,
+      "loss": 0.019905205816030502,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0201,
+      "step": 121,
+      "tokens/total": 15859712,
+      "tokens/train_per_sec_per_gpu": 3495.55,
+      "tokens/trainable": 1679527
+    },
+    {
+      "epoch": 0.3885350318471338,
+      "grad_norm": 0.4765625,
+      "learning_rate": 3.8535031847133766e-05,
+      "loss": 0.01511327363550663,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01523,
+      "step": 122,
+      "tokens/total": 15990784,
+      "tokens/train_per_sec_per_gpu": 3507.46,
+      "tokens/trainable": 1694159
+    },
+    {
+      "epoch": 0.39171974522292996,
+      "grad_norm": 0.44921875,
+      "learning_rate": 3.885350318471338e-05,
+      "loss": 0.02048143371939659,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02069,
+      "step": 123,
+      "tokens/total": 16121856,
+      "tokens/train_per_sec_per_gpu": 3490.03,
+      "tokens/trainable": 1708712
+    },
+    {
+      "epoch": 0.39490445859872614,
+      "grad_norm": 0.392578125,
+      "learning_rate": 3.9171974522292996e-05,
+      "loss": 0.02280033566057682,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02306,
+      "step": 124,
+      "tokens/total": 16252928,
+      "tokens/train_per_sec_per_gpu": 3443.73,
+      "tokens/trainable": 1723059
+    },
+    {
+      "epoch": 0.3980891719745223,
+      "grad_norm": 0.322265625,
+      "learning_rate": 3.949044585987262e-05,
+      "loss": 0.01703651435673237,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01718,
+      "step": 125,
+      "tokens/total": 16384000,
+      "tokens/train_per_sec_per_gpu": 3396.46,
+      "tokens/trainable": 1737268
+    },
+    {
+      "epoch": 0.4012738853503185,
+      "grad_norm": 0.37109375,
+      "learning_rate": 3.9808917197452234e-05,
+      "loss": 0.019548913463950157,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01974,
+      "step": 126,
+      "tokens/total": 16515072,
+      "tokens/train_per_sec_per_gpu": 3220.09,
+      "tokens/trainable": 1750779
+    },
+    {
+      "epoch": 0.40445859872611467,
+      "grad_norm": 0.4609375,
+      "learning_rate": 4.012738853503185e-05,
+      "loss": 0.021433480083942413,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02166,
+      "step": 127,
+      "tokens/total": 16646144,
+      "tokens/train_per_sec_per_gpu": 3135.99,
+      "tokens/trainable": 1763916
+    },
+    {
+      "epoch": 0.40764331210191085,
+      "grad_norm": 0.36328125,
+      "learning_rate": 4.044585987261147e-05,
+      "loss": 0.01608860120177269,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01622,
+      "step": 128,
+      "tokens/total": 16777216,
+      "tokens/train_per_sec_per_gpu": 3294.85,
+      "tokens/trainable": 1777688
+    },
+    {
+      "epoch": 0.410828025477707,
+      "grad_norm": 0.384765625,
+      "learning_rate": 4.076433121019109e-05,
+      "loss": 0.02616111747920513,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02651,
+      "step": 129,
+      "tokens/total": 16908288,
+      "tokens/train_per_sec_per_gpu": 3542.44,
+      "tokens/trainable": 1792526
+    },
+    {
+      "epoch": 0.4140127388535032,
+      "grad_norm": 0.359375,
+      "learning_rate": 4.10828025477707e-05,
+      "loss": 0.023339644074440002,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02361,
+      "step": 130,
+      "tokens/total": 17039360,
+      "tokens/train_per_sec_per_gpu": 3579.44,
+      "tokens/trainable": 1807456
+    },
+    {
+      "epoch": 0.4171974522292994,
+      "grad_norm": 0.396484375,
+      "learning_rate": 4.1401273885350325e-05,
+      "loss": 0.01703963428735733,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01719,
+      "step": 131,
+      "tokens/total": 17170432,
+      "tokens/train_per_sec_per_gpu": 3374.03,
+      "tokens/trainable": 1821617
+    },
+    {
+      "epoch": 0.42038216560509556,
+      "grad_norm": 0.322265625,
+      "learning_rate": 4.171974522292994e-05,
+      "loss": 0.018855011090636253,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01903,
+      "step": 132,
+      "tokens/total": 17301504,
+      "tokens/train_per_sec_per_gpu": 3358.78,
+      "tokens/trainable": 1835657
+    },
+    {
+      "epoch": 0.42356687898089174,
+      "grad_norm": 0.32421875,
+      "learning_rate": 4.2038216560509556e-05,
+      "loss": 0.018383294343948364,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01855,
+      "step": 133,
+      "tokens/total": 17432576,
+      "tokens/train_per_sec_per_gpu": 3288.93,
+      "tokens/trainable": 1849363
+    },
+    {
+      "epoch": 0.4267515923566879,
+      "grad_norm": 0.341796875,
+      "learning_rate": 4.235668789808918e-05,
+      "loss": 0.018167613074183464,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01833,
+      "step": 134,
+      "tokens/total": 17563648,
+      "tokens/train_per_sec_per_gpu": 3327.75,
+      "tokens/trainable": 1863304
+    },
+    {
+      "epoch": 0.4299363057324841,
+      "grad_norm": 0.263671875,
+      "learning_rate": 4.267515923566879e-05,
+      "loss": 0.016551347449421883,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01669,
+      "step": 135,
+      "tokens/total": 17694720,
+      "tokens/train_per_sec_per_gpu": 3278.66,
+      "tokens/trainable": 1877019
+    },
+    {
+      "epoch": 0.43312101910828027,
+      "grad_norm": 0.3359375,
+      "learning_rate": 4.299363057324841e-05,
+      "loss": 0.02233925275504589,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02259,
+      "step": 136,
+      "tokens/total": 17825792,
+      "tokens/train_per_sec_per_gpu": 3065.32,
+      "tokens/trainable": 1889991
+    },
+    {
+      "epoch": 0.43630573248407645,
+      "grad_norm": 0.35546875,
+      "learning_rate": 4.331210191082803e-05,
+      "loss": 0.01874961145222187,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01893,
+      "step": 137,
+      "tokens/total": 17956864,
+      "tokens/train_per_sec_per_gpu": 3421.98,
+      "tokens/trainable": 1904258
+    },
+    {
+      "epoch": 0.4394904458598726,
+      "grad_norm": 0.35546875,
+      "learning_rate": 4.3630573248407646e-05,
+      "loss": 0.016853082925081253,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.017,
+      "step": 138,
+      "tokens/total": 18087936,
+      "tokens/train_per_sec_per_gpu": 3173.54,
+      "tokens/trainable": 1917589
+    },
+    {
+      "epoch": 0.4426751592356688,
+      "grad_norm": 0.373046875,
+      "learning_rate": 4.394904458598726e-05,
+      "loss": 0.015192901715636253,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01531,
+      "step": 139,
+      "tokens/total": 18219008,
+      "tokens/train_per_sec_per_gpu": 2954.41,
+      "tokens/trainable": 1930014
+    },
+    {
+      "epoch": 0.445859872611465,
+      "grad_norm": 0.302734375,
+      "learning_rate": 4.4267515923566884e-05,
+      "loss": 0.01463925652205944,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01475,
+      "step": 140,
+      "tokens/total": 18350080,
+      "tokens/train_per_sec_per_gpu": 3666.98,
+      "tokens/trainable": 1945307
+    },
+    {
+      "epoch": 0.44904458598726116,
+      "grad_norm": 0.390625,
+      "learning_rate": 4.45859872611465e-05,
+      "loss": 0.020933344960212708,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.02115,
+      "step": 141,
+      "tokens/total": 18481152,
+      "tokens/train_per_sec_per_gpu": 3580.73,
+      "tokens/trainable": 1960244
+    },
+    {
+      "epoch": 0.45222929936305734,
+      "grad_norm": 0.345703125,
+      "learning_rate": 4.4904458598726115e-05,
+      "loss": 0.016706032678484917,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01685,
+      "step": 142,
+      "tokens/total": 18612224,
+      "tokens/train_per_sec_per_gpu": 3692.46,
+      "tokens/trainable": 1975680
+    },
+    {
+      "epoch": 0.4554140127388535,
+      "grad_norm": 0.271484375,
+      "learning_rate": 4.522292993630574e-05,
+      "loss": 0.0143811646848917,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01449,
+      "step": 143,
+      "tokens/total": 18743296,
+      "tokens/train_per_sec_per_gpu": 3610.19,
+      "tokens/trainable": 1990745
+    },
+    {
+      "epoch": 0.4585987261146497,
+      "grad_norm": 0.333984375,
+      "learning_rate": 4.554140127388535e-05,
+      "loss": 0.015790347009897232,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01592,
+      "step": 144,
+      "tokens/total": 18874368,
+      "tokens/train_per_sec_per_gpu": 3290.56,
+      "tokens/trainable": 2004531
+    },
+    {
+      "epoch": 0.46178343949044587,
+      "grad_norm": 0.251953125,
+      "learning_rate": 4.585987261146497e-05,
+      "loss": 0.013354619033634663,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01344,
+      "step": 145,
+      "tokens/total": 19005440,
+      "tokens/train_per_sec_per_gpu": 3241.55,
+      "tokens/trainable": 2018101
+    },
+    {
+      "epoch": 0.46496815286624205,
+      "grad_norm": 0.376953125,
+      "learning_rate": 4.617834394904459e-05,
+      "loss": 0.01745392382144928,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01761,
+      "step": 146,
+      "tokens/total": 19136512,
+      "tokens/train_per_sec_per_gpu": 3409.89,
+      "tokens/trainable": 2032310
+    },
+    {
+      "epoch": 0.4681528662420382,
+      "grad_norm": 0.38671875,
+      "learning_rate": 4.6496815286624206e-05,
+      "loss": 0.015100197866559029,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01521,
+      "step": 147,
+      "tokens/total": 19267584,
+      "tokens/train_per_sec_per_gpu": 3269.82,
+      "tokens/trainable": 2045999
+    },
+    {
+      "epoch": 0.4713375796178344,
+      "grad_norm": 0.310546875,
+      "learning_rate": 4.681528662420383e-05,
+      "loss": 0.01744706742465496,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0176,
+      "step": 148,
+      "tokens/total": 19398656,
+      "tokens/train_per_sec_per_gpu": 3709.08,
+      "tokens/trainable": 2061453
+    },
+    {
+      "epoch": 0.4745222929936306,
+      "grad_norm": 0.283203125,
+      "learning_rate": 4.713375796178344e-05,
+      "loss": 0.013093837536871433,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01318,
+      "step": 149,
+      "tokens/total": 19529728,
+      "tokens/train_per_sec_per_gpu": 3292.43,
+      "tokens/trainable": 2075180
+    },
+    {
+      "epoch": 0.47770700636942676,
+      "grad_norm": 0.275390625,
+      "learning_rate": 4.745222929936306e-05,
+      "loss": 0.01639549434185028,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01653,
+      "step": 150,
+      "tokens/total": 19660800,
+      "tokens/train_per_sec_per_gpu": 3175.73,
+      "tokens/trainable": 2088491
+    },
+    {
+      "epoch": 0.48089171974522293,
+      "grad_norm": 0.31640625,
+      "learning_rate": 4.777070063694268e-05,
+      "loss": 0.015184286050498486,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0153,
+      "step": 151,
+      "tokens/total": 19791872,
+      "tokens/train_per_sec_per_gpu": 3611.48,
+      "tokens/trainable": 2103581
+    },
+    {
+      "epoch": 0.4840764331210191,
+      "grad_norm": 0.318359375,
+      "learning_rate": 4.8089171974522296e-05,
+      "loss": 0.015232382342219353,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01535,
+      "step": 152,
+      "tokens/total": 19922944,
+      "tokens/train_per_sec_per_gpu": 3138.84,
+      "tokens/trainable": 2116743
+    },
+    {
+      "epoch": 0.4872611464968153,
+      "grad_norm": 0.4140625,
+      "learning_rate": 4.840764331210191e-05,
+      "loss": 0.018071118742227554,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01824,
+      "step": 153,
+      "tokens/total": 20054016,
+      "tokens/train_per_sec_per_gpu": 2935.94,
+      "tokens/trainable": 2129049
+    },
+    {
+      "epoch": 0.49044585987261147,
+      "grad_norm": 0.26953125,
+      "learning_rate": 4.8726114649681534e-05,
+      "loss": 0.015034169889986515,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01515,
+      "step": 154,
+      "tokens/total": 20185088,
+      "tokens/train_per_sec_per_gpu": 3956.78,
+      "tokens/trainable": 2145499
+    },
+    {
+      "epoch": 0.49363057324840764,
+      "grad_norm": 0.2734375,
+      "learning_rate": 4.904458598726115e-05,
+      "loss": 0.013894051313400269,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01399,
+      "step": 155,
+      "tokens/total": 20316160,
+      "tokens/train_per_sec_per_gpu": 3559.16,
+      "tokens/trainable": 2160294
+    },
+    {
+      "epoch": 0.4968152866242038,
+      "grad_norm": 0.29296875,
+      "learning_rate": 4.9363057324840765e-05,
+      "loss": 0.01629924215376377,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01643,
+      "step": 156,
+      "tokens/total": 20447232,
+      "tokens/train_per_sec_per_gpu": 3108.59,
+      "tokens/trainable": 2173313
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.34375,
+      "learning_rate": 4.968152866242039e-05,
+      "loss": 0.014140879735350609,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01424,
+      "step": 157,
+      "tokens/total": 20578304,
+      "tokens/train_per_sec_per_gpu": 3311.55,
+      "tokens/trainable": 2187121
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 0.0162150077521801,
+      "eval_ppl": 1.01635,
+      "eval_runtime": 42.1529,
+      "eval_samples_per_second": 64.076,
+      "eval_steps_per_second": 4.009,
+      "memory/device_reserved (GiB)": 68.88,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 157
+    },
+    {
+      "epoch": 0.5031847133757962,
+      "grad_norm": 0.255859375,
+      "learning_rate": 5e-05,
+      "loss": 0.012421849183738232,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0125,
+      "step": 158,
+      "tokens/total": 20709376,
+      "tokens/train_per_sec_per_gpu": 3796.22,
+      "tokens/trainable": 2202882
+    },
+    {
+      "epoch": 0.5063694267515924,
+      "grad_norm": 0.298828125,
+      "learning_rate": 4.999993820899543e-05,
+      "loss": 0.014737301506102085,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01485,
+      "step": 159,
+      "tokens/total": 20840448,
+      "tokens/train_per_sec_per_gpu": 2912.87,
+      "tokens/trainable": 2215142
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "grad_norm": 0.3828125,
+      "learning_rate": 4.999975283628719e-05,
+      "loss": 0.017280632629990578,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01743,
+      "step": 160,
+      "tokens/total": 20971520,
+      "tokens/train_per_sec_per_gpu": 2864.73,
+      "tokens/trainable": 2227241
+    },
+    {
+      "epoch": 0.5127388535031847,
+      "grad_norm": 0.30078125,
+      "learning_rate": 4.999944388279162e-05,
+      "loss": 0.014671262353658676,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01478,
+      "step": 161,
+      "tokens/total": 21102592,
+      "tokens/train_per_sec_per_gpu": 3598.96,
+      "tokens/trainable": 2242266
+    },
+    {
+      "epoch": 0.5159235668789809,
+      "grad_norm": 0.357421875,
+      "learning_rate": 4.999901135003596e-05,
+      "loss": 0.01328805461525917,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01338,
+      "step": 162,
+      "tokens/total": 21233664,
+      "tokens/train_per_sec_per_gpu": 3491.44,
+      "tokens/trainable": 2256820
+    },
+    {
+      "epoch": 0.5191082802547771,
+      "grad_norm": 0.294921875,
+      "learning_rate": 4.9998455240158346e-05,
+      "loss": 0.015039588324725628,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01515,
+      "step": 163,
+      "tokens/total": 21364736,
+      "tokens/train_per_sec_per_gpu": 2929.21,
+      "tokens/trainable": 2269119
+    },
+    {
+      "epoch": 0.5222929936305732,
+      "grad_norm": 0.3203125,
+      "learning_rate": 4.999777555590779e-05,
+      "loss": 0.014336930587887764,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01444,
+      "step": 164,
+      "tokens/total": 21495808,
+      "tokens/train_per_sec_per_gpu": 3728.34,
+      "tokens/trainable": 2284700
+    },
+    {
+      "epoch": 0.5254777070063694,
+      "grad_norm": 0.279296875,
+      "learning_rate": 4.999697230064414e-05,
+      "loss": 0.01668444462120533,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01682,
+      "step": 165,
+      "tokens/total": 21626880,
+      "tokens/train_per_sec_per_gpu": 3542.38,
+      "tokens/trainable": 2299523
+    },
+    {
+      "epoch": 0.5286624203821656,
+      "grad_norm": 0.275390625,
+      "learning_rate": 4.999604547833814e-05,
+      "loss": 0.01559534203261137,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01572,
+      "step": 166,
+      "tokens/total": 21757952,
+      "tokens/train_per_sec_per_gpu": 3348.3,
+      "tokens/trainable": 2313539
+    },
+    {
+      "epoch": 0.5318471337579618,
+      "grad_norm": 0.251953125,
+      "learning_rate": 4.9994995093571314e-05,
+      "loss": 0.01181457843631506,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01188,
+      "step": 167,
+      "tokens/total": 21889024,
+      "tokens/train_per_sec_per_gpu": 3193.91,
+      "tokens/trainable": 2326972
+    },
+    {
+      "epoch": 0.535031847133758,
+      "grad_norm": 0.326171875,
+      "learning_rate": 4.9993821151536024e-05,
+      "loss": 0.014408236369490623,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01451,
+      "step": 168,
+      "tokens/total": 22020096,
+      "tokens/train_per_sec_per_gpu": 3172.43,
+      "tokens/trainable": 2340305
+    },
+    {
+      "epoch": 0.5382165605095541,
+      "grad_norm": 0.259765625,
+      "learning_rate": 4.9992523658035376e-05,
+      "loss": 0.010526357218623161,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01058,
+      "step": 169,
+      "tokens/total": 22151168,
+      "tokens/train_per_sec_per_gpu": 3477.3,
+      "tokens/trainable": 2354865
+    },
+    {
+      "epoch": 0.5414012738853503,
+      "grad_norm": 0.2734375,
+      "learning_rate": 4.9991102619483254e-05,
+      "loss": 0.015866123139858246,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01599,
+      "step": 170,
+      "tokens/total": 22282240,
+      "tokens/train_per_sec_per_gpu": 3352.2,
+      "tokens/trainable": 2368942
+    },
+    {
+      "epoch": 0.5445859872611465,
+      "grad_norm": 0.34765625,
+      "learning_rate": 4.998955804290425e-05,
+      "loss": 0.015990689396858215,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01612,
+      "step": 171,
+      "tokens/total": 22413312,
+      "tokens/train_per_sec_per_gpu": 3329.47,
+      "tokens/trainable": 2382903
+    },
+    {
+      "epoch": 0.5477707006369427,
+      "grad_norm": 0.294921875,
+      "learning_rate": 4.998788993593364e-05,
+      "loss": 0.012892219237983227,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01298,
+      "step": 172,
+      "tokens/total": 22544384,
+      "tokens/train_per_sec_per_gpu": 3491.15,
+      "tokens/trainable": 2397472
+    },
+    {
+      "epoch": 0.5509554140127388,
+      "grad_norm": 0.326171875,
+      "learning_rate": 4.998609830681734e-05,
+      "loss": 0.016418559476733208,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01655,
+      "step": 173,
+      "tokens/total": 22675456,
+      "tokens/train_per_sec_per_gpu": 3177.44,
+      "tokens/trainable": 2410837
+    },
+    {
+      "epoch": 0.554140127388535,
+      "grad_norm": 0.275390625,
+      "learning_rate": 4.998418316441188e-05,
+      "loss": 0.0159194003790617,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01605,
+      "step": 174,
+      "tokens/total": 22806528,
+      "tokens/train_per_sec_per_gpu": 3252.23,
+      "tokens/trainable": 2424499
+    },
+    {
+      "epoch": 0.5573248407643312,
+      "grad_norm": 0.255859375,
+      "learning_rate": 4.998214451818434e-05,
+      "loss": 0.017272397875785828,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01742,
+      "step": 175,
+      "tokens/total": 22937600,
+      "tokens/train_per_sec_per_gpu": 3335.6,
+      "tokens/trainable": 2438525
+    },
+    {
+      "epoch": 0.5605095541401274,
+      "grad_norm": 0.3671875,
+      "learning_rate": 4.997998237821233e-05,
+      "loss": 0.018668157979846,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01884,
+      "step": 176,
+      "tokens/total": 23068672,
+      "tokens/train_per_sec_per_gpu": 3037.8,
+      "tokens/trainable": 2451344
+    },
+    {
+      "epoch": 0.5636942675159236,
+      "grad_norm": 0.275390625,
+      "learning_rate": 4.99776967551839e-05,
+      "loss": 0.013892064802348614,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01399,
+      "step": 177,
+      "tokens/total": 23199744,
+      "tokens/train_per_sec_per_gpu": 3608.18,
+      "tokens/trainable": 2466462
+    },
+    {
+      "epoch": 0.5668789808917197,
+      "grad_norm": 0.318359375,
+      "learning_rate": 4.997528766039754e-05,
+      "loss": 0.018128130584955215,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01829,
+      "step": 178,
+      "tokens/total": 23330816,
+      "tokens/train_per_sec_per_gpu": 3418.17,
+      "tokens/trainable": 2480794
+    },
+    {
+      "epoch": 0.5700636942675159,
+      "grad_norm": 0.279296875,
+      "learning_rate": 4.997275510576207e-05,
+      "loss": 0.015599234029650688,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01572,
+      "step": 179,
+      "tokens/total": 23461888,
+      "tokens/train_per_sec_per_gpu": 3348.28,
+      "tokens/trainable": 2494826
+    },
+    {
+      "epoch": 0.5732484076433121,
+      "grad_norm": 0.263671875,
+      "learning_rate": 4.9970099103796625e-05,
+      "loss": 0.01772911660373211,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01789,
+      "step": 180,
+      "tokens/total": 23592960,
+      "tokens/train_per_sec_per_gpu": 3350.41,
+      "tokens/trainable": 2508825
+    },
+    {
+      "epoch": 0.5764331210191083,
+      "grad_norm": 0.3046875,
+      "learning_rate": 4.9967319667630567e-05,
+      "loss": 0.017531519755721092,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01769,
+      "step": 181,
+      "tokens/total": 23724032,
+      "tokens/train_per_sec_per_gpu": 3416.59,
+      "tokens/trainable": 2523152
+    },
+    {
+      "epoch": 0.5796178343949044,
+      "grad_norm": 0.25390625,
+      "learning_rate": 4.9964416811003414e-05,
+      "loss": 0.01645725592970848,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01659,
+      "step": 182,
+      "tokens/total": 23855104,
+      "tokens/train_per_sec_per_gpu": 3286.33,
+      "tokens/trainable": 2536956
+    },
+    {
+      "epoch": 0.5828025477707006,
+      "grad_norm": 0.298828125,
+      "learning_rate": 4.996139054826482e-05,
+      "loss": 0.017507638782262802,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01766,
+      "step": 183,
+      "tokens/total": 23986176,
+      "tokens/train_per_sec_per_gpu": 3802.12,
+      "tokens/trainable": 2552813
+    },
+    {
+      "epoch": 0.5859872611464968,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 4.9958240894374433e-05,
+      "loss": 0.015289016999304295,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01541,
+      "step": 184,
+      "tokens/total": 24117248,
+      "tokens/train_per_sec_per_gpu": 3166.13,
+      "tokens/trainable": 2566093
+    },
+    {
+      "epoch": 0.589171974522293,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 4.995496786490189e-05,
+      "loss": 0.01385944988578558,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01396,
+      "step": 185,
+      "tokens/total": 24248320,
+      "tokens/train_per_sec_per_gpu": 3395.23,
+      "tokens/trainable": 2580324
+    },
+    {
+      "epoch": 0.5923566878980892,
+      "grad_norm": 0.28515625,
+      "learning_rate": 4.995157147602669e-05,
+      "loss": 0.01804269105195999,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01821,
+      "step": 186,
+      "tokens/total": 24379392,
+      "tokens/train_per_sec_per_gpu": 3278.99,
+      "tokens/trainable": 2594113
+    },
+    {
+      "epoch": 0.5955414012738853,
+      "grad_norm": 0.3359375,
+      "learning_rate": 4.994805174453813e-05,
+      "loss": 0.01675378903746605,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01689,
+      "step": 187,
+      "tokens/total": 24510464,
+      "tokens/train_per_sec_per_gpu": 3247.25,
+      "tokens/trainable": 2607778
+    },
+    {
+      "epoch": 0.5987261146496815,
+      "grad_norm": 0.2578125,
+      "learning_rate": 4.994440868783522e-05,
+      "loss": 0.014898994006216526,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01501,
+      "step": 188,
+      "tokens/total": 24641536,
+      "tokens/train_per_sec_per_gpu": 3439.86,
+      "tokens/trainable": 2622165
+    },
+    {
+      "epoch": 0.6019108280254777,
+      "grad_norm": 0.236328125,
+      "learning_rate": 4.994064232392664e-05,
+      "loss": 0.012711770832538605,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01279,
+      "step": 189,
+      "tokens/total": 24772608,
+      "tokens/train_per_sec_per_gpu": 3250.18,
+      "tokens/trainable": 2635842
+    },
+    {
+      "epoch": 0.6050955414012739,
+      "grad_norm": 0.201171875,
+      "learning_rate": 4.993675267143056e-05,
+      "loss": 0.0118938647210598,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01196,
+      "step": 190,
+      "tokens/total": 24903680,
+      "tokens/train_per_sec_per_gpu": 3684.73,
+      "tokens/trainable": 2651259
+    },
+    {
+      "epoch": 0.60828025477707,
+      "grad_norm": 0.265625,
+      "learning_rate": 4.993273974957463e-05,
+      "loss": 0.011486702598631382,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01155,
+      "step": 191,
+      "tokens/total": 25034752,
+      "tokens/train_per_sec_per_gpu": 3177.1,
+      "tokens/trainable": 2664566
+    },
+    {
+      "epoch": 0.6114649681528662,
+      "grad_norm": 0.23046875,
+      "learning_rate": 4.992860357819584e-05,
+      "loss": 0.012811151333153248,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01289,
+      "step": 192,
+      "tokens/total": 25165824,
+      "tokens/train_per_sec_per_gpu": 3414.45,
+      "tokens/trainable": 2678871
+    },
+    {
+      "epoch": 0.6146496815286624,
+      "grad_norm": 0.30078125,
+      "learning_rate": 4.992434417774045e-05,
+      "loss": 0.011826693080365658,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0119,
+      "step": 193,
+      "tokens/total": 25296896,
+      "tokens/train_per_sec_per_gpu": 3298.7,
+      "tokens/trainable": 2692737
+    },
+    {
+      "epoch": 0.6178343949044586,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 4.991996156926387e-05,
+      "loss": 0.01326029933989048,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01335,
+      "step": 194,
+      "tokens/total": 25427968,
+      "tokens/train_per_sec_per_gpu": 3122.96,
+      "tokens/trainable": 2705928
+    },
+    {
+      "epoch": 0.6210191082802548,
+      "grad_norm": 0.2890625,
+      "learning_rate": 4.991545577443057e-05,
+      "loss": 0.012153583578765392,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01223,
+      "step": 195,
+      "tokens/total": 25559040,
+      "tokens/train_per_sec_per_gpu": 3089.07,
+      "tokens/trainable": 2718915
+    },
+    {
+      "epoch": 0.6242038216560509,
+      "grad_norm": 0.296875,
+      "learning_rate": 4.991082681551396e-05,
+      "loss": 0.014371933415532112,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01448,
+      "step": 196,
+      "tokens/total": 25690112,
+      "tokens/train_per_sec_per_gpu": 3197.51,
+      "tokens/trainable": 2732376
+    },
+    {
+      "epoch": 0.6273885350318471,
+      "grad_norm": 0.26953125,
+      "learning_rate": 4.990607471539626e-05,
+      "loss": 0.012046409770846367,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01212,
+      "step": 197,
+      "tokens/total": 25821184,
+      "tokens/train_per_sec_per_gpu": 3374.92,
+      "tokens/trainable": 2746546
+    },
+    {
+      "epoch": 0.6305732484076433,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 4.990119949756845e-05,
+      "loss": 0.009664296172559261,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00971,
+      "step": 198,
+      "tokens/total": 25952256,
+      "tokens/train_per_sec_per_gpu": 3569.8,
+      "tokens/trainable": 2761477
+    },
+    {
+      "epoch": 0.6337579617834395,
+      "grad_norm": 0.279296875,
+      "learning_rate": 4.989620118613009e-05,
+      "loss": 0.00950827170163393,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00955,
+      "step": 199,
+      "tokens/total": 26083328,
+      "tokens/train_per_sec_per_gpu": 3265.27,
+      "tokens/trainable": 2775167
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "grad_norm": 0.310546875,
+      "learning_rate": 4.989107980578924e-05,
+      "loss": 0.01698843576014042,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01713,
+      "step": 200,
+      "tokens/total": 26214400,
+      "tokens/train_per_sec_per_gpu": 3262.25,
+      "tokens/trainable": 2788865
+    },
+    {
+      "epoch": 0.6401273885350318,
+      "grad_norm": 0.248046875,
+      "learning_rate": 4.9885835381862326e-05,
+      "loss": 0.009720825590193272,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00977,
+      "step": 201,
+      "tokens/total": 26345472,
+      "tokens/train_per_sec_per_gpu": 3459.38,
+      "tokens/trainable": 2803380
+    },
+    {
+      "epoch": 0.643312101910828,
+      "grad_norm": 0.30859375,
+      "learning_rate": 4.988046794027399e-05,
+      "loss": 0.01347583532333374,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01357,
+      "step": 202,
+      "tokens/total": 26476544,
+      "tokens/train_per_sec_per_gpu": 3450.44,
+      "tokens/trainable": 2817829
+    },
+    {
+      "epoch": 0.6464968152866242,
+      "grad_norm": 0.2890625,
+      "learning_rate": 4.987497750755702e-05,
+      "loss": 0.014860209077596664,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01497,
+      "step": 203,
+      "tokens/total": 26607616,
+      "tokens/train_per_sec_per_gpu": 3450.98,
+      "tokens/trainable": 2832277
+    },
+    {
+      "epoch": 0.6496815286624203,
+      "grad_norm": 0.31640625,
+      "learning_rate": 4.986936411085214e-05,
+      "loss": 0.016120830550789833,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01625,
+      "step": 204,
+      "tokens/total": 26738688,
+      "tokens/train_per_sec_per_gpu": 3184.35,
+      "tokens/trainable": 2845614
+    },
+    {
+      "epoch": 0.6528662420382165,
+      "grad_norm": 0.2578125,
+      "learning_rate": 4.986362777790796e-05,
+      "loss": 0.01890011504292488,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01908,
+      "step": 205,
+      "tokens/total": 26869760,
+      "tokens/train_per_sec_per_gpu": 3386.54,
+      "tokens/trainable": 2859717
+    },
+    {
+      "epoch": 0.6560509554140127,
+      "grad_norm": 0.333984375,
+      "learning_rate": 4.9857768537080784e-05,
+      "loss": 0.014317265711724758,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01442,
+      "step": 206,
+      "tokens/total": 27000832,
+      "tokens/train_per_sec_per_gpu": 3426.69,
+      "tokens/trainable": 2874068
+    },
+    {
+      "epoch": 0.6592356687898089,
+      "grad_norm": 0.31640625,
+      "learning_rate": 4.9851786417334466e-05,
+      "loss": 0.013661851175129414,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01376,
+      "step": 207,
+      "tokens/total": 27131904,
+      "tokens/train_per_sec_per_gpu": 3324.85,
+      "tokens/trainable": 2887963
+    },
+    {
+      "epoch": 0.6624203821656051,
+      "grad_norm": 0.251953125,
+      "learning_rate": 4.984568144824032e-05,
+      "loss": 0.01245003379881382,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01253,
+      "step": 208,
+      "tokens/total": 27262976,
+      "tokens/train_per_sec_per_gpu": 3335.64,
+      "tokens/trainable": 2901885
+    },
+    {
+      "epoch": 0.6656050955414012,
+      "grad_norm": 0.265625,
+      "learning_rate": 4.983945365997691e-05,
+      "loss": 0.010308452881872654,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01036,
+      "step": 209,
+      "tokens/total": 27394048,
+      "tokens/train_per_sec_per_gpu": 2771.97,
+      "tokens/trainable": 2913512
+    },
+    {
+      "epoch": 0.6687898089171974,
+      "grad_norm": 0.234375,
+      "learning_rate": 4.9833103083329947e-05,
+      "loss": 0.013119550421833992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01321,
+      "step": 210,
+      "tokens/total": 27525120,
+      "tokens/train_per_sec_per_gpu": 3729.48,
+      "tokens/trainable": 2929046
+    },
+    {
+      "epoch": 0.6719745222929936,
+      "grad_norm": 0.259765625,
+      "learning_rate": 4.98266297496921e-05,
+      "loss": 0.01352207362651825,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01361,
+      "step": 211,
+      "tokens/total": 27656192,
+      "tokens/train_per_sec_per_gpu": 3277.56,
+      "tokens/trainable": 2942780
+    },
+    {
+      "epoch": 0.6751592356687898,
+      "grad_norm": 0.34765625,
+      "learning_rate": 4.982003369106287e-05,
+      "loss": 0.017431171610951424,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01758,
+      "step": 212,
+      "tokens/total": 27787264,
+      "tokens/train_per_sec_per_gpu": 3344.98,
+      "tokens/trainable": 2956783
+    },
+    {
+      "epoch": 0.678343949044586,
+      "grad_norm": 0.255859375,
+      "learning_rate": 4.981331494004845e-05,
+      "loss": 0.01397764589637518,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01408,
+      "step": 213,
+      "tokens/total": 27918336,
+      "tokens/train_per_sec_per_gpu": 3185.6,
+      "tokens/trainable": 2970117
+    },
+    {
+      "epoch": 0.6815286624203821,
+      "grad_norm": 0.30859375,
+      "learning_rate": 4.980647352986148e-05,
+      "loss": 0.014616122469305992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01472,
+      "step": 214,
+      "tokens/total": 28049408,
+      "tokens/train_per_sec_per_gpu": 3594.29,
+      "tokens/trainable": 2985083
+    },
+    {
+      "epoch": 0.6847133757961783,
+      "grad_norm": 0.34375,
+      "learning_rate": 4.979950949432098e-05,
+      "loss": 0.012630216777324677,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01271,
+      "step": 215,
+      "tokens/total": 28180480,
+      "tokens/train_per_sec_per_gpu": 3114.53,
+      "tokens/trainable": 2998164
+    },
+    {
+      "epoch": 0.6878980891719745,
+      "grad_norm": 0.369140625,
+      "learning_rate": 4.979242286785214e-05,
+      "loss": 0.01619878038764,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01633,
+      "step": 216,
+      "tokens/total": 28311552,
+      "tokens/train_per_sec_per_gpu": 3343.4,
+      "tokens/trainable": 3012168
+    },
+    {
+      "epoch": 0.6910828025477707,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 4.978521368548612e-05,
+      "loss": 0.00897720456123352,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00902,
+      "step": 217,
+      "tokens/total": 28442624,
+      "tokens/train_per_sec_per_gpu": 3292.3,
+      "tokens/trainable": 3025888
+    },
+    {
+      "epoch": 0.6942675159235668,
+      "grad_norm": 0.232421875,
+      "learning_rate": 4.977788198285995e-05,
+      "loss": 0.010021158494055271,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01007,
+      "step": 218,
+      "tokens/total": 28573696,
+      "tokens/train_per_sec_per_gpu": 3319.6,
+      "tokens/trainable": 3039763
+    },
+    {
+      "epoch": 0.697452229299363,
+      "grad_norm": 0.23828125,
+      "learning_rate": 4.9770427796216284e-05,
+      "loss": 0.01425202563405037,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01435,
+      "step": 219,
+      "tokens/total": 28704768,
+      "tokens/train_per_sec_per_gpu": 2847.77,
+      "tokens/trainable": 3051731
+    },
+    {
+      "epoch": 0.7006369426751592,
+      "grad_norm": 0.322265625,
+      "learning_rate": 4.976285116240326e-05,
+      "loss": 0.014778842218220234,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01489,
+      "step": 220,
+      "tokens/total": 28835840,
+      "tokens/train_per_sec_per_gpu": 3280.14,
+      "tokens/trainable": 3065475
+    },
+    {
+      "epoch": 0.7038216560509554,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 4.9755152118874294e-05,
+      "loss": 0.011257003992795944,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01132,
+      "step": 221,
+      "tokens/total": 28966912,
+      "tokens/train_per_sec_per_gpu": 3367.48,
+      "tokens/trainable": 3079510
+    },
+    {
+      "epoch": 0.7070063694267515,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 4.9747330703687914e-05,
+      "loss": 0.013675577938556671,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01377,
+      "step": 222,
+      "tokens/total": 29097984,
+      "tokens/train_per_sec_per_gpu": 3844.8,
+      "tokens/trainable": 3095524
+    },
+    {
+      "epoch": 0.7101910828025477,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 4.9739386955507587e-05,
+      "loss": 0.01433156430721283,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01443,
+      "step": 223,
+      "tokens/total": 29229056,
+      "tokens/train_per_sec_per_gpu": 3346.69,
+      "tokens/trainable": 3109543
+    },
+    {
+      "epoch": 0.7133757961783439,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 4.9731320913601474e-05,
+      "loss": 0.010345865972340107,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0104,
+      "step": 224,
+      "tokens/total": 29360128,
+      "tokens/train_per_sec_per_gpu": 3025.76,
+      "tokens/trainable": 3122229
+    },
+    {
+      "epoch": 0.7165605095541401,
+      "grad_norm": 0.2109375,
+      "learning_rate": 4.9723132617842284e-05,
+      "loss": 0.014529074542224407,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01464,
+      "step": 225,
+      "tokens/total": 29491200,
+      "tokens/train_per_sec_per_gpu": 3346.66,
+      "tokens/trainable": 3136235
+    },
+    {
+      "epoch": 0.7197452229299363,
+      "grad_norm": 0.263671875,
+      "learning_rate": 4.971482210870706e-05,
+      "loss": 0.017442386597394943,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0176,
+      "step": 226,
+      "tokens/total": 29622272,
+      "tokens/train_per_sec_per_gpu": 3192.22,
+      "tokens/trainable": 3149606
+    },
+    {
+      "epoch": 0.7229299363057324,
+      "grad_norm": 0.1875,
+      "learning_rate": 4.970638942727698e-05,
+      "loss": 0.00844226311892271,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00848,
+      "step": 227,
+      "tokens/total": 29753344,
+      "tokens/train_per_sec_per_gpu": 3247.88,
+      "tokens/trainable": 3163147
+    },
+    {
+      "epoch": 0.7261146496815286,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.969783461523714e-05,
+      "loss": 0.010366439819335938,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01042,
+      "step": 228,
+      "tokens/total": 29884416,
+      "tokens/train_per_sec_per_gpu": 3545.1,
+      "tokens/trainable": 3177891
+    },
+    {
+      "epoch": 0.7292993630573248,
+      "grad_norm": 0.259765625,
+      "learning_rate": 4.968915771487639e-05,
+      "loss": 0.011432585306465626,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0115,
+      "step": 229,
+      "tokens/total": 30015488,
+      "tokens/train_per_sec_per_gpu": 3336.0,
+      "tokens/trainable": 3191819
+    },
+    {
+      "epoch": 0.732484076433121,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 4.9680358769087076e-05,
+      "loss": 0.012058578431606293,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01213,
+      "step": 230,
+      "tokens/total": 30146560,
+      "tokens/train_per_sec_per_gpu": 3245.98,
+      "tokens/trainable": 3205431
+    },
+    {
+      "epoch": 0.7356687898089171,
+      "grad_norm": 0.216796875,
+      "learning_rate": 4.9671437821364855e-05,
+      "loss": 0.013203555718064308,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01329,
+      "step": 231,
+      "tokens/total": 30277632,
+      "tokens/train_per_sec_per_gpu": 2895.23,
+      "tokens/trainable": 3217538
+    },
+    {
+      "epoch": 0.7388535031847133,
+      "grad_norm": 0.2109375,
+      "learning_rate": 4.966239491580847e-05,
+      "loss": 0.011110116727650166,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01117,
+      "step": 232,
+      "tokens/total": 30408704,
+      "tokens/train_per_sec_per_gpu": 3255.67,
+      "tokens/trainable": 3231099
+    },
+    {
+      "epoch": 0.7420382165605095,
+      "grad_norm": 0.19921875,
+      "learning_rate": 4.965323009711954e-05,
+      "loss": 0.01235074270516634,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01243,
+      "step": 233,
+      "tokens/total": 30539776,
+      "tokens/train_per_sec_per_gpu": 3738.25,
+      "tokens/trainable": 3246613
+    },
+    {
+      "epoch": 0.7452229299363057,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 4.964394341060233e-05,
+      "loss": 0.014128293842077255,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01423,
+      "step": 234,
+      "tokens/total": 30670848,
+      "tokens/train_per_sec_per_gpu": 3075.78,
+      "tokens/trainable": 3259483
+    },
+    {
+      "epoch": 0.7484076433121019,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 4.9634534902163544e-05,
+      "loss": 0.011594554409384727,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01166,
+      "step": 235,
+      "tokens/total": 30801920,
+      "tokens/train_per_sec_per_gpu": 3397.95,
+      "tokens/trainable": 3273641
+    },
+    {
+      "epoch": 0.7515923566878981,
+      "grad_norm": 0.34375,
+      "learning_rate": 4.962500461831207e-05,
+      "loss": 0.015983082354068756,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01611,
+      "step": 236,
+      "tokens/total": 30932992,
+      "tokens/train_per_sec_per_gpu": 3322.87,
+      "tokens/trainable": 3287575
+    },
+    {
+      "epoch": 0.7547770700636943,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 4.961535260615876e-05,
+      "loss": 0.01292226929217577,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01301,
+      "step": 237,
+      "tokens/total": 31064064,
+      "tokens/train_per_sec_per_gpu": 3320.22,
+      "tokens/trainable": 3301421
+    },
+    {
+      "epoch": 0.7579617834394905,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 4.9605578913416245e-05,
+      "loss": 0.014275891706347466,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01438,
+      "step": 238,
+      "tokens/total": 31195136,
+      "tokens/train_per_sec_per_gpu": 3614.8,
+      "tokens/trainable": 3316404
+    },
+    {
+      "epoch": 0.7611464968152867,
+      "grad_norm": 0.267578125,
+      "learning_rate": 4.959568358839861e-05,
+      "loss": 0.01322453934699297,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01331,
+      "step": 239,
+      "tokens/total": 31326208,
+      "tokens/train_per_sec_per_gpu": 3704.99,
+      "tokens/trainable": 3331869
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "grad_norm": 0.240234375,
+      "learning_rate": 4.958566668002123e-05,
+      "loss": 0.01428250689059496,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01438,
+      "step": 240,
+      "tokens/total": 31457280,
+      "tokens/train_per_sec_per_gpu": 3217.37,
+      "tokens/trainable": 3345254
+    },
+    {
+      "epoch": 0.767515923566879,
+      "grad_norm": 0.248046875,
+      "learning_rate": 4.957552823780047e-05,
+      "loss": 0.011499980464577675,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01157,
+      "step": 241,
+      "tokens/total": 31588352,
+      "tokens/train_per_sec_per_gpu": 3332.37,
+      "tokens/trainable": 3359111
+    },
+    {
+      "epoch": 0.7707006369426752,
+      "grad_norm": 0.25,
+      "learning_rate": 4.956526831185353e-05,
+      "loss": 0.014339377172291279,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01444,
+      "step": 242,
+      "tokens/total": 31719424,
+      "tokens/train_per_sec_per_gpu": 3461.36,
+      "tokens/trainable": 3373551
+    },
+    {
+      "epoch": 0.7738853503184714,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.955488695289806e-05,
+      "loss": 0.009887355379760265,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00994,
+      "step": 243,
+      "tokens/total": 31850496,
+      "tokens/train_per_sec_per_gpu": 3502.72,
+      "tokens/trainable": 3388151
+    },
+    {
+      "epoch": 0.7770700636942676,
+      "grad_norm": 0.236328125,
+      "learning_rate": 4.954438421225206e-05,
+      "loss": 0.013017972931265831,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0131,
+      "step": 244,
+      "tokens/total": 31981568,
+      "tokens/train_per_sec_per_gpu": 3313.42,
+      "tokens/trainable": 3401935
+    },
+    {
+      "epoch": 0.7802547770700637,
+      "grad_norm": 0.22265625,
+      "learning_rate": 4.9533760141833506e-05,
+      "loss": 0.012434033676981926,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01251,
+      "step": 245,
+      "tokens/total": 32112640,
+      "tokens/train_per_sec_per_gpu": 3363.79,
+      "tokens/trainable": 3415979
+    },
+    {
+      "epoch": 0.7834394904458599,
+      "grad_norm": 0.19140625,
+      "learning_rate": 4.952301479416015e-05,
+      "loss": 0.011714441701769829,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01178,
+      "step": 246,
+      "tokens/total": 32243712,
+      "tokens/train_per_sec_per_gpu": 3236.03,
+      "tokens/trainable": 3429486
+    },
+    {
+      "epoch": 0.7866242038216561,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 4.9512148222349274e-05,
+      "loss": 0.01364858727902174,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01374,
+      "step": 247,
+      "tokens/total": 32374784,
+      "tokens/train_per_sec_per_gpu": 3117.68,
+      "tokens/trainable": 3442584
+    },
+    {
+      "epoch": 0.7898089171974523,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.950116048011739e-05,
+      "loss": 0.00907064788043499,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00911,
+      "step": 248,
+      "tokens/total": 32505856,
+      "tokens/train_per_sec_per_gpu": 3310.3,
+      "tokens/trainable": 3456412
+    },
+    {
+      "epoch": 0.7929936305732485,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.949005162177997e-05,
+      "loss": 0.011760072782635689,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01183,
+      "step": 249,
+      "tokens/total": 32636928,
+      "tokens/train_per_sec_per_gpu": 3404.86,
+      "tokens/trainable": 3470647
+    },
+    {
+      "epoch": 0.7961783439490446,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 4.9478821702251234e-05,
+      "loss": 0.014284678734838963,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01439,
+      "step": 250,
+      "tokens/total": 32768000,
+      "tokens/train_per_sec_per_gpu": 3377.89,
+      "tokens/trainable": 3484748
+    },
+    {
+      "epoch": 0.7993630573248408,
+      "grad_norm": 0.18359375,
+      "learning_rate": 4.9467470777043806e-05,
+      "loss": 0.011529207229614258,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0116,
+      "step": 251,
+      "tokens/total": 32899072,
+      "tokens/train_per_sec_per_gpu": 3574.68,
+      "tokens/trainable": 3499669
+    },
+    {
+      "epoch": 0.802547770700637,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 4.9455998902268504e-05,
+      "loss": 0.01309981569647789,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01319,
+      "step": 252,
+      "tokens/total": 33030144,
+      "tokens/train_per_sec_per_gpu": 3255.28,
+      "tokens/trainable": 3513312
+    },
+    {
+      "epoch": 0.8057324840764332,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.944440613463402e-05,
+      "loss": 0.007244420703500509,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00727,
+      "step": 253,
+      "tokens/total": 33161216,
+      "tokens/train_per_sec_per_gpu": 3061.67,
+      "tokens/trainable": 3526131
+    },
+    {
+      "epoch": 0.8089171974522293,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 4.943269253144664e-05,
+      "loss": 0.012152907438576221,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01223,
+      "step": 254,
+      "tokens/total": 33292288,
+      "tokens/train_per_sec_per_gpu": 3129.1,
+      "tokens/trainable": 3539258
+    },
+    {
+      "epoch": 0.8121019108280255,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 4.9420858150610025e-05,
+      "loss": 0.009945802390575409,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01,
+      "step": 255,
+      "tokens/total": 33423360,
+      "tokens/train_per_sec_per_gpu": 3101.37,
+      "tokens/trainable": 3552212
+    },
+    {
+      "epoch": 0.8152866242038217,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 4.9408903050624796e-05,
+      "loss": 0.00950522068887949,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00955,
+      "step": 256,
+      "tokens/total": 33554432,
+      "tokens/train_per_sec_per_gpu": 3437.25,
+      "tokens/trainable": 3566622
+    },
+    {
+      "epoch": 0.8184713375796179,
+      "grad_norm": 0.265625,
+      "learning_rate": 4.939682729058839e-05,
+      "loss": 0.012676852755248547,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01276,
+      "step": 257,
+      "tokens/total": 33685504,
+      "tokens/train_per_sec_per_gpu": 3405.34,
+      "tokens/trainable": 3580857
+    },
+    {
+      "epoch": 0.821656050955414,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 4.938463093019466e-05,
+      "loss": 0.012163055129349232,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01224,
+      "step": 258,
+      "tokens/total": 33816576,
+      "tokens/train_per_sec_per_gpu": 3175.85,
+      "tokens/trainable": 3594180
+    },
+    {
+      "epoch": 0.8248407643312102,
+      "grad_norm": 0.220703125,
+      "learning_rate": 4.937231402973365e-05,
+      "loss": 0.011768801137804985,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01184,
+      "step": 259,
+      "tokens/total": 33947648,
+      "tokens/train_per_sec_per_gpu": 3036.11,
+      "tokens/trainable": 3606954
+    },
+    {
+      "epoch": 0.8280254777070064,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 4.935987665009123e-05,
+      "loss": 0.01067468523979187,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01073,
+      "step": 260,
+      "tokens/total": 34078720,
+      "tokens/train_per_sec_per_gpu": 3332.71,
+      "tokens/trainable": 3620834
+    },
+    {
+      "epoch": 0.8312101910828026,
+      "grad_norm": 0.208984375,
+      "learning_rate": 4.934731885274887e-05,
+      "loss": 0.008789247833192348,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00883,
+      "step": 261,
+      "tokens/total": 34209792,
+      "tokens/train_per_sec_per_gpu": 3139.67,
+      "tokens/trainable": 3633998
+    },
+    {
+      "epoch": 0.8343949044585988,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 4.9334640699783286e-05,
+      "loss": 0.011909011751413345,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01198,
+      "step": 262,
+      "tokens/total": 34340864,
+      "tokens/train_per_sec_per_gpu": 3340.74,
+      "tokens/trainable": 3647974
+    },
+    {
+      "epoch": 0.8375796178343949,
+      "grad_norm": 0.265625,
+      "learning_rate": 4.9321842253866136e-05,
+      "loss": 0.013996127992868423,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01409,
+      "step": 263,
+      "tokens/total": 34471936,
+      "tokens/train_per_sec_per_gpu": 3762.99,
+      "tokens/trainable": 3663593
+    },
+    {
+      "epoch": 0.8407643312101911,
+      "grad_norm": 0.228515625,
+      "learning_rate": 4.930892357826373e-05,
+      "loss": 0.014773533679544926,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01488,
+      "step": 264,
+      "tokens/total": 34603008,
+      "tokens/train_per_sec_per_gpu": 3474.74,
+      "tokens/trainable": 3678065
+    },
+    {
+      "epoch": 0.8439490445859873,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 4.92958847368367e-05,
+      "loss": 0.01498363260179758,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0151,
+      "step": 265,
+      "tokens/total": 34734080,
+      "tokens/train_per_sec_per_gpu": 3050.93,
+      "tokens/trainable": 3690846
+    },
+    {
+      "epoch": 0.8471337579617835,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 4.928272579403969e-05,
+      "loss": 0.009248088113963604,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00929,
+      "step": 266,
+      "tokens/total": 34865152,
+      "tokens/train_per_sec_per_gpu": 3185.95,
+      "tokens/trainable": 3704117
+    },
+    {
+      "epoch": 0.8503184713375797,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 4.926944681492106e-05,
+      "loss": 0.012684832327067852,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01277,
+      "step": 267,
+      "tokens/total": 34996224,
+      "tokens/train_per_sec_per_gpu": 3411.13,
+      "tokens/trainable": 3718339
+    },
+    {
+      "epoch": 0.8535031847133758,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 4.925604786512251e-05,
+      "loss": 0.0118259247392416,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0119,
+      "step": 268,
+      "tokens/total": 35127296,
+      "tokens/train_per_sec_per_gpu": 3032.33,
+      "tokens/trainable": 3731032
+    },
+    {
+      "epoch": 0.856687898089172,
+      "grad_norm": 0.1953125,
+      "learning_rate": 4.924252901087881e-05,
+      "loss": 0.009350091218948364,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00939,
+      "step": 269,
+      "tokens/total": 35258368,
+      "tokens/train_per_sec_per_gpu": 3595.2,
+      "tokens/trainable": 3746006
+    },
+    {
+      "epoch": 0.8598726114649682,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 4.922889031901745e-05,
+      "loss": 0.01463128998875618,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01474,
+      "step": 270,
+      "tokens/total": 35389440,
+      "tokens/train_per_sec_per_gpu": 3514.64,
+      "tokens/trainable": 3760731
+    },
+    {
+      "epoch": 0.8630573248407644,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.921513185695831e-05,
+      "loss": 0.009343666024506092,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00939,
+      "step": 271,
+      "tokens/total": 35520512,
+      "tokens/train_per_sec_per_gpu": 3137.39,
+      "tokens/trainable": 3773865
+    },
+    {
+      "epoch": 0.8662420382165605,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 4.920125369271332e-05,
+      "loss": 0.011359314434230328,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01142,
+      "step": 272,
+      "tokens/total": 35651584,
+      "tokens/train_per_sec_per_gpu": 3710.71,
+      "tokens/trainable": 3789305
+    },
+    {
+      "epoch": 0.8694267515923567,
+      "grad_norm": 0.173828125,
+      "learning_rate": 4.9187255894886134e-05,
+      "loss": 0.011224365793168545,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01129,
+      "step": 273,
+      "tokens/total": 35782656,
+      "tokens/train_per_sec_per_gpu": 3673.45,
+      "tokens/trainable": 3804528
+    },
+    {
+      "epoch": 0.8726114649681529,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 4.9173138532671796e-05,
+      "loss": 0.012716785073280334,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0128,
+      "step": 274,
+      "tokens/total": 35913728,
+      "tokens/train_per_sec_per_gpu": 3495.34,
+      "tokens/trainable": 3819131
+    },
+    {
+      "epoch": 0.8757961783439491,
+      "grad_norm": 0.193359375,
+      "learning_rate": 4.9158901675856395e-05,
+      "loss": 0.008782695978879929,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00882,
+      "step": 275,
+      "tokens/total": 36044800,
+      "tokens/train_per_sec_per_gpu": 3305.01,
+      "tokens/trainable": 3832973
+    },
+    {
+      "epoch": 0.8789808917197452,
+      "grad_norm": 0.169921875,
+      "learning_rate": 4.9144545394816687e-05,
+      "loss": 0.008706534281373024,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00874,
+      "step": 276,
+      "tokens/total": 36175872,
+      "tokens/train_per_sec_per_gpu": 3043.21,
+      "tokens/trainable": 3845728
+    },
+    {
+      "epoch": 0.8821656050955414,
+      "grad_norm": 0.27734375,
+      "learning_rate": 4.91300697605198e-05,
+      "loss": 0.01517584826797247,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01529,
+      "step": 277,
+      "tokens/total": 36306944,
+      "tokens/train_per_sec_per_gpu": 3664.41,
+      "tokens/trainable": 3860973
+    },
+    {
+      "epoch": 0.8853503184713376,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 4.911547484452286e-05,
+      "loss": 0.009684903547167778,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00973,
+      "step": 278,
+      "tokens/total": 36438016,
+      "tokens/train_per_sec_per_gpu": 3416.95,
+      "tokens/trainable": 3875221
+    },
+    {
+      "epoch": 0.8885350318471338,
+      "grad_norm": 0.201171875,
+      "learning_rate": 4.9100760718972624e-05,
+      "loss": 0.011975611560046673,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01205,
+      "step": 279,
+      "tokens/total": 36569088,
+      "tokens/train_per_sec_per_gpu": 3231.7,
+      "tokens/trainable": 3888737
+    },
+    {
+      "epoch": 0.89171974522293,
+      "grad_norm": 0.171875,
+      "learning_rate": 4.908592745660514e-05,
+      "loss": 0.009973946958780289,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01002,
+      "step": 280,
+      "tokens/total": 36700160,
+      "tokens/train_per_sec_per_gpu": 3510.18,
+      "tokens/trainable": 3903383
+    },
+    {
+      "epoch": 0.8949044585987261,
+      "grad_norm": 0.189453125,
+      "learning_rate": 4.9070975130745387e-05,
+      "loss": 0.009210948832333088,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00925,
+      "step": 281,
+      "tokens/total": 36831232,
+      "tokens/train_per_sec_per_gpu": 3276.53,
+      "tokens/trainable": 3917095
+    },
+    {
+      "epoch": 0.8980891719745223,
+      "grad_norm": 0.216796875,
+      "learning_rate": 4.905590381530689e-05,
+      "loss": 0.010272481478750706,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01033,
+      "step": 282,
+      "tokens/total": 36962304,
+      "tokens/train_per_sec_per_gpu": 3515.84,
+      "tokens/trainable": 3931741
+    },
+    {
+      "epoch": 0.9012738853503185,
+      "grad_norm": 0.203125,
+      "learning_rate": 4.9040713584791406e-05,
+      "loss": 0.009833472780883312,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00988,
+      "step": 283,
+      "tokens/total": 37093376,
+      "tokens/train_per_sec_per_gpu": 2930.03,
+      "tokens/trainable": 3944068
+    },
+    {
+      "epoch": 0.9044585987261147,
+      "grad_norm": 0.173828125,
+      "learning_rate": 4.902540451428849e-05,
+      "loss": 0.008189358748495579,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00822,
+      "step": 284,
+      "tokens/total": 37224448,
+      "tokens/train_per_sec_per_gpu": 3765.41,
+      "tokens/trainable": 3959725
+    },
+    {
+      "epoch": 0.9076433121019108,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 4.900997667947518e-05,
+      "loss": 0.013849266804754734,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01395,
+      "step": 285,
+      "tokens/total": 37355520,
+      "tokens/train_per_sec_per_gpu": 3186.67,
+      "tokens/trainable": 3973038
+    },
+    {
+      "epoch": 0.910828025477707,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 4.899443015661557e-05,
+      "loss": 0.008526762947440147,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00856,
+      "step": 286,
+      "tokens/total": 37486592,
+      "tokens/train_per_sec_per_gpu": 3056.98,
+      "tokens/trainable": 3985851
+    },
+    {
+      "epoch": 0.9140127388535032,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 4.89787650225605e-05,
+      "loss": 0.008836560882627964,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00888,
+      "step": 287,
+      "tokens/total": 37617664,
+      "tokens/train_per_sec_per_gpu": 3316.33,
+      "tokens/trainable": 3999725
+    },
+    {
+      "epoch": 0.9171974522292994,
+      "grad_norm": 0.263671875,
+      "learning_rate": 4.896298135474711e-05,
+      "loss": 0.01038228627294302,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01044,
+      "step": 288,
+      "tokens/total": 37748736,
+      "tokens/train_per_sec_per_gpu": 3125.36,
+      "tokens/trainable": 4012867
+    },
+    {
+      "epoch": 0.9203821656050956,
+      "grad_norm": 0.21875,
+      "learning_rate": 4.8947079231198504e-05,
+      "loss": 0.012707007117569447,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01279,
+      "step": 289,
+      "tokens/total": 37879808,
+      "tokens/train_per_sec_per_gpu": 3307.2,
+      "tokens/trainable": 4026670
+    },
+    {
+      "epoch": 0.9235668789808917,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 4.893105873052333e-05,
+      "loss": 0.010869958437979221,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01093,
+      "step": 290,
+      "tokens/total": 38010880,
+      "tokens/train_per_sec_per_gpu": 3449.15,
+      "tokens/trainable": 4041053
+    },
+    {
+      "epoch": 0.9267515923566879,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 4.8914919931915407e-05,
+      "loss": 0.010028751567006111,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01008,
+      "step": 291,
+      "tokens/total": 38141952,
+      "tokens/train_per_sec_per_gpu": 3442.24,
+      "tokens/trainable": 4055450
+    },
+    {
+      "epoch": 0.9299363057324841,
+      "grad_norm": 0.220703125,
+      "learning_rate": 4.889866291515336e-05,
+      "loss": 0.012203947640955448,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01228,
+      "step": 292,
+      "tokens/total": 38273024,
+      "tokens/train_per_sec_per_gpu": 2829.0,
+      "tokens/trainable": 4067366
+    },
+    {
+      "epoch": 0.9331210191082803,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 4.888228776060016e-05,
+      "loss": 0.010833281092345715,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01089,
+      "step": 293,
+      "tokens/total": 38404096,
+      "tokens/train_per_sec_per_gpu": 3495.99,
+      "tokens/trainable": 4081929
+    },
+    {
+      "epoch": 0.9363057324840764,
+      "grad_norm": 0.181640625,
+      "learning_rate": 4.886579454920281e-05,
+      "loss": 0.012121611274778843,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0122,
+      "step": 294,
+      "tokens/total": 38535168,
+      "tokens/train_per_sec_per_gpu": 3777.6,
+      "tokens/trainable": 4097707
+    },
+    {
+      "epoch": 0.9394904458598726,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 4.884918336249186e-05,
+      "loss": 0.009699760004878044,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00975,
+      "step": 295,
+      "tokens/total": 38666240,
+      "tokens/train_per_sec_per_gpu": 3588.34,
+      "tokens/trainable": 4112623
+    },
+    {
+      "epoch": 0.9426751592356688,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 4.883245428258107e-05,
+      "loss": 0.011465213261544704,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01153,
+      "step": 296,
+      "tokens/total": 38797312,
+      "tokens/train_per_sec_per_gpu": 3411.03,
+      "tokens/trainable": 4126849
+    },
+    {
+      "epoch": 0.945859872611465,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 4.881560739216697e-05,
+      "loss": 0.009318836033344269,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00936,
+      "step": 297,
+      "tokens/total": 38928384,
+      "tokens/train_per_sec_per_gpu": 3338.53,
+      "tokens/trainable": 4140757
+    },
+    {
+      "epoch": 0.9490445859872612,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 4.879864277452847e-05,
+      "loss": 0.012642276473343372,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01272,
+      "step": 298,
+      "tokens/total": 39059456,
+      "tokens/train_per_sec_per_gpu": 3555.91,
+      "tokens/trainable": 4155522
+    },
+    {
+      "epoch": 0.9522292993630573,
+      "grad_norm": 0.20703125,
+      "learning_rate": 4.8781560513526414e-05,
+      "loss": 0.013654773123562336,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01375,
+      "step": 299,
+      "tokens/total": 39190528,
+      "tokens/train_per_sec_per_gpu": 3459.38,
+      "tokens/trainable": 4169921
+    },
+    {
+      "epoch": 0.9554140127388535,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.876436069360323e-05,
+      "loss": 0.006959032732993364,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00698,
+      "step": 300,
+      "tokens/total": 39321600,
+      "tokens/train_per_sec_per_gpu": 3298.43,
+      "tokens/trainable": 4183671
+    },
+    {
+      "epoch": 0.9585987261146497,
+      "grad_norm": 0.2109375,
+      "learning_rate": 4.8747043399782424e-05,
+      "loss": 0.01015427801758051,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01021,
+      "step": 301,
+      "tokens/total": 39452672,
+      "tokens/train_per_sec_per_gpu": 3056.79,
+      "tokens/trainable": 4196527
+    },
+    {
+      "epoch": 0.9617834394904459,
+      "grad_norm": 0.189453125,
+      "learning_rate": 4.8729608717668265e-05,
+      "loss": 0.015600456856191158,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01572,
+      "step": 302,
+      "tokens/total": 39583744,
+      "tokens/train_per_sec_per_gpu": 3500.83,
+      "tokens/trainable": 4211124
+    },
+    {
+      "epoch": 0.964968152866242,
+      "grad_norm": 0.275390625,
+      "learning_rate": 4.871205673344525e-05,
+      "loss": 0.014728494919836521,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01484,
+      "step": 303,
+      "tokens/total": 39714816,
+      "tokens/train_per_sec_per_gpu": 3241.93,
+      "tokens/trainable": 4224632
+    },
+    {
+      "epoch": 0.9681528662420382,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.869438753387777e-05,
+      "loss": 0.008857826702296734,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0089,
+      "step": 304,
+      "tokens/total": 39845888,
+      "tokens/train_per_sec_per_gpu": 3447.73,
+      "tokens/trainable": 4239052
+    },
+    {
+      "epoch": 0.9713375796178344,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 4.867660120630962e-05,
+      "loss": 0.006837591528892517,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00686,
+      "step": 305,
+      "tokens/total": 39976960,
+      "tokens/train_per_sec_per_gpu": 3652.81,
+      "tokens/trainable": 4254227
+    },
+    {
+      "epoch": 0.9745222929936306,
+      "grad_norm": 0.21484375,
+      "learning_rate": 4.8658697838663625e-05,
+      "loss": 0.01278127171099186,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01286,
+      "step": 306,
+      "tokens/total": 40108032,
+      "tokens/train_per_sec_per_gpu": 3363.52,
+      "tokens/trainable": 4268312
+    },
+    {
+      "epoch": 0.9777070063694268,
+      "grad_norm": 0.19140625,
+      "learning_rate": 4.864067751944113e-05,
+      "loss": 0.010625463910400867,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01068,
+      "step": 307,
+      "tokens/total": 40239104,
+      "tokens/train_per_sec_per_gpu": 3301.6,
+      "tokens/trainable": 4282394
+    },
+    {
+      "epoch": 0.9808917197452229,
+      "grad_norm": 0.19140625,
+      "learning_rate": 4.862254033772164e-05,
+      "loss": 0.010408475063741207,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01046,
+      "step": 308,
+      "tokens/total": 40370176,
+      "tokens/train_per_sec_per_gpu": 3139.89,
+      "tokens/trainable": 4295549
+    },
+    {
+      "epoch": 0.9840764331210191,
+      "grad_norm": 0.15625,
+      "learning_rate": 4.8604286383162326e-05,
+      "loss": 0.00865277647972107,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00869,
+      "step": 309,
+      "tokens/total": 40501248,
+      "tokens/train_per_sec_per_gpu": 3451.88,
+      "tokens/trainable": 4309931
+    },
+    {
+      "epoch": 0.9872611464968153,
+      "grad_norm": 0.173828125,
+      "learning_rate": 4.858591574599759e-05,
+      "loss": 0.010455441661179066,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01051,
+      "step": 310,
+      "tokens/total": 40632320,
+      "tokens/train_per_sec_per_gpu": 3652.99,
+      "tokens/trainable": 4325145
+    },
+    {
+      "epoch": 0.9904458598726115,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 4.856742851703866e-05,
+      "loss": 0.009725190699100494,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00977,
+      "step": 311,
+      "tokens/total": 40763392,
+      "tokens/train_per_sec_per_gpu": 3095.9,
+      "tokens/trainable": 4338115
+    },
+    {
+      "epoch": 0.9936305732484076,
+      "grad_norm": 0.189453125,
+      "learning_rate": 4.854882478767308e-05,
+      "loss": 0.0067247929982841015,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00675,
+      "step": 312,
+      "tokens/total": 40894464,
+      "tokens/train_per_sec_per_gpu": 3608.14,
+      "tokens/trainable": 4353094
+    },
+    {
+      "epoch": 0.9968152866242038,
+      "grad_norm": 0.177734375,
+      "learning_rate": 4.8530104649864306e-05,
+      "loss": 0.008235358633100986,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00827,
+      "step": 313,
+      "tokens/total": 41025536,
+      "tokens/train_per_sec_per_gpu": 3438.93,
+      "tokens/trainable": 4367439
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.31640625,
+      "learning_rate": 4.8511268196151224e-05,
+      "loss": 0.013931503519415855,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 39.25,
+      "memory/max_allocated (GiB)": 39.25,
+      "ppl": 1.01403,
+      "step": 314,
+      "tokens/total": 41099264,
+      "tokens/train_per_sec_per_gpu": 2079.74,
+      "tokens/trainable": 4374676
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.010794572532176971,
+      "eval_ppl": 1.01085,
+      "eval_runtime": 42.176,
+      "eval_samples_per_second": 64.041,
+      "eval_steps_per_second": 4.007,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 314
+    },
+    {
+      "epoch": 1.0031847133757963,
+      "grad_norm": 0.19921875,
+      "learning_rate": 4.849231551964771e-05,
+      "loss": 0.01005562860518694,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01011,
+      "step": 315,
+      "tokens/total": 41230336,
+      "tokens/train_per_sec_per_gpu": 3300.4,
+      "tokens/trainable": 4388312
+    },
+    {
+      "epoch": 1.0063694267515924,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 4.8473246714042155e-05,
+      "loss": 0.009829830378293991,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00988,
+      "step": 316,
+      "tokens/total": 41361408,
+      "tokens/train_per_sec_per_gpu": 2786.13,
+      "tokens/trainable": 4400052
+    },
+    {
+      "epoch": 1.0095541401273886,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 4.845406187359701e-05,
+      "loss": 0.009766732342541218,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00981,
+      "step": 317,
+      "tokens/total": 41492480,
+      "tokens/train_per_sec_per_gpu": 3444.53,
+      "tokens/trainable": 4414268
+    },
+    {
+      "epoch": 1.0127388535031847,
+      "grad_norm": 0.17578125,
+      "learning_rate": 4.843476109314833e-05,
+      "loss": 0.009223168715834618,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00927,
+      "step": 318,
+      "tokens/total": 41623552,
+      "tokens/train_per_sec_per_gpu": 3515.7,
+      "tokens/trainable": 4428804
+    },
+    {
+      "epoch": 1.015923566878981,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 4.841534446810527e-05,
+      "loss": 0.008030703291296959,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00806,
+      "step": 319,
+      "tokens/total": 41754624,
+      "tokens/train_per_sec_per_gpu": 3297.15,
+      "tokens/trainable": 4442458
+    },
+    {
+      "epoch": 1.019108280254777,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.839581209444966e-05,
+      "loss": 0.008971852250397205,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00901,
+      "step": 320,
+      "tokens/total": 41885696,
+      "tokens/train_per_sec_per_gpu": 3348.3,
+      "tokens/trainable": 4456319
+    },
+    {
+      "epoch": 1.0222929936305734,
+      "grad_norm": 0.189453125,
+      "learning_rate": 4.8376164068735485e-05,
+      "loss": 0.011034002527594566,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0111,
+      "step": 321,
+      "tokens/total": 42016768,
+      "tokens/train_per_sec_per_gpu": 3467.0,
+      "tokens/trainable": 4470692
+    },
+    {
+      "epoch": 1.0254777070063694,
+      "grad_norm": 0.21484375,
+      "learning_rate": 4.835640048808847e-05,
+      "loss": 0.008709516376256943,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00875,
+      "step": 322,
+      "tokens/total": 42147840,
+      "tokens/train_per_sec_per_gpu": 3335.02,
+      "tokens/trainable": 4484563
+    },
+    {
+      "epoch": 1.0286624203821657,
+      "grad_norm": 0.166015625,
+      "learning_rate": 4.833652145020551e-05,
+      "loss": 0.006180301308631897,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0062,
+      "step": 323,
+      "tokens/total": 42278912,
+      "tokens/train_per_sec_per_gpu": 3293.93,
+      "tokens/trainable": 4498340
+    },
+    {
+      "epoch": 1.0318471337579618,
+      "grad_norm": 0.15234375,
+      "learning_rate": 4.831652705335428e-05,
+      "loss": 0.007071372587233782,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0071,
+      "step": 324,
+      "tokens/total": 42409984,
+      "tokens/train_per_sec_per_gpu": 3496.34,
+      "tokens/trainable": 4512959
+    },
+    {
+      "epoch": 1.035031847133758,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 4.829641739637269e-05,
+      "loss": 0.010390223003923893,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01044,
+      "step": 325,
+      "tokens/total": 42541056,
+      "tokens/train_per_sec_per_gpu": 3109.54,
+      "tokens/trainable": 4525947
+    },
+    {
+      "epoch": 1.0382165605095541,
+      "grad_norm": 0.19140625,
+      "learning_rate": 4.827619257866839e-05,
+      "loss": 0.010280653834342957,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01033,
+      "step": 326,
+      "tokens/total": 42672128,
+      "tokens/train_per_sec_per_gpu": 3494.82,
+      "tokens/trainable": 4540559
+    },
+    {
+      "epoch": 1.0414012738853504,
+      "grad_norm": 0.291015625,
+      "learning_rate": 4.825585270021835e-05,
+      "loss": 0.009634558111429214,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00968,
+      "step": 327,
+      "tokens/total": 42803200,
+      "tokens/train_per_sec_per_gpu": 3081.6,
+      "tokens/trainable": 4553474
+    },
+    {
+      "epoch": 1.0445859872611465,
+      "grad_norm": 0.21875,
+      "learning_rate": 4.823539786156828e-05,
+      "loss": 0.012012935243546963,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01209,
+      "step": 328,
+      "tokens/total": 42934272,
+      "tokens/train_per_sec_per_gpu": 3405.54,
+      "tokens/trainable": 4567721
+    },
+    {
+      "epoch": 1.0477707006369428,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 4.821482816383218e-05,
+      "loss": 0.005780364852398634,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0058,
+      "step": 329,
+      "tokens/total": 43065344,
+      "tokens/train_per_sec_per_gpu": 3703.56,
+      "tokens/trainable": 4583144
+    },
+    {
+      "epoch": 1.0509554140127388,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.8194143708691844e-05,
+      "loss": 0.010735648684203625,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01079,
+      "step": 330,
+      "tokens/total": 43196416,
+      "tokens/train_per_sec_per_gpu": 3454.77,
+      "tokens/trainable": 4597528
+    },
+    {
+      "epoch": 1.0541401273885351,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 4.817334459839633e-05,
+      "loss": 0.009996584616601467,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01005,
+      "step": 331,
+      "tokens/total": 43327488,
+      "tokens/train_per_sec_per_gpu": 3088.63,
+      "tokens/trainable": 4610506
+    },
+    {
+      "epoch": 1.0573248407643312,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 4.8152430935761456e-05,
+      "loss": 0.007421544287353754,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00745,
+      "step": 332,
+      "tokens/total": 43458560,
+      "tokens/train_per_sec_per_gpu": 3395.75,
+      "tokens/trainable": 4624715
+    },
+    {
+      "epoch": 1.0605095541401275,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 4.8131402824169336e-05,
+      "loss": 0.004339924082159996,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00435,
+      "step": 333,
+      "tokens/total": 43589632,
+      "tokens/train_per_sec_per_gpu": 2923.1,
+      "tokens/trainable": 4636991
+    },
+    {
+      "epoch": 1.0636942675159236,
+      "grad_norm": 0.2109375,
+      "learning_rate": 4.8110260367567816e-05,
+      "loss": 0.007030356675386429,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00706,
+      "step": 334,
+      "tokens/total": 43720704,
+      "tokens/train_per_sec_per_gpu": 3278.5,
+      "tokens/trainable": 4650745
+    },
+    {
+      "epoch": 1.0668789808917198,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 4.808900367046999e-05,
+      "loss": 0.00917564332485199,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00922,
+      "step": 335,
+      "tokens/total": 43851776,
+      "tokens/train_per_sec_per_gpu": 3402.45,
+      "tokens/trainable": 4664997
+    },
+    {
+      "epoch": 1.070063694267516,
+      "grad_norm": 0.158203125,
+      "learning_rate": 4.806763283795366e-05,
+      "loss": 0.0065734670497477055,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0066,
+      "step": 336,
+      "tokens/total": 43982848,
+      "tokens/train_per_sec_per_gpu": 2932.03,
+      "tokens/trainable": 4677280
+    },
+    {
+      "epoch": 1.0732484076433122,
+      "grad_norm": 0.154296875,
+      "learning_rate": 4.804614797566086e-05,
+      "loss": 0.00853950995951891,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00858,
+      "step": 337,
+      "tokens/total": 44113920,
+      "tokens/train_per_sec_per_gpu": 3499.45,
+      "tokens/trainable": 4691898
+    },
+    {
+      "epoch": 1.0764331210191083,
+      "grad_norm": 0.271484375,
+      "learning_rate": 4.8024549189797276e-05,
+      "loss": 0.012293344363570213,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01237,
+      "step": 338,
+      "tokens/total": 44244992,
+      "tokens/train_per_sec_per_gpu": 3312.19,
+      "tokens/trainable": 4705870
+    },
+    {
+      "epoch": 1.0796178343949046,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 4.800283658713177e-05,
+      "loss": 0.010073346085846424,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01012,
+      "step": 339,
+      "tokens/total": 44376064,
+      "tokens/train_per_sec_per_gpu": 3473.54,
+      "tokens/trainable": 4720409
+    },
+    {
+      "epoch": 1.0828025477707006,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 4.798101027499581e-05,
+      "loss": 0.010279987938702106,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01033,
+      "step": 340,
+      "tokens/total": 44507136,
+      "tokens/train_per_sec_per_gpu": 3370.76,
+      "tokens/trainable": 4734524
+    },
+    {
+      "epoch": 1.085987261146497,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 4.795907036128299e-05,
+      "loss": 0.009196259081363678,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00924,
+      "step": 341,
+      "tokens/total": 44638208,
+      "tokens/train_per_sec_per_gpu": 3347.17,
+      "tokens/trainable": 4748535
+    },
+    {
+      "epoch": 1.089171974522293,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 4.793701695444846e-05,
+      "loss": 0.009703228250145912,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00975,
+      "step": 342,
+      "tokens/total": 44769280,
+      "tokens/train_per_sec_per_gpu": 3220.71,
+      "tokens/trainable": 4762018
+    },
+    {
+      "epoch": 1.0923566878980893,
+      "grad_norm": 0.18359375,
+      "learning_rate": 4.791485016350837e-05,
+      "loss": 0.010180710814893246,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01023,
+      "step": 343,
+      "tokens/total": 44900352,
+      "tokens/train_per_sec_per_gpu": 3726.69,
+      "tokens/trainable": 4777568
+    },
+    {
+      "epoch": 1.0955414012738853,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 4.78925700980394e-05,
+      "loss": 0.007739739958196878,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00777,
+      "step": 344,
+      "tokens/total": 45031424,
+      "tokens/train_per_sec_per_gpu": 3151.58,
+      "tokens/trainable": 4790766
+    },
+    {
+      "epoch": 1.0987261146496816,
+      "grad_norm": 0.265625,
+      "learning_rate": 4.787017686817816e-05,
+      "loss": 0.013002859428524971,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01309,
+      "step": 345,
+      "tokens/total": 45162496,
+      "tokens/train_per_sec_per_gpu": 3615.54,
+      "tokens/trainable": 4805850
+    },
+    {
+      "epoch": 1.1019108280254777,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.7847670584620653e-05,
+      "loss": 0.008513463661074638,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00855,
+      "step": 346,
+      "tokens/total": 45293568,
+      "tokens/train_per_sec_per_gpu": 3554.41,
+      "tokens/trainable": 4820707
+    },
+    {
+      "epoch": 1.105095541401274,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 4.782505135862176e-05,
+      "loss": 0.012663084082305431,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01274,
+      "step": 347,
+      "tokens/total": 45424640,
+      "tokens/train_per_sec_per_gpu": 3406.8,
+      "tokens/trainable": 4834965
+    },
+    {
+      "epoch": 1.10828025477707,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 4.780231930199465e-05,
+      "loss": 0.006982079707086086,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00701,
+      "step": 348,
+      "tokens/total": 45555712,
+      "tokens/train_per_sec_per_gpu": 3420.63,
+      "tokens/trainable": 4849306
+    },
+    {
+      "epoch": 1.1114649681528663,
+      "grad_norm": 0.150390625,
+      "learning_rate": 4.777947452711026e-05,
+      "loss": 0.007746942341327667,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00778,
+      "step": 349,
+      "tokens/total": 45686784,
+      "tokens/train_per_sec_per_gpu": 3182.73,
+      "tokens/trainable": 4862654
+    },
+    {
+      "epoch": 1.1146496815286624,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 4.77565171468967e-05,
+      "loss": 0.008427651599049568,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00846,
+      "step": 350,
+      "tokens/total": 45817856,
+      "tokens/train_per_sec_per_gpu": 3011.28,
+      "tokens/trainable": 4875396
+    },
+    {
+      "epoch": 1.1178343949044587,
+      "grad_norm": 0.150390625,
+      "learning_rate": 4.773344727483876e-05,
+      "loss": 0.007029036991298199,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00705,
+      "step": 351,
+      "tokens/total": 45948928,
+      "tokens/train_per_sec_per_gpu": 2910.12,
+      "tokens/trainable": 4887648
+    },
+    {
+      "epoch": 1.1210191082802548,
+      "grad_norm": 0.203125,
+      "learning_rate": 4.771026502497726e-05,
+      "loss": 0.009960726834833622,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01001,
+      "step": 352,
+      "tokens/total": 46080000,
+      "tokens/train_per_sec_per_gpu": 3171.62,
+      "tokens/trainable": 4900946
+    },
+    {
+      "epoch": 1.124203821656051,
+      "grad_norm": 0.2109375,
+      "learning_rate": 4.7686970511908594e-05,
+      "loss": 0.010911881923675537,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01097,
+      "step": 353,
+      "tokens/total": 46211072,
+      "tokens/train_per_sec_per_gpu": 3471.86,
+      "tokens/trainable": 4915383
+    },
+    {
+      "epoch": 1.127388535031847,
+      "grad_norm": 0.19921875,
+      "learning_rate": 4.766356385078403e-05,
+      "loss": 0.01082072127610445,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01088,
+      "step": 354,
+      "tokens/total": 46342144,
+      "tokens/train_per_sec_per_gpu": 3528.47,
+      "tokens/trainable": 4930118
+    },
+    {
+      "epoch": 1.1305732484076434,
+      "grad_norm": 0.189453125,
+      "learning_rate": 4.7640045157309286e-05,
+      "loss": 0.00796705111861229,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.008,
+      "step": 355,
+      "tokens/total": 46473216,
+      "tokens/train_per_sec_per_gpu": 3675.06,
+      "tokens/trainable": 4945407
+    },
+    {
+      "epoch": 1.1337579617834395,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 4.761641454774386e-05,
+      "loss": 0.009853512980043888,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0099,
+      "step": 356,
+      "tokens/total": 46604288,
+      "tokens/train_per_sec_per_gpu": 3426.1,
+      "tokens/trainable": 4959713
+    },
+    {
+      "epoch": 1.1369426751592357,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 4.759267213890046e-05,
+      "loss": 0.008251532912254333,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00829,
+      "step": 357,
+      "tokens/total": 46735360,
+      "tokens/train_per_sec_per_gpu": 3370.12,
+      "tokens/trainable": 4973803
+    },
+    {
+      "epoch": 1.1401273885350318,
+      "grad_norm": 0.171875,
+      "learning_rate": 4.756881804814448e-05,
+      "loss": 0.007583227939903736,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00761,
+      "step": 358,
+      "tokens/total": 46866432,
+      "tokens/train_per_sec_per_gpu": 3085.13,
+      "tokens/trainable": 4986783
+    },
+    {
+      "epoch": 1.143312101910828,
+      "grad_norm": 0.1171875,
+      "learning_rate": 4.7544852393393375e-05,
+      "loss": 0.005565401166677475,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00558,
+      "step": 359,
+      "tokens/total": 46997504,
+      "tokens/train_per_sec_per_gpu": 3283.64,
+      "tokens/trainable": 5000464
+    },
+    {
+      "epoch": 1.1464968152866242,
+      "grad_norm": 0.158203125,
+      "learning_rate": 4.7520775293116096e-05,
+      "loss": 0.007274336647242308,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0073,
+      "step": 360,
+      "tokens/total": 47128576,
+      "tokens/train_per_sec_per_gpu": 3219.09,
+      "tokens/trainable": 5013941
+    },
+    {
+      "epoch": 1.1496815286624205,
+      "grad_norm": 0.173828125,
+      "learning_rate": 4.749658686633251e-05,
+      "loss": 0.007295841351151466,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00732,
+      "step": 361,
+      "tokens/total": 47259648,
+      "tokens/train_per_sec_per_gpu": 3222.39,
+      "tokens/trainable": 5027460
+    },
+    {
+      "epoch": 1.1528662420382165,
+      "grad_norm": 0.126953125,
+      "learning_rate": 4.747228723261278e-05,
+      "loss": 0.004342417698353529,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00435,
+      "step": 362,
+      "tokens/total": 47390720,
+      "tokens/train_per_sec_per_gpu": 3121.81,
+      "tokens/trainable": 5040541
+    },
+    {
+      "epoch": 1.1560509554140128,
+      "grad_norm": 0.197265625,
+      "learning_rate": 4.7447876512076815e-05,
+      "loss": 0.00851562898606062,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00855,
+      "step": 363,
+      "tokens/total": 47521792,
+      "tokens/train_per_sec_per_gpu": 3480.8,
+      "tokens/trainable": 5055042
+    },
+    {
+      "epoch": 1.1592356687898089,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 4.7423354825393646e-05,
+      "loss": 0.011735991574823856,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01181,
+      "step": 364,
+      "tokens/total": 47652864,
+      "tokens/train_per_sec_per_gpu": 3454.35,
+      "tokens/trainable": 5069432
+    },
+    {
+      "epoch": 1.1624203821656052,
+      "grad_norm": 0.203125,
+      "learning_rate": 4.739872229378085e-05,
+      "loss": 0.009628934785723686,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00968,
+      "step": 365,
+      "tokens/total": 47783936,
+      "tokens/train_per_sec_per_gpu": 3056.5,
+      "tokens/trainable": 5082238
+    },
+    {
+      "epoch": 1.1656050955414012,
+      "grad_norm": 0.181640625,
+      "learning_rate": 4.737397903900393e-05,
+      "loss": 0.008178248070180416,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00821,
+      "step": 366,
+      "tokens/total": 47915008,
+      "tokens/train_per_sec_per_gpu": 3187.45,
+      "tokens/trainable": 5095582
+    },
+    {
+      "epoch": 1.1687898089171975,
+      "grad_norm": 0.2109375,
+      "learning_rate": 4.734912518337574e-05,
+      "loss": 0.010145166888833046,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0102,
+      "step": 367,
+      "tokens/total": 48046080,
+      "tokens/train_per_sec_per_gpu": 3535.81,
+      "tokens/trainable": 5110321
+    },
+    {
+      "epoch": 1.1719745222929936,
+      "grad_norm": 0.158203125,
+      "learning_rate": 4.732416084975585e-05,
+      "loss": 0.008553897961974144,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00859,
+      "step": 368,
+      "tokens/total": 48177152,
+      "tokens/train_per_sec_per_gpu": 3223.93,
+      "tokens/trainable": 5123813
+    },
+    {
+      "epoch": 1.1751592356687899,
+      "grad_norm": 0.146484375,
+      "learning_rate": 4.729908616154996e-05,
+      "loss": 0.007267483975738287,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00729,
+      "step": 369,
+      "tokens/total": 48308224,
+      "tokens/train_per_sec_per_gpu": 3596.57,
+      "tokens/trainable": 5138875
+    },
+    {
+      "epoch": 1.178343949044586,
+      "grad_norm": 0.20703125,
+      "learning_rate": 4.727390124270929e-05,
+      "loss": 0.010045611299574375,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0101,
+      "step": 370,
+      "tokens/total": 48439296,
+      "tokens/train_per_sec_per_gpu": 3361.05,
+      "tokens/trainable": 5152957
+    },
+    {
+      "epoch": 1.1815286624203822,
+      "grad_norm": 0.166015625,
+      "learning_rate": 4.724860621772995e-05,
+      "loss": 0.006381361745297909,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0064,
+      "step": 371,
+      "tokens/total": 48570368,
+      "tokens/train_per_sec_per_gpu": 3270.56,
+      "tokens/trainable": 5166655
+    },
+    {
+      "epoch": 1.1847133757961783,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 4.7223201211652346e-05,
+      "loss": 0.0061474088579416275,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00617,
+      "step": 372,
+      "tokens/total": 48701440,
+      "tokens/train_per_sec_per_gpu": 3413.31,
+      "tokens/trainable": 5180889
+    },
+    {
+      "epoch": 1.1878980891719746,
+      "grad_norm": 0.205078125,
+      "learning_rate": 4.7197686350060535e-05,
+      "loss": 0.013294153846800327,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01338,
+      "step": 373,
+      "tokens/total": 48832512,
+      "tokens/train_per_sec_per_gpu": 3307.53,
+      "tokens/trainable": 5194736
+    },
+    {
+      "epoch": 1.1910828025477707,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.717206175908164e-05,
+      "loss": 0.009227165952324867,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00927,
+      "step": 374,
+      "tokens/total": 48963584,
+      "tokens/train_per_sec_per_gpu": 3407.27,
+      "tokens/trainable": 5208974
+    },
+    {
+      "epoch": 1.194267515923567,
+      "grad_norm": 0.2421875,
+      "learning_rate": 4.7146327565385195e-05,
+      "loss": 0.009992158971726894,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01004,
+      "step": 375,
+      "tokens/total": 49094656,
+      "tokens/train_per_sec_per_gpu": 3078.36,
+      "tokens/trainable": 5221898
+    },
+    {
+      "epoch": 1.197452229299363,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 4.712048389618254e-05,
+      "loss": 0.0076246620155870914,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00765,
+      "step": 376,
+      "tokens/total": 49225728,
+      "tokens/train_per_sec_per_gpu": 3454.62,
+      "tokens/trainable": 5236300
+    },
+    {
+      "epoch": 1.2006369426751593,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 4.7094530879226166e-05,
+      "loss": 0.010849738493561745,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01091,
+      "step": 377,
+      "tokens/total": 49356800,
+      "tokens/train_per_sec_per_gpu": 3211.73,
+      "tokens/trainable": 5249796
+    },
+    {
+      "epoch": 1.2038216560509554,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.706846864280913e-05,
+      "loss": 0.00665281992405653,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00667,
+      "step": 378,
+      "tokens/total": 49487872,
+      "tokens/train_per_sec_per_gpu": 3615.73,
+      "tokens/trainable": 5264940
+    },
+    {
+      "epoch": 1.2070063694267517,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 4.704229731576435e-05,
+      "loss": 0.009321301244199276,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00936,
+      "step": 379,
+      "tokens/total": 49618944,
+      "tokens/train_per_sec_per_gpu": 3521.87,
+      "tokens/trainable": 5279679
+    },
+    {
+      "epoch": 1.2101910828025477,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 4.701601702746405e-05,
+      "loss": 0.009726524353027344,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00977,
+      "step": 380,
+      "tokens/total": 49750016,
+      "tokens/train_per_sec_per_gpu": 3758.86,
+      "tokens/trainable": 5295322
+    },
+    {
+      "epoch": 1.213375796178344,
+      "grad_norm": 0.138671875,
+      "learning_rate": 4.698962790781906e-05,
+      "loss": 0.00720211723819375,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00723,
+      "step": 381,
+      "tokens/total": 49881088,
+      "tokens/train_per_sec_per_gpu": 3392.46,
+      "tokens/trainable": 5309524
+    },
+    {
+      "epoch": 1.21656050955414,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 4.696313008727819e-05,
+      "loss": 0.009434825740754604,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00948,
+      "step": 382,
+      "tokens/total": 50012160,
+      "tokens/train_per_sec_per_gpu": 3237.9,
+      "tokens/trainable": 5323073
+    },
+    {
+      "epoch": 1.2197452229299364,
+      "grad_norm": 0.203125,
+      "learning_rate": 4.6936523696827615e-05,
+      "loss": 0.013360480777919292,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01345,
+      "step": 383,
+      "tokens/total": 50143232,
+      "tokens/train_per_sec_per_gpu": 3386.03,
+      "tokens/trainable": 5337238
+    },
+    {
+      "epoch": 1.2229299363057324,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.690980886799016e-05,
+      "loss": 0.009163031354546547,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00921,
+      "step": 384,
+      "tokens/total": 50274304,
+      "tokens/train_per_sec_per_gpu": 3800.38,
+      "tokens/trainable": 5353034
+    },
+    {
+      "epoch": 1.2261146496815287,
+      "grad_norm": 0.142578125,
+      "learning_rate": 4.688298573282473e-05,
+      "loss": 0.006065514404326677,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00608,
+      "step": 385,
+      "tokens/total": 50405376,
+      "tokens/train_per_sec_per_gpu": 3325.28,
+      "tokens/trainable": 5366994
+    },
+    {
+      "epoch": 1.2292993630573248,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 4.685605442392559e-05,
+      "loss": 0.007522703614085913,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00755,
+      "step": 386,
+      "tokens/total": 50536448,
+      "tokens/train_per_sec_per_gpu": 3244.1,
+      "tokens/trainable": 5380585
+    },
+    {
+      "epoch": 1.232484076433121,
+      "grad_norm": 0.158203125,
+      "learning_rate": 4.6829015074421754e-05,
+      "loss": 0.008297629654407501,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00833,
+      "step": 387,
+      "tokens/total": 50667520,
+      "tokens/train_per_sec_per_gpu": 3675.24,
+      "tokens/trainable": 5395883
+    },
+    {
+      "epoch": 1.2356687898089171,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 4.680186781797632e-05,
+      "loss": 0.008283684030175209,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00832,
+      "step": 388,
+      "tokens/total": 50798592,
+      "tokens/train_per_sec_per_gpu": 3323.41,
+      "tokens/trainable": 5409819
+    },
+    {
+      "epoch": 1.2388535031847134,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.677461278878577e-05,
+      "loss": 0.009029434062540531,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00907,
+      "step": 389,
+      "tokens/total": 50929664,
+      "tokens/train_per_sec_per_gpu": 2967.79,
+      "tokens/trainable": 5422282
+    },
+    {
+      "epoch": 1.2420382165605095,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 4.674725012157936e-05,
+      "loss": 0.0059669832699000835,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00598,
+      "step": 390,
+      "tokens/total": 51060736,
+      "tokens/train_per_sec_per_gpu": 3230.78,
+      "tokens/trainable": 5435820
+    },
+    {
+      "epoch": 1.2452229299363058,
+      "grad_norm": 0.14453125,
+      "learning_rate": 4.671977995161843e-05,
+      "loss": 0.005600204225629568,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00562,
+      "step": 391,
+      "tokens/total": 51191808,
+      "tokens/train_per_sec_per_gpu": 3398.04,
+      "tokens/trainable": 5450055
+    },
+    {
+      "epoch": 1.2484076433121019,
+      "grad_norm": 0.166015625,
+      "learning_rate": 4.669220241469573e-05,
+      "loss": 0.007735088467597961,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00777,
+      "step": 392,
+      "tokens/total": 51322880,
+      "tokens/train_per_sec_per_gpu": 3315.69,
+      "tokens/trainable": 5463943
+    },
+    {
+      "epoch": 1.2515923566878981,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 4.666451764713475e-05,
+      "loss": 0.010222709737718105,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01028,
+      "step": 393,
+      "tokens/total": 51453952,
+      "tokens/train_per_sec_per_gpu": 3438.45,
+      "tokens/trainable": 5478266
+    },
+    {
+      "epoch": 1.2547770700636942,
+      "grad_norm": 0.154296875,
+      "learning_rate": 4.663672578578908e-05,
+      "loss": 0.007789981085807085,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00782,
+      "step": 394,
+      "tokens/total": 51585024,
+      "tokens/train_per_sec_per_gpu": 3144.21,
+      "tokens/trainable": 5491440
+    },
+    {
+      "epoch": 1.2579617834394905,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 4.660882696804165e-05,
+      "loss": 0.01257528830319643,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01265,
+      "step": 395,
+      "tokens/total": 51716096,
+      "tokens/train_per_sec_per_gpu": 3704.11,
+      "tokens/trainable": 5506947
+    },
+    {
+      "epoch": 1.2611464968152866,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.658082133180416e-05,
+      "loss": 0.007808534894138575,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00784,
+      "step": 396,
+      "tokens/total": 51847168,
+      "tokens/train_per_sec_per_gpu": 3138.89,
+      "tokens/trainable": 5520102
+    },
+    {
+      "epoch": 1.2643312101910829,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.655270901551632e-05,
+      "loss": 0.008749695494771004,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00879,
+      "step": 397,
+      "tokens/total": 51978240,
+      "tokens/train_per_sec_per_gpu": 3068.52,
+      "tokens/trainable": 5532992
+    },
+    {
+      "epoch": 1.267515923566879,
+      "grad_norm": 0.193359375,
+      "learning_rate": 4.652449015814518e-05,
+      "loss": 0.010582723654806614,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01064,
+      "step": 398,
+      "tokens/total": 52109312,
+      "tokens/train_per_sec_per_gpu": 3477.89,
+      "tokens/trainable": 5547568
+    },
+    {
+      "epoch": 1.2707006369426752,
+      "grad_norm": 0.177734375,
+      "learning_rate": 4.649616489918448e-05,
+      "loss": 0.007580795791000128,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00761,
+      "step": 399,
+      "tokens/total": 52240384,
+      "tokens/train_per_sec_per_gpu": 3136.71,
+      "tokens/trainable": 5560738
+    },
+    {
+      "epoch": 1.2738853503184713,
+      "grad_norm": 0.177734375,
+      "learning_rate": 4.646773337865391e-05,
+      "loss": 0.00638965331017971,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00641,
+      "step": 400,
+      "tokens/total": 52371456,
+      "tokens/train_per_sec_per_gpu": 3189.44,
+      "tokens/trainable": 5574146
+    },
+    {
+      "epoch": 1.2770700636942676,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.643919573709843e-05,
+      "loss": 0.007701355963945389,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00773,
+      "step": 401,
+      "tokens/total": 52502528,
+      "tokens/train_per_sec_per_gpu": 3217.16,
+      "tokens/trainable": 5587632
+    },
+    {
+      "epoch": 1.2802547770700636,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 4.641055211558762e-05,
+      "loss": 0.009735530242323875,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00978,
+      "step": 402,
+      "tokens/total": 52633600,
+      "tokens/train_per_sec_per_gpu": 3104.94,
+      "tokens/trainable": 5600617
+    },
+    {
+      "epoch": 1.28343949044586,
+      "grad_norm": 0.193359375,
+      "learning_rate": 4.6381802655714946e-05,
+      "loss": 0.009511996060609818,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00956,
+      "step": 403,
+      "tokens/total": 52764672,
+      "tokens/train_per_sec_per_gpu": 3181.89,
+      "tokens/trainable": 5613940
+    },
+    {
+      "epoch": 1.286624203821656,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.6352947499597024e-05,
+      "loss": 0.008532877080142498,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00857,
+      "step": 404,
+      "tokens/total": 52895744,
+      "tokens/train_per_sec_per_gpu": 3220.05,
+      "tokens/trainable": 5627419
+    },
+    {
+      "epoch": 1.2898089171974523,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.632398678987298e-05,
+      "loss": 0.007435362320393324,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00746,
+      "step": 405,
+      "tokens/total": 53026816,
+      "tokens/train_per_sec_per_gpu": 3293.31,
+      "tokens/trainable": 5641255
+    },
+    {
+      "epoch": 1.2929936305732483,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.629492066970373e-05,
+      "loss": 0.009640632197260857,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00969,
+      "step": 406,
+      "tokens/total": 53157888,
+      "tokens/train_per_sec_per_gpu": 3502.03,
+      "tokens/trainable": 5655889
+    },
+    {
+      "epoch": 1.2961783439490446,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 4.626574928277127e-05,
+      "loss": 0.00989444274455309,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00994,
+      "step": 407,
+      "tokens/total": 53288960,
+      "tokens/train_per_sec_per_gpu": 3544.59,
+      "tokens/trainable": 5670642
+    },
+    {
+      "epoch": 1.2993630573248407,
+      "grad_norm": 0.23828125,
+      "learning_rate": 4.623647277327792e-05,
+      "loss": 0.009198141284286976,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00924,
+      "step": 408,
+      "tokens/total": 53420032,
+      "tokens/train_per_sec_per_gpu": 3306.22,
+      "tokens/trainable": 5684524
+    },
+    {
+      "epoch": 1.302547770700637,
+      "grad_norm": 0.216796875,
+      "learning_rate": 4.6207091285945694e-05,
+      "loss": 0.010384837165474892,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01044,
+      "step": 409,
+      "tokens/total": 53551104,
+      "tokens/train_per_sec_per_gpu": 3444.91,
+      "tokens/trainable": 5698889
+    },
+    {
+      "epoch": 1.305732484076433,
+      "grad_norm": 0.1640625,
+      "learning_rate": 4.61776049660155e-05,
+      "loss": 0.0068597206845879555,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00688,
+      "step": 410,
+      "tokens/total": 53682176,
+      "tokens/train_per_sec_per_gpu": 3083.21,
+      "tokens/trainable": 5711820
+    },
+    {
+      "epoch": 1.3089171974522293,
+      "grad_norm": 0.125,
+      "learning_rate": 4.614801395924649e-05,
+      "loss": 0.005090971477329731,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0051,
+      "step": 411,
+      "tokens/total": 53813248,
+      "tokens/train_per_sec_per_gpu": 3042.48,
+      "tokens/trainable": 5724613
+    },
+    {
+      "epoch": 1.3121019108280254,
+      "grad_norm": 0.142578125,
+      "learning_rate": 4.611831841191533e-05,
+      "loss": 0.005095964763313532,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00511,
+      "step": 412,
+      "tokens/total": 53944320,
+      "tokens/train_per_sec_per_gpu": 3189.46,
+      "tokens/trainable": 5737985
+    },
+    {
+      "epoch": 1.3152866242038217,
+      "grad_norm": 0.177734375,
+      "learning_rate": 4.608851847081542e-05,
+      "loss": 0.009599323384463787,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00965,
+      "step": 413,
+      "tokens/total": 54075392,
+      "tokens/train_per_sec_per_gpu": 3425.1,
+      "tokens/trainable": 5752257
+    },
+    {
+      "epoch": 1.3184713375796178,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 4.6058614283256205e-05,
+      "loss": 0.007107466459274292,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00713,
+      "step": 414,
+      "tokens/total": 54206464,
+      "tokens/train_per_sec_per_gpu": 3284.05,
+      "tokens/trainable": 5766000
+    },
+    {
+      "epoch": 1.321656050955414,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 4.60286059970625e-05,
+      "loss": 0.009428326040506363,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00947,
+      "step": 415,
+      "tokens/total": 54337536,
+      "tokens/train_per_sec_per_gpu": 3375.64,
+      "tokens/trainable": 5780141
+    },
+    {
+      "epoch": 1.3248407643312101,
+      "grad_norm": 0.1484375,
+      "learning_rate": 4.599849376057366e-05,
+      "loss": 0.006207283120602369,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00623,
+      "step": 416,
+      "tokens/total": 54468608,
+      "tokens/train_per_sec_per_gpu": 3150.96,
+      "tokens/trainable": 5793324
+    },
+    {
+      "epoch": 1.3280254777070064,
+      "grad_norm": 0.193359375,
+      "learning_rate": 4.5968277722642915e-05,
+      "loss": 0.011342452839016914,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01141,
+      "step": 417,
+      "tokens/total": 54599680,
+      "tokens/train_per_sec_per_gpu": 3068.96,
+      "tokens/trainable": 5806288
+    },
+    {
+      "epoch": 1.3312101910828025,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 4.593795803263661e-05,
+      "loss": 0.0096285380423069,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00968,
+      "step": 418,
+      "tokens/total": 54730752,
+      "tokens/train_per_sec_per_gpu": 3414.96,
+      "tokens/trainable": 5820535
+    },
+    {
+      "epoch": 1.3343949044585988,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.590753484043348e-05,
+      "loss": 0.008351242169737816,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00839,
+      "step": 419,
+      "tokens/total": 54861824,
+      "tokens/train_per_sec_per_gpu": 3383.61,
+      "tokens/trainable": 5834705
+    },
+    {
+      "epoch": 1.3375796178343948,
+      "grad_norm": 0.20703125,
+      "learning_rate": 4.5877008296423886e-05,
+      "loss": 0.010140678845345974,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01019,
+      "step": 420,
+      "tokens/total": 54992896,
+      "tokens/train_per_sec_per_gpu": 3557.31,
+      "tokens/trainable": 5849593
+    },
+    {
+      "epoch": 1.3407643312101911,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 4.5846378551509097e-05,
+      "loss": 0.003956064116209745,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00396,
+      "step": 421,
+      "tokens/total": 55123968,
+      "tokens/train_per_sec_per_gpu": 3127.02,
+      "tokens/trainable": 5862715
+    },
+    {
+      "epoch": 1.3439490445859872,
+      "grad_norm": 0.19921875,
+      "learning_rate": 4.581564575710053e-05,
+      "loss": 0.011450878344476223,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01152,
+      "step": 422,
+      "tokens/total": 55255040,
+      "tokens/train_per_sec_per_gpu": 3045.36,
+      "tokens/trainable": 5875602
+    },
+    {
+      "epoch": 1.3471337579617835,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 4.5784810065119e-05,
+      "loss": 0.008104214444756508,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00814,
+      "step": 423,
+      "tokens/total": 55386112,
+      "tokens/train_per_sec_per_gpu": 3284.72,
+      "tokens/trainable": 5889428
+    },
+    {
+      "epoch": 1.3503184713375795,
+      "grad_norm": 0.14453125,
+      "learning_rate": 4.575387162799399e-05,
+      "loss": 0.006891798693686724,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00692,
+      "step": 424,
+      "tokens/total": 55517184,
+      "tokens/train_per_sec_per_gpu": 3722.92,
+      "tokens/trainable": 5904973
+    },
+    {
+      "epoch": 1.3535031847133758,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.5722830598662854e-05,
+      "loss": 0.009776144288480282,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00982,
+      "step": 425,
+      "tokens/total": 55648256,
+      "tokens/train_per_sec_per_gpu": 3412.77,
+      "tokens/trainable": 5919245
+    },
+    {
+      "epoch": 1.356687898089172,
+      "grad_norm": 0.166015625,
+      "learning_rate": 4.56916871305701e-05,
+      "loss": 0.007931388914585114,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00796,
+      "step": 426,
+      "tokens/total": 55779328,
+      "tokens/train_per_sec_per_gpu": 3407.46,
+      "tokens/trainable": 5933535
+    },
+    {
+      "epoch": 1.3598726114649682,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 4.5660441377666654e-05,
+      "loss": 0.008083492517471313,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00812,
+      "step": 427,
+      "tokens/total": 55910400,
+      "tokens/train_per_sec_per_gpu": 3599.03,
+      "tokens/trainable": 5948481
+    },
+    {
+      "epoch": 1.3630573248407643,
+      "grad_norm": 0.1484375,
+      "learning_rate": 4.562909349440899e-05,
+      "loss": 0.006925994995981455,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00695,
+      "step": 428,
+      "tokens/total": 56041472,
+      "tokens/train_per_sec_per_gpu": 3517.79,
+      "tokens/trainable": 5963175
+    },
+    {
+      "epoch": 1.3662420382165605,
+      "grad_norm": 0.1484375,
+      "learning_rate": 4.559764363575851e-05,
+      "loss": 0.008385020308196545,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00842,
+      "step": 429,
+      "tokens/total": 56172544,
+      "tokens/train_per_sec_per_gpu": 3404.08,
+      "tokens/trainable": 5977423
+    },
+    {
+      "epoch": 1.3694267515923566,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.556609195718068e-05,
+      "loss": 0.005221434403210878,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00524,
+      "step": 430,
+      "tokens/total": 56303616,
+      "tokens/train_per_sec_per_gpu": 3212.05,
+      "tokens/trainable": 5990835
+    },
+    {
+      "epoch": 1.372611464968153,
+      "grad_norm": 0.193359375,
+      "learning_rate": 4.5534438614644294e-05,
+      "loss": 0.009253652766346931,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0093,
+      "step": 431,
+      "tokens/total": 56434688,
+      "tokens/train_per_sec_per_gpu": 3584.45,
+      "tokens/trainable": 6005749
+    },
+    {
+      "epoch": 1.3757961783439492,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 4.550268376462068e-05,
+      "loss": 0.009988540783524513,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01004,
+      "step": 432,
+      "tokens/total": 56565760,
+      "tokens/train_per_sec_per_gpu": 3148.92,
+      "tokens/trainable": 6018952
+    },
+    {
+      "epoch": 1.3789808917197452,
+      "grad_norm": 0.166015625,
+      "learning_rate": 4.547082756408299e-05,
+      "loss": 0.007521233521401882,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00755,
+      "step": 433,
+      "tokens/total": 56696832,
+      "tokens/train_per_sec_per_gpu": 3309.63,
+      "tokens/trainable": 6032837
+    },
+    {
+      "epoch": 1.3821656050955413,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 4.543887017050534e-05,
+      "loss": 0.005825295113027096,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00584,
+      "step": 434,
+      "tokens/total": 56827904,
+      "tokens/train_per_sec_per_gpu": 3375.36,
+      "tokens/trainable": 6046929
+    },
+    {
+      "epoch": 1.3853503184713376,
+      "grad_norm": 0.2265625,
+      "learning_rate": 4.540681174186209e-05,
+      "loss": 0.011601070873439312,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01167,
+      "step": 435,
+      "tokens/total": 56958976,
+      "tokens/train_per_sec_per_gpu": 3215.41,
+      "tokens/trainable": 6060425
+    },
+    {
+      "epoch": 1.388535031847134,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 4.537465243662704e-05,
+      "loss": 0.008219108916819096,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00825,
+      "step": 436,
+      "tokens/total": 57090048,
+      "tokens/train_per_sec_per_gpu": 3133.32,
+      "tokens/trainable": 6073533
+    },
+    {
+      "epoch": 1.39171974522293,
+      "grad_norm": 0.140625,
+      "learning_rate": 4.534239241377266e-05,
+      "loss": 0.007054620422422886,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00708,
+      "step": 437,
+      "tokens/total": 57221120,
+      "tokens/train_per_sec_per_gpu": 3767.46,
+      "tokens/trainable": 6089174
+    },
+    {
+      "epoch": 1.394904458598726,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 4.5310031832769275e-05,
+      "loss": 0.007198185659945011,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00722,
+      "step": 438,
+      "tokens/total": 57352192,
+      "tokens/train_per_sec_per_gpu": 3417.69,
+      "tokens/trainable": 6103402
+    },
+    {
+      "epoch": 1.3980891719745223,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 4.527757085358431e-05,
+      "loss": 0.007888494990766048,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00792,
+      "step": 439,
+      "tokens/total": 57483264,
+      "tokens/train_per_sec_per_gpu": 3744.21,
+      "tokens/trainable": 6119012
+    },
+    {
+      "epoch": 1.4012738853503186,
+      "grad_norm": 0.19140625,
+      "learning_rate": 4.52450096366815e-05,
+      "loss": 0.010496556758880615,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01055,
+      "step": 440,
+      "tokens/total": 57614336,
+      "tokens/train_per_sec_per_gpu": 3474.71,
+      "tokens/trainable": 6133429
+    },
+    {
+      "epoch": 1.4044585987261147,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 4.521234834302006e-05,
+      "loss": 0.008718312717974186,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00876,
+      "step": 441,
+      "tokens/total": 57745408,
+      "tokens/train_per_sec_per_gpu": 3481.7,
+      "tokens/trainable": 6147945
+    },
+    {
+      "epoch": 1.4076433121019107,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 4.5179587134053916e-05,
+      "loss": 0.01150327455252409,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01157,
+      "step": 442,
+      "tokens/total": 57876480,
+      "tokens/train_per_sec_per_gpu": 3229.57,
+      "tokens/trainable": 6161469
+    },
+    {
+      "epoch": 1.410828025477707,
+      "grad_norm": 0.216796875,
+      "learning_rate": 4.514672617173091e-05,
+      "loss": 0.011761811561882496,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01183,
+      "step": 443,
+      "tokens/total": 58007552,
+      "tokens/train_per_sec_per_gpu": 3416.37,
+      "tokens/trainable": 6175738
+    },
+    {
+      "epoch": 1.4140127388535033,
+      "grad_norm": 0.177734375,
+      "learning_rate": 4.511376561849201e-05,
+      "loss": 0.008984040468931198,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00902,
+      "step": 444,
+      "tokens/total": 58138624,
+      "tokens/train_per_sec_per_gpu": 3352.83,
+      "tokens/trainable": 6189737
+    },
+    {
+      "epoch": 1.4171974522292994,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.5080705637270446e-05,
+      "loss": 0.006133932154625654,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00615,
+      "step": 445,
+      "tokens/total": 58269696,
+      "tokens/train_per_sec_per_gpu": 3183.46,
+      "tokens/trainable": 6203050
+    },
+    {
+      "epoch": 1.4203821656050954,
+      "grad_norm": 0.169921875,
+      "learning_rate": 4.5047546391491e-05,
+      "loss": 0.008717117831110954,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00876,
+      "step": 446,
+      "tokens/total": 58400768,
+      "tokens/train_per_sec_per_gpu": 3819.2,
+      "tokens/trainable": 6218900
+    },
+    {
+      "epoch": 1.4235668789808917,
+      "grad_norm": 0.14453125,
+      "learning_rate": 4.50142880450691e-05,
+      "loss": 0.006517563946545124,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00654,
+      "step": 447,
+      "tokens/total": 58531840,
+      "tokens/train_per_sec_per_gpu": 3083.03,
+      "tokens/trainable": 6231819
+    },
+    {
+      "epoch": 1.426751592356688,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 4.4980930762410084e-05,
+      "loss": 0.010371977463364601,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01043,
+      "step": 448,
+      "tokens/total": 58662912,
+      "tokens/train_per_sec_per_gpu": 3608.97,
+      "tokens/trainable": 6246842
+    },
+    {
+      "epoch": 1.429936305732484,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.4947474708408353e-05,
+      "loss": 0.00814439170062542,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00818,
+      "step": 449,
+      "tokens/total": 58793984,
+      "tokens/train_per_sec_per_gpu": 3576.32,
+      "tokens/trainable": 6261750
+    },
+    {
+      "epoch": 1.4331210191082802,
+      "grad_norm": 0.181640625,
+      "learning_rate": 4.491392004844656e-05,
+      "loss": 0.00930082332342863,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00934,
+      "step": 450,
+      "tokens/total": 58925056,
+      "tokens/train_per_sec_per_gpu": 3149.76,
+      "tokens/trainable": 6274962
+    },
+    {
+      "epoch": 1.4363057324840764,
+      "grad_norm": 0.1875,
+      "learning_rate": 4.48802669483948e-05,
+      "loss": 0.01012382097542286,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01018,
+      "step": 451,
+      "tokens/total": 59056128,
+      "tokens/train_per_sec_per_gpu": 3375.6,
+      "tokens/trainable": 6289094
+    },
+    {
+      "epoch": 1.4394904458598727,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.484651557460978e-05,
+      "loss": 0.007823411375284195,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00785,
+      "step": 452,
+      "tokens/total": 59187200,
+      "tokens/train_per_sec_per_gpu": 3541.02,
+      "tokens/trainable": 6303818
+    },
+    {
+      "epoch": 1.4426751592356688,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 4.4812666093934e-05,
+      "loss": 0.010683316737413406,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01074,
+      "step": 453,
+      "tokens/total": 59318272,
+      "tokens/train_per_sec_per_gpu": 3656.59,
+      "tokens/trainable": 6319060
+    },
+    {
+      "epoch": 1.4458598726114649,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 4.477871867369494e-05,
+      "loss": 0.01043397095054388,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01049,
+      "step": 454,
+      "tokens/total": 59449344,
+      "tokens/train_per_sec_per_gpu": 3638.87,
+      "tokens/trainable": 6334323
+    },
+    {
+      "epoch": 1.4490445859872612,
+      "grad_norm": 0.16015625,
+      "learning_rate": 4.474467348170421e-05,
+      "loss": 0.008449015207588673,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00848,
+      "step": 455,
+      "tokens/total": 59580416,
+      "tokens/train_per_sec_per_gpu": 3481.97,
+      "tokens/trainable": 6348832
+    },
+    {
+      "epoch": 1.4522292993630574,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 4.471053068625674e-05,
+      "loss": 0.008155008777976036,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00819,
+      "step": 456,
+      "tokens/total": 59711488,
+      "tokens/train_per_sec_per_gpu": 3541.25,
+      "tokens/trainable": 6363586
+    },
+    {
+      "epoch": 1.4554140127388535,
+      "grad_norm": 0.15625,
+      "learning_rate": 4.467629045612994e-05,
+      "loss": 0.008736428804695606,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00877,
+      "step": 457,
+      "tokens/total": 59842560,
+      "tokens/train_per_sec_per_gpu": 3481.13,
+      "tokens/trainable": 6378173
+    },
+    {
+      "epoch": 1.4585987261146496,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.4641952960582877e-05,
+      "loss": 0.013414832763373852,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01351,
+      "step": 458,
+      "tokens/total": 59973632,
+      "tokens/train_per_sec_per_gpu": 3571.41,
+      "tokens/trainable": 6393061
+    },
+    {
+      "epoch": 1.4617834394904459,
+      "grad_norm": 0.208984375,
+      "learning_rate": 4.4607518369355403e-05,
+      "loss": 0.008803540840744972,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00884,
+      "step": 459,
+      "tokens/total": 60104704,
+      "tokens/train_per_sec_per_gpu": 3270.87,
+      "tokens/trainable": 6406746
+    },
+    {
+      "epoch": 1.4649681528662422,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 4.457298685266737e-05,
+      "loss": 0.008787565864622593,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00883,
+      "step": 460,
+      "tokens/total": 60235776,
+      "tokens/train_per_sec_per_gpu": 3181.98,
+      "tokens/trainable": 6420083
+    },
+    {
+      "epoch": 1.4681528662420382,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 4.453835858121773e-05,
+      "loss": 0.008562528528273106,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0086,
+      "step": 461,
+      "tokens/total": 60366848,
+      "tokens/train_per_sec_per_gpu": 3258.95,
+      "tokens/trainable": 6433715
+    },
+    {
+      "epoch": 1.4713375796178343,
+      "grad_norm": 0.162109375,
+      "learning_rate": 4.450363372618376e-05,
+      "loss": 0.0074198306538164616,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00745,
+      "step": 462,
+      "tokens/total": 60497920,
+      "tokens/train_per_sec_per_gpu": 3544.34,
+      "tokens/trainable": 6448466
+    },
+    {
+      "epoch": 1.4745222929936306,
+      "grad_norm": 0.1484375,
+      "learning_rate": 4.4468812459220135e-05,
+      "loss": 0.006448620930314064,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00647,
+      "step": 463,
+      "tokens/total": 60628992,
+      "tokens/train_per_sec_per_gpu": 3102.02,
+      "tokens/trainable": 6461464
+    },
+    {
+      "epoch": 1.4777070063694269,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 4.4433894952458156e-05,
+      "loss": 0.008648392744362354,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00869,
+      "step": 464,
+      "tokens/total": 60760064,
+      "tokens/train_per_sec_per_gpu": 3168.95,
+      "tokens/trainable": 6475263
+    },
+    {
+      "epoch": 1.480891719745223,
+      "grad_norm": 0.15234375,
+      "learning_rate": 4.439888137850483e-05,
+      "loss": 0.008528076112270355,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00856,
+      "step": 465,
+      "tokens/total": 60891136,
+      "tokens/train_per_sec_per_gpu": 3278.68,
+      "tokens/trainable": 6488927
+    },
+    {
+      "epoch": 1.484076433121019,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 4.436377191044208e-05,
+      "loss": 0.009064987301826477,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00911,
+      "step": 466,
+      "tokens/total": 61022208,
+      "tokens/train_per_sec_per_gpu": 3401.94,
+      "tokens/trainable": 6503171
+    },
+    {
+      "epoch": 1.4872611464968153,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 4.4328566721825846e-05,
+      "loss": 0.009180644527077675,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00922,
+      "step": 467,
+      "tokens/total": 61153280,
+      "tokens/train_per_sec_per_gpu": 3399.29,
+      "tokens/trainable": 6517402
+    },
+    {
+      "epoch": 1.4904458598726116,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 4.4293265986685264e-05,
+      "loss": 0.00970767717808485,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00975,
+      "step": 468,
+      "tokens/total": 61284352,
+      "tokens/train_per_sec_per_gpu": 2969.25,
+      "tokens/trainable": 6529847
+    },
+    {
+      "epoch": 1.4936305732484076,
+      "grad_norm": 0.154296875,
+      "learning_rate": 4.425786987952174e-05,
+      "loss": 0.009157263673841953,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0092,
+      "step": 469,
+      "tokens/total": 61415424,
+      "tokens/train_per_sec_per_gpu": 3645.56,
+      "tokens/trainable": 6545001
+    },
+    {
+      "epoch": 1.4968152866242037,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 4.4222378575308164e-05,
+      "loss": 0.0058856685645878315,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0059,
+      "step": 470,
+      "tokens/total": 61546496,
+      "tokens/train_per_sec_per_gpu": 3065.44,
+      "tokens/trainable": 6557875
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 4.4186792249488005e-05,
+      "loss": 0.006844916380941868,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00687,
+      "step": 471,
+      "tokens/total": 61677568,
+      "tokens/train_per_sec_per_gpu": 3406.5,
+      "tokens/trainable": 6572069
+    },
+    {
+      "epoch": 1.5,
+      "eval_loss": 0.009513070806860924,
+      "eval_ppl": 1.00956,
+      "eval_runtime": 41.9975,
+      "eval_samples_per_second": 64.313,
+      "eval_steps_per_second": 4.024,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 471
+    },
+    {
+      "epoch": 1.5031847133757963,
+      "grad_norm": 0.18359375,
+      "learning_rate": 4.415111107797445e-05,
+      "loss": 0.007119299378246069,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00714,
+      "step": 472,
+      "tokens/total": 61808640,
+      "tokens/train_per_sec_per_gpu": 3291.47,
+      "tokens/trainable": 6585775
+    },
+    {
+      "epoch": 1.5063694267515924,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 4.411533523714954e-05,
+      "loss": 0.007842868566513062,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00787,
+      "step": 473,
+      "tokens/total": 61939712,
+      "tokens/train_per_sec_per_gpu": 3180.93,
+      "tokens/trainable": 6599115
+    },
+    {
+      "epoch": 1.5095541401273884,
+      "grad_norm": 0.181640625,
+      "learning_rate": 4.4079464903863266e-05,
+      "loss": 0.008342721499502659,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00838,
+      "step": 474,
+      "tokens/total": 62070784,
+      "tokens/train_per_sec_per_gpu": 3367.39,
+      "tokens/trainable": 6613147
+    },
+    {
+      "epoch": 1.5127388535031847,
+      "grad_norm": 0.171875,
+      "learning_rate": 4.404350025543276e-05,
+      "loss": 0.010307609103620052,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01036,
+      "step": 475,
+      "tokens/total": 62201856,
+      "tokens/train_per_sec_per_gpu": 3430.72,
+      "tokens/trainable": 6627509
+    },
+    {
+      "epoch": 1.515923566878981,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.400744146964136e-05,
+      "loss": 0.008362861350178719,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0084,
+      "step": 476,
+      "tokens/total": 62332928,
+      "tokens/train_per_sec_per_gpu": 3051.31,
+      "tokens/trainable": 6640317
+    },
+    {
+      "epoch": 1.519108280254777,
+      "grad_norm": 0.232421875,
+      "learning_rate": 4.3971288724737745e-05,
+      "loss": 0.009740196168422699,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00979,
+      "step": 477,
+      "tokens/total": 62464000,
+      "tokens/train_per_sec_per_gpu": 2966.55,
+      "tokens/trainable": 6652748
+    },
+    {
+      "epoch": 1.5222929936305731,
+      "grad_norm": 0.1328125,
+      "learning_rate": 4.393504219943509e-05,
+      "loss": 0.004925255198031664,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00494,
+      "step": 478,
+      "tokens/total": 62595072,
+      "tokens/train_per_sec_per_gpu": 3160.26,
+      "tokens/trainable": 6666019
+    },
+    {
+      "epoch": 1.5254777070063694,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.3898702072910095e-05,
+      "loss": 0.008841407485306263,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00888,
+      "step": 479,
+      "tokens/total": 62726144,
+      "tokens/train_per_sec_per_gpu": 3085.05,
+      "tokens/trainable": 6679004
+    },
+    {
+      "epoch": 1.5286624203821657,
+      "grad_norm": 0.15625,
+      "learning_rate": 4.386226852480223e-05,
+      "loss": 0.007529627997428179,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00756,
+      "step": 480,
+      "tokens/total": 62857216,
+      "tokens/train_per_sec_per_gpu": 3263.46,
+      "tokens/trainable": 6692672
+    },
+    {
+      "epoch": 1.5318471337579618,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 4.382574173521272e-05,
+      "loss": 0.006781514268368483,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0068,
+      "step": 481,
+      "tokens/total": 62988288,
+      "tokens/train_per_sec_per_gpu": 3583.46,
+      "tokens/trainable": 6707600
+    },
+    {
+      "epoch": 1.5350318471337578,
+      "grad_norm": 0.16015625,
+      "learning_rate": 4.378912188470373e-05,
+      "loss": 0.0076340967789292336,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00766,
+      "step": 482,
+      "tokens/total": 63119360,
+      "tokens/train_per_sec_per_gpu": 3108.97,
+      "tokens/trainable": 6720612
+    },
+    {
+      "epoch": 1.5382165605095541,
+      "grad_norm": 0.212890625,
+      "learning_rate": 4.375240915429745e-05,
+      "loss": 0.009363564662635326,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00941,
+      "step": 483,
+      "tokens/total": 63250432,
+      "tokens/train_per_sec_per_gpu": 3151.23,
+      "tokens/trainable": 6733897
+    },
+    {
+      "epoch": 1.5414012738853504,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 4.3715603725475195e-05,
+      "loss": 0.008497594855725765,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00853,
+      "step": 484,
+      "tokens/total": 63381504,
+      "tokens/train_per_sec_per_gpu": 3630.74,
+      "tokens/trainable": 6749020
+    },
+    {
+      "epoch": 1.5445859872611465,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 4.367870578017653e-05,
+      "loss": 0.004754690453410149,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00477,
+      "step": 485,
+      "tokens/total": 63512576,
+      "tokens/train_per_sec_per_gpu": 3401.99,
+      "tokens/trainable": 6763237
+    },
+    {
+      "epoch": 1.5477707006369426,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.364171550079833e-05,
+      "loss": 0.010673021897673607,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01073,
+      "step": 486,
+      "tokens/total": 63643648,
+      "tokens/train_per_sec_per_gpu": 3289.49,
+      "tokens/trainable": 6777048
+    },
+    {
+      "epoch": 1.5509554140127388,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.3604633070193915e-05,
+      "loss": 0.009158292785286903,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0092,
+      "step": 487,
+      "tokens/total": 63774720,
+      "tokens/train_per_sec_per_gpu": 3323.4,
+      "tokens/trainable": 6790934
+    },
+    {
+      "epoch": 1.5541401273885351,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 4.3567458671672154e-05,
+      "loss": 0.007650249172002077,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00768,
+      "step": 488,
+      "tokens/total": 63905792,
+      "tokens/train_per_sec_per_gpu": 3747.82,
+      "tokens/trainable": 6806546
+    },
+    {
+      "epoch": 1.5573248407643312,
+      "grad_norm": 0.142578125,
+      "learning_rate": 4.35301924889965e-05,
+      "loss": 0.006640854757279158,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00666,
+      "step": 489,
+      "tokens/total": 64036864,
+      "tokens/train_per_sec_per_gpu": 3390.74,
+      "tokens/trainable": 6820768
+    },
+    {
+      "epoch": 1.5605095541401273,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 4.3492834706384154e-05,
+      "loss": 0.008299214765429497,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00833,
+      "step": 490,
+      "tokens/total": 64167936,
+      "tokens/train_per_sec_per_gpu": 3305.77,
+      "tokens/trainable": 6834601
+    },
+    {
+      "epoch": 1.5636942675159236,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 4.345538550850512e-05,
+      "loss": 0.0071832421235740185,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00721,
+      "step": 491,
+      "tokens/total": 64299008,
+      "tokens/train_per_sec_per_gpu": 3301.85,
+      "tokens/trainable": 6848433
+    },
+    {
+      "epoch": 1.5668789808917198,
+      "grad_norm": 0.16796875,
+      "learning_rate": 4.3417845080481255e-05,
+      "loss": 0.008073330856859684,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00811,
+      "step": 492,
+      "tokens/total": 64430080,
+      "tokens/train_per_sec_per_gpu": 3378.84,
+      "tokens/trainable": 6862587
+    },
+    {
+      "epoch": 1.570063694267516,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 4.3380213607885443e-05,
+      "loss": 0.009880865924060345,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00993,
+      "step": 493,
+      "tokens/total": 64561152,
+      "tokens/train_per_sec_per_gpu": 3303.22,
+      "tokens/trainable": 6876421
+    },
+    {
+      "epoch": 1.573248407643312,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 4.3342491276740595e-05,
+      "loss": 0.008753279224038124,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00879,
+      "step": 494,
+      "tokens/total": 64692224,
+      "tokens/train_per_sec_per_gpu": 3323.89,
+      "tokens/trainable": 6890346
+    },
+    {
+      "epoch": 1.5764331210191083,
+      "grad_norm": 0.15234375,
+      "learning_rate": 4.3304678273518776e-05,
+      "loss": 0.009203528985381126,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00925,
+      "step": 495,
+      "tokens/total": 64823296,
+      "tokens/train_per_sec_per_gpu": 3358.21,
+      "tokens/trainable": 6904412
+    },
+    {
+      "epoch": 1.5796178343949046,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 4.326677478514024e-05,
+      "loss": 0.00659502949565649,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00662,
+      "step": 496,
+      "tokens/total": 64954368,
+      "tokens/train_per_sec_per_gpu": 3221.86,
+      "tokens/trainable": 6917910
+    },
+    {
+      "epoch": 1.5828025477707006,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 4.322878099897259e-05,
+      "loss": 0.009297506883740425,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00934,
+      "step": 497,
+      "tokens/total": 65085440,
+      "tokens/train_per_sec_per_gpu": 3425.35,
+      "tokens/trainable": 6932231
+    },
+    {
+      "epoch": 1.5859872611464967,
+      "grad_norm": 0.134765625,
+      "learning_rate": 4.319069710282974e-05,
+      "loss": 0.006143941078335047,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00616,
+      "step": 498,
+      "tokens/total": 65216512,
+      "tokens/train_per_sec_per_gpu": 3594.69,
+      "tokens/trainable": 6947330
+    },
+    {
+      "epoch": 1.589171974522293,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 4.315252328497107e-05,
+      "loss": 0.006281242705881596,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0063,
+      "step": 499,
+      "tokens/total": 65347584,
+      "tokens/train_per_sec_per_gpu": 3393.37,
+      "tokens/trainable": 6961586
+    },
+    {
+      "epoch": 1.5923566878980893,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 4.311425973410047e-05,
+      "loss": 0.007922859862446785,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00795,
+      "step": 500,
+      "tokens/total": 65478656,
+      "tokens/train_per_sec_per_gpu": 3263.1,
+      "tokens/trainable": 6975331
+    },
+    {
+      "epoch": 1.5955414012738853,
+      "grad_norm": 0.23046875,
+      "learning_rate": 4.307590663936541e-05,
+      "loss": 0.009491047821938992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00954,
+      "step": 501,
+      "tokens/total": 65609728,
+      "tokens/train_per_sec_per_gpu": 3050.63,
+      "tokens/trainable": 6988184
+    },
+    {
+      "epoch": 1.5987261146496814,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 4.3037464190355955e-05,
+      "loss": 0.007340395823121071,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00737,
+      "step": 502,
+      "tokens/total": 65740800,
+      "tokens/train_per_sec_per_gpu": 3185.7,
+      "tokens/trainable": 7001560
+    },
+    {
+      "epoch": 1.6019108280254777,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.299893257710394e-05,
+      "loss": 0.006943684071302414,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00697,
+      "step": 503,
+      "tokens/total": 65871872,
+      "tokens/train_per_sec_per_gpu": 3219.92,
+      "tokens/trainable": 7015042
+    },
+    {
+      "epoch": 1.605095541401274,
+      "grad_norm": 0.185546875,
+      "learning_rate": 4.2960311990081924e-05,
+      "loss": 0.009585048072040081,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00963,
+      "step": 504,
+      "tokens/total": 66002944,
+      "tokens/train_per_sec_per_gpu": 3349.25,
+      "tokens/trainable": 7029069
+    },
+    {
+      "epoch": 1.60828025477707,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.292160262020229e-05,
+      "loss": 0.007607592269778252,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00764,
+      "step": 505,
+      "tokens/total": 66134016,
+      "tokens/train_per_sec_per_gpu": 3369.3,
+      "tokens/trainable": 7043148
+    },
+    {
+      "epoch": 1.611464968152866,
+      "grad_norm": 0.16015625,
+      "learning_rate": 4.288280465881632e-05,
+      "loss": 0.009396728128194809,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00944,
+      "step": 506,
+      "tokens/total": 66265088,
+      "tokens/train_per_sec_per_gpu": 3676.06,
+      "tokens/trainable": 7058458
+    },
+    {
+      "epoch": 1.6146496815286624,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 4.2843918297713196e-05,
+      "loss": 0.007050440181046724,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00708,
+      "step": 507,
+      "tokens/total": 66396160,
+      "tokens/train_per_sec_per_gpu": 3145.55,
+      "tokens/trainable": 7071711
+    },
+    {
+      "epoch": 1.6178343949044587,
+      "grad_norm": 0.126953125,
+      "learning_rate": 4.2804943729119115e-05,
+      "loss": 0.007194128353148699,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00722,
+      "step": 508,
+      "tokens/total": 66527232,
+      "tokens/train_per_sec_per_gpu": 3462.21,
+      "tokens/trainable": 7086201
+    },
+    {
+      "epoch": 1.6210191082802548,
+      "grad_norm": 0.17578125,
+      "learning_rate": 4.2765881145696306e-05,
+      "loss": 0.00787313375622034,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0079,
+      "step": 509,
+      "tokens/total": 66658304,
+      "tokens/train_per_sec_per_gpu": 3054.79,
+      "tokens/trainable": 7099037
+    },
+    {
+      "epoch": 1.6242038216560508,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 4.272673074054205e-05,
+      "loss": 0.006892327219247818,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00692,
+      "step": 510,
+      "tokens/total": 66789376,
+      "tokens/train_per_sec_per_gpu": 3439.83,
+      "tokens/trainable": 7113420
+    },
+    {
+      "epoch": 1.627388535031847,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.268749270718778e-05,
+      "loss": 0.006877953186631203,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0069,
+      "step": 511,
+      "tokens/total": 66920448,
+      "tokens/train_per_sec_per_gpu": 3209.06,
+      "tokens/trainable": 7126871
+    },
+    {
+      "epoch": 1.6305732484076434,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 4.2648167239598115e-05,
+      "loss": 0.00894979014992714,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00899,
+      "step": 512,
+      "tokens/total": 67051520,
+      "tokens/train_per_sec_per_gpu": 3488.79,
+      "tokens/trainable": 7141439
+    },
+    {
+      "epoch": 1.6337579617834395,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 4.260875453216985e-05,
+      "loss": 0.011133270338177681,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0112,
+      "step": 513,
+      "tokens/total": 67182592,
+      "tokens/train_per_sec_per_gpu": 3127.41,
+      "tokens/trainable": 7154645
+    },
+    {
+      "epoch": 1.6369426751592355,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 4.256925477973105e-05,
+      "loss": 0.00897931307554245,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00902,
+      "step": 514,
+      "tokens/total": 67313664,
+      "tokens/train_per_sec_per_gpu": 3531.85,
+      "tokens/trainable": 7169429
+    },
+    {
+      "epoch": 1.6401273885350318,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 4.2529668177540064e-05,
+      "loss": 0.007193025201559067,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00722,
+      "step": 515,
+      "tokens/total": 67444736,
+      "tokens/train_per_sec_per_gpu": 3300.04,
+      "tokens/trainable": 7183294
+    },
+    {
+      "epoch": 1.643312101910828,
+      "grad_norm": 0.1953125,
+      "learning_rate": 4.248999492128456e-05,
+      "loss": 0.008410904556512833,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00845,
+      "step": 516,
+      "tokens/total": 67575808,
+      "tokens/train_per_sec_per_gpu": 3053.28,
+      "tokens/trainable": 7196109
+    },
+    {
+      "epoch": 1.6464968152866242,
+      "grad_norm": 0.169921875,
+      "learning_rate": 4.2450235207080594e-05,
+      "loss": 0.007929853163659573,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00796,
+      "step": 517,
+      "tokens/total": 67706880,
+      "tokens/train_per_sec_per_gpu": 3368.48,
+      "tokens/trainable": 7210198
+    },
+    {
+      "epoch": 1.6496815286624202,
+      "grad_norm": 0.166015625,
+      "learning_rate": 4.241038923147154e-05,
+      "loss": 0.011742248199880123,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01181,
+      "step": 518,
+      "tokens/total": 67837952,
+      "tokens/train_per_sec_per_gpu": 3747.84,
+      "tokens/trainable": 7225870
+    },
+    {
+      "epoch": 1.6528662420382165,
+      "grad_norm": 0.150390625,
+      "learning_rate": 4.237045719142726e-05,
+      "loss": 0.007296052295714617,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00732,
+      "step": 519,
+      "tokens/total": 67969024,
+      "tokens/train_per_sec_per_gpu": 3095.92,
+      "tokens/trainable": 7238841
+    },
+    {
+      "epoch": 1.6560509554140128,
+      "grad_norm": 0.15234375,
+      "learning_rate": 4.2330439284343015e-05,
+      "loss": 0.006907866336405277,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00693,
+      "step": 520,
+      "tokens/total": 68100096,
+      "tokens/train_per_sec_per_gpu": 3589.32,
+      "tokens/trainable": 7253801
+    },
+    {
+      "epoch": 1.6592356687898089,
+      "grad_norm": 0.15625,
+      "learning_rate": 4.229033570803853e-05,
+      "loss": 0.0074706668965518475,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0075,
+      "step": 521,
+      "tokens/total": 68231168,
+      "tokens/train_per_sec_per_gpu": 3802.59,
+      "tokens/trainable": 7269629
+    },
+    {
+      "epoch": 1.662420382165605,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 4.2250146660757036e-05,
+      "loss": 0.009104968048632145,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00915,
+      "step": 522,
+      "tokens/total": 68362240,
+      "tokens/train_per_sec_per_gpu": 3755.79,
+      "tokens/trainable": 7285363
+    },
+    {
+      "epoch": 1.6656050955414012,
+      "grad_norm": 0.1484375,
+      "learning_rate": 4.220987234116426e-05,
+      "loss": 0.005891850218176842,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00591,
+      "step": 523,
+      "tokens/total": 68493312,
+      "tokens/train_per_sec_per_gpu": 3446.53,
+      "tokens/trainable": 7299790
+    },
+    {
+      "epoch": 1.6687898089171975,
+      "grad_norm": 0.162109375,
+      "learning_rate": 4.216951294834744e-05,
+      "loss": 0.006473960820585489,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00649,
+      "step": 524,
+      "tokens/total": 68624384,
+      "tokens/train_per_sec_per_gpu": 3751.7,
+      "tokens/trainable": 7315516
+    },
+    {
+      "epoch": 1.6719745222929936,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 4.2129068681814396e-05,
+      "loss": 0.0052047837525606155,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00522,
+      "step": 525,
+      "tokens/total": 68755456,
+      "tokens/train_per_sec_per_gpu": 3241.92,
+      "tokens/trainable": 7329146
+    },
+    {
+      "epoch": 1.6751592356687897,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 4.208853974149246e-05,
+      "loss": 0.01116788387298584,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01123,
+      "step": 526,
+      "tokens/total": 68886528,
+      "tokens/train_per_sec_per_gpu": 3005.85,
+      "tokens/trainable": 7341894
+    },
+    {
+      "epoch": 1.678343949044586,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 4.204792632772754e-05,
+      "loss": 0.01081200409680605,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01087,
+      "step": 527,
+      "tokens/total": 69017600,
+      "tokens/train_per_sec_per_gpu": 3072.53,
+      "tokens/trainable": 7354819
+    },
+    {
+      "epoch": 1.6815286624203822,
+      "grad_norm": 0.181640625,
+      "learning_rate": 4.200722864128315e-05,
+      "loss": 0.007884484715759754,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00792,
+      "step": 528,
+      "tokens/total": 69148672,
+      "tokens/train_per_sec_per_gpu": 3481.52,
+      "tokens/trainable": 7369372
+    },
+    {
+      "epoch": 1.6847133757961783,
+      "grad_norm": 0.146484375,
+      "learning_rate": 4.196644688333935e-05,
+      "loss": 0.006211051717400551,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00623,
+      "step": 529,
+      "tokens/total": 69279744,
+      "tokens/train_per_sec_per_gpu": 3436.61,
+      "tokens/trainable": 7383760
+    },
+    {
+      "epoch": 1.6878980891719744,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 4.19255812554918e-05,
+      "loss": 0.007918811403214931,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00795,
+      "step": 530,
+      "tokens/total": 69410816,
+      "tokens/train_per_sec_per_gpu": 3498.93,
+      "tokens/trainable": 7398384
+    },
+    {
+      "epoch": 1.6910828025477707,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 4.1884631959750766e-05,
+      "loss": 0.007444203365594149,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00747,
+      "step": 531,
+      "tokens/total": 69541888,
+      "tokens/train_per_sec_per_gpu": 3052.82,
+      "tokens/trainable": 7411204
+    },
+    {
+      "epoch": 1.694267515923567,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 4.1843599198540095e-05,
+      "loss": 0.006427375599741936,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00645,
+      "step": 532,
+      "tokens/total": 69672960,
+      "tokens/train_per_sec_per_gpu": 2976.57,
+      "tokens/trainable": 7423690
+    },
+    {
+      "epoch": 1.697452229299363,
+      "grad_norm": 0.169921875,
+      "learning_rate": 4.1802483174696214e-05,
+      "loss": 0.007701891474425793,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00773,
+      "step": 533,
+      "tokens/total": 69804032,
+      "tokens/train_per_sec_per_gpu": 2933.51,
+      "tokens/trainable": 7436112
+    },
+    {
+      "epoch": 1.700636942675159,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.176128409146718e-05,
+      "loss": 0.006673748139292002,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0067,
+      "step": 534,
+      "tokens/total": 69935104,
+      "tokens/train_per_sec_per_gpu": 3182.67,
+      "tokens/trainable": 7449477
+    },
+    {
+      "epoch": 1.7038216560509554,
+      "grad_norm": 0.1328125,
+      "learning_rate": 4.172000215251161e-05,
+      "loss": 0.008220399729907513,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00825,
+      "step": 535,
+      "tokens/total": 70066176,
+      "tokens/train_per_sec_per_gpu": 3196.8,
+      "tokens/trainable": 7462890
+    },
+    {
+      "epoch": 1.7070063694267517,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 4.167863756189767e-05,
+      "loss": 0.008523629046976566,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00856,
+      "step": 536,
+      "tokens/total": 70197248,
+      "tokens/train_per_sec_per_gpu": 3266.89,
+      "tokens/trainable": 7476579
+    },
+    {
+      "epoch": 1.7101910828025477,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 4.163719052410217e-05,
+      "loss": 0.008510093204677105,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00855,
+      "step": 537,
+      "tokens/total": 70328320,
+      "tokens/train_per_sec_per_gpu": 3648.67,
+      "tokens/trainable": 7491858
+    },
+    {
+      "epoch": 1.7133757961783438,
+      "grad_norm": 0.16796875,
+      "learning_rate": 4.159566124400942e-05,
+      "loss": 0.00962991826236248,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00968,
+      "step": 538,
+      "tokens/total": 70459392,
+      "tokens/train_per_sec_per_gpu": 3612.85,
+      "tokens/trainable": 7507000
+    },
+    {
+      "epoch": 1.71656050955414,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 4.1554049926910285e-05,
+      "loss": 0.006633860524743795,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00666,
+      "step": 539,
+      "tokens/total": 70590464,
+      "tokens/train_per_sec_per_gpu": 3414.03,
+      "tokens/trainable": 7521257
+    },
+    {
+      "epoch": 1.7197452229299364,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 4.151235677850119e-05,
+      "loss": 0.007898521609604359,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00793,
+      "step": 540,
+      "tokens/total": 70721536,
+      "tokens/train_per_sec_per_gpu": 3441.64,
+      "tokens/trainable": 7535621
+    },
+    {
+      "epoch": 1.7229299363057324,
+      "grad_norm": 0.154296875,
+      "learning_rate": 4.147058200488305e-05,
+      "loss": 0.009673213586211205,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00972,
+      "step": 541,
+      "tokens/total": 70852608,
+      "tokens/train_per_sec_per_gpu": 3247.07,
+      "tokens/trainable": 7549162
+    },
+    {
+      "epoch": 1.7261146496815285,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 4.142872581256028e-05,
+      "loss": 0.007840042002499104,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00787,
+      "step": 542,
+      "tokens/total": 70983680,
+      "tokens/train_per_sec_per_gpu": 3313.54,
+      "tokens/trainable": 7563047
+    },
+    {
+      "epoch": 1.7292993630573248,
+      "grad_norm": 0.158203125,
+      "learning_rate": 4.1386788408439784e-05,
+      "loss": 0.005681775975972414,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0057,
+      "step": 543,
+      "tokens/total": 71114752,
+      "tokens/train_per_sec_per_gpu": 3239.22,
+      "tokens/trainable": 7576603
+    },
+    {
+      "epoch": 1.732484076433121,
+      "grad_norm": 0.140625,
+      "learning_rate": 4.134476999982989e-05,
+      "loss": 0.005047548562288284,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00506,
+      "step": 544,
+      "tokens/total": 71245824,
+      "tokens/train_per_sec_per_gpu": 3265.49,
+      "tokens/trainable": 7590285
+    },
+    {
+      "epoch": 1.7356687898089171,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 4.130267079443938e-05,
+      "loss": 0.0074127367697656155,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00744,
+      "step": 545,
+      "tokens/total": 71376896,
+      "tokens/train_per_sec_per_gpu": 3477.95,
+      "tokens/trainable": 7604842
+    },
+    {
+      "epoch": 1.7388535031847132,
+      "grad_norm": 0.197265625,
+      "learning_rate": 4.1260491000376446e-05,
+      "loss": 0.007608677726238966,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00764,
+      "step": 546,
+      "tokens/total": 71507968,
+      "tokens/train_per_sec_per_gpu": 3118.58,
+      "tokens/trainable": 7617958
+    },
+    {
+      "epoch": 1.7420382165605095,
+      "grad_norm": 0.21484375,
+      "learning_rate": 4.1218230826147615e-05,
+      "loss": 0.01108642015606165,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01115,
+      "step": 547,
+      "tokens/total": 71639040,
+      "tokens/train_per_sec_per_gpu": 3443.01,
+      "tokens/trainable": 7632387
+    },
+    {
+      "epoch": 1.7452229299363058,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 4.117589048065677e-05,
+      "loss": 0.006157029885798693,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00618,
+      "step": 548,
+      "tokens/total": 71770112,
+      "tokens/train_per_sec_per_gpu": 3439.27,
+      "tokens/trainable": 7646780
+    },
+    {
+      "epoch": 1.7484076433121019,
+      "grad_norm": 0.138671875,
+      "learning_rate": 4.113347017320414e-05,
+      "loss": 0.005342322401702404,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00536,
+      "step": 549,
+      "tokens/total": 71901184,
+      "tokens/train_per_sec_per_gpu": 3001.79,
+      "tokens/trainable": 7659368
+    },
+    {
+      "epoch": 1.7515923566878981,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 4.1090970113485184e-05,
+      "loss": 0.0040708379819989204,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00408,
+      "step": 550,
+      "tokens/total": 72032256,
+      "tokens/train_per_sec_per_gpu": 3546.66,
+      "tokens/trainable": 7674195
+    },
+    {
+      "epoch": 1.7547770700636942,
+      "grad_norm": 0.19921875,
+      "learning_rate": 4.1048390511589595e-05,
+      "loss": 0.01067125890403986,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01073,
+      "step": 551,
+      "tokens/total": 72163328,
+      "tokens/train_per_sec_per_gpu": 3415.06,
+      "tokens/trainable": 7688507
+    },
+    {
+      "epoch": 1.7579617834394905,
+      "grad_norm": 0.171875,
+      "learning_rate": 4.1005731578000305e-05,
+      "loss": 0.008569694124162197,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00861,
+      "step": 552,
+      "tokens/total": 72294400,
+      "tokens/train_per_sec_per_gpu": 3627.75,
+      "tokens/trainable": 7703620
+    },
+    {
+      "epoch": 1.7611464968152868,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 4.0962993523592374e-05,
+      "loss": 0.009042307734489441,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00908,
+      "step": 553,
+      "tokens/total": 72425472,
+      "tokens/train_per_sec_per_gpu": 3269.54,
+      "tokens/trainable": 7717318
+    },
+    {
+      "epoch": 1.7643312101910829,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 4.092017655963198e-05,
+      "loss": 0.007899527437984943,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00793,
+      "step": 554,
+      "tokens/total": 72556544,
+      "tokens/train_per_sec_per_gpu": 3212.13,
+      "tokens/trainable": 7730777
+    },
+    {
+      "epoch": 1.767515923566879,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.0877280897775406e-05,
+      "loss": 0.010296393185853958,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01035,
+      "step": 555,
+      "tokens/total": 72687616,
+      "tokens/train_per_sec_per_gpu": 3337.93,
+      "tokens/trainable": 7744761
+    },
+    {
+      "epoch": 1.7707006369426752,
+      "grad_norm": 0.146484375,
+      "learning_rate": 4.083430675006791e-05,
+      "loss": 0.009942286647856236,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00999,
+      "step": 556,
+      "tokens/total": 72818688,
+      "tokens/train_per_sec_per_gpu": 3400.69,
+      "tokens/trainable": 7759000
+    },
+    {
+      "epoch": 1.7738853503184715,
+      "grad_norm": 0.197265625,
+      "learning_rate": 4.0791254328942756e-05,
+      "loss": 0.00717775197699666,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0072,
+      "step": 557,
+      "tokens/total": 72949760,
+      "tokens/train_per_sec_per_gpu": 3122.89,
+      "tokens/trainable": 7772100
+    },
+    {
+      "epoch": 1.7770700636942676,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 4.074812384722014e-05,
+      "loss": 0.008067919872701168,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0081,
+      "step": 558,
+      "tokens/total": 73080832,
+      "tokens/train_per_sec_per_gpu": 3519.68,
+      "tokens/trainable": 7786740
+    },
+    {
+      "epoch": 1.7802547770700636,
+      "grad_norm": 0.169921875,
+      "learning_rate": 4.0704915518106125e-05,
+      "loss": 0.007346912752836943,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00737,
+      "step": 559,
+      "tokens/total": 73211904,
+      "tokens/train_per_sec_per_gpu": 3104.83,
+      "tokens/trainable": 7799749
+    },
+    {
+      "epoch": 1.78343949044586,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 4.066162955519159e-05,
+      "loss": 0.0073562380857765675,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00738,
+      "step": 560,
+      "tokens/total": 73342976,
+      "tokens/train_per_sec_per_gpu": 3360.32,
+      "tokens/trainable": 7813821
+    },
+    {
+      "epoch": 1.7866242038216562,
+      "grad_norm": 0.19921875,
+      "learning_rate": 4.061826617245119e-05,
+      "loss": 0.007865460589528084,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0079,
+      "step": 561,
+      "tokens/total": 73474048,
+      "tokens/train_per_sec_per_gpu": 2859.48,
+      "tokens/trainable": 7825883
+    },
+    {
+      "epoch": 1.7898089171974523,
+      "grad_norm": 0.189453125,
+      "learning_rate": 4.0574825584242275e-05,
+      "loss": 0.008709411136806011,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00875,
+      "step": 562,
+      "tokens/total": 73605120,
+      "tokens/train_per_sec_per_gpu": 3470.03,
+      "tokens/trainable": 7840437
+    },
+    {
+      "epoch": 1.7929936305732483,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 4.053130800530386e-05,
+      "loss": 0.010312874801456928,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01037,
+      "step": 563,
+      "tokens/total": 73736192,
+      "tokens/train_per_sec_per_gpu": 3416.83,
+      "tokens/trainable": 7854748
+    },
+    {
+      "epoch": 1.7961783439490446,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.048771365075554e-05,
+      "loss": 0.006712635047733784,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00674,
+      "step": 564,
+      "tokens/total": 73867264,
+      "tokens/train_per_sec_per_gpu": 3008.08,
+      "tokens/trainable": 7867356
+    },
+    {
+      "epoch": 1.799363057324841,
+      "grad_norm": 0.216796875,
+      "learning_rate": 4.0444042736096435e-05,
+      "loss": 0.012959079816937447,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01304,
+      "step": 565,
+      "tokens/total": 73998336,
+      "tokens/train_per_sec_per_gpu": 3455.78,
+      "tokens/trainable": 7881813
+    },
+    {
+      "epoch": 1.802547770700637,
+      "grad_norm": 0.142578125,
+      "learning_rate": 4.0400295477204105e-05,
+      "loss": 0.006475909147411585,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0065,
+      "step": 566,
+      "tokens/total": 74129408,
+      "tokens/train_per_sec_per_gpu": 3215.58,
+      "tokens/trainable": 7895287
+    },
+    {
+      "epoch": 1.805732484076433,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 4.035647209033353e-05,
+      "loss": 0.009855620563030243,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0099,
+      "step": 567,
+      "tokens/total": 74260480,
+      "tokens/train_per_sec_per_gpu": 3762.07,
+      "tokens/trainable": 7910983
+    },
+    {
+      "epoch": 1.8089171974522293,
+      "grad_norm": 0.169921875,
+      "learning_rate": 4.031257279211599e-05,
+      "loss": 0.007472330704331398,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0075,
+      "step": 568,
+      "tokens/total": 74391552,
+      "tokens/train_per_sec_per_gpu": 3163.77,
+      "tokens/trainable": 7924299
+    },
+    {
+      "epoch": 1.8121019108280256,
+      "grad_norm": 0.16796875,
+      "learning_rate": 4.026859779955802e-05,
+      "loss": 0.008227458223700523,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00826,
+      "step": 569,
+      "tokens/total": 74522624,
+      "tokens/train_per_sec_per_gpu": 3363.95,
+      "tokens/trainable": 7938293
+    },
+    {
+      "epoch": 1.8152866242038217,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 4.022454733004035e-05,
+      "loss": 0.0075818696059286594,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00761,
+      "step": 570,
+      "tokens/total": 74653696,
+      "tokens/train_per_sec_per_gpu": 2956.76,
+      "tokens/trainable": 7950764
+    },
+    {
+      "epoch": 1.8184713375796178,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 4.01804216013168e-05,
+      "loss": 0.009609042666852474,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00966,
+      "step": 571,
+      "tokens/total": 74784768,
+      "tokens/train_per_sec_per_gpu": 3564.99,
+      "tokens/trainable": 7965654
+    },
+    {
+      "epoch": 1.821656050955414,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 4.013622083151321e-05,
+      "loss": 0.011200753971934319,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01126,
+      "step": 572,
+      "tokens/total": 74915840,
+      "tokens/train_per_sec_per_gpu": 3175.53,
+      "tokens/trainable": 7978976
+    },
+    {
+      "epoch": 1.8248407643312103,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 4.009194523912638e-05,
+      "loss": 0.011081540025770664,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01114,
+      "step": 573,
+      "tokens/total": 75046912,
+      "tokens/train_per_sec_per_gpu": 3512.32,
+      "tokens/trainable": 7993605
+    },
+    {
+      "epoch": 1.8280254777070064,
+      "grad_norm": 0.150390625,
+      "learning_rate": 4.004759504302297e-05,
+      "loss": 0.007977863773703575,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00801,
+      "step": 574,
+      "tokens/total": 75177984,
+      "tokens/train_per_sec_per_gpu": 3555.25,
+      "tokens/trainable": 8008422
+    },
+    {
+      "epoch": 1.8312101910828025,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 4.000317046243845e-05,
+      "loss": 0.005992071703076363,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00601,
+      "step": 575,
+      "tokens/total": 75309056,
+      "tokens/train_per_sec_per_gpu": 2628.73,
+      "tokens/trainable": 8019575
+    },
+    {
+      "epoch": 1.8343949044585988,
+      "grad_norm": 0.134765625,
+      "learning_rate": 3.9958671716975966e-05,
+      "loss": 0.005763609427958727,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00578,
+      "step": 576,
+      "tokens/total": 75440128,
+      "tokens/train_per_sec_per_gpu": 3099.03,
+      "tokens/trainable": 8032607
+    },
+    {
+      "epoch": 1.837579617834395,
+      "grad_norm": 0.18359375,
+      "learning_rate": 3.9914099026605286e-05,
+      "loss": 0.00910909567028284,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00915,
+      "step": 577,
+      "tokens/total": 75571200,
+      "tokens/train_per_sec_per_gpu": 3311.4,
+      "tokens/trainable": 8046493
+    },
+    {
+      "epoch": 1.8407643312101911,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 3.986945261166174e-05,
+      "loss": 0.0058432393707334995,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00586,
+      "step": 578,
+      "tokens/total": 75702272,
+      "tokens/train_per_sec_per_gpu": 2863.58,
+      "tokens/trainable": 8058562
+    },
+    {
+      "epoch": 1.8439490445859872,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 3.9824732692845045e-05,
+      "loss": 0.006885102018713951,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00691,
+      "step": 579,
+      "tokens/total": 75833344,
+      "tokens/train_per_sec_per_gpu": 3605.27,
+      "tokens/trainable": 8073625
+    },
+    {
+      "epoch": 1.8471337579617835,
+      "grad_norm": 0.150390625,
+      "learning_rate": 3.977993949121831e-05,
+      "loss": 0.007448772434145212,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00748,
+      "step": 580,
+      "tokens/total": 75964416,
+      "tokens/train_per_sec_per_gpu": 3178.81,
+      "tokens/trainable": 8086952
+    },
+    {
+      "epoch": 1.8503184713375798,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 3.9735073228206896e-05,
+      "loss": 0.01037865225225687,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01043,
+      "step": 581,
+      "tokens/total": 76095488,
+      "tokens/train_per_sec_per_gpu": 3771.77,
+      "tokens/trainable": 8102605
+    },
+    {
+      "epoch": 1.8535031847133758,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 3.9690134125597315e-05,
+      "loss": 0.005139034241437912,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00515,
+      "step": 582,
+      "tokens/total": 76226560,
+      "tokens/train_per_sec_per_gpu": 3043.71,
+      "tokens/trainable": 8115368
+    },
+    {
+      "epoch": 1.856687898089172,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 3.9645122405536144e-05,
+      "loss": 0.006013063248246908,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00603,
+      "step": 583,
+      "tokens/total": 76357632,
+      "tokens/train_per_sec_per_gpu": 3251.38,
+      "tokens/trainable": 8129041
+    },
+    {
+      "epoch": 1.8598726114649682,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 3.9600038290528944e-05,
+      "loss": 0.00799723993986845,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00803,
+      "step": 584,
+      "tokens/total": 76488704,
+      "tokens/train_per_sec_per_gpu": 3320.59,
+      "tokens/trainable": 8142910
+    },
+    {
+      "epoch": 1.8630573248407645,
+      "grad_norm": 0.142578125,
+      "learning_rate": 3.955488200343913e-05,
+      "loss": 0.00701179401949048,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00704,
+      "step": 585,
+      "tokens/total": 76619776,
+      "tokens/train_per_sec_per_gpu": 3561.59,
+      "tokens/trainable": 8157807
+    },
+    {
+      "epoch": 1.8662420382165605,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 3.950965376748689e-05,
+      "loss": 0.00536251999437809,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00538,
+      "step": 586,
+      "tokens/total": 76750848,
+      "tokens/train_per_sec_per_gpu": 3253.01,
+      "tokens/trainable": 8171483
+    },
+    {
+      "epoch": 1.8694267515923566,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 3.946435380624808e-05,
+      "loss": 0.005477463360875845,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00549,
+      "step": 587,
+      "tokens/total": 76881920,
+      "tokens/train_per_sec_per_gpu": 2986.43,
+      "tokens/trainable": 8184075
+    },
+    {
+      "epoch": 1.872611464968153,
+      "grad_norm": 0.154296875,
+      "learning_rate": 3.94189823436531e-05,
+      "loss": 0.007740751840174198,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00777,
+      "step": 588,
+      "tokens/total": 77012992,
+      "tokens/train_per_sec_per_gpu": 3302.5,
+      "tokens/trainable": 8197922
+    },
+    {
+      "epoch": 1.8757961783439492,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 3.937353960398581e-05,
+      "loss": 0.007541216444224119,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00757,
+      "step": 589,
+      "tokens/total": 77144064,
+      "tokens/train_per_sec_per_gpu": 3178.73,
+      "tokens/trainable": 8211342
+    },
+    {
+      "epoch": 1.8789808917197452,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 3.932802581188243e-05,
+      "loss": 0.006363678723573685,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00638,
+      "step": 590,
+      "tokens/total": 77275136,
+      "tokens/train_per_sec_per_gpu": 3260.49,
+      "tokens/trainable": 8225038
+    },
+    {
+      "epoch": 1.8821656050955413,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 3.928244119233038e-05,
+      "loss": 0.010623229667544365,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01068,
+      "step": 591,
+      "tokens/total": 77406208,
+      "tokens/train_per_sec_per_gpu": 3108.47,
+      "tokens/trainable": 8238178
+    },
+    {
+      "epoch": 1.8853503184713376,
+      "grad_norm": 0.205078125,
+      "learning_rate": 3.9236785970667214e-05,
+      "loss": 0.008010565303266048,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00804,
+      "step": 592,
+      "tokens/total": 77537280,
+      "tokens/train_per_sec_per_gpu": 3537.95,
+      "tokens/trainable": 8252970
+    },
+    {
+      "epoch": 1.888535031847134,
+      "grad_norm": 0.130859375,
+      "learning_rate": 3.91910603725795e-05,
+      "loss": 0.006147422362118959,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00617,
+      "step": 593,
+      "tokens/total": 77668352,
+      "tokens/train_per_sec_per_gpu": 3429.45,
+      "tokens/trainable": 8267246
+    },
+    {
+      "epoch": 1.89171974522293,
+      "grad_norm": 0.162109375,
+      "learning_rate": 3.9145264624101676e-05,
+      "loss": 0.007066651247441769,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00709,
+      "step": 594,
+      "tokens/total": 77799424,
+      "tokens/train_per_sec_per_gpu": 3344.2,
+      "tokens/trainable": 8281242
+    },
+    {
+      "epoch": 1.894904458598726,
+      "grad_norm": 0.169921875,
+      "learning_rate": 3.909939895161498e-05,
+      "loss": 0.007343544624745846,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00737,
+      "step": 595,
+      "tokens/total": 77930496,
+      "tokens/train_per_sec_per_gpu": 3615.15,
+      "tokens/trainable": 8296272
+    },
+    {
+      "epoch": 1.8980891719745223,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 3.905346358184629e-05,
+      "loss": 0.006192365661263466,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00621,
+      "step": 596,
+      "tokens/total": 78061568,
+      "tokens/train_per_sec_per_gpu": 3043.7,
+      "tokens/trainable": 8309000
+    },
+    {
+      "epoch": 1.9012738853503186,
+      "grad_norm": 0.201171875,
+      "learning_rate": 3.900745874186701e-05,
+      "loss": 0.008313626050949097,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00835,
+      "step": 597,
+      "tokens/total": 78192640,
+      "tokens/train_per_sec_per_gpu": 3567.72,
+      "tokens/trainable": 8323863
+    },
+    {
+      "epoch": 1.9044585987261147,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 3.896138465909196e-05,
+      "loss": 0.006214370485395193,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00623,
+      "step": 598,
+      "tokens/total": 78323712,
+      "tokens/train_per_sec_per_gpu": 3365.54,
+      "tokens/trainable": 8337954
+    },
+    {
+      "epoch": 1.9076433121019107,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 3.8915241561278266e-05,
+      "loss": 0.007558876648545265,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00759,
+      "step": 599,
+      "tokens/total": 78454784,
+      "tokens/train_per_sec_per_gpu": 3425.9,
+      "tokens/trainable": 8352302
+    },
+    {
+      "epoch": 1.910828025477707,
+      "grad_norm": 0.166015625,
+      "learning_rate": 3.8869029676524174e-05,
+      "loss": 0.005686955992132425,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0057,
+      "step": 600,
+      "tokens/total": 78585856,
+      "tokens/train_per_sec_per_gpu": 3357.73,
+      "tokens/trainable": 8366310
+    },
+    {
+      "epoch": 1.9140127388535033,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 3.8822749233268006e-05,
+      "loss": 0.0077353366650640965,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00777,
+      "step": 601,
+      "tokens/total": 78716928,
+      "tokens/train_per_sec_per_gpu": 3165.47,
+      "tokens/trainable": 8379573
+    },
+    {
+      "epoch": 1.9171974522292994,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 3.877640046028696e-05,
+      "loss": 0.0062081338837742805,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00623,
+      "step": 602,
+      "tokens/total": 78848000,
+      "tokens/train_per_sec_per_gpu": 3562.58,
+      "tokens/trainable": 8394385
+    },
+    {
+      "epoch": 1.9203821656050954,
+      "grad_norm": 0.154296875,
+      "learning_rate": 3.872998358669601e-05,
+      "loss": 0.006809039041399956,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00683,
+      "step": 603,
+      "tokens/total": 78979072,
+      "tokens/train_per_sec_per_gpu": 3598.12,
+      "tokens/trainable": 8409414
+    },
+    {
+      "epoch": 1.9235668789808917,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 3.868349884194678e-05,
+      "loss": 0.004747939296066761,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00476,
+      "step": 604,
+      "tokens/total": 79110144,
+      "tokens/train_per_sec_per_gpu": 3006.34,
+      "tokens/trainable": 8422068
+    },
+    {
+      "epoch": 1.926751592356688,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 3.863694645582642e-05,
+      "loss": 0.007029777858406305,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00705,
+      "step": 605,
+      "tokens/total": 79241216,
+      "tokens/train_per_sec_per_gpu": 3244.04,
+      "tokens/trainable": 8435649
+    },
+    {
+      "epoch": 1.929936305732484,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 3.8590326658456376e-05,
+      "loss": 0.006050920579582453,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00607,
+      "step": 606,
+      "tokens/total": 79372288,
+      "tokens/train_per_sec_per_gpu": 3305.31,
+      "tokens/trainable": 8449489
+    },
+    {
+      "epoch": 1.9331210191082802,
+      "grad_norm": 0.17578125,
+      "learning_rate": 3.854363968029142e-05,
+      "loss": 0.0075315129943192005,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00756,
+      "step": 607,
+      "tokens/total": 79503360,
+      "tokens/train_per_sec_per_gpu": 3186.77,
+      "tokens/trainable": 8462838
+    },
+    {
+      "epoch": 1.9363057324840764,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.849688575211836e-05,
+      "loss": 0.006646899972110987,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00667,
+      "step": 608,
+      "tokens/total": 79634432,
+      "tokens/train_per_sec_per_gpu": 3410.5,
+      "tokens/trainable": 8477093
+    },
+    {
+      "epoch": 1.9394904458598727,
+      "grad_norm": 0.189453125,
+      "learning_rate": 3.8450065105054966e-05,
+      "loss": 0.00727155851200223,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0073,
+      "step": 609,
+      "tokens/total": 79765504,
+      "tokens/train_per_sec_per_gpu": 3308.9,
+      "tokens/trainable": 8490972
+    },
+    {
+      "epoch": 1.9426751592356688,
+      "grad_norm": 0.1875,
+      "learning_rate": 3.840317797054882e-05,
+      "loss": 0.009210377931594849,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00925,
+      "step": 610,
+      "tokens/total": 79896576,
+      "tokens/train_per_sec_per_gpu": 3480.61,
+      "tokens/trainable": 8505449
+    },
+    {
+      "epoch": 1.9458598726114649,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 3.83562245803762e-05,
+      "loss": 0.008728603832423687,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00877,
+      "step": 611,
+      "tokens/total": 80027648,
+      "tokens/train_per_sec_per_gpu": 3432.92,
+      "tokens/trainable": 8519873
+    },
+    {
+      "epoch": 1.9490445859872612,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 3.830920516664085e-05,
+      "loss": 0.00592332798987627,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00594,
+      "step": 612,
+      "tokens/total": 80158720,
+      "tokens/train_per_sec_per_gpu": 2869.74,
+      "tokens/trainable": 8531911
+    },
+    {
+      "epoch": 1.9522292993630574,
+      "grad_norm": 0.162109375,
+      "learning_rate": 3.826211996177291e-05,
+      "loss": 0.00876440480351448,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0088,
+      "step": 613,
+      "tokens/total": 80289792,
+      "tokens/train_per_sec_per_gpu": 3232.65,
+      "tokens/trainable": 8545475
+    },
+    {
+      "epoch": 1.9554140127388535,
+      "grad_norm": 0.173828125,
+      "learning_rate": 3.8214969198527787e-05,
+      "loss": 0.010759076103568077,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.01082,
+      "step": 614,
+      "tokens/total": 80420864,
+      "tokens/train_per_sec_per_gpu": 3370.92,
+      "tokens/trainable": 8559559
+    },
+    {
+      "epoch": 1.9585987261146496,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.8167753109984886e-05,
+      "loss": 0.007340329699218273,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00737,
+      "step": 615,
+      "tokens/total": 80551936,
+      "tokens/train_per_sec_per_gpu": 3167.2,
+      "tokens/trainable": 8572860
+    },
+    {
+      "epoch": 1.9617834394904459,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 3.8120471929546576e-05,
+      "loss": 0.009786421433091164,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00983,
+      "step": 616,
+      "tokens/total": 80683008,
+      "tokens/train_per_sec_per_gpu": 3398.38,
+      "tokens/trainable": 8587062
+    },
+    {
+      "epoch": 1.9649681528662422,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 3.807312589093701e-05,
+      "loss": 0.007565875072032213,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00759,
+      "step": 617,
+      "tokens/total": 80814080,
+      "tokens/train_per_sec_per_gpu": 3589.95,
+      "tokens/trainable": 8602027
+    },
+    {
+      "epoch": 1.9681528662420382,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.802571522820091e-05,
+      "loss": 0.005681060254573822,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0057,
+      "step": 618,
+      "tokens/total": 80945152,
+      "tokens/train_per_sec_per_gpu": 3097.22,
+      "tokens/trainable": 8615057
+    },
+    {
+      "epoch": 1.9713375796178343,
+      "grad_norm": 0.140625,
+      "learning_rate": 3.7978240175702475e-05,
+      "loss": 0.007764302659779787,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00779,
+      "step": 619,
+      "tokens/total": 81076224,
+      "tokens/train_per_sec_per_gpu": 3585.75,
+      "tokens/trainable": 8630164
+    },
+    {
+      "epoch": 1.9745222929936306,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 3.7930700968124214e-05,
+      "loss": 0.007851460948586464,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00788,
+      "step": 620,
+      "tokens/total": 81207296,
+      "tokens/train_per_sec_per_gpu": 3407.67,
+      "tokens/trainable": 8644417
+    },
+    {
+      "epoch": 1.9777070063694269,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.788309784046574e-05,
+      "loss": 0.007941392250359058,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00797,
+      "step": 621,
+      "tokens/total": 81338368,
+      "tokens/train_per_sec_per_gpu": 3178.19,
+      "tokens/trainable": 8657791
+    },
+    {
+      "epoch": 1.980891719745223,
+      "grad_norm": 0.19921875,
+      "learning_rate": 3.7835431028042664e-05,
+      "loss": 0.008540787734091282,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00858,
+      "step": 622,
+      "tokens/total": 81469440,
+      "tokens/train_per_sec_per_gpu": 3471.65,
+      "tokens/trainable": 8672330
+    },
+    {
+      "epoch": 1.984076433121019,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 3.778770076648543e-05,
+      "loss": 0.008716538548469543,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00875,
+      "step": 623,
+      "tokens/total": 81600512,
+      "tokens/train_per_sec_per_gpu": 3219.98,
+      "tokens/trainable": 8685828
+    },
+    {
+      "epoch": 1.9872611464968153,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.773990729173807e-05,
+      "loss": 0.007769486866891384,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0078,
+      "step": 624,
+      "tokens/total": 81731584,
+      "tokens/train_per_sec_per_gpu": 3473.48,
+      "tokens/trainable": 8700372
+    },
+    {
+      "epoch": 1.9904458598726116,
+      "grad_norm": 0.1484375,
+      "learning_rate": 3.769205084005714e-05,
+      "loss": 0.008443665690720081,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00848,
+      "step": 625,
+      "tokens/total": 81862656,
+      "tokens/train_per_sec_per_gpu": 3400.54,
+      "tokens/trainable": 8714943
+    },
+    {
+      "epoch": 1.9936305732484076,
+      "grad_norm": 0.201171875,
+      "learning_rate": 3.7644131648010494e-05,
+      "loss": 0.009850014001131058,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0099,
+      "step": 626,
+      "tokens/total": 81993728,
+      "tokens/train_per_sec_per_gpu": 3193.43,
+      "tokens/trainable": 8728328
+    },
+    {
+      "epoch": 1.9968152866242037,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 3.759614995247615e-05,
+      "loss": 0.0070216236636042595,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00705,
+      "step": 627,
+      "tokens/total": 82124800,
+      "tokens/train_per_sec_per_gpu": 3280.91,
+      "tokens/trainable": 8742135
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 3.7548105990641055e-05,
+      "loss": 0.008461863733828068,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 39.25,
+      "memory/max_allocated (GiB)": 39.25,
+      "ppl": 1.0085,
+      "step": 628,
+      "tokens/total": 82198528,
+      "tokens/train_per_sec_per_gpu": 3127.15,
+      "tokens/trainable": 8749352
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.00880392361432314,
+      "eval_ppl": 1.00884,
+      "eval_runtime": 41.5789,
+      "eval_samples_per_second": 64.961,
+      "eval_steps_per_second": 4.065,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 628
+    },
+    {
+      "epoch": 2.0031847133757963,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 0.00489779282361269,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00491,
+      "step": 629,
+      "tokens/total": 82329600,
+      "tokens/train_per_sec_per_gpu": 3227.88,
+      "tokens/trainable": 8762717
+    },
+    {
+      "epoch": 2.0063694267515926,
+      "grad_norm": 0.140625,
+      "learning_rate": 3.745183221835439e-05,
+      "loss": 0.0062369704246521,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00626,
+      "step": 630,
+      "tokens/total": 82460672,
+      "tokens/train_per_sec_per_gpu": 3063.9,
+      "tokens/trainable": 8775458
+    },
+    {
+      "epoch": 2.0095541401273884,
+      "grad_norm": 0.1015625,
+      "learning_rate": 3.740360288381105e-05,
+      "loss": 0.004345161374658346,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00435,
+      "step": 631,
+      "tokens/total": 82591744,
+      "tokens/train_per_sec_per_gpu": 3775.58,
+      "tokens/trainable": 8791109
+    },
+    {
+      "epoch": 2.0127388535031847,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 3.735531223478113e-05,
+      "loss": 0.003698494518175721,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00371,
+      "step": 632,
+      "tokens/total": 82722816,
+      "tokens/train_per_sec_per_gpu": 3064.96,
+      "tokens/trainable": 8803927
+    },
+    {
+      "epoch": 2.015923566878981,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 3.730696050997883e-05,
+      "loss": 0.006370588671416044,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00639,
+      "step": 633,
+      "tokens/total": 82853888,
+      "tokens/train_per_sec_per_gpu": 3757.38,
+      "tokens/trainable": 8819539
+    },
+    {
+      "epoch": 2.0191082802547773,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 3.725854794842028e-05,
+      "loss": 0.004867184441536665,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00488,
+      "step": 634,
+      "tokens/total": 82984960,
+      "tokens/train_per_sec_per_gpu": 3367.17,
+      "tokens/trainable": 8833596
+    },
+    {
+      "epoch": 2.022292993630573,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 3.721007478942236e-05,
+      "loss": 0.005630412604659796,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00565,
+      "step": 635,
+      "tokens/total": 83116032,
+      "tokens/train_per_sec_per_gpu": 3394.97,
+      "tokens/trainable": 8847823
+    },
+    {
+      "epoch": 2.0254777070063694,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.716154127260147e-05,
+      "loss": 0.0077174571342766285,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00775,
+      "step": 636,
+      "tokens/total": 83247104,
+      "tokens/train_per_sec_per_gpu": 3334.91,
+      "tokens/trainable": 8861774
+    },
+    {
+      "epoch": 2.0286624203821657,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 3.7112947637872395e-05,
+      "loss": 0.0045103938318789005,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00452,
+      "step": 637,
+      "tokens/total": 83378176,
+      "tokens/train_per_sec_per_gpu": 3076.34,
+      "tokens/trainable": 8874743
+    },
+    {
+      "epoch": 2.031847133757962,
+      "grad_norm": 0.126953125,
+      "learning_rate": 3.706429412544711e-05,
+      "loss": 0.005497838370501995,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00551,
+      "step": 638,
+      "tokens/total": 83509248,
+      "tokens/train_per_sec_per_gpu": 3103.99,
+      "tokens/trainable": 8887731
+    },
+    {
+      "epoch": 2.035031847133758,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 3.701558097583355e-05,
+      "loss": 0.004869392607361078,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00488,
+      "step": 639,
+      "tokens/total": 83640320,
+      "tokens/train_per_sec_per_gpu": 3107.44,
+      "tokens/trainable": 8900746
+    },
+    {
+      "epoch": 2.038216560509554,
+      "grad_norm": 0.13671875,
+      "learning_rate": 3.696680842983447e-05,
+      "loss": 0.006643592845648527,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00667,
+      "step": 640,
+      "tokens/total": 83771392,
+      "tokens/train_per_sec_per_gpu": 3319.13,
+      "tokens/trainable": 8914631
+    },
+    {
+      "epoch": 2.0414012738853504,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 3.691797672854625e-05,
+      "loss": 0.005362721625715494,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00538,
+      "step": 641,
+      "tokens/total": 83902464,
+      "tokens/train_per_sec_per_gpu": 3013.92,
+      "tokens/trainable": 8927265
+    },
+    {
+      "epoch": 2.0445859872611467,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 3.686908611335768e-05,
+      "loss": 0.005462102126330137,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00548,
+      "step": 642,
+      "tokens/total": 84033536,
+      "tokens/train_per_sec_per_gpu": 3114.98,
+      "tokens/trainable": 8940288
+    },
+    {
+      "epoch": 2.0477707006369426,
+      "grad_norm": 0.125,
+      "learning_rate": 3.682013682594876e-05,
+      "loss": 0.0043016825802624226,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00431,
+      "step": 643,
+      "tokens/total": 84164608,
+      "tokens/train_per_sec_per_gpu": 3431.73,
+      "tokens/trainable": 8954653
+    },
+    {
+      "epoch": 2.050955414012739,
+      "grad_norm": 0.19140625,
+      "learning_rate": 3.677112910828957e-05,
+      "loss": 0.0066072107292711735,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00663,
+      "step": 644,
+      "tokens/total": 84295680,
+      "tokens/train_per_sec_per_gpu": 2967.06,
+      "tokens/trainable": 8967100
+    },
+    {
+      "epoch": 2.054140127388535,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 3.672206320263897e-05,
+      "loss": 0.0054827104322612286,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0055,
+      "step": 645,
+      "tokens/total": 84426752,
+      "tokens/train_per_sec_per_gpu": 3194.77,
+      "tokens/trainable": 8980536
+    },
+    {
+      "epoch": 2.0573248407643314,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 3.66729393515435e-05,
+      "loss": 0.005452790763229132,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00547,
+      "step": 646,
+      "tokens/total": 84557824,
+      "tokens/train_per_sec_per_gpu": 3109.68,
+      "tokens/trainable": 8993576
+    },
+    {
+      "epoch": 2.0605095541401273,
+      "grad_norm": 0.181640625,
+      "learning_rate": 3.662375779783614e-05,
+      "loss": 0.0072727687656879425,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0073,
+      "step": 647,
+      "tokens/total": 84688896,
+      "tokens/train_per_sec_per_gpu": 3147.5,
+      "tokens/trainable": 9006855
+    },
+    {
+      "epoch": 2.0636942675159236,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 3.657451878463508e-05,
+      "loss": 0.003491069655865431,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0035,
+      "step": 648,
+      "tokens/total": 84819968,
+      "tokens/train_per_sec_per_gpu": 3224.99,
+      "tokens/trainable": 9020369
+    },
+    {
+      "epoch": 2.06687898089172,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 3.652522255534258e-05,
+      "loss": 0.005467304494231939,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00548,
+      "step": 649,
+      "tokens/total": 84951040,
+      "tokens/train_per_sec_per_gpu": 3640.83,
+      "tokens/trainable": 9035605
+    },
+    {
+      "epoch": 2.070063694267516,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 3.647586935364372e-05,
+      "loss": 0.004504749551415443,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00451,
+      "step": 650,
+      "tokens/total": 85082112,
+      "tokens/train_per_sec_per_gpu": 3394.33,
+      "tokens/trainable": 9049828
+    },
+    {
+      "epoch": 2.073248407643312,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 3.6426459423505214e-05,
+      "loss": 0.007018570322543383,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00704,
+      "step": 651,
+      "tokens/total": 85213184,
+      "tokens/train_per_sec_per_gpu": 2787.07,
+      "tokens/trainable": 9061509
+    },
+    {
+      "epoch": 2.0764331210191083,
+      "grad_norm": 0.12890625,
+      "learning_rate": 3.637699300917418e-05,
+      "loss": 0.005671660415828228,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00569,
+      "step": 652,
+      "tokens/total": 85344256,
+      "tokens/train_per_sec_per_gpu": 3667.49,
+      "tokens/trainable": 9076828
+    },
+    {
+      "epoch": 2.0796178343949046,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 3.632747035517701e-05,
+      "loss": 0.005398279055953026,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00541,
+      "step": 653,
+      "tokens/total": 85475328,
+      "tokens/train_per_sec_per_gpu": 3551.7,
+      "tokens/trainable": 9091646
+    },
+    {
+      "epoch": 2.082802547770701,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 3.6277891706318036e-05,
+      "loss": 0.007613079622387886,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00764,
+      "step": 654,
+      "tokens/total": 85606400,
+      "tokens/train_per_sec_per_gpu": 3571.21,
+      "tokens/trainable": 9106545
+    },
+    {
+      "epoch": 2.0859872611464967,
+      "grad_norm": 0.1640625,
+      "learning_rate": 3.622825730767842e-05,
+      "loss": 0.0069300332106649876,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00695,
+      "step": 655,
+      "tokens/total": 85737472,
+      "tokens/train_per_sec_per_gpu": 3786.4,
+      "tokens/trainable": 9122295
+    },
+    {
+      "epoch": 2.089171974522293,
+      "grad_norm": 0.19140625,
+      "learning_rate": 3.6178567404614936e-05,
+      "loss": 0.006750217638909817,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00677,
+      "step": 656,
+      "tokens/total": 85868544,
+      "tokens/train_per_sec_per_gpu": 3589.29,
+      "tokens/trainable": 9137329
+    },
+    {
+      "epoch": 2.0923566878980893,
+      "grad_norm": 0.162109375,
+      "learning_rate": 3.6128822242758686e-05,
+      "loss": 0.0060827480629086494,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0061,
+      "step": 657,
+      "tokens/total": 85999616,
+      "tokens/train_per_sec_per_gpu": 3096.84,
+      "tokens/trainable": 9150353
+    },
+    {
+      "epoch": 2.0955414012738856,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 3.6079022068013945e-05,
+      "loss": 0.006425363477319479,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00645,
+      "step": 658,
+      "tokens/total": 86130688,
+      "tokens/train_per_sec_per_gpu": 3687.66,
+      "tokens/trainable": 9165791
+    },
+    {
+      "epoch": 2.0987261146496814,
+      "grad_norm": 0.13671875,
+      "learning_rate": 3.602916712655697e-05,
+      "loss": 0.004524726886302233,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00453,
+      "step": 659,
+      "tokens/total": 86261760,
+      "tokens/train_per_sec_per_gpu": 3224.45,
+      "tokens/trainable": 9179333
+    },
+    {
+      "epoch": 2.1019108280254777,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 3.597925766483468e-05,
+      "loss": 0.008739529177546501,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00878,
+      "step": 660,
+      "tokens/total": 86392832,
+      "tokens/train_per_sec_per_gpu": 3380.82,
+      "tokens/trainable": 9193503
+    },
+    {
+      "epoch": 2.105095541401274,
+      "grad_norm": 0.125,
+      "learning_rate": 3.592929392956355e-05,
+      "loss": 0.003972796723246574,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00398,
+      "step": 661,
+      "tokens/total": 86523904,
+      "tokens/train_per_sec_per_gpu": 3337.39,
+      "tokens/trainable": 9207523
+    },
+    {
+      "epoch": 2.1082802547770703,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 3.587927616772834e-05,
+      "loss": 0.00485801137983799,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00487,
+      "step": 662,
+      "tokens/total": 86654976,
+      "tokens/train_per_sec_per_gpu": 3418.39,
+      "tokens/trainable": 9221801
+    },
+    {
+      "epoch": 2.111464968152866,
+      "grad_norm": 0.14453125,
+      "learning_rate": 3.5829204626580856e-05,
+      "loss": 0.005488412454724312,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0055,
+      "step": 663,
+      "tokens/total": 86786048,
+      "tokens/train_per_sec_per_gpu": 3308.78,
+      "tokens/trainable": 9235658
+    },
+    {
+      "epoch": 2.1146496815286624,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 3.577907955363877e-05,
+      "loss": 0.007495546247810125,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00752,
+      "step": 664,
+      "tokens/total": 86917120,
+      "tokens/train_per_sec_per_gpu": 3508.8,
+      "tokens/trainable": 9250377
+    },
+    {
+      "epoch": 2.1178343949044587,
+      "grad_norm": 0.185546875,
+      "learning_rate": 3.572890119668439e-05,
+      "loss": 0.007228251546621323,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00725,
+      "step": 665,
+      "tokens/total": 87048192,
+      "tokens/train_per_sec_per_gpu": 3480.87,
+      "tokens/trainable": 9264939
+    },
+    {
+      "epoch": 2.121019108280255,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 3.567866980376337e-05,
+      "loss": 0.005014233291149139,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00503,
+      "step": 666,
+      "tokens/total": 87179264,
+      "tokens/train_per_sec_per_gpu": 3039.74,
+      "tokens/trainable": 9277680
+    },
+    {
+      "epoch": 2.124203821656051,
+      "grad_norm": 0.16015625,
+      "learning_rate": 3.562838562318358e-05,
+      "loss": 0.004775107838213444,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00479,
+      "step": 667,
+      "tokens/total": 87310336,
+      "tokens/train_per_sec_per_gpu": 3171.11,
+      "tokens/trainable": 9291025
+    },
+    {
+      "epoch": 2.127388535031847,
+      "grad_norm": 0.146484375,
+      "learning_rate": 3.557804890351383e-05,
+      "loss": 0.006139194592833519,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00616,
+      "step": 668,
+      "tokens/total": 87441408,
+      "tokens/train_per_sec_per_gpu": 3193.18,
+      "tokens/trainable": 9304405
+    },
+    {
+      "epoch": 2.1305732484076434,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 3.5527659893582635e-05,
+      "loss": 0.004298456013202667,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00431,
+      "step": 669,
+      "tokens/total": 87572480,
+      "tokens/train_per_sec_per_gpu": 3227.18,
+      "tokens/trainable": 9317913
+    },
+    {
+      "epoch": 2.1337579617834397,
+      "grad_norm": 0.162109375,
+      "learning_rate": 3.547721884247699e-05,
+      "loss": 0.005037225782871246,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00505,
+      "step": 670,
+      "tokens/total": 87703552,
+      "tokens/train_per_sec_per_gpu": 3102.83,
+      "tokens/trainable": 9331033
+    },
+    {
+      "epoch": 2.1369426751592355,
+      "grad_norm": 0.1640625,
+      "learning_rate": 3.5426725999541174e-05,
+      "loss": 0.005763325374573469,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00578,
+      "step": 671,
+      "tokens/total": 87834624,
+      "tokens/train_per_sec_per_gpu": 3489.71,
+      "tokens/trainable": 9345625
+    },
+    {
+      "epoch": 2.140127388535032,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 3.5376181614375436e-05,
+      "loss": 0.005982933100312948,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.006,
+      "step": 672,
+      "tokens/total": 87965696,
+      "tokens/train_per_sec_per_gpu": 3141.99,
+      "tokens/trainable": 9358787
+    },
+    {
+      "epoch": 2.143312101910828,
+      "grad_norm": 0.140625,
+      "learning_rate": 3.532558593683486e-05,
+      "loss": 0.005526629742234945,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00554,
+      "step": 673,
+      "tokens/total": 88096768,
+      "tokens/train_per_sec_per_gpu": 3602.99,
+      "tokens/trainable": 9373882
+    },
+    {
+      "epoch": 2.1464968152866244,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 3.527493921702807e-05,
+      "loss": 0.0037272945046424866,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00373,
+      "step": 674,
+      "tokens/total": 88227840,
+      "tokens/train_per_sec_per_gpu": 3350.15,
+      "tokens/trainable": 9387904
+    },
+    {
+      "epoch": 2.1496815286624202,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.5224241705316e-05,
+      "loss": 0.006022762041538954,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00604,
+      "step": 675,
+      "tokens/total": 88358912,
+      "tokens/train_per_sec_per_gpu": 3348.71,
+      "tokens/trainable": 9401921
+    },
+    {
+      "epoch": 2.1528662420382165,
+      "grad_norm": 0.138671875,
+      "learning_rate": 3.517349365231065e-05,
+      "loss": 0.005744612775743008,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00576,
+      "step": 676,
+      "tokens/total": 88489984,
+      "tokens/train_per_sec_per_gpu": 3430.41,
+      "tokens/trainable": 9416291
+    },
+    {
+      "epoch": 2.156050955414013,
+      "grad_norm": 0.1484375,
+      "learning_rate": 3.5122695308873886e-05,
+      "loss": 0.005131675861775875,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00514,
+      "step": 677,
+      "tokens/total": 88621056,
+      "tokens/train_per_sec_per_gpu": 3279.37,
+      "tokens/trainable": 9430037
+    },
+    {
+      "epoch": 2.159235668789809,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 3.5071846926116156e-05,
+      "loss": 0.007699973881244659,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00773,
+      "step": 678,
+      "tokens/total": 88752128,
+      "tokens/train_per_sec_per_gpu": 3222.97,
+      "tokens/trainable": 9443541
+    },
+    {
+      "epoch": 2.162420382165605,
+      "grad_norm": 0.142578125,
+      "learning_rate": 3.502094875539528e-05,
+      "loss": 0.004470378626137972,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00448,
+      "step": 679,
+      "tokens/total": 88883200,
+      "tokens/train_per_sec_per_gpu": 3648.92,
+      "tokens/trainable": 9458725
+    },
+    {
+      "epoch": 2.1656050955414012,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 3.497000104831518e-05,
+      "loss": 0.00871230848133564,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00875,
+      "step": 680,
+      "tokens/total": 89014272,
+      "tokens/train_per_sec_per_gpu": 3137.55,
+      "tokens/trainable": 9471880
+    },
+    {
+      "epoch": 2.1687898089171975,
+      "grad_norm": 0.130859375,
+      "learning_rate": 3.491900405672466e-05,
+      "loss": 0.0037058612797409296,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00371,
+      "step": 681,
+      "tokens/total": 89145344,
+      "tokens/train_per_sec_per_gpu": 3191.78,
+      "tokens/trainable": 9485245
+    },
+    {
+      "epoch": 2.171974522292994,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 3.486795803271614e-05,
+      "loss": 0.004613788798451424,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00462,
+      "step": 682,
+      "tokens/total": 89276416,
+      "tokens/train_per_sec_per_gpu": 3499.78,
+      "tokens/trainable": 9499844
+    },
+    {
+      "epoch": 2.1751592356687897,
+      "grad_norm": 0.12109375,
+      "learning_rate": 3.481686322862443e-05,
+      "loss": 0.003956732805818319,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00396,
+      "step": 683,
+      "tokens/total": 89407488,
+      "tokens/train_per_sec_per_gpu": 3088.9,
+      "tokens/trainable": 9512840
+    },
+    {
+      "epoch": 2.178343949044586,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 3.476571989702548e-05,
+      "loss": 0.006073053926229477,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00609,
+      "step": 684,
+      "tokens/total": 89538560,
+      "tokens/train_per_sec_per_gpu": 3425.79,
+      "tokens/trainable": 9527160
+    },
+    {
+      "epoch": 2.1815286624203822,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 3.4714528290735105e-05,
+      "loss": 0.005430576391518116,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00545,
+      "step": 685,
+      "tokens/total": 89669632,
+      "tokens/train_per_sec_per_gpu": 3295.01,
+      "tokens/trainable": 9540964
+    },
+    {
+      "epoch": 2.1847133757961785,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 3.466328866280778e-05,
+      "loss": 0.003143883775919676,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00315,
+      "step": 686,
+      "tokens/total": 89800704,
+      "tokens/train_per_sec_per_gpu": 3469.02,
+      "tokens/trainable": 9555485
+    },
+    {
+      "epoch": 2.1878980891719744,
+      "grad_norm": 0.1328125,
+      "learning_rate": 3.4612001266535345e-05,
+      "loss": 0.005530213471502066,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00555,
+      "step": 687,
+      "tokens/total": 89931776,
+      "tokens/train_per_sec_per_gpu": 3563.1,
+      "tokens/trainable": 9570400
+    },
+    {
+      "epoch": 2.1910828025477707,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 3.456066635544577e-05,
+      "loss": 0.004905232228338718,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00492,
+      "step": 688,
+      "tokens/total": 90062848,
+      "tokens/train_per_sec_per_gpu": 3391.03,
+      "tokens/trainable": 9584608
+    },
+    {
+      "epoch": 2.194267515923567,
+      "grad_norm": 0.146484375,
+      "learning_rate": 3.450928418330193e-05,
+      "loss": 0.006313517689704895,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00633,
+      "step": 689,
+      "tokens/total": 90193920,
+      "tokens/train_per_sec_per_gpu": 3288.42,
+      "tokens/trainable": 9598448
+    },
+    {
+      "epoch": 2.1974522292993632,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 3.44578550041003e-05,
+      "loss": 0.0040069082751870155,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00401,
+      "step": 690,
+      "tokens/total": 90324992,
+      "tokens/train_per_sec_per_gpu": 3679.5,
+      "tokens/trainable": 9613777
+    },
+    {
+      "epoch": 2.200636942675159,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 3.440637907206973e-05,
+      "loss": 0.0068097589537501335,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00683,
+      "step": 691,
+      "tokens/total": 90456064,
+      "tokens/train_per_sec_per_gpu": 3546.91,
+      "tokens/trainable": 9628632
+    },
+    {
+      "epoch": 2.2038216560509554,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 3.435485664167019e-05,
+      "loss": 0.004060130566358566,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00407,
+      "step": 692,
+      "tokens/total": 90587136,
+      "tokens/train_per_sec_per_gpu": 3218.86,
+      "tokens/trainable": 9642131
+    },
+    {
+      "epoch": 2.2070063694267517,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 3.4303287967591484e-05,
+      "loss": 0.008195128291845322,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00823,
+      "step": 693,
+      "tokens/total": 90718208,
+      "tokens/train_per_sec_per_gpu": 3409.39,
+      "tokens/trainable": 9656400
+    },
+    {
+      "epoch": 2.210191082802548,
+      "grad_norm": 0.1796875,
+      "learning_rate": 3.425167330475205e-05,
+      "loss": 0.0061119189485907555,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00613,
+      "step": 694,
+      "tokens/total": 90849280,
+      "tokens/train_per_sec_per_gpu": 3495.48,
+      "tokens/trainable": 9670962
+    },
+    {
+      "epoch": 2.213375796178344,
+      "grad_norm": 0.13671875,
+      "learning_rate": 3.420001290829761e-05,
+      "loss": 0.004308244213461876,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00432,
+      "step": 695,
+      "tokens/total": 90980352,
+      "tokens/train_per_sec_per_gpu": 3283.37,
+      "tokens/trainable": 9684728
+    },
+    {
+      "epoch": 2.21656050955414,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 3.4148307033600014e-05,
+      "loss": 0.006343189161270857,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00636,
+      "step": 696,
+      "tokens/total": 91111424,
+      "tokens/train_per_sec_per_gpu": 3576.41,
+      "tokens/trainable": 9699704
+    },
+    {
+      "epoch": 2.2197452229299364,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 3.409655593625587e-05,
+      "loss": 0.006463784724473953,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00648,
+      "step": 697,
+      "tokens/total": 91242496,
+      "tokens/train_per_sec_per_gpu": 3253.16,
+      "tokens/trainable": 9713318
+    },
+    {
+      "epoch": 2.2229299363057327,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 3.404475987208539e-05,
+      "loss": 0.0030284496024250984,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00303,
+      "step": 698,
+      "tokens/total": 91373568,
+      "tokens/train_per_sec_per_gpu": 3353.96,
+      "tokens/trainable": 9727366
+    },
+    {
+      "epoch": 2.2261146496815285,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 3.399291909713101e-05,
+      "loss": 0.004884797614067793,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0049,
+      "step": 699,
+      "tokens/total": 91504640,
+      "tokens/train_per_sec_per_gpu": 3792.66,
+      "tokens/trainable": 9743134
+    },
+    {
+      "epoch": 2.229299363057325,
+      "grad_norm": 0.1875,
+      "learning_rate": 3.394103386765625e-05,
+      "loss": 0.005894185043871403,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00591,
+      "step": 700,
+      "tokens/total": 91635712,
+      "tokens/train_per_sec_per_gpu": 2956.26,
+      "tokens/trainable": 9755576
+    },
+    {
+      "epoch": 2.232484076433121,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 3.388910444014432e-05,
+      "loss": 0.0050967601127922535,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00511,
+      "step": 701,
+      "tokens/total": 91766784,
+      "tokens/train_per_sec_per_gpu": 3181.89,
+      "tokens/trainable": 9768924
+    },
+    {
+      "epoch": 2.2356687898089174,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 3.3837131071296945e-05,
+      "loss": 0.004923132713884115,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00494,
+      "step": 702,
+      "tokens/total": 91897856,
+      "tokens/train_per_sec_per_gpu": 3211.71,
+      "tokens/trainable": 9782384
+    },
+    {
+      "epoch": 2.238853503184713,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 3.378511401803307e-05,
+      "loss": 0.005397360771894455,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00541,
+      "step": 703,
+      "tokens/total": 92028928,
+      "tokens/train_per_sec_per_gpu": 3335.08,
+      "tokens/trainable": 9796325
+    },
+    {
+      "epoch": 2.2420382165605095,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 3.373305353748755e-05,
+      "loss": 0.004327027127146721,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00434,
+      "step": 704,
+      "tokens/total": 92160000,
+      "tokens/train_per_sec_per_gpu": 3315.25,
+      "tokens/trainable": 9810212
+    },
+    {
+      "epoch": 2.245222929936306,
+      "grad_norm": 0.15625,
+      "learning_rate": 3.368094988700996e-05,
+      "loss": 0.007469909265637398,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0075,
+      "step": 705,
+      "tokens/total": 92291072,
+      "tokens/train_per_sec_per_gpu": 3489.88,
+      "tokens/trainable": 9824826
+    },
+    {
+      "epoch": 2.248407643312102,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 3.3628803324163236e-05,
+      "loss": 0.005583882797509432,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0056,
+      "step": 706,
+      "tokens/total": 92422144,
+      "tokens/train_per_sec_per_gpu": 3391.08,
+      "tokens/trainable": 9839023
+    },
+    {
+      "epoch": 2.251592356687898,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 3.357661410672247e-05,
+      "loss": 0.004044718574732542,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00405,
+      "step": 707,
+      "tokens/total": 92553216,
+      "tokens/train_per_sec_per_gpu": 3547.4,
+      "tokens/trainable": 9853877
+    },
+    {
+      "epoch": 2.254777070063694,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 3.352438249267359e-05,
+      "loss": 0.005919166840612888,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00594,
+      "step": 708,
+      "tokens/total": 92684288,
+      "tokens/train_per_sec_per_gpu": 3416.57,
+      "tokens/trainable": 9868162
+    },
+    {
+      "epoch": 2.2579617834394905,
+      "grad_norm": 0.150390625,
+      "learning_rate": 3.347210874021211e-05,
+      "loss": 0.005268896464258432,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00528,
+      "step": 709,
+      "tokens/total": 92815360,
+      "tokens/train_per_sec_per_gpu": 3315.44,
+      "tokens/trainable": 9882010
+    },
+    {
+      "epoch": 2.261146496815287,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 3.3419793107741834e-05,
+      "loss": 0.0063535538502037525,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00637,
+      "step": 710,
+      "tokens/total": 92946432,
+      "tokens/train_per_sec_per_gpu": 3215.77,
+      "tokens/trainable": 9895483
+    },
+    {
+      "epoch": 2.2643312101910826,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 3.336743585387362e-05,
+      "loss": 0.0036360113881528378,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00364,
+      "step": 711,
+      "tokens/total": 93077504,
+      "tokens/train_per_sec_per_gpu": 3574.66,
+      "tokens/trainable": 9910386
+    },
+    {
+      "epoch": 2.267515923566879,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 3.3315037237424036e-05,
+      "loss": 0.0054854946210980415,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0055,
+      "step": 712,
+      "tokens/total": 93208576,
+      "tokens/train_per_sec_per_gpu": 3491.98,
+      "tokens/trainable": 9924935
+    },
+    {
+      "epoch": 2.270700636942675,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 3.326259751741414e-05,
+      "loss": 0.0039428528398275375,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00395,
+      "step": 713,
+      "tokens/total": 93339648,
+      "tokens/train_per_sec_per_gpu": 3219.92,
+      "tokens/trainable": 9938434
+    },
+    {
+      "epoch": 2.2738853503184715,
+      "grad_norm": 0.169921875,
+      "learning_rate": 3.321011695306818e-05,
+      "loss": 0.007426953874528408,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00745,
+      "step": 714,
+      "tokens/total": 93470720,
+      "tokens/train_per_sec_per_gpu": 3424.44,
+      "tokens/trainable": 9952758
+    },
+    {
+      "epoch": 2.2770700636942673,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 3.315759580381228e-05,
+      "loss": 0.006136072333902121,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00615,
+      "step": 715,
+      "tokens/total": 93601792,
+      "tokens/train_per_sec_per_gpu": 3058.02,
+      "tokens/trainable": 9965569
+    },
+    {
+      "epoch": 2.2802547770700636,
+      "grad_norm": 0.140625,
+      "learning_rate": 3.310503432927322e-05,
+      "loss": 0.004970425274223089,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00498,
+      "step": 716,
+      "tokens/total": 93732864,
+      "tokens/train_per_sec_per_gpu": 3320.74,
+      "tokens/trainable": 9979461
+    },
+    {
+      "epoch": 2.28343949044586,
+      "grad_norm": 0.201171875,
+      "learning_rate": 3.305243278927711e-05,
+      "loss": 0.006117875222116709,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00614,
+      "step": 717,
+      "tokens/total": 93863936,
+      "tokens/train_per_sec_per_gpu": 3419.81,
+      "tokens/trainable": 9993781
+    },
+    {
+      "epoch": 2.286624203821656,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 3.299979144384808e-05,
+      "loss": 0.005094599910080433,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00511,
+      "step": 718,
+      "tokens/total": 93995008,
+      "tokens/train_per_sec_per_gpu": 3621.22,
+      "tokens/trainable": 10008873
+    },
+    {
+      "epoch": 2.289808917197452,
+      "grad_norm": 0.150390625,
+      "learning_rate": 3.29471105532071e-05,
+      "loss": 0.005003094207495451,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00502,
+      "step": 719,
+      "tokens/total": 94126080,
+      "tokens/train_per_sec_per_gpu": 3296.22,
+      "tokens/trainable": 10022678
+    },
+    {
+      "epoch": 2.2929936305732483,
+      "grad_norm": 0.16015625,
+      "learning_rate": 3.2894390377770556e-05,
+      "loss": 0.005475780460983515,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00549,
+      "step": 720,
+      "tokens/total": 94257152,
+      "tokens/train_per_sec_per_gpu": 3130.91,
+      "tokens/trainable": 10035849
+    },
+    {
+      "epoch": 2.2961783439490446,
+      "grad_norm": 0.1796875,
+      "learning_rate": 3.284163117814906e-05,
+      "loss": 0.005412337835878134,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00543,
+      "step": 721,
+      "tokens/total": 94388224,
+      "tokens/train_per_sec_per_gpu": 3388.52,
+      "tokens/trainable": 10050035
+    },
+    {
+      "epoch": 2.299363057324841,
+      "grad_norm": 0.15625,
+      "learning_rate": 3.278883321514613e-05,
+      "loss": 0.005983334966003895,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.006,
+      "step": 722,
+      "tokens/total": 94519296,
+      "tokens/train_per_sec_per_gpu": 3388.95,
+      "tokens/trainable": 10064219
+    },
+    {
+      "epoch": 2.3025477707006368,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 3.27359967497569e-05,
+      "loss": 0.006622286047786474,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00664,
+      "step": 723,
+      "tokens/total": 94650368,
+      "tokens/train_per_sec_per_gpu": 3043.49,
+      "tokens/trainable": 10077049
+    },
+    {
+      "epoch": 2.305732484076433,
+      "grad_norm": 0.15234375,
+      "learning_rate": 3.268312204316684e-05,
+      "loss": 0.005963774397969246,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00598,
+      "step": 724,
+      "tokens/total": 94781440,
+      "tokens/train_per_sec_per_gpu": 3577.18,
+      "tokens/trainable": 10091953
+    },
+    {
+      "epoch": 2.3089171974522293,
+      "grad_norm": 0.15625,
+      "learning_rate": 3.263020935675043e-05,
+      "loss": 0.003999189008027315,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00401,
+      "step": 725,
+      "tokens/total": 94912512,
+      "tokens/train_per_sec_per_gpu": 3227.08,
+      "tokens/trainable": 10105451
+    },
+    {
+      "epoch": 2.3121019108280256,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 3.2577258952069934e-05,
+      "loss": 0.0032455208711326122,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00325,
+      "step": 726,
+      "tokens/total": 95043584,
+      "tokens/train_per_sec_per_gpu": 3151.03,
+      "tokens/trainable": 10118668
+    },
+    {
+      "epoch": 2.3152866242038215,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 3.252427109087403e-05,
+      "loss": 0.004745165351778269,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00476,
+      "step": 727,
+      "tokens/total": 95174656,
+      "tokens/train_per_sec_per_gpu": 3383.32,
+      "tokens/trainable": 10132813
+    },
+    {
+      "epoch": 2.3184713375796178,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 3.247124603509659e-05,
+      "loss": 0.004897519946098328,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00491,
+      "step": 728,
+      "tokens/total": 95305728,
+      "tokens/train_per_sec_per_gpu": 3389.91,
+      "tokens/trainable": 10147011
+    },
+    {
+      "epoch": 2.321656050955414,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 3.241818404685531e-05,
+      "loss": 0.0032559458632022142,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00326,
+      "step": 729,
+      "tokens/total": 95436800,
+      "tokens/train_per_sec_per_gpu": 3355.7,
+      "tokens/trainable": 10161054
+    },
+    {
+      "epoch": 2.3248407643312103,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 3.236508538845049e-05,
+      "loss": 0.007957718335092068,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00799,
+      "step": 730,
+      "tokens/total": 95567872,
+      "tokens/train_per_sec_per_gpu": 3374.92,
+      "tokens/trainable": 10175261
+    },
+    {
+      "epoch": 2.328025477707006,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 3.2311950322363685e-05,
+      "loss": 0.004248796030879021,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00426,
+      "step": 731,
+      "tokens/total": 95698944,
+      "tokens/train_per_sec_per_gpu": 2920.63,
+      "tokens/trainable": 10187549
+    },
+    {
+      "epoch": 2.3312101910828025,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 3.225877911125642e-05,
+      "loss": 0.0069992574863135815,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00702,
+      "step": 732,
+      "tokens/total": 95830016,
+      "tokens/train_per_sec_per_gpu": 3329.17,
+      "tokens/trainable": 10201476
+    },
+    {
+      "epoch": 2.3343949044585988,
+      "grad_norm": 0.142578125,
+      "learning_rate": 3.2205572017968895e-05,
+      "loss": 0.0038090457674115896,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00382,
+      "step": 733,
+      "tokens/total": 95961088,
+      "tokens/train_per_sec_per_gpu": 3853.91,
+      "tokens/trainable": 10217517
+    },
+    {
+      "epoch": 2.337579617834395,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 3.21523293055187e-05,
+      "loss": 0.005244470667093992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00526,
+      "step": 734,
+      "tokens/total": 96092160,
+      "tokens/train_per_sec_per_gpu": 3372.34,
+      "tokens/trainable": 10231610
+    },
+    {
+      "epoch": 2.340764331210191,
+      "grad_norm": 0.224609375,
+      "learning_rate": 3.2099051237099475e-05,
+      "loss": 0.007509202696382999,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00754,
+      "step": 735,
+      "tokens/total": 96223232,
+      "tokens/train_per_sec_per_gpu": 3226.63,
+      "tokens/trainable": 10245132
+    },
+    {
+      "epoch": 2.343949044585987,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 3.204573807607967e-05,
+      "loss": 0.004419627133756876,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00443,
+      "step": 736,
+      "tokens/total": 96354304,
+      "tokens/train_per_sec_per_gpu": 3439.56,
+      "tokens/trainable": 10259460
+    },
+    {
+      "epoch": 2.3471337579617835,
+      "grad_norm": 0.125,
+      "learning_rate": 3.199239008600117e-05,
+      "loss": 0.0039891735650599,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.004,
+      "step": 737,
+      "tokens/total": 96485376,
+      "tokens/train_per_sec_per_gpu": 3495.62,
+      "tokens/trainable": 10274090
+    },
+    {
+      "epoch": 2.3503184713375798,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 3.193900753057805e-05,
+      "loss": 0.004535307642072439,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00455,
+      "step": 738,
+      "tokens/total": 96616448,
+      "tokens/train_per_sec_per_gpu": 3314.96,
+      "tokens/trainable": 10287987
+    },
+    {
+      "epoch": 2.3535031847133756,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 3.188559067369525e-05,
+      "loss": 0.004258223343640566,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00427,
+      "step": 739,
+      "tokens/total": 96747520,
+      "tokens/train_per_sec_per_gpu": 3314.93,
+      "tokens/trainable": 10301858
+    },
+    {
+      "epoch": 2.356687898089172,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 3.183213977940726e-05,
+      "loss": 0.00545046990737319,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00547,
+      "step": 740,
+      "tokens/total": 96878592,
+      "tokens/train_per_sec_per_gpu": 3349.75,
+      "tokens/trainable": 10315882
+    },
+    {
+      "epoch": 2.359872611464968,
+      "grad_norm": 0.255859375,
+      "learning_rate": 3.1778655111936866e-05,
+      "loss": 0.005119058303534985,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00513,
+      "step": 741,
+      "tokens/total": 97009664,
+      "tokens/train_per_sec_per_gpu": 3190.1,
+      "tokens/trainable": 10329249
+    },
+    {
+      "epoch": 2.3630573248407645,
+      "grad_norm": 0.150390625,
+      "learning_rate": 3.172513693567375e-05,
+      "loss": 0.004317954182624817,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00433,
+      "step": 742,
+      "tokens/total": 97140736,
+      "tokens/train_per_sec_per_gpu": 3570.82,
+      "tokens/trainable": 10344194
+    },
+    {
+      "epoch": 2.3662420382165603,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 3.167158551517326e-05,
+      "loss": 0.004607304465025663,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00462,
+      "step": 743,
+      "tokens/total": 97271808,
+      "tokens/train_per_sec_per_gpu": 2783.4,
+      "tokens/trainable": 10355961
+    },
+    {
+      "epoch": 2.3694267515923566,
+      "grad_norm": 0.185546875,
+      "learning_rate": 3.1618001115155095e-05,
+      "loss": 0.00533033162355423,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00534,
+      "step": 744,
+      "tokens/total": 97402880,
+      "tokens/train_per_sec_per_gpu": 3466.49,
+      "tokens/trainable": 10370470
+    },
+    {
+      "epoch": 2.372611464968153,
+      "grad_norm": 0.154296875,
+      "learning_rate": 3.1564384000501954e-05,
+      "loss": 0.003959702793508768,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00397,
+      "step": 745,
+      "tokens/total": 97533952,
+      "tokens/train_per_sec_per_gpu": 3617.49,
+      "tokens/trainable": 10385521
+    },
+    {
+      "epoch": 2.375796178343949,
+      "grad_norm": 0.166015625,
+      "learning_rate": 3.151073443625828e-05,
+      "loss": 0.006154323928058147,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00617,
+      "step": 746,
+      "tokens/total": 97665024,
+      "tokens/train_per_sec_per_gpu": 3457.18,
+      "tokens/trainable": 10400021
+    },
+    {
+      "epoch": 2.3789808917197455,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 3.1457052687628905e-05,
+      "loss": 0.0052504888735711575,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00526,
+      "step": 747,
+      "tokens/total": 97796096,
+      "tokens/train_per_sec_per_gpu": 2728.56,
+      "tokens/trainable": 10411548
+    },
+    {
+      "epoch": 2.3821656050955413,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 3.140333901997776e-05,
+      "loss": 0.004432502668350935,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00444,
+      "step": 748,
+      "tokens/total": 97927168,
+      "tokens/train_per_sec_per_gpu": 3271.0,
+      "tokens/trainable": 10425216
+    },
+    {
+      "epoch": 2.3853503184713376,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 3.1349593698826566e-05,
+      "loss": 0.006921032909303904,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00695,
+      "step": 749,
+      "tokens/total": 98058240,
+      "tokens/train_per_sec_per_gpu": 3261.31,
+      "tokens/trainable": 10438873
+    },
+    {
+      "epoch": 2.388535031847134,
+      "grad_norm": 0.216796875,
+      "learning_rate": 3.1295816989853514e-05,
+      "loss": 0.004738848190754652,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00475,
+      "step": 750,
+      "tokens/total": 98189312,
+      "tokens/train_per_sec_per_gpu": 3321.82,
+      "tokens/trainable": 10452755
+    },
+    {
+      "epoch": 2.3917197452229297,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 3.124200915889195e-05,
+      "loss": 0.0069868722930550575,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00701,
+      "step": 751,
+      "tokens/total": 98320384,
+      "tokens/train_per_sec_per_gpu": 3436.55,
+      "tokens/trainable": 10467133
+    },
+    {
+      "epoch": 2.394904458598726,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 3.118817047192907e-05,
+      "loss": 0.0037817361298948526,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00379,
+      "step": 752,
+      "tokens/total": 98451456,
+      "tokens/train_per_sec_per_gpu": 3295.93,
+      "tokens/trainable": 10480957
+    },
+    {
+      "epoch": 2.3980891719745223,
+      "grad_norm": 0.2109375,
+      "learning_rate": 3.11343011951046e-05,
+      "loss": 0.006747876293957233,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00677,
+      "step": 753,
+      "tokens/total": 98582528,
+      "tokens/train_per_sec_per_gpu": 3181.77,
+      "tokens/trainable": 10494276
+    },
+    {
+      "epoch": 2.4012738853503186,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 3.108040159470949e-05,
+      "loss": 0.005729879718273878,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00575,
+      "step": 754,
+      "tokens/total": 98713600,
+      "tokens/train_per_sec_per_gpu": 3542.0,
+      "tokens/trainable": 10509041
+    },
+    {
+      "epoch": 2.404458598726115,
+      "grad_norm": 0.193359375,
+      "learning_rate": 3.1026471937184554e-05,
+      "loss": 0.005885195918381214,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0059,
+      "step": 755,
+      "tokens/total": 98844672,
+      "tokens/train_per_sec_per_gpu": 3159.44,
+      "tokens/trainable": 10522288
+    },
+    {
+      "epoch": 2.4076433121019107,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 3.097251248911922e-05,
+      "loss": 0.005482829641550779,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0055,
+      "step": 756,
+      "tokens/total": 98975744,
+      "tokens/train_per_sec_per_gpu": 3443.41,
+      "tokens/trainable": 10536684
+    },
+    {
+      "epoch": 2.410828025477707,
+      "grad_norm": 0.150390625,
+      "learning_rate": 3.091852351725018e-05,
+      "loss": 0.003930831328034401,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00394,
+      "step": 757,
+      "tokens/total": 99106816,
+      "tokens/train_per_sec_per_gpu": 3398.83,
+      "tokens/trainable": 10550908
+    },
+    {
+      "epoch": 2.4140127388535033,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 3.0864505288460034e-05,
+      "loss": 0.006072892341762781,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00609,
+      "step": 758,
+      "tokens/total": 99237888,
+      "tokens/train_per_sec_per_gpu": 3411.03,
+      "tokens/trainable": 10565215
+    },
+    {
+      "epoch": 2.417197452229299,
+      "grad_norm": 0.166015625,
+      "learning_rate": 3.0810458069776044e-05,
+      "loss": 0.0038501895032823086,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00386,
+      "step": 759,
+      "tokens/total": 99368960,
+      "tokens/train_per_sec_per_gpu": 3448.42,
+      "tokens/trainable": 10579654
+    },
+    {
+      "epoch": 2.4203821656050954,
+      "grad_norm": 0.1796875,
+      "learning_rate": 3.0756382128368765e-05,
+      "loss": 0.006182640325278044,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0062,
+      "step": 760,
+      "tokens/total": 99500032,
+      "tokens/train_per_sec_per_gpu": 3253.44,
+      "tokens/trainable": 10593291
+    },
+    {
+      "epoch": 2.4235668789808917,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 3.070227773155074e-05,
+      "loss": 0.0059751239605247974,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00599,
+      "step": 761,
+      "tokens/total": 99631104,
+      "tokens/train_per_sec_per_gpu": 3587.67,
+      "tokens/trainable": 10608279
+    },
+    {
+      "epoch": 2.426751592356688,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 3.064814514677517e-05,
+      "loss": 0.005476124584674835,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00549,
+      "step": 762,
+      "tokens/total": 99762176,
+      "tokens/train_per_sec_per_gpu": 3337.14,
+      "tokens/trainable": 10622276
+    },
+    {
+      "epoch": 2.4299363057324843,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 3.0593984641634595e-05,
+      "loss": 0.007891716435551643,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00792,
+      "step": 763,
+      "tokens/total": 99893248,
+      "tokens/train_per_sec_per_gpu": 2999.6,
+      "tokens/trainable": 10634845
+    },
+    {
+      "epoch": 2.43312101910828,
+      "grad_norm": 0.130859375,
+      "learning_rate": 3.053979648385957e-05,
+      "loss": 0.004688839428126812,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0047,
+      "step": 764,
+      "tokens/total": 100024320,
+      "tokens/train_per_sec_per_gpu": 3484.41,
+      "tokens/trainable": 10649410
+    },
+    {
+      "epoch": 2.4363057324840764,
+      "grad_norm": 0.150390625,
+      "learning_rate": 3.048558094131737e-05,
+      "loss": 0.004935243632644415,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00495,
+      "step": 765,
+      "tokens/total": 100155392,
+      "tokens/train_per_sec_per_gpu": 3011.34,
+      "tokens/trainable": 10662117
+    },
+    {
+      "epoch": 2.4394904458598727,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 3.0431338282010606e-05,
+      "loss": 0.004069786984473467,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00408,
+      "step": 766,
+      "tokens/total": 100286464,
+      "tokens/train_per_sec_per_gpu": 3252.38,
+      "tokens/trainable": 10675770
+    },
+    {
+      "epoch": 2.4426751592356686,
+      "grad_norm": 0.16796875,
+      "learning_rate": 3.0377068774075957e-05,
+      "loss": 0.005909848026931286,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00593,
+      "step": 767,
+      "tokens/total": 100417536,
+      "tokens/train_per_sec_per_gpu": 3084.65,
+      "tokens/trainable": 10688759
+    },
+    {
+      "epoch": 2.445859872611465,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 3.0322772685782815e-05,
+      "loss": 0.005527772940695286,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00554,
+      "step": 768,
+      "tokens/total": 100548608,
+      "tokens/train_per_sec_per_gpu": 3143.83,
+      "tokens/trainable": 10701925
+    },
+    {
+      "epoch": 2.449044585987261,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 3.0268450285531967e-05,
+      "loss": 0.005178853869438171,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00519,
+      "step": 769,
+      "tokens/total": 100679680,
+      "tokens/train_per_sec_per_gpu": 3510.23,
+      "tokens/trainable": 10716553
+    },
+    {
+      "epoch": 2.4522292993630574,
+      "grad_norm": 0.115234375,
+      "learning_rate": 3.021410184185427e-05,
+      "loss": 0.0034743379801511765,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00348,
+      "step": 770,
+      "tokens/total": 100810752,
+      "tokens/train_per_sec_per_gpu": 3316.12,
+      "tokens/trainable": 10730411
+    },
+    {
+      "epoch": 2.4554140127388537,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 3.0159727623409313e-05,
+      "loss": 0.0041341050527989864,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00414,
+      "step": 771,
+      "tokens/total": 100941824,
+      "tokens/train_per_sec_per_gpu": 3027.88,
+      "tokens/trainable": 10743149
+    },
+    {
+      "epoch": 2.4585987261146496,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 3.0105327898984102e-05,
+      "loss": 0.004606778733432293,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00462,
+      "step": 772,
+      "tokens/total": 101072896,
+      "tokens/train_per_sec_per_gpu": 3423.19,
+      "tokens/trainable": 10757437
+    },
+    {
+      "epoch": 2.461783439490446,
+      "grad_norm": 0.193359375,
+      "learning_rate": 3.005090293749174e-05,
+      "loss": 0.006537875160574913,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00656,
+      "step": 773,
+      "tokens/total": 101203968,
+      "tokens/train_per_sec_per_gpu": 3675.05,
+      "tokens/trainable": 10772736
+    },
+    {
+      "epoch": 2.464968152866242,
+      "grad_norm": 0.181640625,
+      "learning_rate": 2.9996453007970056e-05,
+      "loss": 0.006382662802934647,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0064,
+      "step": 774,
+      "tokens/total": 101335040,
+      "tokens/train_per_sec_per_gpu": 3821.84,
+      "tokens/trainable": 10788651
+    },
+    {
+      "epoch": 2.468152866242038,
+      "grad_norm": 0.16015625,
+      "learning_rate": 2.994197837958032e-05,
+      "loss": 0.005575335118919611,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00559,
+      "step": 775,
+      "tokens/total": 101466112,
+      "tokens/train_per_sec_per_gpu": 3358.0,
+      "tokens/trainable": 10802732
+    },
+    {
+      "epoch": 2.4713375796178343,
+      "grad_norm": 0.16015625,
+      "learning_rate": 2.9887479321605895e-05,
+      "loss": 0.005272061098366976,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00529,
+      "step": 776,
+      "tokens/total": 101597184,
+      "tokens/train_per_sec_per_gpu": 3377.54,
+      "tokens/trainable": 10816888
+    },
+    {
+      "epoch": 2.4745222929936306,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 2.9832956103450905e-05,
+      "loss": 0.0034832200035452843,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00349,
+      "step": 777,
+      "tokens/total": 101728256,
+      "tokens/train_per_sec_per_gpu": 3313.99,
+      "tokens/trainable": 10830748
+    },
+    {
+      "epoch": 2.477707006369427,
+      "grad_norm": 0.166015625,
+      "learning_rate": 2.9778408994638906e-05,
+      "loss": 0.005426026880741119,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00544,
+      "step": 778,
+      "tokens/total": 101859328,
+      "tokens/train_per_sec_per_gpu": 3283.57,
+      "tokens/trainable": 10844538
+    },
+    {
+      "epoch": 2.480891719745223,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 2.9723838264811545e-05,
+      "loss": 0.00458392733708024,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00459,
+      "step": 779,
+      "tokens/total": 101990400,
+      "tokens/train_per_sec_per_gpu": 3438.29,
+      "tokens/trainable": 10858901
+    },
+    {
+      "epoch": 2.484076433121019,
+      "grad_norm": 0.1875,
+      "learning_rate": 2.966924418372724e-05,
+      "loss": 0.006339904386550188,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00636,
+      "step": 780,
+      "tokens/total": 102121472,
+      "tokens/train_per_sec_per_gpu": 3321.39,
+      "tokens/trainable": 10873475
+    },
+    {
+      "epoch": 2.4872611464968153,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 2.9614627021259846e-05,
+      "loss": 0.006326707080006599,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00635,
+      "step": 781,
+      "tokens/total": 102252544,
+      "tokens/train_per_sec_per_gpu": 3305.16,
+      "tokens/trainable": 10887308
+    },
+    {
+      "epoch": 2.4904458598726116,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 2.9559987047397303e-05,
+      "loss": 0.006832771003246307,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00686,
+      "step": 782,
+      "tokens/total": 102383616,
+      "tokens/train_per_sec_per_gpu": 2932.75,
+      "tokens/trainable": 10899670
+    },
+    {
+      "epoch": 2.4936305732484074,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 2.950532453224032e-05,
+      "loss": 0.003962225280702114,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00397,
+      "step": 783,
+      "tokens/total": 102514688,
+      "tokens/train_per_sec_per_gpu": 3082.59,
+      "tokens/trainable": 10912648
+    },
+    {
+      "epoch": 2.4968152866242037,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 2.945063974600104e-05,
+      "loss": 0.005036994814872742,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00505,
+      "step": 784,
+      "tokens/total": 102645760,
+      "tokens/train_per_sec_per_gpu": 3643.04,
+      "tokens/trainable": 10927870
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.173828125,
+      "learning_rate": 2.9395932959001692e-05,
+      "loss": 0.0055970605462789536,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00561,
+      "step": 785,
+      "tokens/total": 102776832,
+      "tokens/train_per_sec_per_gpu": 3510.01,
+      "tokens/trainable": 10942573
+    },
+    {
+      "epoch": 2.5,
+      "eval_loss": 0.00919391866773367,
+      "eval_ppl": 1.00924,
+      "eval_runtime": 41.9998,
+      "eval_samples_per_second": 64.31,
+      "eval_steps_per_second": 4.024,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 785
+    },
+    {
+      "epoch": 2.5031847133757963,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 2.9341204441673266e-05,
+      "loss": 0.004385429434478283,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0044,
+      "step": 786,
+      "tokens/total": 102907904,
+      "tokens/train_per_sec_per_gpu": 3007.8,
+      "tokens/trainable": 10955164
+    },
+    {
+      "epoch": 2.5063694267515926,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 2.9286454464554152e-05,
+      "loss": 0.006849427707493305,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00687,
+      "step": 787,
+      "tokens/total": 103038976,
+      "tokens/train_per_sec_per_gpu": 3371.47,
+      "tokens/trainable": 10969219
+    },
+    {
+      "epoch": 2.5095541401273884,
+      "grad_norm": 0.150390625,
+      "learning_rate": 2.9231683298288853e-05,
+      "loss": 0.005230756010860205,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00524,
+      "step": 788,
+      "tokens/total": 103170048,
+      "tokens/train_per_sec_per_gpu": 3590.97,
+      "tokens/trainable": 10984159
+    },
+    {
+      "epoch": 2.5127388535031847,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 2.9176891213626595e-05,
+      "loss": 0.00515084620565176,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00516,
+      "step": 789,
+      "tokens/total": 103301120,
+      "tokens/train_per_sec_per_gpu": 3471.15,
+      "tokens/trainable": 10998703
+    },
+    {
+      "epoch": 2.515923566878981,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 2.9122078481420012e-05,
+      "loss": 0.005567297339439392,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00558,
+      "step": 790,
+      "tokens/total": 103432192,
+      "tokens/train_per_sec_per_gpu": 3566.36,
+      "tokens/trainable": 11013580
+    },
+    {
+      "epoch": 2.519108280254777,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 2.906724537262381e-05,
+      "loss": 0.005145716480910778,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00516,
+      "step": 791,
+      "tokens/total": 103563264,
+      "tokens/train_per_sec_per_gpu": 3399.88,
+      "tokens/trainable": 11027822
+    },
+    {
+      "epoch": 2.522292993630573,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 2.901239215829341e-05,
+      "loss": 0.0032891561277210712,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00329,
+      "step": 792,
+      "tokens/total": 103694336,
+      "tokens/train_per_sec_per_gpu": 3050.99,
+      "tokens/trainable": 11040610
+    },
+    {
+      "epoch": 2.5254777070063694,
+      "grad_norm": 0.166015625,
+      "learning_rate": 2.895751910958364e-05,
+      "loss": 0.005250695627182722,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00526,
+      "step": 793,
+      "tokens/total": 103825408,
+      "tokens/train_per_sec_per_gpu": 3579.93,
+      "tokens/trainable": 11055502
+    },
+    {
+      "epoch": 2.5286624203821657,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 2.8902626497747366e-05,
+      "loss": 0.005496595986187458,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00551,
+      "step": 794,
+      "tokens/total": 103956480,
+      "tokens/train_per_sec_per_gpu": 3718.84,
+      "tokens/trainable": 11070940
+    },
+    {
+      "epoch": 2.531847133757962,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 2.8847714594134144e-05,
+      "loss": 0.006310721859335899,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00633,
+      "step": 795,
+      "tokens/total": 104087552,
+      "tokens/train_per_sec_per_gpu": 3917.97,
+      "tokens/trainable": 11087217
+    },
+    {
+      "epoch": 2.535031847133758,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 2.8792783670188927e-05,
+      "loss": 0.005432881880551577,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00545,
+      "step": 796,
+      "tokens/total": 104218624,
+      "tokens/train_per_sec_per_gpu": 3115.17,
+      "tokens/trainable": 11100308
+    },
+    {
+      "epoch": 2.538216560509554,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 2.873783399745066e-05,
+      "loss": 0.005197789054363966,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00521,
+      "step": 797,
+      "tokens/total": 104349696,
+      "tokens/train_per_sec_per_gpu": 3361.6,
+      "tokens/trainable": 11114376
+    },
+    {
+      "epoch": 2.5414012738853504,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 2.868286584755099e-05,
+      "loss": 0.005434297490864992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00545,
+      "step": 798,
+      "tokens/total": 104480768,
+      "tokens/train_per_sec_per_gpu": 3144.12,
+      "tokens/trainable": 11127547
+    },
+    {
+      "epoch": 2.5445859872611463,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 2.862787949221288e-05,
+      "loss": 0.0028167557902634144,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00282,
+      "step": 799,
+      "tokens/total": 104611840,
+      "tokens/train_per_sec_per_gpu": 3311.57,
+      "tokens/trainable": 11141342
+    },
+    {
+      "epoch": 2.5477707006369426,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 2.857287520324931e-05,
+      "loss": 0.0033000826369971037,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00331,
+      "step": 800,
+      "tokens/total": 104742912,
+      "tokens/train_per_sec_per_gpu": 3334.18,
+      "tokens/trainable": 11155303
+    },
+    {
+      "epoch": 2.550955414012739,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 2.8517853252561906e-05,
+      "loss": 0.004212173167616129,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00422,
+      "step": 801,
+      "tokens/total": 104873984,
+      "tokens/train_per_sec_per_gpu": 3504.45,
+      "tokens/trainable": 11169930
+    },
+    {
+      "epoch": 2.554140127388535,
+      "grad_norm": 0.166015625,
+      "learning_rate": 2.8462813912139586e-05,
+      "loss": 0.005329788196831942,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00534,
+      "step": 802,
+      "tokens/total": 105005056,
+      "tokens/train_per_sec_per_gpu": 3283.1,
+      "tokens/trainable": 11183661
+    },
+    {
+      "epoch": 2.5573248407643314,
+      "grad_norm": 0.134765625,
+      "learning_rate": 2.8407757454057248e-05,
+      "loss": 0.0038679102435708046,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00388,
+      "step": 803,
+      "tokens/total": 105136128,
+      "tokens/train_per_sec_per_gpu": 3540.39,
+      "tokens/trainable": 11198409
+    },
+    {
+      "epoch": 2.5605095541401273,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 2.83526841504744e-05,
+      "loss": 0.004407938569784164,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00442,
+      "step": 804,
+      "tokens/total": 105267200,
+      "tokens/train_per_sec_per_gpu": 3017.63,
+      "tokens/trainable": 11211070
+    },
+    {
+      "epoch": 2.5636942675159236,
+      "grad_norm": 0.16015625,
+      "learning_rate": 2.8297594273633816e-05,
+      "loss": 0.004717926029115915,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00473,
+      "step": 805,
+      "tokens/total": 105398272,
+      "tokens/train_per_sec_per_gpu": 3319.15,
+      "tokens/trainable": 11224899
+    },
+    {
+      "epoch": 2.56687898089172,
+      "grad_norm": 0.1875,
+      "learning_rate": 2.824248809586021e-05,
+      "loss": 0.005949638783931732,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00597,
+      "step": 806,
+      "tokens/total": 105529344,
+      "tokens/train_per_sec_per_gpu": 3475.48,
+      "tokens/trainable": 11239401
+    },
+    {
+      "epoch": 2.5700636942675157,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 2.8187365889558858e-05,
+      "loss": 0.004526551812887192,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00454,
+      "step": 807,
+      "tokens/total": 105660416,
+      "tokens/train_per_sec_per_gpu": 3481.42,
+      "tokens/trainable": 11253859
+    },
+    {
+      "epoch": 2.573248407643312,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 2.81322279272143e-05,
+      "loss": 0.005305514670908451,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00532,
+      "step": 808,
+      "tokens/total": 105791488,
+      "tokens/train_per_sec_per_gpu": 3355.57,
+      "tokens/trainable": 11267936
+    },
+    {
+      "epoch": 2.5764331210191083,
+      "grad_norm": 0.189453125,
+      "learning_rate": 2.8077074481388927e-05,
+      "loss": 0.003922187723219395,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00393,
+      "step": 809,
+      "tokens/total": 105922560,
+      "tokens/train_per_sec_per_gpu": 3053.79,
+      "tokens/trainable": 11280737
+    },
+    {
+      "epoch": 2.5796178343949046,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 2.802190582472168e-05,
+      "loss": 0.004988102242350578,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.005,
+      "step": 810,
+      "tokens/total": 106053632,
+      "tokens/train_per_sec_per_gpu": 3417.76,
+      "tokens/trainable": 11295020
+    },
+    {
+      "epoch": 2.582802547770701,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 2.7966722229926712e-05,
+      "loss": 0.002851355355232954,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00286,
+      "step": 811,
+      "tokens/total": 106184704,
+      "tokens/train_per_sec_per_gpu": 3055.95,
+      "tokens/trainable": 11307828
+    },
+    {
+      "epoch": 2.5859872611464967,
+      "grad_norm": 0.16015625,
+      "learning_rate": 2.7911523969791997e-05,
+      "loss": 0.00479587959125638,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00481,
+      "step": 812,
+      "tokens/total": 106315776,
+      "tokens/train_per_sec_per_gpu": 3332.89,
+      "tokens/trainable": 11321718
+    },
+    {
+      "epoch": 2.589171974522293,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 2.7856311317178002e-05,
+      "loss": 0.00479497155174613,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00481,
+      "step": 813,
+      "tokens/total": 106446848,
+      "tokens/train_per_sec_per_gpu": 3224.61,
+      "tokens/trainable": 11335234
+    },
+    {
+      "epoch": 2.5923566878980893,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 2.7801084545016364e-05,
+      "loss": 0.005322256591171026,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00534,
+      "step": 814,
+      "tokens/total": 106577920,
+      "tokens/train_per_sec_per_gpu": 3067.32,
+      "tokens/trainable": 11348079
+    },
+    {
+      "epoch": 2.595541401273885,
+      "grad_norm": 0.162109375,
+      "learning_rate": 2.774584392630849e-05,
+      "loss": 0.004532738588750362,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00454,
+      "step": 815,
+      "tokens/total": 106708992,
+      "tokens/train_per_sec_per_gpu": 3239.47,
+      "tokens/trainable": 11361632
+    },
+    {
+      "epoch": 2.5987261146496814,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 2.769058973412424e-05,
+      "loss": 0.005558821838349104,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00557,
+      "step": 816,
+      "tokens/total": 106840064,
+      "tokens/train_per_sec_per_gpu": 3432.78,
+      "tokens/trainable": 11376015
+    },
+    {
+      "epoch": 2.6019108280254777,
+      "grad_norm": 0.23046875,
+      "learning_rate": 2.7635322241600603e-05,
+      "loss": 0.008326980285346508,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00836,
+      "step": 817,
+      "tokens/total": 106971136,
+      "tokens/train_per_sec_per_gpu": 3064.13,
+      "tokens/trainable": 11388898
+    },
+    {
+      "epoch": 2.605095541401274,
+      "grad_norm": 0.1796875,
+      "learning_rate": 2.7580041721940264e-05,
+      "loss": 0.005567263346165419,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00558,
+      "step": 818,
+      "tokens/total": 107102208,
+      "tokens/train_per_sec_per_gpu": 3339.15,
+      "tokens/trainable": 11402886
+    },
+    {
+      "epoch": 2.6082802547770703,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 2.7524748448410337e-05,
+      "loss": 0.0028434821870177984,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00285,
+      "step": 819,
+      "tokens/total": 107233280,
+      "tokens/train_per_sec_per_gpu": 3304.29,
+      "tokens/trainable": 11416722
+    },
+    {
+      "epoch": 2.611464968152866,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 2.7469442694340984e-05,
+      "loss": 0.0058287507854402065,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00585,
+      "step": 820,
+      "tokens/total": 107364352,
+      "tokens/train_per_sec_per_gpu": 3313.02,
+      "tokens/trainable": 11430598
+    },
+    {
+      "epoch": 2.6146496815286624,
+      "grad_norm": 0.15625,
+      "learning_rate": 2.7414124733124046e-05,
+      "loss": 0.004522873554378748,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00453,
+      "step": 821,
+      "tokens/total": 107495424,
+      "tokens/train_per_sec_per_gpu": 3603.28,
+      "tokens/trainable": 11445614
+    },
+    {
+      "epoch": 2.6178343949044587,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 2.735879483821171e-05,
+      "loss": 0.004399726167321205,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00441,
+      "step": 822,
+      "tokens/total": 107626496,
+      "tokens/train_per_sec_per_gpu": 3592.02,
+      "tokens/trainable": 11460570
+    },
+    {
+      "epoch": 2.6210191082802545,
+      "grad_norm": 0.13671875,
+      "learning_rate": 2.7303453283115177e-05,
+      "loss": 0.004378693178296089,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00439,
+      "step": 823,
+      "tokens/total": 107757568,
+      "tokens/train_per_sec_per_gpu": 3491.55,
+      "tokens/trainable": 11475144
+    },
+    {
+      "epoch": 2.624203821656051,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 2.7248100341403247e-05,
+      "loss": 0.0058170161210000515,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00583,
+      "step": 824,
+      "tokens/total": 107888640,
+      "tokens/train_per_sec_per_gpu": 3439.8,
+      "tokens/trainable": 11489495
+    },
+    {
+      "epoch": 2.627388535031847,
+      "grad_norm": 0.13671875,
+      "learning_rate": 2.7192736286701042e-05,
+      "loss": 0.0035439918283373117,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00355,
+      "step": 825,
+      "tokens/total": 108019712,
+      "tokens/train_per_sec_per_gpu": 3559.22,
+      "tokens/trainable": 11504322
+    },
+    {
+      "epoch": 2.6305732484076434,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 2.7137361392688613e-05,
+      "loss": 0.004517707973718643,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00453,
+      "step": 826,
+      "tokens/total": 108150784,
+      "tokens/train_per_sec_per_gpu": 3300.49,
+      "tokens/trainable": 11518136
+    },
+    {
+      "epoch": 2.6337579617834397,
+      "grad_norm": 0.13671875,
+      "learning_rate": 2.7081975933099573e-05,
+      "loss": 0.005291810724884272,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00531,
+      "step": 827,
+      "tokens/total": 108281856,
+      "tokens/train_per_sec_per_gpu": 3485.86,
+      "tokens/trainable": 11532728
+    },
+    {
+      "epoch": 2.6369426751592355,
+      "grad_norm": 0.123046875,
+      "learning_rate": 2.7026580181719774e-05,
+      "loss": 0.0031160882208496332,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00312,
+      "step": 828,
+      "tokens/total": 108412928,
+      "tokens/train_per_sec_per_gpu": 3088.88,
+      "tokens/trainable": 11545669
+    },
+    {
+      "epoch": 2.640127388535032,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 2.697117441238597e-05,
+      "loss": 0.004703770391643047,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00471,
+      "step": 829,
+      "tokens/total": 108544000,
+      "tokens/train_per_sec_per_gpu": 3390.81,
+      "tokens/trainable": 11559794
+    },
+    {
+      "epoch": 2.643312101910828,
+      "grad_norm": 0.1875,
+      "learning_rate": 2.6915758898984384e-05,
+      "loss": 0.006808799225836992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00683,
+      "step": 830,
+      "tokens/total": 108675072,
+      "tokens/train_per_sec_per_gpu": 3340.4,
+      "tokens/trainable": 11573796
+    },
+    {
+      "epoch": 2.646496815286624,
+      "grad_norm": 0.212890625,
+      "learning_rate": 2.686033391544945e-05,
+      "loss": 0.005405929870903492,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00542,
+      "step": 831,
+      "tokens/total": 108806144,
+      "tokens/train_per_sec_per_gpu": 3397.97,
+      "tokens/trainable": 11588025
+    },
+    {
+      "epoch": 2.6496815286624202,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 2.6804899735762405e-05,
+      "loss": 0.006530239712446928,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00655,
+      "step": 832,
+      "tokens/total": 108937216,
+      "tokens/train_per_sec_per_gpu": 3240.0,
+      "tokens/trainable": 11601588
+    },
+    {
+      "epoch": 2.6528662420382165,
+      "grad_norm": 0.142578125,
+      "learning_rate": 2.6749456633949932e-05,
+      "loss": 0.0037627576384693384,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00377,
+      "step": 833,
+      "tokens/total": 109068288,
+      "tokens/train_per_sec_per_gpu": 3152.61,
+      "tokens/trainable": 11614787
+    },
+    {
+      "epoch": 2.656050955414013,
+      "grad_norm": 0.158203125,
+      "learning_rate": 2.6694004884082825e-05,
+      "loss": 0.0034914424177259207,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0035,
+      "step": 834,
+      "tokens/total": 109199360,
+      "tokens/train_per_sec_per_gpu": 3518.76,
+      "tokens/trainable": 11629463
+    },
+    {
+      "epoch": 2.659235668789809,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 2.663854476027465e-05,
+      "loss": 0.004583639558404684,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00459,
+      "step": 835,
+      "tokens/total": 109330432,
+      "tokens/train_per_sec_per_gpu": 3674.53,
+      "tokens/trainable": 11644760
+    },
+    {
+      "epoch": 2.662420382165605,
+      "grad_norm": 0.21875,
+      "learning_rate": 2.6583076536680323e-05,
+      "loss": 0.007365885656327009,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00739,
+      "step": 836,
+      "tokens/total": 109461504,
+      "tokens/train_per_sec_per_gpu": 3133.81,
+      "tokens/trainable": 11657934
+    },
+    {
+      "epoch": 2.6656050955414012,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 2.652760048749483e-05,
+      "loss": 0.004122959915548563,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00413,
+      "step": 837,
+      "tokens/total": 109592576,
+      "tokens/train_per_sec_per_gpu": 3445.42,
+      "tokens/trainable": 11672312
+    },
+    {
+      "epoch": 2.6687898089171975,
+      "grad_norm": 0.150390625,
+      "learning_rate": 2.647211688695186e-05,
+      "loss": 0.005676808767020702,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00569,
+      "step": 838,
+      "tokens/total": 109723648,
+      "tokens/train_per_sec_per_gpu": 3626.52,
+      "tokens/trainable": 11687353
+    },
+    {
+      "epoch": 2.6719745222929934,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 2.6416626009322375e-05,
+      "loss": 0.005739385262131691,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00576,
+      "step": 839,
+      "tokens/total": 109854720,
+      "tokens/train_per_sec_per_gpu": 3288.19,
+      "tokens/trainable": 11701118
+    },
+    {
+      "epoch": 2.6751592356687897,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 2.6361128128913347e-05,
+      "loss": 0.00492321141064167,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00494,
+      "step": 840,
+      "tokens/total": 109985792,
+      "tokens/train_per_sec_per_gpu": 3438.3,
+      "tokens/trainable": 11715518
+    },
+    {
+      "epoch": 2.678343949044586,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 2.6305623520066382e-05,
+      "loss": 0.0048889112658798695,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0049,
+      "step": 841,
+      "tokens/total": 110116864,
+      "tokens/train_per_sec_per_gpu": 3362.44,
+      "tokens/trainable": 11729606
+    },
+    {
+      "epoch": 2.6815286624203822,
+      "grad_norm": 0.189453125,
+      "learning_rate": 2.6250112457156296e-05,
+      "loss": 0.005592016503214836,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00561,
+      "step": 842,
+      "tokens/total": 110247936,
+      "tokens/train_per_sec_per_gpu": 2882.24,
+      "tokens/trainable": 11741702
+    },
+    {
+      "epoch": 2.6847133757961785,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 2.619459521458984e-05,
+      "loss": 0.0058587053790688515,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00588,
+      "step": 843,
+      "tokens/total": 110379008,
+      "tokens/train_per_sec_per_gpu": 3463.6,
+      "tokens/trainable": 11756164
+    },
+    {
+      "epoch": 2.6878980891719744,
+      "grad_norm": 0.1796875,
+      "learning_rate": 2.6139072066804332e-05,
+      "loss": 0.004927500616759062,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00494,
+      "step": 844,
+      "tokens/total": 110510080,
+      "tokens/train_per_sec_per_gpu": 3446.83,
+      "tokens/trainable": 11770544
+    },
+    {
+      "epoch": 2.6910828025477707,
+      "grad_norm": 0.2109375,
+      "learning_rate": 2.6083543288266233e-05,
+      "loss": 0.007675695698708296,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00771,
+      "step": 845,
+      "tokens/total": 110641152,
+      "tokens/train_per_sec_per_gpu": 3146.49,
+      "tokens/trainable": 11783721
+    },
+    {
+      "epoch": 2.694267515923567,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 2.602800915346986e-05,
+      "loss": 0.004762662574648857,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00477,
+      "step": 846,
+      "tokens/total": 110772224,
+      "tokens/train_per_sec_per_gpu": 3111.12,
+      "tokens/trainable": 11796753
+    },
+    {
+      "epoch": 2.697452229299363,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 2.5972469936936046e-05,
+      "loss": 0.006559155881404877,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00658,
+      "step": 847,
+      "tokens/total": 110903296,
+      "tokens/train_per_sec_per_gpu": 3445.43,
+      "tokens/trainable": 11811161
+    },
+    {
+      "epoch": 2.700636942675159,
+      "grad_norm": 0.185546875,
+      "learning_rate": 2.5916925913210677e-05,
+      "loss": 0.005181832704693079,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0052,
+      "step": 848,
+      "tokens/total": 111034368,
+      "tokens/train_per_sec_per_gpu": 3072.78,
+      "tokens/trainable": 11824046
+    },
+    {
+      "epoch": 2.7038216560509554,
+      "grad_norm": 0.1640625,
+      "learning_rate": 2.5861377356863437e-05,
+      "loss": 0.005784741137176752,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0058,
+      "step": 849,
+      "tokens/total": 111165440,
+      "tokens/train_per_sec_per_gpu": 3386.17,
+      "tokens/trainable": 11838220
+    },
+    {
+      "epoch": 2.7070063694267517,
+      "grad_norm": 0.181640625,
+      "learning_rate": 2.5805824542486434e-05,
+      "loss": 0.006970499642193317,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00699,
+      "step": 850,
+      "tokens/total": 111296512,
+      "tokens/train_per_sec_per_gpu": 3653.3,
+      "tokens/trainable": 11853456
+    },
+    {
+      "epoch": 2.710191082802548,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 2.5750267744692786e-05,
+      "loss": 0.005797088146209717,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00581,
+      "step": 851,
+      "tokens/total": 111427584,
+      "tokens/train_per_sec_per_gpu": 3852.62,
+      "tokens/trainable": 11869442
+    },
+    {
+      "epoch": 2.713375796178344,
+      "grad_norm": 0.138671875,
+      "learning_rate": 2.5694707238115323e-05,
+      "loss": 0.003937084693461657,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00394,
+      "step": 852,
+      "tokens/total": 111558656,
+      "tokens/train_per_sec_per_gpu": 3164.69,
+      "tokens/trainable": 11882674
+    },
+    {
+      "epoch": 2.71656050955414,
+      "grad_norm": 0.177734375,
+      "learning_rate": 2.5639143297405222e-05,
+      "loss": 0.004891657270491123,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0049,
+      "step": 853,
+      "tokens/total": 111689728,
+      "tokens/train_per_sec_per_gpu": 3223.75,
+      "tokens/trainable": 11896189
+    },
+    {
+      "epoch": 2.7197452229299364,
+      "grad_norm": 0.14453125,
+      "learning_rate": 2.5583576197230603e-05,
+      "loss": 0.003982385154813528,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00399,
+      "step": 854,
+      "tokens/total": 111820800,
+      "tokens/train_per_sec_per_gpu": 3223.56,
+      "tokens/trainable": 11909688
+    },
+    {
+      "epoch": 2.722929936305732,
+      "grad_norm": 0.1484375,
+      "learning_rate": 2.5528006212275218e-05,
+      "loss": 0.0039648148231208324,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00397,
+      "step": 855,
+      "tokens/total": 111951872,
+      "tokens/train_per_sec_per_gpu": 3301.63,
+      "tokens/trainable": 11923437
+    },
+    {
+      "epoch": 2.7261146496815285,
+      "grad_norm": 0.193359375,
+      "learning_rate": 2.5472433617237107e-05,
+      "loss": 0.006385331507772207,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00641,
+      "step": 856,
+      "tokens/total": 112082944,
+      "tokens/train_per_sec_per_gpu": 3430.81,
+      "tokens/trainable": 11937805
+    },
+    {
+      "epoch": 2.729299363057325,
+      "grad_norm": 0.162109375,
+      "learning_rate": 2.541685868682716e-05,
+      "loss": 0.005221599247306585,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00524,
+      "step": 857,
+      "tokens/total": 112214016,
+      "tokens/train_per_sec_per_gpu": 3535.81,
+      "tokens/trainable": 11952610
+    },
+    {
+      "epoch": 2.732484076433121,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 2.5361281695767854e-05,
+      "loss": 0.004517777357250452,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00453,
+      "step": 858,
+      "tokens/total": 112345088,
+      "tokens/train_per_sec_per_gpu": 3097.23,
+      "tokens/trainable": 11965620
+    },
+    {
+      "epoch": 2.7356687898089174,
+      "grad_norm": 0.1328125,
+      "learning_rate": 2.530570291879184e-05,
+      "loss": 0.003382981289178133,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00339,
+      "step": 859,
+      "tokens/total": 112476160,
+      "tokens/train_per_sec_per_gpu": 3500.78,
+      "tokens/trainable": 11980302
+    },
+    {
+      "epoch": 2.738853503184713,
+      "grad_norm": 0.162109375,
+      "learning_rate": 2.5250122630640587e-05,
+      "loss": 0.005662713665515184,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00568,
+      "step": 860,
+      "tokens/total": 112607232,
+      "tokens/train_per_sec_per_gpu": 3446.65,
+      "tokens/trainable": 11994679
+    },
+    {
+      "epoch": 2.7420382165605095,
+      "grad_norm": 0.15625,
+      "learning_rate": 2.519454110606304e-05,
+      "loss": 0.004983518272638321,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.005,
+      "step": 861,
+      "tokens/total": 112738304,
+      "tokens/train_per_sec_per_gpu": 3673.71,
+      "tokens/trainable": 12009986
+    },
+    {
+      "epoch": 2.745222929936306,
+      "grad_norm": 0.146484375,
+      "learning_rate": 2.5138958619814275e-05,
+      "loss": 0.004905369598418474,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00492,
+      "step": 862,
+      "tokens/total": 112869376,
+      "tokens/train_per_sec_per_gpu": 3063.86,
+      "tokens/trainable": 12022816
+    },
+    {
+      "epoch": 2.7484076433121016,
+      "grad_norm": 0.16796875,
+      "learning_rate": 2.5083375446654083e-05,
+      "loss": 0.006565258372575045,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00659,
+      "step": 863,
+      "tokens/total": 113000448,
+      "tokens/train_per_sec_per_gpu": 3636.75,
+      "tokens/trainable": 12037957
+    },
+    {
+      "epoch": 2.7515923566878984,
+      "grad_norm": 0.142578125,
+      "learning_rate": 2.502779186134568e-05,
+      "loss": 0.004305466078221798,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00431,
+      "step": 864,
+      "tokens/total": 113131520,
+      "tokens/train_per_sec_per_gpu": 3370.27,
+      "tokens/trainable": 12052017
+    },
+    {
+      "epoch": 2.754777070063694,
+      "grad_norm": 0.130859375,
+      "learning_rate": 2.497220813865432e-05,
+      "loss": 0.0037764415610581636,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00378,
+      "step": 865,
+      "tokens/total": 113262592,
+      "tokens/train_per_sec_per_gpu": 3212.97,
+      "tokens/trainable": 12065431
+    },
+    {
+      "epoch": 2.7579617834394905,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 2.491662455334592e-05,
+      "loss": 0.005136569030582905,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00515,
+      "step": 866,
+      "tokens/total": 113393664,
+      "tokens/train_per_sec_per_gpu": 3273.22,
+      "tokens/trainable": 12079122
+    },
+    {
+      "epoch": 2.761146496815287,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 2.4861041380185738e-05,
+      "loss": 0.003261574311181903,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00327,
+      "step": 867,
+      "tokens/total": 113524736,
+      "tokens/train_per_sec_per_gpu": 3109.77,
+      "tokens/trainable": 12092147
+    },
+    {
+      "epoch": 2.7643312101910826,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 2.4805458893936963e-05,
+      "loss": 0.0064933402463793755,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00651,
+      "step": 868,
+      "tokens/total": 113655808,
+      "tokens/train_per_sec_per_gpu": 3269.96,
+      "tokens/trainable": 12105826
+    },
+    {
+      "epoch": 2.767515923566879,
+      "grad_norm": 0.140625,
+      "learning_rate": 2.474987736935942e-05,
+      "loss": 0.004877043422311544,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00489,
+      "step": 869,
+      "tokens/total": 113786880,
+      "tokens/train_per_sec_per_gpu": 3459.54,
+      "tokens/trainable": 12120248
+    },
+    {
+      "epoch": 2.770700636942675,
+      "grad_norm": 0.15625,
+      "learning_rate": 2.469429708120817e-05,
+      "loss": 0.004386639688163996,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0044,
+      "step": 870,
+      "tokens/total": 113917952,
+      "tokens/train_per_sec_per_gpu": 3176.3,
+      "tokens/trainable": 12133552
+    },
+    {
+      "epoch": 2.7738853503184715,
+      "grad_norm": 0.166015625,
+      "learning_rate": 2.463871830423215e-05,
+      "loss": 0.00508409459143877,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0051,
+      "step": 871,
+      "tokens/total": 114049024,
+      "tokens/train_per_sec_per_gpu": 3403.46,
+      "tokens/trainable": 12147799
+    },
+    {
+      "epoch": 2.777070063694268,
+      "grad_norm": 0.17578125,
+      "learning_rate": 2.4583141313172842e-05,
+      "loss": 0.003352643456310034,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00336,
+      "step": 872,
+      "tokens/total": 114180096,
+      "tokens/train_per_sec_per_gpu": 3192.03,
+      "tokens/trainable": 12161167
+    },
+    {
+      "epoch": 2.7802547770700636,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 2.4527566382762902e-05,
+      "loss": 0.005316773895174265,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00533,
+      "step": 873,
+      "tokens/total": 114311168,
+      "tokens/train_per_sec_per_gpu": 3192.46,
+      "tokens/trainable": 12174546
+    },
+    {
+      "epoch": 2.78343949044586,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 2.4471993787724777e-05,
+      "loss": 0.00329143856652081,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0033,
+      "step": 874,
+      "tokens/total": 114442240,
+      "tokens/train_per_sec_per_gpu": 3208.11,
+      "tokens/trainable": 12187974
+    },
+    {
+      "epoch": 2.786624203821656,
+      "grad_norm": 0.162109375,
+      "learning_rate": 2.4416423802769403e-05,
+      "loss": 0.0036203130148351192,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00363,
+      "step": 875,
+      "tokens/total": 114573312,
+      "tokens/train_per_sec_per_gpu": 2915.58,
+      "tokens/trainable": 12200203
+    },
+    {
+      "epoch": 2.789808917197452,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 2.436085670259479e-05,
+      "loss": 0.003102727932855487,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00311,
+      "step": 876,
+      "tokens/total": 114704384,
+      "tokens/train_per_sec_per_gpu": 3023.1,
+      "tokens/trainable": 12212847
+    },
+    {
+      "epoch": 2.7929936305732483,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 2.4305292761884676e-05,
+      "loss": 0.005169394891709089,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00518,
+      "step": 877,
+      "tokens/total": 114835456,
+      "tokens/train_per_sec_per_gpu": 3140.34,
+      "tokens/trainable": 12226003
+    },
+    {
+      "epoch": 2.7961783439490446,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 2.4249732255307216e-05,
+      "loss": 0.004676941316574812,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00469,
+      "step": 878,
+      "tokens/total": 114966528,
+      "tokens/train_per_sec_per_gpu": 2960.89,
+      "tokens/trainable": 12238405
+    },
+    {
+      "epoch": 2.799363057324841,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 2.4194175457513575e-05,
+      "loss": 0.005923910532146692,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00594,
+      "step": 879,
+      "tokens/total": 115097600,
+      "tokens/train_per_sec_per_gpu": 3330.91,
+      "tokens/trainable": 12252333
+    },
+    {
+      "epoch": 2.802547770700637,
+      "grad_norm": 0.16015625,
+      "learning_rate": 2.4138622643136562e-05,
+      "loss": 0.004777503665536642,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00479,
+      "step": 880,
+      "tokens/total": 115228672,
+      "tokens/train_per_sec_per_gpu": 3471.55,
+      "tokens/trainable": 12266874
+    },
+    {
+      "epoch": 2.805732484076433,
+      "grad_norm": 0.15234375,
+      "learning_rate": 2.4083074086789332e-05,
+      "loss": 0.004388585686683655,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0044,
+      "step": 881,
+      "tokens/total": 115359744,
+      "tokens/train_per_sec_per_gpu": 3109.72,
+      "tokens/trainable": 12279904
+    },
+    {
+      "epoch": 2.8089171974522293,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 2.4027530063063966e-05,
+      "loss": 0.00651566544547677,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00654,
+      "step": 882,
+      "tokens/total": 115490816,
+      "tokens/train_per_sec_per_gpu": 3355.68,
+      "tokens/trainable": 12293954
+    },
+    {
+      "epoch": 2.8121019108280256,
+      "grad_norm": 0.14453125,
+      "learning_rate": 2.3971990846530134e-05,
+      "loss": 0.0046853781677782536,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0047,
+      "step": 883,
+      "tokens/total": 115621888,
+      "tokens/train_per_sec_per_gpu": 3459.29,
+      "tokens/trainable": 12308453
+    },
+    {
+      "epoch": 2.8152866242038215,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 2.3916456711733776e-05,
+      "loss": 0.004514369182288647,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00452,
+      "step": 884,
+      "tokens/total": 115752960,
+      "tokens/train_per_sec_per_gpu": 3622.31,
+      "tokens/trainable": 12323539
+    },
+    {
+      "epoch": 2.8184713375796178,
+      "grad_norm": 0.130859375,
+      "learning_rate": 2.386092793319568e-05,
+      "loss": 0.004971818067133427,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00498,
+      "step": 885,
+      "tokens/total": 115884032,
+      "tokens/train_per_sec_per_gpu": 3500.64,
+      "tokens/trainable": 12338133
+    },
+    {
+      "epoch": 2.821656050955414,
+      "grad_norm": 0.150390625,
+      "learning_rate": 2.3805404785410157e-05,
+      "loss": 0.004273276310414076,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00428,
+      "step": 886,
+      "tokens/total": 116015104,
+      "tokens/train_per_sec_per_gpu": 3731.5,
+      "tokens/trainable": 12353671
+    },
+    {
+      "epoch": 2.8248407643312103,
+      "grad_norm": 0.130859375,
+      "learning_rate": 2.374988754284371e-05,
+      "loss": 0.0031330641359090805,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00314,
+      "step": 887,
+      "tokens/total": 116146176,
+      "tokens/train_per_sec_per_gpu": 3214.83,
+      "tokens/trainable": 12367172
+    },
+    {
+      "epoch": 2.8280254777070066,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 2.369437647993363e-05,
+      "loss": 0.007122378330677748,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00715,
+      "step": 888,
+      "tokens/total": 116277248,
+      "tokens/train_per_sec_per_gpu": 3830.69,
+      "tokens/trainable": 12383092
+    },
+    {
+      "epoch": 2.8312101910828025,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 2.3638871871086652e-05,
+      "loss": 0.003396370681002736,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0034,
+      "step": 889,
+      "tokens/total": 116408320,
+      "tokens/train_per_sec_per_gpu": 3247.86,
+      "tokens/trainable": 12396628
+    },
+    {
+      "epoch": 2.8343949044585988,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 2.358337399067763e-05,
+      "loss": 0.00505115557461977,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00506,
+      "step": 890,
+      "tokens/total": 116539392,
+      "tokens/train_per_sec_per_gpu": 3332.28,
+      "tokens/trainable": 12410665
+    },
+    {
+      "epoch": 2.837579617834395,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 2.3527883113048154e-05,
+      "loss": 0.0035984639544039965,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0036,
+      "step": 891,
+      "tokens/total": 116670464,
+      "tokens/train_per_sec_per_gpu": 3242.03,
+      "tokens/trainable": 12424229
+    },
+    {
+      "epoch": 2.840764331210191,
+      "grad_norm": 0.2109375,
+      "learning_rate": 2.3472399512505165e-05,
+      "loss": 0.007709989324212074,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00774,
+      "step": 892,
+      "tokens/total": 116801536,
+      "tokens/train_per_sec_per_gpu": 3025.59,
+      "tokens/trainable": 12436996
+    },
+    {
+      "epoch": 2.843949044585987,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 2.3416923463319686e-05,
+      "loss": 0.00600704038515687,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00603,
+      "step": 893,
+      "tokens/total": 116932608,
+      "tokens/train_per_sec_per_gpu": 3495.42,
+      "tokens/trainable": 12451599
+    },
+    {
+      "epoch": 2.8471337579617835,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 2.3361455239725364e-05,
+      "loss": 0.0037581382784992456,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00377,
+      "step": 894,
+      "tokens/total": 117063680,
+      "tokens/train_per_sec_per_gpu": 3183.5,
+      "tokens/trainable": 12464960
+    },
+    {
+      "epoch": 2.8503184713375798,
+      "grad_norm": 0.177734375,
+      "learning_rate": 2.3305995115917177e-05,
+      "loss": 0.004449051804840565,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00446,
+      "step": 895,
+      "tokens/total": 117194752,
+      "tokens/train_per_sec_per_gpu": 3342.72,
+      "tokens/trainable": 12478964
+    },
+    {
+      "epoch": 2.853503184713376,
+      "grad_norm": 0.12890625,
+      "learning_rate": 2.3250543366050074e-05,
+      "loss": 0.004355857148766518,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00437,
+      "step": 896,
+      "tokens/total": 117325824,
+      "tokens/train_per_sec_per_gpu": 3500.36,
+      "tokens/trainable": 12493585
+    },
+    {
+      "epoch": 2.856687898089172,
+      "grad_norm": 0.138671875,
+      "learning_rate": 2.3195100264237607e-05,
+      "loss": 0.004324641078710556,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00433,
+      "step": 897,
+      "tokens/total": 117456896,
+      "tokens/train_per_sec_per_gpu": 3278.5,
+      "tokens/trainable": 12507318
+    },
+    {
+      "epoch": 2.859872611464968,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 2.3139666084550553e-05,
+      "loss": 0.005408423021435738,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00542,
+      "step": 898,
+      "tokens/total": 117587968,
+      "tokens/train_per_sec_per_gpu": 3103.8,
+      "tokens/trainable": 12520317
+    },
+    {
+      "epoch": 2.8630573248407645,
+      "grad_norm": 0.1796875,
+      "learning_rate": 2.308424110101562e-05,
+      "loss": 0.005885708145797253,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0059,
+      "step": 899,
+      "tokens/total": 117719040,
+      "tokens/train_per_sec_per_gpu": 3937.35,
+      "tokens/trainable": 12536633
+    },
+    {
+      "epoch": 2.8662420382165603,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 2.3028825587614044e-05,
+      "loss": 0.0059039052575826645,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00592,
+      "step": 900,
+      "tokens/total": 117850112,
+      "tokens/train_per_sec_per_gpu": 3269.26,
+      "tokens/trainable": 12550322
+    },
+    {
+      "epoch": 2.8694267515923566,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 2.2973419818280225e-05,
+      "loss": 0.004266998264938593,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00428,
+      "step": 901,
+      "tokens/total": 117981184,
+      "tokens/train_per_sec_per_gpu": 2909.52,
+      "tokens/trainable": 12562584
+    },
+    {
+      "epoch": 2.872611464968153,
+      "grad_norm": 0.19140625,
+      "learning_rate": 2.2918024066900433e-05,
+      "loss": 0.005715237930417061,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00573,
+      "step": 902,
+      "tokens/total": 118112256,
+      "tokens/train_per_sec_per_gpu": 3359.95,
+      "tokens/trainable": 12576629
+    },
+    {
+      "epoch": 2.875796178343949,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 2.28626386073114e-05,
+      "loss": 0.0025465991348028183,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00255,
+      "step": 903,
+      "tokens/total": 118243328,
+      "tokens/train_per_sec_per_gpu": 3119.57,
+      "tokens/trainable": 12589699
+    },
+    {
+      "epoch": 2.8789808917197455,
+      "grad_norm": 0.1328125,
+      "learning_rate": 2.2807263713298957e-05,
+      "loss": 0.003974359482526779,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00398,
+      "step": 904,
+      "tokens/total": 118374400,
+      "tokens/train_per_sec_per_gpu": 3276.02,
+      "tokens/trainable": 12603410
+    },
+    {
+      "epoch": 2.8821656050955413,
+      "grad_norm": 0.1328125,
+      "learning_rate": 2.2751899658596755e-05,
+      "loss": 0.004021751694381237,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00403,
+      "step": 905,
+      "tokens/total": 118505472,
+      "tokens/train_per_sec_per_gpu": 3674.77,
+      "tokens/trainable": 12618803
+    },
+    {
+      "epoch": 2.8853503184713376,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 2.2696546716884835e-05,
+      "loss": 0.003338857088238001,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00334,
+      "step": 906,
+      "tokens/total": 118636544,
+      "tokens/train_per_sec_per_gpu": 2909.02,
+      "tokens/trainable": 12631025
+    },
+    {
+      "epoch": 2.888535031847134,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 2.2641205161788287e-05,
+      "loss": 0.0033922025468200445,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0034,
+      "step": 907,
+      "tokens/total": 118767616,
+      "tokens/train_per_sec_per_gpu": 3269.87,
+      "tokens/trainable": 12644738
+    },
+    {
+      "epoch": 2.8917197452229297,
+      "grad_norm": 0.1484375,
+      "learning_rate": 2.2585875266875956e-05,
+      "loss": 0.005157338920980692,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00517,
+      "step": 908,
+      "tokens/total": 118898688,
+      "tokens/train_per_sec_per_gpu": 3300.06,
+      "tokens/trainable": 12658494
+    },
+    {
+      "epoch": 2.894904458598726,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 2.253055730565902e-05,
+      "loss": 0.006811058614403009,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00683,
+      "step": 909,
+      "tokens/total": 119029760,
+      "tokens/train_per_sec_per_gpu": 3546.15,
+      "tokens/trainable": 12673250
+    },
+    {
+      "epoch": 2.8980891719745223,
+      "grad_norm": 0.14453125,
+      "learning_rate": 2.2475251551589662e-05,
+      "loss": 0.003177374368533492,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00318,
+      "step": 910,
+      "tokens/total": 119160832,
+      "tokens/train_per_sec_per_gpu": 3016.5,
+      "tokens/trainable": 12685906
+    },
+    {
+      "epoch": 2.9012738853503186,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 2.241995827805974e-05,
+      "loss": 0.005059496965259314,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00507,
+      "step": 911,
+      "tokens/total": 119291904,
+      "tokens/train_per_sec_per_gpu": 3674.19,
+      "tokens/trainable": 12701221
+    },
+    {
+      "epoch": 2.904458598726115,
+      "grad_norm": 0.126953125,
+      "learning_rate": 2.2364677758399406e-05,
+      "loss": 0.0032712582033127546,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00328,
+      "step": 912,
+      "tokens/total": 119422976,
+      "tokens/train_per_sec_per_gpu": 3246.71,
+      "tokens/trainable": 12714734
+    },
+    {
+      "epoch": 2.9076433121019107,
+      "grad_norm": 0.212890625,
+      "learning_rate": 2.230941026587576e-05,
+      "loss": 0.007138803135603666,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00716,
+      "step": 913,
+      "tokens/total": 119554048,
+      "tokens/train_per_sec_per_gpu": 3265.13,
+      "tokens/trainable": 12728429
+    },
+    {
+      "epoch": 2.910828025477707,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 2.2254156073691518e-05,
+      "loss": 0.00541570782661438,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00543,
+      "step": 914,
+      "tokens/total": 119685120,
+      "tokens/train_per_sec_per_gpu": 3389.26,
+      "tokens/trainable": 12742539
+    },
+    {
+      "epoch": 2.9140127388535033,
+      "grad_norm": 0.1640625,
+      "learning_rate": 2.219891545498365e-05,
+      "loss": 0.0042840586975216866,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00429,
+      "step": 915,
+      "tokens/total": 119816192,
+      "tokens/train_per_sec_per_gpu": 3318.34,
+      "tokens/trainable": 12756353
+    },
+    {
+      "epoch": 2.917197452229299,
+      "grad_norm": 0.17578125,
+      "learning_rate": 2.2143688682822e-05,
+      "loss": 0.005752744618803263,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00577,
+      "step": 916,
+      "tokens/total": 119947264,
+      "tokens/train_per_sec_per_gpu": 3560.06,
+      "tokens/trainable": 12771211
+    },
+    {
+      "epoch": 2.9203821656050954,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 2.2088476030208012e-05,
+      "loss": 0.003762285690754652,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00377,
+      "step": 917,
+      "tokens/total": 120078336,
+      "tokens/train_per_sec_per_gpu": 2930.2,
+      "tokens/trainable": 12783504
+    },
+    {
+      "epoch": 2.9235668789808917,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 2.2033277770073297e-05,
+      "loss": 0.0025295563973486423,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00253,
+      "step": 918,
+      "tokens/total": 120209408,
+      "tokens/train_per_sec_per_gpu": 3098.34,
+      "tokens/trainable": 12796457
+    },
+    {
+      "epoch": 2.926751592356688,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 2.1978094175278323e-05,
+      "loss": 0.004149306565523148,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00416,
+      "step": 919,
+      "tokens/total": 120340480,
+      "tokens/train_per_sec_per_gpu": 3238.11,
+      "tokens/trainable": 12810005
+    },
+    {
+      "epoch": 2.9299363057324843,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 2.192292551861108e-05,
+      "loss": 0.006155917886644602,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00617,
+      "step": 920,
+      "tokens/total": 120471552,
+      "tokens/train_per_sec_per_gpu": 3351.96,
+      "tokens/trainable": 12824046
+    },
+    {
+      "epoch": 2.93312101910828,
+      "grad_norm": 0.140625,
+      "learning_rate": 2.1867772072785708e-05,
+      "loss": 0.005103899631649256,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00512,
+      "step": 921,
+      "tokens/total": 120602624,
+      "tokens/train_per_sec_per_gpu": 3263.43,
+      "tokens/trainable": 12837714
+    },
+    {
+      "epoch": 2.9363057324840764,
+      "grad_norm": 0.171875,
+      "learning_rate": 2.181263411044114e-05,
+      "loss": 0.004437371157109737,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00445,
+      "step": 922,
+      "tokens/total": 120733696,
+      "tokens/train_per_sec_per_gpu": 3276.05,
+      "tokens/trainable": 12851431
+    },
+    {
+      "epoch": 2.9394904458598727,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 2.1757511904139793e-05,
+      "loss": 0.005264171864837408,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00528,
+      "step": 923,
+      "tokens/total": 120864768,
+      "tokens/train_per_sec_per_gpu": 3525.14,
+      "tokens/trainable": 12866186
+    },
+    {
+      "epoch": 2.9426751592356686,
+      "grad_norm": 0.16796875,
+      "learning_rate": 2.1702405726366193e-05,
+      "loss": 0.0048398361541330814,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00485,
+      "step": 924,
+      "tokens/total": 120995840,
+      "tokens/train_per_sec_per_gpu": 3474.68,
+      "tokens/trainable": 12880741
+    },
+    {
+      "epoch": 2.945859872611465,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 2.1647315849525606e-05,
+      "loss": 0.0037978398613631725,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00381,
+      "step": 925,
+      "tokens/total": 121126912,
+      "tokens/train_per_sec_per_gpu": 3132.29,
+      "tokens/trainable": 12893946
+    },
+    {
+      "epoch": 2.949044585987261,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 2.1592242545942755e-05,
+      "loss": 0.005401961971074343,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00542,
+      "step": 926,
+      "tokens/total": 121257984,
+      "tokens/train_per_sec_per_gpu": 3184.76,
+      "tokens/trainable": 12907278
+    },
+    {
+      "epoch": 2.9522292993630574,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 2.1537186087860423e-05,
+      "loss": 0.005091848783195019,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0051,
+      "step": 927,
+      "tokens/total": 121389056,
+      "tokens/train_per_sec_per_gpu": 3525.92,
+      "tokens/trainable": 12921953
+    },
+    {
+      "epoch": 2.9554140127388537,
+      "grad_norm": 0.162109375,
+      "learning_rate": 2.14821467474381e-05,
+      "loss": 0.005307201761752367,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00532,
+      "step": 928,
+      "tokens/total": 121520128,
+      "tokens/train_per_sec_per_gpu": 3465.19,
+      "tokens/trainable": 12936423
+    },
+    {
+      "epoch": 2.9585987261146496,
+      "grad_norm": 0.12109375,
+      "learning_rate": 2.1427124796750696e-05,
+      "loss": 0.002976613584905863,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00298,
+      "step": 929,
+      "tokens/total": 121651200,
+      "tokens/train_per_sec_per_gpu": 3415.41,
+      "tokens/trainable": 12950697
+    },
+    {
+      "epoch": 2.961783439490446,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 2.1372120507787134e-05,
+      "loss": 0.004961484577506781,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00497,
+      "step": 930,
+      "tokens/total": 121782272,
+      "tokens/train_per_sec_per_gpu": 3237.03,
+      "tokens/trainable": 12964260
+    },
+    {
+      "epoch": 2.964968152866242,
+      "grad_norm": 0.193359375,
+      "learning_rate": 2.131713415244902e-05,
+      "loss": 0.0067651160061359406,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00679,
+      "step": 931,
+      "tokens/total": 121913344,
+      "tokens/train_per_sec_per_gpu": 3323.81,
+      "tokens/trainable": 12978164
+    },
+    {
+      "epoch": 2.968152866242038,
+      "grad_norm": 0.166015625,
+      "learning_rate": 2.1262166002549344e-05,
+      "loss": 0.005593163892626762,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00561,
+      "step": 932,
+      "tokens/total": 122044416,
+      "tokens/train_per_sec_per_gpu": 3178.79,
+      "tokens/trainable": 12991495
+    },
+    {
+      "epoch": 2.9713375796178343,
+      "grad_norm": 0.177734375,
+      "learning_rate": 2.1207216329811082e-05,
+      "loss": 0.0055503519251942635,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00557,
+      "step": 933,
+      "tokens/total": 122175488,
+      "tokens/train_per_sec_per_gpu": 2983.66,
+      "tokens/trainable": 13003996
+    },
+    {
+      "epoch": 2.9745222929936306,
+      "grad_norm": 0.162109375,
+      "learning_rate": 2.115228540586586e-05,
+      "loss": 0.004628556780517101,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00464,
+      "step": 934,
+      "tokens/total": 122306560,
+      "tokens/train_per_sec_per_gpu": 3348.38,
+      "tokens/trainable": 13017998
+    },
+    {
+      "epoch": 2.977707006369427,
+      "grad_norm": 0.146484375,
+      "learning_rate": 2.109737350225264e-05,
+      "loss": 0.0036150780506432056,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00362,
+      "step": 935,
+      "tokens/total": 122437632,
+      "tokens/train_per_sec_per_gpu": 3386.9,
+      "tokens/trainable": 13032100
+    },
+    {
+      "epoch": 2.980891719745223,
+      "grad_norm": 0.15234375,
+      "learning_rate": 2.1042480890416368e-05,
+      "loss": 0.004233770538121462,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00424,
+      "step": 936,
+      "tokens/total": 122568704,
+      "tokens/train_per_sec_per_gpu": 3171.53,
+      "tokens/trainable": 13045341
+    },
+    {
+      "epoch": 2.984076433121019,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 2.0987607841706595e-05,
+      "loss": 0.004372127819806337,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00438,
+      "step": 937,
+      "tokens/total": 122699776,
+      "tokens/train_per_sec_per_gpu": 3077.26,
+      "tokens/trainable": 13058291
+    },
+    {
+      "epoch": 2.9872611464968153,
+      "grad_norm": 0.154296875,
+      "learning_rate": 2.09327546273762e-05,
+      "loss": 0.005242812447249889,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00526,
+      "step": 938,
+      "tokens/total": 122830848,
+      "tokens/train_per_sec_per_gpu": 3362.79,
+      "tokens/trainable": 13072317
+    },
+    {
+      "epoch": 2.9904458598726116,
+      "grad_norm": 0.150390625,
+      "learning_rate": 2.087792151858e-05,
+      "loss": 0.0044011822901666164,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00441,
+      "step": 939,
+      "tokens/total": 122961920,
+      "tokens/train_per_sec_per_gpu": 3314.79,
+      "tokens/trainable": 13086158
+    },
+    {
+      "epoch": 2.9936305732484074,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 2.0823108786373414e-05,
+      "loss": 0.004296471830457449,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00431,
+      "step": 940,
+      "tokens/total": 123092992,
+      "tokens/train_per_sec_per_gpu": 3532.4,
+      "tokens/trainable": 13100899
+    },
+    {
+      "epoch": 2.9968152866242037,
+      "grad_norm": 0.134765625,
+      "learning_rate": 2.0768316701711153e-05,
+      "loss": 0.0038203117437660694,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00383,
+      "step": 941,
+      "tokens/total": 123224064,
+      "tokens/train_per_sec_per_gpu": 3339.11,
+      "tokens/trainable": 13115218
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 2.0713545535445857e-05,
+      "loss": 0.005111368373036385,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 39.25,
+      "memory/max_allocated (GiB)": 39.25,
+      "ppl": 1.00512,
+      "step": 942,
+      "tokens/total": 123297792,
+      "tokens/train_per_sec_per_gpu": 3851.53,
+      "tokens/trainable": 13124028
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.008717856369912624,
+      "eval_ppl": 1.00876,
+      "eval_runtime": 41.6707,
+      "eval_samples_per_second": 64.818,
+      "eval_steps_per_second": 4.056,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 942
+    },
+    {
+      "epoch": 3.0031847133757963,
+      "grad_norm": 0.111328125,
+      "learning_rate": 2.0658795558326743e-05,
+      "loss": 0.0027498805429786444,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00275,
+      "step": 943,
+      "tokens/total": 123428864,
+      "tokens/train_per_sec_per_gpu": 3250.23,
+      "tokens/trainable": 13137492
+    },
+    {
+      "epoch": 3.0063694267515926,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 2.0604067040998314e-05,
+      "loss": 0.002591141266748309,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00259,
+      "step": 944,
+      "tokens/total": 123559936,
+      "tokens/train_per_sec_per_gpu": 3658.64,
+      "tokens/trainable": 13152727
+    },
+    {
+      "epoch": 3.0095541401273884,
+      "grad_norm": 0.11328125,
+      "learning_rate": 2.054936025399897e-05,
+      "loss": 0.0033186483196914196,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00332,
+      "step": 945,
+      "tokens/total": 123691008,
+      "tokens/train_per_sec_per_gpu": 3830.57,
+      "tokens/trainable": 13168699
+    },
+    {
+      "epoch": 3.0127388535031847,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 2.049467546775968e-05,
+      "loss": 0.0039662388153374195,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00397,
+      "step": 946,
+      "tokens/total": 123822080,
+      "tokens/train_per_sec_per_gpu": 3532.79,
+      "tokens/trainable": 13183492
+    },
+    {
+      "epoch": 3.015923566878981,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 2.0440012952602706e-05,
+      "loss": 0.003088605822995305,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00309,
+      "step": 947,
+      "tokens/total": 123953152,
+      "tokens/train_per_sec_per_gpu": 3257.92,
+      "tokens/trainable": 13197146
+    },
+    {
+      "epoch": 3.0191082802547773,
+      "grad_norm": 0.12890625,
+      "learning_rate": 2.0385372978740167e-05,
+      "loss": 0.0031338452827185392,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00314,
+      "step": 948,
+      "tokens/total": 124084224,
+      "tokens/train_per_sec_per_gpu": 3231.23,
+      "tokens/trainable": 13210673
+    },
+    {
+      "epoch": 3.022292993630573,
+      "grad_norm": 0.123046875,
+      "learning_rate": 2.033075581627276e-05,
+      "loss": 0.0032858422491699457,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00329,
+      "step": 949,
+      "tokens/total": 124215296,
+      "tokens/train_per_sec_per_gpu": 3298.75,
+      "tokens/trainable": 13224347
+    },
+    {
+      "epoch": 3.0254777070063694,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 2.0276161735188458e-05,
+      "loss": 0.0026432094164192677,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00265,
+      "step": 950,
+      "tokens/total": 124346368,
+      "tokens/train_per_sec_per_gpu": 3518.56,
+      "tokens/trainable": 13238926
+    },
+    {
+      "epoch": 3.0286624203821657,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 2.0221591005361104e-05,
+      "loss": 0.0035607037134468555,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00357,
+      "step": 951,
+      "tokens/total": 124477440,
+      "tokens/train_per_sec_per_gpu": 3364.92,
+      "tokens/trainable": 13252966
+    },
+    {
+      "epoch": 3.031847133757962,
+      "grad_norm": 0.140625,
+      "learning_rate": 2.0167043896549097e-05,
+      "loss": 0.004281069617718458,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00429,
+      "step": 952,
+      "tokens/total": 124608512,
+      "tokens/train_per_sec_per_gpu": 3140.17,
+      "tokens/trainable": 13266012
+    },
+    {
+      "epoch": 3.035031847133758,
+      "grad_norm": 0.140625,
+      "learning_rate": 2.0112520678394107e-05,
+      "loss": 0.003244205377995968,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00325,
+      "step": 953,
+      "tokens/total": 124739584,
+      "tokens/train_per_sec_per_gpu": 3319.22,
+      "tokens/trainable": 13279830
+    },
+    {
+      "epoch": 3.038216560509554,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 2.005802162041969e-05,
+      "loss": 0.0033878230024129152,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00339,
+      "step": 954,
+      "tokens/total": 124870656,
+      "tokens/train_per_sec_per_gpu": 3384.16,
+      "tokens/trainable": 13293926
+    },
+    {
+      "epoch": 3.0414012738853504,
+      "grad_norm": 0.134765625,
+      "learning_rate": 2.0003546992029953e-05,
+      "loss": 0.002641953295096755,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00265,
+      "step": 955,
+      "tokens/total": 125001728,
+      "tokens/train_per_sec_per_gpu": 2720.72,
+      "tokens/trainable": 13305413
+    },
+    {
+      "epoch": 3.0445859872611467,
+      "grad_norm": 0.138671875,
+      "learning_rate": 1.9949097062508267e-05,
+      "loss": 0.003417475149035454,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00342,
+      "step": 956,
+      "tokens/total": 125132800,
+      "tokens/train_per_sec_per_gpu": 3595.19,
+      "tokens/trainable": 13320381
+    },
+    {
+      "epoch": 3.0477707006369426,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 1.9894672101015904e-05,
+      "loss": 0.002634722040966153,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00264,
+      "step": 957,
+      "tokens/total": 125263872,
+      "tokens/train_per_sec_per_gpu": 3285.4,
+      "tokens/trainable": 13334096
+    },
+    {
+      "epoch": 3.050955414012739,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 1.9840272376590693e-05,
+      "loss": 0.0045495470985770226,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00456,
+      "step": 958,
+      "tokens/total": 125394944,
+      "tokens/train_per_sec_per_gpu": 3160.53,
+      "tokens/trainable": 13347392
+    },
+    {
+      "epoch": 3.054140127388535,
+      "grad_norm": 0.126953125,
+      "learning_rate": 1.9785898158145738e-05,
+      "loss": 0.0035640057176351547,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00357,
+      "step": 959,
+      "tokens/total": 125526016,
+      "tokens/train_per_sec_per_gpu": 3402.01,
+      "tokens/trainable": 13361641
+    },
+    {
+      "epoch": 3.0573248407643314,
+      "grad_norm": 0.12890625,
+      "learning_rate": 1.9731549714468045e-05,
+      "loss": 0.003452250501140952,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00346,
+      "step": 960,
+      "tokens/total": 125657088,
+      "tokens/train_per_sec_per_gpu": 3116.14,
+      "tokens/trainable": 13374682
+    },
+    {
+      "epoch": 3.0605095541401273,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 1.9677227314217188e-05,
+      "loss": 0.0024322110693901777,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00244,
+      "step": 961,
+      "tokens/total": 125788160,
+      "tokens/train_per_sec_per_gpu": 2974.51,
+      "tokens/trainable": 13387164
+    },
+    {
+      "epoch": 3.0636942675159236,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 1.962293122592405e-05,
+      "loss": 0.00328466366045177,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00329,
+      "step": 962,
+      "tokens/total": 125919232,
+      "tokens/train_per_sec_per_gpu": 3223.72,
+      "tokens/trainable": 13400626
+    },
+    {
+      "epoch": 3.06687898089172,
+      "grad_norm": 0.1171875,
+      "learning_rate": 1.9568661717989407e-05,
+      "loss": 0.0021802615374326706,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00218,
+      "step": 963,
+      "tokens/total": 126050304,
+      "tokens/train_per_sec_per_gpu": 3537.27,
+      "tokens/trainable": 13415382
+    },
+    {
+      "epoch": 3.070063694267516,
+      "grad_norm": 0.150390625,
+      "learning_rate": 1.951441905868264e-05,
+      "loss": 0.003219526493921876,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00322,
+      "step": 964,
+      "tokens/total": 126181376,
+      "tokens/train_per_sec_per_gpu": 3173.36,
+      "tokens/trainable": 13428745
+    },
+    {
+      "epoch": 3.073248407643312,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.9460203516140433e-05,
+      "loss": 0.0025150931905955076,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00252,
+      "step": 965,
+      "tokens/total": 126312448,
+      "tokens/train_per_sec_per_gpu": 3357.46,
+      "tokens/trainable": 13442783
+    },
+    {
+      "epoch": 3.0764331210191083,
+      "grad_norm": 0.13671875,
+      "learning_rate": 1.940601535836542e-05,
+      "loss": 0.002752315253019333,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00276,
+      "step": 966,
+      "tokens/total": 126443520,
+      "tokens/train_per_sec_per_gpu": 3434.62,
+      "tokens/trainable": 13457099
+    },
+    {
+      "epoch": 3.0796178343949046,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 1.9351854853224837e-05,
+      "loss": 0.002302248729392886,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0023,
+      "step": 967,
+      "tokens/total": 126574592,
+      "tokens/train_per_sec_per_gpu": 3069.39,
+      "tokens/trainable": 13470035
+    },
+    {
+      "epoch": 3.082802547770701,
+      "grad_norm": 0.138671875,
+      "learning_rate": 1.9297722268449264e-05,
+      "loss": 0.00326096941716969,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00327,
+      "step": 968,
+      "tokens/total": 126705664,
+      "tokens/train_per_sec_per_gpu": 3547.61,
+      "tokens/trainable": 13484891
+    },
+    {
+      "epoch": 3.0859872611464967,
+      "grad_norm": 0.14453125,
+      "learning_rate": 1.9243617871631245e-05,
+      "loss": 0.0029772731941193342,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00298,
+      "step": 969,
+      "tokens/total": 126836736,
+      "tokens/train_per_sec_per_gpu": 3593.08,
+      "tokens/trainable": 13499838
+    },
+    {
+      "epoch": 3.089171974522293,
+      "grad_norm": 0.12890625,
+      "learning_rate": 1.9189541930223965e-05,
+      "loss": 0.0024753999896347523,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00248,
+      "step": 970,
+      "tokens/total": 126967808,
+      "tokens/train_per_sec_per_gpu": 3311.76,
+      "tokens/trainable": 13513723
+    },
+    {
+      "epoch": 3.0923566878980893,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.9135494711539975e-05,
+      "loss": 0.003328888211399317,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00333,
+      "step": 971,
+      "tokens/total": 127098880,
+      "tokens/train_per_sec_per_gpu": 3188.59,
+      "tokens/trainable": 13527089
+    },
+    {
+      "epoch": 3.0955414012738856,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 1.9081476482749838e-05,
+      "loss": 0.0020992374047636986,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0021,
+      "step": 972,
+      "tokens/total": 127229952,
+      "tokens/train_per_sec_per_gpu": 3319.78,
+      "tokens/trainable": 13540974
+    },
+    {
+      "epoch": 3.0987261146496814,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 1.902748751088078e-05,
+      "loss": 0.0023126029409468174,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00232,
+      "step": 973,
+      "tokens/total": 127361024,
+      "tokens/train_per_sec_per_gpu": 3187.82,
+      "tokens/trainable": 13554328
+    },
+    {
+      "epoch": 3.1019108280254777,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 1.8973528062815452e-05,
+      "loss": 0.001823435421101749,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00183,
+      "step": 974,
+      "tokens/total": 127492096,
+      "tokens/train_per_sec_per_gpu": 2959.79,
+      "tokens/trainable": 13566755
+    },
+    {
+      "epoch": 3.105095541401274,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.8919598405290522e-05,
+      "loss": 0.002975163981318474,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00298,
+      "step": 975,
+      "tokens/total": 127623168,
+      "tokens/train_per_sec_per_gpu": 3605.57,
+      "tokens/trainable": 13581801
+    },
+    {
+      "epoch": 3.1082802547770703,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 1.88656988048954e-05,
+      "loss": 0.0033629476092755795,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00337,
+      "step": 976,
+      "tokens/total": 127754240,
+      "tokens/train_per_sec_per_gpu": 3305.94,
+      "tokens/trainable": 13595673
+    },
+    {
+      "epoch": 3.111464968152866,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 1.8811829528070935e-05,
+      "loss": 0.0019825787749141455,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00198,
+      "step": 977,
+      "tokens/total": 127885312,
+      "tokens/train_per_sec_per_gpu": 3321.78,
+      "tokens/trainable": 13609562
+    },
+    {
+      "epoch": 3.1146496815286624,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.8757990841108065e-05,
+      "loss": 0.00240930519066751,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 978,
+      "tokens/total": 128016384,
+      "tokens/train_per_sec_per_gpu": 3254.37,
+      "tokens/trainable": 13623198
+    },
+    {
+      "epoch": 3.1178343949044587,
+      "grad_norm": 0.146484375,
+      "learning_rate": 1.87041830101465e-05,
+      "loss": 0.0034942845813930035,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0035,
+      "step": 979,
+      "tokens/total": 128147456,
+      "tokens/train_per_sec_per_gpu": 3162.68,
+      "tokens/trainable": 13636512
+    },
+    {
+      "epoch": 3.121019108280255,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 1.8650406301173447e-05,
+      "loss": 0.0034091034904122353,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00341,
+      "step": 980,
+      "tokens/total": 128278528,
+      "tokens/train_per_sec_per_gpu": 3456.01,
+      "tokens/trainable": 13650976
+    },
+    {
+      "epoch": 3.124203821656051,
+      "grad_norm": 0.15625,
+      "learning_rate": 1.8596660980022258e-05,
+      "loss": 0.0025934309232980013,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0026,
+      "step": 981,
+      "tokens/total": 128409600,
+      "tokens/train_per_sec_per_gpu": 3098.66,
+      "tokens/trainable": 13663952
+    },
+    {
+      "epoch": 3.127388535031847,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 1.8542947312371108e-05,
+      "loss": 0.0022293792571872473,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 982,
+      "tokens/total": 128540672,
+      "tokens/train_per_sec_per_gpu": 3308.14,
+      "tokens/trainable": 13677809
+    },
+    {
+      "epoch": 3.1305732484076434,
+      "grad_norm": 0.220703125,
+      "learning_rate": 1.8489265563741725e-05,
+      "loss": 0.0036684228107333183,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00368,
+      "step": 983,
+      "tokens/total": 128671744,
+      "tokens/train_per_sec_per_gpu": 2671.82,
+      "tokens/trainable": 13689205
+    },
+    {
+      "epoch": 3.1337579617834397,
+      "grad_norm": 0.146484375,
+      "learning_rate": 1.8435615999498045e-05,
+      "loss": 0.003023945726454258,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00303,
+      "step": 984,
+      "tokens/total": 128802816,
+      "tokens/train_per_sec_per_gpu": 3353.12,
+      "tokens/trainable": 13703248
+    },
+    {
+      "epoch": 3.1369426751592355,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 1.8381998884844914e-05,
+      "loss": 0.0030851985793560743,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00309,
+      "step": 985,
+      "tokens/total": 128933888,
+      "tokens/train_per_sec_per_gpu": 3504.98,
+      "tokens/trainable": 13717913
+    },
+    {
+      "epoch": 3.140127388535032,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 1.8328414484826745e-05,
+      "loss": 0.002863124944269657,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00287,
+      "step": 986,
+      "tokens/total": 129064960,
+      "tokens/train_per_sec_per_gpu": 3309.73,
+      "tokens/trainable": 13731778
+    },
+    {
+      "epoch": 3.143312101910828,
+      "grad_norm": 0.138671875,
+      "learning_rate": 1.8274863064326253e-05,
+      "loss": 0.0033043615985661745,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00331,
+      "step": 987,
+      "tokens/total": 129196032,
+      "tokens/train_per_sec_per_gpu": 3489.76,
+      "tokens/trainable": 13746393
+    },
+    {
+      "epoch": 3.1464968152866244,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 1.822134488806314e-05,
+      "loss": 0.003721470246091485,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00373,
+      "step": 988,
+      "tokens/total": 129327104,
+      "tokens/train_per_sec_per_gpu": 3260.94,
+      "tokens/trainable": 13760070
+    },
+    {
+      "epoch": 3.1496815286624202,
+      "grad_norm": 0.107421875,
+      "learning_rate": 1.8167860220592736e-05,
+      "loss": 0.002208119258284569,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00221,
+      "step": 989,
+      "tokens/total": 129458176,
+      "tokens/train_per_sec_per_gpu": 3466.56,
+      "tokens/trainable": 13774565
+    },
+    {
+      "epoch": 3.1528662420382165,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 1.8114409326304754e-05,
+      "loss": 0.0030963195022195578,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 990,
+      "tokens/total": 129589248,
+      "tokens/train_per_sec_per_gpu": 3297.94,
+      "tokens/trainable": 13788405
+    },
+    {
+      "epoch": 3.156050955414013,
+      "grad_norm": 0.146484375,
+      "learning_rate": 1.806099246942196e-05,
+      "loss": 0.0031601302325725555,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00317,
+      "step": 991,
+      "tokens/total": 129720320,
+      "tokens/train_per_sec_per_gpu": 3314.16,
+      "tokens/trainable": 13802332
+    },
+    {
+      "epoch": 3.159235668789809,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 1.800760991399884e-05,
+      "loss": 0.003068044548854232,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00307,
+      "step": 992,
+      "tokens/total": 129851392,
+      "tokens/train_per_sec_per_gpu": 3131.59,
+      "tokens/trainable": 13815450
+    },
+    {
+      "epoch": 3.162420382165605,
+      "grad_norm": 0.142578125,
+      "learning_rate": 1.7954261923920335e-05,
+      "loss": 0.003088792786002159,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00309,
+      "step": 993,
+      "tokens/total": 129982464,
+      "tokens/train_per_sec_per_gpu": 3446.24,
+      "tokens/trainable": 13829844
+    },
+    {
+      "epoch": 3.1656050955414012,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 1.7900948762900527e-05,
+      "loss": 0.002409819047898054,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 994,
+      "tokens/total": 130113536,
+      "tokens/train_per_sec_per_gpu": 3166.51,
+      "tokens/trainable": 13843168
+    },
+    {
+      "epoch": 3.1687898089171975,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 1.7847670694481307e-05,
+      "loss": 0.004092029761523008,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0041,
+      "step": 995,
+      "tokens/total": 130244608,
+      "tokens/train_per_sec_per_gpu": 3606.98,
+      "tokens/trainable": 13858161
+    },
+    {
+      "epoch": 3.171974522292994,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 1.7794427982031104e-05,
+      "loss": 0.001977186882868409,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00198,
+      "step": 996,
+      "tokens/total": 130375680,
+      "tokens/train_per_sec_per_gpu": 3153.56,
+      "tokens/trainable": 13871441
+    },
+    {
+      "epoch": 3.1751592356687897,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 1.7741220888743587e-05,
+      "loss": 0.0029397865291684866,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00294,
+      "step": 997,
+      "tokens/total": 130506752,
+      "tokens/train_per_sec_per_gpu": 3096.09,
+      "tokens/trainable": 13884512
+    },
+    {
+      "epoch": 3.178343949044586,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 1.768804967763632e-05,
+      "loss": 0.0025828841608017683,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00259,
+      "step": 998,
+      "tokens/total": 130637824,
+      "tokens/train_per_sec_per_gpu": 3237.57,
+      "tokens/trainable": 13898147
+    },
+    {
+      "epoch": 3.1815286624203822,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 1.763491461154951e-05,
+      "loss": 0.002550513716414571,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00255,
+      "step": 999,
+      "tokens/total": 130768896,
+      "tokens/train_per_sec_per_gpu": 3248.32,
+      "tokens/trainable": 13911818
+    },
+    {
+      "epoch": 3.1847133757961785,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 1.7581815953144694e-05,
+      "loss": 0.0023207683116197586,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00232,
+      "step": 1000,
+      "tokens/total": 130899968,
+      "tokens/train_per_sec_per_gpu": 3189.52,
+      "tokens/trainable": 13925184
+    },
+    {
+      "epoch": 3.1878980891719744,
+      "grad_norm": 0.14453125,
+      "learning_rate": 1.7528753964903422e-05,
+      "loss": 0.0033754960168153048,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00338,
+      "step": 1001,
+      "tokens/total": 131031040,
+      "tokens/train_per_sec_per_gpu": 3425.19,
+      "tokens/trainable": 13939522
+    },
+    {
+      "epoch": 3.1910828025477707,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 1.7475728909125967e-05,
+      "loss": 0.0025386540219187737,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00254,
+      "step": 1002,
+      "tokens/total": 131162112,
+      "tokens/train_per_sec_per_gpu": 3600.39,
+      "tokens/trainable": 13954592
+    },
+    {
+      "epoch": 3.194267515923567,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 1.7422741047930075e-05,
+      "loss": 0.00221554609015584,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00222,
+      "step": 1003,
+      "tokens/total": 131293184,
+      "tokens/train_per_sec_per_gpu": 3073.34,
+      "tokens/trainable": 13967458
+    },
+    {
+      "epoch": 3.1974522292993632,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 1.7369790643249573e-05,
+      "loss": 0.0035816675517708063,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00359,
+      "step": 1004,
+      "tokens/total": 131424256,
+      "tokens/train_per_sec_per_gpu": 3426.39,
+      "tokens/trainable": 13981803
+    },
+    {
+      "epoch": 3.200636942675159,
+      "grad_norm": 0.15234375,
+      "learning_rate": 1.731687795683316e-05,
+      "loss": 0.0033436615485697985,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00335,
+      "step": 1005,
+      "tokens/total": 131555328,
+      "tokens/train_per_sec_per_gpu": 3313.83,
+      "tokens/trainable": 13995695
+    },
+    {
+      "epoch": 3.2038216560509554,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 1.7264003250243102e-05,
+      "loss": 0.002780565060675144,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00278,
+      "step": 1006,
+      "tokens/total": 131686400,
+      "tokens/train_per_sec_per_gpu": 3199.71,
+      "tokens/trainable": 14009116
+    },
+    {
+      "epoch": 3.2070063694267517,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 1.7211166784853874e-05,
+      "loss": 0.003775578923523426,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00378,
+      "step": 1007,
+      "tokens/total": 131817472,
+      "tokens/train_per_sec_per_gpu": 3328.07,
+      "tokens/trainable": 14023153
+    },
+    {
+      "epoch": 3.210191082802548,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 1.715836882185094e-05,
+      "loss": 0.0018264808459207416,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00183,
+      "step": 1008,
+      "tokens/total": 131948544,
+      "tokens/train_per_sec_per_gpu": 2972.58,
+      "tokens/trainable": 14035645
+    },
+    {
+      "epoch": 3.213375796178344,
+      "grad_norm": 0.1171875,
+      "learning_rate": 1.710560962222945e-05,
+      "loss": 0.0018301783129572868,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00183,
+      "step": 1009,
+      "tokens/total": 132079616,
+      "tokens/train_per_sec_per_gpu": 3211.08,
+      "tokens/trainable": 14049113
+    },
+    {
+      "epoch": 3.21656050955414,
+      "grad_norm": 0.11328125,
+      "learning_rate": 1.705288944679291e-05,
+      "loss": 0.002403366146609187,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 1010,
+      "tokens/total": 132210688,
+      "tokens/train_per_sec_per_gpu": 3364.87,
+      "tokens/trainable": 14063206
+    },
+    {
+      "epoch": 3.2197452229299364,
+      "grad_norm": 0.1640625,
+      "learning_rate": 1.7000208556151915e-05,
+      "loss": 0.00280455662868917,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00281,
+      "step": 1011,
+      "tokens/total": 132341760,
+      "tokens/train_per_sec_per_gpu": 3264.32,
+      "tokens/trainable": 14076868
+    },
+    {
+      "epoch": 3.2229299363057327,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 1.6947567210722905e-05,
+      "loss": 0.0029342826455831528,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00294,
+      "step": 1012,
+      "tokens/total": 132472832,
+      "tokens/train_per_sec_per_gpu": 3305.91,
+      "tokens/trainable": 14090726
+    },
+    {
+      "epoch": 3.2261146496815285,
+      "grad_norm": 0.1875,
+      "learning_rate": 1.689496567072678e-05,
+      "loss": 0.0028477348387241364,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00285,
+      "step": 1013,
+      "tokens/total": 132603904,
+      "tokens/train_per_sec_per_gpu": 3194.5,
+      "tokens/trainable": 14104098
+    },
+    {
+      "epoch": 3.229299363057325,
+      "grad_norm": 0.15625,
+      "learning_rate": 1.6842404196187715e-05,
+      "loss": 0.002830425277352333,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00283,
+      "step": 1014,
+      "tokens/total": 132734976,
+      "tokens/train_per_sec_per_gpu": 3606.8,
+      "tokens/trainable": 14119202
+    },
+    {
+      "epoch": 3.232484076433121,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 1.678988304693183e-05,
+      "loss": 0.002606867579743266,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00261,
+      "step": 1015,
+      "tokens/total": 132866048,
+      "tokens/train_per_sec_per_gpu": 3574.5,
+      "tokens/trainable": 14134144
+    },
+    {
+      "epoch": 3.2356687898089174,
+      "grad_norm": 0.1484375,
+      "learning_rate": 1.6737402482585863e-05,
+      "loss": 0.0034160753712058067,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00342,
+      "step": 1016,
+      "tokens/total": 132997120,
+      "tokens/train_per_sec_per_gpu": 3134.2,
+      "tokens/trainable": 14147367
+    },
+    {
+      "epoch": 3.238853503184713,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 1.6684962762575966e-05,
+      "loss": 0.0016203324776142836,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00162,
+      "step": 1017,
+      "tokens/total": 133128192,
+      "tokens/train_per_sec_per_gpu": 3101.12,
+      "tokens/trainable": 14160359
+    },
+    {
+      "epoch": 3.2420382165605095,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 1.663256414612639e-05,
+      "loss": 0.0028734614606946707,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00288,
+      "step": 1018,
+      "tokens/total": 133259264,
+      "tokens/train_per_sec_per_gpu": 2813.19,
+      "tokens/trainable": 14172273
+    },
+    {
+      "epoch": 3.245222929936306,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 1.658020689225817e-05,
+      "loss": 0.0035582587588578463,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00356,
+      "step": 1019,
+      "tokens/total": 133390336,
+      "tokens/train_per_sec_per_gpu": 3006.37,
+      "tokens/trainable": 14184925
+    },
+    {
+      "epoch": 3.248407643312102,
+      "grad_norm": 0.16796875,
+      "learning_rate": 1.6527891259787895e-05,
+      "loss": 0.0026477861683815718,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00265,
+      "step": 1020,
+      "tokens/total": 133521408,
+      "tokens/train_per_sec_per_gpu": 3004.12,
+      "tokens/trainable": 14197554
+    },
+    {
+      "epoch": 3.251592356687898,
+      "grad_norm": 0.15234375,
+      "learning_rate": 1.6475617507326418e-05,
+      "loss": 0.0031140560749918222,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00312,
+      "step": 1021,
+      "tokens/total": 133652480,
+      "tokens/train_per_sec_per_gpu": 3175.24,
+      "tokens/trainable": 14210893
+    },
+    {
+      "epoch": 3.254777070063694,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 1.6423385893277536e-05,
+      "loss": 0.003689323551952839,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0037,
+      "step": 1022,
+      "tokens/total": 133783552,
+      "tokens/train_per_sec_per_gpu": 3444.39,
+      "tokens/trainable": 14225297
+    },
+    {
+      "epoch": 3.2579617834394905,
+      "grad_norm": 0.13671875,
+      "learning_rate": 1.6371196675836763e-05,
+      "loss": 0.0028125548269599676,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00282,
+      "step": 1023,
+      "tokens/total": 133914624,
+      "tokens/train_per_sec_per_gpu": 3577.78,
+      "tokens/trainable": 14240285
+    },
+    {
+      "epoch": 3.261146496815287,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 1.631905011299005e-05,
+      "loss": 0.003101219655945897,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00311,
+      "step": 1024,
+      "tokens/total": 134045696,
+      "tokens/train_per_sec_per_gpu": 3314.34,
+      "tokens/trainable": 14254160
+    },
+    {
+      "epoch": 3.2643312101910826,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 1.6266946462512455e-05,
+      "loss": 0.002571912482380867,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00258,
+      "step": 1025,
+      "tokens/total": 134176768,
+      "tokens/train_per_sec_per_gpu": 3129.65,
+      "tokens/trainable": 14267272
+    },
+    {
+      "epoch": 3.267515923566879,
+      "grad_norm": 0.126953125,
+      "learning_rate": 1.6214885981966937e-05,
+      "loss": 0.002030417090281844,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00203,
+      "step": 1026,
+      "tokens/total": 134307840,
+      "tokens/train_per_sec_per_gpu": 3312.29,
+      "tokens/trainable": 14281152
+    },
+    {
+      "epoch": 3.270700636942675,
+      "grad_norm": 0.142578125,
+      "learning_rate": 1.6162868928703057e-05,
+      "loss": 0.0021212187130004168,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00212,
+      "step": 1027,
+      "tokens/total": 134438912,
+      "tokens/train_per_sec_per_gpu": 3278.18,
+      "tokens/trainable": 14294941
+    },
+    {
+      "epoch": 3.2738853503184715,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 1.6110895559855684e-05,
+      "loss": 0.0034488090313971043,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00345,
+      "step": 1028,
+      "tokens/total": 134569984,
+      "tokens/train_per_sec_per_gpu": 3722.82,
+      "tokens/trainable": 14310525
+    },
+    {
+      "epoch": 3.2770700636942673,
+      "grad_norm": 0.138671875,
+      "learning_rate": 1.605896613234375e-05,
+      "loss": 0.002809841651469469,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00281,
+      "step": 1029,
+      "tokens/total": 134701056,
+      "tokens/train_per_sec_per_gpu": 3356.33,
+      "tokens/trainable": 14324590
+    },
+    {
+      "epoch": 3.2802547770700636,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 1.6007080902868986e-05,
+      "loss": 0.003251892514526844,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00326,
+      "step": 1030,
+      "tokens/total": 134832128,
+      "tokens/train_per_sec_per_gpu": 3390.28,
+      "tokens/trainable": 14338793
+    },
+    {
+      "epoch": 3.28343949044586,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 1.5955240127914618e-05,
+      "loss": 0.003499697893857956,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00351,
+      "step": 1031,
+      "tokens/total": 134963200,
+      "tokens/train_per_sec_per_gpu": 3280.67,
+      "tokens/trainable": 14352526
+    },
+    {
+      "epoch": 3.286624203821656,
+      "grad_norm": 0.126953125,
+      "learning_rate": 1.5903444063744126e-05,
+      "loss": 0.0027691691648215055,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00277,
+      "step": 1032,
+      "tokens/total": 135094272,
+      "tokens/train_per_sec_per_gpu": 3269.79,
+      "tokens/trainable": 14366213
+    },
+    {
+      "epoch": 3.289808917197452,
+      "grad_norm": 0.1640625,
+      "learning_rate": 1.5851692966399996e-05,
+      "loss": 0.004021272994577885,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00403,
+      "step": 1033,
+      "tokens/total": 135225344,
+      "tokens/train_per_sec_per_gpu": 3501.62,
+      "tokens/trainable": 14380810
+    },
+    {
+      "epoch": 3.2929936305732483,
+      "grad_norm": 0.1484375,
+      "learning_rate": 1.579998709170239e-05,
+      "loss": 0.003093718783929944,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1034,
+      "tokens/total": 135356416,
+      "tokens/train_per_sec_per_gpu": 3052.87,
+      "tokens/trainable": 14393602
+    },
+    {
+      "epoch": 3.2961783439490446,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 1.5748326695247957e-05,
+      "loss": 0.003595340298488736,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0036,
+      "step": 1035,
+      "tokens/total": 135487488,
+      "tokens/train_per_sec_per_gpu": 3610.85,
+      "tokens/trainable": 14408657
+    },
+    {
+      "epoch": 3.299363057324841,
+      "grad_norm": 0.17578125,
+      "learning_rate": 1.569671203240852e-05,
+      "loss": 0.0037980927154421806,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00381,
+      "step": 1036,
+      "tokens/total": 135618560,
+      "tokens/train_per_sec_per_gpu": 3399.48,
+      "tokens/trainable": 14422876
+    },
+    {
+      "epoch": 3.3025477707006368,
+      "grad_norm": 0.1796875,
+      "learning_rate": 1.5645143358329815e-05,
+      "loss": 0.003825873602181673,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00383,
+      "step": 1037,
+      "tokens/total": 135749632,
+      "tokens/train_per_sec_per_gpu": 3345.54,
+      "tokens/trainable": 14436870
+    },
+    {
+      "epoch": 3.305732484076433,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 1.559362092793027e-05,
+      "loss": 0.002097800839692354,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0021,
+      "step": 1038,
+      "tokens/total": 135880704,
+      "tokens/train_per_sec_per_gpu": 3530.34,
+      "tokens/trainable": 14451577
+    },
+    {
+      "epoch": 3.3089171974522293,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 1.5542144995899698e-05,
+      "loss": 0.003578023286536336,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00358,
+      "step": 1039,
+      "tokens/total": 136011776,
+      "tokens/train_per_sec_per_gpu": 3208.09,
+      "tokens/trainable": 14465046
+    },
+    {
+      "epoch": 3.3121019108280256,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 1.5490715816698077e-05,
+      "loss": 0.002384308958426118,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00239,
+      "step": 1040,
+      "tokens/total": 136142848,
+      "tokens/train_per_sec_per_gpu": 3313.93,
+      "tokens/trainable": 14478889
+    },
+    {
+      "epoch": 3.3152866242038215,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 1.5439333644554227e-05,
+      "loss": 0.0023124567233026028,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00232,
+      "step": 1041,
+      "tokens/total": 136273920,
+      "tokens/train_per_sec_per_gpu": 3490.61,
+      "tokens/trainable": 14493436
+    },
+    {
+      "epoch": 3.3184713375796178,
+      "grad_norm": 0.1640625,
+      "learning_rate": 1.538799873346466e-05,
+      "loss": 0.004312054719775915,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00432,
+      "step": 1042,
+      "tokens/total": 136404992,
+      "tokens/train_per_sec_per_gpu": 3468.79,
+      "tokens/trainable": 14508009
+    },
+    {
+      "epoch": 3.321656050955414,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 1.5336711337192227e-05,
+      "loss": 0.0034810621291399,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00349,
+      "step": 1043,
+      "tokens/total": 136536064,
+      "tokens/train_per_sec_per_gpu": 3681.02,
+      "tokens/trainable": 14523389
+    },
+    {
+      "epoch": 3.3248407643312103,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 1.5285471709264897e-05,
+      "loss": 0.0020460544619709253,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1044,
+      "tokens/total": 136667136,
+      "tokens/train_per_sec_per_gpu": 3329.86,
+      "tokens/trainable": 14537340
+    },
+    {
+      "epoch": 3.328025477707006,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 1.5234280102974525e-05,
+      "loss": 0.003296096809208393,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0033,
+      "step": 1045,
+      "tokens/total": 136798208,
+      "tokens/train_per_sec_per_gpu": 3446.83,
+      "tokens/trainable": 14551699
+    },
+    {
+      "epoch": 3.3312101910828025,
+      "grad_norm": 0.1328125,
+      "learning_rate": 1.5183136771375579e-05,
+      "loss": 0.0019932978320866823,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.002,
+      "step": 1046,
+      "tokens/total": 136929280,
+      "tokens/train_per_sec_per_gpu": 3210.34,
+      "tokens/trainable": 14565142
+    },
+    {
+      "epoch": 3.3343949044585988,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 1.5132041967283866e-05,
+      "loss": 0.001847305684350431,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00185,
+      "step": 1047,
+      "tokens/total": 137060352,
+      "tokens/train_per_sec_per_gpu": 3505.73,
+      "tokens/trainable": 14579823
+    },
+    {
+      "epoch": 3.337579617834395,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 1.5080995943275348e-05,
+      "loss": 0.00248389202170074,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00249,
+      "step": 1048,
+      "tokens/total": 137191424,
+      "tokens/train_per_sec_per_gpu": 3588.55,
+      "tokens/trainable": 14594782
+    },
+    {
+      "epoch": 3.340764331210191,
+      "grad_norm": 0.18359375,
+      "learning_rate": 1.5029998951684828e-05,
+      "loss": 0.00269156857393682,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0027,
+      "step": 1049,
+      "tokens/total": 137322496,
+      "tokens/train_per_sec_per_gpu": 3464.55,
+      "tokens/trainable": 14609308
+    },
+    {
+      "epoch": 3.343949044585987,
+      "grad_norm": 0.173828125,
+      "learning_rate": 1.4979051244604722e-05,
+      "loss": 0.003072477411478758,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00308,
+      "step": 1050,
+      "tokens/total": 137453568,
+      "tokens/train_per_sec_per_gpu": 3052.82,
+      "tokens/trainable": 14622170
+    },
+    {
+      "epoch": 3.3471337579617835,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 1.4928153073883843e-05,
+      "loss": 0.003987753763794899,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.004,
+      "step": 1051,
+      "tokens/total": 137584640,
+      "tokens/train_per_sec_per_gpu": 3233.94,
+      "tokens/trainable": 14635795
+    },
+    {
+      "epoch": 3.3503184713375798,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.4877304691126123e-05,
+      "loss": 0.0029561547562479973,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00296,
+      "step": 1052,
+      "tokens/total": 137715712,
+      "tokens/train_per_sec_per_gpu": 3268.66,
+      "tokens/trainable": 14649498
+    },
+    {
+      "epoch": 3.3535031847133756,
+      "grad_norm": 0.150390625,
+      "learning_rate": 1.4826506347689353e-05,
+      "loss": 0.0022640160750597715,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00227,
+      "step": 1053,
+      "tokens/total": 137846784,
+      "tokens/train_per_sec_per_gpu": 3172.82,
+      "tokens/trainable": 14662788
+    },
+    {
+      "epoch": 3.356687898089172,
+      "grad_norm": 0.181640625,
+      "learning_rate": 1.4775758294684006e-05,
+      "loss": 0.0038375440053641796,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00384,
+      "step": 1054,
+      "tokens/total": 137977856,
+      "tokens/train_per_sec_per_gpu": 3000.65,
+      "tokens/trainable": 14675379
+    },
+    {
+      "epoch": 3.359872611464968,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 1.4725060782971933e-05,
+      "loss": 0.0024567164946347475,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00246,
+      "step": 1055,
+      "tokens/total": 138108928,
+      "tokens/train_per_sec_per_gpu": 3533.93,
+      "tokens/trainable": 14690140
+    },
+    {
+      "epoch": 3.3630573248407645,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 1.4674414063165137e-05,
+      "loss": 0.0013129838043823838,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00131,
+      "step": 1056,
+      "tokens/total": 138240000,
+      "tokens/train_per_sec_per_gpu": 3290.43,
+      "tokens/trainable": 14703961
+    },
+    {
+      "epoch": 3.3662420382165603,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 1.4623818385624566e-05,
+      "loss": 0.003262344980612397,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00327,
+      "step": 1057,
+      "tokens/total": 138371072,
+      "tokens/train_per_sec_per_gpu": 3399.17,
+      "tokens/trainable": 14718152
+    },
+    {
+      "epoch": 3.3694267515923566,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 1.457327400045884e-05,
+      "loss": 0.0037125989329069853,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00372,
+      "step": 1058,
+      "tokens/total": 138502144,
+      "tokens/train_per_sec_per_gpu": 3413.99,
+      "tokens/trainable": 14732369
+    },
+    {
+      "epoch": 3.372611464968153,
+      "grad_norm": 0.171875,
+      "learning_rate": 1.4522781157523008e-05,
+      "loss": 0.003059735056012869,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00306,
+      "step": 1059,
+      "tokens/total": 138633216,
+      "tokens/train_per_sec_per_gpu": 3170.31,
+      "tokens/trainable": 14745664
+    },
+    {
+      "epoch": 3.375796178343949,
+      "grad_norm": 0.16796875,
+      "learning_rate": 1.4472340106417375e-05,
+      "loss": 0.0033829023595899343,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00339,
+      "step": 1060,
+      "tokens/total": 138764288,
+      "tokens/train_per_sec_per_gpu": 3121.95,
+      "tokens/trainable": 14758786
+    },
+    {
+      "epoch": 3.3789808917197455,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 1.4421951096486171e-05,
+      "loss": 0.0024168547242879868,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00242,
+      "step": 1061,
+      "tokens/total": 138895360,
+      "tokens/train_per_sec_per_gpu": 3410.32,
+      "tokens/trainable": 14773023
+    },
+    {
+      "epoch": 3.3821656050955413,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 1.4371614376816416e-05,
+      "loss": 0.0038187310565263033,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00383,
+      "step": 1062,
+      "tokens/total": 139026432,
+      "tokens/train_per_sec_per_gpu": 3277.49,
+      "tokens/trainable": 14786713
+    },
+    {
+      "epoch": 3.3853503184713376,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.4321330196236638e-05,
+      "loss": 0.002092313254252076,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00209,
+      "step": 1063,
+      "tokens/total": 139157504,
+      "tokens/train_per_sec_per_gpu": 3363.98,
+      "tokens/trainable": 14800746
+    },
+    {
+      "epoch": 3.388535031847134,
+      "grad_norm": 0.16015625,
+      "learning_rate": 1.4271098803315624e-05,
+      "loss": 0.0034465331118553877,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00345,
+      "step": 1064,
+      "tokens/total": 139288576,
+      "tokens/train_per_sec_per_gpu": 3553.56,
+      "tokens/trainable": 14815617
+    },
+    {
+      "epoch": 3.3917197452229297,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 1.4220920446361224e-05,
+      "loss": 0.003886766964569688,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00389,
+      "step": 1065,
+      "tokens/total": 139419648,
+      "tokens/train_per_sec_per_gpu": 3092.87,
+      "tokens/trainable": 14828591
+    },
+    {
+      "epoch": 3.394904458598726,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 1.4170795373419148e-05,
+      "loss": 0.0024511385709047318,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00245,
+      "step": 1066,
+      "tokens/total": 139550720,
+      "tokens/train_per_sec_per_gpu": 3130.21,
+      "tokens/trainable": 14841695
+    },
+    {
+      "epoch": 3.3980891719745223,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 1.4120723832271665e-05,
+      "loss": 0.0035048723220825195,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00351,
+      "step": 1067,
+      "tokens/total": 139681792,
+      "tokens/train_per_sec_per_gpu": 3774.26,
+      "tokens/trainable": 14857394
+    },
+    {
+      "epoch": 3.4012738853503186,
+      "grad_norm": 0.154296875,
+      "learning_rate": 1.4070706070436446e-05,
+      "loss": 0.0028158228378742933,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00282,
+      "step": 1068,
+      "tokens/total": 139812864,
+      "tokens/train_per_sec_per_gpu": 3417.94,
+      "tokens/trainable": 14871671
+    },
+    {
+      "epoch": 3.404458598726115,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 1.4020742335165326e-05,
+      "loss": 0.003797327633947134,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0038,
+      "step": 1069,
+      "tokens/total": 139943936,
+      "tokens/train_per_sec_per_gpu": 3477.81,
+      "tokens/trainable": 14886204
+    },
+    {
+      "epoch": 3.4076433121019107,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 1.3970832873443043e-05,
+      "loss": 0.0019341235747560859,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00194,
+      "step": 1070,
+      "tokens/total": 140075008,
+      "tokens/train_per_sec_per_gpu": 3491.7,
+      "tokens/trainable": 14900766
+    },
+    {
+      "epoch": 3.410828025477707,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 1.392097793198605e-05,
+      "loss": 0.0030175955034792423,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00302,
+      "step": 1071,
+      "tokens/total": 140206080,
+      "tokens/train_per_sec_per_gpu": 3393.54,
+      "tokens/trainable": 14914981
+    },
+    {
+      "epoch": 3.4140127388535033,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 1.3871177757241326e-05,
+      "loss": 0.001799887279048562,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0018,
+      "step": 1072,
+      "tokens/total": 140337152,
+      "tokens/train_per_sec_per_gpu": 3339.98,
+      "tokens/trainable": 14928954
+    },
+    {
+      "epoch": 3.417197452229299,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 1.382143259538507e-05,
+      "loss": 0.001962024951353669,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00196,
+      "step": 1073,
+      "tokens/total": 140468224,
+      "tokens/train_per_sec_per_gpu": 3376.61,
+      "tokens/trainable": 14943033
+    },
+    {
+      "epoch": 3.4203821656050954,
+      "grad_norm": 0.16015625,
+      "learning_rate": 1.3771742692321574e-05,
+      "loss": 0.0027512316592037678,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00276,
+      "step": 1074,
+      "tokens/total": 140599296,
+      "tokens/train_per_sec_per_gpu": 3139.25,
+      "tokens/trainable": 14956205
+    },
+    {
+      "epoch": 3.4235668789808917,
+      "grad_norm": 0.15625,
+      "learning_rate": 1.3722108293681973e-05,
+      "loss": 0.0029566381126642227,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00296,
+      "step": 1075,
+      "tokens/total": 140730368,
+      "tokens/train_per_sec_per_gpu": 3445.88,
+      "tokens/trainable": 14970584
+    },
+    {
+      "epoch": 3.426751592356688,
+      "grad_norm": 0.1640625,
+      "learning_rate": 1.3672529644823004e-05,
+      "loss": 0.0029452519956976175,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00295,
+      "step": 1076,
+      "tokens/total": 140861440,
+      "tokens/train_per_sec_per_gpu": 3354.99,
+      "tokens/trainable": 14984596
+    },
+    {
+      "epoch": 3.4299363057324843,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 1.362300699082582e-05,
+      "loss": 0.0017804743256419897,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00178,
+      "step": 1077,
+      "tokens/total": 140992512,
+      "tokens/train_per_sec_per_gpu": 3354.98,
+      "tokens/trainable": 14998636
+    },
+    {
+      "epoch": 3.43312101910828,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 1.35735405764948e-05,
+      "loss": 0.003846959676593542,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00385,
+      "step": 1078,
+      "tokens/total": 141123584,
+      "tokens/train_per_sec_per_gpu": 3226.73,
+      "tokens/trainable": 15012165
+    },
+    {
+      "epoch": 3.4363057324840764,
+      "grad_norm": 0.166015625,
+      "learning_rate": 1.3524130646356283e-05,
+      "loss": 0.0025776573456823826,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00258,
+      "step": 1079,
+      "tokens/total": 141254656,
+      "tokens/train_per_sec_per_gpu": 3226.06,
+      "tokens/trainable": 15025665
+    },
+    {
+      "epoch": 3.4394904458598727,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 1.3474777444657415e-05,
+      "loss": 0.0029838993214070797,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00299,
+      "step": 1080,
+      "tokens/total": 141385728,
+      "tokens/train_per_sec_per_gpu": 3689.69,
+      "tokens/trainable": 15041028
+    },
+    {
+      "epoch": 3.4426751592356686,
+      "grad_norm": 0.14453125,
+      "learning_rate": 1.3425481215364922e-05,
+      "loss": 0.0022048731334507465,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00221,
+      "step": 1081,
+      "tokens/total": 141516800,
+      "tokens/train_per_sec_per_gpu": 3238.87,
+      "tokens/trainable": 15054618
+    },
+    {
+      "epoch": 3.445859872611465,
+      "grad_norm": 0.185546875,
+      "learning_rate": 1.3376242202163868e-05,
+      "loss": 0.004590876400470734,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0046,
+      "step": 1082,
+      "tokens/total": 141647872,
+      "tokens/train_per_sec_per_gpu": 3402.12,
+      "tokens/trainable": 15068791
+    },
+    {
+      "epoch": 3.449044585987261,
+      "grad_norm": 0.15625,
+      "learning_rate": 1.3327060648456502e-05,
+      "loss": 0.0026096594519913197,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00261,
+      "step": 1083,
+      "tokens/total": 141778944,
+      "tokens/train_per_sec_per_gpu": 3599.47,
+      "tokens/trainable": 15083794
+    },
+    {
+      "epoch": 3.4522292993630574,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 1.3277936797361043e-05,
+      "loss": 0.0020494635682553053,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1084,
+      "tokens/total": 141910016,
+      "tokens/train_per_sec_per_gpu": 3307.62,
+      "tokens/trainable": 15097640
+    },
+    {
+      "epoch": 3.4554140127388537,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 1.3228870891710443e-05,
+      "loss": 0.003234599716961384,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00324,
+      "step": 1085,
+      "tokens/total": 142041088,
+      "tokens/train_per_sec_per_gpu": 3210.5,
+      "tokens/trainable": 15111127
+    },
+    {
+      "epoch": 3.4585987261146496,
+      "grad_norm": 0.14453125,
+      "learning_rate": 1.3179863174051238e-05,
+      "loss": 0.002322172513231635,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00232,
+      "step": 1086,
+      "tokens/total": 142172160,
+      "tokens/train_per_sec_per_gpu": 3065.87,
+      "tokens/trainable": 15123986
+    },
+    {
+      "epoch": 3.461783439490446,
+      "grad_norm": 0.16796875,
+      "learning_rate": 1.3130913886642333e-05,
+      "loss": 0.003022089833393693,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00303,
+      "step": 1087,
+      "tokens/total": 142303232,
+      "tokens/train_per_sec_per_gpu": 3611.73,
+      "tokens/trainable": 15139047
+    },
+    {
+      "epoch": 3.464968152866242,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 1.3082023271453759e-05,
+      "loss": 0.0020968448370695114,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0021,
+      "step": 1088,
+      "tokens/total": 142434304,
+      "tokens/train_per_sec_per_gpu": 3221.7,
+      "tokens/trainable": 15152542
+    },
+    {
+      "epoch": 3.468152866242038,
+      "grad_norm": 0.171875,
+      "learning_rate": 1.3033191570165532e-05,
+      "loss": 0.00432826392352581,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00434,
+      "step": 1089,
+      "tokens/total": 142565376,
+      "tokens/train_per_sec_per_gpu": 3192.76,
+      "tokens/trainable": 15165913
+    },
+    {
+      "epoch": 3.4713375796178343,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 1.298441902416646e-05,
+      "loss": 0.0018635153537616134,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00187,
+      "step": 1090,
+      "tokens/total": 142696448,
+      "tokens/train_per_sec_per_gpu": 3635.08,
+      "tokens/trainable": 15181017
+    },
+    {
+      "epoch": 3.4745222929936306,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 1.2935705874552894e-05,
+      "loss": 0.0037171547301113605,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00372,
+      "step": 1091,
+      "tokens/total": 142827520,
+      "tokens/train_per_sec_per_gpu": 3549.32,
+      "tokens/trainable": 15195900
+    },
+    {
+      "epoch": 3.477707006369427,
+      "grad_norm": 0.154296875,
+      "learning_rate": 1.2887052362127594e-05,
+      "loss": 0.0025141574442386627,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00252,
+      "step": 1092,
+      "tokens/total": 142958592,
+      "tokens/train_per_sec_per_gpu": 3427.88,
+      "tokens/trainable": 15210182
+    },
+    {
+      "epoch": 3.480891719745223,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 1.2838458727398531e-05,
+      "loss": 0.0030665546655654907,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00307,
+      "step": 1093,
+      "tokens/total": 143089664,
+      "tokens/train_per_sec_per_gpu": 4042.03,
+      "tokens/trainable": 15226897
+    },
+    {
+      "epoch": 3.484076433121019,
+      "grad_norm": 0.12890625,
+      "learning_rate": 1.2789925210577647e-05,
+      "loss": 0.0020227362401783466,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00202,
+      "step": 1094,
+      "tokens/total": 143220736,
+      "tokens/train_per_sec_per_gpu": 3736.82,
+      "tokens/trainable": 15242382
+    },
+    {
+      "epoch": 3.4872611464968153,
+      "grad_norm": 0.158203125,
+      "learning_rate": 1.274145205157972e-05,
+      "loss": 0.0027202137280255556,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00272,
+      "step": 1095,
+      "tokens/total": 143351808,
+      "tokens/train_per_sec_per_gpu": 3200.5,
+      "tokens/trainable": 15255782
+    },
+    {
+      "epoch": 3.4904458598726116,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 1.269303949002118e-05,
+      "loss": 0.0031496393494307995,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00315,
+      "step": 1096,
+      "tokens/total": 143482880,
+      "tokens/train_per_sec_per_gpu": 3206.14,
+      "tokens/trainable": 15269719
+    },
+    {
+      "epoch": 3.4936305732484074,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 1.2644687765218874e-05,
+      "loss": 0.0028139406349509954,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00282,
+      "step": 1097,
+      "tokens/total": 143613952,
+      "tokens/train_per_sec_per_gpu": 3399.76,
+      "tokens/trainable": 15283962
+    },
+    {
+      "epoch": 3.4968152866242037,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 1.2596397116188946e-05,
+      "loss": 0.0032941231038421392,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0033,
+      "step": 1098,
+      "tokens/total": 143745024,
+      "tokens/train_per_sec_per_gpu": 3149.95,
+      "tokens/trainable": 15297099
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 1.2548167781645616e-05,
+      "loss": 0.00317127862945199,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00318,
+      "step": 1099,
+      "tokens/total": 143876096,
+      "tokens/train_per_sec_per_gpu": 3666.83,
+      "tokens/trainable": 15312299
+    },
+    {
+      "epoch": 3.5,
+      "eval_loss": 0.010016990825533867,
+      "eval_ppl": 1.01007,
+      "eval_runtime": 43.0422,
+      "eval_samples_per_second": 62.752,
+      "eval_steps_per_second": 3.926,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 1099
+    },
+    {
+      "epoch": 3.5031847133757963,
+      "grad_norm": 0.162109375,
+      "learning_rate": 1.2500000000000006e-05,
+      "loss": 0.0022175521589815617,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00222,
+      "step": 1100,
+      "tokens/total": 144007168,
+      "tokens/train_per_sec_per_gpu": 3427.84,
+      "tokens/trainable": 15326710
+    },
+    {
+      "epoch": 3.5063694267515926,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 1.245189400935895e-05,
+      "loss": 0.005054910201579332,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00507,
+      "step": 1101,
+      "tokens/total": 144138240,
+      "tokens/train_per_sec_per_gpu": 3351.14,
+      "tokens/trainable": 15340735
+    },
+    {
+      "epoch": 3.5095541401273884,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 1.2403850047523866e-05,
+      "loss": 0.0027237918693572283,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00273,
+      "step": 1102,
+      "tokens/total": 144269312,
+      "tokens/train_per_sec_per_gpu": 3436.73,
+      "tokens/trainable": 15355132
+    },
+    {
+      "epoch": 3.5127388535031847,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 1.2355868351989509e-05,
+      "loss": 0.0029630253557115793,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00297,
+      "step": 1103,
+      "tokens/total": 144400384,
+      "tokens/train_per_sec_per_gpu": 3214.66,
+      "tokens/trainable": 15368489
+    },
+    {
+      "epoch": 3.515923566878981,
+      "grad_norm": 0.142578125,
+      "learning_rate": 1.2307949159942862e-05,
+      "loss": 0.0033542895689606667,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00336,
+      "step": 1104,
+      "tokens/total": 144531456,
+      "tokens/train_per_sec_per_gpu": 3198.14,
+      "tokens/trainable": 15381840
+    },
+    {
+      "epoch": 3.519108280254777,
+      "grad_norm": 0.17578125,
+      "learning_rate": 1.2260092708261936e-05,
+      "loss": 0.0038351963739842176,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00384,
+      "step": 1105,
+      "tokens/total": 144662528,
+      "tokens/train_per_sec_per_gpu": 3503.01,
+      "tokens/trainable": 15396418
+    },
+    {
+      "epoch": 3.522292993630573,
+      "grad_norm": 0.154296875,
+      "learning_rate": 1.2212299233514582e-05,
+      "loss": 0.0025412808172404766,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00254,
+      "step": 1106,
+      "tokens/total": 144793600,
+      "tokens/train_per_sec_per_gpu": 3919.55,
+      "tokens/trainable": 15412594
+    },
+    {
+      "epoch": 3.5254777070063694,
+      "grad_norm": 0.1796875,
+      "learning_rate": 1.216456897195733e-05,
+      "loss": 0.0032449497375637293,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00325,
+      "step": 1107,
+      "tokens/total": 144924672,
+      "tokens/train_per_sec_per_gpu": 3382.55,
+      "tokens/trainable": 15426656
+    },
+    {
+      "epoch": 3.5286624203821657,
+      "grad_norm": 0.146484375,
+      "learning_rate": 1.211690215953427e-05,
+      "loss": 0.0023905187845230103,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00239,
+      "step": 1108,
+      "tokens/total": 145055744,
+      "tokens/train_per_sec_per_gpu": 3011.33,
+      "tokens/trainable": 15439226
+    },
+    {
+      "epoch": 3.531847133757962,
+      "grad_norm": 0.15625,
+      "learning_rate": 1.2069299031875795e-05,
+      "loss": 0.0024083037860691547,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 1109,
+      "tokens/total": 145186816,
+      "tokens/train_per_sec_per_gpu": 2939.76,
+      "tokens/trainable": 15451512
+    },
+    {
+      "epoch": 3.535031847133758,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 1.2021759824297524e-05,
+      "loss": 0.004423599690198898,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00443,
+      "step": 1110,
+      "tokens/total": 145317888,
+      "tokens/train_per_sec_per_gpu": 3466.29,
+      "tokens/trainable": 15465910
+    },
+    {
+      "epoch": 3.538216560509554,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 1.1974284771799096e-05,
+      "loss": 0.002882221946492791,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00289,
+      "step": 1111,
+      "tokens/total": 145448960,
+      "tokens/train_per_sec_per_gpu": 3506.98,
+      "tokens/trainable": 15480477
+    },
+    {
+      "epoch": 3.5414012738853504,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 1.1926874109063e-05,
+      "loss": 0.003006345359608531,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00301,
+      "step": 1112,
+      "tokens/total": 145580032,
+      "tokens/train_per_sec_per_gpu": 3365.33,
+      "tokens/trainable": 15494478
+    },
+    {
+      "epoch": 3.5445859872611463,
+      "grad_norm": 0.154296875,
+      "learning_rate": 1.1879528070453423e-05,
+      "loss": 0.0027234896551817656,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00273,
+      "step": 1113,
+      "tokens/total": 145711104,
+      "tokens/train_per_sec_per_gpu": 3535.63,
+      "tokens/trainable": 15509199
+    },
+    {
+      "epoch": 3.5477707006369426,
+      "grad_norm": 0.177734375,
+      "learning_rate": 1.1832246890015125e-05,
+      "loss": 0.0036931924987584352,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0037,
+      "step": 1114,
+      "tokens/total": 145842176,
+      "tokens/train_per_sec_per_gpu": 3246.79,
+      "tokens/trainable": 15522710
+    },
+    {
+      "epoch": 3.550955414012739,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 1.1785030801472221e-05,
+      "loss": 0.0028704549185931683,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00287,
+      "step": 1115,
+      "tokens/total": 145973248,
+      "tokens/train_per_sec_per_gpu": 3848.56,
+      "tokens/trainable": 15538730
+    },
+    {
+      "epoch": 3.554140127388535,
+      "grad_norm": 0.15625,
+      "learning_rate": 1.1737880038227082e-05,
+      "loss": 0.00254430272616446,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00255,
+      "step": 1116,
+      "tokens/total": 146104320,
+      "tokens/train_per_sec_per_gpu": 3397.78,
+      "tokens/trainable": 15552911
+    },
+    {
+      "epoch": 3.5573248407643314,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 1.1690794833359159e-05,
+      "loss": 0.0025816336274147034,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00258,
+      "step": 1117,
+      "tokens/total": 146235392,
+      "tokens/train_per_sec_per_gpu": 2881.17,
+      "tokens/trainable": 15564987
+    },
+    {
+      "epoch": 3.5605095541401273,
+      "grad_norm": 0.19140625,
+      "learning_rate": 1.1643775419623812e-05,
+      "loss": 0.003014686517417431,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00302,
+      "step": 1118,
+      "tokens/total": 146366464,
+      "tokens/train_per_sec_per_gpu": 3324.99,
+      "tokens/trainable": 15578834
+    },
+    {
+      "epoch": 3.5636942675159236,
+      "grad_norm": 0.146484375,
+      "learning_rate": 1.1596822029451177e-05,
+      "loss": 0.0020668318029493093,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00207,
+      "step": 1119,
+      "tokens/total": 146497536,
+      "tokens/train_per_sec_per_gpu": 3575.91,
+      "tokens/trainable": 15593729
+    },
+    {
+      "epoch": 3.56687898089172,
+      "grad_norm": 0.142578125,
+      "learning_rate": 1.1549934894945045e-05,
+      "loss": 0.002621435560286045,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00262,
+      "step": 1120,
+      "tokens/total": 146628608,
+      "tokens/train_per_sec_per_gpu": 3223.75,
+      "tokens/trainable": 15607251
+    },
+    {
+      "epoch": 3.5700636942675157,
+      "grad_norm": 0.16796875,
+      "learning_rate": 1.1503114247881648e-05,
+      "loss": 0.002985800849273801,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00299,
+      "step": 1121,
+      "tokens/total": 146759680,
+      "tokens/train_per_sec_per_gpu": 3585.21,
+      "tokens/trainable": 15622149
+    },
+    {
+      "epoch": 3.573248407643312,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 1.1456360319708578e-05,
+      "loss": 0.0013212183257564902,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00132,
+      "step": 1122,
+      "tokens/total": 146890752,
+      "tokens/train_per_sec_per_gpu": 3312.34,
+      "tokens/trainable": 15636033
+    },
+    {
+      "epoch": 3.5764331210191083,
+      "grad_norm": 0.17578125,
+      "learning_rate": 1.1409673341543625e-05,
+      "loss": 0.0023485145065933466,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00235,
+      "step": 1123,
+      "tokens/total": 147021824,
+      "tokens/train_per_sec_per_gpu": 3184.56,
+      "tokens/trainable": 15649372
+    },
+    {
+      "epoch": 3.5796178343949046,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 1.1363053544173596e-05,
+      "loss": 0.002514764666557312,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00252,
+      "step": 1124,
+      "tokens/total": 147152896,
+      "tokens/train_per_sec_per_gpu": 3358.29,
+      "tokens/trainable": 15663368
+    },
+    {
+      "epoch": 3.582802547770701,
+      "grad_norm": 0.13671875,
+      "learning_rate": 1.1316501158053216e-05,
+      "loss": 0.002817730186507106,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00282,
+      "step": 1125,
+      "tokens/total": 147283968,
+      "tokens/train_per_sec_per_gpu": 3488.28,
+      "tokens/trainable": 15677861
+    },
+    {
+      "epoch": 3.5859872611464967,
+      "grad_norm": 0.150390625,
+      "learning_rate": 1.1270016413303997e-05,
+      "loss": 0.0023807904217392206,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00238,
+      "step": 1126,
+      "tokens/total": 147415040,
+      "tokens/train_per_sec_per_gpu": 3351.72,
+      "tokens/trainable": 15691892
+    },
+    {
+      "epoch": 3.589171974522293,
+      "grad_norm": 0.13671875,
+      "learning_rate": 1.1223599539713046e-05,
+      "loss": 0.0022236828226596117,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 1127,
+      "tokens/total": 147546112,
+      "tokens/train_per_sec_per_gpu": 3133.35,
+      "tokens/trainable": 15705012
+    },
+    {
+      "epoch": 3.5923566878980893,
+      "grad_norm": 0.169921875,
+      "learning_rate": 1.1177250766731992e-05,
+      "loss": 0.0034954429138451815,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0035,
+      "step": 1128,
+      "tokens/total": 147677184,
+      "tokens/train_per_sec_per_gpu": 3390.97,
+      "tokens/trainable": 15719238
+    },
+    {
+      "epoch": 3.595541401273885,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 1.1130970323475825e-05,
+      "loss": 0.0024684793315827847,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00247,
+      "step": 1129,
+      "tokens/total": 147808256,
+      "tokens/train_per_sec_per_gpu": 3373.56,
+      "tokens/trainable": 15733335
+    },
+    {
+      "epoch": 3.5987261146496814,
+      "grad_norm": 0.177734375,
+      "learning_rate": 1.1084758438721743e-05,
+      "loss": 0.003184695728123188,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00319,
+      "step": 1130,
+      "tokens/total": 147939328,
+      "tokens/train_per_sec_per_gpu": 3255.08,
+      "tokens/trainable": 15746979
+    },
+    {
+      "epoch": 3.6019108280254777,
+      "grad_norm": 0.154296875,
+      "learning_rate": 1.103861534090804e-05,
+      "loss": 0.00223728409036994,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00224,
+      "step": 1131,
+      "tokens/total": 148070400,
+      "tokens/train_per_sec_per_gpu": 3094.33,
+      "tokens/trainable": 15759937
+    },
+    {
+      "epoch": 3.605095541401274,
+      "grad_norm": 0.244140625,
+      "learning_rate": 1.0992541258132998e-05,
+      "loss": 0.0025429693050682545,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00255,
+      "step": 1132,
+      "tokens/total": 148201472,
+      "tokens/train_per_sec_per_gpu": 3264.14,
+      "tokens/trainable": 15773601
+    },
+    {
+      "epoch": 3.6082802547770703,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.0946536418153716e-05,
+      "loss": 0.0037906889338046312,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0038,
+      "step": 1133,
+      "tokens/total": 148332544,
+      "tokens/train_per_sec_per_gpu": 2941.44,
+      "tokens/trainable": 15785963
+    },
+    {
+      "epoch": 3.611464968152866,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 1.0900601048385017e-05,
+      "loss": 0.0023014359176158905,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0023,
+      "step": 1134,
+      "tokens/total": 148463616,
+      "tokens/train_per_sec_per_gpu": 2661.35,
+      "tokens/trainable": 15797186
+    },
+    {
+      "epoch": 3.6146496815286624,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 1.0854735375898328e-05,
+      "loss": 0.004023172426968813,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00403,
+      "step": 1135,
+      "tokens/total": 148594688,
+      "tokens/train_per_sec_per_gpu": 3525.3,
+      "tokens/trainable": 15811891
+    },
+    {
+      "epoch": 3.6178343949044587,
+      "grad_norm": 0.14453125,
+      "learning_rate": 1.0808939627420514e-05,
+      "loss": 0.0020967398304492235,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0021,
+      "step": 1136,
+      "tokens/total": 148725760,
+      "tokens/train_per_sec_per_gpu": 3402.16,
+      "tokens/trainable": 15826103
+    },
+    {
+      "epoch": 3.6210191082802545,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 1.076321402933279e-05,
+      "loss": 0.002463690470904112,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00247,
+      "step": 1137,
+      "tokens/total": 148856832,
+      "tokens/train_per_sec_per_gpu": 3459.93,
+      "tokens/trainable": 15840539
+    },
+    {
+      "epoch": 3.624203821656051,
+      "grad_norm": 0.201171875,
+      "learning_rate": 1.0717558807669631e-05,
+      "loss": 0.0030937506817281246,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1138,
+      "tokens/total": 148987904,
+      "tokens/train_per_sec_per_gpu": 3333.74,
+      "tokens/trainable": 15854495
+    },
+    {
+      "epoch": 3.627388535031847,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.0671974188117572e-05,
+      "loss": 0.002224976196885109,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 1139,
+      "tokens/total": 149118976,
+      "tokens/train_per_sec_per_gpu": 3179.62,
+      "tokens/trainable": 15867806
+    },
+    {
+      "epoch": 3.6305732484076434,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 1.0626460396014182e-05,
+      "loss": 0.0029444252140820026,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00295,
+      "step": 1140,
+      "tokens/total": 149250048,
+      "tokens/train_per_sec_per_gpu": 3422.65,
+      "tokens/trainable": 15882044
+    },
+    {
+      "epoch": 3.6337579617834397,
+      "grad_norm": 0.185546875,
+      "learning_rate": 1.0581017656346904e-05,
+      "loss": 0.0034989488776773214,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00351,
+      "step": 1141,
+      "tokens/total": 149381120,
+      "tokens/train_per_sec_per_gpu": 3507.73,
+      "tokens/trainable": 15896741
+    },
+    {
+      "epoch": 3.6369426751592355,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 1.053564619375193e-05,
+      "loss": 0.002628948539495468,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00263,
+      "step": 1142,
+      "tokens/total": 149512192,
+      "tokens/train_per_sec_per_gpu": 3219.49,
+      "tokens/trainable": 15910183
+    },
+    {
+      "epoch": 3.640127388535032,
+      "grad_norm": 0.263671875,
+      "learning_rate": 1.0490346232513113e-05,
+      "loss": 0.0031747568864375353,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00318,
+      "step": 1143,
+      "tokens/total": 149643264,
+      "tokens/train_per_sec_per_gpu": 3370.78,
+      "tokens/trainable": 15924212
+    },
+    {
+      "epoch": 3.643312101910828,
+      "grad_norm": 0.208984375,
+      "learning_rate": 1.0445117996560877e-05,
+      "loss": 0.003914204426109791,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00392,
+      "step": 1144,
+      "tokens/total": 149774336,
+      "tokens/train_per_sec_per_gpu": 3173.12,
+      "tokens/trainable": 15937505
+    },
+    {
+      "epoch": 3.646496815286624,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 1.039996170947106e-05,
+      "loss": 0.002363776322454214,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00237,
+      "step": 1145,
+      "tokens/total": 149905408,
+      "tokens/train_per_sec_per_gpu": 3147.56,
+      "tokens/trainable": 15950698
+    },
+    {
+      "epoch": 3.6496815286624202,
+      "grad_norm": 0.16796875,
+      "learning_rate": 1.0354877594463852e-05,
+      "loss": 0.0031070299446582794,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00311,
+      "step": 1146,
+      "tokens/total": 150036480,
+      "tokens/train_per_sec_per_gpu": 3364.36,
+      "tokens/trainable": 15964717
+    },
+    {
+      "epoch": 3.6528662420382165,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 1.0309865874402688e-05,
+      "loss": 0.001972392201423645,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00197,
+      "step": 1147,
+      "tokens/total": 150167552,
+      "tokens/train_per_sec_per_gpu": 3018.99,
+      "tokens/trainable": 15977365
+    },
+    {
+      "epoch": 3.656050955414013,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 1.026492677179311e-05,
+      "loss": 0.0011499158572405577,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00115,
+      "step": 1148,
+      "tokens/total": 150298624,
+      "tokens/train_per_sec_per_gpu": 3220.38,
+      "tokens/trainable": 15990834
+    },
+    {
+      "epoch": 3.659235668789809,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 1.022006050878169e-05,
+      "loss": 0.001693375059403479,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00169,
+      "step": 1149,
+      "tokens/total": 150429696,
+      "tokens/train_per_sec_per_gpu": 3186.29,
+      "tokens/trainable": 16004194
+    },
+    {
+      "epoch": 3.662420382165605,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 1.0175267307154962e-05,
+      "loss": 0.0017610186478123069,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00176,
+      "step": 1150,
+      "tokens/total": 150560768,
+      "tokens/train_per_sec_per_gpu": 3312.55,
+      "tokens/trainable": 16018057
+    },
+    {
+      "epoch": 3.6656050955414012,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 1.0130547388338268e-05,
+      "loss": 0.003534915391355753,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00354,
+      "step": 1151,
+      "tokens/total": 150691840,
+      "tokens/train_per_sec_per_gpu": 3383.89,
+      "tokens/trainable": 16032153
+    },
+    {
+      "epoch": 3.6687898089171975,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 1.0085900973394708e-05,
+      "loss": 0.0027439731638878584,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00275,
+      "step": 1152,
+      "tokens/total": 150822912,
+      "tokens/train_per_sec_per_gpu": 3256.01,
+      "tokens/trainable": 16045798
+    },
+    {
+      "epoch": 3.6719745222929934,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 1.004132828302404e-05,
+      "loss": 0.0019469019025564194,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00195,
+      "step": 1153,
+      "tokens/total": 150953984,
+      "tokens/train_per_sec_per_gpu": 3687.58,
+      "tokens/trainable": 16061140
+    },
+    {
+      "epoch": 3.6751592356687897,
+      "grad_norm": 0.15234375,
+      "learning_rate": 9.996829537561559e-06,
+      "loss": 0.0025109422858804464,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00251,
+      "step": 1154,
+      "tokens/total": 151085056,
+      "tokens/train_per_sec_per_gpu": 3535.46,
+      "tokens/trainable": 16075875
+    },
+    {
+      "epoch": 3.678343949044586,
+      "grad_norm": 0.134765625,
+      "learning_rate": 9.952404956977032e-06,
+      "loss": 0.0022808697540313005,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00228,
+      "step": 1155,
+      "tokens/total": 151216128,
+      "tokens/train_per_sec_per_gpu": 3050.84,
+      "tokens/trainable": 16088674
+    },
+    {
+      "epoch": 3.6815286624203822,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 9.908054760873633e-06,
+      "loss": 0.003984857816249132,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00399,
+      "step": 1156,
+      "tokens/total": 151347200,
+      "tokens/train_per_sec_per_gpu": 3167.18,
+      "tokens/trainable": 16101976
+    },
+    {
+      "epoch": 3.6847133757961785,
+      "grad_norm": 0.16015625,
+      "learning_rate": 9.863779168486798e-06,
+      "loss": 0.002358327154070139,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00236,
+      "step": 1157,
+      "tokens/total": 151478272,
+      "tokens/train_per_sec_per_gpu": 3305.89,
+      "tokens/trainable": 16115788
+    },
+    {
+      "epoch": 3.6878980891719744,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 9.819578398683202e-06,
+      "loss": 0.0030925837345421314,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1158,
+      "tokens/total": 151609344,
+      "tokens/train_per_sec_per_gpu": 3627.75,
+      "tokens/trainable": 16130893
+    },
+    {
+      "epoch": 3.6910828025477707,
+      "grad_norm": 0.142578125,
+      "learning_rate": 9.775452669959651e-06,
+      "loss": 0.00236108573153615,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00236,
+      "step": 1159,
+      "tokens/total": 151740416,
+      "tokens/train_per_sec_per_gpu": 3497.63,
+      "tokens/trainable": 16145461
+    },
+    {
+      "epoch": 3.694267515923567,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 9.731402200441985e-06,
+      "loss": 0.0027799042873084545,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00278,
+      "step": 1160,
+      "tokens/total": 151871488,
+      "tokens/train_per_sec_per_gpu": 3288.32,
+      "tokens/trainable": 16159217
+    },
+    {
+      "epoch": 3.697452229299363,
+      "grad_norm": 0.197265625,
+      "learning_rate": 9.687427207884017e-06,
+      "loss": 0.004562960006296635,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00457,
+      "step": 1161,
+      "tokens/total": 152002560,
+      "tokens/train_per_sec_per_gpu": 3425.43,
+      "tokens/trainable": 16173551
+    },
+    {
+      "epoch": 3.700636942675159,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 9.643527909666484e-06,
+      "loss": 0.003357633948326111,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00336,
+      "step": 1162,
+      "tokens/total": 152133632,
+      "tokens/train_per_sec_per_gpu": 3255.39,
+      "tokens/trainable": 16187139
+    },
+    {
+      "epoch": 3.7038216560509554,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 9.599704522795899e-06,
+      "loss": 0.0035241839941591024,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00353,
+      "step": 1163,
+      "tokens/total": 152264704,
+      "tokens/train_per_sec_per_gpu": 3360.97,
+      "tokens/trainable": 16201246
+    },
+    {
+      "epoch": 3.7070063694267517,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 9.55595726390357e-06,
+      "loss": 0.0019289179472252727,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00193,
+      "step": 1164,
+      "tokens/total": 152395776,
+      "tokens/train_per_sec_per_gpu": 3708.68,
+      "tokens/trainable": 16216642
+    },
+    {
+      "epoch": 3.710191082802548,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 9.512286349244461e-06,
+      "loss": 0.0024172349367290735,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00242,
+      "step": 1165,
+      "tokens/total": 152526848,
+      "tokens/train_per_sec_per_gpu": 3152.27,
+      "tokens/trainable": 16229792
+    },
+    {
+      "epoch": 3.713375796178344,
+      "grad_norm": 0.1484375,
+      "learning_rate": 9.468691994696147e-06,
+      "loss": 0.0027571492828428745,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00276,
+      "step": 1166,
+      "tokens/total": 152657920,
+      "tokens/train_per_sec_per_gpu": 3544.62,
+      "tokens/trainable": 16244524
+    },
+    {
+      "epoch": 3.71656050955414,
+      "grad_norm": 0.171875,
+      "learning_rate": 9.42517441575773e-06,
+      "loss": 0.002144938800483942,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00215,
+      "step": 1167,
+      "tokens/total": 152788992,
+      "tokens/train_per_sec_per_gpu": 3187.0,
+      "tokens/trainable": 16257897
+    },
+    {
+      "epoch": 3.7197452229299364,
+      "grad_norm": 0.166015625,
+      "learning_rate": 9.381733827548825e-06,
+      "loss": 0.002875394420698285,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00288,
+      "step": 1168,
+      "tokens/total": 152920064,
+      "tokens/train_per_sec_per_gpu": 3371.36,
+      "tokens/trainable": 16271956
+    },
+    {
+      "epoch": 3.722929936305732,
+      "grad_norm": 0.146484375,
+      "learning_rate": 9.338370444808417e-06,
+      "loss": 0.0024918625131249428,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00249,
+      "step": 1169,
+      "tokens/total": 153051136,
+      "tokens/train_per_sec_per_gpu": 3125.56,
+      "tokens/trainable": 16285073
+    },
+    {
+      "epoch": 3.7261146496815285,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 9.295084481893876e-06,
+      "loss": 0.0020116898231208324,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00201,
+      "step": 1170,
+      "tokens/total": 153182208,
+      "tokens/train_per_sec_per_gpu": 3620.37,
+      "tokens/trainable": 16300140
+    },
+    {
+      "epoch": 3.729299363057325,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 9.251876152779863e-06,
+      "loss": 0.002456206362694502,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00246,
+      "step": 1171,
+      "tokens/total": 153313280,
+      "tokens/train_per_sec_per_gpu": 3421.83,
+      "tokens/trainable": 16314413
+    },
+    {
+      "epoch": 3.732484076433121,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 9.20874567105725e-06,
+      "loss": 0.002665320411324501,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00267,
+      "step": 1172,
+      "tokens/total": 153444352,
+      "tokens/train_per_sec_per_gpu": 3586.11,
+      "tokens/trainable": 16329319
+    },
+    {
+      "epoch": 3.7356687898089174,
+      "grad_norm": 0.150390625,
+      "learning_rate": 9.165693249932098e-06,
+      "loss": 0.002760200994089246,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00276,
+      "step": 1173,
+      "tokens/total": 153575424,
+      "tokens/train_per_sec_per_gpu": 3499.69,
+      "tokens/trainable": 16343957
+    },
+    {
+      "epoch": 3.738853503184713,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 9.122719102224603e-06,
+      "loss": 0.003271646797657013,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00328,
+      "step": 1174,
+      "tokens/total": 153706496,
+      "tokens/train_per_sec_per_gpu": 3357.83,
+      "tokens/trainable": 16358001
+    },
+    {
+      "epoch": 3.7420382165605095,
+      "grad_norm": 0.140625,
+      "learning_rate": 9.079823440368018e-06,
+      "loss": 0.0022282477002590895,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 1175,
+      "tokens/total": 153837568,
+      "tokens/train_per_sec_per_gpu": 3662.02,
+      "tokens/trainable": 16373253
+    },
+    {
+      "epoch": 3.745222929936306,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 9.037006476407628e-06,
+      "loss": 0.003906633704900742,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00391,
+      "step": 1176,
+      "tokens/total": 153968640,
+      "tokens/train_per_sec_per_gpu": 3414.99,
+      "tokens/trainable": 16387539
+    },
+    {
+      "epoch": 3.7484076433121016,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 8.994268421999702e-06,
+      "loss": 0.0046704974956810474,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00468,
+      "step": 1177,
+      "tokens/total": 154099712,
+      "tokens/train_per_sec_per_gpu": 3318.42,
+      "tokens/trainable": 16401436
+    },
+    {
+      "epoch": 3.7515923566878984,
+      "grad_norm": 0.154296875,
+      "learning_rate": 8.951609488410414e-06,
+      "loss": 0.0023519096430391073,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00235,
+      "step": 1178,
+      "tokens/total": 154230784,
+      "tokens/train_per_sec_per_gpu": 3442.84,
+      "tokens/trainable": 16415791
+    },
+    {
+      "epoch": 3.754777070063694,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 8.909029886514828e-06,
+      "loss": 0.001595214824192226,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1179,
+      "tokens/total": 154361856,
+      "tokens/train_per_sec_per_gpu": 3627.66,
+      "tokens/trainable": 16430902
+    },
+    {
+      "epoch": 3.7579617834394905,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 8.866529826795866e-06,
+      "loss": 0.002106869127601385,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00211,
+      "step": 1180,
+      "tokens/total": 154492928,
+      "tokens/train_per_sec_per_gpu": 3379.91,
+      "tokens/trainable": 16445017
+    },
+    {
+      "epoch": 3.761146496815287,
+      "grad_norm": 0.150390625,
+      "learning_rate": 8.824109519343227e-06,
+      "loss": 0.0035120132379233837,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00352,
+      "step": 1181,
+      "tokens/total": 154624000,
+      "tokens/train_per_sec_per_gpu": 3416.65,
+      "tokens/trainable": 16459298
+    },
+    {
+      "epoch": 3.7643312101910826,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 8.781769173852392e-06,
+      "loss": 0.002475301967933774,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00248,
+      "step": 1182,
+      "tokens/total": 154755072,
+      "tokens/train_per_sec_per_gpu": 3225.84,
+      "tokens/trainable": 16472813
+    },
+    {
+      "epoch": 3.767515923566879,
+      "grad_norm": 0.134765625,
+      "learning_rate": 8.739508999623563e-06,
+      "loss": 0.0018928756471723318,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00189,
+      "step": 1183,
+      "tokens/total": 154886144,
+      "tokens/train_per_sec_per_gpu": 3393.68,
+      "tokens/trainable": 16487035
+    },
+    {
+      "epoch": 3.770700636942675,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 8.697329205560625e-06,
+      "loss": 0.0019152449676766992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00192,
+      "step": 1184,
+      "tokens/total": 155017216,
+      "tokens/train_per_sec_per_gpu": 3331.91,
+      "tokens/trainable": 16500924
+    },
+    {
+      "epoch": 3.7738853503184715,
+      "grad_norm": 0.1328125,
+      "learning_rate": 8.655230000170117e-06,
+      "loss": 0.0024345512501895428,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00244,
+      "step": 1185,
+      "tokens/total": 155148288,
+      "tokens/train_per_sec_per_gpu": 3450.15,
+      "tokens/trainable": 16515278
+    },
+    {
+      "epoch": 3.777070063694268,
+      "grad_norm": 0.125,
+      "learning_rate": 8.61321159156023e-06,
+      "loss": 0.0017270749667659402,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00173,
+      "step": 1186,
+      "tokens/total": 155279360,
+      "tokens/train_per_sec_per_gpu": 2874.69,
+      "tokens/trainable": 16527348
+    },
+    {
+      "epoch": 3.7802547770700636,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 8.571274187439724e-06,
+      "loss": 0.0030203748028725386,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00302,
+      "step": 1187,
+      "tokens/total": 155410432,
+      "tokens/train_per_sec_per_gpu": 3409.54,
+      "tokens/trainable": 16541593
+    },
+    {
+      "epoch": 3.78343949044586,
+      "grad_norm": 0.138671875,
+      "learning_rate": 8.529417995116947e-06,
+      "loss": 0.0022753621451556683,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00228,
+      "step": 1188,
+      "tokens/total": 155541504,
+      "tokens/train_per_sec_per_gpu": 3344.52,
+      "tokens/trainable": 16555605
+    },
+    {
+      "epoch": 3.786624203821656,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 8.487643221498812e-06,
+      "loss": 0.0021583903580904007,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00216,
+      "step": 1189,
+      "tokens/total": 155672576,
+      "tokens/train_per_sec_per_gpu": 3003.62,
+      "tokens/trainable": 16568186
+    },
+    {
+      "epoch": 3.789808917197452,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 8.445950073089721e-06,
+      "loss": 0.002155636204406619,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00216,
+      "step": 1190,
+      "tokens/total": 155803648,
+      "tokens/train_per_sec_per_gpu": 3463.82,
+      "tokens/trainable": 16582617
+    },
+    {
+      "epoch": 3.7929936305732483,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 8.404338755990587e-06,
+      "loss": 0.003606649348512292,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00361,
+      "step": 1191,
+      "tokens/total": 155934720,
+      "tokens/train_per_sec_per_gpu": 3331.8,
+      "tokens/trainable": 16596564
+    },
+    {
+      "epoch": 3.7961783439490446,
+      "grad_norm": 0.1484375,
+      "learning_rate": 8.362809475897837e-06,
+      "loss": 0.0030233021825551987,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00303,
+      "step": 1192,
+      "tokens/total": 156065792,
+      "tokens/train_per_sec_per_gpu": 3466.06,
+      "tokens/trainable": 16611016
+    },
+    {
+      "epoch": 3.799363057324841,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 8.32136243810233e-06,
+      "loss": 0.003034008899703622,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00304,
+      "step": 1193,
+      "tokens/total": 156196864,
+      "tokens/train_per_sec_per_gpu": 3277.31,
+      "tokens/trainable": 16624717
+    },
+    {
+      "epoch": 3.802547770700637,
+      "grad_norm": 0.126953125,
+      "learning_rate": 8.279997847488399e-06,
+      "loss": 0.0017860046355053782,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00179,
+      "step": 1194,
+      "tokens/total": 156327936,
+      "tokens/train_per_sec_per_gpu": 3192.19,
+      "tokens/trainable": 16638031
+    },
+    {
+      "epoch": 3.805732484076433,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 8.238715908532824e-06,
+      "loss": 0.003182856598868966,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00319,
+      "step": 1195,
+      "tokens/total": 156459008,
+      "tokens/train_per_sec_per_gpu": 3312.32,
+      "tokens/trainable": 16651920
+    },
+    {
+      "epoch": 3.8089171974522293,
+      "grad_norm": 0.134765625,
+      "learning_rate": 8.197516825303792e-06,
+      "loss": 0.0023445822298526764,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00235,
+      "step": 1196,
+      "tokens/total": 156590080,
+      "tokens/train_per_sec_per_gpu": 3594.12,
+      "tokens/trainable": 16666821
+    },
+    {
+      "epoch": 3.8121019108280256,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 8.156400801459912e-06,
+      "loss": 0.002362563507631421,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00237,
+      "step": 1197,
+      "tokens/total": 156721152,
+      "tokens/train_per_sec_per_gpu": 2878.74,
+      "tokens/trainable": 16679031
+    },
+    {
+      "epoch": 3.8152866242038215,
+      "grad_norm": 0.173828125,
+      "learning_rate": 8.115368040249242e-06,
+      "loss": 0.0029479744844138622,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00295,
+      "step": 1198,
+      "tokens/total": 156852224,
+      "tokens/train_per_sec_per_gpu": 3403.84,
+      "tokens/trainable": 16693210
+    },
+    {
+      "epoch": 3.8184713375796178,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 8.074418744508202e-06,
+      "loss": 0.001919899950735271,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00192,
+      "step": 1199,
+      "tokens/total": 156983296,
+      "tokens/train_per_sec_per_gpu": 3656.6,
+      "tokens/trainable": 16708430
+    },
+    {
+      "epoch": 3.821656050955414,
+      "grad_norm": 0.1328125,
+      "learning_rate": 8.03355311666065e-06,
+      "loss": 0.0024780076928436756,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00248,
+      "step": 1200,
+      "tokens/total": 157114368,
+      "tokens/train_per_sec_per_gpu": 3218.15,
+      "tokens/trainable": 16721832
+    },
+    {
+      "epoch": 3.8248407643312103,
+      "grad_norm": 0.169921875,
+      "learning_rate": 7.992771358716852e-06,
+      "loss": 0.003482515923678875,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00349,
+      "step": 1201,
+      "tokens/total": 157245440,
+      "tokens/train_per_sec_per_gpu": 3264.75,
+      "tokens/trainable": 16735505
+    },
+    {
+      "epoch": 3.8280254777070066,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 7.952073672272465e-06,
+      "loss": 0.002318483777344227,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00232,
+      "step": 1202,
+      "tokens/total": 157376512,
+      "tokens/train_per_sec_per_gpu": 3205.3,
+      "tokens/trainable": 16748926
+    },
+    {
+      "epoch": 3.8312101910828025,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 7.91146025850755e-06,
+      "loss": 0.0027267371769994497,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00273,
+      "step": 1203,
+      "tokens/total": 157507584,
+      "tokens/train_per_sec_per_gpu": 3508.37,
+      "tokens/trainable": 16763595
+    },
+    {
+      "epoch": 3.8343949044585988,
+      "grad_norm": 0.142578125,
+      "learning_rate": 7.870931318185615e-06,
+      "loss": 0.0021403185091912746,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00214,
+      "step": 1204,
+      "tokens/total": 157638656,
+      "tokens/train_per_sec_per_gpu": 3252.09,
+      "tokens/trainable": 16777230
+    },
+    {
+      "epoch": 3.837579617834395,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 7.830487051652562e-06,
+      "loss": 0.0029888248536735773,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00299,
+      "step": 1205,
+      "tokens/total": 157769728,
+      "tokens/train_per_sec_per_gpu": 3605.6,
+      "tokens/trainable": 16792264
+    },
+    {
+      "epoch": 3.840764331210191,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 7.790127658835747e-06,
+      "loss": 0.0014124944573268294,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00141,
+      "step": 1206,
+      "tokens/total": 157900800,
+      "tokens/train_per_sec_per_gpu": 3439.82,
+      "tokens/trainable": 16806652
+    },
+    {
+      "epoch": 3.843949044585987,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 7.749853339242972e-06,
+      "loss": 0.0024581162724643946,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00246,
+      "step": 1207,
+      "tokens/total": 158031872,
+      "tokens/train_per_sec_per_gpu": 3496.89,
+      "tokens/trainable": 16821214
+    },
+    {
+      "epoch": 3.8471337579617835,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 7.70966429196148e-06,
+      "loss": 0.0028864797204732895,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00289,
+      "step": 1208,
+      "tokens/total": 158162944,
+      "tokens/train_per_sec_per_gpu": 3388.5,
+      "tokens/trainable": 16835398
+    },
+    {
+      "epoch": 3.8503184713375798,
+      "grad_norm": 0.154296875,
+      "learning_rate": 7.669560715656993e-06,
+      "loss": 0.0023927215952426195,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0024,
+      "step": 1209,
+      "tokens/total": 158294016,
+      "tokens/train_per_sec_per_gpu": 3419.53,
+      "tokens/trainable": 16849636
+    },
+    {
+      "epoch": 3.853503184713376,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 7.629542808572746e-06,
+      "loss": 0.0018501668237149715,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00185,
+      "step": 1210,
+      "tokens/total": 158425088,
+      "tokens/train_per_sec_per_gpu": 3232.39,
+      "tokens/trainable": 16863144
+    },
+    {
+      "epoch": 3.856687898089172,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 7.58961076852846e-06,
+      "loss": 0.0026476646307855844,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00265,
+      "step": 1211,
+      "tokens/total": 158556160,
+      "tokens/train_per_sec_per_gpu": 3319.46,
+      "tokens/trainable": 16877036
+    },
+    {
+      "epoch": 3.859872611464968,
+      "grad_norm": 0.162109375,
+      "learning_rate": 7.549764792919414e-06,
+      "loss": 0.0031769457273185253,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00318,
+      "step": 1212,
+      "tokens/total": 158687232,
+      "tokens/train_per_sec_per_gpu": 3138.51,
+      "tokens/trainable": 16890132
+    },
+    {
+      "epoch": 3.8630573248407645,
+      "grad_norm": 0.103515625,
+      "learning_rate": 7.510005078715443e-06,
+      "loss": 0.00180210976395756,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0018,
+      "step": 1213,
+      "tokens/total": 158818304,
+      "tokens/train_per_sec_per_gpu": 3359.16,
+      "tokens/trainable": 16904120
+    },
+    {
+      "epoch": 3.8662420382165603,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 7.47033182245995e-06,
+      "loss": 0.003394015831872821,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0034,
+      "step": 1214,
+      "tokens/total": 158949376,
+      "tokens/train_per_sec_per_gpu": 3097.6,
+      "tokens/trainable": 16917088
+    },
+    {
+      "epoch": 3.8694267515923566,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 7.430745220268962e-06,
+      "loss": 0.0019503788789734244,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00195,
+      "step": 1215,
+      "tokens/total": 159080448,
+      "tokens/train_per_sec_per_gpu": 3410.15,
+      "tokens/trainable": 16931308
+    },
+    {
+      "epoch": 3.872611464968153,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 7.391245467830163e-06,
+      "loss": 0.002893456257879734,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0029,
+      "step": 1216,
+      "tokens/total": 159211520,
+      "tokens/train_per_sec_per_gpu": 3457.29,
+      "tokens/trainable": 16945696
+    },
+    {
+      "epoch": 3.875796178343949,
+      "grad_norm": 0.1640625,
+      "learning_rate": 7.351832760401892e-06,
+      "loss": 0.0023777689784765244,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00238,
+      "step": 1217,
+      "tokens/total": 159342592,
+      "tokens/train_per_sec_per_gpu": 2878.05,
+      "tokens/trainable": 16957864
+    },
+    {
+      "epoch": 3.8789808917197455,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 7.312507292812215e-06,
+      "loss": 0.00224723806604743,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00225,
+      "step": 1218,
+      "tokens/total": 159473664,
+      "tokens/train_per_sec_per_gpu": 3022.25,
+      "tokens/trainable": 16970516
+    },
+    {
+      "epoch": 3.8821656050955413,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 7.273269259457957e-06,
+      "loss": 0.0017601789440959692,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00176,
+      "step": 1219,
+      "tokens/total": 159604736,
+      "tokens/train_per_sec_per_gpu": 3153.66,
+      "tokens/trainable": 16983660
+    },
+    {
+      "epoch": 3.8853503184713376,
+      "grad_norm": 0.15625,
+      "learning_rate": 7.2341188543036985e-06,
+      "loss": 0.0024489860516041517,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00245,
+      "step": 1220,
+      "tokens/total": 159735808,
+      "tokens/train_per_sec_per_gpu": 3174.42,
+      "tokens/trainable": 16996950
+    },
+    {
+      "epoch": 3.888535031847134,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 7.195056270880887e-06,
+      "loss": 0.0038972869515419006,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0039,
+      "step": 1221,
+      "tokens/total": 159866880,
+      "tokens/train_per_sec_per_gpu": 3474.13,
+      "tokens/trainable": 17011372
+    },
+    {
+      "epoch": 3.8917197452229297,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 7.156081702286813e-06,
+      "loss": 0.0033518727868795395,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00336,
+      "step": 1222,
+      "tokens/total": 159997952,
+      "tokens/train_per_sec_per_gpu": 3190.66,
+      "tokens/trainable": 17024744
+    },
+    {
+      "epoch": 3.894904458598726,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 7.11719534118368e-06,
+      "loss": 0.00255336775444448,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00256,
+      "step": 1223,
+      "tokens/total": 160129024,
+      "tokens/train_per_sec_per_gpu": 3465.79,
+      "tokens/trainable": 17039162
+    },
+    {
+      "epoch": 3.8980891719745223,
+      "grad_norm": 0.154296875,
+      "learning_rate": 7.078397379797711e-06,
+      "loss": 0.0020744046196341515,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00208,
+      "step": 1224,
+      "tokens/total": 160260096,
+      "tokens/train_per_sec_per_gpu": 3387.57,
+      "tokens/trainable": 17053370
+    },
+    {
+      "epoch": 3.9012738853503186,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 7.039688009918083e-06,
+      "loss": 0.0021676502656191587,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00217,
+      "step": 1225,
+      "tokens/total": 160391168,
+      "tokens/train_per_sec_per_gpu": 3407.72,
+      "tokens/trainable": 17067644
+    },
+    {
+      "epoch": 3.904458598726115,
+      "grad_norm": 0.171875,
+      "learning_rate": 7.001067422896063e-06,
+      "loss": 0.002485244534909725,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00249,
+      "step": 1226,
+      "tokens/total": 160522240,
+      "tokens/train_per_sec_per_gpu": 3749.29,
+      "tokens/trainable": 17083264
+    },
+    {
+      "epoch": 3.9076433121019107,
+      "grad_norm": 0.173828125,
+      "learning_rate": 6.9625358096440496e-06,
+      "loss": 0.0030091169755905867,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00301,
+      "step": 1227,
+      "tokens/total": 160653312,
+      "tokens/train_per_sec_per_gpu": 3670.67,
+      "tokens/trainable": 17098510
+    },
+    {
+      "epoch": 3.910828025477707,
+      "grad_norm": 0.14453125,
+      "learning_rate": 6.924093360634601e-06,
+      "loss": 0.0025889542885124683,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00259,
+      "step": 1228,
+      "tokens/total": 160784384,
+      "tokens/train_per_sec_per_gpu": 3679.38,
+      "tokens/trainable": 17113820
+    },
+    {
+      "epoch": 3.9140127388535033,
+      "grad_norm": 0.1875,
+      "learning_rate": 6.885740265899526e-06,
+      "loss": 0.0027112660463899374,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00271,
+      "step": 1229,
+      "tokens/total": 160915456,
+      "tokens/train_per_sec_per_gpu": 3136.64,
+      "tokens/trainable": 17126964
+    },
+    {
+      "epoch": 3.917197452229299,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 6.84747671502893e-06,
+      "loss": 0.002578144893050194,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00258,
+      "step": 1230,
+      "tokens/total": 161046528,
+      "tokens/train_per_sec_per_gpu": 3162.79,
+      "tokens/trainable": 17140190
+    },
+    {
+      "epoch": 3.9203821656050954,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 6.809302897170266e-06,
+      "loss": 0.00427253358066082,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00428,
+      "step": 1231,
+      "tokens/total": 161177600,
+      "tokens/train_per_sec_per_gpu": 3541.36,
+      "tokens/trainable": 17154952
+    },
+    {
+      "epoch": 3.9235668789808917,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 6.771219001027415e-06,
+      "loss": 0.002364278305321932,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00237,
+      "step": 1232,
+      "tokens/total": 161308672,
+      "tokens/train_per_sec_per_gpu": 3451.33,
+      "tokens/trainable": 17169330
+    },
+    {
+      "epoch": 3.926751592356688,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 6.733225214859762e-06,
+      "loss": 0.0026184916496276855,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00262,
+      "step": 1233,
+      "tokens/total": 161439744,
+      "tokens/train_per_sec_per_gpu": 3611.52,
+      "tokens/trainable": 17184330
+    },
+    {
+      "epoch": 3.9299363057324843,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 6.695321726481232e-06,
+      "loss": 0.0022467318922281265,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00225,
+      "step": 1234,
+      "tokens/total": 161570816,
+      "tokens/train_per_sec_per_gpu": 3270.16,
+      "tokens/trainable": 17198012
+    },
+    {
+      "epoch": 3.93312101910828,
+      "grad_norm": 0.1484375,
+      "learning_rate": 6.657508723259404e-06,
+      "loss": 0.0020928632002323866,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0021,
+      "step": 1235,
+      "tokens/total": 161701888,
+      "tokens/train_per_sec_per_gpu": 3467.93,
+      "tokens/trainable": 17212436
+    },
+    {
+      "epoch": 3.9363057324840764,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 6.619786392114557e-06,
+      "loss": 0.0016596732893958688,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00166,
+      "step": 1236,
+      "tokens/total": 161832960,
+      "tokens/train_per_sec_per_gpu": 3175.9,
+      "tokens/trainable": 17225718
+    },
+    {
+      "epoch": 3.9394904458598727,
+      "grad_norm": 0.177734375,
+      "learning_rate": 6.582154919518746e-06,
+      "loss": 0.0028763054870069027,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00288,
+      "step": 1237,
+      "tokens/total": 161964032,
+      "tokens/train_per_sec_per_gpu": 3459.65,
+      "tokens/trainable": 17240132
+    },
+    {
+      "epoch": 3.9426751592356686,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 6.544614491494885e-06,
+      "loss": 0.0023539350368082523,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00236,
+      "step": 1238,
+      "tokens/total": 162095104,
+      "tokens/train_per_sec_per_gpu": 3782.05,
+      "tokens/trainable": 17255722
+    },
+    {
+      "epoch": 3.945859872611465,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 6.507165293615847e-06,
+      "loss": 0.001856530667282641,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00186,
+      "step": 1239,
+      "tokens/total": 162226176,
+      "tokens/train_per_sec_per_gpu": 3107.57,
+      "tokens/trainable": 17268686
+    },
+    {
+      "epoch": 3.949044585987261,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 6.469807511003501e-06,
+      "loss": 0.0025471888948231936,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00255,
+      "step": 1240,
+      "tokens/total": 162357248,
+      "tokens/train_per_sec_per_gpu": 3185.35,
+      "tokens/trainable": 17282018
+    },
+    {
+      "epoch": 3.9522292993630574,
+      "grad_norm": 0.185546875,
+      "learning_rate": 6.432541328327848e-06,
+      "loss": 0.0031703345011919737,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00318,
+      "step": 1241,
+      "tokens/total": 162488320,
+      "tokens/train_per_sec_per_gpu": 3523.34,
+      "tokens/trainable": 17296706
+    },
+    {
+      "epoch": 3.9554140127388537,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 6.395366929806084e-06,
+      "loss": 0.002728913212195039,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00273,
+      "step": 1242,
+      "tokens/total": 162619392,
+      "tokens/train_per_sec_per_gpu": 3361.25,
+      "tokens/trainable": 17310780
+    },
+    {
+      "epoch": 3.9585987261146496,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 6.358284499201681e-06,
+      "loss": 0.00209011766128242,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00209,
+      "step": 1243,
+      "tokens/total": 162750464,
+      "tokens/train_per_sec_per_gpu": 3299.45,
+      "tokens/trainable": 17324532
+    },
+    {
+      "epoch": 3.961783439490446,
+      "grad_norm": 0.18359375,
+      "learning_rate": 6.3212942198234755e-06,
+      "loss": 0.003096578875556588,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1244,
+      "tokens/total": 162881536,
+      "tokens/train_per_sec_per_gpu": 3590.65,
+      "tokens/trainable": 17339484
+    },
+    {
+      "epoch": 3.964968152866242,
+      "grad_norm": 0.177734375,
+      "learning_rate": 6.284396274524809e-06,
+      "loss": 0.002964367624372244,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00297,
+      "step": 1245,
+      "tokens/total": 163012608,
+      "tokens/train_per_sec_per_gpu": 3356.72,
+      "tokens/trainable": 17353532
+    },
+    {
+      "epoch": 3.968152866242038,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 6.247590845702553e-06,
+      "loss": 0.0029587389435619116,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00296,
+      "step": 1246,
+      "tokens/total": 163143680,
+      "tokens/train_per_sec_per_gpu": 3114.15,
+      "tokens/trainable": 17366524
+    },
+    {
+      "epoch": 3.9713375796178343,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 6.210878115296267e-06,
+      "loss": 0.0023161745630204678,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00232,
+      "step": 1247,
+      "tokens/total": 163274752,
+      "tokens/train_per_sec_per_gpu": 3507.4,
+      "tokens/trainable": 17381160
+    },
+    {
+      "epoch": 3.9745222929936306,
+      "grad_norm": 0.15234375,
+      "learning_rate": 6.174258264787283e-06,
+      "loss": 0.002960086800158024,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00296,
+      "step": 1248,
+      "tokens/total": 163405824,
+      "tokens/train_per_sec_per_gpu": 3401.28,
+      "tokens/trainable": 17395386
+    },
+    {
+      "epoch": 3.977707006369427,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 6.137731475197775e-06,
+      "loss": 0.0018720726948231459,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00187,
+      "step": 1249,
+      "tokens/total": 163536896,
+      "tokens/train_per_sec_per_gpu": 3215.48,
+      "tokens/trainable": 17408856
+    },
+    {
+      "epoch": 3.980891719745223,
+      "grad_norm": 0.142578125,
+      "learning_rate": 6.101297927089905e-06,
+      "loss": 0.0030803410336375237,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00309,
+      "step": 1250,
+      "tokens/total": 163667968,
+      "tokens/train_per_sec_per_gpu": 3254.38,
+      "tokens/trainable": 17422484
+    },
+    {
+      "epoch": 3.984076433121019,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 6.064957800564924e-06,
+      "loss": 0.0036575605627149343,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00366,
+      "step": 1251,
+      "tokens/total": 163799040,
+      "tokens/train_per_sec_per_gpu": 3076.57,
+      "tokens/trainable": 17435390
+    },
+    {
+      "epoch": 3.9872611464968153,
+      "grad_norm": 0.16796875,
+      "learning_rate": 6.028711275262252e-06,
+      "loss": 0.002414201619103551,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00242,
+      "step": 1252,
+      "tokens/total": 163930112,
+      "tokens/train_per_sec_per_gpu": 3358.96,
+      "tokens/trainable": 17449432
+    },
+    {
+      "epoch": 3.9904458598726116,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 5.992558530358638e-06,
+      "loss": 0.002453506924211979,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00246,
+      "step": 1253,
+      "tokens/total": 164061184,
+      "tokens/train_per_sec_per_gpu": 3465.43,
+      "tokens/trainable": 17463888
+    },
+    {
+      "epoch": 3.9936305732484074,
+      "grad_norm": 0.173828125,
+      "learning_rate": 5.95649974456724e-06,
+      "loss": 0.0030917448457330465,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1254,
+      "tokens/total": 164192256,
+      "tokens/train_per_sec_per_gpu": 3406.16,
+      "tokens/trainable": 17478118
+    },
+    {
+      "epoch": 3.9968152866242037,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 5.920535096136737e-06,
+      "loss": 0.003019727533683181,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00302,
+      "step": 1255,
+      "tokens/total": 164323328,
+      "tokens/train_per_sec_per_gpu": 3013.86,
+      "tokens/trainable": 17491116
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 5.884664762850467e-06,
+      "loss": 0.0035042453091591597,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 39.25,
+      "memory/max_allocated (GiB)": 39.25,
+      "ppl": 1.00351,
+      "step": 1256,
+      "tokens/total": 164397056,
+      "tokens/train_per_sec_per_gpu": 3307.41,
+      "tokens/trainable": 17498700
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.010103554464876652,
+      "eval_ppl": 1.01015,
+      "eval_runtime": 43.1815,
+      "eval_samples_per_second": 62.55,
+      "eval_steps_per_second": 3.914,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 1256
+    },
+    {
+      "epoch": 4.003184713375796,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 5.848888922025553e-06,
+      "loss": 0.0019946754910051823,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.002,
+      "step": 1257,
+      "tokens/total": 164528128,
+      "tokens/train_per_sec_per_gpu": 3365.43,
+      "tokens/trainable": 17512708
+    },
+    {
+      "epoch": 4.006369426751593,
+      "grad_norm": 0.125,
+      "learning_rate": 5.813207750511995e-06,
+      "loss": 0.002120796823874116,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00212,
+      "step": 1258,
+      "tokens/total": 164659200,
+      "tokens/train_per_sec_per_gpu": 3244.01,
+      "tokens/trainable": 17526132
+    },
+    {
+      "epoch": 4.009554140127388,
+      "grad_norm": 0.142578125,
+      "learning_rate": 5.777621424691834e-06,
+      "loss": 0.0018959781154990196,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0019,
+      "step": 1259,
+      "tokens/total": 164790272,
+      "tokens/train_per_sec_per_gpu": 2857.78,
+      "tokens/trainable": 17538062
+    },
+    {
+      "epoch": 4.012738853503185,
+      "grad_norm": 0.126953125,
+      "learning_rate": 5.742130120478265e-06,
+      "loss": 0.002416697796434164,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00242,
+      "step": 1260,
+      "tokens/total": 164921344,
+      "tokens/train_per_sec_per_gpu": 3561.58,
+      "tokens/trainable": 17552824
+    },
+    {
+      "epoch": 4.015923566878981,
+      "grad_norm": 0.109375,
+      "learning_rate": 5.706734013314746e-06,
+      "loss": 0.00218612770549953,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00219,
+      "step": 1261,
+      "tokens/total": 165052416,
+      "tokens/train_per_sec_per_gpu": 3426.83,
+      "tokens/trainable": 17567024
+    },
+    {
+      "epoch": 4.019108280254777,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 5.671433278174151e-06,
+      "loss": 0.0017273772973567247,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00173,
+      "step": 1262,
+      "tokens/total": 165183488,
+      "tokens/train_per_sec_per_gpu": 3392.24,
+      "tokens/trainable": 17581128
+    },
+    {
+      "epoch": 4.022292993630574,
+      "grad_norm": 0.11328125,
+      "learning_rate": 5.636228089557926e-06,
+      "loss": 0.0017078241799026728,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00171,
+      "step": 1263,
+      "tokens/total": 165314560,
+      "tokens/train_per_sec_per_gpu": 3547.78,
+      "tokens/trainable": 17595852
+    },
+    {
+      "epoch": 4.025477707006369,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 5.601118621495175e-06,
+      "loss": 0.0014550643973052502,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00146,
+      "step": 1264,
+      "tokens/total": 165445632,
+      "tokens/train_per_sec_per_gpu": 3380.55,
+      "tokens/trainable": 17609924
+    },
+    {
+      "epoch": 4.028662420382165,
+      "grad_norm": 0.158203125,
+      "learning_rate": 5.566105047541847e-06,
+      "loss": 0.0025803535245358944,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00258,
+      "step": 1265,
+      "tokens/total": 165576704,
+      "tokens/train_per_sec_per_gpu": 3404.37,
+      "tokens/trainable": 17624118
+    },
+    {
+      "epoch": 4.031847133757962,
+      "grad_norm": 0.12890625,
+      "learning_rate": 5.531187540779864e-06,
+      "loss": 0.0025620046071708202,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00257,
+      "step": 1266,
+      "tokens/total": 165707776,
+      "tokens/train_per_sec_per_gpu": 3185.3,
+      "tokens/trainable": 17637446
+    },
+    {
+      "epoch": 4.035031847133758,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 5.4963662738162445e-06,
+      "loss": 0.0018056983826681972,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00181,
+      "step": 1267,
+      "tokens/total": 165838848,
+      "tokens/train_per_sec_per_gpu": 3696.5,
+      "tokens/trainable": 17652856
+    },
+    {
+      "epoch": 4.038216560509555,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 5.461641418782268e-06,
+      "loss": 0.0014057126827538013,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00141,
+      "step": 1268,
+      "tokens/total": 165969920,
+      "tokens/train_per_sec_per_gpu": 3383.57,
+      "tokens/trainable": 17666958
+    },
+    {
+      "epoch": 4.04140127388535,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 5.427013147332638e-06,
+      "loss": 0.0026509405579417944,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00265,
+      "step": 1269,
+      "tokens/total": 166100992,
+      "tokens/train_per_sec_per_gpu": 3482.02,
+      "tokens/trainable": 17681546
+    },
+    {
+      "epoch": 4.044585987261146,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 5.392481630644597e-06,
+      "loss": 0.002696407027542591,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0027,
+      "step": 1270,
+      "tokens/total": 166232064,
+      "tokens/train_per_sec_per_gpu": 3270.12,
+      "tokens/trainable": 17695240
+    },
+    {
+      "epoch": 4.047770700636943,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 5.358047039417122e-06,
+      "loss": 0.0018320954404771328,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00183,
+      "step": 1271,
+      "tokens/total": 166363136,
+      "tokens/train_per_sec_per_gpu": 3274.88,
+      "tokens/trainable": 17708940
+    },
+    {
+      "epoch": 4.050955414012739,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 5.323709543870059e-06,
+      "loss": 0.0021537388674914837,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00216,
+      "step": 1272,
+      "tokens/total": 166494208,
+      "tokens/train_per_sec_per_gpu": 3453.25,
+      "tokens/trainable": 17723348
+    },
+    {
+      "epoch": 4.054140127388535,
+      "grad_norm": 0.130859375,
+      "learning_rate": 5.2894693137432645e-06,
+      "loss": 0.0018690548604354262,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00187,
+      "step": 1273,
+      "tokens/total": 166625280,
+      "tokens/train_per_sec_per_gpu": 3002.21,
+      "tokens/trainable": 17735952
+    },
+    {
+      "epoch": 4.057324840764331,
+      "grad_norm": 0.162109375,
+      "learning_rate": 5.255326518295792e-06,
+      "loss": 0.002879355102777481,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00288,
+      "step": 1274,
+      "tokens/total": 166756352,
+      "tokens/train_per_sec_per_gpu": 3683.9,
+      "tokens/trainable": 17751344
+    },
+    {
+      "epoch": 4.060509554140127,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 5.221281326305066e-06,
+      "loss": 0.0022269003093242645,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 1275,
+      "tokens/total": 166887424,
+      "tokens/train_per_sec_per_gpu": 3651.91,
+      "tokens/trainable": 17766626
+    },
+    {
+      "epoch": 4.063694267515924,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 5.187333906065999e-06,
+      "loss": 0.001456463593058288,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00146,
+      "step": 1276,
+      "tokens/total": 167018496,
+      "tokens/train_per_sec_per_gpu": 3273.46,
+      "tokens/trainable": 17780338
+    },
+    {
+      "epoch": 4.06687898089172,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 5.15348442539022e-06,
+      "loss": 0.0010698458645492792,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00107,
+      "step": 1277,
+      "tokens/total": 167149568,
+      "tokens/train_per_sec_per_gpu": 3378.38,
+      "tokens/trainable": 17794556
+    },
+    {
+      "epoch": 4.070063694267516,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 5.1197330516052025e-06,
+      "loss": 0.002229275880381465,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 1278,
+      "tokens/total": 167280640,
+      "tokens/train_per_sec_per_gpu": 3141.49,
+      "tokens/trainable": 17807720
+    },
+    {
+      "epoch": 4.073248407643312,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 5.086079951553444e-06,
+      "loss": 0.0030983053147792816,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1279,
+      "tokens/total": 167411712,
+      "tokens/train_per_sec_per_gpu": 3466.5,
+      "tokens/trainable": 17822220
+    },
+    {
+      "epoch": 4.076433121019108,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 5.052525291591651e-06,
+      "loss": 0.0031875702552497387,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00319,
+      "step": 1280,
+      "tokens/total": 167542784,
+      "tokens/train_per_sec_per_gpu": 3276.44,
+      "tokens/trainable": 17835898
+    },
+    {
+      "epoch": 4.079617834394904,
+      "grad_norm": 0.111328125,
+      "learning_rate": 5.019069237589921e-06,
+      "loss": 0.0019920531194657087,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00199,
+      "step": 1281,
+      "tokens/total": 167673856,
+      "tokens/train_per_sec_per_gpu": 3318.63,
+      "tokens/trainable": 17849768
+    },
+    {
+      "epoch": 4.082802547770701,
+      "grad_norm": 0.1328125,
+      "learning_rate": 4.985711954930902e-06,
+      "loss": 0.0015500528970733285,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00155,
+      "step": 1282,
+      "tokens/total": 167804928,
+      "tokens/train_per_sec_per_gpu": 3025.08,
+      "tokens/trainable": 17862448
+    },
+    {
+      "epoch": 4.085987261146497,
+      "grad_norm": 0.138671875,
+      "learning_rate": 4.952453608509e-06,
+      "loss": 0.0018041220027953386,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00181,
+      "step": 1283,
+      "tokens/total": 167936000,
+      "tokens/train_per_sec_per_gpu": 3421.79,
+      "tokens/trainable": 17876746
+    },
+    {
+      "epoch": 4.089171974522293,
+      "grad_norm": 0.109375,
+      "learning_rate": 4.919294362729551e-06,
+      "loss": 0.0015523162437602878,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00155,
+      "step": 1284,
+      "tokens/total": 168067072,
+      "tokens/train_per_sec_per_gpu": 3216.81,
+      "tokens/trainable": 17890232
+    },
+    {
+      "epoch": 4.092356687898089,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 4.886234381507998e-06,
+      "loss": 0.0025541428476572037,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00256,
+      "step": 1285,
+      "tokens/total": 168198144,
+      "tokens/train_per_sec_per_gpu": 3474.88,
+      "tokens/trainable": 17904764
+    },
+    {
+      "epoch": 4.095541401273885,
+      "grad_norm": 0.181640625,
+      "learning_rate": 4.853273828269089e-06,
+      "loss": 0.0028677769005298615,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00287,
+      "step": 1286,
+      "tokens/total": 168329216,
+      "tokens/train_per_sec_per_gpu": 3678.61,
+      "tokens/trainable": 17920040
+    },
+    {
+      "epoch": 4.098726114649682,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 4.820412865946092e-06,
+      "loss": 0.003095669439062476,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1287,
+      "tokens/total": 168460288,
+      "tokens/train_per_sec_per_gpu": 3226.04,
+      "tokens/trainable": 17933544
+    },
+    {
+      "epoch": 4.101910828025478,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 4.787651656979949e-06,
+      "loss": 0.001217160257510841,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00122,
+      "step": 1288,
+      "tokens/total": 168591360,
+      "tokens/train_per_sec_per_gpu": 3346.97,
+      "tokens/trainable": 17947562
+    },
+    {
+      "epoch": 4.1050955414012735,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 4.754990363318501e-06,
+      "loss": 0.0015003203880041838,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0015,
+      "step": 1289,
+      "tokens/total": 168722432,
+      "tokens/train_per_sec_per_gpu": 3462.58,
+      "tokens/trainable": 17962074
+    },
+    {
+      "epoch": 4.10828025477707,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 4.722429146415691e-06,
+      "loss": 0.001935549546033144,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00194,
+      "step": 1290,
+      "tokens/total": 168853504,
+      "tokens/train_per_sec_per_gpu": 3245.6,
+      "tokens/trainable": 17975652
+    },
+    {
+      "epoch": 4.111464968152866,
+      "grad_norm": 0.1328125,
+      "learning_rate": 4.6899681672307346e-06,
+      "loss": 0.00210759905166924,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00211,
+      "step": 1291,
+      "tokens/total": 168984576,
+      "tokens/train_per_sec_per_gpu": 3417.61,
+      "tokens/trainable": 17989954
+    },
+    {
+      "epoch": 4.114649681528663,
+      "grad_norm": 0.111328125,
+      "learning_rate": 4.657607586227345e-06,
+      "loss": 0.0014702447224408388,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00147,
+      "step": 1292,
+      "tokens/total": 169115648,
+      "tokens/train_per_sec_per_gpu": 3709.61,
+      "tokens/trainable": 18005404
+    },
+    {
+      "epoch": 4.117834394904459,
+      "grad_norm": 0.12890625,
+      "learning_rate": 4.625347563372964e-06,
+      "loss": 0.0019532586447894573,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00196,
+      "step": 1293,
+      "tokens/total": 169246720,
+      "tokens/train_per_sec_per_gpu": 3349.32,
+      "tokens/trainable": 18019456
+    },
+    {
+      "epoch": 4.1210191082802545,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 4.593188258137912e-06,
+      "loss": 0.0014989221235737205,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0015,
+      "step": 1294,
+      "tokens/total": 169377792,
+      "tokens/train_per_sec_per_gpu": 3412.09,
+      "tokens/trainable": 18033708
+    },
+    {
+      "epoch": 4.124203821656051,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 4.5611298294946596e-06,
+      "loss": 0.0016446541994810104,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00165,
+      "step": 1295,
+      "tokens/total": 169508864,
+      "tokens/train_per_sec_per_gpu": 3299.98,
+      "tokens/trainable": 18047556
+    },
+    {
+      "epoch": 4.127388535031847,
+      "grad_norm": 0.12890625,
+      "learning_rate": 4.529172435917012e-06,
+      "loss": 0.001521661994047463,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00152,
+      "step": 1296,
+      "tokens/total": 169639936,
+      "tokens/train_per_sec_per_gpu": 3090.9,
+      "tokens/trainable": 18060560
+    },
+    {
+      "epoch": 4.130573248407643,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 4.497316235379323e-06,
+      "loss": 0.002716638380661607,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00272,
+      "step": 1297,
+      "tokens/total": 169771008,
+      "tokens/train_per_sec_per_gpu": 3121.65,
+      "tokens/trainable": 18073696
+    },
+    {
+      "epoch": 4.13375796178344,
+      "grad_norm": 0.1484375,
+      "learning_rate": 4.465561385355712e-06,
+      "loss": 0.0017709597013890743,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00177,
+      "step": 1298,
+      "tokens/total": 169902080,
+      "tokens/train_per_sec_per_gpu": 3532.51,
+      "tokens/trainable": 18088448
+    },
+    {
+      "epoch": 4.1369426751592355,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 4.433908042819323e-06,
+      "loss": 0.0015186622040346265,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00152,
+      "step": 1299,
+      "tokens/total": 170033152,
+      "tokens/train_per_sec_per_gpu": 3144.35,
+      "tokens/trainable": 18101652
+    },
+    {
+      "epoch": 4.140127388535032,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 4.402356364241489e-06,
+      "loss": 0.001659161178395152,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00166,
+      "step": 1300,
+      "tokens/total": 170164224,
+      "tokens/train_per_sec_per_gpu": 3339.31,
+      "tokens/trainable": 18115704
+    },
+    {
+      "epoch": 4.143312101910828,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 4.370906505591007e-06,
+      "loss": 0.0014578705886378884,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00146,
+      "step": 1301,
+      "tokens/total": 170295296,
+      "tokens/train_per_sec_per_gpu": 3235.03,
+      "tokens/trainable": 18129240
+    },
+    {
+      "epoch": 4.146496815286624,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.339558622333353e-06,
+      "loss": 0.0024085917975753546,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 1302,
+      "tokens/total": 170426368,
+      "tokens/train_per_sec_per_gpu": 3590.36,
+      "tokens/trainable": 18144174
+    },
+    {
+      "epoch": 4.149681528662421,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 4.308312869429898e-06,
+      "loss": 0.0028695266228169203,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00287,
+      "step": 1303,
+      "tokens/total": 170557440,
+      "tokens/train_per_sec_per_gpu": 3280.59,
+      "tokens/trainable": 18157882
+    },
+    {
+      "epoch": 4.1528662420382165,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.27716940133715e-06,
+      "loss": 0.0024525150656700134,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00246,
+      "step": 1304,
+      "tokens/total": 170688512,
+      "tokens/train_per_sec_per_gpu": 3221.62,
+      "tokens/trainable": 18171456
+    },
+    {
+      "epoch": 4.156050955414012,
+      "grad_norm": 0.138671875,
+      "learning_rate": 4.246128372006017e-06,
+      "loss": 0.00208856794051826,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00209,
+      "step": 1305,
+      "tokens/total": 170819584,
+      "tokens/train_per_sec_per_gpu": 3030.29,
+      "tokens/trainable": 18184178
+    },
+    {
+      "epoch": 4.159235668789809,
+      "grad_norm": 0.119140625,
+      "learning_rate": 4.215189934881001e-06,
+      "loss": 0.0016645672731101513,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00167,
+      "step": 1306,
+      "tokens/total": 170950656,
+      "tokens/train_per_sec_per_gpu": 3509.84,
+      "tokens/trainable": 18198820
+    },
+    {
+      "epoch": 4.162420382165605,
+      "grad_norm": 0.080078125,
+      "learning_rate": 4.1843542428994685e-06,
+      "loss": 0.0010691338684409857,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00107,
+      "step": 1307,
+      "tokens/total": 171081728,
+      "tokens/train_per_sec_per_gpu": 2998.84,
+      "tokens/trainable": 18211426
+    },
+    {
+      "epoch": 4.165605095541402,
+      "grad_norm": 0.140625,
+      "learning_rate": 4.153621448490905e-06,
+      "loss": 0.0030363069381564856,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00304,
+      "step": 1308,
+      "tokens/total": 171212800,
+      "tokens/train_per_sec_per_gpu": 3613.64,
+      "tokens/trainable": 18226492
+    },
+    {
+      "epoch": 4.1687898089171975,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 4.122991703576121e-06,
+      "loss": 0.0039181094616651535,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00393,
+      "step": 1309,
+      "tokens/total": 171343872,
+      "tokens/train_per_sec_per_gpu": 3326.43,
+      "tokens/trainable": 18240364
+    },
+    {
+      "epoch": 4.171974522292993,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 4.092465159566525e-06,
+      "loss": 0.0018522969912737608,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00185,
+      "step": 1310,
+      "tokens/total": 171474944,
+      "tokens/train_per_sec_per_gpu": 3142.02,
+      "tokens/trainable": 18253528
+    },
+    {
+      "epoch": 4.17515923566879,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 4.062041967363395e-06,
+      "loss": 0.0022721600253134966,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00227,
+      "step": 1311,
+      "tokens/total": 171606016,
+      "tokens/train_per_sec_per_gpu": 3346.0,
+      "tokens/trainable": 18267536
+    },
+    {
+      "epoch": 4.178343949044586,
+      "grad_norm": 0.1328125,
+      "learning_rate": 4.031722277357086e-06,
+      "loss": 0.0017200830625370145,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00172,
+      "step": 1312,
+      "tokens/total": 171737088,
+      "tokens/train_per_sec_per_gpu": 3265.04,
+      "tokens/trainable": 18281204
+    },
+    {
+      "epoch": 4.181528662420382,
+      "grad_norm": 0.119140625,
+      "learning_rate": 4.001506239426339e-06,
+      "loss": 0.0018201316706836224,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00182,
+      "step": 1313,
+      "tokens/total": 171868160,
+      "tokens/train_per_sec_per_gpu": 3661.03,
+      "tokens/trainable": 18296460
+    },
+    {
+      "epoch": 4.1847133757961785,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 3.971394002937501e-06,
+      "loss": 0.0008904569549486041,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00089,
+      "step": 1314,
+      "tokens/total": 171999232,
+      "tokens/train_per_sec_per_gpu": 2682.19,
+      "tokens/trainable": 18307764
+    },
+    {
+      "epoch": 4.187898089171974,
+      "grad_norm": 0.115234375,
+      "learning_rate": 3.941385716743795e-06,
+      "loss": 0.0016649002209305763,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00167,
+      "step": 1315,
+      "tokens/total": 172130304,
+      "tokens/train_per_sec_per_gpu": 3367.15,
+      "tokens/trainable": 18321816
+    },
+    {
+      "epoch": 4.191082802547771,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 3.911481529184588e-06,
+      "loss": 0.0019004822243005037,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0019,
+      "step": 1316,
+      "tokens/total": 172261376,
+      "tokens/train_per_sec_per_gpu": 3450.07,
+      "tokens/trainable": 18336222
+    },
+    {
+      "epoch": 4.194267515923567,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 3.881681588084674e-06,
+      "loss": 0.0020820728968828917,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00208,
+      "step": 1317,
+      "tokens/total": 172392448,
+      "tokens/train_per_sec_per_gpu": 3549.92,
+      "tokens/trainable": 18351078
+    },
+    {
+      "epoch": 4.197452229299363,
+      "grad_norm": 0.16015625,
+      "learning_rate": 3.851986040753505e-06,
+      "loss": 0.002381009515374899,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00238,
+      "step": 1318,
+      "tokens/total": 172523520,
+      "tokens/train_per_sec_per_gpu": 3442.29,
+      "tokens/trainable": 18365480
+    },
+    {
+      "epoch": 4.2006369426751595,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 3.822395033984502e-06,
+      "loss": 0.0012018627021461725,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0012,
+      "step": 1319,
+      "tokens/total": 172654592,
+      "tokens/train_per_sec_per_gpu": 3791.81,
+      "tokens/trainable": 18381260
+    },
+    {
+      "epoch": 4.203821656050955,
+      "grad_norm": 0.142578125,
+      "learning_rate": 3.792908714054316e-06,
+      "loss": 0.002608443144708872,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00261,
+      "step": 1320,
+      "tokens/total": 172785664,
+      "tokens/train_per_sec_per_gpu": 3220.83,
+      "tokens/trainable": 18394756
+    },
+    {
+      "epoch": 4.207006369426751,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 3.7635272267220858e-06,
+      "loss": 0.0018472287338227034,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00185,
+      "step": 1321,
+      "tokens/total": 172916736,
+      "tokens/train_per_sec_per_gpu": 3254.63,
+      "tokens/trainable": 18408414
+    },
+    {
+      "epoch": 4.210191082802548,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 3.734250717228735e-06,
+      "loss": 0.00441823760047555,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00443,
+      "step": 1322,
+      "tokens/total": 173047808,
+      "tokens/train_per_sec_per_gpu": 3065.51,
+      "tokens/trainable": 18421262
+    },
+    {
+      "epoch": 4.213375796178344,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 3.7050793302962685e-06,
+      "loss": 0.0016929913545027375,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00169,
+      "step": 1323,
+      "tokens/total": 173178880,
+      "tokens/train_per_sec_per_gpu": 3282.3,
+      "tokens/trainable": 18434992
+    },
+    {
+      "epoch": 4.2165605095541405,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 3.676013210127022e-06,
+      "loss": 0.0025385431945323944,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00254,
+      "step": 1324,
+      "tokens/total": 173309952,
+      "tokens/train_per_sec_per_gpu": 3103.93,
+      "tokens/trainable": 18447992
+    },
+    {
+      "epoch": 4.219745222929936,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 3.647052500402981e-06,
+      "loss": 0.0015956538263708353,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1325,
+      "tokens/total": 173441024,
+      "tokens/train_per_sec_per_gpu": 3147.17,
+      "tokens/trainable": 18461184
+    },
+    {
+      "epoch": 4.222929936305732,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 3.6181973442850597e-06,
+      "loss": 0.001635034685023129,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00164,
+      "step": 1326,
+      "tokens/total": 173572096,
+      "tokens/train_per_sec_per_gpu": 3715.4,
+      "tokens/trainable": 18476658
+    },
+    {
+      "epoch": 4.226114649681529,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 3.589447884412378e-06,
+      "loss": 0.0025338195264339447,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00254,
+      "step": 1327,
+      "tokens/total": 173703168,
+      "tokens/train_per_sec_per_gpu": 3364.82,
+      "tokens/trainable": 18490728
+    },
+    {
+      "epoch": 4.229299363057325,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 3.5608042629015707e-06,
+      "loss": 0.001267962739802897,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00127,
+      "step": 1328,
+      "tokens/total": 173834240,
+      "tokens/train_per_sec_per_gpu": 3268.7,
+      "tokens/trainable": 18504432
+    },
+    {
+      "epoch": 4.232484076433121,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 3.532266621346103e-06,
+      "loss": 0.0019486568635329604,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00195,
+      "step": 1329,
+      "tokens/total": 173965312,
+      "tokens/train_per_sec_per_gpu": 3287.05,
+      "tokens/trainable": 18518208
+    },
+    {
+      "epoch": 4.235668789808917,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 3.5038351008155226e-06,
+      "loss": 0.002935834927484393,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00294,
+      "step": 1330,
+      "tokens/total": 174096384,
+      "tokens/train_per_sec_per_gpu": 3210.37,
+      "tokens/trainable": 18531630
+    },
+    {
+      "epoch": 4.238853503184713,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 3.4755098418548155e-06,
+      "loss": 0.0018774013733491302,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00188,
+      "step": 1331,
+      "tokens/total": 174227456,
+      "tokens/train_per_sec_per_gpu": 3384.93,
+      "tokens/trainable": 18545764
+    },
+    {
+      "epoch": 4.24203821656051,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 3.4472909844836837e-06,
+      "loss": 0.001764771994203329,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00177,
+      "step": 1332,
+      "tokens/total": 174358528,
+      "tokens/train_per_sec_per_gpu": 3577.08,
+      "tokens/trainable": 18560662
+    },
+    {
+      "epoch": 4.245222929936306,
+      "grad_norm": 0.130859375,
+      "learning_rate": 3.4191786681958437e-06,
+      "loss": 0.0026986815501004457,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0027,
+      "step": 1333,
+      "tokens/total": 174489600,
+      "tokens/train_per_sec_per_gpu": 3282.73,
+      "tokens/trainable": 18574404
+    },
+    {
+      "epoch": 4.248407643312102,
+      "grad_norm": 0.138671875,
+      "learning_rate": 3.39117303195835e-06,
+      "loss": 0.0022144122049212456,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00222,
+      "step": 1334,
+      "tokens/total": 174620672,
+      "tokens/train_per_sec_per_gpu": 3194.16,
+      "tokens/trainable": 18587768
+    },
+    {
+      "epoch": 4.251592356687898,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 3.3632742142109293e-06,
+      "loss": 0.00270890723913908,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00271,
+      "step": 1335,
+      "tokens/total": 174751744,
+      "tokens/train_per_sec_per_gpu": 3411.3,
+      "tokens/trainable": 18602002
+    },
+    {
+      "epoch": 4.254777070063694,
+      "grad_norm": 0.16796875,
+      "learning_rate": 3.3354823528652463e-06,
+      "loss": 0.0023235008120536804,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00233,
+      "step": 1336,
+      "tokens/total": 174882816,
+      "tokens/train_per_sec_per_gpu": 3230.27,
+      "tokens/trainable": 18615552
+    },
+    {
+      "epoch": 4.25796178343949,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 3.3077975853042703e-06,
+      "loss": 0.002815892221406102,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00282,
+      "step": 1337,
+      "tokens/total": 175013888,
+      "tokens/train_per_sec_per_gpu": 3239.06,
+      "tokens/trainable": 18629086
+    },
+    {
+      "epoch": 4.261146496815287,
+      "grad_norm": 0.166015625,
+      "learning_rate": 3.280220048381574e-06,
+      "loss": 0.002695944393053651,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0027,
+      "step": 1338,
+      "tokens/total": 175144960,
+      "tokens/train_per_sec_per_gpu": 3597.1,
+      "tokens/trainable": 18644090
+    },
+    {
+      "epoch": 4.264331210191083,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 3.252749878420647e-06,
+      "loss": 0.0021448852494359016,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00215,
+      "step": 1339,
+      "tokens/total": 175276032,
+      "tokens/train_per_sec_per_gpu": 3364.52,
+      "tokens/trainable": 18658172
+    },
+    {
+      "epoch": 4.267515923566879,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 3.225387211214237e-06,
+      "loss": 0.001379702938720584,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00138,
+      "step": 1340,
+      "tokens/total": 175407104,
+      "tokens/train_per_sec_per_gpu": 3324.64,
+      "tokens/trainable": 18672098
+    },
+    {
+      "epoch": 4.270700636942675,
+      "grad_norm": 0.146484375,
+      "learning_rate": 3.1981321820236885e-06,
+      "loss": 0.002582112792879343,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00259,
+      "step": 1341,
+      "tokens/total": 175538176,
+      "tokens/train_per_sec_per_gpu": 3665.7,
+      "tokens/trainable": 18687300
+    },
+    {
+      "epoch": 4.273885350318471,
+      "grad_norm": 0.126953125,
+      "learning_rate": 3.1709849255782466e-06,
+      "loss": 0.0017478655790910125,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00175,
+      "step": 1342,
+      "tokens/total": 175669248,
+      "tokens/train_per_sec_per_gpu": 3348.03,
+      "tokens/trainable": 18701322
+    },
+    {
+      "epoch": 4.277070063694268,
+      "grad_norm": 0.12890625,
+      "learning_rate": 3.1439455760744112e-06,
+      "loss": 0.0016232930356636643,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00162,
+      "step": 1343,
+      "tokens/total": 175800320,
+      "tokens/train_per_sec_per_gpu": 3355.1,
+      "tokens/trainable": 18715342
+    },
+    {
+      "epoch": 4.280254777070064,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 3.117014267175275e-06,
+      "loss": 0.0013508808333426714,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00135,
+      "step": 1344,
+      "tokens/total": 175931392,
+      "tokens/train_per_sec_per_gpu": 3356.71,
+      "tokens/trainable": 18729364
+    },
+    {
+      "epoch": 4.2834394904458595,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.0901911320098426e-06,
+      "loss": 0.002793082967400551,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0028,
+      "step": 1345,
+      "tokens/total": 176062464,
+      "tokens/train_per_sec_per_gpu": 3247.41,
+      "tokens/trainable": 18742944
+    },
+    {
+      "epoch": 4.286624203821656,
+      "grad_norm": 0.130859375,
+      "learning_rate": 3.0634763031723882e-06,
+      "loss": 0.0016741371946409345,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00168,
+      "step": 1346,
+      "tokens/total": 176193536,
+      "tokens/train_per_sec_per_gpu": 3132.92,
+      "tokens/trainable": 18756064
+    },
+    {
+      "epoch": 4.289808917197452,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 3.036869912721807e-06,
+      "loss": 0.0012669205898419023,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00127,
+      "step": 1347,
+      "tokens/total": 176324608,
+      "tokens/train_per_sec_per_gpu": 3626.61,
+      "tokens/trainable": 18771148
+    },
+    {
+      "epoch": 4.292993630573249,
+      "grad_norm": 0.095703125,
+      "learning_rate": 3.010372092180941e-06,
+      "loss": 0.0014189573703333735,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00142,
+      "step": 1348,
+      "tokens/total": 176455680,
+      "tokens/train_per_sec_per_gpu": 3173.45,
+      "tokens/trainable": 18784398
+    },
+    {
+      "epoch": 4.296178343949045,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 2.983982972535948e-06,
+      "loss": 0.0028286417946219444,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00283,
+      "step": 1349,
+      "tokens/total": 176586752,
+      "tokens/train_per_sec_per_gpu": 3281.6,
+      "tokens/trainable": 18798140
+    },
+    {
+      "epoch": 4.2993630573248405,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 2.9577026842356527e-06,
+      "loss": 0.002894408069550991,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0029,
+      "step": 1350,
+      "tokens/total": 176717824,
+      "tokens/train_per_sec_per_gpu": 3578.34,
+      "tokens/trainable": 18813032
+    },
+    {
+      "epoch": 4.302547770700637,
+      "grad_norm": 0.154296875,
+      "learning_rate": 2.931531357190881e-06,
+      "loss": 0.0021069981157779694,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00211,
+      "step": 1351,
+      "tokens/total": 176848896,
+      "tokens/train_per_sec_per_gpu": 3439.97,
+      "tokens/trainable": 18827370
+    },
+    {
+      "epoch": 4.305732484076433,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 2.905469120773835e-06,
+      "loss": 0.002290198812261224,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00229,
+      "step": 1352,
+      "tokens/total": 176979968,
+      "tokens/train_per_sec_per_gpu": 3402.53,
+      "tokens/trainable": 18841572
+    },
+    {
+      "epoch": 4.308917197452229,
+      "grad_norm": 0.16015625,
+      "learning_rate": 2.8795161038174675e-06,
+      "loss": 0.0023499338421970606,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00235,
+      "step": 1353,
+      "tokens/total": 177111040,
+      "tokens/train_per_sec_per_gpu": 3406.22,
+      "tokens/trainable": 18855776
+    },
+    {
+      "epoch": 4.312101910828026,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 2.853672434614807e-06,
+      "loss": 0.0013938483316451311,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00139,
+      "step": 1354,
+      "tokens/total": 177242112,
+      "tokens/train_per_sec_per_gpu": 3788.18,
+      "tokens/trainable": 18871550
+    },
+    {
+      "epoch": 4.3152866242038215,
+      "grad_norm": 0.1328125,
+      "learning_rate": 2.8279382409183598e-06,
+      "loss": 0.0020433831959962845,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1355,
+      "tokens/total": 177373184,
+      "tokens/train_per_sec_per_gpu": 3264.03,
+      "tokens/trainable": 18885240
+    },
+    {
+      "epoch": 4.318471337579618,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 2.802313649939467e-06,
+      "loss": 0.0011658279690891504,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00117,
+      "step": 1356,
+      "tokens/total": 177504256,
+      "tokens/train_per_sec_per_gpu": 3001.88,
+      "tokens/trainable": 18897798
+    },
+    {
+      "epoch": 4.321656050955414,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 2.7767987883476622e-06,
+      "loss": 0.0021784165874123573,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00218,
+      "step": 1357,
+      "tokens/total": 177635328,
+      "tokens/train_per_sec_per_gpu": 3337.15,
+      "tokens/trainable": 18911724
+    },
+    {
+      "epoch": 4.32484076433121,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 2.7513937822700508e-06,
+      "loss": 0.002125969622284174,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00213,
+      "step": 1358,
+      "tokens/total": 177766400,
+      "tokens/train_per_sec_per_gpu": 3060.55,
+      "tokens/trainable": 18924596
+    },
+    {
+      "epoch": 4.328025477707007,
+      "grad_norm": 0.142578125,
+      "learning_rate": 2.7260987572907153e-06,
+      "loss": 0.0018263484816998243,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00183,
+      "step": 1359,
+      "tokens/total": 177897472,
+      "tokens/train_per_sec_per_gpu": 3272.47,
+      "tokens/trainable": 18938296
+    },
+    {
+      "epoch": 4.3312101910828025,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 2.700913838450042e-06,
+      "loss": 0.0014197917189449072,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00142,
+      "step": 1360,
+      "tokens/total": 178028544,
+      "tokens/train_per_sec_per_gpu": 3245.66,
+      "tokens/trainable": 18951818
+    },
+    {
+      "epoch": 4.334394904458598,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 2.675839150244153e-06,
+      "loss": 0.0016245257575064898,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00163,
+      "step": 1361,
+      "tokens/total": 178159616,
+      "tokens/train_per_sec_per_gpu": 3313.42,
+      "tokens/trainable": 18965684
+    },
+    {
+      "epoch": 4.337579617834395,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 2.650874816624266e-06,
+      "loss": 0.0019813040271401405,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00198,
+      "step": 1362,
+      "tokens/total": 178290688,
+      "tokens/train_per_sec_per_gpu": 3158.46,
+      "tokens/trainable": 18978904
+    },
+    {
+      "epoch": 4.340764331210191,
+      "grad_norm": 0.130859375,
+      "learning_rate": 2.6260209609960757e-06,
+      "loss": 0.0024794619530439377,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00248,
+      "step": 1363,
+      "tokens/total": 178421760,
+      "tokens/train_per_sec_per_gpu": 3507.13,
+      "tokens/trainable": 18993500
+    },
+    {
+      "epoch": 4.343949044585988,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 2.6012777062191547e-06,
+      "loss": 0.002862154971808195,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00287,
+      "step": 1364,
+      "tokens/total": 178552832,
+      "tokens/train_per_sec_per_gpu": 3531.99,
+      "tokens/trainable": 19008250
+    },
+    {
+      "epoch": 4.3471337579617835,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 2.5766451746063598e-06,
+      "loss": 0.0013462984934449196,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00135,
+      "step": 1365,
+      "tokens/total": 178683904,
+      "tokens/train_per_sec_per_gpu": 3460.1,
+      "tokens/trainable": 19022668
+    },
+    {
+      "epoch": 4.350318471337579,
+      "grad_norm": 0.142578125,
+      "learning_rate": 2.5521234879231887e-06,
+      "loss": 0.002731763059273362,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00274,
+      "step": 1366,
+      "tokens/total": 178814976,
+      "tokens/train_per_sec_per_gpu": 3570.61,
+      "tokens/trainable": 19037614
+    },
+    {
+      "epoch": 4.353503184713376,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 2.527712767387222e-06,
+      "loss": 0.0014442024985328317,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00145,
+      "step": 1367,
+      "tokens/total": 178946048,
+      "tokens/train_per_sec_per_gpu": 3629.25,
+      "tokens/trainable": 19052728
+    },
+    {
+      "epoch": 4.356687898089172,
+      "grad_norm": 0.123046875,
+      "learning_rate": 2.5034131336674956e-06,
+      "loss": 0.0018038805574178696,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00181,
+      "step": 1368,
+      "tokens/total": 179077120,
+      "tokens/train_per_sec_per_gpu": 3323.04,
+      "tokens/trainable": 19066614
+    },
+    {
+      "epoch": 4.359872611464968,
+      "grad_norm": 0.177734375,
+      "learning_rate": 2.4792247068839064e-06,
+      "loss": 0.0023225173354148865,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00233,
+      "step": 1369,
+      "tokens/total": 179208192,
+      "tokens/train_per_sec_per_gpu": 3675.14,
+      "tokens/trainable": 19081856
+    },
+    {
+      "epoch": 4.3630573248407645,
+      "grad_norm": 0.158203125,
+      "learning_rate": 2.4551476066066307e-06,
+      "loss": 0.003056393703445792,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00306,
+      "step": 1370,
+      "tokens/total": 179339264,
+      "tokens/train_per_sec_per_gpu": 3829.68,
+      "tokens/trainable": 19097794
+    },
+    {
+      "epoch": 4.36624203821656,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 2.4311819518555295e-06,
+      "loss": 0.0030934589449316263,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0031,
+      "step": 1371,
+      "tokens/total": 179470336,
+      "tokens/train_per_sec_per_gpu": 3060.11,
+      "tokens/trainable": 19110620
+    },
+    {
+      "epoch": 4.369426751592357,
+      "grad_norm": 0.14453125,
+      "learning_rate": 2.407327861099548e-06,
+      "loss": 0.0017585513414815068,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00176,
+      "step": 1372,
+      "tokens/total": 179601408,
+      "tokens/train_per_sec_per_gpu": 3435.78,
+      "tokens/trainable": 19125008
+    },
+    {
+      "epoch": 4.372611464968153,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 2.383585452256146e-06,
+      "loss": 0.0014080167748034,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00141,
+      "step": 1373,
+      "tokens/total": 179732480,
+      "tokens/train_per_sec_per_gpu": 3677.42,
+      "tokens/trainable": 19140292
+    },
+    {
+      "epoch": 4.375796178343949,
+      "grad_norm": 0.12109375,
+      "learning_rate": 2.359954842690712e-06,
+      "loss": 0.0016012933338060975,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1374,
+      "tokens/total": 179863552,
+      "tokens/train_per_sec_per_gpu": 2983.63,
+      "tokens/trainable": 19152852
+    },
+    {
+      "epoch": 4.3789808917197455,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 2.336436149215973e-06,
+      "loss": 0.00259294337593019,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0026,
+      "step": 1375,
+      "tokens/total": 179994624,
+      "tokens/train_per_sec_per_gpu": 3180.68,
+      "tokens/trainable": 19166198
+    },
+    {
+      "epoch": 4.382165605095541,
+      "grad_norm": 0.1171875,
+      "learning_rate": 2.3130294880914173e-06,
+      "loss": 0.0015589562244713306,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00156,
+      "step": 1376,
+      "tokens/total": 180125696,
+      "tokens/train_per_sec_per_gpu": 3318.63,
+      "tokens/trainable": 19180092
+    },
+    {
+      "epoch": 4.385350318471337,
+      "grad_norm": 0.146484375,
+      "learning_rate": 2.289734975022742e-06,
+      "loss": 0.0024165399372577667,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00242,
+      "step": 1377,
+      "tokens/total": 180256768,
+      "tokens/train_per_sec_per_gpu": 3487.98,
+      "tokens/trainable": 19194700
+    },
+    {
+      "epoch": 4.388535031847134,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 2.266552725161247e-06,
+      "loss": 0.0012368856696411967,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00124,
+      "step": 1378,
+      "tokens/total": 180387840,
+      "tokens/train_per_sec_per_gpu": 3407.24,
+      "tokens/trainable": 19208968
+    },
+    {
+      "epoch": 4.39171974522293,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 2.2434828531032988e-06,
+      "loss": 0.002780412556603551,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00278,
+      "step": 1379,
+      "tokens/total": 180518912,
+      "tokens/train_per_sec_per_gpu": 3096.75,
+      "tokens/trainable": 19221936
+    },
+    {
+      "epoch": 4.3949044585987265,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 2.220525472889748e-06,
+      "loss": 0.0017908208537846804,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00179,
+      "step": 1380,
+      "tokens/total": 180649984,
+      "tokens/train_per_sec_per_gpu": 3574.84,
+      "tokens/trainable": 19236852
+    },
+    {
+      "epoch": 4.398089171974522,
+      "grad_norm": 0.150390625,
+      "learning_rate": 2.1976806980053556e-06,
+      "loss": 0.0019308646442368627,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00193,
+      "step": 1381,
+      "tokens/total": 180781056,
+      "tokens/train_per_sec_per_gpu": 3120.85,
+      "tokens/trainable": 19249910
+    },
+    {
+      "epoch": 4.401273885350318,
+      "grad_norm": 0.142578125,
+      "learning_rate": 2.1749486413782437e-06,
+      "loss": 0.001861095312051475,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00186,
+      "step": 1382,
+      "tokens/total": 180912128,
+      "tokens/train_per_sec_per_gpu": 3109.88,
+      "tokens/trainable": 19262932
+    },
+    {
+      "epoch": 4.404458598726115,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 2.1523294153793532e-06,
+      "loss": 0.0020333300344645977,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00204,
+      "step": 1383,
+      "tokens/total": 181043200,
+      "tokens/train_per_sec_per_gpu": 3773.27,
+      "tokens/trainable": 19278666
+    },
+    {
+      "epoch": 4.407643312101911,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 2.129823131821848e-06,
+      "loss": 0.0016740905120968819,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00168,
+      "step": 1384,
+      "tokens/total": 181174272,
+      "tokens/train_per_sec_per_gpu": 3304.58,
+      "tokens/trainable": 19292464
+    },
+    {
+      "epoch": 4.4108280254777075,
+      "grad_norm": 0.134765625,
+      "learning_rate": 2.107429901960603e-06,
+      "loss": 0.0017093883361667395,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00171,
+      "step": 1385,
+      "tokens/total": 181305344,
+      "tokens/train_per_sec_per_gpu": 3359.66,
+      "tokens/trainable": 19306468
+    },
+    {
+      "epoch": 4.414012738853503,
+      "grad_norm": 0.140625,
+      "learning_rate": 2.0851498364916345e-06,
+      "loss": 0.002314978279173374,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00232,
+      "step": 1386,
+      "tokens/total": 181436416,
+      "tokens/train_per_sec_per_gpu": 3591.52,
+      "tokens/trainable": 19321404
+    },
+    {
+      "epoch": 4.417197452229299,
+      "grad_norm": 0.10546875,
+      "learning_rate": 2.062983045551553e-06,
+      "loss": 0.0015969820087775588,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1387,
+      "tokens/total": 181567488,
+      "tokens/train_per_sec_per_gpu": 2748.49,
+      "tokens/trainable": 19334018
+    },
+    {
+      "epoch": 4.420382165605096,
+      "grad_norm": 0.12890625,
+      "learning_rate": 2.0409296387170125e-06,
+      "loss": 0.002041134750470519,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00204,
+      "step": 1388,
+      "tokens/total": 181698560,
+      "tokens/train_per_sec_per_gpu": 3103.87,
+      "tokens/trainable": 19347006
+    },
+    {
+      "epoch": 4.423566878980892,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 2.0189897250041945e-06,
+      "loss": 0.002131557324901223,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00213,
+      "step": 1389,
+      "tokens/total": 181829632,
+      "tokens/train_per_sec_per_gpu": 3187.5,
+      "tokens/trainable": 19360352
+    },
+    {
+      "epoch": 4.426751592356688,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 1.997163412868239e-06,
+      "loss": 0.002050690818578005,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1390,
+      "tokens/total": 181960704,
+      "tokens/train_per_sec_per_gpu": 3264.66,
+      "tokens/trainable": 19374028
+    },
+    {
+      "epoch": 4.429936305732484,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 1.975450810202725e-06,
+      "loss": 0.002214430132880807,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00222,
+      "step": 1391,
+      "tokens/total": 182091776,
+      "tokens/train_per_sec_per_gpu": 3059.02,
+      "tokens/trainable": 19386836
+    },
+    {
+      "epoch": 4.43312101910828,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 1.953852024339145e-06,
+      "loss": 0.0023007793352007866,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0023,
+      "step": 1392,
+      "tokens/total": 182222848,
+      "tokens/train_per_sec_per_gpu": 3096.57,
+      "tokens/trainable": 19399808
+    },
+    {
+      "epoch": 4.436305732484076,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 1.9323671620463446e-06,
+      "loss": 0.002242110203951597,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00224,
+      "step": 1393,
+      "tokens/total": 182353920,
+      "tokens/train_per_sec_per_gpu": 3424.27,
+      "tokens/trainable": 19414156
+    },
+    {
+      "epoch": 4.439490445859873,
+      "grad_norm": 0.142578125,
+      "learning_rate": 1.9109963295300183e-06,
+      "loss": 0.002074864227324724,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00208,
+      "step": 1394,
+      "tokens/total": 182484992,
+      "tokens/train_per_sec_per_gpu": 2963.93,
+      "tokens/trainable": 19426592
+    },
+    {
+      "epoch": 4.442675159235669,
+      "grad_norm": 0.158203125,
+      "learning_rate": 1.8897396324321914e-06,
+      "loss": 0.0032230939250439405,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00323,
+      "step": 1395,
+      "tokens/total": 182616064,
+      "tokens/train_per_sec_per_gpu": 3361.36,
+      "tokens/trainable": 19440628
+    },
+    {
+      "epoch": 4.445859872611465,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 1.8685971758306691e-06,
+      "loss": 0.0027499471325427294,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00275,
+      "step": 1396,
+      "tokens/total": 182747136,
+      "tokens/train_per_sec_per_gpu": 3511.89,
+      "tokens/trainable": 19455336
+    },
+    {
+      "epoch": 4.449044585987261,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.8475690642385468e-06,
+      "loss": 0.0020744299981743097,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00208,
+      "step": 1397,
+      "tokens/total": 182878208,
+      "tokens/train_per_sec_per_gpu": 3570.48,
+      "tokens/trainable": 19470248
+    },
+    {
+      "epoch": 4.452229299363057,
+      "grad_norm": 0.123046875,
+      "learning_rate": 1.8266554016036803e-06,
+      "loss": 0.0015029326314106584,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0015,
+      "step": 1398,
+      "tokens/total": 183009280,
+      "tokens/train_per_sec_per_gpu": 3170.84,
+      "tokens/trainable": 19483584
+    },
+    {
+      "epoch": 4.455414012738854,
+      "grad_norm": 0.1171875,
+      "learning_rate": 1.805856291308161e-06,
+      "loss": 0.0015301044331863523,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00153,
+      "step": 1399,
+      "tokens/total": 183140352,
+      "tokens/train_per_sec_per_gpu": 3104.15,
+      "tokens/trainable": 19496574
+    },
+    {
+      "epoch": 4.45859872611465,
+      "grad_norm": 0.1328125,
+      "learning_rate": 1.7851718361678205e-06,
+      "loss": 0.0024863574653863907,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00249,
+      "step": 1400,
+      "tokens/total": 183271424,
+      "tokens/train_per_sec_per_gpu": 3049.29,
+      "tokens/trainable": 19509346
+    },
+    {
+      "epoch": 4.461783439490446,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.7646021384317201e-06,
+      "loss": 0.0017364751547574997,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00174,
+      "step": 1401,
+      "tokens/total": 183402496,
+      "tokens/train_per_sec_per_gpu": 2926.42,
+      "tokens/trainable": 19521620
+    },
+    {
+      "epoch": 4.464968152866242,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 1.7441472997816538e-06,
+      "loss": 0.0021231744904071093,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00213,
+      "step": 1402,
+      "tokens/total": 183533568,
+      "tokens/train_per_sec_per_gpu": 3355.02,
+      "tokens/trainable": 19535636
+    },
+    {
+      "epoch": 4.468152866242038,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 1.7238074213316107e-06,
+      "loss": 0.0017344644293189049,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00174,
+      "step": 1403,
+      "tokens/total": 183664640,
+      "tokens/train_per_sec_per_gpu": 3171.71,
+      "tokens/trainable": 19548924
+    },
+    {
+      "epoch": 4.471337579617835,
+      "grad_norm": 0.119140625,
+      "learning_rate": 1.703582603627321e-06,
+      "loss": 0.0015079887816682458,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00151,
+      "step": 1404,
+      "tokens/total": 183795712,
+      "tokens/train_per_sec_per_gpu": 3552.52,
+      "tokens/trainable": 19563736
+    },
+    {
+      "epoch": 4.474522292993631,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 1.6834729466457256e-06,
+      "loss": 0.0015849830815568566,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00159,
+      "step": 1405,
+      "tokens/total": 183926784,
+      "tokens/train_per_sec_per_gpu": 3232.9,
+      "tokens/trainable": 19577276
+    },
+    {
+      "epoch": 4.477707006369426,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 1.6634785497944922e-06,
+      "loss": 0.002168088685721159,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00217,
+      "step": 1406,
+      "tokens/total": 184057856,
+      "tokens/train_per_sec_per_gpu": 3279.65,
+      "tokens/trainable": 19591000
+    },
+    {
+      "epoch": 4.480891719745223,
+      "grad_norm": 0.15234375,
+      "learning_rate": 1.6435995119115367e-06,
+      "loss": 0.0026027678977698088,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00261,
+      "step": 1407,
+      "tokens/total": 184188928,
+      "tokens/train_per_sec_per_gpu": 3467.93,
+      "tokens/trainable": 19605518
+    },
+    {
+      "epoch": 4.484076433121019,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.6238359312645168e-06,
+      "loss": 0.0017946372972801328,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0018,
+      "step": 1408,
+      "tokens/total": 184320000,
+      "tokens/train_per_sec_per_gpu": 3080.27,
+      "tokens/trainable": 19618436
+    },
+    {
+      "epoch": 4.487261146496815,
+      "grad_norm": 0.126953125,
+      "learning_rate": 1.6041879055503473e-06,
+      "loss": 0.002403007121756673,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 1409,
+      "tokens/total": 184451072,
+      "tokens/train_per_sec_per_gpu": 3474.59,
+      "tokens/trainable": 19632904
+    },
+    {
+      "epoch": 4.490445859872612,
+      "grad_norm": 0.12890625,
+      "learning_rate": 1.5846555318947353e-06,
+      "loss": 0.0019406620413064957,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00194,
+      "step": 1410,
+      "tokens/total": 184582144,
+      "tokens/train_per_sec_per_gpu": 3262.74,
+      "tokens/trainable": 19646574
+    },
+    {
+      "epoch": 4.493630573248407,
+      "grad_norm": 0.123046875,
+      "learning_rate": 1.5652389068516765e-06,
+      "loss": 0.0018433219520375133,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00185,
+      "step": 1411,
+      "tokens/total": 184713216,
+      "tokens/train_per_sec_per_gpu": 3327.14,
+      "tokens/trainable": 19660446
+    },
+    {
+      "epoch": 4.496815286624204,
+      "grad_norm": 0.1171875,
+      "learning_rate": 1.5459381264029904e-06,
+      "loss": 0.0018597168382257223,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00186,
+      "step": 1412,
+      "tokens/total": 184844288,
+      "tokens/train_per_sec_per_gpu": 3955.39,
+      "tokens/trainable": 19676842
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.1328125,
+      "learning_rate": 1.5267532859578437e-06,
+      "loss": 0.0019280803389847279,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00193,
+      "step": 1413,
+      "tokens/total": 184975360,
+      "tokens/train_per_sec_per_gpu": 3216.74,
+      "tokens/trainable": 19690306
+    },
+    {
+      "epoch": 4.5,
+      "eval_loss": 0.010314718820154667,
+      "eval_ppl": 1.01037,
+      "eval_runtime": 41.6339,
+      "eval_samples_per_second": 64.875,
+      "eval_steps_per_second": 4.059,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 1413
+    },
+    {
+      "epoch": 4.503184713375796,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 1.5076844803522922e-06,
+      "loss": 0.0020255008712410927,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00203,
+      "step": 1414,
+      "tokens/total": 185106432,
+      "tokens/train_per_sec_per_gpu": 3418.68,
+      "tokens/trainable": 19704608
+    },
+    {
+      "epoch": 4.506369426751593,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 1.4887318038487752e-06,
+      "loss": 0.0020268706139177084,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00203,
+      "step": 1415,
+      "tokens/total": 185237504,
+      "tokens/train_per_sec_per_gpu": 3371.87,
+      "tokens/trainable": 19718710
+    },
+    {
+      "epoch": 4.509554140127388,
+      "grad_norm": 0.12890625,
+      "learning_rate": 1.4698953501356972e-06,
+      "loss": 0.00200156238861382,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.002,
+      "step": 1416,
+      "tokens/total": 185368576,
+      "tokens/train_per_sec_per_gpu": 3222.46,
+      "tokens/trainable": 19732218
+    },
+    {
+      "epoch": 4.512738853503185,
+      "grad_norm": 0.115234375,
+      "learning_rate": 1.4511752123269245e-06,
+      "loss": 0.0017808325355872512,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00178,
+      "step": 1417,
+      "tokens/total": 185499648,
+      "tokens/train_per_sec_per_gpu": 3317.65,
+      "tokens/trainable": 19746108
+    },
+    {
+      "epoch": 4.515923566878981,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 1.432571482961345e-06,
+      "loss": 0.0017151820939034224,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00172,
+      "step": 1418,
+      "tokens/total": 185630720,
+      "tokens/train_per_sec_per_gpu": 3182.32,
+      "tokens/trainable": 19759432
+    },
+    {
+      "epoch": 4.519108280254777,
+      "grad_norm": 0.17578125,
+      "learning_rate": 1.4140842540024123e-06,
+      "loss": 0.002563396468758583,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00257,
+      "step": 1419,
+      "tokens/total": 185761792,
+      "tokens/train_per_sec_per_gpu": 3137.69,
+      "tokens/trainable": 19772572
+    },
+    {
+      "epoch": 4.522292993630574,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 1.3957136168376822e-06,
+      "loss": 0.0014816210605204105,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00148,
+      "step": 1420,
+      "tokens/total": 185892864,
+      "tokens/train_per_sec_per_gpu": 3152.7,
+      "tokens/trainable": 19785788
+    },
+    {
+      "epoch": 4.525477707006369,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 1.3774596622783604e-06,
+      "loss": 0.0015394608490169048,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00154,
+      "step": 1421,
+      "tokens/total": 186023936,
+      "tokens/train_per_sec_per_gpu": 3228.29,
+      "tokens/trainable": 19799284
+    },
+    {
+      "epoch": 4.528662420382165,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 1.3593224805588722e-06,
+      "loss": 0.0022464555222541094,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00225,
+      "step": 1422,
+      "tokens/total": 186155008,
+      "tokens/train_per_sec_per_gpu": 3587.26,
+      "tokens/trainable": 19814314
+    },
+    {
+      "epoch": 4.531847133757962,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 1.341302161336383e-06,
+      "loss": 0.0008602555026300251,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00086,
+      "step": 1423,
+      "tokens/total": 186286080,
+      "tokens/train_per_sec_per_gpu": 3203.52,
+      "tokens/trainable": 19827672
+    },
+    {
+      "epoch": 4.535031847133758,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 1.3233987936903808e-06,
+      "loss": 0.0015553171979263425,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00156,
+      "step": 1424,
+      "tokens/total": 186417152,
+      "tokens/train_per_sec_per_gpu": 3742.26,
+      "tokens/trainable": 19843252
+    },
+    {
+      "epoch": 4.538216560509554,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 1.3056124661222357e-06,
+      "loss": 0.0025198939256370068,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00252,
+      "step": 1425,
+      "tokens/total": 186548224,
+      "tokens/train_per_sec_per_gpu": 3418.51,
+      "tokens/trainable": 19857490
+    },
+    {
+      "epoch": 4.54140127388535,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 1.2879432665547558e-06,
+      "loss": 0.002200118498876691,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0022,
+      "step": 1426,
+      "tokens/total": 186679296,
+      "tokens/train_per_sec_per_gpu": 3288.15,
+      "tokens/trainable": 19871216
+    },
+    {
+      "epoch": 4.544585987261146,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.27039128233174e-06,
+      "loss": 0.0021478794515132904,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00215,
+      "step": 1427,
+      "tokens/total": 186810368,
+      "tokens/train_per_sec_per_gpu": 3148.41,
+      "tokens/trainable": 19884396
+    },
+    {
+      "epoch": 4.547770700636943,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 1.2529566002175753e-06,
+      "loss": 0.002553946105763316,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00256,
+      "step": 1428,
+      "tokens/total": 186941440,
+      "tokens/train_per_sec_per_gpu": 3403.73,
+      "tokens/trainable": 19898644
+    },
+    {
+      "epoch": 4.550955414012739,
+      "grad_norm": 0.12890625,
+      "learning_rate": 1.2356393063967798e-06,
+      "loss": 0.001968652941286564,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00197,
+      "step": 1429,
+      "tokens/total": 187072512,
+      "tokens/train_per_sec_per_gpu": 3468.59,
+      "tokens/trainable": 19913104
+    },
+    {
+      "epoch": 4.554140127388535,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 1.2184394864735881e-06,
+      "loss": 0.00198244652710855,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00198,
+      "step": 1430,
+      "tokens/total": 187203584,
+      "tokens/train_per_sec_per_gpu": 3461.21,
+      "tokens/trainable": 19927596
+    },
+    {
+      "epoch": 4.557324840764331,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 1.201357225471536e-06,
+      "loss": 0.0016815853305161,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00168,
+      "step": 1431,
+      "tokens/total": 187334656,
+      "tokens/train_per_sec_per_gpu": 3043.87,
+      "tokens/trainable": 19940348
+    },
+    {
+      "epoch": 4.560509554140127,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 1.184392607833032e-06,
+      "loss": 0.0021309617441147566,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00213,
+      "step": 1432,
+      "tokens/total": 187465728,
+      "tokens/train_per_sec_per_gpu": 3664.84,
+      "tokens/trainable": 19955600
+    },
+    {
+      "epoch": 4.563694267515924,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 1.1675457174189302e-06,
+      "loss": 0.00207577389664948,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00208,
+      "step": 1433,
+      "tokens/total": 187596800,
+      "tokens/train_per_sec_per_gpu": 3301.91,
+      "tokens/trainable": 19969374
+    },
+    {
+      "epoch": 4.56687898089172,
+      "grad_norm": 0.138671875,
+      "learning_rate": 1.1508166375081424e-06,
+      "loss": 0.0015523422043770552,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00155,
+      "step": 1434,
+      "tokens/total": 187727872,
+      "tokens/train_per_sec_per_gpu": 3163.05,
+      "tokens/trainable": 19982636
+    },
+    {
+      "epoch": 4.570063694267516,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.1342054507971933e-06,
+      "loss": 0.0017875637859106064,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00179,
+      "step": 1435,
+      "tokens/total": 187858944,
+      "tokens/train_per_sec_per_gpu": 3297.76,
+      "tokens/trainable": 19996446
+    },
+    {
+      "epoch": 4.573248407643312,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 1.1177122393998374e-06,
+      "loss": 0.0017204630421474576,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00172,
+      "step": 1436,
+      "tokens/total": 187990016,
+      "tokens/train_per_sec_per_gpu": 3545.62,
+      "tokens/trainable": 20011220
+    },
+    {
+      "epoch": 4.576433121019108,
+      "grad_norm": 0.1171875,
+      "learning_rate": 1.101337084846643e-06,
+      "loss": 0.0016106198308989406,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00161,
+      "step": 1437,
+      "tokens/total": 188121088,
+      "tokens/train_per_sec_per_gpu": 3421.36,
+      "tokens/trainable": 20025440
+    },
+    {
+      "epoch": 4.579617834394904,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 1.0850800680845929e-06,
+      "loss": 0.0017103978898376226,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00171,
+      "step": 1438,
+      "tokens/total": 188252160,
+      "tokens/train_per_sec_per_gpu": 3540.04,
+      "tokens/trainable": 20040228
+    },
+    {
+      "epoch": 4.582802547770701,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 1.0689412694766753e-06,
+      "loss": 0.0013984747929498553,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0014,
+      "step": 1439,
+      "tokens/total": 188383232,
+      "tokens/train_per_sec_per_gpu": 3402.4,
+      "tokens/trainable": 20054474
+    },
+    {
+      "epoch": 4.585987261146497,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 1.0529207688015018e-06,
+      "loss": 0.0012951147509738803,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0013,
+      "step": 1440,
+      "tokens/total": 188514304,
+      "tokens/train_per_sec_per_gpu": 3638.2,
+      "tokens/trainable": 20069706
+    },
+    {
+      "epoch": 4.5891719745222925,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 1.0370186452528935e-06,
+      "loss": 0.0015985879581421614,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1441,
+      "tokens/total": 188645376,
+      "tokens/train_per_sec_per_gpu": 3136.96,
+      "tokens/trainable": 20082850
+    },
+    {
+      "epoch": 4.592356687898089,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 1.021234977439503e-06,
+      "loss": 0.0018211111892014742,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00182,
+      "step": 1442,
+      "tokens/total": 188776448,
+      "tokens/train_per_sec_per_gpu": 3227.61,
+      "tokens/trainable": 20096360
+    },
+    {
+      "epoch": 4.595541401273885,
+      "grad_norm": 0.146484375,
+      "learning_rate": 1.0055698433844324e-06,
+      "loss": 0.002404790371656418,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 1443,
+      "tokens/total": 188907520,
+      "tokens/train_per_sec_per_gpu": 3174.22,
+      "tokens/trainable": 20109664
+    },
+    {
+      "epoch": 4.598726114649682,
+      "grad_norm": 0.13671875,
+      "learning_rate": 9.9002332052483e-07,
+      "loss": 0.0017899831291288137,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00179,
+      "step": 1444,
+      "tokens/total": 189038592,
+      "tokens/train_per_sec_per_gpu": 3046.84,
+      "tokens/trainable": 20122424
+    },
+    {
+      "epoch": 4.601910828025478,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 9.745954857115102e-07,
+      "loss": 0.0016956630861386657,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0017,
+      "step": 1445,
+      "tokens/total": 189169664,
+      "tokens/train_per_sec_per_gpu": 3263.68,
+      "tokens/trainable": 20136088
+    },
+    {
+      "epoch": 4.6050955414012735,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 9.592864152085963e-07,
+      "loss": 0.0015517222927883267,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00155,
+      "step": 1446,
+      "tokens/total": 189300736,
+      "tokens/train_per_sec_per_gpu": 3336.28,
+      "tokens/trainable": 20150052
+    },
+    {
+      "epoch": 4.60828025477707,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 9.440961846931107e-07,
+      "loss": 0.0015380029799416661,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00154,
+      "step": 1447,
+      "tokens/total": 189431808,
+      "tokens/train_per_sec_per_gpu": 3506.95,
+      "tokens/trainable": 20164734
+    },
+    {
+      "epoch": 4.611464968152866,
+      "grad_norm": 0.119140625,
+      "learning_rate": 9.290248692546189e-07,
+      "loss": 0.0016031761188060045,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1448,
+      "tokens/total": 189562880,
+      "tokens/train_per_sec_per_gpu": 3064.35,
+      "tokens/trainable": 20177570
+    },
+    {
+      "epoch": 4.614649681528663,
+      "grad_norm": 0.12109375,
+      "learning_rate": 9.140725433948616e-07,
+      "loss": 0.002197918714955449,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0022,
+      "step": 1449,
+      "tokens/total": 189693952,
+      "tokens/train_per_sec_per_gpu": 3280.73,
+      "tokens/trainable": 20191308
+    },
+    {
+      "epoch": 4.617834394904459,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 8.992392810273781e-07,
+      "loss": 0.0015633050352334976,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00156,
+      "step": 1450,
+      "tokens/total": 189825024,
+      "tokens/train_per_sec_per_gpu": 3466.66,
+      "tokens/trainable": 20205824
+    },
+    {
+      "epoch": 4.6210191082802545,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 8.845251554771422e-07,
+      "loss": 0.0020091324113309383,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00201,
+      "step": 1451,
+      "tokens/total": 189956096,
+      "tokens/train_per_sec_per_gpu": 3548.93,
+      "tokens/trainable": 20220694
+    },
+    {
+      "epoch": 4.624203821656051,
+      "grad_norm": 0.10546875,
+      "learning_rate": 8.699302394802016e-07,
+      "loss": 0.0017181969014927745,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00172,
+      "step": 1452,
+      "tokens/total": 190087168,
+      "tokens/train_per_sec_per_gpu": 3437.21,
+      "tokens/trainable": 20235080
+    },
+    {
+      "epoch": 4.627388535031847,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 8.554546051833201e-07,
+      "loss": 0.0018156894948333502,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00182,
+      "step": 1453,
+      "tokens/total": 190218240,
+      "tokens/train_per_sec_per_gpu": 3566.23,
+      "tokens/trainable": 20250016
+    },
+    {
+      "epoch": 4.630573248407643,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 8.410983241436132e-07,
+      "loss": 0.002036329824477434,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00204,
+      "step": 1454,
+      "tokens/total": 190349312,
+      "tokens/train_per_sec_per_gpu": 3178.74,
+      "tokens/trainable": 20263338
+    },
+    {
+      "epoch": 4.63375796178344,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 8.268614673282021e-07,
+      "loss": 0.0012238912750035524,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00122,
+      "step": 1455,
+      "tokens/total": 190480384,
+      "tokens/train_per_sec_per_gpu": 3217.83,
+      "tokens/trainable": 20276824
+    },
+    {
+      "epoch": 4.6369426751592355,
+      "grad_norm": 0.162109375,
+      "learning_rate": 8.127441051138662e-07,
+      "loss": 0.0029940090607851744,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.003,
+      "step": 1456,
+      "tokens/total": 190611456,
+      "tokens/train_per_sec_per_gpu": 3579.23,
+      "tokens/trainable": 20291792
+    },
+    {
+      "epoch": 4.640127388535031,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 7.987463072866852e-07,
+      "loss": 0.001104258350096643,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0011,
+      "step": 1457,
+      "tokens/total": 190742528,
+      "tokens/train_per_sec_per_gpu": 3201.93,
+      "tokens/trainable": 20305198
+    },
+    {
+      "epoch": 4.643312101910828,
+      "grad_norm": 0.126953125,
+      "learning_rate": 7.848681430416948e-07,
+      "loss": 0.0020911027677357197,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00209,
+      "step": 1458,
+      "tokens/total": 190873600,
+      "tokens/train_per_sec_per_gpu": 3356.29,
+      "tokens/trainable": 20319196
+    },
+    {
+      "epoch": 4.646496815286624,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 7.711096809825513e-07,
+      "loss": 0.0017163840821012855,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00172,
+      "step": 1459,
+      "tokens/total": 191004672,
+      "tokens/train_per_sec_per_gpu": 3241.44,
+      "tokens/trainable": 20332736
+    },
+    {
+      "epoch": 4.649681528662421,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 7.574709891211951e-07,
+      "loss": 0.0014391193399205804,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00144,
+      "step": 1460,
+      "tokens/total": 191135744,
+      "tokens/train_per_sec_per_gpu": 2792.75,
+      "tokens/trainable": 20344452
+    },
+    {
+      "epoch": 4.6528662420382165,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 7.439521348774959e-07,
+      "loss": 0.0014456507051363587,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00145,
+      "step": 1461,
+      "tokens/total": 191266816,
+      "tokens/train_per_sec_per_gpu": 3650.92,
+      "tokens/trainable": 20359712
+    },
+    {
+      "epoch": 4.656050955414012,
+      "grad_norm": 0.126953125,
+      "learning_rate": 7.305531850789444e-07,
+      "loss": 0.0015093558467924595,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00151,
+      "step": 1462,
+      "tokens/total": 191397888,
+      "tokens/train_per_sec_per_gpu": 3308.41,
+      "tokens/trainable": 20373528
+    },
+    {
+      "epoch": 4.659235668789809,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 7.17274205960311e-07,
+      "loss": 0.0016126552363857627,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00161,
+      "step": 1463,
+      "tokens/total": 191528960,
+      "tokens/train_per_sec_per_gpu": 3625.83,
+      "tokens/trainable": 20388600
+    },
+    {
+      "epoch": 4.662420382165605,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 7.041152631633075e-07,
+      "loss": 0.0025427560321986675,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00255,
+      "step": 1464,
+      "tokens/total": 191660032,
+      "tokens/train_per_sec_per_gpu": 3430.12,
+      "tokens/trainable": 20402944
+    },
+    {
+      "epoch": 4.665605095541402,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 6.910764217362753e-07,
+      "loss": 0.002073355484753847,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00208,
+      "step": 1465,
+      "tokens/total": 191791104,
+      "tokens/train_per_sec_per_gpu": 3534.12,
+      "tokens/trainable": 20417666
+    },
+    {
+      "epoch": 4.6687898089171975,
+      "grad_norm": 0.15234375,
+      "learning_rate": 6.781577461338673e-07,
+      "loss": 0.0026118066161870956,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00262,
+      "step": 1466,
+      "tokens/total": 191922176,
+      "tokens/train_per_sec_per_gpu": 3348.83,
+      "tokens/trainable": 20431704
+    },
+    {
+      "epoch": 4.671974522292993,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 6.653593002167168e-07,
+      "loss": 0.0018058358691632748,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00181,
+      "step": 1467,
+      "tokens/total": 192053248,
+      "tokens/train_per_sec_per_gpu": 3093.22,
+      "tokens/trainable": 20444646
+    },
+    {
+      "epoch": 4.67515923566879,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 6.526811472511302e-07,
+      "loss": 0.0014479233650490642,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00145,
+      "step": 1468,
+      "tokens/total": 192184320,
+      "tokens/train_per_sec_per_gpu": 3514.53,
+      "tokens/trainable": 20459360
+    },
+    {
+      "epoch": 4.678343949044586,
+      "grad_norm": 0.140625,
+      "learning_rate": 6.40123349908775e-07,
+      "loss": 0.002245377516373992,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00225,
+      "step": 1469,
+      "tokens/total": 192315392,
+      "tokens/train_per_sec_per_gpu": 3310.51,
+      "tokens/trainable": 20473228
+    },
+    {
+      "epoch": 4.681528662420382,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 6.276859702663618e-07,
+      "loss": 0.001856306567788124,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00186,
+      "step": 1470,
+      "tokens/total": 192446464,
+      "tokens/train_per_sec_per_gpu": 3067.13,
+      "tokens/trainable": 20486072
+    },
+    {
+      "epoch": 4.6847133757961785,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 6.153690698053438e-07,
+      "loss": 0.0019508072873577476,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00195,
+      "step": 1471,
+      "tokens/total": 192577536,
+      "tokens/train_per_sec_per_gpu": 3417.64,
+      "tokens/trainable": 20500378
+    },
+    {
+      "epoch": 4.687898089171974,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 6.031727094116175e-07,
+      "loss": 0.0022490478586405516,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00225,
+      "step": 1472,
+      "tokens/total": 192708608,
+      "tokens/train_per_sec_per_gpu": 3687.19,
+      "tokens/trainable": 20515722
+    },
+    {
+      "epoch": 4.69108280254777,
+      "grad_norm": 0.123046875,
+      "learning_rate": 5.910969493752055e-07,
+      "loss": 0.0018782130209729075,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00188,
+      "step": 1473,
+      "tokens/total": 192839680,
+      "tokens/train_per_sec_per_gpu": 3720.3,
+      "tokens/trainable": 20531248
+    },
+    {
+      "epoch": 4.694267515923567,
+      "grad_norm": 0.126953125,
+      "learning_rate": 5.791418493899803e-07,
+      "loss": 0.0018554049311205745,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00186,
+      "step": 1474,
+      "tokens/total": 192970752,
+      "tokens/train_per_sec_per_gpu": 3362.35,
+      "tokens/trainable": 20545276
+    },
+    {
+      "epoch": 4.697452229299363,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 5.673074685533547e-07,
+      "loss": 0.00283794361166656,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00284,
+      "step": 1475,
+      "tokens/total": 193101824,
+      "tokens/train_per_sec_per_gpu": 3107.71,
+      "tokens/trainable": 20558290
+    },
+    {
+      "epoch": 4.7006369426751595,
+      "grad_norm": 0.115234375,
+      "learning_rate": 5.555938653659859e-07,
+      "loss": 0.0015586434165015817,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00156,
+      "step": 1476,
+      "tokens/total": 193232896,
+      "tokens/train_per_sec_per_gpu": 3495.85,
+      "tokens/trainable": 20572836
+    },
+    {
+      "epoch": 4.703821656050955,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 5.440010977315003e-07,
+      "loss": 0.002725705737248063,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00273,
+      "step": 1477,
+      "tokens/total": 193363968,
+      "tokens/train_per_sec_per_gpu": 2926.02,
+      "tokens/trainable": 20585176
+    },
+    {
+      "epoch": 4.707006369426751,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 5.32529222956199e-07,
+      "loss": 0.003224026644602418,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00323,
+      "step": 1478,
+      "tokens/total": 193495040,
+      "tokens/train_per_sec_per_gpu": 3124.75,
+      "tokens/trainable": 20598252
+    },
+    {
+      "epoch": 4.710191082802548,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 5.211782977487728e-07,
+      "loss": 0.0022572882007807493,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00226,
+      "step": 1479,
+      "tokens/total": 193626112,
+      "tokens/train_per_sec_per_gpu": 3719.62,
+      "tokens/trainable": 20613736
+    },
+    {
+      "epoch": 4.713375796178344,
+      "grad_norm": 0.126953125,
+      "learning_rate": 5.099483782200321e-07,
+      "loss": 0.0020106916781514883,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00201,
+      "step": 1480,
+      "tokens/total": 193757184,
+      "tokens/train_per_sec_per_gpu": 3386.3,
+      "tokens/trainable": 20627916
+    },
+    {
+      "epoch": 4.7165605095541405,
+      "grad_norm": 0.150390625,
+      "learning_rate": 4.988395198826157e-07,
+      "loss": 0.002159472554922104,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00216,
+      "step": 1481,
+      "tokens/total": 193888256,
+      "tokens/train_per_sec_per_gpu": 3132.67,
+      "tokens/trainable": 20641036
+    },
+    {
+      "epoch": 4.719745222929936,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 4.878517776507247e-07,
+      "loss": 0.0026867706328630447,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00269,
+      "step": 1482,
+      "tokens/total": 194019328,
+      "tokens/train_per_sec_per_gpu": 3359.56,
+      "tokens/trainable": 20655048
+    },
+    {
+      "epoch": 4.722929936305732,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 4.7698520583985e-07,
+      "loss": 0.0017674706177785993,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00177,
+      "step": 1483,
+      "tokens/total": 194150400,
+      "tokens/train_per_sec_per_gpu": 3297.46,
+      "tokens/trainable": 20668772
+    },
+    {
+      "epoch": 4.726114649681529,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 4.662398581665006e-07,
+      "loss": 0.0014837021008133888,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00148,
+      "step": 1484,
+      "tokens/total": 194281472,
+      "tokens/train_per_sec_per_gpu": 3494.12,
+      "tokens/trainable": 20683348
+    },
+    {
+      "epoch": 4.729299363057325,
+      "grad_norm": 0.123046875,
+      "learning_rate": 4.5561578774794276e-07,
+      "loss": 0.0021369662135839462,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00214,
+      "step": 1485,
+      "tokens/total": 194412544,
+      "tokens/train_per_sec_per_gpu": 3607.71,
+      "tokens/trainable": 20698316
+    },
+    {
+      "epoch": 4.732484076433121,
+      "grad_norm": 0.158203125,
+      "learning_rate": 4.45113047101936e-07,
+      "loss": 0.002360973972827196,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00236,
+      "step": 1486,
+      "tokens/total": 194543616,
+      "tokens/train_per_sec_per_gpu": 3584.68,
+      "tokens/trainable": 20713220
+    },
+    {
+      "epoch": 4.735668789808917,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 4.3473168814647525e-07,
+      "loss": 0.0015863839071244001,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00159,
+      "step": 1487,
+      "tokens/total": 194674688,
+      "tokens/train_per_sec_per_gpu": 3400.0,
+      "tokens/trainable": 20727406
+    },
+    {
+      "epoch": 4.738853503184713,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 4.24471762199527e-07,
+      "loss": 0.0016582348616793752,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00166,
+      "step": 1488,
+      "tokens/total": 194805760,
+      "tokens/train_per_sec_per_gpu": 3258.7,
+      "tokens/trainable": 20741076
+    },
+    {
+      "epoch": 4.742038216560509,
+      "grad_norm": 0.1328125,
+      "learning_rate": 4.143333199787769e-07,
+      "loss": 0.00176681496668607,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00177,
+      "step": 1489,
+      "tokens/total": 194936832,
+      "tokens/train_per_sec_per_gpu": 2999.18,
+      "tokens/trainable": 20753696
+    },
+    {
+      "epoch": 4.745222929936306,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 4.0431641160139367e-07,
+      "loss": 0.002107662847265601,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00211,
+      "step": 1490,
+      "tokens/total": 195067904,
+      "tokens/train_per_sec_per_gpu": 3418.27,
+      "tokens/trainable": 20767910
+    },
+    {
+      "epoch": 4.748407643312102,
+      "grad_norm": 0.140625,
+      "learning_rate": 3.944210865837572e-07,
+      "loss": 0.0021030758507549763,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00211,
+      "step": 1491,
+      "tokens/total": 195198976,
+      "tokens/train_per_sec_per_gpu": 3144.49,
+      "tokens/trainable": 20781092
+    },
+    {
+      "epoch": 4.751592356687898,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 3.846473938412365e-07,
+      "loss": 0.0020006736740469933,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.002,
+      "step": 1492,
+      "tokens/total": 195330048,
+      "tokens/train_per_sec_per_gpu": 3535.97,
+      "tokens/trainable": 20795832
+    },
+    {
+      "epoch": 4.754777070063694,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 3.749953816879398e-07,
+      "loss": 0.001961378613486886,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00196,
+      "step": 1493,
+      "tokens/total": 195461120,
+      "tokens/train_per_sec_per_gpu": 3395.84,
+      "tokens/trainable": 20810046
+    },
+    {
+      "epoch": 4.757961783439491,
+      "grad_norm": 0.1328125,
+      "learning_rate": 3.654650978364649e-07,
+      "loss": 0.0024665065575391054,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00247,
+      "step": 1494,
+      "tokens/total": 195592192,
+      "tokens/train_per_sec_per_gpu": 3389.65,
+      "tokens/trainable": 20824160
+    },
+    {
+      "epoch": 4.761146496815287,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 3.560565893976742e-07,
+      "loss": 0.0024471194483339787,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00245,
+      "step": 1495,
+      "tokens/total": 195723264,
+      "tokens/train_per_sec_per_gpu": 3146.42,
+      "tokens/trainable": 20837344
+    },
+    {
+      "epoch": 4.764331210191083,
+      "grad_norm": 0.201171875,
+      "learning_rate": 3.467699028804672e-07,
+      "loss": 0.003118871245533228,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00312,
+      "step": 1496,
+      "tokens/total": 195854336,
+      "tokens/train_per_sec_per_gpu": 3068.73,
+      "tokens/trainable": 20850148
+    },
+    {
+      "epoch": 4.767515923566879,
+      "grad_norm": 0.146484375,
+      "learning_rate": 3.376050841915335e-07,
+      "loss": 0.0028909991960972548,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0029,
+      "step": 1497,
+      "tokens/total": 195985408,
+      "tokens/train_per_sec_per_gpu": 3299.09,
+      "tokens/trainable": 20863890
+    },
+    {
+      "epoch": 4.770700636942675,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 3.2856217863514727e-07,
+      "loss": 0.001599812414497137,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1498,
+      "tokens/total": 196116480,
+      "tokens/train_per_sec_per_gpu": 3543.45,
+      "tokens/trainable": 20878628
+    },
+    {
+      "epoch": 4.773885350318471,
+      "grad_norm": 0.158203125,
+      "learning_rate": 3.1964123091292595e-07,
+      "loss": 0.0027794367633759975,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00278,
+      "step": 1499,
+      "tokens/total": 196247552,
+      "tokens/train_per_sec_per_gpu": 3411.09,
+      "tokens/trainable": 20892812
+    },
+    {
+      "epoch": 4.777070063694268,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 3.108422851236137e-07,
+      "loss": 0.0011374036548659205,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00114,
+      "step": 1500,
+      "tokens/total": 196378624,
+      "tokens/train_per_sec_per_gpu": 3329.43,
+      "tokens/trainable": 20906680
+    },
+    {
+      "epoch": 4.780254777070064,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 3.0216538476286196e-07,
+      "loss": 0.0018032776424661279,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0018,
+      "step": 1501,
+      "tokens/total": 196509696,
+      "tokens/train_per_sec_per_gpu": 3433.1,
+      "tokens/trainable": 20920932
+    },
+    {
+      "epoch": 4.7834394904458595,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 2.936105727230298e-07,
+      "loss": 0.0027445517480373383,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00275,
+      "step": 1502,
+      "tokens/total": 196640768,
+      "tokens/train_per_sec_per_gpu": 3841.28,
+      "tokens/trainable": 20936896
+    },
+    {
+      "epoch": 4.786624203821656,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 2.851778912929426e-07,
+      "loss": 0.001024644705466926,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00103,
+      "step": 1503,
+      "tokens/total": 196771840,
+      "tokens/train_per_sec_per_gpu": 3599.56,
+      "tokens/trainable": 20951852
+    },
+    {
+      "epoch": 4.789808917197452,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 2.768673821577167e-07,
+      "loss": 0.0011879701633006334,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00119,
+      "step": 1504,
+      "tokens/total": 196902912,
+      "tokens/train_per_sec_per_gpu": 3009.96,
+      "tokens/trainable": 20964468
+    },
+    {
+      "epoch": 4.792993630573249,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 2.6867908639852944e-07,
+      "loss": 0.0033508751075714827,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00336,
+      "step": 1505,
+      "tokens/total": 197033984,
+      "tokens/train_per_sec_per_gpu": 3536.08,
+      "tokens/trainable": 20979168
+    },
+    {
+      "epoch": 4.796178343949045,
+      "grad_norm": 0.166015625,
+      "learning_rate": 2.6061304449241655e-07,
+      "loss": 0.0030738934874534607,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00308,
+      "step": 1506,
+      "tokens/total": 197165056,
+      "tokens/train_per_sec_per_gpu": 2876.89,
+      "tokens/trainable": 20991260
+    },
+    {
+      "epoch": 4.7993630573248405,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 2.526692963120858e-07,
+      "loss": 0.002285804832354188,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00229,
+      "step": 1507,
+      "tokens/total": 197296128,
+      "tokens/train_per_sec_per_gpu": 3400.76,
+      "tokens/trainable": 21005408
+    },
+    {
+      "epoch": 4.802547770700637,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 2.448478811257149e-07,
+      "loss": 0.002408439526334405,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00241,
+      "step": 1508,
+      "tokens/total": 197427200,
+      "tokens/train_per_sec_per_gpu": 3679.66,
+      "tokens/trainable": 21020666
+    },
+    {
+      "epoch": 4.805732484076433,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 2.3714883759674566e-07,
+      "loss": 0.0013570735463872552,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00136,
+      "step": 1509,
+      "tokens/total": 197558272,
+      "tokens/train_per_sec_per_gpu": 3549.55,
+      "tokens/trainable": 21035448
+    },
+    {
+      "epoch": 4.80891719745223,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 2.295722037837178e-07,
+      "loss": 0.0017048909794539213,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00171,
+      "step": 1510,
+      "tokens/total": 197689344,
+      "tokens/train_per_sec_per_gpu": 2969.31,
+      "tokens/trainable": 21047892
+    },
+    {
+      "epoch": 4.812101910828026,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 2.2211801714004942e-07,
+      "loss": 0.0012713008327409625,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00127,
+      "step": 1511,
+      "tokens/total": 197820416,
+      "tokens/train_per_sec_per_gpu": 3589.68,
+      "tokens/trainable": 21062844
+    },
+    {
+      "epoch": 4.8152866242038215,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 2.1478631451387898e-07,
+      "loss": 0.002427282277494669,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00243,
+      "step": 1512,
+      "tokens/total": 197951488,
+      "tokens/train_per_sec_per_gpu": 3365.34,
+      "tokens/trainable": 21076902
+    },
+    {
+      "epoch": 4.818471337579618,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 2.0757713214786533e-07,
+      "loss": 0.0020946285221725702,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0021,
+      "step": 1513,
+      "tokens/total": 198082560,
+      "tokens/train_per_sec_per_gpu": 3593.33,
+      "tokens/trainable": 21091820
+    },
+    {
+      "epoch": 4.821656050955414,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 2.0049050567902128e-07,
+      "loss": 0.0015513665275648236,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00155,
+      "step": 1514,
+      "tokens/total": 198213632,
+      "tokens/train_per_sec_per_gpu": 3683.66,
+      "tokens/trainable": 21107128
+    },
+    {
+      "epoch": 4.82484076433121,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.9352647013852477e-07,
+      "loss": 0.001911777420900762,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00191,
+      "step": 1515,
+      "tokens/total": 198344704,
+      "tokens/train_per_sec_per_gpu": 3288.64,
+      "tokens/trainable": 21120836
+    },
+    {
+      "epoch": 4.828025477707007,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 1.8668505995155515e-07,
+      "loss": 0.0022345585748553276,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00224,
+      "step": 1516,
+      "tokens/total": 198475776,
+      "tokens/train_per_sec_per_gpu": 3563.74,
+      "tokens/trainable": 21135672
+    },
+    {
+      "epoch": 4.8312101910828025,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.7996630893712675e-07,
+      "loss": 0.0015912681119516492,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00159,
+      "step": 1517,
+      "tokens/total": 198606848,
+      "tokens/train_per_sec_per_gpu": 3421.44,
+      "tokens/trainable": 21149896
+    },
+    {
+      "epoch": 4.834394904458598,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 1.7337025030790543e-07,
+      "loss": 0.0015856210375204682,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00159,
+      "step": 1518,
+      "tokens/total": 198737920,
+      "tokens/train_per_sec_per_gpu": 2977.95,
+      "tokens/trainable": 21162350
+    },
+    {
+      "epoch": 4.837579617834395,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 1.6689691667005902e-07,
+      "loss": 0.0021609310060739517,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00216,
+      "step": 1519,
+      "tokens/total": 198868992,
+      "tokens/train_per_sec_per_gpu": 3149.82,
+      "tokens/trainable": 21175516
+    },
+    {
+      "epoch": 4.840764331210191,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 1.6054634002309054e-07,
+      "loss": 0.0015277141937986016,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00153,
+      "step": 1520,
+      "tokens/total": 199000064,
+      "tokens/train_per_sec_per_gpu": 3155.1,
+      "tokens/trainable": 21188692
+    },
+    {
+      "epoch": 4.843949044585988,
+      "grad_norm": 0.162109375,
+      "learning_rate": 1.5431855175968014e-07,
+      "loss": 0.002204909920692444,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00221,
+      "step": 1521,
+      "tokens/total": 199131136,
+      "tokens/train_per_sec_per_gpu": 3326.25,
+      "tokens/trainable": 21202538
+    },
+    {
+      "epoch": 4.8471337579617835,
+      "grad_norm": 0.138671875,
+      "learning_rate": 1.4821358266553231e-07,
+      "loss": 0.002712359419092536,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00272,
+      "step": 1522,
+      "tokens/total": 199262208,
+      "tokens/train_per_sec_per_gpu": 3457.57,
+      "tokens/trainable": 21216952
+    },
+    {
+      "epoch": 4.850318471337579,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 1.4223146291922062e-07,
+      "loss": 0.0019022361375391483,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0019,
+      "step": 1523,
+      "tokens/total": 199393280,
+      "tokens/train_per_sec_per_gpu": 3419.62,
+      "tokens/trainable": 21231184
+    },
+    {
+      "epoch": 4.853503184713376,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 1.3637222209204327e-07,
+      "loss": 0.0018241211073473096,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00183,
+      "step": 1524,
+      "tokens/total": 199524352,
+      "tokens/train_per_sec_per_gpu": 3139.7,
+      "tokens/trainable": 21244294
+    },
+    {
+      "epoch": 4.856687898089172,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 1.3063588914786207e-07,
+      "loss": 0.001210428192280233,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00121,
+      "step": 1525,
+      "tokens/total": 199655424,
+      "tokens/train_per_sec_per_gpu": 3122.56,
+      "tokens/trainable": 21257322
+    },
+    {
+      "epoch": 4.859872611464969,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 1.250224924429888e-07,
+      "loss": 0.0014607764314860106,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00146,
+      "step": 1526,
+      "tokens/total": 199786496,
+      "tokens/train_per_sec_per_gpu": 3032.1,
+      "tokens/trainable": 21269998
+    },
+    {
+      "epoch": 4.8630573248407645,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 1.1953205972601022e-07,
+      "loss": 0.002046809531748295,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1527,
+      "tokens/total": 199917568,
+      "tokens/train_per_sec_per_gpu": 3360.03,
+      "tokens/trainable": 21284056
+    },
+    {
+      "epoch": 4.86624203821656,
+      "grad_norm": 0.140625,
+      "learning_rate": 1.1416461813767709e-07,
+      "loss": 0.002186344237998128,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00219,
+      "step": 1528,
+      "tokens/total": 200048640,
+      "tokens/train_per_sec_per_gpu": 3252.14,
+      "tokens/trainable": 21297784
+    },
+    {
+      "epoch": 4.869426751592357,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.0892019421075706e-07,
+      "loss": 0.002091720700263977,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00209,
+      "step": 1529,
+      "tokens/total": 200179712,
+      "tokens/train_per_sec_per_gpu": 3436.08,
+      "tokens/trainable": 21312112
+    },
+    {
+      "epoch": 4.872611464968153,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 1.0379881386990974e-07,
+      "loss": 0.001913387910462916,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00192,
+      "step": 1530,
+      "tokens/total": 200310784,
+      "tokens/train_per_sec_per_gpu": 3187.83,
+      "tokens/trainable": 21325434
+    },
+    {
+      "epoch": 4.875796178343949,
+      "grad_norm": 0.154296875,
+      "learning_rate": 9.880050243155359e-08,
+      "loss": 0.0024429503828287125,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00245,
+      "step": 1531,
+      "tokens/total": 200441856,
+      "tokens/train_per_sec_per_gpu": 3182.45,
+      "tokens/trainable": 21338740
+    },
+    {
+      "epoch": 4.8789808917197455,
+      "grad_norm": 0.130859375,
+      "learning_rate": 9.392528460374362e-08,
+      "loss": 0.0016927801771089435,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00169,
+      "step": 1532,
+      "tokens/total": 200572928,
+      "tokens/train_per_sec_per_gpu": 3158.89,
+      "tokens/trainable": 21351978
+    },
+    {
+      "epoch": 4.882165605095541,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 8.917318448604661e-08,
+      "loss": 0.0016512478468939662,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00165,
+      "step": 1533,
+      "tokens/total": 200704000,
+      "tokens/train_per_sec_per_gpu": 3721.88,
+      "tokens/trainable": 21367448
+    },
+    {
+      "epoch": 4.885350318471337,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 8.454422556942454e-08,
+      "loss": 0.0020441263914108276,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1534,
+      "tokens/total": 200835072,
+      "tokens/train_per_sec_per_gpu": 3451.49,
+      "tokens/trainable": 21381804
+    },
+    {
+      "epoch": 4.888535031847134,
+      "grad_norm": 0.12890625,
+      "learning_rate": 8.003843073612627e-08,
+      "loss": 0.0019288958283141255,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00193,
+      "step": 1535,
+      "tokens/total": 200966144,
+      "tokens/train_per_sec_per_gpu": 3544.84,
+      "tokens/trainable": 21396546
+    },
+    {
+      "epoch": 4.89171974522293,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 7.565582225955158e-08,
+      "loss": 0.0020149427000433207,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00202,
+      "step": 1536,
+      "tokens/total": 201097216,
+      "tokens/train_per_sec_per_gpu": 3206.12,
+      "tokens/trainable": 21410012
+    },
+    {
+      "epoch": 4.8949044585987265,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 7.139642180416517e-08,
+      "loss": 0.00250299577601254,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00251,
+      "step": 1537,
+      "tokens/total": 201228288,
+      "tokens/train_per_sec_per_gpu": 3708.05,
+      "tokens/trainable": 21425414
+    },
+    {
+      "epoch": 4.898089171974522,
+      "grad_norm": 0.158203125,
+      "learning_rate": 6.726025042537721e-08,
+      "loss": 0.002223816467449069,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 1538,
+      "tokens/total": 201359360,
+      "tokens/train_per_sec_per_gpu": 3310.1,
+      "tokens/trainable": 21439244
+    },
+    {
+      "epoch": 4.901273885350318,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 6.324732856944349e-08,
+      "loss": 0.002602557884529233,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00261,
+      "step": 1539,
+      "tokens/total": 201490432,
+      "tokens/train_per_sec_per_gpu": 3309.88,
+      "tokens/trainable": 21453112
+    },
+    {
+      "epoch": 4.904458598726115,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 5.935767607336273e-08,
+      "loss": 0.0018828021129593253,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00188,
+      "step": 1540,
+      "tokens/total": 201621504,
+      "tokens/train_per_sec_per_gpu": 3360.0,
+      "tokens/trainable": 21467172
+    },
+    {
+      "epoch": 4.907643312101911,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 5.5591312164776646e-08,
+      "loss": 0.0018976753344759345,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0019,
+      "step": 1541,
+      "tokens/total": 201752576,
+      "tokens/train_per_sec_per_gpu": 3485.96,
+      "tokens/trainable": 21481770
+    },
+    {
+      "epoch": 4.9108280254777075,
+      "grad_norm": 0.126953125,
+      "learning_rate": 5.194825546187831e-08,
+      "loss": 0.0018805229337885976,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00188,
+      "step": 1542,
+      "tokens/total": 201883648,
+      "tokens/train_per_sec_per_gpu": 3369.24,
+      "tokens/trainable": 21495882
+    },
+    {
+      "epoch": 4.914012738853503,
+      "grad_norm": 0.1171875,
+      "learning_rate": 4.84285239733151e-08,
+      "loss": 0.0020450761076062918,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1543,
+      "tokens/total": 202014720,
+      "tokens/train_per_sec_per_gpu": 3212.2,
+      "tokens/trainable": 21509344
+    },
+    {
+      "epoch": 4.917197452229299,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 4.503213509811088e-08,
+      "loss": 0.00226628128439188,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00227,
+      "step": 1544,
+      "tokens/total": 202145792,
+      "tokens/train_per_sec_per_gpu": 3154.8,
+      "tokens/trainable": 21522558
+    },
+    {
+      "epoch": 4.920382165605096,
+      "grad_norm": 0.125,
+      "learning_rate": 4.175910562556895e-08,
+      "loss": 0.0018778032390400767,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00188,
+      "step": 1545,
+      "tokens/total": 202276864,
+      "tokens/train_per_sec_per_gpu": 3493.16,
+      "tokens/trainable": 21537180
+    },
+    {
+      "epoch": 4.923566878980892,
+      "grad_norm": 0.134765625,
+      "learning_rate": 3.860945173518593e-08,
+      "loss": 0.0019706811290234327,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00197,
+      "step": 1546,
+      "tokens/total": 202407936,
+      "tokens/train_per_sec_per_gpu": 3480.51,
+      "tokens/trainable": 21551752
+    },
+    {
+      "epoch": 4.926751592356688,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 3.5583188996587965e-08,
+      "loss": 0.001993852434679866,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.002,
+      "step": 1547,
+      "tokens/total": 202539008,
+      "tokens/train_per_sec_per_gpu": 3231.61,
+      "tokens/trainable": 21565284
+    },
+    {
+      "epoch": 4.929936305732484,
+      "grad_norm": 0.12890625,
+      "learning_rate": 3.26803323694419e-08,
+      "loss": 0.0025727523025125265,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00258,
+      "step": 1548,
+      "tokens/total": 202670080,
+      "tokens/train_per_sec_per_gpu": 3483.78,
+      "tokens/trainable": 21579872
+    },
+    {
+      "epoch": 4.93312101910828,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 2.990089620337755e-08,
+      "loss": 0.00160361104644835,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1549,
+      "tokens/total": 202801152,
+      "tokens/train_per_sec_per_gpu": 3091.07,
+      "tokens/trainable": 21592838
+    },
+    {
+      "epoch": 4.936305732484076,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 2.724489423792942e-08,
+      "loss": 0.002017256570979953,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00202,
+      "step": 1550,
+      "tokens/total": 202932224,
+      "tokens/train_per_sec_per_gpu": 3197.05,
+      "tokens/trainable": 21606214
+    },
+    {
+      "epoch": 4.939490445859873,
+      "grad_norm": 0.16015625,
+      "learning_rate": 2.4712339602461774e-08,
+      "loss": 0.0018039483111351728,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00181,
+      "step": 1551,
+      "tokens/total": 203063296,
+      "tokens/train_per_sec_per_gpu": 3119.44,
+      "tokens/trainable": 21619296
+    },
+    {
+      "epoch": 4.942675159235669,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 2.2303244816099244e-08,
+      "loss": 0.001978665590286255,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00198,
+      "step": 1552,
+      "tokens/total": 203194368,
+      "tokens/train_per_sec_per_gpu": 2961.33,
+      "tokens/trainable": 21631746
+    },
+    {
+      "epoch": 4.945859872611465,
+      "grad_norm": 0.130859375,
+      "learning_rate": 2.0017621787671303e-08,
+      "loss": 0.0023562528658658266,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00236,
+      "step": 1553,
+      "tokens/total": 203325440,
+      "tokens/train_per_sec_per_gpu": 3105.93,
+      "tokens/trainable": 21644684
+    },
+    {
+      "epoch": 4.949044585987261,
+      "grad_norm": 0.16796875,
+      "learning_rate": 1.7855481815659546e-08,
+      "loss": 0.0023984115105122328,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0024,
+      "step": 1554,
+      "tokens/total": 203456512,
+      "tokens/train_per_sec_per_gpu": 3024.15,
+      "tokens/trainable": 21657424
+    },
+    {
+      "epoch": 4.952229299363057,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 1.5816835588122748e-08,
+      "loss": 0.0020472980104386806,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00205,
+      "step": 1555,
+      "tokens/total": 203587584,
+      "tokens/train_per_sec_per_gpu": 3146.82,
+      "tokens/trainable": 21670582
+    },
+    {
+      "epoch": 4.955414012738854,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 1.3901693182660768e-08,
+      "loss": 0.002163731260225177,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00217,
+      "step": 1556,
+      "tokens/total": 203718656,
+      "tokens/train_per_sec_per_gpu": 3104.79,
+      "tokens/trainable": 21683604
+    },
+    {
+      "epoch": 4.95859872611465,
+      "grad_norm": 0.119140625,
+      "learning_rate": 1.2110064066361836e-08,
+      "loss": 0.002627151319757104,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00263,
+      "step": 1557,
+      "tokens/total": 203849728,
+      "tokens/train_per_sec_per_gpu": 3130.19,
+      "tokens/trainable": 21696704
+    },
+    {
+      "epoch": 4.961783439490446,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 1.0441957095752574e-08,
+      "loss": 0.0015952385729178786,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0016,
+      "step": 1558,
+      "tokens/total": 203980800,
+      "tokens/train_per_sec_per_gpu": 3351.63,
+      "tokens/trainable": 21710708
+    },
+    {
+      "epoch": 4.964968152866242,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 8.897380516748044e-09,
+      "loss": 0.002128974301740527,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00213,
+      "step": 1559,
+      "tokens/total": 204111872,
+      "tokens/train_per_sec_per_gpu": 3197.67,
+      "tokens/trainable": 21724200
+    },
+    {
+      "epoch": 4.968152866242038,
+      "grad_norm": 0.107421875,
+      "learning_rate": 7.476341964626766e-09,
+      "loss": 0.0021477844566106796,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00215,
+      "step": 1560,
+      "tokens/total": 204242944,
+      "tokens/train_per_sec_per_gpu": 3209.08,
+      "tokens/trainable": 21737668
+    },
+    {
+      "epoch": 4.971337579617835,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 6.178848463980758e-09,
+      "loss": 0.00202268292196095,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00202,
+      "step": 1561,
+      "tokens/total": 204374016,
+      "tokens/train_per_sec_per_gpu": 3300.82,
+      "tokens/trainable": 21751480
+    },
+    {
+      "epoch": 4.974522292993631,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 5.004906428685008e-09,
+      "loss": 0.0012176063610240817,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00122,
+      "step": 1562,
+      "tokens/total": 204505088,
+      "tokens/train_per_sec_per_gpu": 3387.1,
+      "tokens/trainable": 21765732
+    },
+    {
+      "epoch": 4.977707006369426,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 3.954521661861388e-09,
+      "loss": 0.0024965633638203144,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.0025,
+      "step": 1563,
+      "tokens/total": 204636160,
+      "tokens/train_per_sec_per_gpu": 3190.66,
+      "tokens/trainable": 21779126
+    },
+    {
+      "epoch": 4.980891719745223,
+      "grad_norm": 0.142578125,
+      "learning_rate": 3.027699355859226e-09,
+      "loss": 0.0016142029780894518,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00162,
+      "step": 1564,
+      "tokens/total": 204767232,
+      "tokens/train_per_sec_per_gpu": 3249.58,
+      "tokens/trainable": 21792812
+    },
+    {
+      "epoch": 4.984076433121019,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 2.2244440922164487e-09,
+      "loss": 0.0019386119674891233,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00194,
+      "step": 1565,
+      "tokens/total": 204898304,
+      "tokens/train_per_sec_per_gpu": 3286.69,
+      "tokens/trainable": 21806628
+    },
+    {
+      "epoch": 4.987261146496815,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 1.544759841654031e-09,
+      "loss": 0.001636000582948327,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00164,
+      "step": 1566,
+      "tokens/total": 205029376,
+      "tokens/train_per_sec_per_gpu": 3407.55,
+      "tokens/trainable": 21820912
+    },
+    {
+      "epoch": 4.990445859872612,
+      "grad_norm": 0.130859375,
+      "learning_rate": 9.886499640399116e-10,
+      "loss": 0.0022515307646244764,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00225,
+      "step": 1567,
+      "tokens/total": 205160448,
+      "tokens/train_per_sec_per_gpu": 3312.13,
+      "tokens/trainable": 21834766
+    },
+    {
+      "epoch": 4.993630573248407,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 5.561172083806688e-10,
+      "loss": 0.0021433548536151648,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00215,
+      "step": 1568,
+      "tokens/total": 205291520,
+      "tokens/train_per_sec_per_gpu": 3715.64,
+      "tokens/trainable": 21850308
+    },
+    {
+      "epoch": 4.996815286624204,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 2.4716371280764093e-10,
+      "loss": 0.002227420685812831,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 64.64,
+      "memory/max_allocated (GiB)": 64.64,
+      "ppl": 1.00223,
+      "step": 1569,
+      "tokens/total": 205422592,
+      "tokens/train_per_sec_per_gpu": 3672.72,
+      "tokens/trainable": 21865620
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 6.179100456582543e-11,
+      "loss": 0.002160376403480768,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 39.25,
+      "memory/max_allocated (GiB)": 39.25,
+      "ppl": 1.00216,
+      "step": 1570,
+      "tokens/total": 205496320,
+      "tokens/train_per_sec_per_gpu": 3367.0,
+      "tokens/trainable": 21873388
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.010312405414879322,
+      "eval_ppl": 1.01037,
+      "eval_runtime": 41.6326,
+      "eval_samples_per_second": 64.877,
+      "eval_steps_per_second": 4.059,
+      "memory/device_reserved (GiB)": 74.81,
+      "memory/max_active (GiB)": 54.61,
+      "memory/max_allocated (GiB)": 54.61,
+      "step": 1570
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1570,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 314,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2775166334468096e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}