IbrahimSalah
/

Arabic-TTS-Spark

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 44.270833333333336,
+  "eval_steps": 576,
+  "global_step": 25500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.001736111111111111,
+      "eval_loss": 9.594602584838867,
+      "eval_runtime": 41.3373,
+      "eval_samples_per_second": 90.209,
+      "eval_steps_per_second": 5.661,
+      "step": 1
+    },
+    {
+      "epoch": 0.08680555555555555,
+      "grad_norm": 12.75,
+      "learning_rate": 0.000196,
+      "loss": 7.4156,
+      "step": 50
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 13.3125,
+      "learning_rate": 0.0001999985665413352,
+      "loss": 4.4164,
+      "step": 100
+    },
+    {
+      "epoch": 0.2604166666666667,
+      "grad_norm": 5.78125,
+      "learning_rate": 0.00019999414859436728,
+      "loss": 4.1765,
+      "step": 150
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 11.0,
+      "learning_rate": 0.00019998674569395055,
+      "loss": 4.0896,
+      "step": 200
+    },
+    {
+      "epoch": 0.4340277777777778,
+      "grad_norm": 6.625,
+      "learning_rate": 0.000199976358061071,
+      "loss": 3.9586,
+      "step": 250
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.00019996298600581287,
+      "loss": 3.9273,
+      "step": 300
+    },
+    {
+      "epoch": 0.6076388888888888,
+      "grad_norm": 13.125,
+      "learning_rate": 0.0001999466299273491,
+      "loss": 3.8612,
+      "step": 350
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 7.0625,
+      "learning_rate": 0.00019992729031392958,
+      "loss": 3.8205,
+      "step": 400
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 8.75,
+      "learning_rate": 0.00019990496774286654,
+      "loss": 3.7956,
+      "step": 450
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 8.75,
+      "learning_rate": 0.00019987966288051735,
+      "loss": 3.7654,
+      "step": 500
+    },
+    {
+      "epoch": 0.9548611111111112,
+      "grad_norm": 14.0625,
+      "learning_rate": 0.00019985137648226457,
+      "loss": 3.6055,
+      "step": 550
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 3.320380210876465,
+      "eval_runtime": 41.8114,
+      "eval_samples_per_second": 89.186,
+      "eval_steps_per_second": 5.597,
+      "step": 576
+    },
+    {
+      "epoch": 1.0416666666666667,
+      "grad_norm": 13.875,
+      "learning_rate": 0.00019982010939249346,
+      "loss": 3.4141,
+      "step": 600
+    },
+    {
+      "epoch": 1.1284722222222223,
+      "grad_norm": 15.125,
+      "learning_rate": 0.0001997858625445666,
+      "loss": 3.3461,
+      "step": 650
+    },
+    {
+      "epoch": 1.2152777777777777,
+      "grad_norm": 13.25,
+      "learning_rate": 0.0001997486369607964,
+      "loss": 3.2968,
+      "step": 700
+    },
+    {
+      "epoch": 1.3020833333333333,
+      "grad_norm": 11.25,
+      "learning_rate": 0.00019970843375241416,
+      "loss": 3.2924,
+      "step": 750
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 12.5,
+      "learning_rate": 0.00019966525411953717,
+      "loss": 3.2577,
+      "step": 800
+    },
+    {
+      "epoch": 1.4756944444444444,
+      "grad_norm": 13.0625,
+      "learning_rate": 0.00019961909935113284,
+      "loss": 3.2544,
+      "step": 850
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 14.125,
+      "learning_rate": 0.00019956997082498009,
+      "loss": 3.2245,
+      "step": 900
+    },
+    {
+      "epoch": 1.6493055555555556,
+      "grad_norm": 9.1875,
+      "learning_rate": 0.00019951787000762835,
+      "loss": 3.2121,
+      "step": 950
+    },
+    {
+      "epoch": 1.7361111111111112,
+      "grad_norm": 13.125,
+      "learning_rate": 0.00019946279845435382,
+      "loss": 3.1861,
+      "step": 1000
+    },
+    {
+      "epoch": 1.8229166666666665,
+      "grad_norm": 8.8125,
+      "learning_rate": 0.0001994047578091129,
+      "loss": 3.1813,
+      "step": 1050
+    },
+    {
+      "epoch": 1.9097222222222223,
+      "grad_norm": 10.5,
+      "learning_rate": 0.00019934374980449325,
+      "loss": 3.1483,
+      "step": 1100
+    },
+    {
+      "epoch": 1.9965277777777777,
+      "grad_norm": 11.875,
+      "learning_rate": 0.00019927977626166193,
+      "loss": 3.1491,
+      "step": 1150
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 3.0681025981903076,
+      "eval_runtime": 41.9062,
+      "eval_samples_per_second": 88.984,
+      "eval_steps_per_second": 5.584,
+      "step": 1152
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": 10.25,
+      "learning_rate": 0.00019921283909031114,
+      "loss": 3.1364,
+      "step": 1200
+    },
+    {
+      "epoch": 2.170138888888889,
+      "grad_norm": 10.375,
+      "learning_rate": 0.00019914294028860127,
+      "loss": 3.1123,
+      "step": 1250
+    },
+    {
+      "epoch": 2.2569444444444446,
+      "grad_norm": 10.3125,
+      "learning_rate": 0.00019907008194310102,
+      "loss": 3.1234,
+      "step": 1300
+    },
+    {
+      "epoch": 2.34375,
+      "grad_norm": 12.9375,
+      "learning_rate": 0.00019899426622872543,
+      "loss": 3.1215,
+      "step": 1350
+    },
+    {
+      "epoch": 2.4305555555555554,
+      "grad_norm": 12.5,
+      "learning_rate": 0.00019891549540867066,
+      "loss": 3.0999,
+      "step": 1400
+    },
+    {
+      "epoch": 2.517361111111111,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.00019883377183434666,
+      "loss": 3.1192,
+      "step": 1450
+    },
+    {
+      "epoch": 2.6041666666666665,
+      "grad_norm": 7.6875,
+      "learning_rate": 0.00019874909794530675,
+      "loss": 3.0983,
+      "step": 1500
+    },
+    {
+      "epoch": 2.6909722222222223,
+      "grad_norm": 8.625,
+      "learning_rate": 0.0001986614762691751,
+      "loss": 3.0853,
+      "step": 1550
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": 12.875,
+      "learning_rate": 0.00019857090942157092,
+      "loss": 3.0822,
+      "step": 1600
+    },
+    {
+      "epoch": 2.8645833333333335,
+      "grad_norm": 11.3125,
+      "learning_rate": 0.00019847740010603068,
+      "loss": 3.0779,
+      "step": 1650
+    },
+    {
+      "epoch": 2.951388888888889,
+      "grad_norm": 7.3125,
+      "learning_rate": 0.00019838095111392726,
+      "loss": 3.0747,
+      "step": 1700
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 3.002568483352661,
+      "eval_runtime": 40.5832,
+      "eval_samples_per_second": 91.885,
+      "eval_steps_per_second": 5.766,
+      "step": 1728
+    },
+    {
+      "epoch": 3.0381944444444446,
+      "grad_norm": 11.625,
+      "learning_rate": 0.00019828156532438666,
+      "loss": 3.0638,
+      "step": 1750
+    },
+    {
+      "epoch": 3.125,
+      "grad_norm": 10.9375,
+      "learning_rate": 0.00019817924570420198,
+      "loss": 3.0585,
+      "step": 1800
+    },
+    {
+      "epoch": 3.2118055555555554,
+      "grad_norm": 7.0625,
+      "learning_rate": 0.00019807399530774502,
+      "loss": 3.0494,
+      "step": 1850
+    },
+    {
+      "epoch": 3.298611111111111,
+      "grad_norm": 9.125,
+      "learning_rate": 0.00019796581727687493,
+      "loss": 3.0628,
+      "step": 1900
+    },
+    {
+      "epoch": 3.3854166666666665,
+      "grad_norm": 11.875,
+      "learning_rate": 0.00019785471484084458,
+      "loss": 3.0529,
+      "step": 1950
+    },
+    {
+      "epoch": 3.4722222222222223,
+      "grad_norm": 14.9375,
+      "learning_rate": 0.00019774069131620398,
+      "loss": 3.0594,
+      "step": 2000
+    },
+    {
+      "epoch": 3.5590277777777777,
+      "grad_norm": 8.4375,
+      "learning_rate": 0.00019762375010670143,
+      "loss": 3.0478,
+      "step": 2050
+    },
+    {
+      "epoch": 3.6458333333333335,
+      "grad_norm": 9.8125,
+      "learning_rate": 0.0001975038947031819,
+      "loss": 3.0401,
+      "step": 2100
+    },
+    {
+      "epoch": 3.732638888888889,
+      "grad_norm": 11.0,
+      "learning_rate": 0.0001973811286834827,
+      "loss": 3.0339,
+      "step": 2150
+    },
+    {
+      "epoch": 3.8194444444444446,
+      "grad_norm": 9.0625,
+      "learning_rate": 0.00019725545571232686,
+      "loss": 3.0461,
+      "step": 2200
+    },
+    {
+      "epoch": 3.90625,
+      "grad_norm": 7.21875,
+      "learning_rate": 0.0001971268795412135,
+      "loss": 3.0156,
+      "step": 2250
+    },
+    {
+      "epoch": 3.9930555555555554,
+      "grad_norm": 9.75,
+      "learning_rate": 0.00019699540400830616,
+      "loss": 3.0261,
+      "step": 2300
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.960036516189575,
+      "eval_runtime": 41.7286,
+      "eval_samples_per_second": 89.363,
+      "eval_steps_per_second": 5.608,
+      "step": 2304
+    },
+    {
+      "epoch": 4.079861111111111,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.00019686103303831787,
+      "loss": 3.0194,
+      "step": 2350
+    },
+    {
+      "epoch": 4.166666666666667,
+      "grad_norm": 8.0,
+      "learning_rate": 0.0001967237706423943,
+      "loss": 2.9982,
+      "step": 2400
+    },
+    {
+      "epoch": 4.253472222222222,
+      "grad_norm": 10.0,
+      "learning_rate": 0.00019658362091799374,
+      "loss": 3.0147,
+      "step": 2450
+    },
+    {
+      "epoch": 4.340277777777778,
+      "grad_norm": 8.9375,
+      "learning_rate": 0.00019644058804876513,
+      "loss": 3.0187,
+      "step": 2500
+    },
+    {
+      "epoch": 4.427083333333333,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.0001962946763044228,
+      "loss": 3.0009,
+      "step": 2550
+    },
+    {
+      "epoch": 4.513888888888889,
+      "grad_norm": 6.96875,
+      "learning_rate": 0.00019614589004061928,
+      "loss": 3.0264,
+      "step": 2600
+    },
+    {
+      "epoch": 4.600694444444445,
+      "grad_norm": 8.6875,
+      "learning_rate": 0.0001959942336988152,
+      "loss": 3.0037,
+      "step": 2650
+    },
+    {
+      "epoch": 4.6875,
+      "grad_norm": 8.25,
+      "learning_rate": 0.0001958397118061466,
+      "loss": 3.0003,
+      "step": 2700
+    },
+    {
+      "epoch": 4.774305555555555,
+      "grad_norm": 7.1875,
+      "learning_rate": 0.00019568232897529002,
+      "loss": 2.9937,
+      "step": 2750
+    },
+    {
+      "epoch": 4.861111111111111,
+      "grad_norm": 7.5,
+      "learning_rate": 0.00019552208990432457,
+      "loss": 2.9977,
+      "step": 2800
+    },
+    {
+      "epoch": 4.947916666666667,
+      "grad_norm": 11.8125,
+      "learning_rate": 0.0001953589993765918,
+      "loss": 2.992,
+      "step": 2850
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.9334027767181396,
+      "eval_runtime": 42.3875,
+      "eval_samples_per_second": 87.974,
+      "eval_steps_per_second": 5.52,
+      "step": 2880
+    },
+    {
+      "epoch": 5.034722222222222,
+      "grad_norm": 8.8125,
+      "learning_rate": 0.000195193062260553,
+      "loss": 2.9851,
+      "step": 2900
+    },
+    {
+      "epoch": 5.121527777777778,
+      "grad_norm": 7.875,
+      "learning_rate": 0.00019502428350964355,
+      "loss": 2.9796,
+      "step": 2950
+    },
+    {
+      "epoch": 5.208333333333333,
+      "grad_norm": 6.53125,
+      "learning_rate": 0.00019485266816212548,
+      "loss": 2.977,
+      "step": 3000
+    },
+    {
+      "epoch": 5.295138888888889,
+      "grad_norm": 11.125,
+      "learning_rate": 0.00019467822134093684,
+      "loss": 2.9887,
+      "step": 3050
+    },
+    {
+      "epoch": 5.381944444444445,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.00019450094825353864,
+      "loss": 2.982,
+      "step": 3100
+    },
+    {
+      "epoch": 5.46875,
+      "grad_norm": 8.75,
+      "learning_rate": 0.00019432085419175975,
+      "loss": 2.9896,
+      "step": 3150
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.00019413794453163857,
+      "loss": 2.9854,
+      "step": 3200
+    },
+    {
+      "epoch": 5.642361111111111,
+      "grad_norm": 10.4375,
+      "learning_rate": 0.00019395222473326284,
+      "loss": 2.9749,
+      "step": 3250
+    },
+    {
+      "epoch": 5.729166666666667,
+      "grad_norm": 7.03125,
+      "learning_rate": 0.00019376370034060653,
+      "loss": 2.9705,
+      "step": 3300
+    },
+    {
+      "epoch": 5.815972222222222,
+      "grad_norm": 9.8125,
+      "learning_rate": 0.00019357237698136427,
+      "loss": 2.9855,
+      "step": 3350
+    },
+    {
+      "epoch": 5.902777777777778,
+      "grad_norm": 6.78125,
+      "learning_rate": 0.00019337826036678338,
+      "loss": 2.9596,
+      "step": 3400
+    },
+    {
+      "epoch": 5.989583333333333,
+      "grad_norm": 8.6875,
+      "learning_rate": 0.00019318135629149363,
+      "loss": 2.9692,
+      "step": 3450
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.9161436557769775,
+      "eval_runtime": 41.8777,
+      "eval_samples_per_second": 89.045,
+      "eval_steps_per_second": 5.588,
+      "step": 3456
+    },
+    {
+      "epoch": 6.076388888888889,
+      "grad_norm": 8.5625,
+      "learning_rate": 0.0001929816706333339,
+      "loss": 2.9666,
+      "step": 3500
+    },
+    {
+      "epoch": 6.163194444444445,
+      "grad_norm": 11.625,
+      "learning_rate": 0.00019277920935317688,
+      "loss": 2.9451,
+      "step": 3550
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 7.625,
+      "learning_rate": 0.00019257397849475124,
+      "loss": 2.9624,
+      "step": 3600
+    },
+    {
+      "epoch": 6.336805555555555,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.00019236598418446098,
+      "loss": 2.9722,
+      "step": 3650
+    },
+    {
+      "epoch": 6.423611111111111,
+      "grad_norm": 7.5,
+      "learning_rate": 0.00019215523263120283,
+      "loss": 2.9552,
+      "step": 3700
+    },
+    {
+      "epoch": 6.510416666666667,
+      "grad_norm": 10.625,
+      "learning_rate": 0.0001919417301261806,
+      "loss": 2.9844,
+      "step": 3750
+    },
+    {
+      "epoch": 6.597222222222222,
+      "grad_norm": 6.25,
+      "learning_rate": 0.00019172548304271768,
+      "loss": 2.9576,
+      "step": 3800
+    },
+    {
+      "epoch": 6.684027777777778,
+      "grad_norm": 8.25,
+      "learning_rate": 0.00019150649783606646,
+      "loss": 2.9598,
+      "step": 3850
+    },
+    {
+      "epoch": 6.770833333333333,
+      "grad_norm": 6.25,
+      "learning_rate": 0.00019128478104321603,
+      "loss": 2.9488,
+      "step": 3900
+    },
+    {
+      "epoch": 6.857638888888889,
+      "grad_norm": 8.25,
+      "learning_rate": 0.00019106033928269667,
+      "loss": 2.9591,
+      "step": 3950
+    },
+    {
+      "epoch": 6.944444444444445,
+      "grad_norm": 5.8125,
+      "learning_rate": 0.00019083317925438248,
+      "loss": 2.9501,
+      "step": 4000
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 2.90425968170166,
+      "eval_runtime": 41.3276,
+      "eval_samples_per_second": 90.23,
+      "eval_steps_per_second": 5.662,
+      "step": 4032
+    },
+    {
+      "epoch": 7.03125,
+      "grad_norm": 6.40625,
+      "learning_rate": 0.00019060330773929137,
+      "loss": 2.9478,
+      "step": 4050
+    },
+    {
+      "epoch": 7.118055555555555,
+      "grad_norm": 8.75,
+      "learning_rate": 0.00019037073159938256,
+      "loss": 2.9421,
+      "step": 4100
+    },
+    {
+      "epoch": 7.204861111111111,
+      "grad_norm": 6.1875,
+      "learning_rate": 0.00019013545777735183,
+      "loss": 2.9394,
+      "step": 4150
+    },
+    {
+      "epoch": 7.291666666666667,
+      "grad_norm": 6.71875,
+      "learning_rate": 0.00018989749329642418,
+      "loss": 2.9519,
+      "step": 4200
+    },
+    {
+      "epoch": 7.378472222222222,
+      "grad_norm": 6.25,
+      "learning_rate": 0.00018965684526014425,
+      "loss": 2.9475,
+      "step": 4250
+    },
+    {
+      "epoch": 7.465277777777778,
+      "grad_norm": 4.46875,
+      "learning_rate": 0.00018941352085216425,
+      "loss": 2.9507,
+      "step": 4300
+    },
+    {
+      "epoch": 7.552083333333333,
+      "grad_norm": 7.5625,
+      "learning_rate": 0.0001891675273360295,
+      "loss": 2.956,
+      "step": 4350
+    },
+    {
+      "epoch": 7.638888888888889,
+      "grad_norm": 8.1875,
+      "learning_rate": 0.00018891887205496163,
+      "loss": 2.9422,
+      "step": 4400
+    },
+    {
+      "epoch": 7.725694444444445,
+      "grad_norm": 6.625,
+      "learning_rate": 0.00018866756243163938,
+      "loss": 2.9379,
+      "step": 4450
+    },
+    {
+      "epoch": 7.8125,
+      "grad_norm": 7.46875,
+      "learning_rate": 0.00018841360596797695,
+      "loss": 2.9477,
+      "step": 4500
+    },
+    {
+      "epoch": 7.899305555555555,
+      "grad_norm": 9.3125,
+      "learning_rate": 0.0001881570102449002,
+      "loss": 2.9293,
+      "step": 4550
+    },
+    {
+      "epoch": 7.986111111111111,
+      "grad_norm": 8.375,
+      "learning_rate": 0.0001878977829221201,
+      "loss": 2.9379,
+      "step": 4600
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 2.894627571105957,
+      "eval_runtime": 42.0326,
+      "eval_samples_per_second": 88.717,
+      "eval_steps_per_second": 5.567,
+      "step": 4608
+    },
+    {
+      "epoch": 8.072916666666666,
+      "grad_norm": 7.625,
+      "learning_rate": 0.00018763593173790454,
+      "loss": 2.9327,
+      "step": 4650
+    },
+    {
+      "epoch": 8.159722222222221,
+      "grad_norm": 6.25,
+      "learning_rate": 0.00018737146450884668,
+      "loss": 2.917,
+      "step": 4700
+    },
+    {
+      "epoch": 8.246527777777779,
+      "grad_norm": 5.28125,
+      "learning_rate": 0.00018710438912963225,
+      "loss": 2.9335,
+      "step": 4750
+    },
+    {
+      "epoch": 8.333333333333334,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.00018683471357280347,
+      "loss": 2.9416,
+      "step": 4800
+    },
+    {
+      "epoch": 8.42013888888889,
+      "grad_norm": 6.84375,
+      "learning_rate": 0.00018656244588852124,
+      "loss": 2.9256,
+      "step": 4850
+    },
+    {
+      "epoch": 8.506944444444445,
+      "grad_norm": 5.5625,
+      "learning_rate": 0.00018628759420432473,
+      "loss": 2.9525,
+      "step": 4900
+    },
+    {
+      "epoch": 8.59375,
+      "grad_norm": 6.0625,
+      "learning_rate": 0.00018601016672488888,
+      "loss": 2.9268,
+      "step": 4950
+    },
+    {
+      "epoch": 8.680555555555555,
+      "grad_norm": 6.90625,
+      "learning_rate": 0.00018573017173177938,
+      "loss": 2.9347,
+      "step": 5000
+    },
+    {
+      "epoch": 8.76736111111111,
+      "grad_norm": 5.78125,
+      "learning_rate": 0.0001854476175832055,
+      "loss": 2.9267,
+      "step": 5050
+    },
+    {
+      "epoch": 8.854166666666666,
+      "grad_norm": 6.375,
+      "learning_rate": 0.00018516251271377064,
+      "loss": 2.9246,
+      "step": 5100
+    },
+    {
+      "epoch": 8.940972222222221,
+      "grad_norm": 6.0,
+      "learning_rate": 0.00018487486563422036,
+      "loss": 2.9221,
+      "step": 5150
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 2.8883821964263916,
+      "eval_runtime": 41.385,
+      "eval_samples_per_second": 90.105,
+      "eval_steps_per_second": 5.654,
+      "step": 5184
+    },
+    {
+      "epoch": 9.027777777777779,
+      "grad_norm": 7.875,
+      "learning_rate": 0.00018458468493118857,
+      "loss": 2.9219,
+      "step": 5200
+    },
+    {
+      "epoch": 9.114583333333334,
+      "grad_norm": 7.90625,
+      "learning_rate": 0.000184291979266941,
+      "loss": 2.9209,
+      "step": 5250
+    },
+    {
+      "epoch": 9.20138888888889,
+      "grad_norm": 6.4375,
+      "learning_rate": 0.00018399675737911677,
+      "loss": 2.9127,
+      "step": 5300
+    },
+    {
+      "epoch": 9.288194444444445,
+      "grad_norm": 5.9375,
+      "learning_rate": 0.00018369902808046748,
+      "loss": 2.9262,
+      "step": 5350
+    },
+    {
+      "epoch": 9.375,
+      "grad_norm": 6.5625,
+      "learning_rate": 0.0001833988002585941,
+      "loss": 2.9258,
+      "step": 5400
+    },
+    {
+      "epoch": 9.461805555555555,
+      "grad_norm": 5.8125,
+      "learning_rate": 0.00018309608287568182,
+      "loss": 2.9275,
+      "step": 5450
+    },
+    {
+      "epoch": 9.54861111111111,
+      "grad_norm": 6.25,
+      "learning_rate": 0.00018279088496823235,
+      "loss": 2.9312,
+      "step": 5500
+    },
+    {
+      "epoch": 9.635416666666666,
+      "grad_norm": 6.09375,
+      "learning_rate": 0.00018248321564679425,
+      "loss": 2.9205,
+      "step": 5550
+    },
+    {
+      "epoch": 9.722222222222221,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.0001821730840956909,
+      "loss": 2.9203,
+      "step": 5600
+    },
+    {
+      "epoch": 9.809027777777779,
+      "grad_norm": 4.6875,
+      "learning_rate": 0.00018186049957274656,
+      "loss": 2.9264,
+      "step": 5650
+    },
+    {
+      "epoch": 9.895833333333334,
+      "grad_norm": 5.0,
+      "learning_rate": 0.0001815454714090096,
+      "loss": 2.9109,
+      "step": 5700
+    },
+    {
+      "epoch": 9.98263888888889,
+      "grad_norm": 5.875,
+      "learning_rate": 0.0001812280090084744,
+      "loss": 2.9139,
+      "step": 5750
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 2.8820853233337402,
+      "eval_runtime": 42.0383,
+      "eval_samples_per_second": 88.705,
+      "eval_steps_per_second": 5.566,
+      "step": 5760
+    },
+    {
+      "epoch": 10.069444444444445,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.00018090812184780032,
+      "loss": 2.9105,
+      "step": 5800
+    },
+    {
+      "epoch": 10.15625,
+      "grad_norm": 5.59375,
+      "learning_rate": 0.000180585819476029,
+      "loss": 2.9039,
+      "step": 5850
+    },
+    {
+      "epoch": 10.243055555555555,
+      "grad_norm": 5.84375,
+      "learning_rate": 0.0001802611115142991,
+      "loss": 2.9122,
+      "step": 5900
+    },
+    {
+      "epoch": 10.32986111111111,
+      "grad_norm": 6.75,
+      "learning_rate": 0.00017993400765555932,
+      "loss": 2.9233,
+      "step": 5950
+    },
+    {
+      "epoch": 10.416666666666666,
+      "grad_norm": 5.71875,
+      "learning_rate": 0.00017960451766427897,
+      "loss": 2.9075,
+      "step": 6000
+    },
+    {
+      "epoch": 10.503472222222221,
+      "grad_norm": 6.09375,
+      "learning_rate": 0.00017927265137615637,
+      "loss": 2.937,
+      "step": 6050
+    },
+    {
+      "epoch": 10.590277777777779,
+      "grad_norm": 4.90625,
+      "learning_rate": 0.00017893841869782547,
+      "loss": 2.9075,
+      "step": 6100
+    },
+    {
+      "epoch": 10.677083333333334,
+      "grad_norm": 5.5625,
+      "learning_rate": 0.0001786018296065599,
+      "loss": 2.9184,
+      "step": 6150
+    },
+    {
+      "epoch": 10.76388888888889,
+      "grad_norm": 5.71875,
+      "learning_rate": 0.0001782628941499753,
+      "loss": 2.9093,
+      "step": 6200
+    },
+    {
+      "epoch": 10.850694444444445,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.00017792162244572928,
+      "loss": 2.911,
+      "step": 6250
+    },
+    {
+      "epoch": 10.9375,
+      "grad_norm": 8.125,
+      "learning_rate": 0.00017757802468121946,
+      "loss": 2.9023,
+      "step": 6300
+    },
+    {
+      "epoch": 11.0,
+      "eval_loss": 2.8765242099761963,
+      "eval_runtime": 40.8481,
+      "eval_samples_per_second": 91.289,
+      "eval_steps_per_second": 5.729,
+      "step": 6336
+    },
+    {
+      "epoch": 11.024305555555555,
+      "grad_norm": 4.3125,
+      "learning_rate": 0.00017723211111327934,
+      "loss": 2.9075,
+      "step": 6350
+    },
+    {
+      "epoch": 11.11111111111111,
+      "grad_norm": 4.6875,
+      "learning_rate": 0.0001768838920678721,
+      "loss": 2.9027,
+      "step": 6400
+    },
+    {
+      "epoch": 11.197916666666666,
+      "grad_norm": 8.375,
+      "learning_rate": 0.00017653337793978237,
+      "loss": 2.8971,
+      "step": 6450
+    },
+    {
+      "epoch": 11.284722222222221,
+      "grad_norm": 6.34375,
+      "learning_rate": 0.00017618057919230597,
+      "loss": 2.9095,
+      "step": 6500
+    },
+    {
+      "epoch": 11.371527777777779,
+      "grad_norm": 10.125,
+      "learning_rate": 0.00017582550635693753,
+      "loss": 2.9108,
+      "step": 6550
+    },
+    {
+      "epoch": 11.458333333333334,
+      "grad_norm": 9.375,
+      "learning_rate": 0.0001754681700330561,
+      "loss": 2.9115,
+      "step": 6600
+    },
+    {
+      "epoch": 11.54513888888889,
+      "grad_norm": 5.96875,
+      "learning_rate": 0.00017510858088760876,
+      "loss": 2.9137,
+      "step": 6650
+    },
+    {
+      "epoch": 11.631944444444445,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.00017474674965479222,
+      "loss": 2.91,
+      "step": 6700
+    },
+    {
+      "epoch": 11.71875,
+      "grad_norm": 9.8125,
+      "learning_rate": 0.00017438268713573237,
+      "loss": 2.9037,
+      "step": 6750
+    },
+    {
+      "epoch": 11.805555555555555,
+      "grad_norm": 4.75,
+      "learning_rate": 0.00017401640419816182,
+      "loss": 2.9103,
+      "step": 6800
+    },
+    {
+      "epoch": 11.89236111111111,
+      "grad_norm": 6.96875,
+      "learning_rate": 0.00017364791177609554,
+      "loss": 2.895,
+      "step": 6850
+    },
+    {
+      "epoch": 11.979166666666666,
+      "grad_norm": 6.0625,
+      "learning_rate": 0.00017327722086950446,
+      "loss": 2.8989,
+      "step": 6900
+    },
+    {
+      "epoch": 12.0,
+      "eval_loss": 2.872136116027832,
+      "eval_runtime": 41.6305,
+      "eval_samples_per_second": 89.574,
+      "eval_steps_per_second": 5.621,
+      "step": 6912
+    },
+    {
+      "epoch": 12.065972222222221,
+      "grad_norm": 7.75,
+      "learning_rate": 0.0001729043425439871,
+      "loss": 2.8952,
+      "step": 6950
+    },
+    {
+      "epoch": 12.152777777777779,
+      "grad_norm": 5.84375,
+      "learning_rate": 0.00017252928793043916,
+      "loss": 2.8915,
+      "step": 7000
+    },
+    {
+      "epoch": 12.239583333333334,
+      "grad_norm": 6.5625,
+      "learning_rate": 0.00017215206822472143,
+      "loss": 2.8955,
+      "step": 7050
+    },
+    {
+      "epoch": 12.32638888888889,
+      "grad_norm": 5.875,
+      "learning_rate": 0.00017177269468732535,
+      "loss": 2.9131,
+      "step": 7100
+    },
+    {
+      "epoch": 12.413194444444445,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.00017139117864303714,
+      "loss": 2.8935,
+      "step": 7150
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 6.96875,
+      "learning_rate": 0.0001710075314805995,
+      "loss": 2.9223,
+      "step": 7200
+    },
+    {
+      "epoch": 12.586805555555555,
+      "grad_norm": 5.71875,
+      "learning_rate": 0.00017062176465237175,
+      "loss": 2.8979,
+      "step": 7250
+    },
+    {
+      "epoch": 12.67361111111111,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.00017023388967398796,
+      "loss": 2.9076,
+      "step": 7300
+    },
+    {
+      "epoch": 12.760416666666666,
+      "grad_norm": 6.0625,
+      "learning_rate": 0.00016984391812401316,
+      "loss": 2.8939,
+      "step": 7350
+    },
+    {
+      "epoch": 12.847222222222221,
+      "grad_norm": 5.03125,
+      "learning_rate": 0.00016945186164359782,
+      "loss": 2.9007,
+      "step": 7400
+    },
+    {
+      "epoch": 12.934027777777779,
+      "grad_norm": 5.46875,
+      "learning_rate": 0.00016905773193613013,
+      "loss": 2.891,
+      "step": 7450
+    },
+    {
+      "epoch": 13.0,
+      "eval_loss": 2.869907855987549,
+      "eval_runtime": 41.6939,
+      "eval_samples_per_second": 89.437,
+      "eval_steps_per_second": 5.612,
+      "step": 7488
+    },
+    {
+      "epoch": 13.020833333333334,
+      "grad_norm": 5.375,
+      "learning_rate": 0.00016866154076688683,
+      "loss": 2.8958,
+      "step": 7500
+    },
+    {
+      "epoch": 13.10763888888889,
+      "grad_norm": 5.03125,
+      "learning_rate": 0.00016826329996268196,
+      "loss": 2.8938,
+      "step": 7550
+    },
+    {
+      "epoch": 13.194444444444445,
+      "grad_norm": 6.0625,
+      "learning_rate": 0.00016786302141151368,
+      "loss": 2.8862,
+      "step": 7600
+    },
+    {
+      "epoch": 13.28125,
+      "grad_norm": 5.21875,
+      "learning_rate": 0.00016746071706220966,
+      "loss": 2.8969,
+      "step": 7650
+    },
+    {
+      "epoch": 13.368055555555555,
+      "grad_norm": 6.8125,
+      "learning_rate": 0.00016705639892407014,
+      "loss": 2.9042,
+      "step": 7700
+    },
+    {
+      "epoch": 13.45486111111111,
+      "grad_norm": 5.28125,
+      "learning_rate": 0.00016665007906650948,
+      "loss": 2.8953,
+      "step": 7750
+    },
+    {
+      "epoch": 13.541666666666666,
+      "grad_norm": 8.1875,
+      "learning_rate": 0.00016624176961869616,
+      "loss": 2.908,
+      "step": 7800
+    },
+    {
+      "epoch": 13.628472222222221,
+      "grad_norm": 5.34375,
+      "learning_rate": 0.0001658314827691902,
+      "loss": 2.8964,
+      "step": 7850
+    },
+    {
+      "epoch": 13.715277777777779,
+      "grad_norm": 5.0,
+      "learning_rate": 0.00016541923076557978,
+      "loss": 2.8924,
+      "step": 7900
+    },
+    {
+      "epoch": 13.802083333333334,
+      "grad_norm": 5.28125,
+      "learning_rate": 0.0001650050259141154,
+      "loss": 2.9024,
+      "step": 7950
+    },
+    {
+      "epoch": 13.88888888888889,
+      "grad_norm": 4.71875,
+      "learning_rate": 0.00016458888057934248,
+      "loss": 2.884,
+      "step": 8000
+    },
+    {
+      "epoch": 13.975694444444445,
+      "grad_norm": 11.4375,
+      "learning_rate": 0.0001641708071837325,
+      "loss": 2.8926,
+      "step": 8050
+    },
+    {
+      "epoch": 14.0,
+      "eval_loss": 2.8657476902008057,
+      "eval_runtime": 41.9302,
+      "eval_samples_per_second": 88.934,
+      "eval_steps_per_second": 5.581,
+      "step": 8064
+    },
+    {
+      "epoch": 14.0625,
+      "grad_norm": 6.40625,
+      "learning_rate": 0.00016375081820731193,
+      "loss": 2.8867,
+      "step": 8100
+    },
+    {
+      "epoch": 14.149305555555555,
+      "grad_norm": 4.625,
+      "learning_rate": 0.00016332892618728986,
+      "loss": 2.8829,
+      "step": 8150
+    },
+    {
+      "epoch": 14.23611111111111,
+      "grad_norm": 4.1875,
+      "learning_rate": 0.00016290514371768356,
+      "loss": 2.8852,
+      "step": 8200
+    },
+    {
+      "epoch": 14.322916666666666,
+      "grad_norm": 4.3125,
+      "learning_rate": 0.0001624794834489427,
+      "loss": 2.9058,
+      "step": 8250
+    },
+    {
+      "epoch": 14.409722222222221,
+      "grad_norm": 4.53125,
+      "learning_rate": 0.00016205195808757173,
+      "loss": 2.8848,
+      "step": 8300
+    },
+    {
+      "epoch": 14.496527777777779,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.00016162258039575033,
+      "loss": 2.9088,
+      "step": 8350
+    },
+    {
+      "epoch": 14.583333333333334,
+      "grad_norm": 7.34375,
+      "learning_rate": 0.0001611913631909528,
+      "loss": 2.8913,
+      "step": 8400
+    },
+    {
+      "epoch": 14.67013888888889,
+      "grad_norm": 6.0,
+      "learning_rate": 0.00016075831934556518,
+      "loss": 2.9013,
+      "step": 8450
+    },
+    {
+      "epoch": 14.756944444444445,
+      "grad_norm": 8.9375,
+      "learning_rate": 0.00016032346178650105,
+      "loss": 2.8843,
+      "step": 8500
+    },
+    {
+      "epoch": 14.84375,
+      "grad_norm": 4.9375,
+      "learning_rate": 0.0001598868034948157,
+      "loss": 2.8901,
+      "step": 8550
+    },
+    {
+      "epoch": 14.930555555555555,
+      "grad_norm": 5.40625,
+      "learning_rate": 0.00015944835750531858,
+      "loss": 2.8824,
+      "step": 8600
+    },
+    {
+      "epoch": 15.0,
+      "eval_loss": 2.8647055625915527,
+      "eval_runtime": 41.7282,
+      "eval_samples_per_second": 89.364,
+      "eval_steps_per_second": 5.608,
+      "step": 8640
+    },
+    {
+      "epoch": 15.01736111111111,
+      "grad_norm": 8.125,
+      "learning_rate": 0.0001590081369061842,
+      "loss": 2.8874,
+      "step": 8650
+    },
+    {
+      "epoch": 15.104166666666666,
+      "grad_norm": 6.375,
+      "learning_rate": 0.00015856615483856153,
+      "loss": 2.8822,
+      "step": 8700
+    },
+    {
+      "epoch": 15.190972222222221,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.00015812242449618147,
+      "loss": 2.8752,
+      "step": 8750
+    },
+    {
+      "epoch": 15.277777777777779,
+      "grad_norm": 7.15625,
+      "learning_rate": 0.0001576769591249633,
+      "loss": 2.8873,
+      "step": 8800
+    },
+    {
+      "epoch": 15.364583333333334,
+      "grad_norm": 5.0625,
+      "learning_rate": 0.0001572297720226191,
+      "loss": 2.8993,
+      "step": 8850
+    },
+    {
+      "epoch": 15.45138888888889,
+      "grad_norm": 5.8125,
+      "learning_rate": 0.00015678087653825675,
+      "loss": 2.8854,
+      "step": 8900
+    },
+    {
+      "epoch": 15.538194444444445,
+      "grad_norm": 4.03125,
+      "learning_rate": 0.0001563302860719816,
+      "loss": 2.8994,
+      "step": 8950
+    },
+    {
+      "epoch": 15.625,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.00015587801407449648,
+      "loss": 2.8893,
+      "step": 9000
+    },
+    {
+      "epoch": 15.711805555555555,
+      "grad_norm": 6.25,
+      "learning_rate": 0.0001554240740466998,
+      "loss": 2.8871,
+      "step": 9050
+    },
+    {
+      "epoch": 15.79861111111111,
+      "grad_norm": 10.0625,
+      "learning_rate": 0.00015496847953928313,
+      "loss": 2.8935,
+      "step": 9100
+    },
+    {
+      "epoch": 15.885416666666666,
+      "grad_norm": 5.9375,
+      "learning_rate": 0.00015451124415232615,
+      "loss": 2.8775,
+      "step": 9150
+    },
+    {
+      "epoch": 15.972222222222221,
+      "grad_norm": 7.65625,
+      "learning_rate": 0.00015405238153489096,
+      "loss": 2.8831,
+      "step": 9200
+    },
+    {
+      "epoch": 16.0,
+      "eval_loss": 2.8630547523498535,
+      "eval_runtime": 40.8978,
+      "eval_samples_per_second": 91.179,
+      "eval_steps_per_second": 5.722,
+      "step": 9216
+    },
+    {
+      "epoch": 16.05902777777778,
+      "grad_norm": 5.78125,
+      "learning_rate": 0.00015359190538461462,
+      "loss": 2.88,
+      "step": 9250
+    },
+    {
+      "epoch": 16.145833333333332,
+      "grad_norm": 5.46875,
+      "learning_rate": 0.00015312982944730018,
+      "loss": 2.8777,
+      "step": 9300
+    },
+    {
+      "epoch": 16.23263888888889,
+      "grad_norm": 6.71875,
+      "learning_rate": 0.00015266616751650642,
+      "loss": 2.8785,
+      "step": 9350
+    },
+    {
+      "epoch": 16.319444444444443,
+      "grad_norm": 4.46875,
+      "learning_rate": 0.00015220093343313592,
+      "loss": 2.8968,
+      "step": 9400
+    },
+    {
+      "epoch": 16.40625,
+      "grad_norm": 5.125,
+      "learning_rate": 0.00015173414108502224,
+      "loss": 2.877,
+      "step": 9450
+    },
+    {
+      "epoch": 16.493055555555557,
+      "grad_norm": 5.03125,
+      "learning_rate": 0.00015126580440651496,
+      "loss": 2.9016,
+      "step": 9500
+    },
+    {
+      "epoch": 16.57986111111111,
+      "grad_norm": 6.71875,
+      "learning_rate": 0.00015079593737806399,
+      "loss": 2.8841,
+      "step": 9550
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 5.21875,
+      "learning_rate": 0.00015032455402580217,
+      "loss": 2.8937,
+      "step": 9600
+    },
+    {
+      "epoch": 16.75347222222222,
+      "grad_norm": 5.59375,
+      "learning_rate": 0.00014985166842112644,
+      "loss": 2.8789,
+      "step": 9650
+    },
+    {
+      "epoch": 16.84027777777778,
+      "grad_norm": 5.5625,
+      "learning_rate": 0.00014937729468027797,
+      "loss": 2.8883,
+      "step": 9700
+    },
+    {
+      "epoch": 16.927083333333332,
+      "grad_norm": 5.15625,
+      "learning_rate": 0.00014890144696392074,
+      "loss": 2.8751,
+      "step": 9750
+    },
+    {
+      "epoch": 17.0,
+      "eval_loss": 2.862104892730713,
+      "eval_runtime": 41.9728,
+      "eval_samples_per_second": 88.843,
+      "eval_steps_per_second": 5.575,
+      "step": 9792
+    },
+    {
+      "epoch": 17.01388888888889,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.00014842413947671872,
+      "loss": 2.8821,
+      "step": 9800
+    },
+    {
+      "epoch": 17.100694444444443,
+      "grad_norm": 5.71875,
+      "learning_rate": 0.0001479453864669119,
+      "loss": 2.8785,
+      "step": 9850
+    },
+    {
+      "epoch": 17.1875,
+      "grad_norm": 5.4375,
+      "learning_rate": 0.00014746520222589103,
+      "loss": 2.8715,
+      "step": 9900
+    },
+    {
+      "epoch": 17.274305555555557,
+      "grad_norm": 11.8125,
+      "learning_rate": 0.00014698360108777097,
+      "loss": 2.8826,
+      "step": 9950
+    },
+    {
+      "epoch": 17.36111111111111,
+      "grad_norm": 5.96875,
+      "learning_rate": 0.00014650059742896265,
+      "loss": 2.8958,
+      "step": 10000
+    },
+    {
+      "epoch": 17.447916666666668,
+      "grad_norm": 5.75,
+      "learning_rate": 0.00014601620566774415,
+      "loss": 2.8751,
+      "step": 10050
+    },
+    {
+      "epoch": 17.53472222222222,
+      "grad_norm": 4.53125,
+      "learning_rate": 0.00014553044026383014,
+      "loss": 2.8925,
+      "step": 10100
+    },
+    {
+      "epoch": 17.62152777777778,
+      "grad_norm": 5.28125,
+      "learning_rate": 0.0001450433157179403,
+      "loss": 2.8889,
+      "step": 10150
+    },
+    {
+      "epoch": 17.708333333333332,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.00014455484657136642,
+      "loss": 2.8807,
+      "step": 10200
+    },
+    {
+      "epoch": 17.79513888888889,
+      "grad_norm": 5.375,
+      "learning_rate": 0.00014406504740553837,
+      "loss": 2.8836,
+      "step": 10250
+    },
+    {
+      "epoch": 17.881944444444443,
+      "grad_norm": 5.125,
+      "learning_rate": 0.00014357393284158878,
+      "loss": 2.8723,
+      "step": 10300
+    },
+    {
+      "epoch": 17.96875,
+      "grad_norm": 5.3125,
+      "learning_rate": 0.00014308151753991658,
+      "loss": 2.881,
+      "step": 10350
+    },
+    {
+      "epoch": 18.0,
+      "eval_loss": 2.8605997562408447,
+      "eval_runtime": 40.45,
+      "eval_samples_per_second": 92.188,
+      "eval_steps_per_second": 5.785,
+      "step": 10368
+    },
+    {
+      "epoch": 18.055555555555557,
+      "grad_norm": 4.5625,
+      "learning_rate": 0.00014258781619974945,
+      "loss": 2.8781,
+      "step": 10400
+    },
+    {
+      "epoch": 18.14236111111111,
+      "grad_norm": 4.625,
+      "learning_rate": 0.00014209284355870492,
+      "loss": 2.8705,
+      "step": 10450
+    },
+    {
+      "epoch": 18.229166666666668,
+      "grad_norm": 6.09375,
+      "learning_rate": 0.00014159661439235046,
+      "loss": 2.876,
+      "step": 10500
+    },
+    {
+      "epoch": 18.31597222222222,
+      "grad_norm": 4.875,
+      "learning_rate": 0.0001410991435137625,
+      "loss": 2.8918,
+      "step": 10550
+    },
+    {
+      "epoch": 18.40277777777778,
+      "grad_norm": 4.375,
+      "learning_rate": 0.00014060044577308408,
+      "loss": 2.8759,
+      "step": 10600
+    },
+    {
+      "epoch": 18.489583333333332,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.00014010053605708174,
+      "loss": 2.8958,
+      "step": 10650
+    },
+    {
+      "epoch": 18.57638888888889,
+      "grad_norm": 5.125,
+      "learning_rate": 0.000139599429288701,
+      "loss": 2.8763,
+      "step": 10700
+    },
+    {
+      "epoch": 18.663194444444443,
+      "grad_norm": 4.96875,
+      "learning_rate": 0.00013909714042662085,
+      "loss": 2.8905,
+      "step": 10750
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 5.625,
+      "learning_rate": 0.00013859368446480743,
+      "loss": 2.8782,
+      "step": 10800
+    },
+    {
+      "epoch": 18.836805555555557,
+      "grad_norm": 5.375,
+      "learning_rate": 0.0001380890764320662,
+      "loss": 2.8834,
+      "step": 10850
+    },
+    {
+      "epoch": 18.92361111111111,
+      "grad_norm": 3.984375,
+      "learning_rate": 0.00013758333139159343,
+      "loss": 2.8705,
+      "step": 10900
+    },
+    {
+      "epoch": 19.0,
+      "eval_loss": 2.8602795600891113,
+      "eval_runtime": 40.4167,
+      "eval_samples_per_second": 92.264,
+      "eval_steps_per_second": 5.79,
+      "step": 10944
+    },
+    {
+      "epoch": 19.010416666666668,
+      "grad_norm": 5.71875,
+      "learning_rate": 0.00013707646444052656,
+      "loss": 2.8757,
+      "step": 10950
+    },
+    {
+      "epoch": 19.09722222222222,
+      "grad_norm": 7.3125,
+      "learning_rate": 0.0001365684907094935,
+      "loss": 2.8753,
+      "step": 11000
+    },
+    {
+      "epoch": 19.18402777777778,
+      "grad_norm": 5.78125,
+      "learning_rate": 0.0001360594253621609,
+      "loss": 2.8632,
+      "step": 11050
+    },
+    {
+      "epoch": 19.270833333333332,
+      "grad_norm": 5.28125,
+      "learning_rate": 0.0001355492835947816,
+      "loss": 2.8771,
+      "step": 11100
+    },
+    {
+      "epoch": 19.35763888888889,
+      "grad_norm": 4.53125,
+      "learning_rate": 0.000135038080635741,
+      "loss": 2.8945,
+      "step": 11150
+    },
+    {
+      "epoch": 19.444444444444443,
+      "grad_norm": 4.25,
+      "learning_rate": 0.00013452583174510237,
+      "loss": 2.87,
+      "step": 11200
+    },
+    {
+      "epoch": 19.53125,
+      "grad_norm": 5.1875,
+      "learning_rate": 0.0001340125522141514,
+      "loss": 2.8948,
+      "step": 11250
+    },
+    {
+      "epoch": 19.618055555555557,
+      "grad_norm": 5.9375,
+      "learning_rate": 0.00013349825736493965,
+      "loss": 2.882,
+      "step": 11300
+    },
+    {
+      "epoch": 19.70486111111111,
+      "grad_norm": 7.0,
+      "learning_rate": 0.00013298296254982733,
+      "loss": 2.8753,
+      "step": 11350
+    },
+    {
+      "epoch": 19.791666666666668,
+      "grad_norm": 4.78125,
+      "learning_rate": 0.00013246668315102487,
+      "loss": 2.8823,
+      "step": 11400
+    },
+    {
+      "epoch": 19.87847222222222,
+      "grad_norm": 4.59375,
+      "learning_rate": 0.00013194943458013375,
+      "loss": 2.8675,
+      "step": 11450
+    },
+    {
+      "epoch": 19.96527777777778,
+      "grad_norm": 6.0,
+      "learning_rate": 0.00013143123227768658,
+      "loss": 2.8765,
+      "step": 11500
+    },
+    {
+      "epoch": 20.0,
+      "eval_loss": 2.8591601848602295,
+      "eval_runtime": 41.4885,
+      "eval_samples_per_second": 89.88,
+      "eval_steps_per_second": 5.64,
+      "step": 11520
+    },
+    {
+      "epoch": 20.052083333333332,
+      "grad_norm": 4.9375,
+      "learning_rate": 0.00013091209171268599,
+      "loss": 2.8735,
+      "step": 11550
+    },
+    {
+      "epoch": 20.13888888888889,
+      "grad_norm": 5.40625,
+      "learning_rate": 0.00013039202838214294,
+      "loss": 2.8698,
+      "step": 11600
+    },
+    {
+      "epoch": 20.225694444444443,
+      "grad_norm": 5.34375,
+      "learning_rate": 0.0001298710578106142,
+      "loss": 2.8702,
+      "step": 11650
+    },
+    {
+      "epoch": 20.3125,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.00012934919554973874,
+      "loss": 2.8871,
+      "step": 11700
+    },
+    {
+      "epoch": 20.399305555555557,
+      "grad_norm": 4.375,
+      "learning_rate": 0.00012882645717777376,
+      "loss": 2.8752,
+      "step": 11750
+    },
+    {
+      "epoch": 20.48611111111111,
+      "grad_norm": 6.78125,
+      "learning_rate": 0.00012830285829912926,
+      "loss": 2.8896,
+      "step": 11800
+    },
+    {
+      "epoch": 20.572916666666668,
+      "grad_norm": 5.21875,
+      "learning_rate": 0.00012777841454390275,
+      "loss": 2.8768,
+      "step": 11850
+    },
+    {
+      "epoch": 20.65972222222222,
+      "grad_norm": 5.59375,
+      "learning_rate": 0.00012725314156741214,
+      "loss": 2.8846,
+      "step": 11900
+    },
+    {
+      "epoch": 20.74652777777778,
+      "grad_norm": 5.25,
+      "learning_rate": 0.00012672705504972884,
+      "loss": 2.873,
+      "step": 11950
+    },
+    {
+      "epoch": 20.833333333333332,
+      "grad_norm": 5.6875,
+      "learning_rate": 0.00012620017069520936,
+      "loss": 2.8809,
+      "step": 12000
+    },
+    {
+      "epoch": 20.92013888888889,
+      "grad_norm": 4.40625,
+      "learning_rate": 0.00012567250423202675,
+      "loss": 2.8656,
+      "step": 12050
+    },
+    {
+      "epoch": 21.0,
+      "eval_loss": 2.857980489730835,
+      "eval_runtime": 41.0572,
+      "eval_samples_per_second": 90.825,
+      "eval_steps_per_second": 5.699,
+      "step": 12096
+    },
+    {
+      "epoch": 21.006944444444443,
+      "grad_norm": 5.5,
+      "learning_rate": 0.00012514407141170104,
+      "loss": 2.8738,
+      "step": 12100
+    },
+    {
+      "epoch": 21.09375,
+      "grad_norm": 4.78125,
+      "learning_rate": 0.00012461488800862887,
+      "loss": 2.8725,
+      "step": 12150
+    },
+    {
+      "epoch": 21.180555555555557,
+      "grad_norm": 8.375,
+      "learning_rate": 0.00012408496981961288,
+      "loss": 2.8628,
+      "step": 12200
+    },
+    {
+      "epoch": 21.26736111111111,
+      "grad_norm": 7.375,
+      "learning_rate": 0.00012355433266338992,
+      "loss": 2.8733,
+      "step": 12250
+    },
+    {
+      "epoch": 21.354166666666668,
+      "grad_norm": 4.65625,
+      "learning_rate": 0.00012302299238015895,
+      "loss": 2.8901,
+      "step": 12300
+    },
+    {
+      "epoch": 21.44097222222222,
+      "grad_norm": 5.0625,
+      "learning_rate": 0.0001224909648311082,
+      "loss": 2.8696,
+      "step": 12350
+    },
+    {
+      "epoch": 21.52777777777778,
+      "grad_norm": 6.09375,
+      "learning_rate": 0.00012195826589794162,
+      "loss": 2.8925,
+      "step": 12400
+    },
+    {
+      "epoch": 21.614583333333332,
+      "grad_norm": 6.0625,
+      "learning_rate": 0.00012142491148240491,
+      "loss": 2.8764,
+      "step": 12450
+    },
+    {
+      "epoch": 21.70138888888889,
+      "grad_norm": 4.90625,
+      "learning_rate": 0.00012089091750581067,
+      "loss": 2.8716,
+      "step": 12500
+    },
+    {
+      "epoch": 21.788194444444443,
+      "grad_norm": 5.9375,
+      "learning_rate": 0.0001203562999085633,
+      "loss": 2.8816,
+      "step": 12550
+    },
+    {
+      "epoch": 21.875,
+      "grad_norm": 5.59375,
+      "learning_rate": 0.00011982107464968298,
+      "loss": 2.8677,
+      "step": 12600
+    },
+    {
+      "epoch": 21.961805555555557,
+      "grad_norm": 5.40625,
+      "learning_rate": 0.00011928525770632946,
+      "loss": 2.8729,
+      "step": 12650
+    },
+    {
+      "epoch": 22.0,
+      "eval_loss": 2.857877016067505,
+      "eval_runtime": 42.4855,
+      "eval_samples_per_second": 87.771,
+      "eval_steps_per_second": 5.508,
+      "step": 12672
+    },
+    {
+      "epoch": 22.04861111111111,
+      "grad_norm": 7.5,
+      "learning_rate": 0.000118748865073325,
+      "loss": 2.8712,
+      "step": 12700
+    },
+    {
+      "epoch": 22.135416666666668,
+      "grad_norm": 5.65625,
+      "learning_rate": 0.00011821191276267684,
+      "loss": 2.867,
+      "step": 12750
+    },
+    {
+      "epoch": 22.22222222222222,
+      "grad_norm": 6.1875,
+      "learning_rate": 0.00011767441680309955,
+      "loss": 2.8635,
+      "step": 12800
+    },
+    {
+      "epoch": 22.30902777777778,
+      "grad_norm": 8.625,
+      "learning_rate": 0.00011713639323953602,
+      "loss": 2.886,
+      "step": 12850
+    },
+    {
+      "epoch": 22.395833333333332,
+      "grad_norm": 6.09375,
+      "learning_rate": 0.00011659785813267905,
+      "loss": 2.872,
+      "step": 12900
+    },
+    {
+      "epoch": 22.48263888888889,
+      "grad_norm": 5.625,
+      "learning_rate": 0.0001160588275584915,
+      "loss": 2.8891,
+      "step": 12950
+    },
+    {
+      "epoch": 22.569444444444443,
+      "grad_norm": 4.59375,
+      "learning_rate": 0.00011551931760772661,
+      "loss": 2.8741,
+      "step": 13000
+    },
+    {
+      "epoch": 22.65625,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.00011497934438544769,
+      "loss": 2.8815,
+      "step": 13050
+    },
+    {
+      "epoch": 22.743055555555557,
+      "grad_norm": 5.3125,
+      "learning_rate": 0.00011443892401054719,
+      "loss": 2.8705,
+      "step": 13100
+    },
+    {
+      "epoch": 22.82986111111111,
+      "grad_norm": 4.96875,
+      "learning_rate": 0.00011389807261526573,
+      "loss": 2.8823,
+      "step": 13150
+    },
+    {
+      "epoch": 22.916666666666668,
+      "grad_norm": 5.5,
+      "learning_rate": 0.00011335680634471035,
+      "loss": 2.8596,
+      "step": 13200
+    },
+    {
+      "epoch": 23.0,
+      "eval_loss": 2.856687545776367,
+      "eval_runtime": 42.2388,
+      "eval_samples_per_second": 88.284,
+      "eval_steps_per_second": 5.54,
+      "step": 13248
+    },
+    {
+      "epoch": 23.00347222222222,
+      "grad_norm": 7.3125,
+      "learning_rate": 0.00011281514135637278,
+      "loss": 2.8712,
+      "step": 13250
+    },
+    {
+      "epoch": 23.09027777777778,
+      "grad_norm": 5.46875,
+      "learning_rate": 0.00011227309381964684,
+      "loss": 2.8741,
+      "step": 13300
+    },
+    {
+      "epoch": 23.177083333333332,
+      "grad_norm": 4.875,
+      "learning_rate": 0.00011173067991534598,
+      "loss": 2.8567,
+      "step": 13350
+    },
+    {
+      "epoch": 23.26388888888889,
+      "grad_norm": 4.59375,
+      "learning_rate": 0.00011118791583522023,
+      "loss": 2.8739,
+      "step": 13400
+    },
+    {
+      "epoch": 23.350694444444443,
+      "grad_norm": 5.96875,
+      "learning_rate": 0.00011064481778147275,
+      "loss": 2.8865,
+      "step": 13450
+    },
+    {
+      "epoch": 23.4375,
+      "grad_norm": 5.6875,
+      "learning_rate": 0.00011010140196627627,
+      "loss": 2.8657,
+      "step": 13500
+    },
+    {
+      "epoch": 23.524305555555557,
+      "grad_norm": 5.15625,
+      "learning_rate": 0.00010955768461128911,
+      "loss": 2.8911,
+      "step": 13550
+    },
+    {
+      "epoch": 23.61111111111111,
+      "grad_norm": 5.0625,
+      "learning_rate": 0.00010901368194717091,
+      "loss": 2.8727,
+      "step": 13600
+    },
+    {
+      "epoch": 23.697916666666668,
+      "grad_norm": 5.34375,
+      "learning_rate": 0.00010846941021309817,
+      "loss": 2.8729,
+      "step": 13650
+    },
+    {
+      "epoch": 23.78472222222222,
+      "grad_norm": 5.53125,
+      "learning_rate": 0.00010792488565627953,
+      "loss": 2.8749,
+      "step": 13700
+    },
+    {
+      "epoch": 23.87152777777778,
+      "grad_norm": 4.84375,
+      "learning_rate": 0.00010738012453147062,
+      "loss": 2.87,
+      "step": 13750
+    },
+    {
+      "epoch": 23.958333333333332,
+      "grad_norm": 6.875,
+      "learning_rate": 0.00010683514310048894,
+      "loss": 2.8713,
+      "step": 13800
+    },
+    {
+      "epoch": 24.0,
+      "eval_loss": 2.856473922729492,
+      "eval_runtime": 40.551,
+      "eval_samples_per_second": 91.958,
+      "eval_steps_per_second": 5.771,
+      "step": 13824
+    },
+    {
+      "epoch": 24.04513888888889,
+      "grad_norm": 7.4375,
+      "learning_rate": 0.00010628995763172851,
+      "loss": 2.8675,
+      "step": 13850
+    },
+    {
+      "epoch": 24.131944444444443,
+      "grad_norm": 5.875,
+      "learning_rate": 0.00010574458439967401,
+      "loss": 2.8666,
+      "step": 13900
+    },
+    {
+      "epoch": 24.21875,
+      "grad_norm": 5.40625,
+      "learning_rate": 0.00010519903968441516,
+      "loss": 2.8586,
+      "step": 13950
+    },
+    {
+      "epoch": 24.305555555555557,
+      "grad_norm": 4.8125,
+      "learning_rate": 0.0001046533397711607,
+      "loss": 2.8836,
+      "step": 14000
+    },
+    {
+      "epoch": 24.39236111111111,
+      "grad_norm": 4.46875,
+      "learning_rate": 0.00010410750094975215,
+      "loss": 2.8711,
+      "step": 14050
+    },
+    {
+      "epoch": 24.479166666666668,
+      "grad_norm": 5.59375,
+      "learning_rate": 0.00010356153951417771,
+      "loss": 2.8866,
+      "step": 14100
+    },
+    {
+      "epoch": 24.56597222222222,
+      "grad_norm": 5.3125,
+      "learning_rate": 0.00010301547176208568,
+      "loss": 2.8723,
+      "step": 14150
+    },
+    {
+      "epoch": 24.65277777777778,
+      "grad_norm": 5.71875,
+      "learning_rate": 0.00010246931399429812,
+      "loss": 2.8754,
+      "step": 14200
+    },
+    {
+      "epoch": 24.739583333333332,
+      "grad_norm": 5.46875,
+      "learning_rate": 0.00010192308251432412,
+      "loss": 2.8733,
+      "step": 14250
+    },
+    {
+      "epoch": 24.82638888888889,
+      "grad_norm": 5.8125,
+      "learning_rate": 0.0001013767936278732,
+      "loss": 2.8821,
+      "step": 14300
+    },
+    {
+      "epoch": 24.913194444444443,
+      "grad_norm": 4.96875,
+      "learning_rate": 0.00010083046364236854,
+      "loss": 2.8564,
+      "step": 14350
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.00010028410886646014,
+      "loss": 2.8708,
+      "step": 14400
+    },
+    {
+      "epoch": 25.0,
+      "eval_loss": 2.8556883335113525,
+      "eval_runtime": 39.6238,
+      "eval_samples_per_second": 94.11,
+      "eval_steps_per_second": 5.906,
+      "step": 14400
+    },
+    {
+      "epoch": 25.086805555555557,
+      "grad_norm": 4.46875,
+      "learning_rate": 9.97377456095381e-05,
+      "loss": 2.8732,
+      "step": 14450
+    },
+    {
+      "epoch": 25.17361111111111,
+      "grad_norm": 4.1875,
+      "learning_rate": 9.91913901812456e-05,
+      "loss": 2.8537,
+      "step": 14500
+    },
+    {
+      "epoch": 25.260416666666668,
+      "grad_norm": 3.640625,
+      "learning_rate": 9.864505889099217e-05,
+      "loss": 2.8704,
+      "step": 14550
+    },
+    {
+      "epoch": 25.34722222222222,
+      "grad_norm": 5.5,
+      "learning_rate": 9.809876804746683e-05,
+      "loss": 2.8865,
+      "step": 14600
+    },
+    {
+      "epoch": 25.43402777777778,
+      "grad_norm": 6.6875,
+      "learning_rate": 9.755253395815116e-05,
+      "loss": 2.8648,
+      "step": 14650
+    },
+    {
+      "epoch": 25.520833333333332,
+      "grad_norm": 5.46875,
+      "learning_rate": 9.700637292883252e-05,
+      "loss": 2.8886,
+      "step": 14700
+    },
+    {
+      "epoch": 25.60763888888889,
+      "grad_norm": 4.96875,
+      "learning_rate": 9.646030126311743e-05,
+      "loss": 2.872,
+      "step": 14750
+    },
+    {
+      "epoch": 25.694444444444443,
+      "grad_norm": 7.9375,
+      "learning_rate": 9.591433526194474e-05,
+      "loss": 2.8698,
+      "step": 14800
+    },
+    {
+      "epoch": 25.78125,
+      "grad_norm": 5.0,
+      "learning_rate": 9.536849122309901e-05,
+      "loss": 2.8718,
+      "step": 14850
+    },
+    {
+      "epoch": 25.868055555555557,
+      "grad_norm": 6.1875,
+      "learning_rate": 9.482278544072425e-05,
+      "loss": 2.8712,
+      "step": 14900
+    },
+    {
+      "epoch": 25.95486111111111,
+      "grad_norm": 5.34375,
+      "learning_rate": 9.427723420483717e-05,
+      "loss": 2.8674,
+      "step": 14950
+    },
+    {
+      "epoch": 26.0,
+      "eval_loss": 2.855642080307007,
+      "eval_runtime": 41.248,
+      "eval_samples_per_second": 90.404,
+      "eval_steps_per_second": 5.673,
+      "step": 14976
+    },
+    {
+      "epoch": 26.041666666666668,
+      "grad_norm": 6.46875,
+      "learning_rate": 9.373185380084113e-05,
+      "loss": 2.8681,
+      "step": 15000
+    },
+    {
+      "epoch": 26.12847222222222,
+      "grad_norm": 6.21875,
+      "learning_rate": 9.318666050903988e-05,
+      "loss": 2.8659,
+      "step": 15050
+    },
+    {
+      "epoch": 26.21527777777778,
+      "grad_norm": 5.65625,
+      "learning_rate": 9.264167060415178e-05,
+      "loss": 2.857,
+      "step": 15100
+    },
+    {
+      "epoch": 26.302083333333332,
+      "grad_norm": 4.4375,
+      "learning_rate": 9.209690035482372e-05,
+      "loss": 2.8821,
+      "step": 15150
+    },
+    {
+      "epoch": 26.38888888888889,
+      "grad_norm": 6.0,
+      "learning_rate": 9.155236602314552e-05,
+      "loss": 2.8707,
+      "step": 15200
+    },
+    {
+      "epoch": 26.475694444444443,
+      "grad_norm": 4.53125,
+      "learning_rate": 9.100808386416475e-05,
+      "loss": 2.8819,
+      "step": 15250
+    },
+    {
+      "epoch": 26.5625,
+      "grad_norm": 4.125,
+      "learning_rate": 9.046407012540115e-05,
+      "loss": 2.8716,
+      "step": 15300
+    },
+    {
+      "epoch": 26.649305555555557,
+      "grad_norm": 5.8125,
+      "learning_rate": 8.992034104636183e-05,
+      "loss": 2.8758,
+      "step": 15350
+    },
+    {
+      "epoch": 26.73611111111111,
+      "grad_norm": 4.84375,
+      "learning_rate": 8.937691285805634e-05,
+      "loss": 2.8716,
+      "step": 15400
+    },
+    {
+      "epoch": 26.822916666666668,
+      "grad_norm": 4.8125,
+      "learning_rate": 8.883380178251249e-05,
+      "loss": 2.8792,
+      "step": 15450
+    },
+    {
+      "epoch": 26.90972222222222,
+      "grad_norm": 4.59375,
+      "learning_rate": 8.829102403229163e-05,
+      "loss": 2.8585,
+      "step": 15500
+    },
+    {
+      "epoch": 26.99652777777778,
+      "grad_norm": 7.09375,
+      "learning_rate": 8.774859581000504e-05,
+      "loss": 2.8683,
+      "step": 15550
+    },
+    {
+      "epoch": 27.0,
+      "eval_loss": 2.8553037643432617,
+      "eval_runtime": 41.718,
+      "eval_samples_per_second": 89.386,
+      "eval_steps_per_second": 5.609,
+      "step": 15552
+    },
+    {
+      "epoch": 27.083333333333332,
+      "grad_norm": 4.78125,
+      "learning_rate": 8.720653330783013e-05,
+      "loss": 2.8705,
+      "step": 15600
+    },
+    {
+      "epoch": 27.17013888888889,
+      "grad_norm": 4.5,
+      "learning_rate": 8.666485270702704e-05,
+      "loss": 2.8559,
+      "step": 15650
+    },
+    {
+      "epoch": 27.256944444444443,
+      "grad_norm": 4.03125,
+      "learning_rate": 8.612357017745578e-05,
+      "loss": 2.87,
+      "step": 15700
+    },
+    {
+      "epoch": 27.34375,
+      "grad_norm": 5.65625,
+      "learning_rate": 8.558270187709328e-05,
+      "loss": 2.8804,
+      "step": 15750
+    },
+    {
+      "epoch": 27.430555555555557,
+      "grad_norm": 5.15625,
+      "learning_rate": 8.504226395155132e-05,
+      "loss": 2.8634,
+      "step": 15800
+    },
+    {
+      "epoch": 27.51736111111111,
+      "grad_norm": 5.40625,
+      "learning_rate": 8.450227253359439e-05,
+      "loss": 2.8878,
+      "step": 15850
+    },
+    {
+      "epoch": 27.604166666666668,
+      "grad_norm": 4.15625,
+      "learning_rate": 8.39627437426581e-05,
+      "loss": 2.8713,
+      "step": 15900
+    },
+    {
+      "epoch": 27.69097222222222,
+      "grad_norm": 4.84375,
+      "learning_rate": 8.34236936843682e-05,
+      "loss": 2.8689,
+      "step": 15950
+    },
+    {
+      "epoch": 27.77777777777778,
+      "grad_norm": 5.34375,
+      "learning_rate": 8.28851384500595e-05,
+      "loss": 2.8706,
+      "step": 16000
+    },
+    {
+      "epoch": 27.864583333333332,
+      "grad_norm": 4.3125,
+      "learning_rate": 8.234709411629572e-05,
+      "loss": 2.8689,
+      "step": 16050
+    },
+    {
+      "epoch": 27.95138888888889,
+      "grad_norm": 8.125,
+      "learning_rate": 8.180957674438966e-05,
+      "loss": 2.8677,
+      "step": 16100
+    },
+    {
+      "epoch": 28.0,
+      "eval_loss": 2.8553411960601807,
+      "eval_runtime": 41.9128,
+      "eval_samples_per_second": 88.97,
+      "eval_steps_per_second": 5.583,
+      "step": 16128
+    },
+    {
+      "epoch": 28.038194444444443,
+      "grad_norm": 4.34375,
+      "learning_rate": 8.12726023799235e-05,
+      "loss": 2.8652,
+      "step": 16150
+    },
+    {
+      "epoch": 28.125,
+      "grad_norm": 5.125,
+      "learning_rate": 8.073618705226998e-05,
+      "loss": 2.8667,
+      "step": 16200
+    },
+    {
+      "epoch": 28.211805555555557,
+      "grad_norm": 4.34375,
+      "learning_rate": 8.020034677411386e-05,
+      "loss": 2.8591,
+      "step": 16250
+    },
+    {
+      "epoch": 28.29861111111111,
+      "grad_norm": 4.9375,
+      "learning_rate": 7.966509754097404e-05,
+      "loss": 2.8778,
+      "step": 16300
+    },
+    {
+      "epoch": 28.385416666666668,
+      "grad_norm": 4.0625,
+      "learning_rate": 7.913045533072587e-05,
+      "loss": 2.8716,
+      "step": 16350
+    },
+    {
+      "epoch": 28.47222222222222,
+      "grad_norm": 5.125,
+      "learning_rate": 7.859643610312424e-05,
+      "loss": 2.8786,
+      "step": 16400
+    },
+    {
+      "epoch": 28.55902777777778,
+      "grad_norm": 5.375,
+      "learning_rate": 7.80630557993274e-05,
+      "loss": 2.8746,
+      "step": 16450
+    },
+    {
+      "epoch": 28.645833333333332,
+      "grad_norm": 4.75,
+      "learning_rate": 7.753033034142075e-05,
+      "loss": 2.871,
+      "step": 16500
+    },
+    {
+      "epoch": 28.73263888888889,
+      "grad_norm": 5.09375,
+      "learning_rate": 7.69982756319417e-05,
+      "loss": 2.8704,
+      "step": 16550
+    },
+    {
+      "epoch": 28.819444444444443,
+      "grad_norm": 5.03125,
+      "learning_rate": 7.646690755340504e-05,
+      "loss": 2.8813,
+      "step": 16600
+    },
+    {
+      "epoch": 28.90625,
+      "grad_norm": 4.53125,
+      "learning_rate": 7.59362419678287e-05,
+      "loss": 2.8563,
+      "step": 16650
+    },
+    {
+      "epoch": 28.993055555555557,
+      "grad_norm": 4.53125,
+      "learning_rate": 7.540629471626026e-05,
+      "loss": 2.868,
+      "step": 16700
+    },
+    {
+      "epoch": 29.0,
+      "eval_loss": 2.8549838066101074,
+      "eval_runtime": 40.2288,
+      "eval_samples_per_second": 92.695,
+      "eval_steps_per_second": 5.817,
+      "step": 16704
+    },
+    {
+      "epoch": 29.07986111111111,
+      "grad_norm": 4.90625,
+      "learning_rate": 7.48770816183042e-05,
+      "loss": 2.869,
+      "step": 16750
+    },
+    {
+      "epoch": 29.166666666666668,
+      "grad_norm": 4.1875,
+      "learning_rate": 7.434861847164955e-05,
+      "loss": 2.8525,
+      "step": 16800
+    },
+    {
+      "epoch": 29.25347222222222,
+      "grad_norm": 4.125,
+      "learning_rate": 7.382092105159825e-05,
+      "loss": 2.868,
+      "step": 16850
+    },
+    {
+      "epoch": 29.34027777777778,
+      "grad_norm": 6.125,
+      "learning_rate": 7.329400511059442e-05,
+      "loss": 2.8797,
+      "step": 16900
+    },
+    {
+      "epoch": 29.427083333333332,
+      "grad_norm": 4.71875,
+      "learning_rate": 7.276788637775393e-05,
+      "loss": 2.8629,
+      "step": 16950
+    },
+    {
+      "epoch": 29.51388888888889,
+      "grad_norm": 3.90625,
+      "learning_rate": 7.224258055839509e-05,
+      "loss": 2.8888,
+      "step": 17000
+    },
+    {
+      "epoch": 29.600694444444443,
+      "grad_norm": 4.8125,
+      "learning_rate": 7.171810333356961e-05,
+      "loss": 2.869,
+      "step": 17050
+    },
+    {
+      "epoch": 29.6875,
+      "grad_norm": 5.0625,
+      "learning_rate": 7.119447035959457e-05,
+      "loss": 2.8709,
+      "step": 17100
+    },
+    {
+      "epoch": 29.774305555555557,
+      "grad_norm": 3.828125,
+      "learning_rate": 7.067169726758522e-05,
+      "loss": 2.8669,
+      "step": 17150
+    },
+    {
+      "epoch": 29.86111111111111,
+      "grad_norm": 5.5625,
+      "learning_rate": 7.014979966298808e-05,
+      "loss": 2.8698,
+      "step": 17200
+    },
+    {
+      "epoch": 29.947916666666668,
+      "grad_norm": 3.546875,
+      "learning_rate": 6.962879312511531e-05,
+      "loss": 2.8669,
+      "step": 17250
+    },
+    {
+      "epoch": 30.0,
+      "eval_loss": 2.854860544204712,
+      "eval_runtime": 41.7924,
+      "eval_samples_per_second": 89.227,
+      "eval_steps_per_second": 5.599,
+      "step": 17280
+    },
+    {
+      "epoch": 30.03472222222222,
+      "grad_norm": 5.53125,
+      "learning_rate": 6.910869320667955e-05,
+      "loss": 2.8649,
+      "step": 17300
+    },
+    {
+      "epoch": 30.12152777777778,
+      "grad_norm": 6.15625,
+      "learning_rate": 6.858951543332978e-05,
+      "loss": 2.8648,
+      "step": 17350
+    },
+    {
+      "epoch": 30.208333333333332,
+      "grad_norm": 5.3125,
+      "learning_rate": 6.807127530318771e-05,
+      "loss": 2.8618,
+      "step": 17400
+    },
+    {
+      "epoch": 30.29513888888889,
+      "grad_norm": 4.625,
+      "learning_rate": 6.755398828638512e-05,
+      "loss": 2.8748,
+      "step": 17450
+    },
+    {
+      "epoch": 30.381944444444443,
+      "grad_norm": 4.6875,
+      "learning_rate": 6.703766982460231e-05,
+      "loss": 2.8702,
+      "step": 17500
+    },
+    {
+      "epoch": 30.46875,
+      "grad_norm": 4.5,
+      "learning_rate": 6.652233533060683e-05,
+      "loss": 2.8766,
+      "step": 17550
+    },
+    {
+      "epoch": 30.555555555555557,
+      "grad_norm": 4.65625,
+      "learning_rate": 6.600800018779356e-05,
+      "loss": 2.8766,
+      "step": 17600
+    },
+    {
+      "epoch": 30.64236111111111,
+      "grad_norm": 5.53125,
+      "learning_rate": 6.549467974972552e-05,
+      "loss": 2.8674,
+      "step": 17650
+    },
+    {
+      "epoch": 30.729166666666668,
+      "grad_norm": 5.28125,
+      "learning_rate": 6.498238933967544e-05,
+      "loss": 2.868,
+      "step": 17700
+    },
+    {
+      "epoch": 30.81597222222222,
+      "grad_norm": 4.09375,
+      "learning_rate": 6.44711442501684e-05,
+      "loss": 2.8798,
+      "step": 17750
+    },
+    {
+      "epoch": 30.90277777777778,
+      "grad_norm": 6.03125,
+      "learning_rate": 6.396095974252534e-05,
+      "loss": 2.8578,
+      "step": 17800
+    },
+    {
+      "epoch": 30.989583333333332,
+      "grad_norm": 5.59375,
+      "learning_rate": 6.345185104640747e-05,
+      "loss": 2.8672,
+      "step": 17850
+    },
+    {
+      "epoch": 31.0,
+      "eval_loss": 2.8543925285339355,
+      "eval_runtime": 41.2327,
+      "eval_samples_per_second": 90.438,
+      "eval_steps_per_second": 5.675,
+      "step": 17856
+    },
+    {
+      "epoch": 31.07638888888889,
+      "grad_norm": 4.625,
+      "learning_rate": 6.294383335936167e-05,
+      "loss": 2.87,
+      "step": 17900
+    },
+    {
+      "epoch": 31.163194444444443,
+      "grad_norm": 3.78125,
+      "learning_rate": 6.24369218463667e-05,
+      "loss": 2.8516,
+      "step": 17950
+    },
+    {
+      "epoch": 31.25,
+      "grad_norm": 5.4375,
+      "learning_rate": 6.193113163938075e-05,
+      "loss": 2.8673,
+      "step": 18000
+    },
+    {
+      "epoch": 31.336805555555557,
+      "grad_norm": 4.4375,
+      "learning_rate": 6.14264778368895e-05,
+      "loss": 2.8794,
+      "step": 18050
+    },
+    {
+      "epoch": 31.42361111111111,
+      "grad_norm": 5.125,
+      "learning_rate": 6.092297550345554e-05,
+      "loss": 2.8634,
+      "step": 18100
+    },
+    {
+      "epoch": 31.510416666666668,
+      "grad_norm": 5.46875,
+      "learning_rate": 6.0420639669268544e-05,
+      "loss": 2.8904,
+      "step": 18150
+    },
+    {
+      "epoch": 31.59722222222222,
+      "grad_norm": 4.21875,
+      "learning_rate": 5.991948532969685e-05,
+      "loss": 2.8651,
+      "step": 18200
+    },
+    {
+      "epoch": 31.68402777777778,
+      "grad_norm": 4.6875,
+      "learning_rate": 5.9419527444839515e-05,
+      "loss": 2.8727,
+      "step": 18250
+    },
+    {
+      "epoch": 31.770833333333332,
+      "grad_norm": 3.765625,
+      "learning_rate": 5.8920780939079955e-05,
+      "loss": 2.8645,
+      "step": 18300
+    },
+    {
+      "epoch": 31.85763888888889,
+      "grad_norm": 6.28125,
+      "learning_rate": 5.8423260700640417e-05,
+      "loss": 2.8713,
+      "step": 18350
+    },
+    {
+      "epoch": 31.944444444444443,
+      "grad_norm": 6.9375,
+      "learning_rate": 5.792698158113742e-05,
+      "loss": 2.8634,
+      "step": 18400
+    },
+    {
+      "epoch": 32.0,
+      "eval_loss": 2.8544044494628906,
+      "eval_runtime": 40.4905,
+      "eval_samples_per_second": 92.096,
+      "eval_steps_per_second": 5.779,
+      "step": 18432
+    },
+    {
+      "epoch": 32.03125,
+      "grad_norm": 3.84375,
+      "learning_rate": 5.743195839513852e-05,
+      "loss": 2.8657,
+      "step": 18450
+    },
+    {
+      "epoch": 32.11805555555556,
+      "grad_norm": 4.65625,
+      "learning_rate": 5.693820591971996e-05,
+      "loss": 2.8633,
+      "step": 18500
+    },
+    {
+      "epoch": 32.204861111111114,
+      "grad_norm": 5.25,
+      "learning_rate": 5.644573889402589e-05,
+      "loss": 2.8595,
+      "step": 18550
+    },
+    {
+      "epoch": 32.291666666666664,
+      "grad_norm": 4.8125,
+      "learning_rate": 5.5954572018827846e-05,
+      "loss": 2.8737,
+      "step": 18600
+    },
+    {
+      "epoch": 32.37847222222222,
+      "grad_norm": 5.46875,
+      "learning_rate": 5.5464719956086396e-05,
+      "loss": 2.8722,
+      "step": 18650
+    },
+    {
+      "epoch": 32.46527777777778,
+      "grad_norm": 5.15625,
+      "learning_rate": 5.49761973285132e-05,
+      "loss": 2.871,
+      "step": 18700
+    },
+    {
+      "epoch": 32.552083333333336,
+      "grad_norm": 4.1875,
+      "learning_rate": 5.4489018719134654e-05,
+      "loss": 2.8801,
+      "step": 18750
+    },
+    {
+      "epoch": 32.638888888888886,
+      "grad_norm": 4.875,
+      "learning_rate": 5.400319867085633e-05,
+      "loss": 2.8668,
+      "step": 18800
+    },
+    {
+      "epoch": 32.72569444444444,
+      "grad_norm": 4.8125,
+      "learning_rate": 5.3518751686029134e-05,
+      "loss": 2.8673,
+      "step": 18850
+    },
+    {
+      "epoch": 32.8125,
+      "grad_norm": 3.828125,
+      "learning_rate": 5.303569222601626e-05,
+      "loss": 2.875,
+      "step": 18900
+    },
+    {
+      "epoch": 32.89930555555556,
+      "grad_norm": 4.0625,
+      "learning_rate": 5.25540347107615e-05,
+      "loss": 2.8596,
+      "step": 18950
+    },
+    {
+      "epoch": 32.986111111111114,
+      "grad_norm": 4.25,
+      "learning_rate": 5.207379351835875e-05,
+      "loss": 2.8683,
+      "step": 19000
+    },
+    {
+      "epoch": 33.0,
+      "eval_loss": 2.854464054107666,
+      "eval_runtime": 40.2584,
+      "eval_samples_per_second": 92.627,
+      "eval_steps_per_second": 5.812,
+      "step": 19008
+    },
+    {
+      "epoch": 33.072916666666664,
+      "grad_norm": 4.5,
+      "learning_rate": 5.1594982984622906e-05,
+      "loss": 2.8657,
+      "step": 19050
+    },
+    {
+      "epoch": 33.15972222222222,
+      "grad_norm": 5.78125,
+      "learning_rate": 5.1117617402661865e-05,
+      "loss": 2.8538,
+      "step": 19100
+    },
+    {
+      "epoch": 33.24652777777778,
+      "grad_norm": 4.25,
+      "learning_rate": 5.064171102244985e-05,
+      "loss": 2.8671,
+      "step": 19150
+    },
+    {
+      "epoch": 33.333333333333336,
+      "grad_norm": 5.125,
+      "learning_rate": 5.0167278050402075e-05,
+      "loss": 2.879,
+      "step": 19200
+    },
+    {
+      "epoch": 33.420138888888886,
+      "grad_norm": 3.890625,
+      "learning_rate": 4.9694332648950536e-05,
+      "loss": 2.8637,
+      "step": 19250
+    },
+    {
+      "epoch": 33.50694444444444,
+      "grad_norm": 3.71875,
+      "learning_rate": 4.9222888936121494e-05,
+      "loss": 2.8891,
+      "step": 19300
+    },
+    {
+      "epoch": 33.59375,
+      "grad_norm": 3.578125,
+      "learning_rate": 4.875296098511365e-05,
+      "loss": 2.864,
+      "step": 19350
+    },
+    {
+      "epoch": 33.68055555555556,
+      "grad_norm": 4.9375,
+      "learning_rate": 4.828456282387859e-05,
+      "loss": 2.8731,
+      "step": 19400
+    },
+    {
+      "epoch": 33.767361111111114,
+      "grad_norm": 4.3125,
+      "learning_rate": 4.781770843470144e-05,
+      "loss": 2.8677,
+      "step": 19450
+    },
+    {
+      "epoch": 33.854166666666664,
+      "grad_norm": 5.15625,
+      "learning_rate": 4.735241175378386e-05,
+      "loss": 2.8649,
+      "step": 19500
+    },
+    {
+      "epoch": 33.94097222222222,
+      "grad_norm": 4.15625,
+      "learning_rate": 4.688868667082794e-05,
+      "loss": 2.8629,
+      "step": 19550
+    },
+    {
+      "epoch": 34.0,
+      "eval_loss": 2.8541414737701416,
+      "eval_runtime": 39.773,
+      "eval_samples_per_second": 93.757,
+      "eval_steps_per_second": 5.883,
+      "step": 19584
+    },
+    {
+      "epoch": 34.02777777777778,
+      "grad_norm": 4.09375,
+      "learning_rate": 4.642654702862157e-05,
+      "loss": 2.8661,
+      "step": 19600
+    },
+    {
+      "epoch": 34.114583333333336,
+      "grad_norm": 3.53125,
+      "learning_rate": 4.596600662262508e-05,
+      "loss": 2.8641,
+      "step": 19650
+    },
+    {
+      "epoch": 34.201388888888886,
+      "grad_norm": 4.46875,
+      "learning_rate": 4.55070792005597e-05,
+      "loss": 2.8574,
+      "step": 19700
+    },
+    {
+      "epoch": 34.28819444444444,
+      "grad_norm": 3.828125,
+      "learning_rate": 4.5049778461996926e-05,
+      "loss": 2.8735,
+      "step": 19750
+    },
+    {
+      "epoch": 34.375,
+      "grad_norm": 6.65625,
+      "learning_rate": 4.459411805794976e-05,
+      "loss": 2.8731,
+      "step": 19800
+    },
+    {
+      "epoch": 34.46180555555556,
+      "grad_norm": 4.25,
+      "learning_rate": 4.414011159046495e-05,
+      "loss": 2.8719,
+      "step": 19850
+    },
+    {
+      "epoch": 34.548611111111114,
+      "grad_norm": 3.90625,
+      "learning_rate": 4.368777261221737e-05,
+      "loss": 2.8769,
+      "step": 19900
+    },
+    {
+      "epoch": 34.635416666666664,
+      "grad_norm": 4.0625,
+      "learning_rate": 4.323711462610495e-05,
+      "loss": 2.8679,
+      "step": 19950
+    },
+    {
+      "epoch": 34.72222222222222,
+      "grad_norm": 3.953125,
+      "learning_rate": 4.278815108484602e-05,
+      "loss": 2.8681,
+      "step": 20000
+    },
+    {
+      "epoch": 34.80902777777778,
+      "grad_norm": 4.09375,
+      "learning_rate": 4.234089539057745e-05,
+      "loss": 2.8744,
+      "step": 20050
+    },
+    {
+      "epoch": 34.895833333333336,
+      "grad_norm": 4.03125,
+      "learning_rate": 4.1895360894454774e-05,
+      "loss": 2.8615,
+      "step": 20100
+    },
+    {
+      "epoch": 34.982638888888886,
+      "grad_norm": 4.21875,
+      "learning_rate": 4.1451560896253515e-05,
+      "loss": 2.8641,
+      "step": 20150
+    },
+    {
+      "epoch": 35.0,
+      "eval_loss": 2.8540618419647217,
+      "eval_runtime": 41.6189,
+      "eval_samples_per_second": 89.599,
+      "eval_steps_per_second": 5.622,
+      "step": 20160
+    },
+    {
+      "epoch": 35.06944444444444,
+      "grad_norm": 4.1875,
+      "learning_rate": 4.100950864397223e-05,
+      "loss": 2.8629,
+      "step": 20200
+    },
+    {
+      "epoch": 35.15625,
+      "grad_norm": 3.484375,
+      "learning_rate": 4.056921733343704e-05,
+      "loss": 2.8579,
+      "step": 20250
+    },
+    {
+      "epoch": 35.24305555555556,
+      "grad_norm": 5.28125,
+      "learning_rate": 4.013070010790759e-05,
+      "loss": 2.8641,
+      "step": 20300
+    },
+    {
+      "epoch": 35.329861111111114,
+      "grad_norm": 4.25,
+      "learning_rate": 3.9693970057684984e-05,
+      "loss": 2.8801,
+      "step": 20350
+    },
+    {
+      "epoch": 35.416666666666664,
+      "grad_norm": 4.84375,
+      "learning_rate": 3.9259040219720645e-05,
+      "loss": 2.8614,
+      "step": 20400
+    },
+    {
+      "epoch": 35.50347222222222,
+      "grad_norm": 5.21875,
+      "learning_rate": 3.882592357722761e-05,
+      "loss": 2.8902,
+      "step": 20450
+    },
+    {
+      "epoch": 35.59027777777778,
+      "grad_norm": 4.71875,
+      "learning_rate": 3.839463305929247e-05,
+      "loss": 2.8626,
+      "step": 20500
+    },
+    {
+      "epoch": 35.677083333333336,
+      "grad_norm": 4.3125,
+      "learning_rate": 3.7965181540489794e-05,
+      "loss": 2.8741,
+      "step": 20550
+    },
+    {
+      "epoch": 35.763888888888886,
+      "grad_norm": 3.46875,
+      "learning_rate": 3.753758184049764e-05,
+      "loss": 2.8656,
+      "step": 20600
+    },
+    {
+      "epoch": 35.85069444444444,
+      "grad_norm": 5.03125,
+      "learning_rate": 3.7111846723714916e-05,
+      "loss": 2.8661,
+      "step": 20650
+    },
+    {
+      "epoch": 35.9375,
+      "grad_norm": 3.25,
+      "learning_rate": 3.668798889888022e-05,
+      "loss": 2.8597,
+      "step": 20700
+    },
+    {
+      "epoch": 36.0,
+      "eval_loss": 2.853997230529785,
+      "eval_runtime": 41.7365,
+      "eval_samples_per_second": 89.346,
+      "eval_steps_per_second": 5.607,
+      "step": 20736
+    },
+    {
+      "epoch": 36.02430555555556,
+      "grad_norm": 4.5,
+      "learning_rate": 3.626602101869281e-05,
+      "loss": 2.8674,
+      "step": 20750
+    },
+    {
+      "epoch": 36.111111111111114,
+      "grad_norm": 5.15625,
+      "learning_rate": 3.5845955679434426e-05,
+      "loss": 2.8631,
+      "step": 20800
+    },
+    {
+      "epoch": 36.197916666666664,
+      "grad_norm": 3.6875,
+      "learning_rate": 3.542780542059373e-05,
+      "loss": 2.8576,
+      "step": 20850
+    },
+    {
+      "epoch": 36.28472222222222,
+      "grad_norm": 3.515625,
+      "learning_rate": 3.501158272449155e-05,
+      "loss": 2.8715,
+      "step": 20900
+    },
+    {
+      "epoch": 36.37152777777778,
+      "grad_norm": 3.4375,
+      "learning_rate": 3.45973000159088e-05,
+      "loss": 2.8754,
+      "step": 20950
+    },
+    {
+      "epoch": 36.458333333333336,
+      "grad_norm": 3.671875,
+      "learning_rate": 3.418496966171498e-05,
+      "loss": 2.8721,
+      "step": 21000
+    },
+    {
+      "epoch": 36.545138888888886,
+      "grad_norm": 3.84375,
+      "learning_rate": 3.377460397049951e-05,
+      "loss": 2.8741,
+      "step": 21050
+    },
+    {
+      "epoch": 36.63194444444444,
+      "grad_norm": 4.3125,
+      "learning_rate": 3.336621519220404e-05,
+      "loss": 2.8717,
+      "step": 21100
+    },
+    {
+      "epoch": 36.71875,
+      "grad_norm": 3.625,
+      "learning_rate": 3.295981551775679e-05,
+      "loss": 2.8655,
+      "step": 21150
+    },
+    {
+      "epoch": 36.80555555555556,
+      "grad_norm": 3.46875,
+      "learning_rate": 3.255541707870874e-05,
+      "loss": 2.8748,
+      "step": 21200
+    },
+    {
+      "epoch": 36.892361111111114,
+      "grad_norm": 3.640625,
+      "learning_rate": 3.2153031946871427e-05,
+      "loss": 2.8598,
+      "step": 21250
+    },
+    {
+      "epoch": 36.979166666666664,
+      "grad_norm": 3.875,
+      "learning_rate": 3.1752672133956596e-05,
+      "loss": 2.8632,
+      "step": 21300
+    },
+    {
+      "epoch": 37.0,
+      "eval_loss": 2.854156017303467,
+      "eval_runtime": 40.6449,
+      "eval_samples_per_second": 91.746,
+      "eval_steps_per_second": 5.757,
+      "step": 21312
+    },
+    {
+      "epoch": 37.06597222222222,
+      "grad_norm": 4.125,
+      "learning_rate": 3.135434959121756e-05,
+      "loss": 2.8613,
+      "step": 21350
+    },
+    {
+      "epoch": 37.15277777777778,
+      "grad_norm": 5.53125,
+      "learning_rate": 3.095807620909257e-05,
+      "loss": 2.859,
+      "step": 21400
+    },
+    {
+      "epoch": 37.239583333333336,
+      "grad_norm": 4.15625,
+      "learning_rate": 3.0563863816849795e-05,
+      "loss": 2.8618,
+      "step": 21450
+    },
+    {
+      "epoch": 37.326388888888886,
+      "grad_norm": 4.0625,
+      "learning_rate": 3.017172418223424e-05,
+      "loss": 2.8817,
+      "step": 21500
+    },
+    {
+      "epoch": 37.41319444444444,
+      "grad_norm": 3.953125,
+      "learning_rate": 2.9781669011116364e-05,
+      "loss": 2.8609,
+      "step": 21550
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 3.96875,
+      "learning_rate": 2.939370994714278e-05,
+      "loss": 2.8872,
+      "step": 21600
+    },
+    {
+      "epoch": 37.58680555555556,
+      "grad_norm": 4.09375,
+      "learning_rate": 2.90078585713886e-05,
+      "loss": 2.864,
+      "step": 21650
+    },
+    {
+      "epoch": 37.673611111111114,
+      "grad_norm": 3.421875,
+      "learning_rate": 2.8624126402011798e-05,
+      "loss": 2.8757,
+      "step": 21700
+    },
+    {
+      "epoch": 37.760416666666664,
+      "grad_norm": 3.40625,
+      "learning_rate": 2.8242524893909162e-05,
+      "loss": 2.8623,
+      "step": 21750
+    },
+    {
+      "epoch": 37.84722222222222,
+      "grad_norm": 3.28125,
+      "learning_rate": 2.7863065438374748e-05,
+      "loss": 2.8695,
+      "step": 21800
+    },
+    {
+      "epoch": 37.93402777777778,
+      "grad_norm": 4.4375,
+      "learning_rate": 2.7485759362759378e-05,
+      "loss": 2.8596,
+      "step": 21850
+    },
+    {
+      "epoch": 38.0,
+      "eval_loss": 2.8540520668029785,
+      "eval_runtime": 41.5194,
+      "eval_samples_per_second": 89.813,
+      "eval_steps_per_second": 5.636,
+      "step": 21888
+    },
+    {
+      "epoch": 38.020833333333336,
+      "grad_norm": 3.25,
+      "learning_rate": 2.7110617930132877e-05,
+      "loss": 2.8658,
+      "step": 21900
+    },
+    {
+      "epoch": 38.107638888888886,
+      "grad_norm": 3.1875,
+      "learning_rate": 2.673765233894755e-05,
+      "loss": 2.8632,
+      "step": 21950
+    },
+    {
+      "epoch": 38.19444444444444,
+      "grad_norm": 3.765625,
+      "learning_rate": 2.6366873722704265e-05,
+      "loss": 2.8583,
+      "step": 22000
+    },
+    {
+      "epoch": 38.28125,
+      "grad_norm": 3.640625,
+      "learning_rate": 2.599829314961967e-05,
+      "loss": 2.8678,
+      "step": 22050
+    },
+    {
+      "epoch": 38.36805555555556,
+      "grad_norm": 3.609375,
+      "learning_rate": 2.5631921622296128e-05,
+      "loss": 2.8777,
+      "step": 22100
+    },
+    {
+      "epoch": 38.454861111111114,
+      "grad_norm": 3.15625,
+      "learning_rate": 2.526777007739316e-05,
+      "loss": 2.8671,
+      "step": 22150
+    },
+    {
+      "epoch": 38.541666666666664,
+      "grad_norm": 3.53125,
+      "learning_rate": 2.4905849385300883e-05,
+      "loss": 2.8782,
+      "step": 22200
+    },
+    {
+      "epoch": 38.62847222222222,
+      "grad_norm": 3.46875,
+      "learning_rate": 2.4546170349815666e-05,
+      "loss": 2.8699,
+      "step": 22250
+    },
+    {
+      "epoch": 38.71527777777778,
+      "grad_norm": 3.8125,
+      "learning_rate": 2.418874370781754e-05,
+      "loss": 2.8658,
+      "step": 22300
+    },
+    {
+      "epoch": 38.802083333333336,
+      "grad_norm": 3.9375,
+      "learning_rate": 2.3833580128949762e-05,
+      "loss": 2.8749,
+      "step": 22350
+    },
+    {
+      "epoch": 38.888888888888886,
+      "grad_norm": 3.75,
+      "learning_rate": 2.3480690215300105e-05,
+      "loss": 2.8573,
+      "step": 22400
+    },
+    {
+      "epoch": 38.97569444444444,
+      "grad_norm": 4.75,
+      "learning_rate": 2.313008450108468e-05,
+      "loss": 2.8656,
+      "step": 22450
+    },
+    {
+      "epoch": 39.0,
+      "eval_loss": 2.854092597961426,
+      "eval_runtime": 41.0282,
+      "eval_samples_per_second": 90.889,
+      "eval_steps_per_second": 5.703,
+      "step": 22464
+    },
+    {
+      "epoch": 39.0625,
+      "grad_norm": 3.40625,
+      "learning_rate": 2.278177345233323e-05,
+      "loss": 2.8622,
+      "step": 22500
+    },
+    {
+      "epoch": 39.14930555555556,
+      "grad_norm": 3.625,
+      "learning_rate": 2.2435767466576863e-05,
+      "loss": 2.8578,
+      "step": 22550
+    },
+    {
+      "epoch": 39.236111111111114,
+      "grad_norm": 4.15625,
+      "learning_rate": 2.209207687253746e-05,
+      "loss": 2.8602,
+      "step": 22600
+    },
+    {
+      "epoch": 39.322916666666664,
+      "grad_norm": 2.921875,
+      "learning_rate": 2.1750711929819723e-05,
+      "loss": 2.8825,
+      "step": 22650
+    },
+    {
+      "epoch": 39.40972222222222,
+      "grad_norm": 3.421875,
+      "learning_rate": 2.1411682828604452e-05,
+      "loss": 2.8618,
+      "step": 22700
+    },
+    {
+      "epoch": 39.49652777777778,
+      "grad_norm": 3.6875,
+      "learning_rate": 2.1074999689344755e-05,
+      "loss": 2.8834,
+      "step": 22750
+    },
+    {
+      "epoch": 39.583333333333336,
+      "grad_norm": 4.46875,
+      "learning_rate": 2.0740672562463602e-05,
+      "loss": 2.8664,
+      "step": 22800
+    },
+    {
+      "epoch": 39.670138888888886,
+      "grad_norm": 3.5625,
+      "learning_rate": 2.0408711428054195e-05,
+      "loss": 2.8771,
+      "step": 22850
+    },
+    {
+      "epoch": 39.75694444444444,
+      "grad_norm": 3.390625,
+      "learning_rate": 2.0079126195581612e-05,
+      "loss": 2.8629,
+      "step": 22900
+    },
+    {
+      "epoch": 39.84375,
+      "grad_norm": 4.28125,
+      "learning_rate": 1.9751926703587353e-05,
+      "loss": 2.867,
+      "step": 22950
+    },
+    {
+      "epoch": 39.93055555555556,
+      "grad_norm": 3.390625,
+      "learning_rate": 1.9427122719395452e-05,
+      "loss": 2.8591,
+      "step": 23000
+    },
+    {
+      "epoch": 40.0,
+      "eval_loss": 2.85404634475708,
+      "eval_runtime": 42.4648,
+      "eval_samples_per_second": 87.814,
+      "eval_steps_per_second": 5.51,
+      "step": 23040
+    },
+    {
+      "epoch": 40.017361111111114,
+      "grad_norm": 4.71875,
+      "learning_rate": 1.9104723938821012e-05,
+      "loss": 2.8661,
+      "step": 23050
+    },
+    {
+      "epoch": 40.104166666666664,
+      "grad_norm": 3.375,
+      "learning_rate": 1.8784739985880628e-05,
+      "loss": 2.8613,
+      "step": 23100
+    },
+    {
+      "epoch": 40.19097222222222,
+      "grad_norm": 3.078125,
+      "learning_rate": 1.8467180412505313e-05,
+      "loss": 2.8565,
+      "step": 23150
+    },
+    {
+      "epoch": 40.27777777777778,
+      "grad_norm": 3.171875,
+      "learning_rate": 1.8152054698255194e-05,
+      "loss": 2.8671,
+      "step": 23200
+    },
+    {
+      "epoch": 40.364583333333336,
+      "grad_norm": 3.609375,
+      "learning_rate": 1.7839372250036534e-05,
+      "loss": 2.8812,
+      "step": 23250
+    },
+    {
+      "epoch": 40.451388888888886,
+      "grad_norm": 3.9375,
+      "learning_rate": 1.7529142401821062e-05,
+      "loss": 2.8657,
+      "step": 23300
+    },
+    {
+      "epoch": 40.53819444444444,
+      "grad_norm": 3.875,
+      "learning_rate": 1.722137441436721e-05,
+      "loss": 2.8782,
+      "step": 23350
+    },
+    {
+      "epoch": 40.625,
+      "grad_norm": 3.984375,
+      "learning_rate": 1.6916077474943736e-05,
+      "loss": 2.8685,
+      "step": 23400
+    },
+    {
+      "epoch": 40.71180555555556,
+      "grad_norm": 2.90625,
+      "learning_rate": 1.66132606970554e-05,
+      "loss": 2.8671,
+      "step": 23450
+    },
+    {
+      "epoch": 40.798611111111114,
+      "grad_norm": 3.328125,
+      "learning_rate": 1.631293312017099e-05,
+      "loss": 2.8723,
+      "step": 23500
+    },
+    {
+      "epoch": 40.885416666666664,
+      "grad_norm": 3.171875,
+      "learning_rate": 1.6015103709453482e-05,
+      "loss": 2.8591,
+      "step": 23550
+    },
+    {
+      "epoch": 40.97222222222222,
+      "grad_norm": 3.375,
+      "learning_rate": 1.571978135549238e-05,
+      "loss": 2.8635,
+      "step": 23600
+    },
+    {
+      "epoch": 41.0,
+      "eval_loss": 2.8541696071624756,
+      "eval_runtime": 40.7796,
+      "eval_samples_per_second": 91.443,
+      "eval_steps_per_second": 5.738,
+      "step": 23616
+    },
+    {
+      "epoch": 41.05902777777778,
+      "grad_norm": 3.5625,
+      "learning_rate": 1.5426974874038247e-05,
+      "loss": 2.8627,
+      "step": 23650
+    },
+    {
+      "epoch": 41.145833333333336,
+      "grad_norm": 2.96875,
+      "learning_rate": 1.51366930057398e-05,
+      "loss": 2.8606,
+      "step": 23700
+    },
+    {
+      "epoch": 41.232638888888886,
+      "grad_norm": 3.25,
+      "learning_rate": 1.4848944415882648e-05,
+      "loss": 2.8608,
+      "step": 23750
+    },
+    {
+      "epoch": 41.31944444444444,
+      "grad_norm": 2.9375,
+      "learning_rate": 1.4563737694130885e-05,
+      "loss": 2.8802,
+      "step": 23800
+    },
+    {
+      "epoch": 41.40625,
+      "grad_norm": 3.21875,
+      "learning_rate": 1.4281081354270564e-05,
+      "loss": 2.8615,
+      "step": 23850
+    },
+    {
+      "epoch": 41.49305555555556,
+      "grad_norm": 3.59375,
+      "learning_rate": 1.4000983833955594e-05,
+      "loss": 2.8829,
+      "step": 23900
+    },
+    {
+      "epoch": 41.579861111111114,
+      "grad_norm": 2.828125,
+      "learning_rate": 1.3723453494455784e-05,
+      "loss": 2.8665,
+      "step": 23950
+    },
+    {
+      "epoch": 41.666666666666664,
+      "grad_norm": 3.015625,
+      "learning_rate": 1.3448498620407345e-05,
+      "loss": 2.8761,
+      "step": 24000
+    },
+    {
+      "epoch": 41.75347222222222,
+      "grad_norm": 4.03125,
+      "learning_rate": 1.3176127419565564e-05,
+      "loss": 2.8624,
+      "step": 24050
+    },
+    {
+      "epoch": 41.84027777777778,
+      "grad_norm": 3.09375,
+      "learning_rate": 1.2906348022559755e-05,
+      "loss": 2.8687,
+      "step": 24100
+    },
+    {
+      "epoch": 41.927083333333336,
+      "grad_norm": 3.078125,
+      "learning_rate": 1.2639168482650532e-05,
+      "loss": 2.8575,
+      "step": 24150
+    },
+    {
+      "epoch": 42.0,
+      "eval_loss": 2.854001045227051,
+      "eval_runtime": 40.3709,
+      "eval_samples_per_second": 92.368,
+      "eval_steps_per_second": 5.796,
+      "step": 24192
+    },
+    {
+      "epoch": 42.013888888888886,
+      "grad_norm": 3.96875,
+      "learning_rate": 1.2374596775489477e-05,
+      "loss": 2.8656,
+      "step": 24200
+    },
+    {
+      "epoch": 42.10069444444444,
+      "grad_norm": 3.4375,
+      "learning_rate": 1.2112640798881058e-05,
+      "loss": 2.8625,
+      "step": 24250
+    },
+    {
+      "epoch": 42.1875,
+      "grad_norm": 3.53125,
+      "learning_rate": 1.1853308372546756e-05,
+      "loss": 2.8571,
+      "step": 24300
+    },
+    {
+      "epoch": 42.27430555555556,
+      "grad_norm": 3.21875,
+      "learning_rate": 1.1596607237891766e-05,
+      "loss": 2.8664,
+      "step": 24350
+    },
+    {
+      "epoch": 42.361111111111114,
+      "grad_norm": 3.125,
+      "learning_rate": 1.1342545057773846e-05,
+      "loss": 2.881,
+      "step": 24400
+    },
+    {
+      "epoch": 42.447916666666664,
+      "grad_norm": 3.453125,
+      "learning_rate": 1.1091129416274603e-05,
+      "loss": 2.8614,
+      "step": 24450
+    },
+    {
+      "epoch": 42.53472222222222,
+      "grad_norm": 3.28125,
+      "learning_rate": 1.0842367818472988e-05,
+      "loss": 2.8773,
+      "step": 24500
+    },
+    {
+      "epoch": 42.62152777777778,
+      "grad_norm": 2.640625,
+      "learning_rate": 1.0596267690221496e-05,
+      "loss": 2.874,
+      "step": 24550
+    },
+    {
+      "epoch": 42.708333333333336,
+      "grad_norm": 4.0625,
+      "learning_rate": 1.0352836377924202e-05,
+      "loss": 2.8666,
+      "step": 24600
+    },
+    {
+      "epoch": 42.795138888888886,
+      "grad_norm": 3.203125,
+      "learning_rate": 1.0112081148317687e-05,
+      "loss": 2.8681,
+      "step": 24650
+    },
+    {
+      "epoch": 42.88194444444444,
+      "grad_norm": 3.15625,
+      "learning_rate": 9.874009188253974e-06,
+      "loss": 2.8575,
+      "step": 24700
+    },
+    {
+      "epoch": 42.96875,
+      "grad_norm": 3.046875,
+      "learning_rate": 9.63862760448616e-06,
+      "loss": 2.8666,
+      "step": 24750
+    },
+    {
+      "epoch": 43.0,
+      "eval_loss": 2.8540420532226562,
+      "eval_runtime": 42.3273,
+      "eval_samples_per_second": 88.099,
+      "eval_steps_per_second": 5.528,
+      "step": 24768
+    },
+    {
+      "epoch": 43.05555555555556,
+      "grad_norm": 3.265625,
+      "learning_rate": 9.405943423456043e-06,
+      "loss": 2.8636,
+      "step": 24800
+    },
+    {
+      "epoch": 43.142361111111114,
+      "grad_norm": 3.25,
+      "learning_rate": 9.175963591084546e-06,
+      "loss": 2.858,
+      "step": 24850
+    },
+    {
+      "epoch": 43.229166666666664,
+      "grad_norm": 4.59375,
+      "learning_rate": 8.948694972564343e-06,
+      "loss": 2.8629,
+      "step": 24900
+    },
+    {
+      "epoch": 43.31597222222222,
+      "grad_norm": 3.078125,
+      "learning_rate": 8.724144352154861e-06,
+      "loss": 2.8783,
+      "step": 24950
+    },
+    {
+      "epoch": 43.40277777777778,
+      "grad_norm": 3.515625,
+      "learning_rate": 8.502318432979806e-06,
+      "loss": 2.8623,
+      "step": 25000
+    },
+    {
+      "epoch": 43.489583333333336,
+      "grad_norm": 2.65625,
+      "learning_rate": 8.28322383682707e-06,
+      "loss": 2.8827,
+      "step": 25050
+    },
+    {
+      "epoch": 43.576388888888886,
+      "grad_norm": 3.203125,
+      "learning_rate": 8.066867103951082e-06,
+      "loss": 2.8631,
+      "step": 25100
+    },
+    {
+      "epoch": 43.66319444444444,
+      "grad_norm": 2.890625,
+      "learning_rate": 7.853254692877476e-06,
+      "loss": 2.8769,
+      "step": 25150
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 3.078125,
+      "learning_rate": 7.642392980210423e-06,
+      "loss": 2.8654,
+      "step": 25200
+    },
+    {
+      "epoch": 43.83680555555556,
+      "grad_norm": 3.21875,
+      "learning_rate": 7.4342882604422125e-06,
+      "loss": 2.87,
+      "step": 25250
+    },
+    {
+      "epoch": 43.923611111111114,
+      "grad_norm": 3.34375,
+      "learning_rate": 7.228946745765364e-06,
+      "loss": 2.8584,
+      "step": 25300
+    },
+    {
+      "epoch": 44.0,
+      "eval_loss": 2.8539493083953857,
+      "eval_runtime": 42.0373,
+      "eval_samples_per_second": 88.707,
+      "eval_steps_per_second": 5.566,
+      "step": 25344
+    },
+    {
+      "epoch": 44.010416666666664,
+      "grad_norm": 2.90625,
+      "learning_rate": 7.026374565887117e-06,
+      "loss": 2.8638,
+      "step": 25350
+    },
+    {
+      "epoch": 44.09722222222222,
+      "grad_norm": 2.46875,
+      "learning_rate": 6.826577767846665e-06,
+      "loss": 2.8638,
+      "step": 25400
+    },
+    {
+      "epoch": 44.18402777777778,
+      "grad_norm": 3.078125,
+      "learning_rate": 6.629562315834348e-06,
+      "loss": 2.8536,
+      "step": 25450
+    },
+    {
+      "epoch": 44.270833333333336,
+      "grad_norm": 3.09375,
+      "learning_rate": 6.435334091013856e-06,
+      "loss": 2.8646,
+      "step": 25500
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 28800,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.177294293290189e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}