train_copa_42_1760623604 / trainer_state.json
rbelanec's picture
End of training
3e8efcb verified
{
"best_global_step": 180,
"best_metric": 0.22924575209617615,
"best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_copa_42_1760623604/checkpoint-180",
"epoch": 20.0,
"eval_steps": 90,
"global_step": 1800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05555555555555555,
"grad_norm": 1.1328125,
"learning_rate": 0.0006666666666666666,
"loss": 0.1508,
"num_input_tokens_seen": 1600,
"step": 5,
"train_runtime": 3.1038,
"train_tokens_per_second": 515.503
},
{
"epoch": 0.1111111111111111,
"grad_norm": 12.25,
"learning_rate": 0.0015,
"loss": 0.1225,
"num_input_tokens_seen": 3200,
"step": 10,
"train_runtime": 3.9613,
"train_tokens_per_second": 807.808
},
{
"epoch": 0.16666666666666666,
"grad_norm": 11.625,
"learning_rate": 0.002333333333333333,
"loss": 0.2189,
"num_input_tokens_seen": 4768,
"step": 15,
"train_runtime": 4.8299,
"train_tokens_per_second": 987.182
},
{
"epoch": 0.2222222222222222,
"grad_norm": 41.25,
"learning_rate": 0.0031666666666666666,
"loss": 0.2288,
"num_input_tokens_seen": 6336,
"step": 20,
"train_runtime": 5.6824,
"train_tokens_per_second": 1115.019
},
{
"epoch": 0.2777777777777778,
"grad_norm": 100.0,
"learning_rate": 0.004,
"loss": 4.5867,
"num_input_tokens_seen": 7904,
"step": 25,
"train_runtime": 6.501,
"train_tokens_per_second": 1215.811
},
{
"epoch": 0.3333333333333333,
"grad_norm": 34.25,
"learning_rate": 0.004833333333333334,
"loss": 3.3424,
"num_input_tokens_seen": 9504,
"step": 30,
"train_runtime": 7.296,
"train_tokens_per_second": 1302.633
},
{
"epoch": 0.3888888888888889,
"grad_norm": 10.0,
"learning_rate": 0.005666666666666666,
"loss": 1.1922,
"num_input_tokens_seen": 11072,
"step": 35,
"train_runtime": 8.0915,
"train_tokens_per_second": 1368.343
},
{
"epoch": 0.4444444444444444,
"grad_norm": 31.875,
"learning_rate": 0.0065,
"loss": 0.5121,
"num_input_tokens_seen": 12672,
"step": 40,
"train_runtime": 8.8889,
"train_tokens_per_second": 1425.604
},
{
"epoch": 0.5,
"grad_norm": 0.953125,
"learning_rate": 0.007333333333333333,
"loss": 0.2904,
"num_input_tokens_seen": 14176,
"step": 45,
"train_runtime": 9.678,
"train_tokens_per_second": 1464.762
},
{
"epoch": 0.5555555555555556,
"grad_norm": 4.21875,
"learning_rate": 0.008166666666666666,
"loss": 0.3048,
"num_input_tokens_seen": 15776,
"step": 50,
"train_runtime": 10.472,
"train_tokens_per_second": 1506.489
},
{
"epoch": 0.6111111111111112,
"grad_norm": 4.21875,
"learning_rate": 0.009,
"loss": 0.3677,
"num_input_tokens_seen": 17312,
"step": 55,
"train_runtime": 11.2639,
"train_tokens_per_second": 1536.94
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.58203125,
"learning_rate": 0.009833333333333333,
"loss": 0.2604,
"num_input_tokens_seen": 18848,
"step": 60,
"train_runtime": 12.0538,
"train_tokens_per_second": 1563.659
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.50390625,
"learning_rate": 0.010666666666666666,
"loss": 0.256,
"num_input_tokens_seen": 20448,
"step": 65,
"train_runtime": 12.8468,
"train_tokens_per_second": 1591.68
},
{
"epoch": 0.7777777777777778,
"grad_norm": 1.484375,
"learning_rate": 0.0115,
"loss": 0.5609,
"num_input_tokens_seen": 22016,
"step": 70,
"train_runtime": 13.6383,
"train_tokens_per_second": 1614.278
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.333984375,
"learning_rate": 0.012333333333333332,
"loss": 1.0653,
"num_input_tokens_seen": 23616,
"step": 75,
"train_runtime": 14.4335,
"train_tokens_per_second": 1636.199
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.54296875,
"learning_rate": 0.013166666666666667,
"loss": 0.2765,
"num_input_tokens_seen": 25152,
"step": 80,
"train_runtime": 15.2249,
"train_tokens_per_second": 1652.025
},
{
"epoch": 0.9444444444444444,
"grad_norm": 22.875,
"learning_rate": 0.014,
"loss": 1.0491,
"num_input_tokens_seen": 26688,
"step": 85,
"train_runtime": 16.0181,
"train_tokens_per_second": 1666.117
},
{
"epoch": 1.0,
"grad_norm": 0.2060546875,
"learning_rate": 0.014833333333333334,
"loss": 0.26,
"num_input_tokens_seen": 28256,
"step": 90,
"train_runtime": 16.9498,
"train_tokens_per_second": 1667.037
},
{
"epoch": 1.0,
"eval_loss": 0.23620998859405518,
"eval_runtime": 1.2704,
"eval_samples_per_second": 31.486,
"eval_steps_per_second": 7.872,
"num_input_tokens_seen": 28256,
"step": 90
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.91796875,
"learning_rate": 0.015666666666666666,
"loss": 0.29,
"num_input_tokens_seen": 29824,
"step": 95,
"train_runtime": 19.874,
"train_tokens_per_second": 1500.656
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.4140625,
"learning_rate": 0.0165,
"loss": 0.3073,
"num_input_tokens_seen": 31360,
"step": 100,
"train_runtime": 20.6816,
"train_tokens_per_second": 1516.326
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.251953125,
"learning_rate": 0.017333333333333333,
"loss": 0.231,
"num_input_tokens_seen": 32960,
"step": 105,
"train_runtime": 21.4794,
"train_tokens_per_second": 1534.493
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.1591796875,
"learning_rate": 0.018166666666666664,
"loss": 0.2256,
"num_input_tokens_seen": 34464,
"step": 110,
"train_runtime": 22.2709,
"train_tokens_per_second": 1547.491
},
{
"epoch": 1.2777777777777777,
"grad_norm": 0.375,
"learning_rate": 0.019,
"loss": 0.7092,
"num_input_tokens_seen": 36032,
"step": 115,
"train_runtime": 23.0649,
"train_tokens_per_second": 1562.201
},
{
"epoch": 1.3333333333333333,
"grad_norm": 3.171875,
"learning_rate": 0.01983333333333333,
"loss": 0.2771,
"num_input_tokens_seen": 37600,
"step": 120,
"train_runtime": 23.8592,
"train_tokens_per_second": 1575.909
},
{
"epoch": 1.3888888888888888,
"grad_norm": 2.15625,
"learning_rate": 0.020666666666666667,
"loss": 0.2618,
"num_input_tokens_seen": 39168,
"step": 125,
"train_runtime": 24.6509,
"train_tokens_per_second": 1588.905
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.07275390625,
"learning_rate": 0.0215,
"loss": 0.2334,
"num_input_tokens_seen": 40736,
"step": 130,
"train_runtime": 25.4426,
"train_tokens_per_second": 1601.092
},
{
"epoch": 1.5,
"grad_norm": 0.0279541015625,
"learning_rate": 0.022333333333333334,
"loss": 0.2422,
"num_input_tokens_seen": 42240,
"step": 135,
"train_runtime": 26.2319,
"train_tokens_per_second": 1610.255
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.103515625,
"learning_rate": 0.023166666666666665,
"loss": 0.2405,
"num_input_tokens_seen": 43840,
"step": 140,
"train_runtime": 27.0285,
"train_tokens_per_second": 1621.993
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.04541015625,
"learning_rate": 0.024,
"loss": 0.2416,
"num_input_tokens_seen": 45408,
"step": 145,
"train_runtime": 27.823,
"train_tokens_per_second": 1632.028
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.060791015625,
"learning_rate": 0.024833333333333332,
"loss": 0.2476,
"num_input_tokens_seen": 46976,
"step": 150,
"train_runtime": 28.6144,
"train_tokens_per_second": 1641.69
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.08740234375,
"learning_rate": 0.025666666666666664,
"loss": 0.2165,
"num_input_tokens_seen": 48512,
"step": 155,
"train_runtime": 29.4067,
"train_tokens_per_second": 1649.691
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.07373046875,
"learning_rate": 0.0265,
"loss": 0.2679,
"num_input_tokens_seen": 50112,
"step": 160,
"train_runtime": 30.2016,
"train_tokens_per_second": 1659.249
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.0458984375,
"learning_rate": 0.02733333333333333,
"loss": 0.2291,
"num_input_tokens_seen": 51712,
"step": 165,
"train_runtime": 30.9983,
"train_tokens_per_second": 1668.222
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.07373046875,
"learning_rate": 0.028166666666666666,
"loss": 0.235,
"num_input_tokens_seen": 53280,
"step": 170,
"train_runtime": 31.7939,
"train_tokens_per_second": 1675.793
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.032470703125,
"learning_rate": 0.028999999999999998,
"loss": 0.241,
"num_input_tokens_seen": 54880,
"step": 175,
"train_runtime": 32.5915,
"train_tokens_per_second": 1683.876
},
{
"epoch": 2.0,
"grad_norm": 0.01507568359375,
"learning_rate": 0.029833333333333333,
"loss": 0.2246,
"num_input_tokens_seen": 56480,
"step": 180,
"train_runtime": 33.4262,
"train_tokens_per_second": 1689.693
},
{
"epoch": 2.0,
"eval_loss": 0.22924575209617615,
"eval_runtime": 0.8256,
"eval_samples_per_second": 48.45,
"eval_steps_per_second": 12.112,
"num_input_tokens_seen": 56480,
"step": 180
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.0224609375,
"learning_rate": 0.02999954871719651,
"loss": 0.2275,
"num_input_tokens_seen": 58048,
"step": 185,
"train_runtime": 36.0921,
"train_tokens_per_second": 1608.328
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.01361083984375,
"learning_rate": 0.029997715427345868,
"loss": 0.2277,
"num_input_tokens_seen": 59584,
"step": 190,
"train_runtime": 36.8907,
"train_tokens_per_second": 1615.149
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.052734375,
"learning_rate": 0.02999447209750064,
"loss": 0.2313,
"num_input_tokens_seen": 61216,
"step": 195,
"train_runtime": 37.6876,
"train_tokens_per_second": 1624.3
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.033203125,
"learning_rate": 0.02998981903258893,
"loss": 0.2358,
"num_input_tokens_seen": 62784,
"step": 200,
"train_runtime": 38.4845,
"train_tokens_per_second": 1631.408
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.044921875,
"learning_rate": 0.02998375667007787,
"loss": 0.2412,
"num_input_tokens_seen": 64352,
"step": 205,
"train_runtime": 39.2774,
"train_tokens_per_second": 1638.396
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.0220947265625,
"learning_rate": 0.029976285579932503,
"loss": 0.2008,
"num_input_tokens_seen": 65952,
"step": 210,
"train_runtime": 40.0729,
"train_tokens_per_second": 1645.799
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.12255859375,
"learning_rate": 0.029967406464562214,
"loss": 0.2465,
"num_input_tokens_seen": 67552,
"step": 215,
"train_runtime": 40.866,
"train_tokens_per_second": 1653.013
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.0245361328125,
"learning_rate": 0.02995712015875466,
"loss": 0.2297,
"num_input_tokens_seen": 69120,
"step": 220,
"train_runtime": 41.6581,
"train_tokens_per_second": 1659.219
},
{
"epoch": 2.5,
"grad_norm": 0.04248046875,
"learning_rate": 0.029945427629597305,
"loss": 0.2409,
"num_input_tokens_seen": 70688,
"step": 225,
"train_runtime": 42.453,
"train_tokens_per_second": 1665.087
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.07763671875,
"learning_rate": 0.029932329976386493,
"loss": 0.2373,
"num_input_tokens_seen": 72288,
"step": 230,
"train_runtime": 43.2489,
"train_tokens_per_second": 1671.441
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.10400390625,
"learning_rate": 0.0299178284305241,
"loss": 0.2425,
"num_input_tokens_seen": 73856,
"step": 235,
"train_runtime": 44.0446,
"train_tokens_per_second": 1676.845
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.04052734375,
"learning_rate": 0.02990192435540175,
"loss": 0.2331,
"num_input_tokens_seen": 75392,
"step": 240,
"train_runtime": 44.8352,
"train_tokens_per_second": 1681.537
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.02197265625,
"learning_rate": 0.029884619246272646,
"loss": 0.2384,
"num_input_tokens_seen": 76960,
"step": 245,
"train_runtime": 45.6329,
"train_tokens_per_second": 1686.502
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.03369140625,
"learning_rate": 0.02986591473011098,
"loss": 0.2255,
"num_input_tokens_seen": 78496,
"step": 250,
"train_runtime": 46.4329,
"train_tokens_per_second": 1690.526
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.0205078125,
"learning_rate": 0.02984581256545898,
"loss": 0.2376,
"num_input_tokens_seen": 80000,
"step": 255,
"train_runtime": 47.2277,
"train_tokens_per_second": 1693.922
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.030029296875,
"learning_rate": 0.02982431464226157,
"loss": 0.2274,
"num_input_tokens_seen": 81568,
"step": 260,
"train_runtime": 48.0201,
"train_tokens_per_second": 1698.622
},
{
"epoch": 2.9444444444444446,
"grad_norm": 0.03125,
"learning_rate": 0.02980142298168869,
"loss": 0.2306,
"num_input_tokens_seen": 83168,
"step": 265,
"train_runtime": 48.8131,
"train_tokens_per_second": 1703.803
},
{
"epoch": 3.0,
"grad_norm": 0.02099609375,
"learning_rate": 0.029777139735945243,
"loss": 0.2336,
"num_input_tokens_seen": 84736,
"step": 270,
"train_runtime": 49.7464,
"train_tokens_per_second": 1703.361
},
{
"epoch": 3.0,
"eval_loss": 0.23087672889232635,
"eval_runtime": 0.8189,
"eval_samples_per_second": 48.846,
"eval_steps_per_second": 12.212,
"num_input_tokens_seen": 84736,
"step": 270
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.03759765625,
"learning_rate": 0.029751467188068818,
"loss": 0.2376,
"num_input_tokens_seen": 86304,
"step": 275,
"train_runtime": 52.2896,
"train_tokens_per_second": 1650.5
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.029541015625,
"learning_rate": 0.02972440775171496,
"loss": 0.2289,
"num_input_tokens_seen": 87904,
"step": 280,
"train_runtime": 53.1215,
"train_tokens_per_second": 1654.773
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.041259765625,
"learning_rate": 0.029695963970930307,
"loss": 0.2255,
"num_input_tokens_seen": 89408,
"step": 285,
"train_runtime": 53.9107,
"train_tokens_per_second": 1658.447
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.01312255859375,
"learning_rate": 0.029666138519913395,
"loss": 0.2251,
"num_input_tokens_seen": 91008,
"step": 290,
"train_runtime": 54.7037,
"train_tokens_per_second": 1663.654
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.0269775390625,
"learning_rate": 0.029634934202763214,
"loss": 0.2566,
"num_input_tokens_seen": 92512,
"step": 295,
"train_runtime": 55.4911,
"train_tokens_per_second": 1667.149
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.02490234375,
"learning_rate": 0.0296023539532156,
"loss": 0.2391,
"num_input_tokens_seen": 94080,
"step": 300,
"train_runtime": 56.2846,
"train_tokens_per_second": 1671.505
},
{
"epoch": 3.388888888888889,
"grad_norm": 0.042236328125,
"learning_rate": 0.029568400834367403,
"loss": 0.2269,
"num_input_tokens_seen": 95680,
"step": 305,
"train_runtime": 57.0809,
"train_tokens_per_second": 1676.218
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.033935546875,
"learning_rate": 0.02953307803838851,
"loss": 0.2325,
"num_input_tokens_seen": 97248,
"step": 310,
"train_runtime": 57.8741,
"train_tokens_per_second": 1680.338
},
{
"epoch": 3.5,
"grad_norm": 0.00994873046875,
"learning_rate": 0.02949638888622172,
"loss": 0.2369,
"num_input_tokens_seen": 98784,
"step": 315,
"train_runtime": 58.6661,
"train_tokens_per_second": 1683.835
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.01507568359375,
"learning_rate": 0.029458336827270518,
"loss": 0.2209,
"num_input_tokens_seen": 100384,
"step": 320,
"train_runtime": 59.4642,
"train_tokens_per_second": 1688.142
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.0086669921875,
"learning_rate": 0.029418925439074782,
"loss": 0.2318,
"num_input_tokens_seen": 101952,
"step": 325,
"train_runtime": 60.2574,
"train_tokens_per_second": 1691.941
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.006805419921875,
"learning_rate": 0.029378158426974426,
"loss": 0.2321,
"num_input_tokens_seen": 103520,
"step": 330,
"train_runtime": 61.0556,
"train_tokens_per_second": 1695.503
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.003814697265625,
"learning_rate": 0.029336039623761044,
"loss": 0.2406,
"num_input_tokens_seen": 105120,
"step": 335,
"train_runtime": 61.8554,
"train_tokens_per_second": 1699.448
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.003692626953125,
"learning_rate": 0.02929257298931754,
"loss": 0.2309,
"num_input_tokens_seen": 106720,
"step": 340,
"train_runtime": 62.653,
"train_tokens_per_second": 1703.349
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.0189208984375,
"learning_rate": 0.02924776261024586,
"loss": 0.2327,
"num_input_tokens_seen": 108320,
"step": 345,
"train_runtime": 63.447,
"train_tokens_per_second": 1707.252
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.0030975341796875,
"learning_rate": 0.02920161269948277,
"loss": 0.2304,
"num_input_tokens_seen": 109888,
"step": 350,
"train_runtime": 64.242,
"train_tokens_per_second": 1710.531
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.00421142578125,
"learning_rate": 0.029154127595903752,
"loss": 0.2293,
"num_input_tokens_seen": 111424,
"step": 355,
"train_runtime": 65.0354,
"train_tokens_per_second": 1713.281
},
{
"epoch": 4.0,
"grad_norm": 0.00347900390625,
"learning_rate": 0.029105311763915113,
"loss": 0.2347,
"num_input_tokens_seen": 113024,
"step": 360,
"train_runtime": 65.871,
"train_tokens_per_second": 1715.839
},
{
"epoch": 4.0,
"eval_loss": 0.23181450366973877,
"eval_runtime": 0.8261,
"eval_samples_per_second": 48.422,
"eval_steps_per_second": 12.105,
"num_input_tokens_seen": 113024,
"step": 360
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.004241943359375,
"learning_rate": 0.029055169793034224,
"loss": 0.2306,
"num_input_tokens_seen": 114624,
"step": 365,
"train_runtime": 68.3415,
"train_tokens_per_second": 1677.223
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.00628662109375,
"learning_rate": 0.029003706397458022,
"loss": 0.2349,
"num_input_tokens_seen": 116224,
"step": 370,
"train_runtime": 69.1577,
"train_tokens_per_second": 1680.564
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.00592041015625,
"learning_rate": 0.028950926415619846,
"loss": 0.2471,
"num_input_tokens_seen": 117760,
"step": 375,
"train_runtime": 69.9474,
"train_tokens_per_second": 1683.55
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.0169677734375,
"learning_rate": 0.028896834809734474,
"loss": 0.2298,
"num_input_tokens_seen": 119360,
"step": 380,
"train_runtime": 70.7435,
"train_tokens_per_second": 1687.223
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.0302734375,
"learning_rate": 0.028841436665331635,
"loss": 0.2254,
"num_input_tokens_seen": 120960,
"step": 385,
"train_runtime": 71.5406,
"train_tokens_per_second": 1690.787
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.003662109375,
"learning_rate": 0.02878473719077787,
"loss": 0.2393,
"num_input_tokens_seen": 122528,
"step": 390,
"train_runtime": 72.3313,
"train_tokens_per_second": 1693.983
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.0174560546875,
"learning_rate": 0.028726741716786866,
"loss": 0.2317,
"num_input_tokens_seen": 124096,
"step": 395,
"train_runtime": 73.1276,
"train_tokens_per_second": 1696.979
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.0186767578125,
"learning_rate": 0.02866745569591825,
"loss": 0.2351,
"num_input_tokens_seen": 125696,
"step": 400,
"train_runtime": 73.9263,
"train_tokens_per_second": 1700.287
},
{
"epoch": 4.5,
"grad_norm": 0.017578125,
"learning_rate": 0.028606884702065006,
"loss": 0.2317,
"num_input_tokens_seen": 127264,
"step": 405,
"train_runtime": 74.7224,
"train_tokens_per_second": 1703.157
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.005462646484375,
"learning_rate": 0.028545034429929377,
"loss": 0.2264,
"num_input_tokens_seen": 128832,
"step": 410,
"train_runtime": 75.5264,
"train_tokens_per_second": 1705.788
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.0185546875,
"learning_rate": 0.028481910694487505,
"loss": 0.2396,
"num_input_tokens_seen": 130464,
"step": 415,
"train_runtime": 76.325,
"train_tokens_per_second": 1709.321
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.00421142578125,
"learning_rate": 0.02841751943044271,
"loss": 0.2336,
"num_input_tokens_seen": 132032,
"step": 420,
"train_runtime": 77.126,
"train_tokens_per_second": 1711.901
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.01361083984375,
"learning_rate": 0.028351866691667543,
"loss": 0.2314,
"num_input_tokens_seen": 133632,
"step": 425,
"train_runtime": 77.9247,
"train_tokens_per_second": 1714.886
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.01361083984375,
"learning_rate": 0.02828495865063459,
"loss": 0.2325,
"num_input_tokens_seen": 135232,
"step": 430,
"train_runtime": 78.7259,
"train_tokens_per_second": 1717.757
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.0042724609375,
"learning_rate": 0.028216801597836176,
"loss": 0.2216,
"num_input_tokens_seen": 136768,
"step": 435,
"train_runtime": 79.5217,
"train_tokens_per_second": 1719.883
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.005462646484375,
"learning_rate": 0.028147401941192952,
"loss": 0.2297,
"num_input_tokens_seen": 138368,
"step": 440,
"train_runtime": 80.3232,
"train_tokens_per_second": 1722.64
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.02197265625,
"learning_rate": 0.028076766205451433,
"loss": 0.2443,
"num_input_tokens_seen": 139904,
"step": 445,
"train_runtime": 81.1198,
"train_tokens_per_second": 1724.659
},
{
"epoch": 5.0,
"grad_norm": 0.005706787109375,
"learning_rate": 0.028004901031570568,
"loss": 0.2277,
"num_input_tokens_seen": 141440,
"step": 450,
"train_runtime": 81.9601,
"train_tokens_per_second": 1725.718
},
{
"epoch": 5.0,
"eval_loss": 0.2365764081478119,
"eval_runtime": 0.8277,
"eval_samples_per_second": 48.329,
"eval_steps_per_second": 12.082,
"num_input_tokens_seen": 141440,
"step": 450
},
{
"epoch": 5.055555555555555,
"grad_norm": 0.0167236328125,
"learning_rate": 0.027931813176097366,
"loss": 0.2361,
"num_input_tokens_seen": 142976,
"step": 455,
"train_runtime": 84.4881,
"train_tokens_per_second": 1692.262
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.00494384765625,
"learning_rate": 0.027857509510531685,
"loss": 0.2293,
"num_input_tokens_seen": 144576,
"step": 460,
"train_runtime": 85.2893,
"train_tokens_per_second": 1695.125
},
{
"epoch": 5.166666666666667,
"grad_norm": 0.0245361328125,
"learning_rate": 0.02778199702068017,
"loss": 0.2307,
"num_input_tokens_seen": 146144,
"step": 465,
"train_runtime": 86.0895,
"train_tokens_per_second": 1697.581
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.012451171875,
"learning_rate": 0.02770528280599949,
"loss": 0.2336,
"num_input_tokens_seen": 147712,
"step": 470,
"train_runtime": 86.8889,
"train_tokens_per_second": 1700.009
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.006683349609375,
"learning_rate": 0.02762737407892886,
"loss": 0.2294,
"num_input_tokens_seen": 149248,
"step": 475,
"train_runtime": 87.6835,
"train_tokens_per_second": 1702.121
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.013916015625,
"learning_rate": 0.02754827816421195,
"loss": 0.2357,
"num_input_tokens_seen": 150816,
"step": 480,
"train_runtime": 88.4782,
"train_tokens_per_second": 1704.555
},
{
"epoch": 5.388888888888889,
"grad_norm": 0.01202392578125,
"learning_rate": 0.02746800249820822,
"loss": 0.2212,
"num_input_tokens_seen": 152352,
"step": 485,
"train_runtime": 89.2722,
"train_tokens_per_second": 1706.6
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.0166015625,
"learning_rate": 0.027386554628193813,
"loss": 0.2362,
"num_input_tokens_seen": 153888,
"step": 490,
"train_runtime": 90.0625,
"train_tokens_per_second": 1708.681
},
{
"epoch": 5.5,
"grad_norm": 0.00469970703125,
"learning_rate": 0.027303942211651937,
"loss": 0.2391,
"num_input_tokens_seen": 155488,
"step": 495,
"train_runtime": 90.8589,
"train_tokens_per_second": 1711.313
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.0223388671875,
"learning_rate": 0.02722017301555297,
"loss": 0.2305,
"num_input_tokens_seen": 157024,
"step": 500,
"train_runtime": 91.6521,
"train_tokens_per_second": 1713.261
},
{
"epoch": 5.611111111111111,
"grad_norm": 0.023193359375,
"learning_rate": 0.02713525491562421,
"loss": 0.2316,
"num_input_tokens_seen": 158528,
"step": 505,
"train_runtime": 92.4397,
"train_tokens_per_second": 1714.934
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.0115966796875,
"learning_rate": 0.027049195895609432,
"loss": 0.2305,
"num_input_tokens_seen": 160064,
"step": 510,
"train_runtime": 93.229,
"train_tokens_per_second": 1716.89
},
{
"epoch": 5.722222222222222,
"grad_norm": 0.009765625,
"learning_rate": 0.026962004046518273,
"loss": 0.2286,
"num_input_tokens_seen": 161664,
"step": 515,
"train_runtime": 94.0222,
"train_tokens_per_second": 1719.424
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.02099609375,
"learning_rate": 0.02687368756586555,
"loss": 0.2297,
"num_input_tokens_seen": 163264,
"step": 520,
"train_runtime": 94.8159,
"train_tokens_per_second": 1721.905
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.01226806640625,
"learning_rate": 0.02678425475690055,
"loss": 0.2348,
"num_input_tokens_seen": 164864,
"step": 525,
"train_runtime": 95.6085,
"train_tokens_per_second": 1724.365
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.0126953125,
"learning_rate": 0.02669371402782638,
"loss": 0.2286,
"num_input_tokens_seen": 166432,
"step": 530,
"train_runtime": 96.4007,
"train_tokens_per_second": 1726.461
},
{
"epoch": 5.944444444444445,
"grad_norm": 0.01153564453125,
"learning_rate": 0.026602073891009458,
"loss": 0.2308,
"num_input_tokens_seen": 168032,
"step": 535,
"train_runtime": 97.1955,
"train_tokens_per_second": 1728.804
},
{
"epoch": 6.0,
"grad_norm": 0.0030517578125,
"learning_rate": 0.0265093429621792,
"loss": 0.2348,
"num_input_tokens_seen": 169600,
"step": 540,
"train_runtime": 98.0299,
"train_tokens_per_second": 1730.084
},
{
"epoch": 6.0,
"eval_loss": 0.2344598025083542,
"eval_runtime": 0.8191,
"eval_samples_per_second": 48.837,
"eval_steps_per_second": 12.209,
"num_input_tokens_seen": 169600,
"step": 540
},
{
"epoch": 6.055555555555555,
"grad_norm": 0.00494384765625,
"learning_rate": 0.026415529959618007,
"loss": 0.2308,
"num_input_tokens_seen": 171168,
"step": 545,
"train_runtime": 100.5102,
"train_tokens_per_second": 1702.992
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.0037078857421875,
"learning_rate": 0.02632064370334158,
"loss": 0.2311,
"num_input_tokens_seen": 172672,
"step": 550,
"train_runtime": 101.3026,
"train_tokens_per_second": 1704.517
},
{
"epoch": 6.166666666666667,
"grad_norm": 0.0038299560546875,
"learning_rate": 0.026224693114269705,
"loss": 0.233,
"num_input_tokens_seen": 174240,
"step": 555,
"train_runtime": 102.1187,
"train_tokens_per_second": 1706.249
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.011962890625,
"learning_rate": 0.02612768721338753,
"loss": 0.2279,
"num_input_tokens_seen": 175776,
"step": 560,
"train_runtime": 102.9125,
"train_tokens_per_second": 1708.014
},
{
"epoch": 6.277777777777778,
"grad_norm": 0.0228271484375,
"learning_rate": 0.02602963512089743,
"loss": 0.232,
"num_input_tokens_seen": 177376,
"step": 565,
"train_runtime": 103.7126,
"train_tokens_per_second": 1710.265
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.0133056640625,
"learning_rate": 0.025930546055361575,
"loss": 0.231,
"num_input_tokens_seen": 178912,
"step": 570,
"train_runtime": 104.5028,
"train_tokens_per_second": 1712.031
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.01507568359375,
"learning_rate": 0.025830429332835202,
"loss": 0.2286,
"num_input_tokens_seen": 180480,
"step": 575,
"train_runtime": 105.2944,
"train_tokens_per_second": 1714.052
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.015380859375,
"learning_rate": 0.025729294365990772,
"loss": 0.231,
"num_input_tokens_seen": 182048,
"step": 580,
"train_runtime": 106.0853,
"train_tokens_per_second": 1716.052
},
{
"epoch": 6.5,
"grad_norm": 0.007720947265625,
"learning_rate": 0.025627150663232998,
"loss": 0.2408,
"num_input_tokens_seen": 183648,
"step": 585,
"train_runtime": 106.8794,
"train_tokens_per_second": 1718.273
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.00848388671875,
"learning_rate": 0.025524007827804902,
"loss": 0.2358,
"num_input_tokens_seen": 185248,
"step": 590,
"train_runtime": 107.6729,
"train_tokens_per_second": 1720.47
},
{
"epoch": 6.611111111111111,
"grad_norm": 0.0162353515625,
"learning_rate": 0.025419875556884956,
"loss": 0.2302,
"num_input_tokens_seen": 186720,
"step": 595,
"train_runtime": 108.46,
"train_tokens_per_second": 1721.556
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.0142822265625,
"learning_rate": 0.025314763640675374,
"loss": 0.2313,
"num_input_tokens_seen": 188288,
"step": 600,
"train_runtime": 109.2516,
"train_tokens_per_second": 1723.435
},
{
"epoch": 6.722222222222222,
"grad_norm": 0.006683349609375,
"learning_rate": 0.025208681961481655,
"loss": 0.2359,
"num_input_tokens_seen": 189888,
"step": 605,
"train_runtime": 110.0491,
"train_tokens_per_second": 1725.485
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.006134033203125,
"learning_rate": 0.025101640492783503,
"loss": 0.238,
"num_input_tokens_seen": 191424,
"step": 610,
"train_runtime": 110.8418,
"train_tokens_per_second": 1727.002
},
{
"epoch": 6.833333333333333,
"grad_norm": 0.02490234375,
"learning_rate": 0.024993649298297137,
"loss": 0.2306,
"num_input_tokens_seen": 193056,
"step": 615,
"train_runtime": 111.6358,
"train_tokens_per_second": 1729.338
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.01251220703125,
"learning_rate": 0.02488471853102912,
"loss": 0.2314,
"num_input_tokens_seen": 194592,
"step": 620,
"train_runtime": 112.4269,
"train_tokens_per_second": 1730.831
},
{
"epoch": 6.944444444444445,
"grad_norm": 0.0252685546875,
"learning_rate": 0.024774858432321828,
"loss": 0.2347,
"num_input_tokens_seen": 196192,
"step": 625,
"train_runtime": 113.2223,
"train_tokens_per_second": 1732.803
},
{
"epoch": 7.0,
"grad_norm": 0.01275634765625,
"learning_rate": 0.024664079330890574,
"loss": 0.2294,
"num_input_tokens_seen": 197792,
"step": 630,
"train_runtime": 114.0579,
"train_tokens_per_second": 1734.137
},
{
"epoch": 7.0,
"eval_loss": 0.23141007125377655,
"eval_runtime": 0.8186,
"eval_samples_per_second": 48.864,
"eval_steps_per_second": 12.216,
"num_input_tokens_seen": 197792,
"step": 630
},
{
"epoch": 7.055555555555555,
"grad_norm": 0.003936767578125,
"learning_rate": 0.02455239164185254,
"loss": 0.2314,
"num_input_tokens_seen": 199392,
"step": 635,
"train_runtime": 116.6196,
"train_tokens_per_second": 1709.764
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.005767822265625,
"learning_rate": 0.024439805865747562,
"loss": 0.2286,
"num_input_tokens_seen": 200992,
"step": 640,
"train_runtime": 117.4275,
"train_tokens_per_second": 1711.626
},
{
"epoch": 7.166666666666667,
"grad_norm": 0.0135498046875,
"learning_rate": 0.02432633258755093,
"loss": 0.236,
"num_input_tokens_seen": 202592,
"step": 645,
"train_runtime": 118.2277,
"train_tokens_per_second": 1713.574
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.01141357421875,
"learning_rate": 0.024211982475678205,
"loss": 0.2237,
"num_input_tokens_seen": 204064,
"step": 650,
"train_runtime": 119.016,
"train_tokens_per_second": 1714.593
},
{
"epoch": 7.277777777777778,
"grad_norm": 0.0172119140625,
"learning_rate": 0.024096766280982205,
"loss": 0.2322,
"num_input_tokens_seen": 205664,
"step": 655,
"train_runtime": 119.8138,
"train_tokens_per_second": 1716.53
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.0172119140625,
"learning_rate": 0.023980694835742226,
"loss": 0.2317,
"num_input_tokens_seen": 207264,
"step": 660,
"train_runtime": 120.6083,
"train_tokens_per_second": 1718.488
},
{
"epoch": 7.388888888888889,
"grad_norm": 0.0106201171875,
"learning_rate": 0.023863779052645667,
"loss": 0.2301,
"num_input_tokens_seen": 208832,
"step": 665,
"train_runtime": 121.401,
"train_tokens_per_second": 1720.184
},
{
"epoch": 7.444444444444445,
"grad_norm": 0.02587890625,
"learning_rate": 0.02374602992376202,
"loss": 0.227,
"num_input_tokens_seen": 210368,
"step": 670,
"train_runtime": 122.192,
"train_tokens_per_second": 1721.618
},
{
"epoch": 7.5,
"grad_norm": 0.020751953125,
"learning_rate": 0.023627458519509432,
"loss": 0.228,
"num_input_tokens_seen": 211936,
"step": 675,
"train_runtime": 122.9844,
"train_tokens_per_second": 1723.275
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.0233154296875,
"learning_rate": 0.023508075987613904,
"loss": 0.2143,
"num_input_tokens_seen": 213536,
"step": 680,
"train_runtime": 123.7781,
"train_tokens_per_second": 1725.151
},
{
"epoch": 7.611111111111111,
"grad_norm": 0.05078125,
"learning_rate": 0.023387893552061202,
"loss": 0.2273,
"num_input_tokens_seen": 215136,
"step": 685,
"train_runtime": 124.613,
"train_tokens_per_second": 1726.433
},
{
"epoch": 7.666666666666667,
"grad_norm": 0.02294921875,
"learning_rate": 0.023266922512041644,
"loss": 0.2513,
"num_input_tokens_seen": 216736,
"step": 690,
"train_runtime": 125.4907,
"train_tokens_per_second": 1727.107
},
{
"epoch": 7.722222222222222,
"grad_norm": 0.029052734375,
"learning_rate": 0.023145174240887748,
"loss": 0.2378,
"num_input_tokens_seen": 218272,
"step": 695,
"train_runtime": 126.2833,
"train_tokens_per_second": 1728.432
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.031005859375,
"learning_rate": 0.023022660185004967,
"loss": 0.2316,
"num_input_tokens_seen": 219808,
"step": 700,
"train_runtime": 127.0771,
"train_tokens_per_second": 1729.722
},
{
"epoch": 7.833333333333333,
"grad_norm": 0.02587890625,
"learning_rate": 0.02289939186279551,
"loss": 0.2331,
"num_input_tokens_seen": 221312,
"step": 705,
"train_runtime": 127.8643,
"train_tokens_per_second": 1730.834
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.023193359375,
"learning_rate": 0.022775380863575456,
"loss": 0.2339,
"num_input_tokens_seen": 222880,
"step": 710,
"train_runtime": 128.6548,
"train_tokens_per_second": 1732.388
},
{
"epoch": 7.944444444444445,
"grad_norm": 0.03369140625,
"learning_rate": 0.02265063884648513,
"loss": 0.2344,
"num_input_tokens_seen": 224416,
"step": 715,
"train_runtime": 129.4473,
"train_tokens_per_second": 1733.648
},
{
"epoch": 8.0,
"grad_norm": 0.049072265625,
"learning_rate": 0.022525177539392937,
"loss": 0.218,
"num_input_tokens_seen": 225984,
"step": 720,
"train_runtime": 130.2822,
"train_tokens_per_second": 1734.573
},
{
"epoch": 8.0,
"eval_loss": 0.2308429777622223,
"eval_runtime": 0.8222,
"eval_samples_per_second": 48.649,
"eval_steps_per_second": 12.162,
"num_input_tokens_seen": 225984,
"step": 720
},
{
"epoch": 8.055555555555555,
"grad_norm": 0.052978515625,
"learning_rate": 0.02239900873779278,
"loss": 0.2506,
"num_input_tokens_seen": 227552,
"step": 725,
"train_runtime": 132.8103,
"train_tokens_per_second": 1713.361
},
{
"epoch": 8.11111111111111,
"grad_norm": 0.03564453125,
"learning_rate": 0.022272144303695056,
"loss": 0.2338,
"num_input_tokens_seen": 229088,
"step": 730,
"train_runtime": 133.6036,
"train_tokens_per_second": 1714.684
},
{
"epoch": 8.166666666666666,
"grad_norm": 0.1044921875,
"learning_rate": 0.02214459616451143,
"loss": 0.2381,
"num_input_tokens_seen": 230656,
"step": 735,
"train_runtime": 134.4027,
"train_tokens_per_second": 1716.156
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.047119140625,
"learning_rate": 0.02201637631193346,
"loss": 0.2288,
"num_input_tokens_seen": 232224,
"step": 740,
"train_runtime": 135.195,
"train_tokens_per_second": 1717.697
},
{
"epoch": 8.277777777777779,
"grad_norm": 0.2265625,
"learning_rate": 0.021887496800805175,
"loss": 0.2157,
"num_input_tokens_seen": 233792,
"step": 745,
"train_runtime": 135.9862,
"train_tokens_per_second": 1719.233
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.061279296875,
"learning_rate": 0.021757969747989707,
"loss": 0.2441,
"num_input_tokens_seen": 235328,
"step": 750,
"train_runtime": 136.775,
"train_tokens_per_second": 1720.549
},
{
"epoch": 8.38888888888889,
"grad_norm": 0.0341796875,
"learning_rate": 0.02162780733123012,
"loss": 0.2362,
"num_input_tokens_seen": 236864,
"step": 755,
"train_runtime": 137.5647,
"train_tokens_per_second": 1721.838
},
{
"epoch": 8.444444444444445,
"grad_norm": 2.125,
"learning_rate": 0.021497021788004445,
"loss": 0.9504,
"num_input_tokens_seen": 238368,
"step": 760,
"train_runtime": 138.3522,
"train_tokens_per_second": 1722.907
},
{
"epoch": 8.5,
"grad_norm": 0.1669921875,
"learning_rate": 0.021365625414375228,
"loss": 0.2414,
"num_input_tokens_seen": 239936,
"step": 765,
"train_runtime": 139.1437,
"train_tokens_per_second": 1724.375
},
{
"epoch": 8.555555555555555,
"grad_norm": 0.10400390625,
"learning_rate": 0.021233630563833435,
"loss": 0.2626,
"num_input_tokens_seen": 241536,
"step": 770,
"train_runtime": 139.9389,
"train_tokens_per_second": 1726.01
},
{
"epoch": 8.61111111111111,
"grad_norm": 0.038330078125,
"learning_rate": 0.021101049646137005,
"loss": 0.2398,
"num_input_tokens_seen": 243136,
"step": 775,
"train_runtime": 140.7323,
"train_tokens_per_second": 1727.648
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.051025390625,
"learning_rate": 0.02096789512614417,
"loss": 0.2382,
"num_input_tokens_seen": 244704,
"step": 780,
"train_runtime": 141.5236,
"train_tokens_per_second": 1729.069
},
{
"epoch": 8.722222222222221,
"grad_norm": 0.048095703125,
"learning_rate": 0.020834179522641504,
"loss": 0.2276,
"num_input_tokens_seen": 246272,
"step": 785,
"train_runtime": 142.3196,
"train_tokens_per_second": 1730.416
},
{
"epoch": 8.777777777777779,
"grad_norm": 0.01165771484375,
"learning_rate": 0.020699915407166987,
"loss": 0.2446,
"num_input_tokens_seen": 247808,
"step": 790,
"train_runtime": 143.1092,
"train_tokens_per_second": 1731.6
},
{
"epoch": 8.833333333333334,
"grad_norm": 0.0169677734375,
"learning_rate": 0.020565115402828002,
"loss": 0.2376,
"num_input_tokens_seen": 249376,
"step": 795,
"train_runtime": 143.9049,
"train_tokens_per_second": 1732.922
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.034912109375,
"learning_rate": 0.02042979218311462,
"loss": 0.2325,
"num_input_tokens_seen": 250944,
"step": 800,
"train_runtime": 144.6957,
"train_tokens_per_second": 1734.288
},
{
"epoch": 8.944444444444445,
"grad_norm": 0.0235595703125,
"learning_rate": 0.02029395847070803,
"loss": 0.226,
"num_input_tokens_seen": 252512,
"step": 805,
"train_runtime": 145.4907,
"train_tokens_per_second": 1735.588
},
{
"epoch": 9.0,
"grad_norm": 0.01190185546875,
"learning_rate": 0.020157627036284417,
"loss": 0.238,
"num_input_tokens_seen": 254112,
"step": 810,
"train_runtime": 146.3281,
"train_tokens_per_second": 1736.591
},
{
"epoch": 9.0,
"eval_loss": 0.23274096846580505,
"eval_runtime": 0.8192,
"eval_samples_per_second": 48.831,
"eval_steps_per_second": 12.208,
"num_input_tokens_seen": 254112,
"step": 810
},
{
"epoch": 9.055555555555555,
"grad_norm": 0.02197265625,
"learning_rate": 0.02002081069731427,
"loss": 0.2334,
"num_input_tokens_seen": 255680,
"step": 815,
"train_runtime": 148.7956,
"train_tokens_per_second": 1718.33
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.0185546875,
"learning_rate": 0.01988352231685735,
"loss": 0.2236,
"num_input_tokens_seen": 257216,
"step": 820,
"train_runtime": 149.5891,
"train_tokens_per_second": 1719.483
},
{
"epoch": 9.166666666666666,
"grad_norm": 0.049560546875,
"learning_rate": 0.019745774802353344,
"loss": 0.2579,
"num_input_tokens_seen": 258816,
"step": 825,
"train_runtime": 150.3889,
"train_tokens_per_second": 1720.978
},
{
"epoch": 9.222222222222221,
"grad_norm": 0.00970458984375,
"learning_rate": 0.019607581104408342,
"loss": 0.2457,
"num_input_tokens_seen": 260384,
"step": 830,
"train_runtime": 151.1849,
"train_tokens_per_second": 1722.289
},
{
"epoch": 9.277777777777779,
"grad_norm": 0.0211181640625,
"learning_rate": 0.019468954215577226,
"loss": 0.2301,
"num_input_tokens_seen": 262048,
"step": 835,
"train_runtime": 151.9844,
"train_tokens_per_second": 1724.177
},
{
"epoch": 9.333333333333334,
"grad_norm": 0.02734375,
"learning_rate": 0.01932990716914222,
"loss": 0.244,
"num_input_tokens_seen": 263616,
"step": 840,
"train_runtime": 152.7753,
"train_tokens_per_second": 1725.514
},
{
"epoch": 9.38888888888889,
"grad_norm": 0.0093994140625,
"learning_rate": 0.019190453037887464,
"loss": 0.2323,
"num_input_tokens_seen": 265152,
"step": 845,
"train_runtime": 153.5949,
"train_tokens_per_second": 1726.308
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.01708984375,
"learning_rate": 0.019050604932870013,
"loss": 0.2314,
"num_input_tokens_seen": 266688,
"step": 850,
"train_runtime": 154.3846,
"train_tokens_per_second": 1727.427
},
{
"epoch": 9.5,
"grad_norm": 0.0322265625,
"learning_rate": 0.01891037600218712,
"loss": 0.2338,
"num_input_tokens_seen": 268256,
"step": 855,
"train_runtime": 155.1779,
"train_tokens_per_second": 1728.7
},
{
"epoch": 9.555555555555555,
"grad_norm": 0.022705078125,
"learning_rate": 0.018769779429740154,
"loss": 0.2379,
"num_input_tokens_seen": 269824,
"step": 860,
"train_runtime": 155.971,
"train_tokens_per_second": 1729.963
},
{
"epoch": 9.61111111111111,
"grad_norm": 0.0140380859375,
"learning_rate": 0.018628828433995014,
"loss": 0.2388,
"num_input_tokens_seen": 271424,
"step": 865,
"train_runtime": 156.7643,
"train_tokens_per_second": 1731.414
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.007415771484375,
"learning_rate": 0.018487536266739445,
"loss": 0.2359,
"num_input_tokens_seen": 272960,
"step": 870,
"train_runtime": 157.5546,
"train_tokens_per_second": 1732.478
},
{
"epoch": 9.722222222222221,
"grad_norm": 0.01263427734375,
"learning_rate": 0.01834591621183709,
"loss": 0.229,
"num_input_tokens_seen": 274528,
"step": 875,
"train_runtime": 158.3508,
"train_tokens_per_second": 1733.67
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.00537109375,
"learning_rate": 0.018203981583978603,
"loss": 0.235,
"num_input_tokens_seen": 276128,
"step": 880,
"train_runtime": 159.1469,
"train_tokens_per_second": 1735.051
},
{
"epoch": 9.833333333333334,
"grad_norm": 0.006561279296875,
"learning_rate": 0.018061745727429836,
"loss": 0.2284,
"num_input_tokens_seen": 277664,
"step": 885,
"train_runtime": 159.9388,
"train_tokens_per_second": 1736.064
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.0159912109375,
"learning_rate": 0.017919222014777265,
"loss": 0.2371,
"num_input_tokens_seen": 279232,
"step": 890,
"train_runtime": 160.7334,
"train_tokens_per_second": 1737.237
},
{
"epoch": 9.944444444444445,
"grad_norm": 0.003265380859375,
"learning_rate": 0.017776423845670717,
"loss": 0.228,
"num_input_tokens_seen": 280768,
"step": 895,
"train_runtime": 161.5242,
"train_tokens_per_second": 1738.241
},
{
"epoch": 10.0,
"grad_norm": 0.01251220703125,
"learning_rate": 0.0176333646455636,
"loss": 0.2218,
"num_input_tokens_seen": 282368,
"step": 900,
"train_runtime": 162.3604,
"train_tokens_per_second": 1739.143
},
{
"epoch": 10.0,
"eval_loss": 0.23551960289478302,
"eval_runtime": 0.817,
"eval_samples_per_second": 48.961,
"eval_steps_per_second": 12.24,
"num_input_tokens_seen": 282368,
"step": 900
},
{
"epoch": 10.055555555555555,
"grad_norm": 0.0185546875,
"learning_rate": 0.017490057864450664,
"loss": 0.2283,
"num_input_tokens_seen": 283936,
"step": 905,
"train_runtime": 165.2451,
"train_tokens_per_second": 1718.272
},
{
"epoch": 10.11111111111111,
"grad_norm": 0.00946044921875,
"learning_rate": 0.017346516975603462,
"loss": 0.2199,
"num_input_tokens_seen": 285504,
"step": 910,
"train_runtime": 166.0581,
"train_tokens_per_second": 1719.302
},
{
"epoch": 10.166666666666666,
"grad_norm": 0.01251220703125,
"learning_rate": 0.017202755474303683,
"loss": 0.2405,
"num_input_tokens_seen": 287072,
"step": 915,
"train_runtime": 166.8565,
"train_tokens_per_second": 1720.472
},
{
"epoch": 10.222222222222221,
"grad_norm": 0.011474609375,
"learning_rate": 0.017058786876574313,
"loss": 0.2363,
"num_input_tokens_seen": 288576,
"step": 920,
"train_runtime": 167.6452,
"train_tokens_per_second": 1721.35
},
{
"epoch": 10.277777777777779,
"grad_norm": 0.00555419921875,
"learning_rate": 0.016914624717908923,
"loss": 0.2355,
"num_input_tokens_seen": 290144,
"step": 925,
"train_runtime": 168.4381,
"train_tokens_per_second": 1722.555
},
{
"epoch": 10.333333333333334,
"grad_norm": 0.0235595703125,
"learning_rate": 0.016770282551999093,
"loss": 0.2278,
"num_input_tokens_seen": 291744,
"step": 930,
"train_runtime": 169.2344,
"train_tokens_per_second": 1723.905
},
{
"epoch": 10.38888888888889,
"grad_norm": 0.01385498046875,
"learning_rate": 0.01662577394946016,
"loss": 0.2358,
"num_input_tokens_seen": 293344,
"step": 935,
"train_runtime": 170.0296,
"train_tokens_per_second": 1725.253
},
{
"epoch": 10.444444444444445,
"grad_norm": 0.00482177734375,
"learning_rate": 0.016481112496555317,
"loss": 0.2315,
"num_input_tokens_seen": 294912,
"step": 940,
"train_runtime": 170.8203,
"train_tokens_per_second": 1726.446
},
{
"epoch": 10.5,
"grad_norm": 0.01275634765625,
"learning_rate": 0.016336311793918295,
"loss": 0.2304,
"num_input_tokens_seen": 296480,
"step": 945,
"train_runtime": 171.6139,
"train_tokens_per_second": 1727.599
},
{
"epoch": 10.555555555555555,
"grad_norm": 0.00665283203125,
"learning_rate": 0.016191385455274654,
"loss": 0.2347,
"num_input_tokens_seen": 298048,
"step": 950,
"train_runtime": 172.4051,
"train_tokens_per_second": 1728.766
},
{
"epoch": 10.61111111111111,
"grad_norm": 0.0230712890625,
"learning_rate": 0.016046347106161877,
"loss": 0.2326,
"num_input_tokens_seen": 299648,
"step": 955,
"train_runtime": 173.1986,
"train_tokens_per_second": 1730.083
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.011474609375,
"learning_rate": 0.01590121038264835,
"loss": 0.2264,
"num_input_tokens_seen": 301216,
"step": 960,
"train_runtime": 173.995,
"train_tokens_per_second": 1731.176
},
{
"epoch": 10.722222222222221,
"grad_norm": 0.022705078125,
"learning_rate": 0.015755988930051302,
"loss": 0.2329,
"num_input_tokens_seen": 302784,
"step": 965,
"train_runtime": 174.7881,
"train_tokens_per_second": 1732.292
},
{
"epoch": 10.777777777777779,
"grad_norm": 0.01312255859375,
"learning_rate": 0.01561069640165394,
"loss": 0.2371,
"num_input_tokens_seen": 304320,
"step": 970,
"train_runtime": 175.5852,
"train_tokens_per_second": 1733.175
},
{
"epoch": 10.833333333333334,
"grad_norm": 0.01214599609375,
"learning_rate": 0.015465346457421807,
"loss": 0.239,
"num_input_tokens_seen": 305856,
"step": 975,
"train_runtime": 176.3792,
"train_tokens_per_second": 1734.082
},
{
"epoch": 10.88888888888889,
"grad_norm": 0.0140380859375,
"learning_rate": 0.015319952762718515,
"loss": 0.2338,
"num_input_tokens_seen": 307424,
"step": 980,
"train_runtime": 177.1761,
"train_tokens_per_second": 1735.132
},
{
"epoch": 10.944444444444445,
"grad_norm": 0.01434326171875,
"learning_rate": 0.015174528987020958,
"loss": 0.234,
"num_input_tokens_seen": 308992,
"step": 985,
"train_runtime": 177.9704,
"train_tokens_per_second": 1736.198
},
{
"epoch": 11.0,
"grad_norm": 0.01129150390625,
"learning_rate": 0.015029088802634146,
"loss": 0.2349,
"num_input_tokens_seen": 310560,
"step": 990,
"train_runtime": 178.804,
"train_tokens_per_second": 1736.874
},
{
"epoch": 11.0,
"eval_loss": 0.23004861176013947,
"eval_runtime": 0.8164,
"eval_samples_per_second": 48.995,
"eval_steps_per_second": 12.249,
"num_input_tokens_seen": 310560,
"step": 990
},
{
"epoch": 11.055555555555555,
"grad_norm": 0.0113525390625,
"learning_rate": 0.014883645883405797,
"loss": 0.2328,
"num_input_tokens_seen": 312160,
"step": 995,
"train_runtime": 181.2905,
"train_tokens_per_second": 1721.877
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.01300048828125,
"learning_rate": 0.014738213903440746,
"loss": 0.2319,
"num_input_tokens_seen": 313728,
"step": 1000,
"train_runtime": 182.1312,
"train_tokens_per_second": 1722.538
},
{
"epoch": 11.166666666666666,
"grad_norm": 0.01287841796875,
"learning_rate": 0.014592806535815357,
"loss": 0.2386,
"num_input_tokens_seen": 315264,
"step": 1005,
"train_runtime": 182.9305,
"train_tokens_per_second": 1723.409
},
{
"epoch": 11.222222222222221,
"grad_norm": 0.02099609375,
"learning_rate": 0.014447437451291999,
"loss": 0.2291,
"num_input_tokens_seen": 316864,
"step": 1010,
"train_runtime": 183.7272,
"train_tokens_per_second": 1724.644
},
{
"epoch": 11.277777777777779,
"grad_norm": 0.01251220703125,
"learning_rate": 0.014302120317033798,
"loss": 0.2201,
"num_input_tokens_seen": 318432,
"step": 1015,
"train_runtime": 184.5231,
"train_tokens_per_second": 1725.703
},
{
"epoch": 11.333333333333334,
"grad_norm": 0.004974365234375,
"learning_rate": 0.014156868795319669,
"loss": 0.2403,
"num_input_tokens_seen": 320032,
"step": 1020,
"train_runtime": 185.3161,
"train_tokens_per_second": 1726.952
},
{
"epoch": 11.38888888888889,
"grad_norm": 0.01190185546875,
"learning_rate": 0.014011696542259821,
"loss": 0.2356,
"num_input_tokens_seen": 321536,
"step": 1025,
"train_runtime": 186.1035,
"train_tokens_per_second": 1727.727
},
{
"epoch": 11.444444444444445,
"grad_norm": 0.01007080078125,
"learning_rate": 0.013866617206511882,
"loss": 0.235,
"num_input_tokens_seen": 323040,
"step": 1030,
"train_runtime": 186.8909,
"train_tokens_per_second": 1728.495
},
{
"epoch": 11.5,
"grad_norm": 0.00927734375,
"learning_rate": 0.013721644427997651,
"loss": 0.2268,
"num_input_tokens_seen": 324608,
"step": 1035,
"train_runtime": 187.6849,
"train_tokens_per_second": 1729.537
},
{
"epoch": 11.555555555555555,
"grad_norm": 0.0125732421875,
"learning_rate": 0.01357679183662076,
"loss": 0.2333,
"num_input_tokens_seen": 326144,
"step": 1040,
"train_runtime": 188.4763,
"train_tokens_per_second": 1730.425
},
{
"epoch": 11.61111111111111,
"grad_norm": 0.0048828125,
"learning_rate": 0.0134320730509852,
"loss": 0.2322,
"num_input_tokens_seen": 327712,
"step": 1045,
"train_runtime": 189.2669,
"train_tokens_per_second": 1731.481
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.0029296875,
"learning_rate": 0.01328750167711494,
"loss": 0.2322,
"num_input_tokens_seen": 329248,
"step": 1050,
"train_runtime": 190.0636,
"train_tokens_per_second": 1732.304
},
{
"epoch": 11.722222222222221,
"grad_norm": 0.004974365234375,
"learning_rate": 0.013143091307174755,
"loss": 0.2413,
"num_input_tokens_seen": 330816,
"step": 1055,
"train_runtime": 190.8551,
"train_tokens_per_second": 1733.336
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.0208740234375,
"learning_rate": 0.012998855518192309,
"loss": 0.2275,
"num_input_tokens_seen": 332416,
"step": 1060,
"train_runtime": 191.6505,
"train_tokens_per_second": 1734.491
},
{
"epoch": 11.833333333333334,
"grad_norm": 0.010009765625,
"learning_rate": 0.012854807870781686,
"loss": 0.2338,
"num_input_tokens_seen": 334016,
"step": 1065,
"train_runtime": 192.4488,
"train_tokens_per_second": 1735.61
},
{
"epoch": 11.88888888888889,
"grad_norm": 0.01092529296875,
"learning_rate": 0.012710961907868478,
"loss": 0.2338,
"num_input_tokens_seen": 335616,
"step": 1070,
"train_runtime": 193.2439,
"train_tokens_per_second": 1736.748
},
{
"epoch": 11.944444444444445,
"grad_norm": 0.0027923583984375,
"learning_rate": 0.012567331153416489,
"loss": 0.2359,
"num_input_tokens_seen": 337152,
"step": 1075,
"train_runtime": 194.0342,
"train_tokens_per_second": 1737.59
},
{
"epoch": 12.0,
"grad_norm": 0.004852294921875,
"learning_rate": 0.012423929111156296,
"loss": 0.2315,
"num_input_tokens_seen": 338784,
"step": 1080,
"train_runtime": 194.8731,
"train_tokens_per_second": 1738.486
},
{
"epoch": 12.0,
"eval_loss": 0.23689353466033936,
"eval_runtime": 0.8185,
"eval_samples_per_second": 48.871,
"eval_steps_per_second": 12.218,
"num_input_tokens_seen": 338784,
"step": 1080
},
{
"epoch": 12.055555555555555,
"grad_norm": 0.01953125,
"learning_rate": 0.012280769263315627,
"loss": 0.2296,
"num_input_tokens_seen": 340288,
"step": 1085,
"train_runtime": 197.3733,
"train_tokens_per_second": 1724.083
},
{
"epoch": 12.11111111111111,
"grad_norm": 0.01123046875,
"learning_rate": 0.012137865069351828,
"loss": 0.2306,
"num_input_tokens_seen": 341888,
"step": 1090,
"train_runtime": 198.1719,
"train_tokens_per_second": 1725.209
},
{
"epoch": 12.166666666666666,
"grad_norm": 0.0205078125,
"learning_rate": 0.01199522996468644,
"loss": 0.2317,
"num_input_tokens_seen": 343488,
"step": 1095,
"train_runtime": 198.9687,
"train_tokens_per_second": 1726.342
},
{
"epoch": 12.222222222222221,
"grad_norm": 0.0130615234375,
"learning_rate": 0.01185287735944204,
"loss": 0.2309,
"num_input_tokens_seen": 344992,
"step": 1100,
"train_runtime": 199.7613,
"train_tokens_per_second": 1727.021
},
{
"epoch": 12.277777777777779,
"grad_norm": 0.0029296875,
"learning_rate": 0.011710820637181448,
"loss": 0.2392,
"num_input_tokens_seen": 346560,
"step": 1105,
"train_runtime": 200.5543,
"train_tokens_per_second": 1728.011
},
{
"epoch": 12.333333333333334,
"grad_norm": 0.00457763671875,
"learning_rate": 0.011569073153649483,
"loss": 0.2339,
"num_input_tokens_seen": 348160,
"step": 1110,
"train_runtime": 201.3491,
"train_tokens_per_second": 1729.136
},
{
"epoch": 12.38888888888889,
"grad_norm": 0.0118408203125,
"learning_rate": 0.01142764823551724,
"loss": 0.234,
"num_input_tokens_seen": 349760,
"step": 1115,
"train_runtime": 202.1428,
"train_tokens_per_second": 1730.262
},
{
"epoch": 12.444444444444445,
"grad_norm": 0.01214599609375,
"learning_rate": 0.011286559179129213,
"loss": 0.2319,
"num_input_tokens_seen": 351328,
"step": 1120,
"train_runtime": 202.9386,
"train_tokens_per_second": 1731.204
},
{
"epoch": 12.5,
"grad_norm": 0.020263671875,
"learning_rate": 0.01114581924925317,
"loss": 0.2318,
"num_input_tokens_seen": 352896,
"step": 1125,
"train_runtime": 203.734,
"train_tokens_per_second": 1732.141
},
{
"epoch": 12.555555555555555,
"grad_norm": 0.01019287109375,
"learning_rate": 0.011005441677833067,
"loss": 0.2295,
"num_input_tokens_seen": 354464,
"step": 1130,
"train_runtime": 204.5288,
"train_tokens_per_second": 1733.076
},
{
"epoch": 12.61111111111111,
"grad_norm": 0.00994873046875,
"learning_rate": 0.010865439662745013,
"loss": 0.2339,
"num_input_tokens_seen": 356032,
"step": 1135,
"train_runtime": 205.321,
"train_tokens_per_second": 1734.026
},
{
"epoch": 12.666666666666666,
"grad_norm": 0.01141357421875,
"learning_rate": 0.01072582636655643,
"loss": 0.2263,
"num_input_tokens_seen": 357632,
"step": 1140,
"train_runtime": 206.1151,
"train_tokens_per_second": 1735.108
},
{
"epoch": 12.722222222222221,
"grad_norm": 0.0113525390625,
"learning_rate": 0.010586614915288572,
"loss": 0.2327,
"num_input_tokens_seen": 359168,
"step": 1145,
"train_runtime": 206.9071,
"train_tokens_per_second": 1735.89
},
{
"epoch": 12.777777777777779,
"grad_norm": 0.005645751953125,
"learning_rate": 0.010447818397182444,
"loss": 0.2337,
"num_input_tokens_seen": 360736,
"step": 1150,
"train_runtime": 207.6979,
"train_tokens_per_second": 1736.83
},
{
"epoch": 12.833333333333334,
"grad_norm": 0.0033721923828125,
"learning_rate": 0.010309449861468272,
"loss": 0.2317,
"num_input_tokens_seen": 362304,
"step": 1155,
"train_runtime": 208.4895,
"train_tokens_per_second": 1737.757
},
{
"epoch": 12.88888888888889,
"grad_norm": 0.004791259765625,
"learning_rate": 0.010171522317138689,
"loss": 0.2318,
"num_input_tokens_seen": 363872,
"step": 1160,
"train_runtime": 209.2816,
"train_tokens_per_second": 1738.671
},
{
"epoch": 12.944444444444445,
"grad_norm": 0.00982666015625,
"learning_rate": 0.01003404873172563,
"loss": 0.2339,
"num_input_tokens_seen": 365376,
"step": 1165,
"train_runtime": 210.0732,
"train_tokens_per_second": 1739.28
},
{
"epoch": 13.0,
"grad_norm": 0.0029296875,
"learning_rate": 0.009897042030081191,
"loss": 0.2297,
"num_input_tokens_seen": 366944,
"step": 1170,
"train_runtime": 210.9074,
"train_tokens_per_second": 1739.834
},
{
"epoch": 13.0,
"eval_loss": 0.2312408983707428,
"eval_runtime": 0.8181,
"eval_samples_per_second": 48.893,
"eval_steps_per_second": 12.223,
"num_input_tokens_seen": 366944,
"step": 1170
},
{
"epoch": 13.055555555555555,
"grad_norm": 0.0034332275390625,
"learning_rate": 0.009760515093162463,
"loss": 0.2329,
"num_input_tokens_seen": 368384,
"step": 1175,
"train_runtime": 213.385,
"train_tokens_per_second": 1726.382
},
{
"epoch": 13.11111111111111,
"grad_norm": 0.01220703125,
"learning_rate": 0.009624480756820496,
"loss": 0.2307,
"num_input_tokens_seen": 369984,
"step": 1180,
"train_runtime": 214.2014,
"train_tokens_per_second": 1727.272
},
{
"epoch": 13.166666666666666,
"grad_norm": 0.011474609375,
"learning_rate": 0.009488951810593525,
"loss": 0.2327,
"num_input_tokens_seen": 371520,
"step": 1185,
"train_runtime": 214.9913,
"train_tokens_per_second": 1728.07
},
{
"epoch": 13.222222222222221,
"grad_norm": 0.01251220703125,
"learning_rate": 0.009353940996504537,
"loss": 0.2391,
"num_input_tokens_seen": 373120,
"step": 1190,
"train_runtime": 215.7896,
"train_tokens_per_second": 1729.092
},
{
"epoch": 13.277777777777779,
"grad_norm": 0.01239013671875,
"learning_rate": 0.009219461007863278,
"loss": 0.2317,
"num_input_tokens_seen": 374688,
"step": 1195,
"train_runtime": 216.5862,
"train_tokens_per_second": 1729.972
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.0036163330078125,
"learning_rate": 0.009085524488072901,
"loss": 0.2347,
"num_input_tokens_seen": 376288,
"step": 1200,
"train_runtime": 217.3817,
"train_tokens_per_second": 1731.001
},
{
"epoch": 13.38888888888889,
"grad_norm": 0.00543212890625,
"learning_rate": 0.008952144029441248,
"loss": 0.2304,
"num_input_tokens_seen": 377888,
"step": 1205,
"train_runtime": 218.1804,
"train_tokens_per_second": 1731.998
},
{
"epoch": 13.444444444444445,
"grad_norm": 0.0064697265625,
"learning_rate": 0.008819332171996975,
"loss": 0.2325,
"num_input_tokens_seen": 379424,
"step": 1210,
"train_runtime": 218.971,
"train_tokens_per_second": 1732.759
},
{
"epoch": 13.5,
"grad_norm": 0.01080322265625,
"learning_rate": 0.008687101402310564,
"loss": 0.2336,
"num_input_tokens_seen": 380992,
"step": 1215,
"train_runtime": 219.765,
"train_tokens_per_second": 1733.634
},
{
"epoch": 13.555555555555555,
"grad_norm": 0.003936767578125,
"learning_rate": 0.008555464152320372,
"loss": 0.2295,
"num_input_tokens_seen": 382592,
"step": 1220,
"train_runtime": 220.5584,
"train_tokens_per_second": 1734.652
},
{
"epoch": 13.61111111111111,
"grad_norm": 0.01177978515625,
"learning_rate": 0.008424432798163836,
"loss": 0.2284,
"num_input_tokens_seen": 384192,
"step": 1225,
"train_runtime": 221.3532,
"train_tokens_per_second": 1735.651
},
{
"epoch": 13.666666666666666,
"grad_norm": 0.01123046875,
"learning_rate": 0.008294019659013892,
"loss": 0.2325,
"num_input_tokens_seen": 385760,
"step": 1230,
"train_runtime": 222.1454,
"train_tokens_per_second": 1736.521
},
{
"epoch": 13.722222222222221,
"grad_norm": 0.0120849609375,
"learning_rate": 0.008164236995920735,
"loss": 0.2358,
"num_input_tokens_seen": 387328,
"step": 1235,
"train_runtime": 222.9361,
"train_tokens_per_second": 1737.395
},
{
"epoch": 13.777777777777779,
"grad_norm": 0.00482177734375,
"learning_rate": 0.008035097010659147,
"loss": 0.2295,
"num_input_tokens_seen": 388896,
"step": 1240,
"train_runtime": 223.7293,
"train_tokens_per_second": 1738.244
},
{
"epoch": 13.833333333333334,
"grad_norm": 0.0101318359375,
"learning_rate": 0.00790661184458125,
"loss": 0.2346,
"num_input_tokens_seen": 390496,
"step": 1245,
"train_runtime": 224.5255,
"train_tokens_per_second": 1739.206
},
{
"epoch": 13.88888888888889,
"grad_norm": 0.0030670166015625,
"learning_rate": 0.007778793577475039,
"loss": 0.2284,
"num_input_tokens_seen": 392064,
"step": 1250,
"train_runtime": 225.3179,
"train_tokens_per_second": 1740.048
},
{
"epoch": 13.944444444444445,
"grad_norm": 0.0093994140625,
"learning_rate": 0.007651654226428696,
"loss": 0.2265,
"num_input_tokens_seen": 393632,
"step": 1255,
"train_runtime": 226.1132,
"train_tokens_per_second": 1740.862
},
{
"epoch": 14.0,
"grad_norm": 0.0101318359375,
"learning_rate": 0.0075252057447007465,
"loss": 0.2276,
"num_input_tokens_seen": 395104,
"step": 1260,
"train_runtime": 226.95,
"train_tokens_per_second": 1740.93
},
{
"epoch": 14.0,
"eval_loss": 0.2316901683807373,
"eval_runtime": 0.8178,
"eval_samples_per_second": 48.909,
"eval_steps_per_second": 12.227,
"num_input_tokens_seen": 395104,
"step": 1260
},
{
"epoch": 14.055555555555555,
"grad_norm": 0.00628662109375,
"learning_rate": 0.007399460020596265,
"loss": 0.2307,
"num_input_tokens_seen": 396672,
"step": 1265,
"train_runtime": 229.4732,
"train_tokens_per_second": 1728.62
},
{
"epoch": 14.11111111111111,
"grad_norm": 0.005584716796875,
"learning_rate": 0.007274428876349185,
"loss": 0.2348,
"num_input_tokens_seen": 398304,
"step": 1270,
"train_runtime": 230.292,
"train_tokens_per_second": 1729.561
},
{
"epoch": 14.166666666666666,
"grad_norm": 0.01055908203125,
"learning_rate": 0.007150124067010788,
"loss": 0.2317,
"num_input_tokens_seen": 399840,
"step": 1275,
"train_runtime": 231.085,
"train_tokens_per_second": 1730.272
},
{
"epoch": 14.222222222222221,
"grad_norm": 0.0037689208984375,
"learning_rate": 0.007026557279344533,
"loss": 0.2286,
"num_input_tokens_seen": 401440,
"step": 1280,
"train_runtime": 231.8835,
"train_tokens_per_second": 1731.214
},
{
"epoch": 14.277777777777779,
"grad_norm": 0.0035400390625,
"learning_rate": 0.006903740130727311,
"loss": 0.2264,
"num_input_tokens_seen": 403040,
"step": 1285,
"train_runtime": 232.6814,
"train_tokens_per_second": 1732.154
},
{
"epoch": 14.333333333333334,
"grad_norm": 0.01104736328125,
"learning_rate": 0.0067816841680572015,
"loss": 0.2337,
"num_input_tokens_seen": 404640,
"step": 1290,
"train_runtime": 233.4758,
"train_tokens_per_second": 1733.113
},
{
"epoch": 14.38888888888889,
"grad_norm": 0.004364013671875,
"learning_rate": 0.006660400866667899,
"loss": 0.2246,
"num_input_tokens_seen": 406208,
"step": 1295,
"train_runtime": 234.2675,
"train_tokens_per_second": 1733.95
},
{
"epoch": 14.444444444444445,
"grad_norm": 0.005584716796875,
"learning_rate": 0.006539901629249787,
"loss": 0.2322,
"num_input_tokens_seen": 407776,
"step": 1300,
"train_runtime": 235.0597,
"train_tokens_per_second": 1734.776
},
{
"epoch": 14.5,
"grad_norm": 0.004791259765625,
"learning_rate": 0.006420197784777924,
"loss": 0.2268,
"num_input_tokens_seen": 409312,
"step": 1305,
"train_runtime": 235.8489,
"train_tokens_per_second": 1735.484
},
{
"epoch": 14.555555555555555,
"grad_norm": 0.02587890625,
"learning_rate": 0.006301300587446937,
"loss": 0.2314,
"num_input_tokens_seen": 410816,
"step": 1310,
"train_runtime": 236.6364,
"train_tokens_per_second": 1736.064
},
{
"epoch": 14.61111111111111,
"grad_norm": 0.0244140625,
"learning_rate": 0.006183221215612904,
"loss": 0.2415,
"num_input_tokens_seen": 412416,
"step": 1315,
"train_runtime": 237.4299,
"train_tokens_per_second": 1737.001
},
{
"epoch": 14.666666666666666,
"grad_norm": 0.0108642578125,
"learning_rate": 0.00606597077074242,
"loss": 0.2288,
"num_input_tokens_seen": 414016,
"step": 1320,
"train_runtime": 238.223,
"train_tokens_per_second": 1737.935
},
{
"epoch": 14.722222222222221,
"grad_norm": 0.003570556640625,
"learning_rate": 0.005949560276368865,
"loss": 0.2402,
"num_input_tokens_seen": 415552,
"step": 1325,
"train_runtime": 239.012,
"train_tokens_per_second": 1738.624
},
{
"epoch": 14.777777777777779,
"grad_norm": 0.005096435546875,
"learning_rate": 0.005834000677056003,
"loss": 0.2289,
"num_input_tokens_seen": 417088,
"step": 1330,
"train_runtime": 239.8035,
"train_tokens_per_second": 1739.291
},
{
"epoch": 14.833333333333334,
"grad_norm": 0.010009765625,
"learning_rate": 0.005719302837369021,
"loss": 0.2317,
"num_input_tokens_seen": 418656,
"step": 1335,
"train_runtime": 240.5946,
"train_tokens_per_second": 1740.089
},
{
"epoch": 14.88888888888889,
"grad_norm": 0.01055908203125,
"learning_rate": 0.00560547754085305,
"loss": 0.2265,
"num_input_tokens_seen": 420256,
"step": 1340,
"train_runtime": 241.3879,
"train_tokens_per_second": 1740.999
},
{
"epoch": 14.944444444444445,
"grad_norm": 0.0203857421875,
"learning_rate": 0.005492535489019344,
"loss": 0.2245,
"num_input_tokens_seen": 421792,
"step": 1345,
"train_runtime": 242.1774,
"train_tokens_per_second": 1741.665
},
{
"epoch": 15.0,
"grad_norm": 0.0223388671875,
"learning_rate": 0.005380487300339167,
"loss": 0.2402,
"num_input_tokens_seen": 423360,
"step": 1350,
"train_runtime": 243.0095,
"train_tokens_per_second": 1742.154
},
{
"epoch": 15.0,
"eval_loss": 0.23129186034202576,
"eval_runtime": 0.8149,
"eval_samples_per_second": 49.088,
"eval_steps_per_second": 12.272,
"num_input_tokens_seen": 423360,
"step": 1350
},
{
"epoch": 15.055555555555555,
"grad_norm": 0.01123046875,
"learning_rate": 0.005269343509245449,
"loss": 0.2339,
"num_input_tokens_seen": 424992,
"step": 1355,
"train_runtime": 245.5308,
"train_tokens_per_second": 1730.911
},
{
"epoch": 15.11111111111111,
"grad_norm": 0.01226806640625,
"learning_rate": 0.005159114565142392,
"loss": 0.2307,
"num_input_tokens_seen": 426528,
"step": 1360,
"train_runtime": 246.3315,
"train_tokens_per_second": 1731.52
},
{
"epoch": 15.166666666666666,
"grad_norm": 0.0106201171875,
"learning_rate": 0.0050498108314230425,
"loss": 0.2318,
"num_input_tokens_seen": 428096,
"step": 1365,
"train_runtime": 247.1262,
"train_tokens_per_second": 1732.297
},
{
"epoch": 15.222222222222221,
"grad_norm": 0.00732421875,
"learning_rate": 0.0049414425844949445,
"loss": 0.2307,
"num_input_tokens_seen": 429600,
"step": 1370,
"train_runtime": 247.9142,
"train_tokens_per_second": 1732.858
},
{
"epoch": 15.277777777777779,
"grad_norm": 0.00335693359375,
"learning_rate": 0.004834020012814016,
"loss": 0.2337,
"num_input_tokens_seen": 431200,
"step": 1375,
"train_runtime": 248.7142,
"train_tokens_per_second": 1733.717
},
{
"epoch": 15.333333333333334,
"grad_norm": 0.01177978515625,
"learning_rate": 0.004727553215926623,
"loss": 0.2305,
"num_input_tokens_seen": 432736,
"step": 1380,
"train_runtime": 249.5378,
"train_tokens_per_second": 1734.15
},
{
"epoch": 15.38888888888889,
"grad_norm": 0.010498046875,
"learning_rate": 0.004622052203520061,
"loss": 0.2276,
"num_input_tokens_seen": 434336,
"step": 1385,
"train_runtime": 250.3618,
"train_tokens_per_second": 1734.834
},
{
"epoch": 15.444444444444445,
"grad_norm": 0.0115966796875,
"learning_rate": 0.004517526894481498,
"loss": 0.2348,
"num_input_tokens_seen": 435904,
"step": 1390,
"train_runtime": 251.1536,
"train_tokens_per_second": 1735.607
},
{
"epoch": 15.5,
"grad_norm": 0.0107421875,
"learning_rate": 0.004413987115965404,
"loss": 0.2286,
"num_input_tokens_seen": 437440,
"step": 1395,
"train_runtime": 251.9431,
"train_tokens_per_second": 1736.265
},
{
"epoch": 15.555555555555555,
"grad_norm": 0.01171875,
"learning_rate": 0.004311442602469636,
"loss": 0.2347,
"num_input_tokens_seen": 438976,
"step": 1400,
"train_runtime": 252.733,
"train_tokens_per_second": 1736.916
},
{
"epoch": 15.61111111111111,
"grad_norm": 0.005950927734375,
"learning_rate": 0.004209902994920235,
"loss": 0.2255,
"num_input_tokens_seen": 440512,
"step": 1405,
"train_runtime": 253.5249,
"train_tokens_per_second": 1737.549
},
{
"epoch": 15.666666666666666,
"grad_norm": 0.01190185546875,
"learning_rate": 0.004109377839765016,
"loss": 0.2295,
"num_input_tokens_seen": 442112,
"step": 1410,
"train_runtime": 254.3181,
"train_tokens_per_second": 1738.421
},
{
"epoch": 15.722222222222221,
"grad_norm": 0.012451171875,
"learning_rate": 0.004009876588076046,
"loss": 0.2339,
"num_input_tokens_seen": 443616,
"step": 1415,
"train_runtime": 255.1075,
"train_tokens_per_second": 1738.938
},
{
"epoch": 15.777777777777779,
"grad_norm": 0.022705078125,
"learning_rate": 0.003911408594661061,
"loss": 0.2316,
"num_input_tokens_seen": 445184,
"step": 1420,
"train_runtime": 255.8999,
"train_tokens_per_second": 1739.68
},
{
"epoch": 15.833333333333334,
"grad_norm": 0.012451171875,
"learning_rate": 0.0038139831171839726,
"loss": 0.2308,
"num_input_tokens_seen": 446752,
"step": 1425,
"train_runtime": 256.6958,
"train_tokens_per_second": 1740.394
},
{
"epoch": 15.88888888888889,
"grad_norm": 0.01324462890625,
"learning_rate": 0.0037176093152944947,
"loss": 0.2318,
"num_input_tokens_seen": 448352,
"step": 1430,
"train_runtime": 257.491,
"train_tokens_per_second": 1741.234
},
{
"epoch": 15.944444444444445,
"grad_norm": 0.0101318359375,
"learning_rate": 0.0036222962497669668,
"loss": 0.2276,
"num_input_tokens_seen": 449888,
"step": 1435,
"train_runtime": 258.2827,
"train_tokens_per_second": 1741.843
},
{
"epoch": 16.0,
"grad_norm": 0.00531005859375,
"learning_rate": 0.003528052881648488,
"loss": 0.2338,
"num_input_tokens_seen": 451424,
"step": 1440,
"train_runtime": 259.1151,
"train_tokens_per_second": 1742.176
},
{
"epoch": 16.0,
"eval_loss": 0.23337697982788086,
"eval_runtime": 0.8216,
"eval_samples_per_second": 48.684,
"eval_steps_per_second": 12.171,
"num_input_tokens_seen": 451424,
"step": 1440
},
{
"epoch": 16.055555555555557,
"grad_norm": 0.0111083984375,
"learning_rate": 0.0034348880714164414,
"loss": 0.2306,
"num_input_tokens_seen": 452992,
"step": 1445,
"train_runtime": 261.5987,
"train_tokens_per_second": 1731.629
},
{
"epoch": 16.11111111111111,
"grad_norm": 0.00958251953125,
"learning_rate": 0.0033428105781454364,
"loss": 0.2266,
"num_input_tokens_seen": 454496,
"step": 1450,
"train_runtime": 262.4288,
"train_tokens_per_second": 1731.883
},
{
"epoch": 16.166666666666668,
"grad_norm": 0.005096435546875,
"learning_rate": 0.0032518290586838377,
"loss": 0.2359,
"num_input_tokens_seen": 456096,
"step": 1455,
"train_runtime": 263.2247,
"train_tokens_per_second": 1732.725
},
{
"epoch": 16.22222222222222,
"grad_norm": 0.0118408203125,
"learning_rate": 0.0031619520668398388,
"loss": 0.2308,
"num_input_tokens_seen": 457696,
"step": 1460,
"train_runtime": 264.0196,
"train_tokens_per_second": 1733.568
},
{
"epoch": 16.27777777777778,
"grad_norm": 0.00433349609375,
"learning_rate": 0.003073188052577281,
"loss": 0.2318,
"num_input_tokens_seen": 459232,
"step": 1465,
"train_runtime": 264.8136,
"train_tokens_per_second": 1734.171
},
{
"epoch": 16.333333333333332,
"grad_norm": 0.0062255859375,
"learning_rate": 0.00298554536122122,
"loss": 0.2337,
"num_input_tokens_seen": 460832,
"step": 1470,
"train_runtime": 265.6148,
"train_tokens_per_second": 1734.964
},
{
"epoch": 16.38888888888889,
"grad_norm": 0.0040283203125,
"learning_rate": 0.0028990322326732957,
"loss": 0.2329,
"num_input_tokens_seen": 462432,
"step": 1475,
"train_runtime": 266.4104,
"train_tokens_per_second": 1735.788
},
{
"epoch": 16.444444444444443,
"grad_norm": 0.00653076171875,
"learning_rate": 0.0028136568006370643,
"loss": 0.2245,
"num_input_tokens_seen": 464000,
"step": 1480,
"train_runtime": 267.2022,
"train_tokens_per_second": 1736.513
},
{
"epoch": 16.5,
"grad_norm": 0.01025390625,
"learning_rate": 0.0027294270918532875,
"loss": 0.2256,
"num_input_tokens_seen": 465536,
"step": 1485,
"train_runtime": 267.991,
"train_tokens_per_second": 1737.133
},
{
"epoch": 16.555555555555557,
"grad_norm": 0.02001953125,
"learning_rate": 0.0026463510253452744,
"loss": 0.2255,
"num_input_tokens_seen": 467136,
"step": 1490,
"train_runtime": 268.784,
"train_tokens_per_second": 1737.96
},
{
"epoch": 16.61111111111111,
"grad_norm": 0.002838134765625,
"learning_rate": 0.0025644364116743754,
"loss": 0.2308,
"num_input_tokens_seen": 468672,
"step": 1495,
"train_runtime": 269.575,
"train_tokens_per_second": 1738.559
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.003387451171875,
"learning_rate": 0.002483690952205637,
"loss": 0.235,
"num_input_tokens_seen": 470272,
"step": 1500,
"train_runtime": 270.3683,
"train_tokens_per_second": 1739.376
},
{
"epoch": 16.72222222222222,
"grad_norm": 0.0108642578125,
"learning_rate": 0.0024041222383837536,
"loss": 0.2306,
"num_input_tokens_seen": 471872,
"step": 1505,
"train_runtime": 271.1607,
"train_tokens_per_second": 1740.193
},
{
"epoch": 16.77777777777778,
"grad_norm": 0.011962890625,
"learning_rate": 0.002325737751019347,
"loss": 0.2276,
"num_input_tokens_seen": 473440,
"step": 1510,
"train_runtime": 271.9576,
"train_tokens_per_second": 1740.859
},
{
"epoch": 16.833333333333332,
"grad_norm": 0.01055908203125,
"learning_rate": 0.00224854485958563,
"loss": 0.2308,
"num_input_tokens_seen": 475008,
"step": 1515,
"train_runtime": 272.7521,
"train_tokens_per_second": 1741.537
},
{
"epoch": 16.88888888888889,
"grad_norm": 0.0120849609375,
"learning_rate": 0.0021725508215255634,
"loss": 0.234,
"num_input_tokens_seen": 476608,
"step": 1520,
"train_runtime": 273.5491,
"train_tokens_per_second": 1742.312
},
{
"epoch": 16.944444444444443,
"grad_norm": 0.01226806640625,
"learning_rate": 0.0020977627815695213,
"loss": 0.2286,
"num_input_tokens_seen": 478176,
"step": 1525,
"train_runtime": 274.3439,
"train_tokens_per_second": 1742.98
},
{
"epoch": 17.0,
"grad_norm": 0.0130615234375,
"learning_rate": 0.0020241877710635747,
"loss": 0.2339,
"num_input_tokens_seen": 479744,
"step": 1530,
"train_runtime": 275.1758,
"train_tokens_per_second": 1743.409
},
{
"epoch": 17.0,
"eval_loss": 0.2328735888004303,
"eval_runtime": 0.8173,
"eval_samples_per_second": 48.943,
"eval_steps_per_second": 12.236,
"num_input_tokens_seen": 479744,
"step": 1530
},
{
"epoch": 17.055555555555557,
"grad_norm": 0.0213623046875,
"learning_rate": 0.0019518327073084285,
"loss": 0.2328,
"num_input_tokens_seen": 481344,
"step": 1535,
"train_runtime": 277.7336,
"train_tokens_per_second": 1733.114
},
{
"epoch": 17.11111111111111,
"grad_norm": 0.01092529296875,
"learning_rate": 0.0018807043929090638,
"loss": 0.2328,
"num_input_tokens_seen": 482944,
"step": 1540,
"train_runtime": 278.5559,
"train_tokens_per_second": 1733.742
},
{
"epoch": 17.166666666666668,
"grad_norm": 0.01220703125,
"learning_rate": 0.0018108095151351837,
"loss": 0.2275,
"num_input_tokens_seen": 484480,
"step": 1545,
"train_runtime": 279.3468,
"train_tokens_per_second": 1734.332
},
{
"epoch": 17.22222222222222,
"grad_norm": 0.01348876953125,
"learning_rate": 0.001742154645292508,
"loss": 0.2381,
"num_input_tokens_seen": 486016,
"step": 1550,
"train_runtime": 280.1392,
"train_tokens_per_second": 1734.909
},
{
"epoch": 17.27777777777778,
"grad_norm": 0.0045166015625,
"learning_rate": 0.0016747462381049415,
"loss": 0.2307,
"num_input_tokens_seen": 487584,
"step": 1555,
"train_runtime": 280.9345,
"train_tokens_per_second": 1735.579
},
{
"epoch": 17.333333333333332,
"grad_norm": 0.006256103515625,
"learning_rate": 0.0016085906311077212,
"loss": 0.2339,
"num_input_tokens_seen": 489088,
"step": 1560,
"train_runtime": 281.7263,
"train_tokens_per_second": 1736.04
},
{
"epoch": 17.38888888888889,
"grad_norm": 0.01171875,
"learning_rate": 0.0015436940440516017,
"loss": 0.2306,
"num_input_tokens_seen": 490688,
"step": 1565,
"train_runtime": 282.5227,
"train_tokens_per_second": 1736.809
},
{
"epoch": 17.444444444444443,
"grad_norm": 0.022216796875,
"learning_rate": 0.0014800625783180658,
"loss": 0.237,
"num_input_tokens_seen": 492288,
"step": 1570,
"train_runtime": 283.316,
"train_tokens_per_second": 1737.593
},
{
"epoch": 17.5,
"grad_norm": 0.011474609375,
"learning_rate": 0.0014177022163457135,
"loss": 0.2308,
"num_input_tokens_seen": 493824,
"step": 1575,
"train_runtime": 284.1065,
"train_tokens_per_second": 1738.165
},
{
"epoch": 17.555555555555557,
"grad_norm": 0.01409912109375,
"learning_rate": 0.0013566188210677903,
"loss": 0.2338,
"num_input_tokens_seen": 495456,
"step": 1580,
"train_runtime": 284.9046,
"train_tokens_per_second": 1739.024
},
{
"epoch": 17.61111111111111,
"grad_norm": 0.004791259765625,
"learning_rate": 0.0012968181353609854,
"loss": 0.2307,
"num_input_tokens_seen": 497024,
"step": 1585,
"train_runtime": 285.6964,
"train_tokens_per_second": 1739.693
},
{
"epoch": 17.666666666666668,
"grad_norm": 0.01092529296875,
"learning_rate": 0.0012383057815055082,
"loss": 0.2266,
"num_input_tokens_seen": 498592,
"step": 1590,
"train_runtime": 286.4968,
"train_tokens_per_second": 1740.306
},
{
"epoch": 17.72222222222222,
"grad_norm": 0.01214599609375,
"learning_rate": 0.001181087260656487,
"loss": 0.2308,
"num_input_tokens_seen": 500128,
"step": 1595,
"train_runtime": 287.2884,
"train_tokens_per_second": 1740.857
},
{
"epoch": 17.77777777777778,
"grad_norm": 0.0223388671875,
"learning_rate": 0.0011251679523267587,
"loss": 0.2297,
"num_input_tokens_seen": 501696,
"step": 1600,
"train_runtime": 288.0865,
"train_tokens_per_second": 1741.477
},
{
"epoch": 17.833333333333332,
"grad_norm": 0.003814697265625,
"learning_rate": 0.0010705531138811369,
"loss": 0.2327,
"num_input_tokens_seen": 503232,
"step": 1605,
"train_runtime": 288.877,
"train_tokens_per_second": 1742.029
},
{
"epoch": 17.88888888888889,
"grad_norm": 0.01214599609375,
"learning_rate": 0.0010172478800420954,
"loss": 0.2296,
"num_input_tokens_seen": 504736,
"step": 1610,
"train_runtime": 289.6642,
"train_tokens_per_second": 1742.487
},
{
"epoch": 17.944444444444443,
"grad_norm": 0.021484375,
"learning_rate": 0.0009652572624070293,
"loss": 0.2256,
"num_input_tokens_seen": 506304,
"step": 1615,
"train_runtime": 290.4568,
"train_tokens_per_second": 1743.13
},
{
"epoch": 18.0,
"grad_norm": 0.00579833984375,
"learning_rate": 0.0009145861489770912,
"loss": 0.2307,
"num_input_tokens_seen": 507872,
"step": 1620,
"train_runtime": 291.2951,
"train_tokens_per_second": 1743.497
},
{
"epoch": 18.0,
"eval_loss": 0.23335394263267517,
"eval_runtime": 0.8176,
"eval_samples_per_second": 48.921,
"eval_steps_per_second": 12.23,
"num_input_tokens_seen": 507872,
"step": 1620
},
{
"epoch": 18.055555555555557,
"grad_norm": 0.00408935546875,
"learning_rate": 0.0008652393036976157,
"loss": 0.2286,
"num_input_tokens_seen": 509408,
"step": 1625,
"train_runtime": 293.8353,
"train_tokens_per_second": 1733.651
},
{
"epoch": 18.11111111111111,
"grad_norm": 0.020751953125,
"learning_rate": 0.0008172213660102473,
"loss": 0.2267,
"num_input_tokens_seen": 510912,
"step": 1630,
"train_runtime": 294.6455,
"train_tokens_per_second": 1733.989
},
{
"epoch": 18.166666666666668,
"grad_norm": 0.0113525390625,
"learning_rate": 0.0007705368504167398,
"loss": 0.2329,
"num_input_tokens_seen": 512384,
"step": 1635,
"train_runtime": 295.4336,
"train_tokens_per_second": 1734.346
},
{
"epoch": 18.22222222222222,
"grad_norm": 0.01153564453125,
"learning_rate": 0.0007251901460545118,
"loss": 0.2307,
"num_input_tokens_seen": 513952,
"step": 1640,
"train_runtime": 296.2304,
"train_tokens_per_second": 1734.974
},
{
"epoch": 18.27777777777778,
"grad_norm": 0.003753662109375,
"learning_rate": 0.0006811855162840213,
"loss": 0.238,
"num_input_tokens_seen": 515520,
"step": 1645,
"train_runtime": 297.0246,
"train_tokens_per_second": 1735.614
},
{
"epoch": 18.333333333333332,
"grad_norm": 0.003143310546875,
"learning_rate": 0.0006385270982879065,
"loss": 0.236,
"num_input_tokens_seen": 517120,
"step": 1650,
"train_runtime": 297.8218,
"train_tokens_per_second": 1736.34
},
{
"epoch": 18.38888888888889,
"grad_norm": 0.0034332275390625,
"learning_rate": 0.0005972189026820351,
"loss": 0.2276,
"num_input_tokens_seen": 518688,
"step": 1655,
"train_runtime": 298.6166,
"train_tokens_per_second": 1736.97
},
{
"epoch": 18.444444444444443,
"grad_norm": 0.01287841796875,
"learning_rate": 0.0005572648131384361,
"loss": 0.2358,
"num_input_tokens_seen": 520224,
"step": 1660,
"train_runtime": 299.4059,
"train_tokens_per_second": 1737.521
},
{
"epoch": 18.5,
"grad_norm": 0.01214599609375,
"learning_rate": 0.0005186685860201717,
"loss": 0.2255,
"num_input_tokens_seen": 521824,
"step": 1665,
"train_runtime": 300.2061,
"train_tokens_per_second": 1738.219
},
{
"epoch": 18.555555555555557,
"grad_norm": 0.00579833984375,
"learning_rate": 0.0004814338500281634,
"loss": 0.2297,
"num_input_tokens_seen": 523424,
"step": 1670,
"train_runtime": 301.0084,
"train_tokens_per_second": 1738.902
},
{
"epoch": 18.61111111111111,
"grad_norm": 0.004425048828125,
"learning_rate": 0.0004455641058600529,
"loss": 0.2307,
"num_input_tokens_seen": 524960,
"step": 1675,
"train_runtime": 301.8068,
"train_tokens_per_second": 1739.391
},
{
"epoch": 18.666666666666668,
"grad_norm": 0.020751953125,
"learning_rate": 0.00041106272588105564,
"loss": 0.2255,
"num_input_tokens_seen": 526496,
"step": 1680,
"train_runtime": 302.6028,
"train_tokens_per_second": 1739.891
},
{
"epoch": 18.72222222222222,
"grad_norm": 0.01141357421875,
"learning_rate": 0.0003779329538069159,
"loss": 0.2317,
"num_input_tokens_seen": 528064,
"step": 1685,
"train_runtime": 303.3978,
"train_tokens_per_second": 1740.5
},
{
"epoch": 18.77777777777778,
"grad_norm": 0.00982666015625,
"learning_rate": 0.00034617790439893603,
"loss": 0.2255,
"num_input_tokens_seen": 529632,
"step": 1690,
"train_runtime": 304.1914,
"train_tokens_per_second": 1741.114
},
{
"epoch": 18.833333333333332,
"grad_norm": 0.011474609375,
"learning_rate": 0.00031580056317113525,
"loss": 0.2327,
"num_input_tokens_seen": 531232,
"step": 1695,
"train_runtime": 304.9886,
"train_tokens_per_second": 1741.809
},
{
"epoch": 18.88888888888889,
"grad_norm": 0.0033721923828125,
"learning_rate": 0.00028680378610956793,
"loss": 0.2338,
"num_input_tokens_seen": 532800,
"step": 1700,
"train_runtime": 305.7795,
"train_tokens_per_second": 1742.432
},
{
"epoch": 18.944444444444443,
"grad_norm": 0.00628662109375,
"learning_rate": 0.00025919029940380146,
"loss": 0.2245,
"num_input_tokens_seen": 534400,
"step": 1705,
"train_runtime": 306.5763,
"train_tokens_per_second": 1743.123
},
{
"epoch": 19.0,
"grad_norm": 0.005096435546875,
"learning_rate": 0.0002329626991906164,
"loss": 0.2307,
"num_input_tokens_seen": 535968,
"step": 1710,
"train_runtime": 307.4089,
"train_tokens_per_second": 1743.502
},
{
"epoch": 19.0,
"eval_loss": 0.23493099212646484,
"eval_runtime": 0.8178,
"eval_samples_per_second": 48.914,
"eval_steps_per_second": 12.229,
"num_input_tokens_seen": 535968,
"step": 1710
},
{
"epoch": 19.055555555555557,
"grad_norm": 0.0224609375,
"learning_rate": 0.00020812345130992503,
"loss": 0.2327,
"num_input_tokens_seen": 537536,
"step": 1715,
"train_runtime": 309.9741,
"train_tokens_per_second": 1734.132
},
{
"epoch": 19.11111111111111,
"grad_norm": 0.006378173828125,
"learning_rate": 0.0001846748910729351,
"loss": 0.2297,
"num_input_tokens_seen": 539072,
"step": 1720,
"train_runtime": 310.7692,
"train_tokens_per_second": 1734.638
},
{
"epoch": 19.166666666666668,
"grad_norm": 0.0224609375,
"learning_rate": 0.0001626192230425938,
"loss": 0.2286,
"num_input_tokens_seen": 540608,
"step": 1725,
"train_runtime": 311.5604,
"train_tokens_per_second": 1735.163
},
{
"epoch": 19.22222222222222,
"grad_norm": 0.00372314453125,
"learning_rate": 0.00014195852082632686,
"loss": 0.2339,
"num_input_tokens_seen": 542208,
"step": 1730,
"train_runtime": 312.3593,
"train_tokens_per_second": 1735.847
},
{
"epoch": 19.27777777777778,
"grad_norm": 0.013671875,
"learning_rate": 0.00012269472688107463,
"loss": 0.2328,
"num_input_tokens_seen": 543776,
"step": 1735,
"train_runtime": 313.1529,
"train_tokens_per_second": 1736.455
},
{
"epoch": 19.333333333333332,
"grad_norm": 0.0050048828125,
"learning_rate": 0.00010482965233067298,
"loss": 0.2287,
"num_input_tokens_seen": 545280,
"step": 1740,
"train_runtime": 313.9486,
"train_tokens_per_second": 1736.845
},
{
"epoch": 19.38888888888889,
"grad_norm": 0.01202392578125,
"learning_rate": 8.836497679557964e-05,
"loss": 0.2422,
"num_input_tokens_seen": 546848,
"step": 1745,
"train_runtime": 314.7443,
"train_tokens_per_second": 1737.436
},
{
"epoch": 19.444444444444443,
"grad_norm": 0.00311279296875,
"learning_rate": 7.330224823495379e-05,
"loss": 0.2369,
"num_input_tokens_seen": 548416,
"step": 1750,
"train_runtime": 315.5381,
"train_tokens_per_second": 1738.034
},
{
"epoch": 19.5,
"grad_norm": 0.01019287109375,
"learning_rate": 5.96428828011325e-05,
"loss": 0.2306,
"num_input_tokens_seen": 550016,
"step": 1755,
"train_runtime": 316.3398,
"train_tokens_per_second": 1738.687
},
{
"epoch": 19.555555555555557,
"grad_norm": 0.0026092529296875,
"learning_rate": 4.738816470647389e-05,
"loss": 0.236,
"num_input_tokens_seen": 551584,
"step": 1760,
"train_runtime": 317.1375,
"train_tokens_per_second": 1739.258
},
{
"epoch": 19.61111111111111,
"grad_norm": 0.01263427734375,
"learning_rate": 3.653924610263703e-05,
"loss": 0.2297,
"num_input_tokens_seen": 553152,
"step": 1765,
"train_runtime": 317.9345,
"train_tokens_per_second": 1739.83
},
{
"epoch": 19.666666666666668,
"grad_norm": 0.0211181640625,
"learning_rate": 2.7097146972240305e-05,
"loss": 0.2276,
"num_input_tokens_seen": 554752,
"step": 1770,
"train_runtime": 318.7296,
"train_tokens_per_second": 1740.51
},
{
"epoch": 19.72222222222222,
"grad_norm": 0.004180908203125,
"learning_rate": 1.9062755032984713e-05,
"loss": 0.2235,
"num_input_tokens_seen": 556288,
"step": 1775,
"train_runtime": 319.5234,
"train_tokens_per_second": 1740.993
},
{
"epoch": 19.77777777777778,
"grad_norm": 0.00506591796875,
"learning_rate": 1.2436825654180693e-05,
"loss": 0.2308,
"num_input_tokens_seen": 557888,
"step": 1780,
"train_runtime": 320.3196,
"train_tokens_per_second": 1741.661
},
{
"epoch": 19.833333333333332,
"grad_norm": 0.0048828125,
"learning_rate": 7.219981785733242e-06,
"loss": 0.2307,
"num_input_tokens_seen": 559424,
"step": 1785,
"train_runtime": 321.1115,
"train_tokens_per_second": 1742.149
},
{
"epoch": 19.88888888888889,
"grad_norm": 0.007659912109375,
"learning_rate": 3.4127138995787565e-06,
"loss": 0.2297,
"num_input_tokens_seen": 560960,
"step": 1790,
"train_runtime": 321.902,
"train_tokens_per_second": 1742.642
},
{
"epoch": 19.944444444444443,
"grad_norm": 0.01220703125,
"learning_rate": 1.0153799435669298e-06,
"loss": 0.2234,
"num_input_tokens_seen": 562592,
"step": 1795,
"train_runtime": 322.6996,
"train_tokens_per_second": 1743.393
},
{
"epoch": 20.0,
"grad_norm": 0.00958251953125,
"learning_rate": 2.820530780767161e-08,
"loss": 0.2288,
"num_input_tokens_seen": 564096,
"step": 1800,
"train_runtime": 323.5286,
"train_tokens_per_second": 1743.574
},
{
"epoch": 20.0,
"eval_loss": 0.2323770523071289,
"eval_runtime": 0.8134,
"eval_samples_per_second": 49.178,
"eval_steps_per_second": 12.295,
"num_input_tokens_seen": 564096,
"step": 1800
},
{
"epoch": 20.0,
"num_input_tokens_seen": 564096,
"step": 1800,
"total_flos": 2.540098792665907e+16,
"train_loss": 0.2664620706770155,
"train_runtime": 325.1935,
"train_samples_per_second": 22.141,
"train_steps_per_second": 5.535
}
],
"logging_steps": 5,
"max_steps": 1800,
"num_input_tokens_seen": 564096,
"num_train_epochs": 20,
"save_steps": 90,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.540098792665907e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}