train_svamp_42_1760623621 / trainer_state.json
rbelanec's picture
End of training
390dee7 verified
{
"best_global_step": 1106,
"best_metric": 0.04775509238243103,
"best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_svamp_42_1760623621/checkpoint-1106",
"epoch": 20.0,
"eval_steps": 158,
"global_step": 3160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03164556962025317,
"grad_norm": 9.375,
"learning_rate": 0.00037974683544303797,
"loss": 2.2551,
"num_input_tokens_seen": 2336,
"step": 5,
"train_runtime": 2.9586,
"train_tokens_per_second": 789.551
},
{
"epoch": 0.06329113924050633,
"grad_norm": 11.75,
"learning_rate": 0.0008544303797468354,
"loss": 1.4629,
"num_input_tokens_seen": 4672,
"step": 10,
"train_runtime": 3.9467,
"train_tokens_per_second": 1183.768
},
{
"epoch": 0.0949367088607595,
"grad_norm": 35.5,
"learning_rate": 0.001329113924050633,
"loss": 0.621,
"num_input_tokens_seen": 6912,
"step": 15,
"train_runtime": 4.9324,
"train_tokens_per_second": 1401.334
},
{
"epoch": 0.12658227848101267,
"grad_norm": 3.71875,
"learning_rate": 0.0018037974683544303,
"loss": 0.1941,
"num_input_tokens_seen": 9152,
"step": 20,
"train_runtime": 5.9039,
"train_tokens_per_second": 1550.15
},
{
"epoch": 0.15822784810126583,
"grad_norm": 6.90625,
"learning_rate": 0.002278481012658228,
"loss": 0.2347,
"num_input_tokens_seen": 11456,
"step": 25,
"train_runtime": 6.8902,
"train_tokens_per_second": 1662.643
},
{
"epoch": 0.189873417721519,
"grad_norm": 4.6875,
"learning_rate": 0.0027531645569620253,
"loss": 0.1998,
"num_input_tokens_seen": 13792,
"step": 30,
"train_runtime": 7.8811,
"train_tokens_per_second": 1750.02
},
{
"epoch": 0.22151898734177214,
"grad_norm": 14.5625,
"learning_rate": 0.0032278481012658227,
"loss": 0.2293,
"num_input_tokens_seen": 16064,
"step": 35,
"train_runtime": 8.8602,
"train_tokens_per_second": 1813.052
},
{
"epoch": 0.25316455696202533,
"grad_norm": 30.625,
"learning_rate": 0.00370253164556962,
"loss": 0.3135,
"num_input_tokens_seen": 18336,
"step": 40,
"train_runtime": 9.8367,
"train_tokens_per_second": 1864.043
},
{
"epoch": 0.2848101265822785,
"grad_norm": 2.421875,
"learning_rate": 0.004177215189873417,
"loss": 1.6481,
"num_input_tokens_seen": 20640,
"step": 45,
"train_runtime": 10.817,
"train_tokens_per_second": 1908.101
},
{
"epoch": 0.31645569620253167,
"grad_norm": 35.25,
"learning_rate": 0.0046518987341772145,
"loss": 1.5985,
"num_input_tokens_seen": 22880,
"step": 50,
"train_runtime": 11.7934,
"train_tokens_per_second": 1940.068
},
{
"epoch": 0.34810126582278483,
"grad_norm": 7.59375,
"learning_rate": 0.005126582278481013,
"loss": 0.3414,
"num_input_tokens_seen": 25088,
"step": 55,
"train_runtime": 12.7651,
"train_tokens_per_second": 1965.361
},
{
"epoch": 0.379746835443038,
"grad_norm": 8.8125,
"learning_rate": 0.00560126582278481,
"loss": 0.4014,
"num_input_tokens_seen": 27392,
"step": 60,
"train_runtime": 13.7552,
"train_tokens_per_second": 1991.388
},
{
"epoch": 0.41139240506329117,
"grad_norm": 5.03125,
"learning_rate": 0.0060759493670886075,
"loss": 0.7651,
"num_input_tokens_seen": 29728,
"step": 65,
"train_runtime": 14.7552,
"train_tokens_per_second": 2014.75
},
{
"epoch": 0.4430379746835443,
"grad_norm": 2.109375,
"learning_rate": 0.006550632911392405,
"loss": 1.8753,
"num_input_tokens_seen": 32064,
"step": 70,
"train_runtime": 15.7473,
"train_tokens_per_second": 2036.159
},
{
"epoch": 0.47468354430379744,
"grad_norm": 3.640625,
"learning_rate": 0.007025316455696202,
"loss": 0.1454,
"num_input_tokens_seen": 34400,
"step": 75,
"train_runtime": 16.739,
"train_tokens_per_second": 2055.084
},
{
"epoch": 0.5063291139240507,
"grad_norm": 4.4375,
"learning_rate": 0.0075,
"loss": 0.2766,
"num_input_tokens_seen": 36704,
"step": 80,
"train_runtime": 17.7274,
"train_tokens_per_second": 2070.462
},
{
"epoch": 0.5379746835443038,
"grad_norm": 22.75,
"learning_rate": 0.007974683544303796,
"loss": 0.5439,
"num_input_tokens_seen": 38880,
"step": 85,
"train_runtime": 18.6936,
"train_tokens_per_second": 2079.855
},
{
"epoch": 0.569620253164557,
"grad_norm": 4.6875,
"learning_rate": 0.008449367088607595,
"loss": 1.2058,
"num_input_tokens_seen": 41120,
"step": 90,
"train_runtime": 19.6663,
"train_tokens_per_second": 2090.882
},
{
"epoch": 0.6012658227848101,
"grad_norm": 1.7734375,
"learning_rate": 0.008924050632911391,
"loss": 0.4753,
"num_input_tokens_seen": 43232,
"step": 95,
"train_runtime": 20.6294,
"train_tokens_per_second": 2095.655
},
{
"epoch": 0.6329113924050633,
"grad_norm": 11.8125,
"learning_rate": 0.00939873417721519,
"loss": 5.5594,
"num_input_tokens_seen": 45504,
"step": 100,
"train_runtime": 21.618,
"train_tokens_per_second": 2104.912
},
{
"epoch": 0.6645569620253164,
"grad_norm": 7.0,
"learning_rate": 0.009873417721518986,
"loss": 3.1616,
"num_input_tokens_seen": 47872,
"step": 105,
"train_runtime": 22.608,
"train_tokens_per_second": 2117.482
},
{
"epoch": 0.6962025316455697,
"grad_norm": 18.25,
"learning_rate": 0.010348101265822784,
"loss": 2.1289,
"num_input_tokens_seen": 49952,
"step": 110,
"train_runtime": 23.5679,
"train_tokens_per_second": 2119.497
},
{
"epoch": 0.7278481012658228,
"grad_norm": 2.6875,
"learning_rate": 0.01082278481012658,
"loss": 1.2601,
"num_input_tokens_seen": 52288,
"step": 115,
"train_runtime": 24.5454,
"train_tokens_per_second": 2130.257
},
{
"epoch": 0.759493670886076,
"grad_norm": 1.9296875,
"learning_rate": 0.011297468354430379,
"loss": 0.433,
"num_input_tokens_seen": 54592,
"step": 120,
"train_runtime": 25.5344,
"train_tokens_per_second": 2137.983
},
{
"epoch": 0.7911392405063291,
"grad_norm": 0.7578125,
"learning_rate": 0.011772151898734175,
"loss": 0.1428,
"num_input_tokens_seen": 56864,
"step": 125,
"train_runtime": 26.5128,
"train_tokens_per_second": 2144.779
},
{
"epoch": 0.8227848101265823,
"grad_norm": 1.875,
"learning_rate": 0.012246835443037974,
"loss": 0.263,
"num_input_tokens_seen": 59104,
"step": 130,
"train_runtime": 27.488,
"train_tokens_per_second": 2150.174
},
{
"epoch": 0.8544303797468354,
"grad_norm": 1.7265625,
"learning_rate": 0.012721518987341772,
"loss": 0.6412,
"num_input_tokens_seen": 61408,
"step": 135,
"train_runtime": 28.4783,
"train_tokens_per_second": 2156.311
},
{
"epoch": 0.8860759493670886,
"grad_norm": 0.859375,
"learning_rate": 0.01319620253164557,
"loss": 0.3362,
"num_input_tokens_seen": 63616,
"step": 140,
"train_runtime": 29.4521,
"train_tokens_per_second": 2159.982
},
{
"epoch": 0.9177215189873418,
"grad_norm": 0.58203125,
"learning_rate": 0.013670886075949367,
"loss": 0.3041,
"num_input_tokens_seen": 65824,
"step": 145,
"train_runtime": 30.4243,
"train_tokens_per_second": 2163.531
},
{
"epoch": 0.9493670886075949,
"grad_norm": 1.0078125,
"learning_rate": 0.014145569620253165,
"loss": 0.1369,
"num_input_tokens_seen": 68064,
"step": 150,
"train_runtime": 31.3981,
"train_tokens_per_second": 2167.774
},
{
"epoch": 0.9810126582278481,
"grad_norm": 0.8671875,
"learning_rate": 0.014620253164556962,
"loss": 0.1167,
"num_input_tokens_seen": 70432,
"step": 155,
"train_runtime": 32.3917,
"train_tokens_per_second": 2174.381
},
{
"epoch": 1.0,
"eval_loss": 0.09709509462118149,
"eval_runtime": 1.6655,
"eval_samples_per_second": 42.028,
"eval_steps_per_second": 10.807,
"num_input_tokens_seen": 71568,
"step": 158
},
{
"epoch": 1.0126582278481013,
"grad_norm": 0.185546875,
"learning_rate": 0.01509493670886076,
"loss": 0.168,
"num_input_tokens_seen": 72400,
"step": 160,
"train_runtime": 35.9024,
"train_tokens_per_second": 2016.576
},
{
"epoch": 1.0443037974683544,
"grad_norm": 1.65625,
"learning_rate": 0.015569620253164556,
"loss": 0.1485,
"num_input_tokens_seen": 74576,
"step": 165,
"train_runtime": 36.8752,
"train_tokens_per_second": 2022.387
},
{
"epoch": 1.0759493670886076,
"grad_norm": 0.353515625,
"learning_rate": 0.01604430379746835,
"loss": 0.1079,
"num_input_tokens_seen": 76944,
"step": 170,
"train_runtime": 37.8783,
"train_tokens_per_second": 2031.348
},
{
"epoch": 1.1075949367088607,
"grad_norm": 0.212890625,
"learning_rate": 0.01651898734177215,
"loss": 0.1488,
"num_input_tokens_seen": 79184,
"step": 175,
"train_runtime": 38.8505,
"train_tokens_per_second": 2038.175
},
{
"epoch": 1.139240506329114,
"grad_norm": 0.2109375,
"learning_rate": 0.016993670886075948,
"loss": 0.0581,
"num_input_tokens_seen": 81424,
"step": 180,
"train_runtime": 39.8224,
"train_tokens_per_second": 2044.679
},
{
"epoch": 1.1708860759493671,
"grad_norm": 0.404296875,
"learning_rate": 0.017468354430379748,
"loss": 0.1631,
"num_input_tokens_seen": 83760,
"step": 185,
"train_runtime": 40.8036,
"train_tokens_per_second": 2052.762
},
{
"epoch": 1.2025316455696202,
"grad_norm": 0.30859375,
"learning_rate": 0.017943037974683544,
"loss": 0.0797,
"num_input_tokens_seen": 86032,
"step": 190,
"train_runtime": 41.7895,
"train_tokens_per_second": 2058.699
},
{
"epoch": 1.2341772151898733,
"grad_norm": 0.169921875,
"learning_rate": 0.018417721518987344,
"loss": 0.0969,
"num_input_tokens_seen": 88368,
"step": 195,
"train_runtime": 42.785,
"train_tokens_per_second": 2065.397
},
{
"epoch": 1.2658227848101267,
"grad_norm": 0.142578125,
"learning_rate": 0.01889240506329114,
"loss": 0.0895,
"num_input_tokens_seen": 90608,
"step": 200,
"train_runtime": 43.7627,
"train_tokens_per_second": 2070.439
},
{
"epoch": 1.2974683544303798,
"grad_norm": 0.19921875,
"learning_rate": 0.019367088607594937,
"loss": 0.1284,
"num_input_tokens_seen": 93040,
"step": 205,
"train_runtime": 44.7708,
"train_tokens_per_second": 2078.138
},
{
"epoch": 1.3291139240506329,
"grad_norm": 0.40625,
"learning_rate": 0.019841772151898734,
"loss": 0.1272,
"num_input_tokens_seen": 95376,
"step": 210,
"train_runtime": 45.7696,
"train_tokens_per_second": 2083.83
},
{
"epoch": 1.360759493670886,
"grad_norm": 0.107421875,
"learning_rate": 0.02031645569620253,
"loss": 0.0786,
"num_input_tokens_seen": 97616,
"step": 215,
"train_runtime": 46.7523,
"train_tokens_per_second": 2087.942
},
{
"epoch": 1.3924050632911391,
"grad_norm": 0.1748046875,
"learning_rate": 0.02079113924050633,
"loss": 0.0547,
"num_input_tokens_seen": 99760,
"step": 220,
"train_runtime": 47.7222,
"train_tokens_per_second": 2090.431
},
{
"epoch": 1.4240506329113924,
"grad_norm": 0.248046875,
"learning_rate": 0.021265822784810127,
"loss": 0.164,
"num_input_tokens_seen": 102064,
"step": 225,
"train_runtime": 48.705,
"train_tokens_per_second": 2095.553
},
{
"epoch": 1.4556962025316456,
"grad_norm": 0.28125,
"learning_rate": 0.021740506329113923,
"loss": 0.0752,
"num_input_tokens_seen": 104304,
"step": 230,
"train_runtime": 49.686,
"train_tokens_per_second": 2099.264
},
{
"epoch": 1.4873417721518987,
"grad_norm": 0.099609375,
"learning_rate": 0.02221518987341772,
"loss": 0.0558,
"num_input_tokens_seen": 106608,
"step": 235,
"train_runtime": 50.6626,
"train_tokens_per_second": 2104.274
},
{
"epoch": 1.518987341772152,
"grad_norm": 0.091796875,
"learning_rate": 0.02268987341772152,
"loss": 0.0738,
"num_input_tokens_seen": 108752,
"step": 240,
"train_runtime": 51.6278,
"train_tokens_per_second": 2106.46
},
{
"epoch": 1.5506329113924051,
"grad_norm": 0.1630859375,
"learning_rate": 0.023164556962025316,
"loss": 0.1385,
"num_input_tokens_seen": 111088,
"step": 245,
"train_runtime": 52.6192,
"train_tokens_per_second": 2111.17
},
{
"epoch": 1.5822784810126582,
"grad_norm": 0.0830078125,
"learning_rate": 0.023639240506329113,
"loss": 0.0286,
"num_input_tokens_seen": 113328,
"step": 250,
"train_runtime": 53.5922,
"train_tokens_per_second": 2114.635
},
{
"epoch": 1.6139240506329116,
"grad_norm": 0.0791015625,
"learning_rate": 0.02411392405063291,
"loss": 0.2212,
"num_input_tokens_seen": 115600,
"step": 255,
"train_runtime": 54.578,
"train_tokens_per_second": 2118.069
},
{
"epoch": 1.6455696202531644,
"grad_norm": 0.1640625,
"learning_rate": 0.02458860759493671,
"loss": 0.1101,
"num_input_tokens_seen": 117936,
"step": 260,
"train_runtime": 55.5669,
"train_tokens_per_second": 2122.413
},
{
"epoch": 1.6772151898734178,
"grad_norm": 0.07568359375,
"learning_rate": 0.025063291139240506,
"loss": 0.1231,
"num_input_tokens_seen": 120176,
"step": 265,
"train_runtime": 56.5444,
"train_tokens_per_second": 2125.338
},
{
"epoch": 1.7088607594936709,
"grad_norm": 0.054931640625,
"learning_rate": 0.025537974683544303,
"loss": 0.0565,
"num_input_tokens_seen": 122320,
"step": 270,
"train_runtime": 57.5142,
"train_tokens_per_second": 2126.778
},
{
"epoch": 1.740506329113924,
"grad_norm": 0.0966796875,
"learning_rate": 0.0260126582278481,
"loss": 0.0567,
"num_input_tokens_seen": 124752,
"step": 275,
"train_runtime": 58.5126,
"train_tokens_per_second": 2132.054
},
{
"epoch": 1.7721518987341773,
"grad_norm": 0.162109375,
"learning_rate": 0.0264873417721519,
"loss": 0.1192,
"num_input_tokens_seen": 127216,
"step": 280,
"train_runtime": 59.5352,
"train_tokens_per_second": 2136.82
},
{
"epoch": 1.8037974683544302,
"grad_norm": 0.11083984375,
"learning_rate": 0.026962025316455696,
"loss": 0.1226,
"num_input_tokens_seen": 129424,
"step": 285,
"train_runtime": 60.5092,
"train_tokens_per_second": 2138.915
},
{
"epoch": 1.8354430379746836,
"grad_norm": 0.15234375,
"learning_rate": 0.027436708860759492,
"loss": 0.0977,
"num_input_tokens_seen": 131632,
"step": 290,
"train_runtime": 61.4848,
"train_tokens_per_second": 2140.888
},
{
"epoch": 1.8670886075949367,
"grad_norm": 0.21484375,
"learning_rate": 0.02791139240506329,
"loss": 0.1471,
"num_input_tokens_seen": 133808,
"step": 295,
"train_runtime": 62.4572,
"train_tokens_per_second": 2142.395
},
{
"epoch": 1.8987341772151898,
"grad_norm": 0.0830078125,
"learning_rate": 0.02838607594936709,
"loss": 0.1052,
"num_input_tokens_seen": 136144,
"step": 300,
"train_runtime": 63.4497,
"train_tokens_per_second": 2145.7
},
{
"epoch": 1.9303797468354431,
"grad_norm": 0.0576171875,
"learning_rate": 0.028860759493670885,
"loss": 0.0865,
"num_input_tokens_seen": 138384,
"step": 305,
"train_runtime": 64.4245,
"train_tokens_per_second": 2148.004
},
{
"epoch": 1.9620253164556962,
"grad_norm": 0.310546875,
"learning_rate": 0.02933544303797468,
"loss": 0.1386,
"num_input_tokens_seen": 140656,
"step": 310,
"train_runtime": 65.4004,
"train_tokens_per_second": 2150.689
},
{
"epoch": 1.9936708860759493,
"grad_norm": 0.0966796875,
"learning_rate": 0.029810126582278478,
"loss": 0.0618,
"num_input_tokens_seen": 142992,
"step": 315,
"train_runtime": 66.4373,
"train_tokens_per_second": 2152.285
},
{
"epoch": 2.0,
"eval_loss": 0.07142843306064606,
"eval_runtime": 1.6714,
"eval_samples_per_second": 41.882,
"eval_steps_per_second": 10.77,
"num_input_tokens_seen": 143232,
"step": 316
},
{
"epoch": 2.0253164556962027,
"grad_norm": 0.036376953125,
"learning_rate": 0.02999991763476599,
"loss": 0.0334,
"num_input_tokens_seen": 145056,
"step": 320,
"train_runtime": 70.098,
"train_tokens_per_second": 2069.33
},
{
"epoch": 2.0569620253164556,
"grad_norm": 0.0439453125,
"learning_rate": 0.02999941429494495,
"loss": 0.0793,
"num_input_tokens_seen": 147360,
"step": 325,
"train_runtime": 71.0787,
"train_tokens_per_second": 2073.194
},
{
"epoch": 2.088607594936709,
"grad_norm": 0.04736328125,
"learning_rate": 0.02999845338910228,
"loss": 0.0319,
"num_input_tokens_seen": 149568,
"step": 330,
"train_runtime": 72.054,
"train_tokens_per_second": 2075.777
},
{
"epoch": 2.1202531645569622,
"grad_norm": 0.0634765625,
"learning_rate": 0.029997034946550982,
"loss": 0.1134,
"num_input_tokens_seen": 151872,
"step": 335,
"train_runtime": 73.033,
"train_tokens_per_second": 2079.498
},
{
"epoch": 2.151898734177215,
"grad_norm": 0.0732421875,
"learning_rate": 0.029995159010561483,
"loss": 0.044,
"num_input_tokens_seen": 154080,
"step": 340,
"train_runtime": 74.0082,
"train_tokens_per_second": 2081.933
},
{
"epoch": 2.1835443037974684,
"grad_norm": 0.04296875,
"learning_rate": 0.029992825638360327,
"loss": 0.047,
"num_input_tokens_seen": 156288,
"step": 345,
"train_runtime": 74.9789,
"train_tokens_per_second": 2084.427
},
{
"epoch": 2.2151898734177213,
"grad_norm": 0.032470703125,
"learning_rate": 0.02999003490112841,
"loss": 0.0458,
"num_input_tokens_seen": 158624,
"step": 350,
"train_runtime": 75.9723,
"train_tokens_per_second": 2087.92
},
{
"epoch": 2.2468354430379747,
"grad_norm": 0.087890625,
"learning_rate": 0.029986786883998827,
"loss": 0.019,
"num_input_tokens_seen": 160864,
"step": 355,
"train_runtime": 76.9879,
"train_tokens_per_second": 2089.471
},
{
"epoch": 2.278481012658228,
"grad_norm": 0.1416015625,
"learning_rate": 0.029983081686054267,
"loss": 0.1553,
"num_input_tokens_seen": 163136,
"step": 360,
"train_runtime": 78.0606,
"train_tokens_per_second": 2089.864
},
{
"epoch": 2.310126582278481,
"grad_norm": 0.031005859375,
"learning_rate": 0.02997891942032399,
"loss": 0.0626,
"num_input_tokens_seen": 165408,
"step": 365,
"train_runtime": 79.1252,
"train_tokens_per_second": 2090.46
},
{
"epoch": 2.3417721518987342,
"grad_norm": 0.16796875,
"learning_rate": 0.029974300213780378,
"loss": 0.0852,
"num_input_tokens_seen": 167712,
"step": 370,
"train_runtime": 80.115,
"train_tokens_per_second": 2093.39
},
{
"epoch": 2.3734177215189876,
"grad_norm": 0.1474609375,
"learning_rate": 0.02996922420733506,
"loss": 0.1164,
"num_input_tokens_seen": 169824,
"step": 375,
"train_runtime": 81.0779,
"train_tokens_per_second": 2094.579
},
{
"epoch": 2.4050632911392404,
"grad_norm": 1.1328125,
"learning_rate": 0.029963691555834625,
"loss": 0.153,
"num_input_tokens_seen": 171968,
"step": 380,
"train_runtime": 82.0451,
"train_tokens_per_second": 2096.018
},
{
"epoch": 2.4367088607594938,
"grad_norm": 0.1220703125,
"learning_rate": 0.02995770242805588,
"loss": 0.0872,
"num_input_tokens_seen": 174272,
"step": 385,
"train_runtime": 83.0327,
"train_tokens_per_second": 2098.836
},
{
"epoch": 2.4683544303797467,
"grad_norm": 0.173828125,
"learning_rate": 0.029951257006700725,
"loss": 0.1133,
"num_input_tokens_seen": 176576,
"step": 390,
"train_runtime": 84.014,
"train_tokens_per_second": 2101.745
},
{
"epoch": 2.5,
"grad_norm": 10.125,
"learning_rate": 0.029944355488390553,
"loss": 1.7205,
"num_input_tokens_seen": 178848,
"step": 395,
"train_runtime": 85.0012,
"train_tokens_per_second": 2104.066
},
{
"epoch": 2.5316455696202533,
"grad_norm": 0.408203125,
"learning_rate": 0.029936998083660273,
"loss": 1.1129,
"num_input_tokens_seen": 181120,
"step": 400,
"train_runtime": 85.989,
"train_tokens_per_second": 2106.315
},
{
"epoch": 2.5632911392405062,
"grad_norm": 0.1962890625,
"learning_rate": 0.029929185016951868,
"loss": 0.174,
"num_input_tokens_seen": 183232,
"step": 405,
"train_runtime": 86.955,
"train_tokens_per_second": 2107.205
},
{
"epoch": 2.5949367088607596,
"grad_norm": 0.1767578125,
"learning_rate": 0.02992091652660758,
"loss": 0.1194,
"num_input_tokens_seen": 185504,
"step": 410,
"train_runtime": 87.9423,
"train_tokens_per_second": 2109.383
},
{
"epoch": 2.6265822784810124,
"grad_norm": 0.1435546875,
"learning_rate": 0.029912192864862595,
"loss": 0.1515,
"num_input_tokens_seen": 187808,
"step": 415,
"train_runtime": 88.9212,
"train_tokens_per_second": 2112.072
},
{
"epoch": 2.6582278481012658,
"grad_norm": 0.07568359375,
"learning_rate": 0.029903014297837396,
"loss": 0.1169,
"num_input_tokens_seen": 190016,
"step": 420,
"train_runtime": 89.8976,
"train_tokens_per_second": 2113.693
},
{
"epoch": 2.689873417721519,
"grad_norm": 0.1357421875,
"learning_rate": 0.0298933811055296,
"loss": 0.1,
"num_input_tokens_seen": 192480,
"step": 425,
"train_runtime": 90.9192,
"train_tokens_per_second": 2117.044
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.37890625,
"learning_rate": 0.029883293581805453,
"loss": 0.1351,
"num_input_tokens_seen": 194880,
"step": 430,
"train_runtime": 91.9268,
"train_tokens_per_second": 2119.947
},
{
"epoch": 2.7531645569620253,
"grad_norm": 0.123046875,
"learning_rate": 0.029872752034390833,
"loss": 0.128,
"num_input_tokens_seen": 197152,
"step": 435,
"train_runtime": 92.9052,
"train_tokens_per_second": 2122.076
},
{
"epoch": 2.7848101265822782,
"grad_norm": 0.09814453125,
"learning_rate": 0.029861756784861908,
"loss": 0.0751,
"num_input_tokens_seen": 199328,
"step": 440,
"train_runtime": 93.878,
"train_tokens_per_second": 2123.267
},
{
"epoch": 2.8164556962025316,
"grad_norm": 0.033203125,
"learning_rate": 0.029850308168635264,
"loss": 0.1457,
"num_input_tokens_seen": 201664,
"step": 445,
"train_runtime": 94.8626,
"train_tokens_per_second": 2125.855
},
{
"epoch": 2.848101265822785,
"grad_norm": 0.08984375,
"learning_rate": 0.02983840653495774,
"loss": 0.0665,
"num_input_tokens_seen": 204000,
"step": 450,
"train_runtime": 95.8542,
"train_tokens_per_second": 2128.231
},
{
"epoch": 2.879746835443038,
"grad_norm": 0.068359375,
"learning_rate": 0.029826052246895707,
"loss": 0.0807,
"num_input_tokens_seen": 206432,
"step": 455,
"train_runtime": 96.8728,
"train_tokens_per_second": 2130.959
},
{
"epoch": 2.911392405063291,
"grad_norm": 0.016845703125,
"learning_rate": 0.029813245681324055,
"loss": 0.0481,
"num_input_tokens_seen": 208672,
"step": 460,
"train_runtime": 97.8462,
"train_tokens_per_second": 2132.653
},
{
"epoch": 2.9430379746835444,
"grad_norm": 0.1826171875,
"learning_rate": 0.02979998722891465,
"loss": 0.0907,
"num_input_tokens_seen": 210976,
"step": 465,
"train_runtime": 98.8269,
"train_tokens_per_second": 2134.804
},
{
"epoch": 2.9746835443037973,
"grad_norm": 0.01556396484375,
"learning_rate": 0.029786277294124443,
"loss": 0.0498,
"num_input_tokens_seen": 213248,
"step": 470,
"train_runtime": 99.8061,
"train_tokens_per_second": 2136.623
},
{
"epoch": 3.0,
"eval_loss": 0.07707643508911133,
"eval_runtime": 1.6778,
"eval_samples_per_second": 41.721,
"eval_steps_per_second": 10.728,
"num_input_tokens_seen": 214912,
"step": 474
},
{
"epoch": 3.0063291139240507,
"grad_norm": 0.02978515625,
"learning_rate": 0.029772116295183122,
"loss": 0.029,
"num_input_tokens_seen": 215360,
"step": 475,
"train_runtime": 103.3347,
"train_tokens_per_second": 2084.101
},
{
"epoch": 3.037974683544304,
"grad_norm": 0.019775390625,
"learning_rate": 0.02975750466408034,
"loss": 0.0487,
"num_input_tokens_seen": 217632,
"step": 480,
"train_runtime": 104.3119,
"train_tokens_per_second": 2086.359
},
{
"epoch": 3.069620253164557,
"grad_norm": 0.028076171875,
"learning_rate": 0.029742442846552575,
"loss": 0.0696,
"num_input_tokens_seen": 219936,
"step": 485,
"train_runtime": 105.3099,
"train_tokens_per_second": 2088.464
},
{
"epoch": 3.1012658227848102,
"grad_norm": 0.10009765625,
"learning_rate": 0.029726931302069493,
"loss": 0.0665,
"num_input_tokens_seen": 222304,
"step": 490,
"train_runtime": 106.3036,
"train_tokens_per_second": 2091.217
},
{
"epoch": 3.132911392405063,
"grad_norm": 0.0771484375,
"learning_rate": 0.029710970503819947,
"loss": 0.0542,
"num_input_tokens_seen": 224448,
"step": 495,
"train_runtime": 107.2731,
"train_tokens_per_second": 2092.304
},
{
"epoch": 3.1645569620253164,
"grad_norm": 0.0108642578125,
"learning_rate": 0.029694560938697545,
"loss": 0.0892,
"num_input_tokens_seen": 226752,
"step": 500,
"train_runtime": 108.2551,
"train_tokens_per_second": 2094.609
},
{
"epoch": 3.1962025316455698,
"grad_norm": 0.080078125,
"learning_rate": 0.029677703107285798,
"loss": 0.1341,
"num_input_tokens_seen": 229088,
"step": 505,
"train_runtime": 109.2387,
"train_tokens_per_second": 2097.132
},
{
"epoch": 3.2278481012658227,
"grad_norm": 0.02783203125,
"learning_rate": 0.029660397523842846,
"loss": 0.026,
"num_input_tokens_seen": 231232,
"step": 510,
"train_runtime": 110.218,
"train_tokens_per_second": 2097.951
},
{
"epoch": 3.259493670886076,
"grad_norm": 0.08642578125,
"learning_rate": 0.029642644716285763,
"loss": 0.0589,
"num_input_tokens_seen": 233568,
"step": 515,
"train_runtime": 111.2009,
"train_tokens_per_second": 2100.415
},
{
"epoch": 3.291139240506329,
"grad_norm": 0.0152587890625,
"learning_rate": 0.02962444522617446,
"loss": 0.0482,
"num_input_tokens_seen": 235808,
"step": 520,
"train_runtime": 112.1832,
"train_tokens_per_second": 2101.991
},
{
"epoch": 3.3227848101265822,
"grad_norm": 0.0213623046875,
"learning_rate": 0.02960579960869518,
"loss": 0.1296,
"num_input_tokens_seen": 238080,
"step": 525,
"train_runtime": 113.1698,
"train_tokens_per_second": 2103.741
},
{
"epoch": 3.3544303797468356,
"grad_norm": 0.05859375,
"learning_rate": 0.029586708432643525,
"loss": 0.0445,
"num_input_tokens_seen": 240288,
"step": 530,
"train_runtime": 114.1467,
"train_tokens_per_second": 2105.081
},
{
"epoch": 3.3860759493670884,
"grad_norm": 0.068359375,
"learning_rate": 0.029567172280407134,
"loss": 0.0441,
"num_input_tokens_seen": 242464,
"step": 535,
"train_runtime": 115.1213,
"train_tokens_per_second": 2106.16
},
{
"epoch": 3.4177215189873418,
"grad_norm": 0.05078125,
"learning_rate": 0.02954719174794791,
"loss": 0.0436,
"num_input_tokens_seen": 244736,
"step": 540,
"train_runtime": 116.108,
"train_tokens_per_second": 2107.83
},
{
"epoch": 3.449367088607595,
"grad_norm": 0.05517578125,
"learning_rate": 0.02952676744478383,
"loss": 0.0769,
"num_input_tokens_seen": 247072,
"step": 545,
"train_runtime": 117.0993,
"train_tokens_per_second": 2109.936
},
{
"epoch": 3.481012658227848,
"grad_norm": 0.046142578125,
"learning_rate": 0.029505899993970373,
"loss": 0.037,
"num_input_tokens_seen": 249376,
"step": 550,
"train_runtime": 118.0787,
"train_tokens_per_second": 2111.947
},
{
"epoch": 3.5126582278481013,
"grad_norm": 0.032470703125,
"learning_rate": 0.029484590032081488,
"loss": 0.0548,
"num_input_tokens_seen": 251680,
"step": 555,
"train_runtime": 119.0678,
"train_tokens_per_second": 2113.753
},
{
"epoch": 3.5443037974683547,
"grad_norm": 0.0771484375,
"learning_rate": 0.029462838209190195,
"loss": 0.0635,
"num_input_tokens_seen": 254048,
"step": 560,
"train_runtime": 120.0624,
"train_tokens_per_second": 2115.966
},
{
"epoch": 3.5759493670886076,
"grad_norm": 0.125,
"learning_rate": 0.029440645188848733,
"loss": 0.0936,
"num_input_tokens_seen": 256256,
"step": 565,
"train_runtime": 121.034,
"train_tokens_per_second": 2117.223
},
{
"epoch": 3.607594936708861,
"grad_norm": 0.03466796875,
"learning_rate": 0.029418011648068353,
"loss": 0.0574,
"num_input_tokens_seen": 258592,
"step": 570,
"train_runtime": 122.0267,
"train_tokens_per_second": 2119.144
},
{
"epoch": 3.6392405063291138,
"grad_norm": 0.05322265625,
"learning_rate": 0.029394938277298614,
"loss": 0.0827,
"num_input_tokens_seen": 260832,
"step": 575,
"train_runtime": 123.0027,
"train_tokens_per_second": 2120.539
},
{
"epoch": 3.670886075949367,
"grad_norm": 0.05810546875,
"learning_rate": 0.029371425780406368,
"loss": 0.0578,
"num_input_tokens_seen": 263168,
"step": 580,
"train_runtime": 123.9966,
"train_tokens_per_second": 2122.38
},
{
"epoch": 3.7025316455696204,
"grad_norm": 0.035888671875,
"learning_rate": 0.029347474874654274,
"loss": 0.0399,
"num_input_tokens_seen": 265344,
"step": 585,
"train_runtime": 124.9725,
"train_tokens_per_second": 2123.219
},
{
"epoch": 3.7341772151898733,
"grad_norm": 0.0849609375,
"learning_rate": 0.029323086290678896,
"loss": 0.0807,
"num_input_tokens_seen": 267552,
"step": 590,
"train_runtime": 125.9463,
"train_tokens_per_second": 2124.334
},
{
"epoch": 3.7658227848101267,
"grad_norm": 0.054443359375,
"learning_rate": 0.02929826077246845,
"loss": 0.0741,
"num_input_tokens_seen": 269760,
"step": 595,
"train_runtime": 126.921,
"train_tokens_per_second": 2125.416
},
{
"epoch": 3.7974683544303796,
"grad_norm": 0.0140380859375,
"learning_rate": 0.029272999077340066,
"loss": 0.0445,
"num_input_tokens_seen": 272000,
"step": 600,
"train_runtime": 127.8981,
"train_tokens_per_second": 2126.693
},
{
"epoch": 3.829113924050633,
"grad_norm": 0.0546875,
"learning_rate": 0.02924730197591674,
"loss": 0.05,
"num_input_tokens_seen": 274304,
"step": 605,
"train_runtime": 128.8903,
"train_tokens_per_second": 2128.197
},
{
"epoch": 3.8607594936708862,
"grad_norm": 0.050537109375,
"learning_rate": 0.029221170252103766,
"loss": 0.0443,
"num_input_tokens_seen": 276608,
"step": 610,
"train_runtime": 129.8816,
"train_tokens_per_second": 2129.693
},
{
"epoch": 3.892405063291139,
"grad_norm": 0.01470947265625,
"learning_rate": 0.029194604703064876,
"loss": 0.0408,
"num_input_tokens_seen": 278912,
"step": 615,
"train_runtime": 130.8625,
"train_tokens_per_second": 2131.337
},
{
"epoch": 3.9240506329113924,
"grad_norm": 0.0654296875,
"learning_rate": 0.029167606139197878,
"loss": 0.078,
"num_input_tokens_seen": 281152,
"step": 620,
"train_runtime": 131.8362,
"train_tokens_per_second": 2132.586
},
{
"epoch": 3.9556962025316453,
"grad_norm": 0.05908203125,
"learning_rate": 0.029140175384109963,
"loss": 0.0847,
"num_input_tokens_seen": 283424,
"step": 625,
"train_runtime": 132.8127,
"train_tokens_per_second": 2134.013
},
{
"epoch": 3.9873417721518987,
"grad_norm": 0.08349609375,
"learning_rate": 0.02911231327459257,
"loss": 0.0981,
"num_input_tokens_seen": 285792,
"step": 630,
"train_runtime": 133.8165,
"train_tokens_per_second": 2135.7
},
{
"epoch": 4.0,
"eval_loss": 0.057963330298662186,
"eval_runtime": 1.671,
"eval_samples_per_second": 41.892,
"eval_steps_per_second": 10.772,
"num_input_tokens_seen": 286448,
"step": 632
},
{
"epoch": 4.018987341772152,
"grad_norm": 0.0301513671875,
"learning_rate": 0.029084020660595865,
"loss": 0.0247,
"num_input_tokens_seen": 287824,
"step": 635,
"train_runtime": 137.3316,
"train_tokens_per_second": 2095.832
},
{
"epoch": 4.050632911392405,
"grad_norm": 0.051513671875,
"learning_rate": 0.0290552984052028,
"loss": 0.0395,
"num_input_tokens_seen": 290128,
"step": 640,
"train_runtime": 138.3441,
"train_tokens_per_second": 2097.148
},
{
"epoch": 4.082278481012658,
"grad_norm": 0.0001811981201171875,
"learning_rate": 0.029026147384602796,
"loss": 0.018,
"num_input_tokens_seen": 292336,
"step": 645,
"train_runtime": 139.3178,
"train_tokens_per_second": 2098.339
},
{
"epoch": 4.113924050632911,
"grad_norm": 0.00201416015625,
"learning_rate": 0.028996568488065012,
"loss": 0.0408,
"num_input_tokens_seen": 294544,
"step": 650,
"train_runtime": 140.2961,
"train_tokens_per_second": 2099.446
},
{
"epoch": 4.1455696202531644,
"grad_norm": 0.03515625,
"learning_rate": 0.02896656261791122,
"loss": 0.0666,
"num_input_tokens_seen": 296784,
"step": 655,
"train_runtime": 141.2725,
"train_tokens_per_second": 2100.79
},
{
"epoch": 4.177215189873418,
"grad_norm": 0.072265625,
"learning_rate": 0.028936130689488263,
"loss": 0.0343,
"num_input_tokens_seen": 299056,
"step": 660,
"train_runtime": 142.253,
"train_tokens_per_second": 2102.283
},
{
"epoch": 4.208860759493671,
"grad_norm": 0.06005859375,
"learning_rate": 0.028905273631140153,
"loss": 0.0286,
"num_input_tokens_seen": 301264,
"step": 665,
"train_runtime": 143.231,
"train_tokens_per_second": 2103.344
},
{
"epoch": 4.2405063291139244,
"grad_norm": 0.013671875,
"learning_rate": 0.02887399238417974,
"loss": 0.0207,
"num_input_tokens_seen": 303472,
"step": 670,
"train_runtime": 144.2059,
"train_tokens_per_second": 2104.435
},
{
"epoch": 4.272151898734177,
"grad_norm": 0.0274658203125,
"learning_rate": 0.02884228790286001,
"loss": 0.0509,
"num_input_tokens_seen": 305744,
"step": 675,
"train_runtime": 145.1926,
"train_tokens_per_second": 2105.782
},
{
"epoch": 4.30379746835443,
"grad_norm": 0.030029296875,
"learning_rate": 0.02881016115434494,
"loss": 0.0467,
"num_input_tokens_seen": 308016,
"step": 680,
"train_runtime": 146.1698,
"train_tokens_per_second": 2107.248
},
{
"epoch": 4.3354430379746836,
"grad_norm": 0.0020904541015625,
"learning_rate": 0.028777613118680035,
"loss": 0.0321,
"num_input_tokens_seen": 310320,
"step": 685,
"train_runtime": 147.1607,
"train_tokens_per_second": 2108.715
},
{
"epoch": 4.367088607594937,
"grad_norm": 0.059814453125,
"learning_rate": 0.028744644788762413,
"loss": 0.0559,
"num_input_tokens_seen": 312624,
"step": 690,
"train_runtime": 148.1407,
"train_tokens_per_second": 2110.318
},
{
"epoch": 4.39873417721519,
"grad_norm": 0.00469970703125,
"learning_rate": 0.02871125717031052,
"loss": 0.0241,
"num_input_tokens_seen": 314960,
"step": 695,
"train_runtime": 149.1324,
"train_tokens_per_second": 2111.949
},
{
"epoch": 4.430379746835443,
"grad_norm": 0.0166015625,
"learning_rate": 0.028677451281833435,
"loss": 0.0067,
"num_input_tokens_seen": 317232,
"step": 700,
"train_runtime": 150.1094,
"train_tokens_per_second": 2113.339
},
{
"epoch": 4.462025316455696,
"grad_norm": 0.041259765625,
"learning_rate": 0.028643228154599815,
"loss": 0.0212,
"num_input_tokens_seen": 319536,
"step": 705,
"train_runtime": 151.0915,
"train_tokens_per_second": 2114.851
},
{
"epoch": 4.493670886075949,
"grad_norm": 0.0166015625,
"learning_rate": 0.028608588832606446,
"loss": 0.0317,
"num_input_tokens_seen": 321936,
"step": 710,
"train_runtime": 152.0979,
"train_tokens_per_second": 2116.637
},
{
"epoch": 4.525316455696203,
"grad_norm": 0.0869140625,
"learning_rate": 0.02857353437254637,
"loss": 0.036,
"num_input_tokens_seen": 324208,
"step": 715,
"train_runtime": 153.0861,
"train_tokens_per_second": 2117.815
},
{
"epoch": 4.556962025316456,
"grad_norm": 0.05810546875,
"learning_rate": 0.028538065843776658,
"loss": 0.0478,
"num_input_tokens_seen": 326384,
"step": 720,
"train_runtime": 154.0569,
"train_tokens_per_second": 2118.594
},
{
"epoch": 4.588607594936709,
"grad_norm": 0.00151824951171875,
"learning_rate": 0.028502184328285808,
"loss": 0.0082,
"num_input_tokens_seen": 328592,
"step": 725,
"train_runtime": 155.0297,
"train_tokens_per_second": 2119.543
},
{
"epoch": 4.620253164556962,
"grad_norm": 0.046875,
"learning_rate": 0.02846589092066071,
"loss": 0.0182,
"num_input_tokens_seen": 330896,
"step": 730,
"train_runtime": 156.0139,
"train_tokens_per_second": 2120.939
},
{
"epoch": 4.651898734177215,
"grad_norm": 0.0047607421875,
"learning_rate": 0.02842918672805327,
"loss": 0.0399,
"num_input_tokens_seen": 333360,
"step": 735,
"train_runtime": 157.0419,
"train_tokens_per_second": 2122.745
},
{
"epoch": 4.6835443037974684,
"grad_norm": 0.0908203125,
"learning_rate": 0.028392072870146633,
"loss": 0.0917,
"num_input_tokens_seen": 335568,
"step": 740,
"train_runtime": 158.0163,
"train_tokens_per_second": 2123.629
},
{
"epoch": 4.715189873417722,
"grad_norm": 0.029296875,
"learning_rate": 0.028354550479121027,
"loss": 0.0462,
"num_input_tokens_seen": 337808,
"step": 745,
"train_runtime": 158.9933,
"train_tokens_per_second": 2124.668
},
{
"epoch": 4.746835443037975,
"grad_norm": 0.072265625,
"learning_rate": 0.028316620699619228,
"loss": 0.0275,
"num_input_tokens_seen": 340016,
"step": 750,
"train_runtime": 159.9715,
"train_tokens_per_second": 2125.479
},
{
"epoch": 4.7784810126582276,
"grad_norm": 0.049072265625,
"learning_rate": 0.028278284688711637,
"loss": 0.0641,
"num_input_tokens_seen": 342320,
"step": 755,
"train_runtime": 160.9537,
"train_tokens_per_second": 2126.823
},
{
"epoch": 4.810126582278481,
"grad_norm": 0.08154296875,
"learning_rate": 0.028239543615860983,
"loss": 0.0479,
"num_input_tokens_seen": 344656,
"step": 760,
"train_runtime": 161.9536,
"train_tokens_per_second": 2128.115
},
{
"epoch": 4.841772151898734,
"grad_norm": 0.0576171875,
"learning_rate": 0.028200398662886653,
"loss": 0.0593,
"num_input_tokens_seen": 346928,
"step": 765,
"train_runtime": 162.9314,
"train_tokens_per_second": 2129.288
},
{
"epoch": 4.8734177215189876,
"grad_norm": 0.040283203125,
"learning_rate": 0.028160851023928634,
"loss": 0.1044,
"num_input_tokens_seen": 349168,
"step": 770,
"train_runtime": 163.9064,
"train_tokens_per_second": 2130.289
},
{
"epoch": 4.905063291139241,
"grad_norm": 0.01116943359375,
"learning_rate": 0.02812090190541108,
"loss": 0.0527,
"num_input_tokens_seen": 351536,
"step": 775,
"train_runtime": 164.9105,
"train_tokens_per_second": 2131.678
},
{
"epoch": 4.936708860759493,
"grad_norm": 0.0072021484375,
"learning_rate": 0.028080552526005543,
"loss": 0.03,
"num_input_tokens_seen": 353840,
"step": 780,
"train_runtime": 165.8904,
"train_tokens_per_second": 2132.974
},
{
"epoch": 4.968354430379747,
"grad_norm": 0.0205078125,
"learning_rate": 0.02803980411659374,
"loss": 0.0797,
"num_input_tokens_seen": 356112,
"step": 785,
"train_runtime": 166.8767,
"train_tokens_per_second": 2133.983
},
{
"epoch": 5.0,
"grad_norm": 0.0113525390625,
"learning_rate": 0.02799865792023004,
"loss": 0.021,
"num_input_tokens_seen": 358176,
"step": 790,
"train_runtime": 167.8315,
"train_tokens_per_second": 2134.14
},
{
"epoch": 5.0,
"eval_loss": 0.057497963309288025,
"eval_runtime": 1.6779,
"eval_samples_per_second": 41.718,
"eval_steps_per_second": 10.728,
"num_input_tokens_seen": 358176,
"step": 790
},
{
"epoch": 5.031645569620253,
"grad_norm": 0.00144195556640625,
"learning_rate": 0.027957115192103567,
"loss": 0.021,
"num_input_tokens_seen": 360512,
"step": 795,
"train_runtime": 171.5628,
"train_tokens_per_second": 2101.341
},
{
"epoch": 5.063291139240507,
"grad_norm": 0.00262451171875,
"learning_rate": 0.027915177199499843,
"loss": 0.0336,
"num_input_tokens_seen": 362752,
"step": 800,
"train_runtime": 172.5391,
"train_tokens_per_second": 2102.433
},
{
"epoch": 5.094936708860759,
"grad_norm": 0.03662109375,
"learning_rate": 0.027872845221762192,
"loss": 0.0355,
"num_input_tokens_seen": 365056,
"step": 805,
"train_runtime": 173.5332,
"train_tokens_per_second": 2103.666
},
{
"epoch": 5.1265822784810124,
"grad_norm": 0.1015625,
"learning_rate": 0.02783012055025268,
"loss": 0.0592,
"num_input_tokens_seen": 367264,
"step": 810,
"train_runtime": 174.5081,
"train_tokens_per_second": 2104.567
},
{
"epoch": 5.158227848101266,
"grad_norm": 0.0079345703125,
"learning_rate": 0.027787004488312724,
"loss": 0.0095,
"num_input_tokens_seen": 369536,
"step": 815,
"train_runtime": 175.4906,
"train_tokens_per_second": 2105.731
},
{
"epoch": 5.189873417721519,
"grad_norm": 0.0654296875,
"learning_rate": 0.027743498351223354,
"loss": 0.0159,
"num_input_tokens_seen": 371744,
"step": 820,
"train_runtime": 176.468,
"train_tokens_per_second": 2106.58
},
{
"epoch": 5.2215189873417724,
"grad_norm": 0.00726318359375,
"learning_rate": 0.027699603466165058,
"loss": 0.0128,
"num_input_tokens_seen": 373984,
"step": 825,
"train_runtime": 177.4488,
"train_tokens_per_second": 2107.56
},
{
"epoch": 5.253164556962025,
"grad_norm": 0.0001239776611328125,
"learning_rate": 0.027655321172177314,
"loss": 0.0304,
"num_input_tokens_seen": 376288,
"step": 830,
"train_runtime": 178.4286,
"train_tokens_per_second": 2108.9
},
{
"epoch": 5.284810126582278,
"grad_norm": 0.0010986328125,
"learning_rate": 0.027610652820117747,
"loss": 0.0402,
"num_input_tokens_seen": 378528,
"step": 835,
"train_runtime": 179.4119,
"train_tokens_per_second": 2109.827
},
{
"epoch": 5.3164556962025316,
"grad_norm": 0.038818359375,
"learning_rate": 0.0275655997726209,
"loss": 0.0245,
"num_input_tokens_seen": 380896,
"step": 840,
"train_runtime": 180.4061,
"train_tokens_per_second": 2111.325
},
{
"epoch": 5.348101265822785,
"grad_norm": 0.046875,
"learning_rate": 0.02752016340405669,
"loss": 0.0517,
"num_input_tokens_seen": 383168,
"step": 845,
"train_runtime": 181.3859,
"train_tokens_per_second": 2112.447
},
{
"epoch": 5.379746835443038,
"grad_norm": 0.056640625,
"learning_rate": 0.027474345100488465,
"loss": 0.0426,
"num_input_tokens_seen": 385568,
"step": 850,
"train_runtime": 182.3921,
"train_tokens_per_second": 2113.952
},
{
"epoch": 5.4113924050632916,
"grad_norm": 0.060546875,
"learning_rate": 0.027428146259630727,
"loss": 0.0426,
"num_input_tokens_seen": 387872,
"step": 855,
"train_runtime": 183.3925,
"train_tokens_per_second": 2114.983
},
{
"epoch": 5.443037974683544,
"grad_norm": 0.00335693359375,
"learning_rate": 0.027381568290806495,
"loss": 0.0082,
"num_input_tokens_seen": 390112,
"step": 860,
"train_runtime": 184.3713,
"train_tokens_per_second": 2115.905
},
{
"epoch": 5.474683544303797,
"grad_norm": 0.07666015625,
"learning_rate": 0.027334612614904306,
"loss": 0.032,
"num_input_tokens_seen": 392512,
"step": 865,
"train_runtime": 185.3821,
"train_tokens_per_second": 2117.314
},
{
"epoch": 5.506329113924051,
"grad_norm": 0.054443359375,
"learning_rate": 0.02728728066433488,
"loss": 0.0408,
"num_input_tokens_seen": 394752,
"step": 870,
"train_runtime": 186.3599,
"train_tokens_per_second": 2118.224
},
{
"epoch": 5.537974683544304,
"grad_norm": 0.0054931640625,
"learning_rate": 0.027239573882987415,
"loss": 0.0327,
"num_input_tokens_seen": 396992,
"step": 875,
"train_runtime": 187.3367,
"train_tokens_per_second": 2119.137
},
{
"epoch": 5.569620253164557,
"grad_norm": 0.02099609375,
"learning_rate": 0.02719149372618555,
"loss": 0.0248,
"num_input_tokens_seen": 399232,
"step": 880,
"train_runtime": 188.3252,
"train_tokens_per_second": 2119.908
},
{
"epoch": 5.60126582278481,
"grad_norm": 0.038818359375,
"learning_rate": 0.027143041660642967,
"loss": 0.0276,
"num_input_tokens_seen": 401440,
"step": 885,
"train_runtime": 189.3005,
"train_tokens_per_second": 2120.649
},
{
"epoch": 5.632911392405063,
"grad_norm": 0.016845703125,
"learning_rate": 0.027094219164418627,
"loss": 0.016,
"num_input_tokens_seen": 403616,
"step": 890,
"train_runtime": 190.2719,
"train_tokens_per_second": 2121.259
},
{
"epoch": 5.6645569620253164,
"grad_norm": 0.03662109375,
"learning_rate": 0.02704502772687172,
"loss": 0.0218,
"num_input_tokens_seen": 405760,
"step": 895,
"train_runtime": 191.2416,
"train_tokens_per_second": 2121.714
},
{
"epoch": 5.69620253164557,
"grad_norm": 0.02099609375,
"learning_rate": 0.026995468848616182,
"loss": 0.0419,
"num_input_tokens_seen": 408064,
"step": 900,
"train_runtime": 192.2375,
"train_tokens_per_second": 2122.707
},
{
"epoch": 5.727848101265823,
"grad_norm": 0.06103515625,
"learning_rate": 0.026945544041474978,
"loss": 0.0553,
"num_input_tokens_seen": 410400,
"step": 905,
"train_runtime": 193.2315,
"train_tokens_per_second": 2123.878
},
{
"epoch": 5.759493670886076,
"grad_norm": 0.0986328125,
"learning_rate": 0.02689525482843393,
"loss": 0.032,
"num_input_tokens_seen": 412736,
"step": 910,
"train_runtime": 194.2258,
"train_tokens_per_second": 2125.032
},
{
"epoch": 5.791139240506329,
"grad_norm": 0.026123046875,
"learning_rate": 0.02684460274359528,
"loss": 0.0392,
"num_input_tokens_seen": 415168,
"step": 915,
"train_runtime": 195.2258,
"train_tokens_per_second": 2126.604
},
{
"epoch": 5.822784810126582,
"grad_norm": 0.041259765625,
"learning_rate": 0.026793589332130902,
"loss": 0.0468,
"num_input_tokens_seen": 417344,
"step": 920,
"train_runtime": 196.1957,
"train_tokens_per_second": 2127.183
},
{
"epoch": 5.8544303797468356,
"grad_norm": 0.028564453125,
"learning_rate": 0.02674221615023513,
"loss": 0.0345,
"num_input_tokens_seen": 419488,
"step": 925,
"train_runtime": 197.1647,
"train_tokens_per_second": 2127.602
},
{
"epoch": 5.886075949367089,
"grad_norm": 0.034912109375,
"learning_rate": 0.026690484765077332,
"loss": 0.0553,
"num_input_tokens_seen": 421600,
"step": 930,
"train_runtime": 198.1377,
"train_tokens_per_second": 2127.814
},
{
"epoch": 5.917721518987342,
"grad_norm": 0.0157470703125,
"learning_rate": 0.026638396754754056,
"loss": 0.0148,
"num_input_tokens_seen": 423904,
"step": 935,
"train_runtime": 199.1211,
"train_tokens_per_second": 2128.875
},
{
"epoch": 5.949367088607595,
"grad_norm": 0.04296875,
"learning_rate": 0.026585953708240937,
"loss": 0.0247,
"num_input_tokens_seen": 426240,
"step": 940,
"train_runtime": 200.116,
"train_tokens_per_second": 2129.965
},
{
"epoch": 5.981012658227848,
"grad_norm": 0.00439453125,
"learning_rate": 0.02653315722534418,
"loss": 0.0334,
"num_input_tokens_seen": 428608,
"step": 945,
"train_runtime": 201.1018,
"train_tokens_per_second": 2131.299
},
{
"epoch": 6.0,
"eval_loss": 0.057629313319921494,
"eval_runtime": 1.682,
"eval_samples_per_second": 41.616,
"eval_steps_per_second": 10.701,
"num_input_tokens_seen": 429728,
"step": 948
},
{
"epoch": 6.012658227848101,
"grad_norm": 0.0556640625,
"learning_rate": 0.026480008916651778,
"loss": 0.0224,
"num_input_tokens_seen": 430624,
"step": 950,
"train_runtime": 204.5872,
"train_tokens_per_second": 2104.843
},
{
"epoch": 6.044303797468355,
"grad_norm": 0.004547119140625,
"learning_rate": 0.02642651040348439,
"loss": 0.0104,
"num_input_tokens_seen": 432960,
"step": 955,
"train_runtime": 205.6102,
"train_tokens_per_second": 2105.732
},
{
"epoch": 6.075949367088608,
"grad_norm": 0.0211181640625,
"learning_rate": 0.026372663317845862,
"loss": 0.0298,
"num_input_tokens_seen": 435104,
"step": 960,
"train_runtime": 206.5794,
"train_tokens_per_second": 2106.231
},
{
"epoch": 6.1075949367088604,
"grad_norm": 0.0008544921875,
"learning_rate": 0.02631846930237345,
"loss": 0.0218,
"num_input_tokens_seen": 437312,
"step": 965,
"train_runtime": 207.5581,
"train_tokens_per_second": 2106.938
},
{
"epoch": 6.139240506329114,
"grad_norm": 0.0272216796875,
"learning_rate": 0.026263930010287713,
"loss": 0.0266,
"num_input_tokens_seen": 439584,
"step": 970,
"train_runtime": 208.5403,
"train_tokens_per_second": 2107.909
},
{
"epoch": 6.170886075949367,
"grad_norm": 0.017578125,
"learning_rate": 0.02620904710534207,
"loss": 0.0454,
"num_input_tokens_seen": 441856,
"step": 975,
"train_runtime": 209.5207,
"train_tokens_per_second": 2108.889
},
{
"epoch": 6.2025316455696204,
"grad_norm": 0.0004482269287109375,
"learning_rate": 0.026153822261772066,
"loss": 0.0287,
"num_input_tokens_seen": 444128,
"step": 980,
"train_runtime": 210.4989,
"train_tokens_per_second": 2109.883
},
{
"epoch": 6.234177215189874,
"grad_norm": 0.0189208984375,
"learning_rate": 0.026098257164244274,
"loss": 0.0146,
"num_input_tokens_seen": 446368,
"step": 985,
"train_runtime": 211.4822,
"train_tokens_per_second": 2110.665
},
{
"epoch": 6.265822784810126,
"grad_norm": 0.021240234375,
"learning_rate": 0.02604235350780493,
"loss": 0.004,
"num_input_tokens_seen": 448640,
"step": 990,
"train_runtime": 212.4666,
"train_tokens_per_second": 2111.579
},
{
"epoch": 6.2974683544303796,
"grad_norm": 0.046142578125,
"learning_rate": 0.025986112997828197,
"loss": 0.0223,
"num_input_tokens_seen": 450848,
"step": 995,
"train_runtime": 213.4442,
"train_tokens_per_second": 2112.252
},
{
"epoch": 6.329113924050633,
"grad_norm": 0.0242919921875,
"learning_rate": 0.025929537349964157,
"loss": 0.0108,
"num_input_tokens_seen": 453056,
"step": 1000,
"train_runtime": 214.4214,
"train_tokens_per_second": 2112.923
},
{
"epoch": 6.360759493670886,
"grad_norm": 0.017333984375,
"learning_rate": 0.025872628290086477,
"loss": 0.0305,
"num_input_tokens_seen": 455424,
"step": 1005,
"train_runtime": 215.4176,
"train_tokens_per_second": 2114.145
},
{
"epoch": 6.3924050632911396,
"grad_norm": 0.01806640625,
"learning_rate": 0.025815387554239753,
"loss": 0.0084,
"num_input_tokens_seen": 457632,
"step": 1010,
"train_runtime": 216.3934,
"train_tokens_per_second": 2114.815
},
{
"epoch": 6.424050632911392,
"grad_norm": 0.0296630859375,
"learning_rate": 0.025757816888586547,
"loss": 0.0425,
"num_input_tokens_seen": 459968,
"step": 1015,
"train_runtime": 217.3863,
"train_tokens_per_second": 2115.902
},
{
"epoch": 6.455696202531645,
"grad_norm": 0.004058837890625,
"learning_rate": 0.025699918049354144,
"loss": 0.0221,
"num_input_tokens_seen": 462240,
"step": 1020,
"train_runtime": 218.3659,
"train_tokens_per_second": 2116.814
},
{
"epoch": 6.487341772151899,
"grad_norm": 0.000591278076171875,
"learning_rate": 0.025641692802780933,
"loss": 0.0021,
"num_input_tokens_seen": 464416,
"step": 1025,
"train_runtime": 219.3388,
"train_tokens_per_second": 2117.346
},
{
"epoch": 6.518987341772152,
"grad_norm": 0.005340576171875,
"learning_rate": 0.02558314292506257,
"loss": 0.0216,
"num_input_tokens_seen": 466752,
"step": 1030,
"train_runtime": 220.3457,
"train_tokens_per_second": 2118.272
},
{
"epoch": 6.550632911392405,
"grad_norm": 0.00909423828125,
"learning_rate": 0.025524270202297767,
"loss": 0.027,
"num_input_tokens_seen": 469088,
"step": 1035,
"train_runtime": 221.3497,
"train_tokens_per_second": 2119.217
},
{
"epoch": 6.582278481012658,
"grad_norm": 0.06494140625,
"learning_rate": 0.025465076430433827,
"loss": 0.0288,
"num_input_tokens_seen": 471328,
"step": 1040,
"train_runtime": 222.3263,
"train_tokens_per_second": 2119.983
},
{
"epoch": 6.613924050632911,
"grad_norm": 0.07275390625,
"learning_rate": 0.025405563415211833,
"loss": 0.0325,
"num_input_tokens_seen": 473728,
"step": 1045,
"train_runtime": 223.3355,
"train_tokens_per_second": 2121.15
},
{
"epoch": 6.6455696202531644,
"grad_norm": 0.023681640625,
"learning_rate": 0.025345732972111585,
"loss": 0.0163,
"num_input_tokens_seen": 475840,
"step": 1050,
"train_runtime": 224.3028,
"train_tokens_per_second": 2121.418
},
{
"epoch": 6.677215189873418,
"grad_norm": 0.025390625,
"learning_rate": 0.025285586926296195,
"loss": 0.0182,
"num_input_tokens_seen": 478048,
"step": 1055,
"train_runtime": 225.2808,
"train_tokens_per_second": 2122.01
},
{
"epoch": 6.708860759493671,
"grad_norm": 0.002716064453125,
"learning_rate": 0.025225127112556447,
"loss": 0.038,
"num_input_tokens_seen": 480352,
"step": 1060,
"train_runtime": 226.2736,
"train_tokens_per_second": 2122.881
},
{
"epoch": 6.740506329113924,
"grad_norm": 0.0576171875,
"learning_rate": 0.025164355375254775,
"loss": 0.0626,
"num_input_tokens_seen": 482752,
"step": 1065,
"train_runtime": 227.2847,
"train_tokens_per_second": 2123.997
},
{
"epoch": 6.772151898734177,
"grad_norm": 0.01177978515625,
"learning_rate": 0.02510327356826905,
"loss": 0.0365,
"num_input_tokens_seen": 485056,
"step": 1070,
"train_runtime": 228.2764,
"train_tokens_per_second": 2124.863
},
{
"epoch": 6.80379746835443,
"grad_norm": 0.00775146484375,
"learning_rate": 0.02504188355493598,
"loss": 0.0396,
"num_input_tokens_seen": 487360,
"step": 1075,
"train_runtime": 229.2704,
"train_tokens_per_second": 2125.699
},
{
"epoch": 6.8354430379746836,
"grad_norm": 0.01458740234375,
"learning_rate": 0.024980187207994307,
"loss": 0.0469,
"num_input_tokens_seen": 489728,
"step": 1080,
"train_runtime": 230.2664,
"train_tokens_per_second": 2126.789
},
{
"epoch": 6.867088607594937,
"grad_norm": 0.06591796875,
"learning_rate": 0.024918186409527657,
"loss": 0.0226,
"num_input_tokens_seen": 492064,
"step": 1085,
"train_runtime": 231.2505,
"train_tokens_per_second": 2127.84
},
{
"epoch": 6.89873417721519,
"grad_norm": 0.048828125,
"learning_rate": 0.024855883050907124,
"loss": 0.0439,
"num_input_tokens_seen": 494304,
"step": 1090,
"train_runtime": 232.2288,
"train_tokens_per_second": 2128.521
},
{
"epoch": 6.930379746835443,
"grad_norm": 0.06787109375,
"learning_rate": 0.024793279032733578,
"loss": 0.0403,
"num_input_tokens_seen": 496800,
"step": 1095,
"train_runtime": 233.2529,
"train_tokens_per_second": 2129.877
},
{
"epoch": 6.962025316455696,
"grad_norm": 0.059814453125,
"learning_rate": 0.024730376264779707,
"loss": 0.0124,
"num_input_tokens_seen": 499040,
"step": 1100,
"train_runtime": 234.2299,
"train_tokens_per_second": 2130.556
},
{
"epoch": 6.993670886075949,
"grad_norm": 0.00176239013671875,
"learning_rate": 0.02466717666593172,
"loss": 0.0169,
"num_input_tokens_seen": 501280,
"step": 1105,
"train_runtime": 235.2092,
"train_tokens_per_second": 2131.21
},
{
"epoch": 7.0,
"eval_loss": 0.04775509238243103,
"eval_runtime": 1.6963,
"eval_samples_per_second": 41.266,
"eval_steps_per_second": 10.611,
"num_input_tokens_seen": 501504,
"step": 1106
},
{
"epoch": 7.025316455696203,
"grad_norm": 0.00537109375,
"learning_rate": 0.02460368216413082,
"loss": 0.0175,
"num_input_tokens_seen": 503328,
"step": 1110,
"train_runtime": 238.7545,
"train_tokens_per_second": 2108.141
},
{
"epoch": 7.056962025316456,
"grad_norm": 0.01031494140625,
"learning_rate": 0.024539894696314412,
"loss": 0.0033,
"num_input_tokens_seen": 505568,
"step": 1115,
"train_runtime": 239.7409,
"train_tokens_per_second": 2108.81
},
{
"epoch": 7.0886075949367084,
"grad_norm": 0.05029296875,
"learning_rate": 0.024475816208357017,
"loss": 0.0229,
"num_input_tokens_seen": 507904,
"step": 1120,
"train_runtime": 240.7268,
"train_tokens_per_second": 2109.878
},
{
"epoch": 7.120253164556962,
"grad_norm": 0.0030975341796875,
"learning_rate": 0.024411448655010867,
"loss": 0.0058,
"num_input_tokens_seen": 510048,
"step": 1125,
"train_runtime": 241.698,
"train_tokens_per_second": 2110.27
},
{
"epoch": 7.151898734177215,
"grad_norm": 0.0247802734375,
"learning_rate": 0.024346793999846333,
"loss": 0.0131,
"num_input_tokens_seen": 512256,
"step": 1130,
"train_runtime": 242.6735,
"train_tokens_per_second": 2110.886
},
{
"epoch": 7.1835443037974684,
"grad_norm": 0.0181884765625,
"learning_rate": 0.02428185421519197,
"loss": 0.0039,
"num_input_tokens_seen": 514592,
"step": 1135,
"train_runtime": 243.675,
"train_tokens_per_second": 2111.797
},
{
"epoch": 7.215189873417722,
"grad_norm": 0.00543212890625,
"learning_rate": 0.02421663128207441,
"loss": 0.0084,
"num_input_tokens_seen": 517024,
"step": 1140,
"train_runtime": 244.6865,
"train_tokens_per_second": 2113.005
},
{
"epoch": 7.246835443037975,
"grad_norm": 0.0069580078125,
"learning_rate": 0.024151127190157863,
"loss": 0.0167,
"num_input_tokens_seen": 519360,
"step": 1145,
"train_runtime": 245.6905,
"train_tokens_per_second": 2113.879
},
{
"epoch": 7.2784810126582276,
"grad_norm": 0.028564453125,
"learning_rate": 0.02408534393768348,
"loss": 0.0082,
"num_input_tokens_seen": 521664,
"step": 1150,
"train_runtime": 246.6697,
"train_tokens_per_second": 2114.828
},
{
"epoch": 7.310126582278481,
"grad_norm": 0.009033203125,
"learning_rate": 0.024019283531408357,
"loss": 0.0026,
"num_input_tokens_seen": 523872,
"step": 1155,
"train_runtime": 247.6391,
"train_tokens_per_second": 2115.466
},
{
"epoch": 7.341772151898734,
"grad_norm": 0.0184326171875,
"learning_rate": 0.02395294798654433,
"loss": 0.0037,
"num_input_tokens_seen": 526144,
"step": 1160,
"train_runtime": 248.6207,
"train_tokens_per_second": 2116.252
},
{
"epoch": 7.3734177215189876,
"grad_norm": 0.006256103515625,
"learning_rate": 0.023886339326696513,
"loss": 0.0101,
"num_input_tokens_seen": 528416,
"step": 1165,
"train_runtime": 249.6153,
"train_tokens_per_second": 2116.922
},
{
"epoch": 7.405063291139241,
"grad_norm": 0.0279541015625,
"learning_rate": 0.023819459583801543,
"loss": 0.0379,
"num_input_tokens_seen": 530720,
"step": 1170,
"train_runtime": 250.6008,
"train_tokens_per_second": 2117.79
},
{
"epoch": 7.436708860759493,
"grad_norm": 0.005859375,
"learning_rate": 0.023752310798065612,
"loss": 0.0242,
"num_input_tokens_seen": 532896,
"step": 1175,
"train_runtime": 251.5746,
"train_tokens_per_second": 2118.243
},
{
"epoch": 7.468354430379747,
"grad_norm": 0.0032806396484375,
"learning_rate": 0.023684895017902212,
"loss": 0.0015,
"num_input_tokens_seen": 535040,
"step": 1180,
"train_runtime": 252.5512,
"train_tokens_per_second": 2118.541
},
{
"epoch": 7.5,
"grad_norm": 0.00013065338134765625,
"learning_rate": 0.02361721429986967,
"loss": 0.0016,
"num_input_tokens_seen": 537248,
"step": 1185,
"train_runtime": 253.527,
"train_tokens_per_second": 2119.095
},
{
"epoch": 7.531645569620253,
"grad_norm": 0.0016326904296875,
"learning_rate": 0.02354927070860841,
"loss": 0.0056,
"num_input_tokens_seen": 539616,
"step": 1190,
"train_runtime": 254.5237,
"train_tokens_per_second": 2120.102
},
{
"epoch": 7.563291139240507,
"grad_norm": 0.0203857421875,
"learning_rate": 0.023481066316777932,
"loss": 0.0166,
"num_input_tokens_seen": 541824,
"step": 1195,
"train_runtime": 255.5036,
"train_tokens_per_second": 2120.612
},
{
"epoch": 7.594936708860759,
"grad_norm": 0.0264892578125,
"learning_rate": 0.023412603204993634,
"loss": 0.007,
"num_input_tokens_seen": 544064,
"step": 1200,
"train_runtime": 256.4988,
"train_tokens_per_second": 2121.117
},
{
"epoch": 7.6265822784810124,
"grad_norm": 0.02685546875,
"learning_rate": 0.023343883461763304,
"loss": 0.0038,
"num_input_tokens_seen": 546336,
"step": 1205,
"train_runtime": 257.4801,
"train_tokens_per_second": 2121.857
},
{
"epoch": 7.658227848101266,
"grad_norm": 0.003265380859375,
"learning_rate": 0.023274909183423443,
"loss": 0.0311,
"num_input_tokens_seen": 548608,
"step": 1210,
"train_runtime": 258.4622,
"train_tokens_per_second": 2122.585
},
{
"epoch": 7.689873417721519,
"grad_norm": 0.055908203125,
"learning_rate": 0.023205682474075274,
"loss": 0.0167,
"num_input_tokens_seen": 550944,
"step": 1215,
"train_runtime": 259.4627,
"train_tokens_per_second": 2123.404
},
{
"epoch": 7.7215189873417724,
"grad_norm": 0.10791015625,
"learning_rate": 0.023136205445520596,
"loss": 0.0156,
"num_input_tokens_seen": 553344,
"step": 1220,
"train_runtime": 260.4697,
"train_tokens_per_second": 2124.409
},
{
"epoch": 7.753164556962025,
"grad_norm": 0.06591796875,
"learning_rate": 0.02306648021719733,
"loss": 0.0211,
"num_input_tokens_seen": 555552,
"step": 1225,
"train_runtime": 261.4463,
"train_tokens_per_second": 2124.918
},
{
"epoch": 7.784810126582278,
"grad_norm": 0.0108642578125,
"learning_rate": 0.022996508916114898,
"loss": 0.0031,
"num_input_tokens_seen": 557792,
"step": 1230,
"train_runtime": 262.4269,
"train_tokens_per_second": 2125.514
},
{
"epoch": 7.8164556962025316,
"grad_norm": 0.041015625,
"learning_rate": 0.02292629367678929,
"loss": 0.0079,
"num_input_tokens_seen": 559968,
"step": 1235,
"train_runtime": 263.401,
"train_tokens_per_second": 2125.915
},
{
"epoch": 7.848101265822785,
"grad_norm": 0.0198974609375,
"learning_rate": 0.022855836641178,
"loss": 0.0098,
"num_input_tokens_seen": 562272,
"step": 1240,
"train_runtime": 264.3946,
"train_tokens_per_second": 2126.639
},
{
"epoch": 7.879746835443038,
"grad_norm": 0.05224609375,
"learning_rate": 0.022785139958614652,
"loss": 0.0238,
"num_input_tokens_seen": 564608,
"step": 1245,
"train_runtime": 265.3951,
"train_tokens_per_second": 2127.424
},
{
"epoch": 7.911392405063291,
"grad_norm": 0.0242919921875,
"learning_rate": 0.02271420578574343,
"loss": 0.0284,
"num_input_tokens_seen": 566976,
"step": 1250,
"train_runtime": 266.4068,
"train_tokens_per_second": 2128.234
},
{
"epoch": 7.943037974683544,
"grad_norm": 0.05615234375,
"learning_rate": 0.022643036286453325,
"loss": 0.0153,
"num_input_tokens_seen": 569248,
"step": 1255,
"train_runtime": 267.3887,
"train_tokens_per_second": 2128.916
},
{
"epoch": 7.974683544303797,
"grad_norm": 0.083984375,
"learning_rate": 0.022571633631812082,
"loss": 0.0271,
"num_input_tokens_seen": 571488,
"step": 1260,
"train_runtime": 268.3681,
"train_tokens_per_second": 2129.493
},
{
"epoch": 8.0,
"eval_loss": 0.06543910503387451,
"eval_runtime": 1.6923,
"eval_samples_per_second": 41.364,
"eval_steps_per_second": 10.636,
"num_input_tokens_seen": 573120,
"step": 1264
},
{
"epoch": 8.00632911392405,
"grad_norm": 0.007598876953125,
"learning_rate": 0.0225,
"loss": 0.0486,
"num_input_tokens_seen": 573568,
"step": 1265,
"train_runtime": 271.9159,
"train_tokens_per_second": 2109.358
},
{
"epoch": 8.037974683544304,
"grad_norm": 0.030517578125,
"learning_rate": 0.022428137576243456,
"loss": 0.0053,
"num_input_tokens_seen": 575808,
"step": 1270,
"train_runtime": 272.9056,
"train_tokens_per_second": 2109.917
},
{
"epoch": 8.069620253164556,
"grad_norm": 0.035888671875,
"learning_rate": 0.022356048552748285,
"loss": 0.0142,
"num_input_tokens_seen": 578208,
"step": 1275,
"train_runtime": 273.9241,
"train_tokens_per_second": 2110.833
},
{
"epoch": 8.10126582278481,
"grad_norm": 0.04541015625,
"learning_rate": 0.02228373512863286,
"loss": 0.002,
"num_input_tokens_seen": 580416,
"step": 1280,
"train_runtime": 274.9034,
"train_tokens_per_second": 2111.345
},
{
"epoch": 8.132911392405063,
"grad_norm": 0.0087890625,
"learning_rate": 0.022211199509861033,
"loss": 0.0041,
"num_input_tokens_seen": 582720,
"step": 1285,
"train_runtime": 275.8966,
"train_tokens_per_second": 2112.096
},
{
"epoch": 8.164556962025316,
"grad_norm": 0.00089263916015625,
"learning_rate": 0.022138443909174844,
"loss": 0.0066,
"num_input_tokens_seen": 584928,
"step": 1290,
"train_runtime": 276.8743,
"train_tokens_per_second": 2112.612
},
{
"epoch": 8.19620253164557,
"grad_norm": 4.9591064453125e-05,
"learning_rate": 0.02206547054602701,
"loss": 0.0191,
"num_input_tokens_seen": 587232,
"step": 1295,
"train_runtime": 277.8742,
"train_tokens_per_second": 2113.302
},
{
"epoch": 8.227848101265822,
"grad_norm": 0.0023193359375,
"learning_rate": 0.021992281646513213,
"loss": 0.0067,
"num_input_tokens_seen": 589504,
"step": 1300,
"train_runtime": 278.8576,
"train_tokens_per_second": 2113.996
},
{
"epoch": 8.259493670886076,
"grad_norm": 0.0123291015625,
"learning_rate": 0.0219188794433042,
"loss": 0.0072,
"num_input_tokens_seen": 591680,
"step": 1305,
"train_runtime": 279.8324,
"train_tokens_per_second": 2114.408
},
{
"epoch": 8.291139240506329,
"grad_norm": 0.06982421875,
"learning_rate": 0.021845266175577683,
"loss": 0.0619,
"num_input_tokens_seen": 594016,
"step": 1310,
"train_runtime": 280.8306,
"train_tokens_per_second": 2115.211
},
{
"epoch": 8.322784810126583,
"grad_norm": 0.07763671875,
"learning_rate": 0.02177144408895002,
"loss": 0.0096,
"num_input_tokens_seen": 596416,
"step": 1315,
"train_runtime": 281.8514,
"train_tokens_per_second": 2116.066
},
{
"epoch": 8.354430379746836,
"grad_norm": 0.0791015625,
"learning_rate": 0.0216974154354077,
"loss": 0.0368,
"num_input_tokens_seen": 598752,
"step": 1320,
"train_runtime": 282.8349,
"train_tokens_per_second": 2116.966
},
{
"epoch": 8.386075949367088,
"grad_norm": 0.00101470947265625,
"learning_rate": 0.02162318247323868,
"loss": 0.0353,
"num_input_tokens_seen": 600960,
"step": 1325,
"train_runtime": 283.8103,
"train_tokens_per_second": 2117.471
},
{
"epoch": 8.417721518987342,
"grad_norm": 0.0106201171875,
"learning_rate": 0.021548747466963447,
"loss": 0.0221,
"num_input_tokens_seen": 603264,
"step": 1330,
"train_runtime": 284.8014,
"train_tokens_per_second": 2118.192
},
{
"epoch": 8.449367088607595,
"grad_norm": 0.0771484375,
"learning_rate": 0.02147411268726599,
"loss": 0.0107,
"num_input_tokens_seen": 605536,
"step": 1335,
"train_runtime": 285.7825,
"train_tokens_per_second": 2118.87
},
{
"epoch": 8.481012658227849,
"grad_norm": 0.0137939453125,
"learning_rate": 0.021399280410924492,
"loss": 0.0387,
"num_input_tokens_seen": 607840,
"step": 1340,
"train_runtime": 286.7715,
"train_tokens_per_second": 2119.597
},
{
"epoch": 8.512658227848101,
"grad_norm": 0.006011962890625,
"learning_rate": 0.021324252920741877,
"loss": 0.0014,
"num_input_tokens_seen": 610144,
"step": 1345,
"train_runtime": 287.7496,
"train_tokens_per_second": 2120.399
},
{
"epoch": 8.544303797468354,
"grad_norm": 0.044921875,
"learning_rate": 0.021249032505476193,
"loss": 0.0213,
"num_input_tokens_seen": 612448,
"step": 1350,
"train_runtime": 288.7422,
"train_tokens_per_second": 2121.089
},
{
"epoch": 8.575949367088608,
"grad_norm": 0.039306640625,
"learning_rate": 0.0211736214597708,
"loss": 0.0532,
"num_input_tokens_seen": 614816,
"step": 1355,
"train_runtime": 289.7373,
"train_tokens_per_second": 2121.977
},
{
"epoch": 8.60759493670886,
"grad_norm": 0.0595703125,
"learning_rate": 0.021098022084084324,
"loss": 0.0263,
"num_input_tokens_seen": 617088,
"step": 1360,
"train_runtime": 290.7256,
"train_tokens_per_second": 2122.579
},
{
"epoch": 8.639240506329115,
"grad_norm": 0.050537109375,
"learning_rate": 0.02102223668462052,
"loss": 0.0137,
"num_input_tokens_seen": 619328,
"step": 1365,
"train_runtime": 291.7026,
"train_tokens_per_second": 2123.149
},
{
"epoch": 8.670886075949367,
"grad_norm": 0.0517578125,
"learning_rate": 0.02094626757325791,
"loss": 0.024,
"num_input_tokens_seen": 621600,
"step": 1370,
"train_runtime": 292.6815,
"train_tokens_per_second": 2123.81
},
{
"epoch": 8.70253164556962,
"grad_norm": 0.01129150390625,
"learning_rate": 0.020870117067479252,
"loss": 0.0252,
"num_input_tokens_seen": 623840,
"step": 1375,
"train_runtime": 293.6647,
"train_tokens_per_second": 2124.328
},
{
"epoch": 8.734177215189874,
"grad_norm": 0.044921875,
"learning_rate": 0.02079378749030086,
"loss": 0.0102,
"num_input_tokens_seen": 626112,
"step": 1380,
"train_runtime": 294.6439,
"train_tokens_per_second": 2124.979
},
{
"epoch": 8.765822784810126,
"grad_norm": 0.007080078125,
"learning_rate": 0.020717281170201704,
"loss": 0.0083,
"num_input_tokens_seen": 628448,
"step": 1385,
"train_runtime": 295.6383,
"train_tokens_per_second": 2125.733
},
{
"epoch": 8.79746835443038,
"grad_norm": 0.0220947265625,
"learning_rate": 0.02064060044105243,
"loss": 0.0515,
"num_input_tokens_seen": 630752,
"step": 1390,
"train_runtime": 296.6244,
"train_tokens_per_second": 2126.433
},
{
"epoch": 8.829113924050633,
"grad_norm": 0.0281982421875,
"learning_rate": 0.02056374764204411,
"loss": 0.0056,
"num_input_tokens_seen": 633024,
"step": 1395,
"train_runtime": 297.601,
"train_tokens_per_second": 2127.09
},
{
"epoch": 8.860759493670885,
"grad_norm": 0.0634765625,
"learning_rate": 0.02048672511761693,
"loss": 0.0236,
"num_input_tokens_seen": 635264,
"step": 1400,
"train_runtime": 298.5788,
"train_tokens_per_second": 2127.626
},
{
"epoch": 8.89240506329114,
"grad_norm": 0.02294921875,
"learning_rate": 0.020409535217388638,
"loss": 0.0061,
"num_input_tokens_seen": 637472,
"step": 1405,
"train_runtime": 299.553,
"train_tokens_per_second": 2128.077
},
{
"epoch": 8.924050632911392,
"grad_norm": 0.008544921875,
"learning_rate": 0.020332180296082875,
"loss": 0.0166,
"num_input_tokens_seen": 639744,
"step": 1410,
"train_runtime": 300.5318,
"train_tokens_per_second": 2128.706
},
{
"epoch": 8.955696202531646,
"grad_norm": 0.04345703125,
"learning_rate": 0.020254662713457366,
"loss": 0.0118,
"num_input_tokens_seen": 642016,
"step": 1415,
"train_runtime": 301.5109,
"train_tokens_per_second": 2129.329
},
{
"epoch": 8.987341772151899,
"grad_norm": 0.031982421875,
"learning_rate": 0.020176984834231897,
"loss": 0.0045,
"num_input_tokens_seen": 644288,
"step": 1420,
"train_runtime": 302.4933,
"train_tokens_per_second": 2129.925
},
{
"epoch": 9.0,
"eval_loss": 0.06644842028617859,
"eval_runtime": 1.6869,
"eval_samples_per_second": 41.497,
"eval_steps_per_second": 10.671,
"num_input_tokens_seen": 644944,
"step": 1422
},
{
"epoch": 9.018987341772151,
"grad_norm": 0.0303955078125,
"learning_rate": 0.02009914902801621,
"loss": 0.0113,
"num_input_tokens_seen": 646320,
"step": 1425,
"train_runtime": 306.0813,
"train_tokens_per_second": 2111.596
},
{
"epoch": 9.050632911392405,
"grad_norm": 0.05419921875,
"learning_rate": 0.020021157669237698,
"loss": 0.0084,
"num_input_tokens_seen": 648560,
"step": 1430,
"train_runtime": 307.0899,
"train_tokens_per_second": 2111.955
},
{
"epoch": 9.082278481012658,
"grad_norm": 0.003936767578125,
"learning_rate": 0.01994301313706898,
"loss": 0.0014,
"num_input_tokens_seen": 650832,
"step": 1435,
"train_runtime": 308.0817,
"train_tokens_per_second": 2112.531
},
{
"epoch": 9.113924050632912,
"grad_norm": 0.00010967254638671875,
"learning_rate": 0.01986471781535531,
"loss": 0.0214,
"num_input_tokens_seen": 653072,
"step": 1440,
"train_runtime": 309.06,
"train_tokens_per_second": 2113.091
},
{
"epoch": 9.145569620253164,
"grad_norm": 0.0115966796875,
"learning_rate": 0.019786274092541887,
"loss": 0.0146,
"num_input_tokens_seen": 655344,
"step": 1445,
"train_runtime": 310.0523,
"train_tokens_per_second": 2113.656
},
{
"epoch": 9.177215189873417,
"grad_norm": 0.0264892578125,
"learning_rate": 0.01970768436160095,
"loss": 0.0028,
"num_input_tokens_seen": 657520,
"step": 1450,
"train_runtime": 311.0288,
"train_tokens_per_second": 2114.017
},
{
"epoch": 9.208860759493671,
"grad_norm": 0.0147705078125,
"learning_rate": 0.019628951019958815,
"loss": 0.0143,
"num_input_tokens_seen": 659856,
"step": 1455,
"train_runtime": 312.0235,
"train_tokens_per_second": 2114.764
},
{
"epoch": 9.240506329113924,
"grad_norm": 0.0089111328125,
"learning_rate": 0.01955007646942273,
"loss": 0.0047,
"num_input_tokens_seen": 662096,
"step": 1460,
"train_runtime": 313.0002,
"train_tokens_per_second": 2115.321
},
{
"epoch": 9.272151898734178,
"grad_norm": 0.000591278076171875,
"learning_rate": 0.019471063116107593,
"loss": 0.0064,
"num_input_tokens_seen": 664368,
"step": 1465,
"train_runtime": 313.9816,
"train_tokens_per_second": 2115.946
},
{
"epoch": 9.30379746835443,
"grad_norm": 0.0157470703125,
"learning_rate": 0.01939191337036257,
"loss": 0.0027,
"num_input_tokens_seen": 666608,
"step": 1470,
"train_runtime": 314.9615,
"train_tokens_per_second": 2116.475
},
{
"epoch": 9.335443037974684,
"grad_norm": 0.0751953125,
"learning_rate": 0.019312629646697572,
"loss": 0.0119,
"num_input_tokens_seen": 668976,
"step": 1475,
"train_runtime": 315.9602,
"train_tokens_per_second": 2117.279
},
{
"epoch": 9.367088607594937,
"grad_norm": 6.771087646484375e-05,
"learning_rate": 0.019233214363709557,
"loss": 0.0048,
"num_input_tokens_seen": 671312,
"step": 1480,
"train_runtime": 316.9511,
"train_tokens_per_second": 2118.03
},
{
"epoch": 9.39873417721519,
"grad_norm": 0.08056640625,
"learning_rate": 0.0191536699440088,
"loss": 0.0121,
"num_input_tokens_seen": 673648,
"step": 1485,
"train_runtime": 317.9431,
"train_tokens_per_second": 2118.769
},
{
"epoch": 9.430379746835444,
"grad_norm": 0.0057373046875,
"learning_rate": 0.019073998814144958,
"loss": 0.0044,
"num_input_tokens_seen": 675952,
"step": 1490,
"train_runtime": 318.9217,
"train_tokens_per_second": 2119.492
},
{
"epoch": 9.462025316455696,
"grad_norm": 0.0002536773681640625,
"learning_rate": 0.018994203404533068,
"loss": 0.0122,
"num_input_tokens_seen": 678128,
"step": 1495,
"train_runtime": 319.8919,
"train_tokens_per_second": 2119.866
},
{
"epoch": 9.49367088607595,
"grad_norm": 0.0004024505615234375,
"learning_rate": 0.01891428614937938,
"loss": 0.008,
"num_input_tokens_seen": 680432,
"step": 1500,
"train_runtime": 320.8826,
"train_tokens_per_second": 2120.501
},
{
"epoch": 9.525316455696203,
"grad_norm": 0.07177734375,
"learning_rate": 0.01883424948660712,
"loss": 0.0093,
"num_input_tokens_seen": 682608,
"step": 1505,
"train_runtime": 321.8483,
"train_tokens_per_second": 2120.9
},
{
"epoch": 9.556962025316455,
"grad_norm": 0.042236328125,
"learning_rate": 0.018754095857782118,
"loss": 0.0179,
"num_input_tokens_seen": 684912,
"step": 1510,
"train_runtime": 322.8354,
"train_tokens_per_second": 2121.552
},
{
"epoch": 9.58860759493671,
"grad_norm": 0.234375,
"learning_rate": 0.01867382770803832,
"loss": 0.0253,
"num_input_tokens_seen": 687216,
"step": 1515,
"train_runtime": 323.8209,
"train_tokens_per_second": 2122.21
},
{
"epoch": 9.620253164556962,
"grad_norm": 0.000713348388671875,
"learning_rate": 0.018593447486003202,
"loss": 0.0127,
"num_input_tokens_seen": 689488,
"step": 1520,
"train_runtime": 324.803,
"train_tokens_per_second": 2122.788
},
{
"epoch": 9.651898734177216,
"grad_norm": 0.0004444122314453125,
"learning_rate": 0.018512957643723064,
"loss": 0.0011,
"num_input_tokens_seen": 691760,
"step": 1525,
"train_runtime": 325.7751,
"train_tokens_per_second": 2123.428
},
{
"epoch": 9.683544303797468,
"grad_norm": 0.0284423828125,
"learning_rate": 0.01843236063658825,
"loss": 0.0106,
"num_input_tokens_seen": 694000,
"step": 1530,
"train_runtime": 326.7548,
"train_tokens_per_second": 2123.917
},
{
"epoch": 9.715189873417721,
"grad_norm": 0.10498046875,
"learning_rate": 0.018351658923258213,
"loss": 0.0134,
"num_input_tokens_seen": 696400,
"step": 1535,
"train_runtime": 327.7566,
"train_tokens_per_second": 2124.747
},
{
"epoch": 9.746835443037975,
"grad_norm": 0.00018310546875,
"learning_rate": 0.018270854965586555,
"loss": 0.0175,
"num_input_tokens_seen": 698640,
"step": 1540,
"train_runtime": 328.7367,
"train_tokens_per_second": 2125.226
},
{
"epoch": 9.778481012658228,
"grad_norm": 0.0115966796875,
"learning_rate": 0.018189951228545883,
"loss": 0.0058,
"num_input_tokens_seen": 700848,
"step": 1545,
"train_runtime": 329.7056,
"train_tokens_per_second": 2125.678
},
{
"epoch": 9.810126582278482,
"grad_norm": 0.02392578125,
"learning_rate": 0.018108950180152635,
"loss": 0.0138,
"num_input_tokens_seen": 703248,
"step": 1550,
"train_runtime": 330.7032,
"train_tokens_per_second": 2126.523
},
{
"epoch": 9.841772151898734,
"grad_norm": 0.048095703125,
"learning_rate": 0.018027854291391796,
"loss": 0.0099,
"num_input_tokens_seen": 705488,
"step": 1555,
"train_runtime": 331.675,
"train_tokens_per_second": 2127.046
},
{
"epoch": 9.873417721518987,
"grad_norm": 0.020751953125,
"learning_rate": 0.017946666036141513,
"loss": 0.0111,
"num_input_tokens_seen": 707728,
"step": 1560,
"train_runtime": 332.6455,
"train_tokens_per_second": 2127.575
},
{
"epoch": 9.905063291139241,
"grad_norm": 9.107589721679688e-05,
"learning_rate": 0.017865387891097616,
"loss": 0.002,
"num_input_tokens_seen": 709904,
"step": 1565,
"train_runtime": 333.6131,
"train_tokens_per_second": 2127.926
},
{
"epoch": 9.936708860759493,
"grad_norm": 0.0038604736328125,
"learning_rate": 0.017784022335698094,
"loss": 0.0029,
"num_input_tokens_seen": 712208,
"step": 1570,
"train_runtime": 334.5996,
"train_tokens_per_second": 2128.538
},
{
"epoch": 9.968354430379748,
"grad_norm": 0.03857421875,
"learning_rate": 0.01770257185204742,
"loss": 0.0091,
"num_input_tokens_seen": 714448,
"step": 1575,
"train_runtime": 335.5716,
"train_tokens_per_second": 2129.048
},
{
"epoch": 10.0,
"grad_norm": 0.1015625,
"learning_rate": 0.017621038924840873,
"loss": 0.013,
"num_input_tokens_seen": 716448,
"step": 1580,
"train_runtime": 336.5105,
"train_tokens_per_second": 2129.051
},
{
"epoch": 10.0,
"eval_loss": 0.06166619062423706,
"eval_runtime": 1.6665,
"eval_samples_per_second": 42.005,
"eval_steps_per_second": 10.801,
"num_input_tokens_seen": 716448,
"step": 1580
},
{
"epoch": 10.031645569620252,
"grad_norm": 0.0035858154296875,
"learning_rate": 0.017539426041288716,
"loss": 0.0014,
"num_input_tokens_seen": 718880,
"step": 1585,
"train_runtime": 340.3212,
"train_tokens_per_second": 2112.358
},
{
"epoch": 10.063291139240507,
"grad_norm": 0.0025177001953125,
"learning_rate": 0.017457735691040317,
"loss": 0.0051,
"num_input_tokens_seen": 721248,
"step": 1590,
"train_runtime": 341.3156,
"train_tokens_per_second": 2113.141
},
{
"epoch": 10.094936708860759,
"grad_norm": 0.0181884765625,
"learning_rate": 0.017375970366108225,
"loss": 0.0169,
"num_input_tokens_seen": 723520,
"step": 1595,
"train_runtime": 342.3044,
"train_tokens_per_second": 2113.674
},
{
"epoch": 10.126582278481013,
"grad_norm": 0.0257568359375,
"learning_rate": 0.017294132560792125,
"loss": 0.0039,
"num_input_tokens_seen": 725696,
"step": 1600,
"train_runtime": 343.2739,
"train_tokens_per_second": 2114.044
},
{
"epoch": 10.158227848101266,
"grad_norm": 0.00146484375,
"learning_rate": 0.017212224771602776,
"loss": 0.0018,
"num_input_tokens_seen": 727968,
"step": 1605,
"train_runtime": 344.2492,
"train_tokens_per_second": 2114.654
},
{
"epoch": 10.189873417721518,
"grad_norm": 0.003021240234375,
"learning_rate": 0.01713024949718581,
"loss": 0.0087,
"num_input_tokens_seen": 730304,
"step": 1610,
"train_runtime": 345.2405,
"train_tokens_per_second": 2115.349
},
{
"epoch": 10.221518987341772,
"grad_norm": 0.00019359588623046875,
"learning_rate": 0.01704820923824556,
"loss": 0.002,
"num_input_tokens_seen": 732672,
"step": 1615,
"train_runtime": 346.2421,
"train_tokens_per_second": 2116.068
},
{
"epoch": 10.253164556962025,
"grad_norm": 0.007354736328125,
"learning_rate": 0.01696610649746875,
"loss": 0.0026,
"num_input_tokens_seen": 734880,
"step": 1620,
"train_runtime": 347.2145,
"train_tokens_per_second": 2116.501
},
{
"epoch": 10.284810126582279,
"grad_norm": 0.000904083251953125,
"learning_rate": 0.016883943779448123,
"loss": 0.0036,
"num_input_tokens_seen": 737120,
"step": 1625,
"train_runtime": 348.1864,
"train_tokens_per_second": 2117.027
},
{
"epoch": 10.316455696202532,
"grad_norm": 0.00494384765625,
"learning_rate": 0.016801723590606086,
"loss": 0.0004,
"num_input_tokens_seen": 739392,
"step": 1630,
"train_runtime": 349.176,
"train_tokens_per_second": 2117.534
},
{
"epoch": 10.348101265822784,
"grad_norm": 0.0140380859375,
"learning_rate": 0.016719448439118236,
"loss": 0.0019,
"num_input_tokens_seen": 741632,
"step": 1635,
"train_runtime": 350.1489,
"train_tokens_per_second": 2118.048
},
{
"epoch": 10.379746835443038,
"grad_norm": 0.006439208984375,
"learning_rate": 0.016637120834836816,
"loss": 0.0012,
"num_input_tokens_seen": 744000,
"step": 1640,
"train_runtime": 351.1394,
"train_tokens_per_second": 2118.816
},
{
"epoch": 10.41139240506329,
"grad_norm": 0.01116943359375,
"learning_rate": 0.016554743289214174,
"loss": 0.0019,
"num_input_tokens_seen": 746272,
"step": 1645,
"train_runtime": 352.1151,
"train_tokens_per_second": 2119.398
},
{
"epoch": 10.443037974683545,
"grad_norm": 0.01123046875,
"learning_rate": 0.016472318315226164,
"loss": 0.0011,
"num_input_tokens_seen": 748576,
"step": 1650,
"train_runtime": 353.1023,
"train_tokens_per_second": 2119.998
},
{
"epoch": 10.474683544303797,
"grad_norm": 0.0002841949462890625,
"learning_rate": 0.016389848427295465,
"loss": 0.0022,
"num_input_tokens_seen": 750944,
"step": 1655,
"train_runtime": 354.0847,
"train_tokens_per_second": 2120.803
},
{
"epoch": 10.50632911392405,
"grad_norm": 0.00634765625,
"learning_rate": 0.016307336141214875,
"loss": 0.0007,
"num_input_tokens_seen": 753184,
"step": 1660,
"train_runtime": 355.1293,
"train_tokens_per_second": 2120.872
},
{
"epoch": 10.537974683544304,
"grad_norm": 4.4345855712890625e-05,
"learning_rate": 0.016224783974070574,
"loss": 0.0005,
"num_input_tokens_seen": 755360,
"step": 1665,
"train_runtime": 356.0959,
"train_tokens_per_second": 2121.226
},
{
"epoch": 10.569620253164556,
"grad_norm": 0.01251220703125,
"learning_rate": 0.016142194444165342,
"loss": 0.001,
"num_input_tokens_seen": 757536,
"step": 1670,
"train_runtime": 357.0656,
"train_tokens_per_second": 2121.56
},
{
"epoch": 10.60126582278481,
"grad_norm": 6.818771362304688e-05,
"learning_rate": 0.01605957007094174,
"loss": 0.0046,
"num_input_tokens_seen": 759808,
"step": 1675,
"train_runtime": 358.041,
"train_tokens_per_second": 2122.126
},
{
"epoch": 10.632911392405063,
"grad_norm": 8.678436279296875e-05,
"learning_rate": 0.015976913374905227,
"loss": 0.0007,
"num_input_tokens_seen": 762208,
"step": 1680,
"train_runtime": 359.0375,
"train_tokens_per_second": 2122.92
},
{
"epoch": 10.664556962025316,
"grad_norm": 5.245208740234375e-05,
"learning_rate": 0.015894226877547296,
"loss": 0.001,
"num_input_tokens_seen": 764480,
"step": 1685,
"train_runtime": 360.0214,
"train_tokens_per_second": 2123.43
},
{
"epoch": 10.69620253164557,
"grad_norm": 0.0003814697265625,
"learning_rate": 0.015811513101268555,
"loss": 0.0004,
"num_input_tokens_seen": 766720,
"step": 1690,
"train_runtime": 360.9952,
"train_tokens_per_second": 2123.906
},
{
"epoch": 10.727848101265822,
"grad_norm": 9.34600830078125e-05,
"learning_rate": 0.015728774569301763,
"loss": 0.0018,
"num_input_tokens_seen": 768896,
"step": 1695,
"train_runtime": 361.9638,
"train_tokens_per_second": 2124.234
},
{
"epoch": 10.759493670886076,
"grad_norm": 0.10400390625,
"learning_rate": 0.015646013805634868,
"loss": 0.002,
"num_input_tokens_seen": 771200,
"step": 1700,
"train_runtime": 362.9508,
"train_tokens_per_second": 2124.806
},
{
"epoch": 10.791139240506329,
"grad_norm": 0.033203125,
"learning_rate": 0.015563233334934002,
"loss": 0.0029,
"num_input_tokens_seen": 773440,
"step": 1705,
"train_runtime": 363.9338,
"train_tokens_per_second": 2125.222
},
{
"epoch": 10.822784810126583,
"grad_norm": 0.001800537109375,
"learning_rate": 0.01548043568246649,
"loss": 0.002,
"num_input_tokens_seen": 775584,
"step": 1710,
"train_runtime": 364.8998,
"train_tokens_per_second": 2125.471
},
{
"epoch": 10.854430379746836,
"grad_norm": 0.000614166259765625,
"learning_rate": 0.01539762337402378,
"loss": 0.0053,
"num_input_tokens_seen": 777824,
"step": 1715,
"train_runtime": 365.8738,
"train_tokens_per_second": 2125.935
},
{
"epoch": 10.886075949367088,
"grad_norm": 0.00077056884765625,
"learning_rate": 0.015314798935844417,
"loss": 0.0175,
"num_input_tokens_seen": 780160,
"step": 1720,
"train_runtime": 366.8549,
"train_tokens_per_second": 2126.617
},
{
"epoch": 10.917721518987342,
"grad_norm": 0.0498046875,
"learning_rate": 0.015231964894536964,
"loss": 0.0025,
"num_input_tokens_seen": 782528,
"step": 1725,
"train_runtime": 367.8465,
"train_tokens_per_second": 2127.322
},
{
"epoch": 10.949367088607595,
"grad_norm": 0.01153564453125,
"learning_rate": 0.015149123777002947,
"loss": 0.0008,
"num_input_tokens_seen": 784832,
"step": 1730,
"train_runtime": 368.8341,
"train_tokens_per_second": 2127.873
},
{
"epoch": 10.981012658227849,
"grad_norm": 0.0004596710205078125,
"learning_rate": 0.015066278110359738,
"loss": 0.0002,
"num_input_tokens_seen": 787136,
"step": 1735,
"train_runtime": 369.8218,
"train_tokens_per_second": 2128.42
},
{
"epoch": 11.0,
"eval_loss": 0.08956073224544525,
"eval_runtime": 1.7158,
"eval_samples_per_second": 40.797,
"eval_steps_per_second": 10.491,
"num_input_tokens_seen": 788256,
"step": 1738
},
{
"epoch": 11.012658227848101,
"grad_norm": 0.0030975341796875,
"learning_rate": 0.014983430421863501,
"loss": 0.0011,
"num_input_tokens_seen": 789184,
"step": 1740,
"train_runtime": 373.3617,
"train_tokens_per_second": 2113.725
},
{
"epoch": 11.044303797468354,
"grad_norm": 0.0029449462890625,
"learning_rate": 0.014900583238832062,
"loss": 0.0008,
"num_input_tokens_seen": 791328,
"step": 1745,
"train_runtime": 374.327,
"train_tokens_per_second": 2114.002
},
{
"epoch": 11.075949367088608,
"grad_norm": 6.246566772460938e-05,
"learning_rate": 0.014817739088567832,
"loss": 0.0026,
"num_input_tokens_seen": 793536,
"step": 1750,
"train_runtime": 375.2996,
"train_tokens_per_second": 2114.406
},
{
"epoch": 11.10759493670886,
"grad_norm": 0.0014801025390625,
"learning_rate": 0.014734900498280717,
"loss": 0.0001,
"num_input_tokens_seen": 795744,
"step": 1755,
"train_runtime": 376.274,
"train_tokens_per_second": 2114.799
},
{
"epoch": 11.139240506329115,
"grad_norm": 0.00112152099609375,
"learning_rate": 0.014652069995011003,
"loss": 0.0002,
"num_input_tokens_seen": 797984,
"step": 1760,
"train_runtime": 377.247,
"train_tokens_per_second": 2115.283
},
{
"epoch": 11.170886075949367,
"grad_norm": 0.005828857421875,
"learning_rate": 0.014569250105552262,
"loss": 0.0006,
"num_input_tokens_seen": 800352,
"step": 1765,
"train_runtime": 378.2415,
"train_tokens_per_second": 2115.982
},
{
"epoch": 11.20253164556962,
"grad_norm": 0.0012054443359375,
"learning_rate": 0.014486443356374317,
"loss": 0.0008,
"num_input_tokens_seen": 802592,
"step": 1770,
"train_runtime": 379.2202,
"train_tokens_per_second": 2116.427
},
{
"epoch": 11.234177215189874,
"grad_norm": 0.0218505859375,
"learning_rate": 0.014403652273546117,
"loss": 0.0013,
"num_input_tokens_seen": 804896,
"step": 1775,
"train_runtime": 380.1991,
"train_tokens_per_second": 2117.038
},
{
"epoch": 11.265822784810126,
"grad_norm": 0.0027313232421875,
"learning_rate": 0.014320879382658702,
"loss": 0.0004,
"num_input_tokens_seen": 807168,
"step": 1780,
"train_runtime": 381.1834,
"train_tokens_per_second": 2117.532
},
{
"epoch": 11.29746835443038,
"grad_norm": 0.00010204315185546875,
"learning_rate": 0.014238127208748164,
"loss": 0.0001,
"num_input_tokens_seen": 809408,
"step": 1785,
"train_runtime": 382.157,
"train_tokens_per_second": 2117.999
},
{
"epoch": 11.329113924050633,
"grad_norm": 0.00174713134765625,
"learning_rate": 0.014155398276218605,
"loss": 0.0007,
"num_input_tokens_seen": 811680,
"step": 1790,
"train_runtime": 383.1444,
"train_tokens_per_second": 2118.47
},
{
"epoch": 11.360759493670885,
"grad_norm": 0.00019073486328125,
"learning_rate": 0.014072695108765128,
"loss": 0.0003,
"num_input_tokens_seen": 813824,
"step": 1795,
"train_runtime": 384.1109,
"train_tokens_per_second": 2118.722
},
{
"epoch": 11.39240506329114,
"grad_norm": 0.001312255859375,
"learning_rate": 0.013990020229296886,
"loss": 0.0028,
"num_input_tokens_seen": 816096,
"step": 1800,
"train_runtime": 385.0871,
"train_tokens_per_second": 2119.25
},
{
"epoch": 11.424050632911392,
"grad_norm": 0.0213623046875,
"learning_rate": 0.013907376159860046,
"loss": 0.003,
"num_input_tokens_seen": 818496,
"step": 1805,
"train_runtime": 386.0926,
"train_tokens_per_second": 2119.947
},
{
"epoch": 11.455696202531646,
"grad_norm": 4.673004150390625e-05,
"learning_rate": 0.013824765421560938,
"loss": 0.0005,
"num_input_tokens_seen": 820704,
"step": 1810,
"train_runtime": 387.0636,
"train_tokens_per_second": 2120.334
},
{
"epoch": 11.487341772151899,
"grad_norm": 0.0029754638671875,
"learning_rate": 0.013742190534489085,
"loss": 0.0008,
"num_input_tokens_seen": 823040,
"step": 1815,
"train_runtime": 388.0546,
"train_tokens_per_second": 2120.938
},
{
"epoch": 11.518987341772151,
"grad_norm": 0.00070953369140625,
"learning_rate": 0.013659654017640343,
"loss": 0.0001,
"num_input_tokens_seen": 825248,
"step": 1820,
"train_runtime": 389.0274,
"train_tokens_per_second": 2121.311
},
{
"epoch": 11.550632911392405,
"grad_norm": 0.0123291015625,
"learning_rate": 0.013577158388840075,
"loss": 0.0007,
"num_input_tokens_seen": 827552,
"step": 1825,
"train_runtime": 390.0152,
"train_tokens_per_second": 2121.846
},
{
"epoch": 11.582278481012658,
"grad_norm": 0.00115203857421875,
"learning_rate": 0.013494706164666324,
"loss": 0.0013,
"num_input_tokens_seen": 829856,
"step": 1830,
"train_runtime": 391.0116,
"train_tokens_per_second": 2122.331
},
{
"epoch": 11.613924050632912,
"grad_norm": 0.00189971923828125,
"learning_rate": 0.013412299860373046,
"loss": 0.0006,
"num_input_tokens_seen": 832192,
"step": 1835,
"train_runtime": 392.014,
"train_tokens_per_second": 2122.863
},
{
"epoch": 11.645569620253164,
"grad_norm": 0.0007171630859375,
"learning_rate": 0.013329941989813392,
"loss": 0.0002,
"num_input_tokens_seen": 834368,
"step": 1840,
"train_runtime": 392.9826,
"train_tokens_per_second": 2123.168
},
{
"epoch": 11.677215189873417,
"grad_norm": 0.00390625,
"learning_rate": 0.013247635065363007,
"loss": 0.0007,
"num_input_tokens_seen": 836672,
"step": 1845,
"train_runtime": 393.9706,
"train_tokens_per_second": 2123.692
},
{
"epoch": 11.708860759493671,
"grad_norm": 5.316734313964844e-05,
"learning_rate": 0.013165381597843384,
"loss": 0.0004,
"num_input_tokens_seen": 838912,
"step": 1850,
"train_runtime": 394.9452,
"train_tokens_per_second": 2124.122
},
{
"epoch": 11.740506329113924,
"grad_norm": 0.001220703125,
"learning_rate": 0.013083184096445313,
"loss": 0.0003,
"num_input_tokens_seen": 841216,
"step": 1855,
"train_runtime": 395.9332,
"train_tokens_per_second": 2124.641
},
{
"epoch": 11.772151898734178,
"grad_norm": 0.0047607421875,
"learning_rate": 0.013001045068652269,
"loss": 0.0009,
"num_input_tokens_seen": 843520,
"step": 1860,
"train_runtime": 396.9205,
"train_tokens_per_second": 2125.161
},
{
"epoch": 11.80379746835443,
"grad_norm": 6.580352783203125e-05,
"learning_rate": 0.012918967020163976,
"loss": 0.0003,
"num_input_tokens_seen": 845824,
"step": 1865,
"train_runtime": 397.9019,
"train_tokens_per_second": 2125.71
},
{
"epoch": 11.835443037974684,
"grad_norm": 3.886222839355469e-05,
"learning_rate": 0.012836952454819943,
"loss": 0.0006,
"num_input_tokens_seen": 848160,
"step": 1870,
"train_runtime": 398.8925,
"train_tokens_per_second": 2126.287
},
{
"epoch": 11.867088607594937,
"grad_norm": 0.00124359130859375,
"learning_rate": 0.012755003874523082,
"loss": 0.0003,
"num_input_tokens_seen": 850400,
"step": 1875,
"train_runtime": 399.8677,
"train_tokens_per_second": 2126.704
},
{
"epoch": 11.89873417721519,
"grad_norm": 0.00102996826171875,
"learning_rate": 0.012673123779163402,
"loss": 0.0009,
"num_input_tokens_seen": 852736,
"step": 1880,
"train_runtime": 400.8519,
"train_tokens_per_second": 2127.309
},
{
"epoch": 11.930379746835444,
"grad_norm": 0.0012359619140625,
"learning_rate": 0.01259131466654173,
"loss": 0.0001,
"num_input_tokens_seen": 855072,
"step": 1885,
"train_runtime": 401.845,
"train_tokens_per_second": 2127.865
},
{
"epoch": 11.962025316455696,
"grad_norm": 0.007110595703125,
"learning_rate": 0.012509579032293525,
"loss": 0.0005,
"num_input_tokens_seen": 857312,
"step": 1890,
"train_runtime": 402.8194,
"train_tokens_per_second": 2128.279
},
{
"epoch": 11.99367088607595,
"grad_norm": 5.459785461425781e-05,
"learning_rate": 0.012427919369812754,
"loss": 0.0001,
"num_input_tokens_seen": 859584,
"step": 1895,
"train_runtime": 403.7997,
"train_tokens_per_second": 2128.739
},
{
"epoch": 12.0,
"eval_loss": 0.09028957784175873,
"eval_runtime": 1.6669,
"eval_samples_per_second": 41.995,
"eval_steps_per_second": 10.799,
"num_input_tokens_seen": 859808,
"step": 1896
},
{
"epoch": 12.025316455696203,
"grad_norm": 0.00299072265625,
"learning_rate": 0.012346338170175808,
"loss": 0.0001,
"num_input_tokens_seen": 861632,
"step": 1900,
"train_runtime": 407.3224,
"train_tokens_per_second": 2115.356
},
{
"epoch": 12.056962025316455,
"grad_norm": 0.001861572265625,
"learning_rate": 0.012264837922065518,
"loss": 0.0002,
"num_input_tokens_seen": 863808,
"step": 1905,
"train_runtime": 408.2983,
"train_tokens_per_second": 2115.63
},
{
"epoch": 12.08860759493671,
"grad_norm": 0.00023746490478515625,
"learning_rate": 0.012183421111695262,
"loss": 0.0004,
"num_input_tokens_seen": 866048,
"step": 1910,
"train_runtime": 409.2823,
"train_tokens_per_second": 2116.016
},
{
"epoch": 12.120253164556962,
"grad_norm": 0.000736236572265625,
"learning_rate": 0.012102090222733081,
"loss": 0.0006,
"num_input_tokens_seen": 868352,
"step": 1915,
"train_runtime": 410.2801,
"train_tokens_per_second": 2116.486
},
{
"epoch": 12.151898734177216,
"grad_norm": 0.00909423828125,
"learning_rate": 0.012020847736225939,
"loss": 0.0005,
"num_input_tokens_seen": 870560,
"step": 1920,
"train_runtime": 411.2539,
"train_tokens_per_second": 2116.843
},
{
"epoch": 12.183544303797468,
"grad_norm": 0.00604248046875,
"learning_rate": 0.011939696130524032,
"loss": 0.0005,
"num_input_tokens_seen": 872768,
"step": 1925,
"train_runtime": 412.226,
"train_tokens_per_second": 2117.208
},
{
"epoch": 12.215189873417721,
"grad_norm": 5.1021575927734375e-05,
"learning_rate": 0.011858637881205177,
"loss": 0.0,
"num_input_tokens_seen": 874976,
"step": 1930,
"train_runtime": 413.1976,
"train_tokens_per_second": 2117.573
},
{
"epoch": 12.246835443037975,
"grad_norm": 0.0023193359375,
"learning_rate": 0.011777675460999311,
"loss": 0.0003,
"num_input_tokens_seen": 877344,
"step": 1935,
"train_runtime": 414.1922,
"train_tokens_per_second": 2118.205
},
{
"epoch": 12.278481012658228,
"grad_norm": 0.00714111328125,
"learning_rate": 0.01169681133971304,
"loss": 0.0005,
"num_input_tokens_seen": 879584,
"step": 1940,
"train_runtime": 415.1661,
"train_tokens_per_second": 2118.632
},
{
"epoch": 12.310126582278482,
"grad_norm": 0.00015926361083984375,
"learning_rate": 0.011616047984154299,
"loss": 0.0002,
"num_input_tokens_seen": 881760,
"step": 1945,
"train_runtime": 416.1354,
"train_tokens_per_second": 2118.926
},
{
"epoch": 12.341772151898734,
"grad_norm": 0.0009613037109375,
"learning_rate": 0.011535387858057114,
"loss": 0.0004,
"num_input_tokens_seen": 883968,
"step": 1950,
"train_runtime": 417.1069,
"train_tokens_per_second": 2119.284
},
{
"epoch": 12.373417721518987,
"grad_norm": 0.000431060791015625,
"learning_rate": 0.011454833422006427,
"loss": 0.0005,
"num_input_tokens_seen": 886144,
"step": 1955,
"train_runtime": 418.0757,
"train_tokens_per_second": 2119.578
},
{
"epoch": 12.405063291139241,
"grad_norm": 5.8650970458984375e-05,
"learning_rate": 0.011374387133363046,
"loss": 0.0004,
"num_input_tokens_seen": 888448,
"step": 1960,
"train_runtime": 419.0588,
"train_tokens_per_second": 2120.103
},
{
"epoch": 12.436708860759493,
"grad_norm": 4.1484832763671875e-05,
"learning_rate": 0.01129405144618868,
"loss": 0.0,
"num_input_tokens_seen": 890720,
"step": 1965,
"train_runtime": 420.0388,
"train_tokens_per_second": 2120.566
},
{
"epoch": 12.468354430379748,
"grad_norm": 0.0004100799560546875,
"learning_rate": 0.01121382881117107,
"loss": 0.0012,
"num_input_tokens_seen": 892992,
"step": 1970,
"train_runtime": 421.0157,
"train_tokens_per_second": 2121.042
},
{
"epoch": 12.5,
"grad_norm": 0.000186920166015625,
"learning_rate": 0.011133721675549232,
"loss": 0.0009,
"num_input_tokens_seen": 895360,
"step": 1975,
"train_runtime": 422.0134,
"train_tokens_per_second": 2121.639
},
{
"epoch": 12.531645569620252,
"grad_norm": 6.079673767089844e-05,
"learning_rate": 0.011053732483038824,
"loss": 0.0003,
"num_input_tokens_seen": 897696,
"step": 1980,
"train_runtime": 423.0131,
"train_tokens_per_second": 2122.147
},
{
"epoch": 12.563291139240507,
"grad_norm": 0.000579833984375,
"learning_rate": 0.010973863673757548,
"loss": 0.0003,
"num_input_tokens_seen": 900032,
"step": 1985,
"train_runtime": 424.0043,
"train_tokens_per_second": 2122.695
},
{
"epoch": 12.594936708860759,
"grad_norm": 3.62396240234375e-05,
"learning_rate": 0.010894117684150773,
"loss": 0.0001,
"num_input_tokens_seen": 902336,
"step": 1990,
"train_runtime": 424.9831,
"train_tokens_per_second": 2123.228
},
{
"epoch": 12.626582278481013,
"grad_norm": 0.00020122528076171875,
"learning_rate": 0.010814496946917168,
"loss": 0.0002,
"num_input_tokens_seen": 904512,
"step": 1995,
"train_runtime": 425.953,
"train_tokens_per_second": 2123.502
},
{
"epoch": 12.658227848101266,
"grad_norm": 0.0019073486328125,
"learning_rate": 0.010735003890934494,
"loss": 0.0005,
"num_input_tokens_seen": 906944,
"step": 2000,
"train_runtime": 426.96,
"train_tokens_per_second": 2124.189
},
{
"epoch": 12.689873417721518,
"grad_norm": 0.00109100341796875,
"learning_rate": 0.010655640941185544,
"loss": 0.0001,
"num_input_tokens_seen": 909280,
"step": 2005,
"train_runtime": 427.9421,
"train_tokens_per_second": 2124.774
},
{
"epoch": 12.721518987341772,
"grad_norm": 0.00040435791015625,
"learning_rate": 0.010576410518684127,
"loss": 0.0005,
"num_input_tokens_seen": 911424,
"step": 2010,
"train_runtime": 428.9095,
"train_tokens_per_second": 2124.98
},
{
"epoch": 12.753164556962025,
"grad_norm": 0.0059814453125,
"learning_rate": 0.01049731504040122,
"loss": 0.0006,
"num_input_tokens_seen": 913760,
"step": 2015,
"train_runtime": 429.8934,
"train_tokens_per_second": 2125.55
},
{
"epoch": 12.784810126582279,
"grad_norm": 0.00019359588623046875,
"learning_rate": 0.010418356919191284,
"loss": 0.0012,
"num_input_tokens_seen": 916096,
"step": 2020,
"train_runtime": 430.8842,
"train_tokens_per_second": 2126.084
},
{
"epoch": 12.816455696202532,
"grad_norm": 0.0004558563232421875,
"learning_rate": 0.010339538563718576,
"loss": 0.0002,
"num_input_tokens_seen": 918336,
"step": 2025,
"train_runtime": 431.8594,
"train_tokens_per_second": 2126.47
},
{
"epoch": 12.848101265822784,
"grad_norm": 0.00095367431640625,
"learning_rate": 0.010260862378383738,
"loss": 0.0008,
"num_input_tokens_seen": 920672,
"step": 2030,
"train_runtime": 432.8498,
"train_tokens_per_second": 2127.001
},
{
"epoch": 12.879746835443038,
"grad_norm": 7.200241088867188e-05,
"learning_rate": 0.01018233076325044,
"loss": 0.0003,
"num_input_tokens_seen": 923072,
"step": 2035,
"train_runtime": 433.845,
"train_tokens_per_second": 2127.654
},
{
"epoch": 12.91139240506329,
"grad_norm": 0.00518798828125,
"learning_rate": 0.01010394611397213,
"loss": 0.0006,
"num_input_tokens_seen": 925440,
"step": 2040,
"train_runtime": 434.8489,
"train_tokens_per_second": 2128.188
},
{
"epoch": 12.943037974683545,
"grad_norm": 0.0045166015625,
"learning_rate": 0.010025710821718983,
"loss": 0.0002,
"num_input_tokens_seen": 927744,
"step": 2045,
"train_runtime": 435.8295,
"train_tokens_per_second": 2128.685
},
{
"epoch": 12.974683544303797,
"grad_norm": 5.7220458984375e-05,
"learning_rate": 0.009947627273104958,
"loss": 0.0005,
"num_input_tokens_seen": 929952,
"step": 2050,
"train_runtime": 436.8026,
"train_tokens_per_second": 2128.998
},
{
"epoch": 13.0,
"eval_loss": 0.09263655543327332,
"eval_runtime": 1.678,
"eval_samples_per_second": 41.718,
"eval_steps_per_second": 10.727,
"num_input_tokens_seen": 931472,
"step": 2054
},
{
"epoch": 13.00632911392405,
"grad_norm": 0.005462646484375,
"learning_rate": 0.00986969785011497,
"loss": 0.0004,
"num_input_tokens_seen": 931952,
"step": 2055,
"train_runtime": 440.2919,
"train_tokens_per_second": 2116.669
},
{
"epoch": 13.037974683544304,
"grad_norm": 0.002655029296875,
"learning_rate": 0.009791924930032251,
"loss": 0.0007,
"num_input_tokens_seen": 934288,
"step": 2060,
"train_runtime": 441.2826,
"train_tokens_per_second": 2117.21
},
{
"epoch": 13.069620253164556,
"grad_norm": 0.0004215240478515625,
"learning_rate": 0.00971431088536582,
"loss": 0.0002,
"num_input_tokens_seen": 936624,
"step": 2065,
"train_runtime": 442.2743,
"train_tokens_per_second": 2117.745
},
{
"epoch": 13.10126582278481,
"grad_norm": 0.0003299713134765625,
"learning_rate": 0.009636858083778092,
"loss": 0.0004,
"num_input_tokens_seen": 938928,
"step": 2070,
"train_runtime": 443.2555,
"train_tokens_per_second": 2118.254
},
{
"epoch": 13.132911392405063,
"grad_norm": 0.0010528564453125,
"learning_rate": 0.00955956888801269,
"loss": 0.0002,
"num_input_tokens_seen": 941232,
"step": 2075,
"train_runtime": 444.2469,
"train_tokens_per_second": 2118.714
},
{
"epoch": 13.164556962025316,
"grad_norm": 0.00115203857421875,
"learning_rate": 0.009482445655822326,
"loss": 0.0002,
"num_input_tokens_seen": 943440,
"step": 2080,
"train_runtime": 445.2222,
"train_tokens_per_second": 2119.032
},
{
"epoch": 13.19620253164557,
"grad_norm": 0.0001506805419921875,
"learning_rate": 0.009405490739896898,
"loss": 0.0001,
"num_input_tokens_seen": 945648,
"step": 2085,
"train_runtime": 446.1949,
"train_tokens_per_second": 2119.361
},
{
"epoch": 13.227848101265822,
"grad_norm": 3.814697265625e-05,
"learning_rate": 0.009328706487791726,
"loss": 0.0,
"num_input_tokens_seen": 947920,
"step": 2090,
"train_runtime": 447.1719,
"train_tokens_per_second": 2119.811
},
{
"epoch": 13.259493670886076,
"grad_norm": 4.601478576660156e-05,
"learning_rate": 0.009252095241855923,
"loss": 0.0001,
"num_input_tokens_seen": 950288,
"step": 2095,
"train_runtime": 448.1741,
"train_tokens_per_second": 2120.354
},
{
"epoch": 13.291139240506329,
"grad_norm": 0.00341796875,
"learning_rate": 0.009175659339160935,
"loss": 0.0002,
"num_input_tokens_seen": 952560,
"step": 2100,
"train_runtime": 449.1515,
"train_tokens_per_second": 2120.799
},
{
"epoch": 13.322784810126583,
"grad_norm": 3.504753112792969e-05,
"learning_rate": 0.009099401111429277,
"loss": 0.0004,
"num_input_tokens_seen": 954864,
"step": 2105,
"train_runtime": 450.1402,
"train_tokens_per_second": 2121.259
},
{
"epoch": 13.354430379746836,
"grad_norm": 0.004058837890625,
"learning_rate": 0.009023322884963372,
"loss": 0.0007,
"num_input_tokens_seen": 957168,
"step": 2110,
"train_runtime": 451.1194,
"train_tokens_per_second": 2121.762
},
{
"epoch": 13.386075949367088,
"grad_norm": 0.00213623046875,
"learning_rate": 0.008947426980574607,
"loss": 0.0002,
"num_input_tokens_seen": 959408,
"step": 2115,
"train_runtime": 452.094,
"train_tokens_per_second": 2122.143
},
{
"epoch": 13.417721518987342,
"grad_norm": 0.00049591064453125,
"learning_rate": 0.008871715713512522,
"loss": 0.0002,
"num_input_tokens_seen": 961648,
"step": 2120,
"train_runtime": 453.0774,
"train_tokens_per_second": 2122.481
},
{
"epoch": 13.449367088607595,
"grad_norm": 0.001617431640625,
"learning_rate": 0.008796191393394177,
"loss": 0.0003,
"num_input_tokens_seen": 964016,
"step": 2125,
"train_runtime": 454.0727,
"train_tokens_per_second": 2123.043
},
{
"epoch": 13.481012658227849,
"grad_norm": 0.002288818359375,
"learning_rate": 0.00872085632413372,
"loss": 0.0003,
"num_input_tokens_seen": 966256,
"step": 2130,
"train_runtime": 455.0563,
"train_tokens_per_second": 2123.377
},
{
"epoch": 13.512658227848101,
"grad_norm": 4.649162292480469e-05,
"learning_rate": 0.008645712803872083,
"loss": 0.0005,
"num_input_tokens_seen": 968432,
"step": 2135,
"train_runtime": 456.0259,
"train_tokens_per_second": 2123.634
},
{
"epoch": 13.544303797468354,
"grad_norm": 0.00433349609375,
"learning_rate": 0.008570763124906865,
"loss": 0.0004,
"num_input_tokens_seen": 970672,
"step": 2140,
"train_runtime": 457.0,
"train_tokens_per_second": 2124.009
},
{
"epoch": 13.575949367088608,
"grad_norm": 4.124641418457031e-05,
"learning_rate": 0.00849600957362246,
"loss": 0.0012,
"num_input_tokens_seen": 972912,
"step": 2145,
"train_runtime": 457.9749,
"train_tokens_per_second": 2124.378
},
{
"epoch": 13.60759493670886,
"grad_norm": 0.000522613525390625,
"learning_rate": 0.008421454430420234,
"loss": 0.0012,
"num_input_tokens_seen": 975152,
"step": 2150,
"train_runtime": 458.9584,
"train_tokens_per_second": 2124.707
},
{
"epoch": 13.639240506329115,
"grad_norm": 0.0002288818359375,
"learning_rate": 0.008347099969649014,
"loss": 0.0001,
"num_input_tokens_seen": 977584,
"step": 2155,
"train_runtime": 459.9591,
"train_tokens_per_second": 2125.372
},
{
"epoch": 13.670886075949367,
"grad_norm": 9.584426879882812e-05,
"learning_rate": 0.008272948459535695,
"loss": 0.0001,
"num_input_tokens_seen": 979856,
"step": 2160,
"train_runtime": 460.9457,
"train_tokens_per_second": 2125.751
},
{
"epoch": 13.70253164556962,
"grad_norm": 6.151199340820312e-05,
"learning_rate": 0.008199002162116022,
"loss": 0.0002,
"num_input_tokens_seen": 982256,
"step": 2165,
"train_runtime": 461.9527,
"train_tokens_per_second": 2126.313
},
{
"epoch": 13.734177215189874,
"grad_norm": 4.220008850097656e-05,
"learning_rate": 0.008125263333165628,
"loss": 0.0001,
"num_input_tokens_seen": 984592,
"step": 2170,
"train_runtime": 462.9443,
"train_tokens_per_second": 2126.805
},
{
"epoch": 13.765822784810126,
"grad_norm": 0.0002765655517578125,
"learning_rate": 0.008051734222131186,
"loss": 0.0001,
"num_input_tokens_seen": 986864,
"step": 2175,
"train_runtime": 463.9248,
"train_tokens_per_second": 2127.207
},
{
"epoch": 13.79746835443038,
"grad_norm": 0.006378173828125,
"learning_rate": 0.00797841707206179,
"loss": 0.0004,
"num_input_tokens_seen": 989136,
"step": 2180,
"train_runtime": 464.9127,
"train_tokens_per_second": 2127.574
},
{
"epoch": 13.829113924050633,
"grad_norm": 0.000225067138671875,
"learning_rate": 0.00790531411954057,
"loss": 0.0007,
"num_input_tokens_seen": 991440,
"step": 2185,
"train_runtime": 465.9052,
"train_tokens_per_second": 2127.986
},
{
"epoch": 13.860759493670885,
"grad_norm": 0.002593994140625,
"learning_rate": 0.007832427594616397,
"loss": 0.0003,
"num_input_tokens_seen": 993712,
"step": 2190,
"train_runtime": 466.8825,
"train_tokens_per_second": 2128.398
},
{
"epoch": 13.89240506329114,
"grad_norm": 0.0012664794921875,
"learning_rate": 0.0077597597207359125,
"loss": 0.0007,
"num_input_tokens_seen": 995920,
"step": 2195,
"train_runtime": 467.8545,
"train_tokens_per_second": 2128.696
},
{
"epoch": 13.924050632911392,
"grad_norm": 0.0002346038818359375,
"learning_rate": 0.007687312714675674,
"loss": 0.0004,
"num_input_tokens_seen": 998224,
"step": 2200,
"train_runtime": 468.8346,
"train_tokens_per_second": 2129.16
},
{
"epoch": 13.955696202531646,
"grad_norm": 0.0023193359375,
"learning_rate": 0.007615088786474526,
"loss": 0.0007,
"num_input_tokens_seen": 1000400,
"step": 2205,
"train_runtime": 469.8044,
"train_tokens_per_second": 2129.397
},
{
"epoch": 13.987341772151899,
"grad_norm": 0.000194549560546875,
"learning_rate": 0.0075430901393662,
"loss": 0.0003,
"num_input_tokens_seen": 1002640,
"step": 2210,
"train_runtime": 470.7794,
"train_tokens_per_second": 2129.745
},
{
"epoch": 14.0,
"eval_loss": 0.09213147312402725,
"eval_runtime": 1.6795,
"eval_samples_per_second": 41.679,
"eval_steps_per_second": 10.718,
"num_input_tokens_seen": 1003376,
"step": 2212
},
{
"epoch": 14.018987341772151,
"grad_norm": 0.0019989013671875,
"learning_rate": 0.007471318969712099,
"loss": 0.0002,
"num_input_tokens_seen": 1004752,
"step": 2215,
"train_runtime": 474.323,
"train_tokens_per_second": 2118.286
},
{
"epoch": 14.050632911392405,
"grad_norm": 0.0001735687255859375,
"learning_rate": 0.007399777466934275,
"loss": 0.0002,
"num_input_tokens_seen": 1007120,
"step": 2220,
"train_runtime": 475.3503,
"train_tokens_per_second": 2118.69
},
{
"epoch": 14.082278481012658,
"grad_norm": 0.0034332275390625,
"learning_rate": 0.007328467813448668,
"loss": 0.0006,
"num_input_tokens_seen": 1009520,
"step": 2225,
"train_runtime": 476.3596,
"train_tokens_per_second": 2119.24
},
{
"epoch": 14.113924050632912,
"grad_norm": 0.003326416015625,
"learning_rate": 0.007257392184598517,
"loss": 0.0003,
"num_input_tokens_seen": 1011792,
"step": 2230,
"train_runtime": 477.3371,
"train_tokens_per_second": 2119.659
},
{
"epoch": 14.145569620253164,
"grad_norm": 0.00012111663818359375,
"learning_rate": 0.007186552748587997,
"loss": 0.0002,
"num_input_tokens_seen": 1014032,
"step": 2235,
"train_runtime": 478.3216,
"train_tokens_per_second": 2119.98
},
{
"epoch": 14.177215189873417,
"grad_norm": 0.0028839111328125,
"learning_rate": 0.00711595166641609,
"loss": 0.0008,
"num_input_tokens_seen": 1016336,
"step": 2240,
"train_runtime": 479.3034,
"train_tokens_per_second": 2120.444
},
{
"epoch": 14.208860759493671,
"grad_norm": 0.001068115234375,
"learning_rate": 0.007045591091810634,
"loss": 0.0005,
"num_input_tokens_seen": 1018672,
"step": 2245,
"train_runtime": 480.2951,
"train_tokens_per_second": 2120.929
},
{
"epoch": 14.240506329113924,
"grad_norm": 0.0002155303955078125,
"learning_rate": 0.006975473171162659,
"loss": 0.0,
"num_input_tokens_seen": 1020976,
"step": 2250,
"train_runtime": 481.2844,
"train_tokens_per_second": 2121.357
},
{
"epoch": 14.272151898734178,
"grad_norm": 0.006011962890625,
"learning_rate": 0.006905600043460891,
"loss": 0.0005,
"num_input_tokens_seen": 1023120,
"step": 2255,
"train_runtime": 482.2561,
"train_tokens_per_second": 2121.529
},
{
"epoch": 14.30379746835443,
"grad_norm": 4.935264587402344e-05,
"learning_rate": 0.006835973840226484,
"loss": 0.0002,
"num_input_tokens_seen": 1025360,
"step": 2260,
"train_runtime": 483.2319,
"train_tokens_per_second": 2121.88
},
{
"epoch": 14.335443037974684,
"grad_norm": 0.002716064453125,
"learning_rate": 0.006766596685448035,
"loss": 0.0003,
"num_input_tokens_seen": 1027664,
"step": 2265,
"train_runtime": 484.2201,
"train_tokens_per_second": 2122.308
},
{
"epoch": 14.367088607594937,
"grad_norm": 6.437301635742188e-05,
"learning_rate": 0.006697470695516768,
"loss": 0.0002,
"num_input_tokens_seen": 1029936,
"step": 2270,
"train_runtime": 485.2006,
"train_tokens_per_second": 2122.701
},
{
"epoch": 14.39873417721519,
"grad_norm": 4.5299530029296875e-05,
"learning_rate": 0.006628597979161958,
"loss": 0.0002,
"num_input_tokens_seen": 1032208,
"step": 2275,
"train_runtime": 486.187,
"train_tokens_per_second": 2123.068
},
{
"epoch": 14.430379746835444,
"grad_norm": 4.3392181396484375e-05,
"learning_rate": 0.006559980637386639,
"loss": 0.0004,
"num_input_tokens_seen": 1034416,
"step": 2280,
"train_runtime": 487.1593,
"train_tokens_per_second": 2123.363
},
{
"epoch": 14.462025316455696,
"grad_norm": 4.124641418457031e-05,
"learning_rate": 0.00649162076340348,
"loss": 0.0005,
"num_input_tokens_seen": 1036688,
"step": 2285,
"train_runtime": 488.1363,
"train_tokens_per_second": 2123.767
},
{
"epoch": 14.49367088607595,
"grad_norm": 4.029273986816406e-05,
"learning_rate": 0.006423520442570956,
"loss": 0.0005,
"num_input_tokens_seen": 1038960,
"step": 2290,
"train_runtime": 489.1261,
"train_tokens_per_second": 2124.115
},
{
"epoch": 14.525316455696203,
"grad_norm": 0.00012111663818359375,
"learning_rate": 0.006355681752329696,
"loss": 0.0004,
"num_input_tokens_seen": 1041072,
"step": 2295,
"train_runtime": 490.0912,
"train_tokens_per_second": 2124.241
},
{
"epoch": 14.556962025316455,
"grad_norm": 0.00171661376953125,
"learning_rate": 0.006288106762139153,
"loss": 0.0002,
"num_input_tokens_seen": 1043312,
"step": 2300,
"train_runtime": 491.0663,
"train_tokens_per_second": 2124.585
},
{
"epoch": 14.58860759493671,
"grad_norm": 0.00927734375,
"learning_rate": 0.006220797533414447,
"loss": 0.0008,
"num_input_tokens_seen": 1045552,
"step": 2305,
"train_runtime": 492.0409,
"train_tokens_per_second": 2124.929
},
{
"epoch": 14.620253164556962,
"grad_norm": 0.001129150390625,
"learning_rate": 0.0061537561194634945,
"loss": 0.0003,
"num_input_tokens_seen": 1048048,
"step": 2310,
"train_runtime": 493.0522,
"train_tokens_per_second": 2125.633
},
{
"epoch": 14.651898734177216,
"grad_norm": 0.00201416015625,
"learning_rate": 0.006086984565424345,
"loss": 0.0003,
"num_input_tokens_seen": 1050384,
"step": 2315,
"train_runtime": 494.0443,
"train_tokens_per_second": 2126.093
},
{
"epoch": 14.683544303797468,
"grad_norm": 4.506111145019531e-05,
"learning_rate": 0.006020484908202826,
"loss": 0.0001,
"num_input_tokens_seen": 1052720,
"step": 2320,
"train_runtime": 495.0358,
"train_tokens_per_second": 2126.553
},
{
"epoch": 14.715189873417721,
"grad_norm": 0.00016689300537109375,
"learning_rate": 0.00595425917641039,
"loss": 0.0001,
"num_input_tokens_seen": 1054864,
"step": 2325,
"train_runtime": 496.0045,
"train_tokens_per_second": 2126.723
},
{
"epoch": 14.746835443037975,
"grad_norm": 0.00445556640625,
"learning_rate": 0.005888309390302235,
"loss": 0.0002,
"num_input_tokens_seen": 1057168,
"step": 2330,
"train_runtime": 496.9849,
"train_tokens_per_second": 2127.163
},
{
"epoch": 14.778481012658228,
"grad_norm": 0.00087738037109375,
"learning_rate": 0.005822637561715658,
"loss": 0.0003,
"num_input_tokens_seen": 1059376,
"step": 2335,
"train_runtime": 497.957,
"train_tokens_per_second": 2127.445
},
{
"epoch": 14.810126582278482,
"grad_norm": 0.00086212158203125,
"learning_rate": 0.005757245694008714,
"loss": 0.0004,
"num_input_tokens_seen": 1061680,
"step": 2340,
"train_runtime": 498.9369,
"train_tokens_per_second": 2127.884
},
{
"epoch": 14.841772151898734,
"grad_norm": 5.173683166503906e-05,
"learning_rate": 0.005692135781999078,
"loss": 0.0004,
"num_input_tokens_seen": 1063984,
"step": 2345,
"train_runtime": 499.9176,
"train_tokens_per_second": 2128.319
},
{
"epoch": 14.873417721518987,
"grad_norm": 6.103515625e-05,
"learning_rate": 0.005627309811903193,
"loss": 0.0001,
"num_input_tokens_seen": 1066352,
"step": 2350,
"train_runtime": 500.9022,
"train_tokens_per_second": 2128.863
},
{
"epoch": 14.905063291139241,
"grad_norm": 4.76837158203125e-05,
"learning_rate": 0.005562769761275697,
"loss": 0.0004,
"num_input_tokens_seen": 1068528,
"step": 2355,
"train_runtime": 501.8805,
"train_tokens_per_second": 2129.049
},
{
"epoch": 14.936708860759493,
"grad_norm": 0.0012054443359375,
"learning_rate": 0.005498517598949082,
"loss": 0.0004,
"num_input_tokens_seen": 1070864,
"step": 2360,
"train_runtime": 502.8728,
"train_tokens_per_second": 2129.493
},
{
"epoch": 14.968354430379748,
"grad_norm": 0.004119873046875,
"learning_rate": 0.005434555284973631,
"loss": 0.0006,
"num_input_tokens_seen": 1073072,
"step": 2365,
"train_runtime": 503.8466,
"train_tokens_per_second": 2129.759
},
{
"epoch": 15.0,
"grad_norm": 0.0003948211669921875,
"learning_rate": 0.005370884770557645,
"loss": 0.0011,
"num_input_tokens_seen": 1075088,
"step": 2370,
"train_runtime": 504.7907,
"train_tokens_per_second": 2129.77
},
{
"epoch": 15.0,
"eval_loss": 0.09199367463588715,
"eval_runtime": 1.6755,
"eval_samples_per_second": 41.779,
"eval_steps_per_second": 10.743,
"num_input_tokens_seen": 1075088,
"step": 2370
},
{
"epoch": 15.031645569620252,
"grad_norm": 0.0025787353515625,
"learning_rate": 0.0053075079980078824,
"loss": 0.0008,
"num_input_tokens_seen": 1077296,
"step": 2375,
"train_runtime": 508.5507,
"train_tokens_per_second": 2118.365
},
{
"epoch": 15.063291139240507,
"grad_norm": 0.0017547607421875,
"learning_rate": 0.005244426900670356,
"loss": 0.0001,
"num_input_tokens_seen": 1079440,
"step": 2380,
"train_runtime": 509.5263,
"train_tokens_per_second": 2118.517
},
{
"epoch": 15.094936708860759,
"grad_norm": 0.00128173828125,
"learning_rate": 0.0051816434028713245,
"loss": 0.0002,
"num_input_tokens_seen": 1081648,
"step": 2385,
"train_runtime": 510.5011,
"train_tokens_per_second": 2118.796
},
{
"epoch": 15.126582278481013,
"grad_norm": 0.0027618408203125,
"learning_rate": 0.005119159419858583,
"loss": 0.0003,
"num_input_tokens_seen": 1083952,
"step": 2390,
"train_runtime": 511.4794,
"train_tokens_per_second": 2119.248
},
{
"epoch": 15.158227848101266,
"grad_norm": 6.914138793945312e-05,
"learning_rate": 0.005056976857743068,
"loss": 0.0001,
"num_input_tokens_seen": 1086224,
"step": 2395,
"train_runtime": 512.4655,
"train_tokens_per_second": 2119.604
},
{
"epoch": 15.189873417721518,
"grad_norm": 0.006195068359375,
"learning_rate": 0.004995097613440688,
"loss": 0.0003,
"num_input_tokens_seen": 1088432,
"step": 2400,
"train_runtime": 513.4385,
"train_tokens_per_second": 2119.888
},
{
"epoch": 15.221518987341772,
"grad_norm": 0.00311279296875,
"learning_rate": 0.004933523574614447,
"loss": 0.0011,
"num_input_tokens_seen": 1090736,
"step": 2405,
"train_runtime": 514.427,
"train_tokens_per_second": 2120.293
},
{
"epoch": 15.253164556962025,
"grad_norm": 4.00543212890625e-05,
"learning_rate": 0.004872256619616906,
"loss": 0.0002,
"num_input_tokens_seen": 1092912,
"step": 2410,
"train_runtime": 515.3964,
"train_tokens_per_second": 2120.527
},
{
"epoch": 15.284810126582279,
"grad_norm": 8.726119995117188e-05,
"learning_rate": 0.004811298617432824,
"loss": 0.0002,
"num_input_tokens_seen": 1095280,
"step": 2415,
"train_runtime": 516.39,
"train_tokens_per_second": 2121.033
},
{
"epoch": 15.316455696202532,
"grad_norm": 0.0098876953125,
"learning_rate": 0.004750651427622173,
"loss": 0.001,
"num_input_tokens_seen": 1097552,
"step": 2420,
"train_runtime": 517.3784,
"train_tokens_per_second": 2121.372
},
{
"epoch": 15.348101265822784,
"grad_norm": 0.00017547607421875,
"learning_rate": 0.004690316900263435,
"loss": 0.0001,
"num_input_tokens_seen": 1099760,
"step": 2425,
"train_runtime": 518.351,
"train_tokens_per_second": 2121.651
},
{
"epoch": 15.379746835443038,
"grad_norm": 0.000774383544921875,
"learning_rate": 0.0046302968758971065,
"loss": 0.0005,
"num_input_tokens_seen": 1102096,
"step": 2430,
"train_runtime": 519.3418,
"train_tokens_per_second": 2122.102
},
{
"epoch": 15.41139240506329,
"grad_norm": 0.00051116943359375,
"learning_rate": 0.004570593185469605,
"loss": 0.0009,
"num_input_tokens_seen": 1104336,
"step": 2435,
"train_runtime": 520.3174,
"train_tokens_per_second": 2122.428
},
{
"epoch": 15.443037974683545,
"grad_norm": 0.0002498626708984375,
"learning_rate": 0.004511207650277389,
"loss": 0.0002,
"num_input_tokens_seen": 1106480,
"step": 2440,
"train_runtime": 521.284,
"train_tokens_per_second": 2122.605
},
{
"epoch": 15.474683544303797,
"grad_norm": 4.410743713378906e-05,
"learning_rate": 0.004452142081911388,
"loss": 0.0005,
"num_input_tokens_seen": 1108752,
"step": 2445,
"train_runtime": 522.2723,
"train_tokens_per_second": 2122.939
},
{
"epoch": 15.50632911392405,
"grad_norm": 0.0029754638671875,
"learning_rate": 0.004393398282201788,
"loss": 0.0009,
"num_input_tokens_seen": 1110960,
"step": 2450,
"train_runtime": 523.2445,
"train_tokens_per_second": 2123.214
},
{
"epoch": 15.537974683544304,
"grad_norm": 6.29425048828125e-05,
"learning_rate": 0.004334978043162998,
"loss": 0.0002,
"num_input_tokens_seen": 1113168,
"step": 2455,
"train_runtime": 524.222,
"train_tokens_per_second": 2123.467
},
{
"epoch": 15.569620253164556,
"grad_norm": 0.00194549560546875,
"learning_rate": 0.004276883146939021,
"loss": 0.0002,
"num_input_tokens_seen": 1115408,
"step": 2460,
"train_runtime": 525.1968,
"train_tokens_per_second": 2123.791
},
{
"epoch": 15.60126582278481,
"grad_norm": 0.003936767578125,
"learning_rate": 0.004219115365749112,
"loss": 0.0002,
"num_input_tokens_seen": 1117648,
"step": 2465,
"train_runtime": 526.1716,
"train_tokens_per_second": 2124.113
},
{
"epoch": 15.632911392405063,
"grad_norm": 0.00138092041015625,
"learning_rate": 0.004161676461833653,
"loss": 0.0005,
"num_input_tokens_seen": 1119984,
"step": 2470,
"train_runtime": 527.1626,
"train_tokens_per_second": 2124.551
},
{
"epoch": 15.664556962025316,
"grad_norm": 4.7206878662109375e-05,
"learning_rate": 0.004104568187400455,
"loss": 0.0,
"num_input_tokens_seen": 1122256,
"step": 2475,
"train_runtime": 528.1398,
"train_tokens_per_second": 2124.922
},
{
"epoch": 15.69620253164557,
"grad_norm": 0.00439453125,
"learning_rate": 0.004047792284571272,
"loss": 0.0002,
"num_input_tokens_seen": 1124560,
"step": 2480,
"train_runtime": 529.1252,
"train_tokens_per_second": 2125.319
},
{
"epoch": 15.727848101265822,
"grad_norm": 0.000858306884765625,
"learning_rate": 0.0039913504853286525,
"loss": 0.0005,
"num_input_tokens_seen": 1126960,
"step": 2485,
"train_runtime": 530.1311,
"train_tokens_per_second": 2125.814
},
{
"epoch": 15.759493670886076,
"grad_norm": 0.00023651123046875,
"learning_rate": 0.00393524451146315,
"loss": 0.0001,
"num_input_tokens_seen": 1129360,
"step": 2490,
"train_runtime": 531.1366,
"train_tokens_per_second": 2126.308
},
{
"epoch": 15.791139240506329,
"grad_norm": 5.435943603515625e-05,
"learning_rate": 0.0038794760745207314,
"loss": 0.0001,
"num_input_tokens_seen": 1131568,
"step": 2495,
"train_runtime": 532.1089,
"train_tokens_per_second": 2126.572
},
{
"epoch": 15.822784810126583,
"grad_norm": 0.000202178955078125,
"learning_rate": 0.0038240468757506077,
"loss": 0.0001,
"num_input_tokens_seen": 1133872,
"step": 2500,
"train_runtime": 533.0972,
"train_tokens_per_second": 2126.952
},
{
"epoch": 15.854430379746836,
"grad_norm": 0.00024318695068359375,
"learning_rate": 0.0037689586060533522,
"loss": 0.0002,
"num_input_tokens_seen": 1136240,
"step": 2505,
"train_runtime": 534.084,
"train_tokens_per_second": 2127.456
},
{
"epoch": 15.886075949367088,
"grad_norm": 0.0004425048828125,
"learning_rate": 0.003714212945929265,
"loss": 0.0002,
"num_input_tokens_seen": 1138448,
"step": 2510,
"train_runtime": 535.0569,
"train_tokens_per_second": 2127.714
},
{
"epoch": 15.917721518987342,
"grad_norm": 0.003021240234375,
"learning_rate": 0.003659811565427151,
"loss": 0.0006,
"num_input_tokens_seen": 1140752,
"step": 2515,
"train_runtime": 536.0486,
"train_tokens_per_second": 2128.076
},
{
"epoch": 15.949367088607595,
"grad_norm": 4.887580871582031e-05,
"learning_rate": 0.0036057561240933683,
"loss": 0.0001,
"num_input_tokens_seen": 1143056,
"step": 2520,
"train_runtime": 537.031,
"train_tokens_per_second": 2128.473
},
{
"epoch": 15.981012658227849,
"grad_norm": 0.0002994537353515625,
"learning_rate": 0.003552048270921177,
"loss": 0.0002,
"num_input_tokens_seen": 1145392,
"step": 2525,
"train_runtime": 538.0245,
"train_tokens_per_second": 2128.884
},
{
"epoch": 16.0,
"eval_loss": 0.09226036071777344,
"eval_runtime": 1.6756,
"eval_samples_per_second": 41.776,
"eval_steps_per_second": 10.742,
"num_input_tokens_seen": 1146608,
"step": 2528
},
{
"epoch": 16.0126582278481,
"grad_norm": 6.103515625e-05,
"learning_rate": 0.0034986896443004695,
"loss": 0.0003,
"num_input_tokens_seen": 1147536,
"step": 2530,
"train_runtime": 541.5633,
"train_tokens_per_second": 2118.932
},
{
"epoch": 16.044303797468356,
"grad_norm": 0.00018405914306640625,
"learning_rate": 0.003445681871967776,
"loss": 0.0002,
"num_input_tokens_seen": 1149776,
"step": 2535,
"train_runtime": 542.5573,
"train_tokens_per_second": 2119.179
},
{
"epoch": 16.075949367088608,
"grad_norm": 9.489059448242188e-05,
"learning_rate": 0.003393026570956594,
"loss": 0.0004,
"num_input_tokens_seen": 1151952,
"step": 2540,
"train_runtime": 543.5318,
"train_tokens_per_second": 2119.383
},
{
"epoch": 16.10759493670886,
"grad_norm": 0.0028228759765625,
"learning_rate": 0.0033407253475480903,
"loss": 0.0003,
"num_input_tokens_seen": 1154192,
"step": 2545,
"train_runtime": 544.51,
"train_tokens_per_second": 2119.689
},
{
"epoch": 16.139240506329113,
"grad_norm": 0.003692626953125,
"learning_rate": 0.0032887797972220756,
"loss": 0.0007,
"num_input_tokens_seen": 1156528,
"step": 2550,
"train_runtime": 545.5009,
"train_tokens_per_second": 2120.121
},
{
"epoch": 16.170886075949365,
"grad_norm": 8.821487426757812e-05,
"learning_rate": 0.003237191504608346,
"loss": 0.0002,
"num_input_tokens_seen": 1158768,
"step": 2555,
"train_runtime": 546.4749,
"train_tokens_per_second": 2120.442
},
{
"epoch": 16.20253164556962,
"grad_norm": 0.004425048828125,
"learning_rate": 0.003185962043438345,
"loss": 0.0002,
"num_input_tokens_seen": 1160912,
"step": 2560,
"train_runtime": 547.4408,
"train_tokens_per_second": 2120.617
},
{
"epoch": 16.234177215189874,
"grad_norm": 3.504753112792969e-05,
"learning_rate": 0.003135092976497134,
"loss": 0.0005,
"num_input_tokens_seen": 1163120,
"step": 2565,
"train_runtime": 548.4131,
"train_tokens_per_second": 2120.883
},
{
"epoch": 16.265822784810126,
"grad_norm": 3.743171691894531e-05,
"learning_rate": 0.003084585855575747,
"loss": 0.0002,
"num_input_tokens_seen": 1165264,
"step": 2570,
"train_runtime": 549.3818,
"train_tokens_per_second": 2121.046
},
{
"epoch": 16.29746835443038,
"grad_norm": 0.000690460205078125,
"learning_rate": 0.0030344422214238454,
"loss": 0.0003,
"num_input_tokens_seen": 1167536,
"step": 2575,
"train_runtime": 550.3587,
"train_tokens_per_second": 2121.409
},
{
"epoch": 16.32911392405063,
"grad_norm": 0.000873565673828125,
"learning_rate": 0.002984663603702693,
"loss": 0.0004,
"num_input_tokens_seen": 1169776,
"step": 2580,
"train_runtime": 551.3355,
"train_tokens_per_second": 2121.714
},
{
"epoch": 16.360759493670887,
"grad_norm": 0.00133514404296875,
"learning_rate": 0.0029352515209385283,
"loss": 0.0003,
"num_input_tokens_seen": 1172176,
"step": 2585,
"train_runtime": 552.3303,
"train_tokens_per_second": 2122.237
},
{
"epoch": 16.39240506329114,
"grad_norm": 0.0012359619140625,
"learning_rate": 0.002886207480476215,
"loss": 0.0004,
"num_input_tokens_seen": 1174384,
"step": 2590,
"train_runtime": 553.307,
"train_tokens_per_second": 2122.482
},
{
"epoch": 16.424050632911392,
"grad_norm": 0.00176239013671875,
"learning_rate": 0.0028375329784332765,
"loss": 0.0003,
"num_input_tokens_seen": 1176752,
"step": 2595,
"train_runtime": 554.3042,
"train_tokens_per_second": 2122.935
},
{
"epoch": 16.455696202531644,
"grad_norm": 0.000614166259765625,
"learning_rate": 0.002789229499654233,
"loss": 0.0003,
"num_input_tokens_seen": 1179024,
"step": 2600,
"train_runtime": 555.2831,
"train_tokens_per_second": 2123.285
},
{
"epoch": 16.4873417721519,
"grad_norm": 0.0028839111328125,
"learning_rate": 0.002741298517665333,
"loss": 0.0003,
"num_input_tokens_seen": 1181328,
"step": 2605,
"train_runtime": 556.2718,
"train_tokens_per_second": 2123.652
},
{
"epoch": 16.518987341772153,
"grad_norm": 0.002044677734375,
"learning_rate": 0.002693741494629585,
"loss": 0.0002,
"num_input_tokens_seen": 1183696,
"step": 2610,
"train_runtime": 557.2677,
"train_tokens_per_second": 2124.107
},
{
"epoch": 16.550632911392405,
"grad_norm": 9.298324584960938e-05,
"learning_rate": 0.002646559881302165,
"loss": 0.0004,
"num_input_tokens_seen": 1186000,
"step": 2615,
"train_runtime": 558.2479,
"train_tokens_per_second": 2124.504
},
{
"epoch": 16.582278481012658,
"grad_norm": 0.0028228759765625,
"learning_rate": 0.0025997551169861365,
"loss": 0.0004,
"num_input_tokens_seen": 1188400,
"step": 2620,
"train_runtime": 559.2532,
"train_tokens_per_second": 2124.977
},
{
"epoch": 16.61392405063291,
"grad_norm": 0.001708984375,
"learning_rate": 0.002553328629488577,
"loss": 0.0003,
"num_input_tokens_seen": 1190640,
"step": 2625,
"train_runtime": 560.2292,
"train_tokens_per_second": 2125.273
},
{
"epoch": 16.645569620253166,
"grad_norm": 0.001251220703125,
"learning_rate": 0.002507281835076998,
"loss": 0.0002,
"num_input_tokens_seen": 1192784,
"step": 2630,
"train_runtime": 561.1965,
"train_tokens_per_second": 2125.43
},
{
"epoch": 16.67721518987342,
"grad_norm": 0.000553131103515625,
"learning_rate": 0.002461616138436155,
"loss": 0.0005,
"num_input_tokens_seen": 1195024,
"step": 2635,
"train_runtime": 562.1739,
"train_tokens_per_second": 2125.719
},
{
"epoch": 16.70886075949367,
"grad_norm": 0.001373291015625,
"learning_rate": 0.0024163329326251774,
"loss": 0.0001,
"num_input_tokens_seen": 1197264,
"step": 2640,
"train_runtime": 563.1522,
"train_tokens_per_second": 2126.004
},
{
"epoch": 16.740506329113924,
"grad_norm": 0.00604248046875,
"learning_rate": 0.002371433599035097,
"loss": 0.0004,
"num_input_tokens_seen": 1199760,
"step": 2645,
"train_runtime": 564.179,
"train_tokens_per_second": 2126.559
},
{
"epoch": 16.772151898734176,
"grad_norm": 0.0001201629638671875,
"learning_rate": 0.0023269195073466957,
"loss": 0.0004,
"num_input_tokens_seen": 1202096,
"step": 2650,
"train_runtime": 565.1712,
"train_tokens_per_second": 2126.959
},
{
"epoch": 16.803797468354432,
"grad_norm": 0.00213623046875,
"learning_rate": 0.0022827920154887132,
"loss": 0.0002,
"num_input_tokens_seen": 1204400,
"step": 2655,
"train_runtime": 566.1635,
"train_tokens_per_second": 2127.301
},
{
"epoch": 16.835443037974684,
"grad_norm": 0.000514984130859375,
"learning_rate": 0.002239052469596439,
"loss": 0.0005,
"num_input_tokens_seen": 1206896,
"step": 2660,
"train_runtime": 567.1759,
"train_tokens_per_second": 2127.904
},
{
"epoch": 16.867088607594937,
"grad_norm": 0.005645751953125,
"learning_rate": 0.0021957022039706454,
"loss": 0.0006,
"num_input_tokens_seen": 1209264,
"step": 2665,
"train_runtime": 568.1622,
"train_tokens_per_second": 2128.378
},
{
"epoch": 16.89873417721519,
"grad_norm": 4.38690185546875e-05,
"learning_rate": 0.002152742541036869,
"loss": 0.0003,
"num_input_tokens_seen": 1211504,
"step": 2670,
"train_runtime": 569.1386,
"train_tokens_per_second": 2128.663
},
{
"epoch": 16.930379746835442,
"grad_norm": 0.004364013671875,
"learning_rate": 0.0021101747913050855,
"loss": 0.0011,
"num_input_tokens_seen": 1213808,
"step": 2675,
"train_runtime": 570.1268,
"train_tokens_per_second": 2129.014
},
{
"epoch": 16.962025316455698,
"grad_norm": 0.00066375732421875,
"learning_rate": 0.0020680002533297274,
"loss": 0.0004,
"num_input_tokens_seen": 1215952,
"step": 2680,
"train_runtime": 571.0938,
"train_tokens_per_second": 2129.163
},
{
"epoch": 16.99367088607595,
"grad_norm": 0.003936767578125,
"learning_rate": 0.002026220213670069,
"loss": 0.0005,
"num_input_tokens_seen": 1218160,
"step": 2685,
"train_runtime": 572.077,
"train_tokens_per_second": 2129.364
},
{
"epoch": 17.0,
"eval_loss": 0.09145190566778183,
"eval_runtime": 1.6789,
"eval_samples_per_second": 41.694,
"eval_steps_per_second": 10.721,
"num_input_tokens_seen": 1218368,
"step": 2686
},
{
"epoch": 17.025316455696203,
"grad_norm": 0.0010223388671875,
"learning_rate": 0.0019848359468509825,
"loss": 0.0001,
"num_input_tokens_seen": 1220256,
"step": 2690,
"train_runtime": 575.6869,
"train_tokens_per_second": 2119.652
},
{
"epoch": 17.056962025316455,
"grad_norm": 0.00274658203125,
"learning_rate": 0.0019438487153240424,
"loss": 0.001,
"num_input_tokens_seen": 1222528,
"step": 2695,
"train_runtime": 576.6737,
"train_tokens_per_second": 2119.965
},
{
"epoch": 17.088607594936708,
"grad_norm": 0.0019683837890625,
"learning_rate": 0.0019032597694290392,
"loss": 0.0001,
"num_input_tokens_seen": 1224768,
"step": 2700,
"train_runtime": 577.6518,
"train_tokens_per_second": 2120.253
},
{
"epoch": 17.120253164556964,
"grad_norm": 4.7206878662109375e-05,
"learning_rate": 0.0018630703473558234,
"loss": 0.0002,
"num_input_tokens_seen": 1227008,
"step": 2705,
"train_runtime": 578.6261,
"train_tokens_per_second": 2120.554
},
{
"epoch": 17.151898734177216,
"grad_norm": 0.003631591796875,
"learning_rate": 0.0018232816751065249,
"loss": 0.0006,
"num_input_tokens_seen": 1229344,
"step": 2710,
"train_runtime": 579.6303,
"train_tokens_per_second": 2120.911
},
{
"epoch": 17.18354430379747,
"grad_norm": 0.0023651123046875,
"learning_rate": 0.0017838949664581742,
"loss": 0.0007,
"num_input_tokens_seen": 1231712,
"step": 2715,
"train_runtime": 580.6264,
"train_tokens_per_second": 2121.35
},
{
"epoch": 17.21518987341772,
"grad_norm": 0.00032806396484375,
"learning_rate": 0.0017449114229256607,
"loss": 0.0005,
"num_input_tokens_seen": 1233984,
"step": 2720,
"train_runtime": 581.604,
"train_tokens_per_second": 2121.691
},
{
"epoch": 17.246835443037973,
"grad_norm": 0.000274658203125,
"learning_rate": 0.0017063322337250713,
"loss": 0.0002,
"num_input_tokens_seen": 1236224,
"step": 2725,
"train_runtime": 582.5946,
"train_tokens_per_second": 2121.928
},
{
"epoch": 17.27848101265823,
"grad_norm": 0.0009307861328125,
"learning_rate": 0.0016681585757374472,
"loss": 0.0001,
"num_input_tokens_seen": 1238528,
"step": 2730,
"train_runtime": 583.5946,
"train_tokens_per_second": 2122.24
},
{
"epoch": 17.310126582278482,
"grad_norm": 4.887580871582031e-05,
"learning_rate": 0.001630391613472837,
"loss": 0.0002,
"num_input_tokens_seen": 1240832,
"step": 2735,
"train_runtime": 584.5843,
"train_tokens_per_second": 2122.589
},
{
"epoch": 17.341772151898734,
"grad_norm": 0.00014400482177734375,
"learning_rate": 0.001593032499034811,
"loss": 0.0002,
"num_input_tokens_seen": 1243104,
"step": 2740,
"train_runtime": 585.562,
"train_tokens_per_second": 2122.925
},
{
"epoch": 17.373417721518987,
"grad_norm": 4.3392181396484375e-05,
"learning_rate": 0.0015560823720852928,
"loss": 0.0002,
"num_input_tokens_seen": 1245376,
"step": 2745,
"train_runtime": 586.5413,
"train_tokens_per_second": 2123.254
},
{
"epoch": 17.40506329113924,
"grad_norm": 4.410743713378906e-05,
"learning_rate": 0.0015195423598097972,
"loss": 0.0004,
"num_input_tokens_seen": 1247648,
"step": 2750,
"train_runtime": 587.5225,
"train_tokens_per_second": 2123.575
},
{
"epoch": 17.436708860759495,
"grad_norm": 4.482269287109375e-05,
"learning_rate": 0.001483413576883057,
"loss": 0.0006,
"num_input_tokens_seen": 1250048,
"step": 2755,
"train_runtime": 588.521,
"train_tokens_per_second": 2124.05
},
{
"epoch": 17.468354430379748,
"grad_norm": 4.839897155761719e-05,
"learning_rate": 0.001447697125435004,
"loss": 0.0006,
"num_input_tokens_seen": 1252352,
"step": 2760,
"train_runtime": 589.5024,
"train_tokens_per_second": 2124.422
},
{
"epoch": 17.5,
"grad_norm": 4.649162292480469e-05,
"learning_rate": 0.0014123940950171508,
"loss": 0.0002,
"num_input_tokens_seen": 1254624,
"step": 2765,
"train_runtime": 590.481,
"train_tokens_per_second": 2124.749
},
{
"epoch": 17.531645569620252,
"grad_norm": 0.0042724609375,
"learning_rate": 0.0013775055625693683,
"loss": 0.0003,
"num_input_tokens_seen": 1256864,
"step": 2770,
"train_runtime": 591.4594,
"train_tokens_per_second": 2125.022
},
{
"epoch": 17.563291139240505,
"grad_norm": 0.005096435546875,
"learning_rate": 0.0013430325923870095,
"loss": 0.0002,
"num_input_tokens_seen": 1259072,
"step": 2775,
"train_runtime": 592.4327,
"train_tokens_per_second": 2125.257
},
{
"epoch": 17.59493670886076,
"grad_norm": 0.0015411376953125,
"learning_rate": 0.0013089762360884538,
"loss": 0.0004,
"num_input_tokens_seen": 1261376,
"step": 2780,
"train_runtime": 593.423,
"train_tokens_per_second": 2125.593
},
{
"epoch": 17.626582278481013,
"grad_norm": 0.00115203857421875,
"learning_rate": 0.0012753375325830413,
"loss": 0.0001,
"num_input_tokens_seen": 1263488,
"step": 2785,
"train_runtime": 594.3873,
"train_tokens_per_second": 2125.698
},
{
"epoch": 17.658227848101266,
"grad_norm": 0.00010967254638671875,
"learning_rate": 0.001242117508039347,
"loss": 0.0008,
"num_input_tokens_seen": 1265824,
"step": 2790,
"train_runtime": 595.3802,
"train_tokens_per_second": 2126.077
},
{
"epoch": 17.689873417721518,
"grad_norm": 0.00023651123046875,
"learning_rate": 0.0012093171758539112,
"loss": 0.0001,
"num_input_tokens_seen": 1268032,
"step": 2795,
"train_runtime": 596.3537,
"train_tokens_per_second": 2126.309
},
{
"epoch": 17.72151898734177,
"grad_norm": 0.007232666015625,
"learning_rate": 0.0011769375366203066,
"loss": 0.0006,
"num_input_tokens_seen": 1270272,
"step": 2800,
"train_runtime": 597.3299,
"train_tokens_per_second": 2126.584
},
{
"epoch": 17.753164556962027,
"grad_norm": 0.00189208984375,
"learning_rate": 0.0011449795780986071,
"loss": 0.0002,
"num_input_tokens_seen": 1272544,
"step": 2805,
"train_runtime": 598.3089,
"train_tokens_per_second": 2126.901
},
{
"epoch": 17.78481012658228,
"grad_norm": 0.00019359588623046875,
"learning_rate": 0.0011134442751852846,
"loss": 0.0003,
"num_input_tokens_seen": 1274880,
"step": 2810,
"train_runtime": 599.2919,
"train_tokens_per_second": 2127.311
},
{
"epoch": 17.81645569620253,
"grad_norm": 0.0004482269287109375,
"learning_rate": 0.0010823325898834395,
"loss": 0.0,
"num_input_tokens_seen": 1277120,
"step": 2815,
"train_runtime": 600.2682,
"train_tokens_per_second": 2127.582
},
{
"epoch": 17.848101265822784,
"grad_norm": 0.003387451171875,
"learning_rate": 0.0010516454712734629,
"loss": 0.0003,
"num_input_tokens_seen": 1279424,
"step": 2820,
"train_runtime": 601.2667,
"train_tokens_per_second": 2127.881
},
{
"epoch": 17.879746835443036,
"grad_norm": 0.00148773193359375,
"learning_rate": 0.0010213838554841027,
"loss": 0.0003,
"num_input_tokens_seen": 1281728,
"step": 2825,
"train_runtime": 602.256,
"train_tokens_per_second": 2128.211
},
{
"epoch": 17.911392405063292,
"grad_norm": 5.698204040527344e-05,
"learning_rate": 0.0009915486656638728,
"loss": 0.0002,
"num_input_tokens_seen": 1283968,
"step": 2830,
"train_runtime": 603.2398,
"train_tokens_per_second": 2128.454
},
{
"epoch": 17.943037974683545,
"grad_norm": 0.00543212890625,
"learning_rate": 0.0009621408119529234,
"loss": 0.0007,
"num_input_tokens_seen": 1286272,
"step": 2835,
"train_runtime": 604.2235,
"train_tokens_per_second": 2128.802
},
{
"epoch": 17.974683544303797,
"grad_norm": 0.000476837158203125,
"learning_rate": 0.0009331611914552607,
"loss": 0.0007,
"num_input_tokens_seen": 1288544,
"step": 2840,
"train_runtime": 605.202,
"train_tokens_per_second": 2129.114
},
{
"epoch": 18.0,
"eval_loss": 0.0919828787446022,
"eval_runtime": 1.6803,
"eval_samples_per_second": 41.66,
"eval_steps_per_second": 10.713,
"num_input_tokens_seen": 1290144,
"step": 2844
},
{
"epoch": 18.00632911392405,
"grad_norm": 0.0001678466796875,
"learning_rate": 0.0009046106882113752,
"loss": 0.0004,
"num_input_tokens_seen": 1290624,
"step": 2845,
"train_runtime": 608.7518,
"train_tokens_per_second": 2120.115
},
{
"epoch": 18.037974683544302,
"grad_norm": 0.0013885498046875,
"learning_rate": 0.000876490173171291,
"loss": 0.0003,
"num_input_tokens_seen": 1292992,
"step": 2850,
"train_runtime": 609.7974,
"train_tokens_per_second": 2120.363
},
{
"epoch": 18.069620253164558,
"grad_norm": 0.003570556640625,
"learning_rate": 0.0008488005041679841,
"loss": 0.0003,
"num_input_tokens_seen": 1295328,
"step": 2855,
"train_runtime": 610.7944,
"train_tokens_per_second": 2120.727
},
{
"epoch": 18.10126582278481,
"grad_norm": 0.00093841552734375,
"learning_rate": 0.0008215425258912096,
"loss": 0.0008,
"num_input_tokens_seen": 1297568,
"step": 2860,
"train_runtime": 611.7703,
"train_tokens_per_second": 2121.005
},
{
"epoch": 18.132911392405063,
"grad_norm": 6.961822509765625e-05,
"learning_rate": 0.0007947170698617595,
"loss": 0.0002,
"num_input_tokens_seen": 1299840,
"step": 2865,
"train_runtime": 612.7479,
"train_tokens_per_second": 2121.329
},
{
"epoch": 18.164556962025316,
"grad_norm": 5.650520324707031e-05,
"learning_rate": 0.0007683249544060571,
"loss": 0.0001,
"num_input_tokens_seen": 1302080,
"step": 2870,
"train_runtime": 613.7222,
"train_tokens_per_second": 2121.611
},
{
"epoch": 18.196202531645568,
"grad_norm": 0.00182342529296875,
"learning_rate": 0.000742366984631227,
"loss": 0.0005,
"num_input_tokens_seen": 1304288,
"step": 2875,
"train_runtime": 614.6948,
"train_tokens_per_second": 2121.846
},
{
"epoch": 18.227848101265824,
"grad_norm": 0.000766754150390625,
"learning_rate": 0.000716843952400522,
"loss": 0.0003,
"num_input_tokens_seen": 1306784,
"step": 2880,
"train_runtime": 615.7196,
"train_tokens_per_second": 2122.369
},
{
"epoch": 18.259493670886076,
"grad_norm": 0.004425048828125,
"learning_rate": 0.0006917566363091609,
"loss": 0.0004,
"num_input_tokens_seen": 1309120,
"step": 2885,
"train_runtime": 616.7115,
"train_tokens_per_second": 2122.743
},
{
"epoch": 18.29113924050633,
"grad_norm": 3.170967102050781e-05,
"learning_rate": 0.000667105801660589,
"loss": 0.0004,
"num_input_tokens_seen": 1311296,
"step": 2890,
"train_runtime": 617.6819,
"train_tokens_per_second": 2122.931
},
{
"epoch": 18.32278481012658,
"grad_norm": 0.0004444122314453125,
"learning_rate": 0.0006428922004431298,
"loss": 0.0001,
"num_input_tokens_seen": 1313568,
"step": 2895,
"train_runtime": 618.6625,
"train_tokens_per_second": 2123.238
},
{
"epoch": 18.354430379746834,
"grad_norm": 0.0010528564453125,
"learning_rate": 0.000619116571307029,
"loss": 0.0001,
"num_input_tokens_seen": 1315776,
"step": 2900,
"train_runtime": 619.6407,
"train_tokens_per_second": 2123.45
},
{
"epoch": 18.38607594936709,
"grad_norm": 0.005950927734375,
"learning_rate": 0.0005957796395419484,
"loss": 0.0006,
"num_input_tokens_seen": 1318080,
"step": 2905,
"train_runtime": 620.6294,
"train_tokens_per_second": 2123.779
},
{
"epoch": 18.417721518987342,
"grad_norm": 0.00469970703125,
"learning_rate": 0.0005728821170548199,
"loss": 0.0002,
"num_input_tokens_seen": 1320320,
"step": 2910,
"train_runtime": 621.6135,
"train_tokens_per_second": 2124.021
},
{
"epoch": 18.449367088607595,
"grad_norm": 5.14984130859375e-05,
"learning_rate": 0.0005504247023481373,
"loss": 0.0002,
"num_input_tokens_seen": 1322528,
"step": 2915,
"train_runtime": 622.5944,
"train_tokens_per_second": 2124.221
},
{
"epoch": 18.481012658227847,
"grad_norm": 0.0037994384765625,
"learning_rate": 0.0005284080804986412,
"loss": 0.0012,
"num_input_tokens_seen": 1324928,
"step": 2920,
"train_runtime": 623.5899,
"train_tokens_per_second": 2124.678
},
{
"epoch": 18.5126582278481,
"grad_norm": 0.00122833251953125,
"learning_rate": 0.0005068329231364282,
"loss": 0.0001,
"num_input_tokens_seen": 1327136,
"step": 2925,
"train_runtime": 624.5635,
"train_tokens_per_second": 2124.902
},
{
"epoch": 18.544303797468356,
"grad_norm": 0.000865936279296875,
"learning_rate": 0.00048569988842446065,
"loss": 0.0006,
"num_input_tokens_seen": 1329408,
"step": 2930,
"train_runtime": 625.5408,
"train_tokens_per_second": 2125.214
},
{
"epoch": 18.575949367088608,
"grad_norm": 0.002044677734375,
"learning_rate": 0.00046500962103848795,
"loss": 0.0005,
"num_input_tokens_seen": 1331680,
"step": 2935,
"train_runtime": 626.5185,
"train_tokens_per_second": 2125.524
},
{
"epoch": 18.60759493670886,
"grad_norm": 8.249282836914062e-05,
"learning_rate": 0.00044476275214737235,
"loss": 0.0001,
"num_input_tokens_seen": 1333856,
"step": 2940,
"train_runtime": 627.4887,
"train_tokens_per_second": 2125.705
},
{
"epoch": 18.639240506329113,
"grad_norm": 0.0025177001953125,
"learning_rate": 0.00042495989939384915,
"loss": 0.0004,
"num_input_tokens_seen": 1336160,
"step": 2945,
"train_runtime": 628.4775,
"train_tokens_per_second": 2126.027
},
{
"epoch": 18.67088607594937,
"grad_norm": 0.003509521484375,
"learning_rate": 0.0004056016668756801,
"loss": 0.0002,
"num_input_tokens_seen": 1338496,
"step": 2950,
"train_runtime": 629.4608,
"train_tokens_per_second": 2126.417
},
{
"epoch": 18.70253164556962,
"grad_norm": 4.7206878662109375e-05,
"learning_rate": 0.00038668864512721667,
"loss": 0.0003,
"num_input_tokens_seen": 1340736,
"step": 2955,
"train_runtime": 630.4347,
"train_tokens_per_second": 2126.685
},
{
"epoch": 18.734177215189874,
"grad_norm": 0.000598907470703125,
"learning_rate": 0.00036822141110139594,
"loss": 0.0004,
"num_input_tokens_seen": 1342976,
"step": 2960,
"train_runtime": 631.4181,
"train_tokens_per_second": 2126.92
},
{
"epoch": 18.765822784810126,
"grad_norm": 0.0005340576171875,
"learning_rate": 0.00035020052815213477,
"loss": 0.0003,
"num_input_tokens_seen": 1345312,
"step": 2965,
"train_runtime": 632.4043,
"train_tokens_per_second": 2127.297
},
{
"epoch": 18.79746835443038,
"grad_norm": 6.437301635742188e-05,
"learning_rate": 0.0003326265460171468,
"loss": 0.0003,
"num_input_tokens_seen": 1347552,
"step": 2970,
"train_runtime": 633.3905,
"train_tokens_per_second": 2127.522
},
{
"epoch": 18.82911392405063,
"grad_norm": 0.0013427734375,
"learning_rate": 0.0003155000008011727,
"loss": 0.0003,
"num_input_tokens_seen": 1349792,
"step": 2975,
"train_runtime": 634.3732,
"train_tokens_per_second": 2127.757
},
{
"epoch": 18.860759493670887,
"grad_norm": 0.004302978515625,
"learning_rate": 0.0002988214149596197,
"loss": 0.0006,
"num_input_tokens_seen": 1352096,
"step": 2980,
"train_runtime": 635.3617,
"train_tokens_per_second": 2128.073
},
{
"epoch": 18.89240506329114,
"grad_norm": 0.00010442733764648438,
"learning_rate": 0.00028259129728263607,
"loss": 0.0006,
"num_input_tokens_seen": 1354400,
"step": 2985,
"train_runtime": 636.3409,
"train_tokens_per_second": 2128.419
},
{
"epoch": 18.924050632911392,
"grad_norm": 0.00167083740234375,
"learning_rate": 0.0002668101428795788,
"loss": 0.0005,
"num_input_tokens_seen": 1356672,
"step": 2990,
"train_runtime": 637.3271,
"train_tokens_per_second": 2128.69
},
{
"epoch": 18.955696202531644,
"grad_norm": 0.0003376007080078125,
"learning_rate": 0.00025147843316391524,
"loss": 0.0001,
"num_input_tokens_seen": 1358944,
"step": 2995,
"train_runtime": 638.3058,
"train_tokens_per_second": 2128.986
},
{
"epoch": 18.9873417721519,
"grad_norm": 8.296966552734375e-05,
"learning_rate": 0.0002365966358385335,
"loss": 0.0004,
"num_input_tokens_seen": 1361312,
"step": 3000,
"train_runtime": 639.3002,
"train_tokens_per_second": 2129.378
},
{
"epoch": 19.0,
"eval_loss": 0.0921000987291336,
"eval_runtime": 1.6825,
"eval_samples_per_second": 41.604,
"eval_steps_per_second": 10.698,
"num_input_tokens_seen": 1361984,
"step": 3002
},
{
"epoch": 19.018987341772153,
"grad_norm": 0.0005035400390625,
"learning_rate": 0.00022216520488148206,
"loss": 0.0001,
"num_input_tokens_seen": 1363328,
"step": 3005,
"train_runtime": 642.8252,
"train_tokens_per_second": 2120.838
},
{
"epoch": 19.050632911392405,
"grad_norm": 0.004241943359375,
"learning_rate": 0.00020818458053211252,
"loss": 0.0008,
"num_input_tokens_seen": 1365600,
"step": 3010,
"train_runtime": 643.8422,
"train_tokens_per_second": 2121.017
},
{
"epoch": 19.082278481012658,
"grad_norm": 0.0032196044921875,
"learning_rate": 0.00019465518927765712,
"loss": 0.0008,
"num_input_tokens_seen": 1368032,
"step": 3015,
"train_runtime": 644.8884,
"train_tokens_per_second": 2121.347
},
{
"epoch": 19.11392405063291,
"grad_norm": 0.0007476806640625,
"learning_rate": 0.00018157744384021234,
"loss": 0.0001,
"num_input_tokens_seen": 1370272,
"step": 3020,
"train_runtime": 645.8738,
"train_tokens_per_second": 2121.579
},
{
"epoch": 19.145569620253166,
"grad_norm": 0.005950927734375,
"learning_rate": 0.00016895174316415405,
"loss": 0.0004,
"num_input_tokens_seen": 1372480,
"step": 3025,
"train_runtime": 646.8474,
"train_tokens_per_second": 2121.799
},
{
"epoch": 19.17721518987342,
"grad_norm": 5.14984130859375e-05,
"learning_rate": 0.0001567784724039589,
"loss": 0.0002,
"num_input_tokens_seen": 1374688,
"step": 3030,
"train_runtime": 647.8217,
"train_tokens_per_second": 2122.016
},
{
"epoch": 19.20886075949367,
"grad_norm": 0.0020294189453125,
"learning_rate": 0.00014505800291247207,
"loss": 0.0005,
"num_input_tokens_seen": 1376960,
"step": 3035,
"train_runtime": 648.7999,
"train_tokens_per_second": 2122.319
},
{
"epoch": 19.240506329113924,
"grad_norm": 0.000232696533203125,
"learning_rate": 0.00013379069222955618,
"loss": 0.0008,
"num_input_tokens_seen": 1379232,
"step": 3040,
"train_runtime": 649.7776,
"train_tokens_per_second": 2122.622
},
{
"epoch": 19.272151898734176,
"grad_norm": 0.01190185546875,
"learning_rate": 0.00012297688407120032,
"loss": 0.0011,
"num_input_tokens_seen": 1381632,
"step": 3045,
"train_runtime": 650.7744,
"train_tokens_per_second": 2123.058
},
{
"epoch": 19.303797468354432,
"grad_norm": 0.000354766845703125,
"learning_rate": 0.00011261690831903481,
"loss": 0.0003,
"num_input_tokens_seen": 1383808,
"step": 3050,
"train_runtime": 651.7447,
"train_tokens_per_second": 2123.236
},
{
"epoch": 19.335443037974684,
"grad_norm": 0.00099945068359375,
"learning_rate": 0.00010271108101025439,
"loss": 0.0001,
"num_input_tokens_seen": 1386080,
"step": 3055,
"train_runtime": 652.7319,
"train_tokens_per_second": 2123.506
},
{
"epoch": 19.367088607594937,
"grad_norm": 4.172325134277344e-05,
"learning_rate": 9.325970432799424e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1388288,
"step": 3060,
"train_runtime": 653.705,
"train_tokens_per_second": 2123.722
},
{
"epoch": 19.39873417721519,
"grad_norm": 0.006439208984375,
"learning_rate": 8.426306659209903e-05,
"loss": 0.0009,
"num_input_tokens_seen": 1390560,
"step": 3065,
"train_runtime": 654.6916,
"train_tokens_per_second": 2123.993
},
{
"epoch": 19.430379746835442,
"grad_norm": 0.003448486328125,
"learning_rate": 7.572144225033495e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1392864,
"step": 3070,
"train_runtime": 655.6816,
"train_tokens_per_second": 2124.299
},
{
"epoch": 19.462025316455698,
"grad_norm": 0.0002994537353515625,
"learning_rate": 6.76350918700147e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1395040,
"step": 3075,
"train_runtime": 656.6509,
"train_tokens_per_second": 2124.477
},
{
"epoch": 19.49367088607595,
"grad_norm": 0.0013580322265625,
"learning_rate": 6.0004262130048946e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1397344,
"step": 3080,
"train_runtime": 657.6314,
"train_tokens_per_second": 2124.813
},
{
"epoch": 19.525316455696203,
"grad_norm": 3.743171691894531e-05,
"learning_rate": 5.282918581341889e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1399520,
"step": 3085,
"train_runtime": 658.6027,
"train_tokens_per_second": 2124.984
},
{
"epoch": 19.556962025316455,
"grad_norm": 0.00010538101196289062,
"learning_rate": 4.6110081800082025e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1401728,
"step": 3090,
"train_runtime": 659.5798,
"train_tokens_per_second": 2125.183
},
{
"epoch": 19.588607594936708,
"grad_norm": 0.0032501220703125,
"learning_rate": 3.98471550602858e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1404032,
"step": 3095,
"train_runtime": 660.5686,
"train_tokens_per_second": 2125.49
},
{
"epoch": 19.620253164556964,
"grad_norm": 9.441375732421875e-05,
"learning_rate": 3.404059664832259e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1406240,
"step": 3100,
"train_runtime": 661.5418,
"train_tokens_per_second": 2125.701
},
{
"epoch": 19.651898734177216,
"grad_norm": 0.007568359375,
"learning_rate": 2.869058369669941e-05,
"loss": 0.0005,
"num_input_tokens_seen": 1408640,
"step": 3105,
"train_runtime": 662.548,
"train_tokens_per_second": 2126.095
},
{
"epoch": 19.68354430379747,
"grad_norm": 0.0057373046875,
"learning_rate": 2.3797279410728844e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1410944,
"step": 3110,
"train_runtime": 663.5279,
"train_tokens_per_second": 2126.427
},
{
"epoch": 19.71518987341772,
"grad_norm": 0.0002593994140625,
"learning_rate": 1.9360833063559732e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1413056,
"step": 3115,
"train_runtime": 664.4925,
"train_tokens_per_second": 2126.519
},
{
"epoch": 19.746835443037973,
"grad_norm": 0.00299072265625,
"learning_rate": 1.5381379991615817e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1415360,
"step": 3120,
"train_runtime": 665.4727,
"train_tokens_per_second": 2126.849
},
{
"epoch": 19.77848101265823,
"grad_norm": 0.000598907470703125,
"learning_rate": 1.1859041590472351e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1417600,
"step": 3125,
"train_runtime": 666.4479,
"train_tokens_per_second": 2127.098
},
{
"epoch": 19.810126582278482,
"grad_norm": 0.00103759765625,
"learning_rate": 8.793925311149087e-06,
"loss": 0.0005,
"num_input_tokens_seen": 1419904,
"step": 3130,
"train_runtime": 667.4289,
"train_tokens_per_second": 2127.424
},
{
"epoch": 19.841772151898734,
"grad_norm": 6.866455078125e-05,
"learning_rate": 6.18612465683288e-06,
"loss": 0.0001,
"num_input_tokens_seen": 1422176,
"step": 3135,
"train_runtime": 668.4162,
"train_tokens_per_second": 2127.68
},
{
"epoch": 19.873417721518987,
"grad_norm": 0.00011920928955078125,
"learning_rate": 4.035719180031649e-06,
"loss": 0.0003,
"num_input_tokens_seen": 1424608,
"step": 3140,
"train_runtime": 669.4247,
"train_tokens_per_second": 2128.108
},
{
"epoch": 19.90506329113924,
"grad_norm": 0.0012969970703125,
"learning_rate": 2.3427744801363113e-06,
"loss": 0.0001,
"num_input_tokens_seen": 1426912,
"step": 3145,
"train_runtime": 670.4043,
"train_tokens_per_second": 2128.435
},
{
"epoch": 19.936708860759495,
"grad_norm": 0.000553131103515625,
"learning_rate": 1.107342201427386e-06,
"loss": 0.0003,
"num_input_tokens_seen": 1429248,
"step": 3150,
"train_runtime": 671.3956,
"train_tokens_per_second": 2128.772
},
{
"epoch": 19.968354430379748,
"grad_norm": 3.0517578125e-05,
"learning_rate": 3.294600315012497e-07,
"loss": 0.0004,
"num_input_tokens_seen": 1431552,
"step": 3155,
"train_runtime": 672.3857,
"train_tokens_per_second": 2129.064
},
{
"epoch": 20.0,
"grad_norm": 0.00592041015625,
"learning_rate": 9.151700112730588e-09,
"loss": 0.0003,
"num_input_tokens_seen": 1433520,
"step": 3160,
"train_runtime": 673.523,
"train_tokens_per_second": 2128.391
},
{
"epoch": 20.0,
"eval_loss": 0.09257736802101135,
"eval_runtime": 1.6798,
"eval_samples_per_second": 41.672,
"eval_steps_per_second": 10.716,
"num_input_tokens_seen": 1433520,
"step": 3160
},
{
"epoch": 20.0,
"num_input_tokens_seen": 1433520,
"step": 3160,
"total_flos": 6.455075769483264e+16,
"train_loss": 0.07119393949640976,
"train_runtime": 676.0485,
"train_samples_per_second": 18.638,
"train_steps_per_second": 4.674
}
],
"logging_steps": 5,
"max_steps": 3160,
"num_input_tokens_seen": 1433520,
"num_train_epochs": 20,
"save_steps": 158,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.455075769483264e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}