train_copa_1757340251 / trainer_state.json
rbelanec's picture
End of training
1f7533c verified
{
"best_global_step": 720,
"best_metric": 0.23617739975452423,
"best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_copa_1757340251/checkpoint-720",
"epoch": 20.0,
"eval_steps": 180,
"global_step": 3600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.027777777777777776,
"grad_norm": 127.00370788574219,
"learning_rate": 5.555555555555556e-07,
"loss": 8.3103,
"num_input_tokens_seen": 784,
"step": 5
},
{
"epoch": 0.05555555555555555,
"grad_norm": 122.15016174316406,
"learning_rate": 1.25e-06,
"loss": 7.8191,
"num_input_tokens_seen": 1552,
"step": 10
},
{
"epoch": 0.08333333333333333,
"grad_norm": 112.4898452758789,
"learning_rate": 1.9444444444444444e-06,
"loss": 6.8534,
"num_input_tokens_seen": 2320,
"step": 15
},
{
"epoch": 0.1111111111111111,
"grad_norm": 148.39366149902344,
"learning_rate": 2.638888888888889e-06,
"loss": 5.5534,
"num_input_tokens_seen": 3088,
"step": 20
},
{
"epoch": 0.1388888888888889,
"grad_norm": 87.2863998413086,
"learning_rate": 3.3333333333333333e-06,
"loss": 3.9203,
"num_input_tokens_seen": 3856,
"step": 25
},
{
"epoch": 0.16666666666666666,
"grad_norm": 81.75582885742188,
"learning_rate": 4.027777777777779e-06,
"loss": 2.4315,
"num_input_tokens_seen": 4608,
"step": 30
},
{
"epoch": 0.19444444444444445,
"grad_norm": 42.17738723754883,
"learning_rate": 4.722222222222222e-06,
"loss": 1.3174,
"num_input_tokens_seen": 5360,
"step": 35
},
{
"epoch": 0.2222222222222222,
"grad_norm": 41.45945739746094,
"learning_rate": 5.416666666666667e-06,
"loss": 0.6433,
"num_input_tokens_seen": 6112,
"step": 40
},
{
"epoch": 0.25,
"grad_norm": 117.64752197265625,
"learning_rate": 6.111111111111111e-06,
"loss": 0.4838,
"num_input_tokens_seen": 6864,
"step": 45
},
{
"epoch": 0.2777777777777778,
"grad_norm": 78.57173156738281,
"learning_rate": 6.805555555555556e-06,
"loss": 0.4224,
"num_input_tokens_seen": 7616,
"step": 50
},
{
"epoch": 0.3055555555555556,
"grad_norm": 54.80699157714844,
"learning_rate": 7.5e-06,
"loss": 0.3504,
"num_input_tokens_seen": 8400,
"step": 55
},
{
"epoch": 0.3333333333333333,
"grad_norm": 18.32509422302246,
"learning_rate": 8.194444444444445e-06,
"loss": 0.3481,
"num_input_tokens_seen": 9152,
"step": 60
},
{
"epoch": 0.3611111111111111,
"grad_norm": 57.70254135131836,
"learning_rate": 8.88888888888889e-06,
"loss": 0.3422,
"num_input_tokens_seen": 9952,
"step": 65
},
{
"epoch": 0.3888888888888889,
"grad_norm": 17.409032821655273,
"learning_rate": 9.583333333333334e-06,
"loss": 0.2373,
"num_input_tokens_seen": 10720,
"step": 70
},
{
"epoch": 0.4166666666666667,
"grad_norm": 27.374752044677734,
"learning_rate": 1.0277777777777777e-05,
"loss": 0.311,
"num_input_tokens_seen": 11488,
"step": 75
},
{
"epoch": 0.4444444444444444,
"grad_norm": 7.99110746383667,
"learning_rate": 1.0972222222222223e-05,
"loss": 0.2879,
"num_input_tokens_seen": 12272,
"step": 80
},
{
"epoch": 0.4722222222222222,
"grad_norm": 32.56344985961914,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.2769,
"num_input_tokens_seen": 13040,
"step": 85
},
{
"epoch": 0.5,
"grad_norm": 24.26778221130371,
"learning_rate": 1.2361111111111112e-05,
"loss": 0.4031,
"num_input_tokens_seen": 13792,
"step": 90
},
{
"epoch": 0.5277777777777778,
"grad_norm": 13.13208293914795,
"learning_rate": 1.3055555555555557e-05,
"loss": 0.3413,
"num_input_tokens_seen": 14544,
"step": 95
},
{
"epoch": 0.5555555555555556,
"grad_norm": 14.814593315124512,
"learning_rate": 1.3750000000000002e-05,
"loss": 0.2835,
"num_input_tokens_seen": 15328,
"step": 100
},
{
"epoch": 0.5833333333333334,
"grad_norm": 2.4372589588165283,
"learning_rate": 1.4444444444444444e-05,
"loss": 0.2891,
"num_input_tokens_seen": 16064,
"step": 105
},
{
"epoch": 0.6111111111111112,
"grad_norm": 3.005253314971924,
"learning_rate": 1.5138888888888888e-05,
"loss": 0.2217,
"num_input_tokens_seen": 16832,
"step": 110
},
{
"epoch": 0.6388888888888888,
"grad_norm": 8.54447078704834,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.2413,
"num_input_tokens_seen": 17616,
"step": 115
},
{
"epoch": 0.6666666666666666,
"grad_norm": 17.670551300048828,
"learning_rate": 1.6527777777777777e-05,
"loss": 0.2773,
"num_input_tokens_seen": 18368,
"step": 120
},
{
"epoch": 0.6944444444444444,
"grad_norm": 3.102203607559204,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.2652,
"num_input_tokens_seen": 19120,
"step": 125
},
{
"epoch": 0.7222222222222222,
"grad_norm": 7.354095935821533,
"learning_rate": 1.7916666666666667e-05,
"loss": 0.1998,
"num_input_tokens_seen": 19872,
"step": 130
},
{
"epoch": 0.75,
"grad_norm": 9.806069374084473,
"learning_rate": 1.861111111111111e-05,
"loss": 0.3866,
"num_input_tokens_seen": 20608,
"step": 135
},
{
"epoch": 0.7777777777777778,
"grad_norm": 5.3900299072265625,
"learning_rate": 1.9305555555555558e-05,
"loss": 0.3152,
"num_input_tokens_seen": 21376,
"step": 140
},
{
"epoch": 0.8055555555555556,
"grad_norm": 7.56992769241333,
"learning_rate": 2e-05,
"loss": 0.2778,
"num_input_tokens_seen": 22080,
"step": 145
},
{
"epoch": 0.8333333333333334,
"grad_norm": 2.2404680252075195,
"learning_rate": 2.0694444444444445e-05,
"loss": 0.2645,
"num_input_tokens_seen": 22880,
"step": 150
},
{
"epoch": 0.8611111111111112,
"grad_norm": 12.300857543945312,
"learning_rate": 2.138888888888889e-05,
"loss": 0.2734,
"num_input_tokens_seen": 23664,
"step": 155
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.1452852487564087,
"learning_rate": 2.2083333333333333e-05,
"loss": 0.2669,
"num_input_tokens_seen": 24416,
"step": 160
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.9725009799003601,
"learning_rate": 2.277777777777778e-05,
"loss": 0.2419,
"num_input_tokens_seen": 25200,
"step": 165
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.5348834991455078,
"learning_rate": 2.3472222222222223e-05,
"loss": 0.2372,
"num_input_tokens_seen": 25936,
"step": 170
},
{
"epoch": 0.9722222222222222,
"grad_norm": 3.6036555767059326,
"learning_rate": 2.4166666666666667e-05,
"loss": 0.1935,
"num_input_tokens_seen": 26672,
"step": 175
},
{
"epoch": 1.0,
"grad_norm": 6.211343288421631,
"learning_rate": 2.4861111111111114e-05,
"loss": 0.5261,
"num_input_tokens_seen": 27424,
"step": 180
},
{
"epoch": 1.0,
"eval_loss": 0.26661795377731323,
"eval_runtime": 0.86,
"eval_samples_per_second": 46.512,
"eval_steps_per_second": 23.256,
"num_input_tokens_seen": 27424,
"step": 180
},
{
"epoch": 1.0277777777777777,
"grad_norm": 2.1466715335845947,
"learning_rate": 2.5555555555555554e-05,
"loss": 0.2197,
"num_input_tokens_seen": 28240,
"step": 185
},
{
"epoch": 1.0555555555555556,
"grad_norm": 1.111944317817688,
"learning_rate": 2.625e-05,
"loss": 0.264,
"num_input_tokens_seen": 28976,
"step": 190
},
{
"epoch": 1.0833333333333333,
"grad_norm": 0.7908123135566711,
"learning_rate": 2.6944444444444445e-05,
"loss": 0.235,
"num_input_tokens_seen": 29776,
"step": 195
},
{
"epoch": 1.1111111111111112,
"grad_norm": 8.314372062683105,
"learning_rate": 2.7638888888888892e-05,
"loss": 0.2339,
"num_input_tokens_seen": 30496,
"step": 200
},
{
"epoch": 1.1388888888888888,
"grad_norm": 2.806135416030884,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.1747,
"num_input_tokens_seen": 31216,
"step": 205
},
{
"epoch": 1.1666666666666667,
"grad_norm": 8.267914772033691,
"learning_rate": 2.9027777777777782e-05,
"loss": 0.4313,
"num_input_tokens_seen": 31984,
"step": 210
},
{
"epoch": 1.1944444444444444,
"grad_norm": 2.079704999923706,
"learning_rate": 2.9722222222222223e-05,
"loss": 0.1819,
"num_input_tokens_seen": 32704,
"step": 215
},
{
"epoch": 1.2222222222222223,
"grad_norm": 2.2734806537628174,
"learning_rate": 3.0416666666666666e-05,
"loss": 0.3006,
"num_input_tokens_seen": 33440,
"step": 220
},
{
"epoch": 1.25,
"grad_norm": 0.48634809255599976,
"learning_rate": 3.111111111111111e-05,
"loss": 0.2362,
"num_input_tokens_seen": 34208,
"step": 225
},
{
"epoch": 1.2777777777777777,
"grad_norm": 1.6364030838012695,
"learning_rate": 3.180555555555556e-05,
"loss": 0.2127,
"num_input_tokens_seen": 34960,
"step": 230
},
{
"epoch": 1.3055555555555556,
"grad_norm": 1.138373851776123,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.2249,
"num_input_tokens_seen": 35728,
"step": 235
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.3072881698608398,
"learning_rate": 3.3194444444444444e-05,
"loss": 0.248,
"num_input_tokens_seen": 36464,
"step": 240
},
{
"epoch": 1.3611111111111112,
"grad_norm": 2.485381603240967,
"learning_rate": 3.388888888888889e-05,
"loss": 0.3014,
"num_input_tokens_seen": 37264,
"step": 245
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.2830923795700073,
"learning_rate": 3.458333333333333e-05,
"loss": 0.2377,
"num_input_tokens_seen": 38000,
"step": 250
},
{
"epoch": 1.4166666666666667,
"grad_norm": 1.9528510570526123,
"learning_rate": 3.527777777777778e-05,
"loss": 0.2443,
"num_input_tokens_seen": 38752,
"step": 255
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.8921493887901306,
"learning_rate": 3.5972222222222225e-05,
"loss": 0.2766,
"num_input_tokens_seen": 39520,
"step": 260
},
{
"epoch": 1.4722222222222223,
"grad_norm": 0.5744603872299194,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.2485,
"num_input_tokens_seen": 40288,
"step": 265
},
{
"epoch": 1.5,
"grad_norm": 4.022632122039795,
"learning_rate": 3.736111111111111e-05,
"loss": 0.2624,
"num_input_tokens_seen": 41056,
"step": 270
},
{
"epoch": 1.5277777777777777,
"grad_norm": 2.419853687286377,
"learning_rate": 3.805555555555555e-05,
"loss": 0.2496,
"num_input_tokens_seen": 41808,
"step": 275
},
{
"epoch": 1.5555555555555556,
"grad_norm": 1.003553867340088,
"learning_rate": 3.875e-05,
"loss": 0.2213,
"num_input_tokens_seen": 42608,
"step": 280
},
{
"epoch": 1.5833333333333335,
"grad_norm": 1.9896985292434692,
"learning_rate": 3.944444444444445e-05,
"loss": 0.2944,
"num_input_tokens_seen": 43344,
"step": 285
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.9683464169502258,
"learning_rate": 4.0138888888888894e-05,
"loss": 0.2175,
"num_input_tokens_seen": 44096,
"step": 290
},
{
"epoch": 1.6388888888888888,
"grad_norm": 4.231605529785156,
"learning_rate": 4.0833333333333334e-05,
"loss": 0.2541,
"num_input_tokens_seen": 44832,
"step": 295
},
{
"epoch": 1.6666666666666665,
"grad_norm": 3.3105416297912598,
"learning_rate": 4.152777777777778e-05,
"loss": 0.3017,
"num_input_tokens_seen": 45568,
"step": 300
},
{
"epoch": 1.6944444444444444,
"grad_norm": 5.141078472137451,
"learning_rate": 4.222222222222222e-05,
"loss": 0.3072,
"num_input_tokens_seen": 46336,
"step": 305
},
{
"epoch": 1.7222222222222223,
"grad_norm": 6.590628147125244,
"learning_rate": 4.291666666666667e-05,
"loss": 0.2321,
"num_input_tokens_seen": 47104,
"step": 310
},
{
"epoch": 1.75,
"grad_norm": 1.4629476070404053,
"learning_rate": 4.3611111111111116e-05,
"loss": 0.3229,
"num_input_tokens_seen": 47888,
"step": 315
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.34415674209594727,
"learning_rate": 4.4305555555555556e-05,
"loss": 0.2341,
"num_input_tokens_seen": 48672,
"step": 320
},
{
"epoch": 1.8055555555555556,
"grad_norm": 1.390702724456787,
"learning_rate": 4.5e-05,
"loss": 0.241,
"num_input_tokens_seen": 49440,
"step": 325
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.3445907533168793,
"learning_rate": 4.569444444444444e-05,
"loss": 0.2414,
"num_input_tokens_seen": 50192,
"step": 330
},
{
"epoch": 1.8611111111111112,
"grad_norm": 0.7393403649330139,
"learning_rate": 4.638888888888889e-05,
"loss": 0.232,
"num_input_tokens_seen": 50992,
"step": 335
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.7771939635276794,
"learning_rate": 4.708333333333334e-05,
"loss": 0.2534,
"num_input_tokens_seen": 51744,
"step": 340
},
{
"epoch": 1.9166666666666665,
"grad_norm": 0.13477805256843567,
"learning_rate": 4.7777777777777784e-05,
"loss": 0.2324,
"num_input_tokens_seen": 52512,
"step": 345
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.6720524430274963,
"learning_rate": 4.8472222222222224e-05,
"loss": 0.2067,
"num_input_tokens_seen": 53296,
"step": 350
},
{
"epoch": 1.9722222222222223,
"grad_norm": 0.5370499491691589,
"learning_rate": 4.9166666666666665e-05,
"loss": 0.173,
"num_input_tokens_seen": 54080,
"step": 355
},
{
"epoch": 2.0,
"grad_norm": 2.276262044906616,
"learning_rate": 4.986111111111111e-05,
"loss": 0.4265,
"num_input_tokens_seen": 54832,
"step": 360
},
{
"epoch": 2.0,
"eval_loss": 0.25172701478004456,
"eval_runtime": 0.8641,
"eval_samples_per_second": 46.29,
"eval_steps_per_second": 23.145,
"num_input_tokens_seen": 54832,
"step": 360
},
{
"epoch": 2.0277777777777777,
"grad_norm": 0.8999596238136292,
"learning_rate": 4.99998119647914e-05,
"loss": 0.2602,
"num_input_tokens_seen": 55600,
"step": 365
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.34948647022247314,
"learning_rate": 4.999904807660428e-05,
"loss": 0.2329,
"num_input_tokens_seen": 56352,
"step": 370
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.5982630252838135,
"learning_rate": 4.999769660117901e-05,
"loss": 0.264,
"num_input_tokens_seen": 57136,
"step": 375
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.5527498722076416,
"learning_rate": 4.999575757028119e-05,
"loss": 0.227,
"num_input_tokens_seen": 57856,
"step": 380
},
{
"epoch": 2.138888888888889,
"grad_norm": 0.48654255270957947,
"learning_rate": 4.9993231029486544e-05,
"loss": 0.2217,
"num_input_tokens_seen": 58592,
"step": 385
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.8106479048728943,
"learning_rate": 4.999011703817986e-05,
"loss": 0.2329,
"num_input_tokens_seen": 59328,
"step": 390
},
{
"epoch": 2.1944444444444446,
"grad_norm": 0.26983633637428284,
"learning_rate": 4.9986415669553586e-05,
"loss": 0.2497,
"num_input_tokens_seen": 60064,
"step": 395
},
{
"epoch": 2.2222222222222223,
"grad_norm": 3.985640287399292,
"learning_rate": 4.998212701060612e-05,
"loss": 0.2642,
"num_input_tokens_seen": 60800,
"step": 400
},
{
"epoch": 2.25,
"grad_norm": 0.9668533205986023,
"learning_rate": 4.997725116213973e-05,
"loss": 0.2568,
"num_input_tokens_seen": 61536,
"step": 405
},
{
"epoch": 2.2777777777777777,
"grad_norm": 1.4461299180984497,
"learning_rate": 4.997178823875826e-05,
"loss": 0.2494,
"num_input_tokens_seen": 62304,
"step": 410
},
{
"epoch": 2.3055555555555554,
"grad_norm": 0.20660564303398132,
"learning_rate": 4.996573836886435e-05,
"loss": 0.2262,
"num_input_tokens_seen": 63088,
"step": 415
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.8261621594429016,
"learning_rate": 4.995910169465646e-05,
"loss": 0.2546,
"num_input_tokens_seen": 63824,
"step": 420
},
{
"epoch": 2.361111111111111,
"grad_norm": 0.1395564079284668,
"learning_rate": 4.9951878372125547e-05,
"loss": 0.2324,
"num_input_tokens_seen": 64608,
"step": 425
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.7593284249305725,
"learning_rate": 4.994406857105136e-05,
"loss": 0.2425,
"num_input_tokens_seen": 65360,
"step": 430
},
{
"epoch": 2.4166666666666665,
"grad_norm": 0.5143575072288513,
"learning_rate": 4.993567247499845e-05,
"loss": 0.243,
"num_input_tokens_seen": 66144,
"step": 435
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.6070240139961243,
"learning_rate": 4.9926690281311904e-05,
"loss": 0.2422,
"num_input_tokens_seen": 66896,
"step": 440
},
{
"epoch": 2.4722222222222223,
"grad_norm": 0.119643434882164,
"learning_rate": 4.9917122201112656e-05,
"loss": 0.2333,
"num_input_tokens_seen": 67648,
"step": 445
},
{
"epoch": 2.5,
"grad_norm": 0.589148223400116,
"learning_rate": 4.9906968459292524e-05,
"loss": 0.2297,
"num_input_tokens_seen": 68416,
"step": 450
},
{
"epoch": 2.5277777777777777,
"grad_norm": 0.4487152695655823,
"learning_rate": 4.9896229294508976e-05,
"loss": 0.2168,
"num_input_tokens_seen": 69184,
"step": 455
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.43815767765045166,
"learning_rate": 4.988490495917947e-05,
"loss": 0.2522,
"num_input_tokens_seen": 69984,
"step": 460
},
{
"epoch": 2.5833333333333335,
"grad_norm": 0.4412115216255188,
"learning_rate": 4.987299571947553e-05,
"loss": 0.2473,
"num_input_tokens_seen": 70752,
"step": 465
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.0880437046289444,
"learning_rate": 4.9860501855316514e-05,
"loss": 0.2256,
"num_input_tokens_seen": 71504,
"step": 470
},
{
"epoch": 2.638888888888889,
"grad_norm": 0.30497997999191284,
"learning_rate": 4.9847423660363e-05,
"loss": 0.1775,
"num_input_tokens_seen": 72224,
"step": 475
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.5846831798553467,
"learning_rate": 4.983376144200992e-05,
"loss": 0.2647,
"num_input_tokens_seen": 72976,
"step": 480
},
{
"epoch": 2.6944444444444446,
"grad_norm": 0.2081085443496704,
"learning_rate": 4.981951552137929e-05,
"loss": 0.3079,
"num_input_tokens_seen": 73728,
"step": 485
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.12143510580062866,
"learning_rate": 4.980468623331273e-05,
"loss": 0.237,
"num_input_tokens_seen": 74496,
"step": 490
},
{
"epoch": 2.75,
"grad_norm": 0.0849996879696846,
"learning_rate": 4.978927392636351e-05,
"loss": 0.2329,
"num_input_tokens_seen": 75280,
"step": 495
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.11367756128311157,
"learning_rate": 4.9773278962788436e-05,
"loss": 0.2298,
"num_input_tokens_seen": 76048,
"step": 500
},
{
"epoch": 2.8055555555555554,
"grad_norm": 0.08262645453214645,
"learning_rate": 4.975670171853926e-05,
"loss": 0.24,
"num_input_tokens_seen": 76832,
"step": 505
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.09589759260416031,
"learning_rate": 4.973954258325392e-05,
"loss": 0.2383,
"num_input_tokens_seen": 77568,
"step": 510
},
{
"epoch": 2.861111111111111,
"grad_norm": 0.12209156900644302,
"learning_rate": 4.972180196024733e-05,
"loss": 0.2298,
"num_input_tokens_seen": 78352,
"step": 515
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.27318504452705383,
"learning_rate": 4.97034802665019e-05,
"loss": 0.2255,
"num_input_tokens_seen": 79152,
"step": 520
},
{
"epoch": 2.9166666666666665,
"grad_norm": 0.5788432955741882,
"learning_rate": 4.9684577932657786e-05,
"loss": 0.2326,
"num_input_tokens_seen": 79904,
"step": 525
},
{
"epoch": 2.9444444444444446,
"grad_norm": 0.4293730556964874,
"learning_rate": 4.966509540300269e-05,
"loss": 0.269,
"num_input_tokens_seen": 80624,
"step": 530
},
{
"epoch": 2.9722222222222223,
"grad_norm": 0.09981006383895874,
"learning_rate": 4.9645033135461494e-05,
"loss": 0.2429,
"num_input_tokens_seen": 81376,
"step": 535
},
{
"epoch": 3.0,
"grad_norm": 0.46701985597610474,
"learning_rate": 4.962439160158544e-05,
"loss": 0.2294,
"num_input_tokens_seen": 82160,
"step": 540
},
{
"epoch": 3.0,
"eval_loss": 0.23996683955192566,
"eval_runtime": 0.8775,
"eval_samples_per_second": 45.586,
"eval_steps_per_second": 22.793,
"num_input_tokens_seen": 82160,
"step": 540
},
{
"epoch": 3.0277777777777777,
"grad_norm": 0.09257233142852783,
"learning_rate": 4.960317128654108e-05,
"loss": 0.2281,
"num_input_tokens_seen": 82880,
"step": 545
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.19769765436649323,
"learning_rate": 4.958137268909887e-05,
"loss": 0.1958,
"num_input_tokens_seen": 83632,
"step": 550
},
{
"epoch": 3.0833333333333335,
"grad_norm": 0.11292538791894913,
"learning_rate": 4.9558996321621405e-05,
"loss": 0.2951,
"num_input_tokens_seen": 84416,
"step": 555
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.30781084299087524,
"learning_rate": 4.953604271005144e-05,
"loss": 0.2366,
"num_input_tokens_seen": 85184,
"step": 560
},
{
"epoch": 3.138888888888889,
"grad_norm": 0.363540917634964,
"learning_rate": 4.951251239389948e-05,
"loss": 0.2303,
"num_input_tokens_seen": 85936,
"step": 565
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.053299520164728165,
"learning_rate": 4.9488405926231144e-05,
"loss": 0.2306,
"num_input_tokens_seen": 86688,
"step": 570
},
{
"epoch": 3.1944444444444446,
"grad_norm": 0.030355053022503853,
"learning_rate": 4.946372387365409e-05,
"loss": 0.2238,
"num_input_tokens_seen": 87424,
"step": 575
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.057436492294073105,
"learning_rate": 4.943846681630479e-05,
"loss": 0.214,
"num_input_tokens_seen": 88192,
"step": 580
},
{
"epoch": 3.25,
"grad_norm": 0.23630383610725403,
"learning_rate": 4.941263534783482e-05,
"loss": 0.2115,
"num_input_tokens_seen": 88976,
"step": 585
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.092474564909935,
"learning_rate": 4.9386230075396964e-05,
"loss": 0.2541,
"num_input_tokens_seen": 89744,
"step": 590
},
{
"epoch": 3.3055555555555554,
"grad_norm": 0.06676716357469559,
"learning_rate": 4.9359251619630886e-05,
"loss": 0.2639,
"num_input_tokens_seen": 90496,
"step": 595
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.2717723250389099,
"learning_rate": 4.933170061464858e-05,
"loss": 0.2006,
"num_input_tokens_seen": 91280,
"step": 600
},
{
"epoch": 3.361111111111111,
"grad_norm": 0.05707673728466034,
"learning_rate": 4.930357770801947e-05,
"loss": 0.225,
"num_input_tokens_seen": 92048,
"step": 605
},
{
"epoch": 3.388888888888889,
"grad_norm": 0.0571831539273262,
"learning_rate": 4.9274883560755156e-05,
"loss": 0.2418,
"num_input_tokens_seen": 92800,
"step": 610
},
{
"epoch": 3.4166666666666665,
"grad_norm": 0.26747772097587585,
"learning_rate": 4.924561884729391e-05,
"loss": 0.24,
"num_input_tokens_seen": 93568,
"step": 615
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.026865195482969284,
"learning_rate": 4.921578425548482e-05,
"loss": 0.234,
"num_input_tokens_seen": 94304,
"step": 620
},
{
"epoch": 3.4722222222222223,
"grad_norm": 0.3754294216632843,
"learning_rate": 4.9185380486571595e-05,
"loss": 0.2314,
"num_input_tokens_seen": 95056,
"step": 625
},
{
"epoch": 3.5,
"grad_norm": 0.11914645880460739,
"learning_rate": 4.915440825517612e-05,
"loss": 0.2524,
"num_input_tokens_seen": 95856,
"step": 630
},
{
"epoch": 3.5277777777777777,
"grad_norm": 0.361465185880661,
"learning_rate": 4.912286828928162e-05,
"loss": 0.254,
"num_input_tokens_seen": 96608,
"step": 635
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.2721046507358551,
"learning_rate": 4.909076133021557e-05,
"loss": 0.2339,
"num_input_tokens_seen": 97344,
"step": 640
},
{
"epoch": 3.5833333333333335,
"grad_norm": 0.2542001008987427,
"learning_rate": 4.9058088132632306e-05,
"loss": 0.224,
"num_input_tokens_seen": 98096,
"step": 645
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.2439340353012085,
"learning_rate": 4.9024849464495215e-05,
"loss": 0.2386,
"num_input_tokens_seen": 98864,
"step": 650
},
{
"epoch": 3.638888888888889,
"grad_norm": 0.02642286755144596,
"learning_rate": 4.8991046107058735e-05,
"loss": 0.2301,
"num_input_tokens_seen": 99632,
"step": 655
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.04620293900370598,
"learning_rate": 4.895667885484997e-05,
"loss": 0.2337,
"num_input_tokens_seen": 100400,
"step": 660
},
{
"epoch": 3.6944444444444446,
"grad_norm": 0.042576540261507034,
"learning_rate": 4.892174851565004e-05,
"loss": 0.2402,
"num_input_tokens_seen": 101168,
"step": 665
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.2519048750400543,
"learning_rate": 4.8886255910475054e-05,
"loss": 0.2318,
"num_input_tokens_seen": 101936,
"step": 670
},
{
"epoch": 3.75,
"grad_norm": 0.01728709600865841,
"learning_rate": 4.885020187355687e-05,
"loss": 0.2338,
"num_input_tokens_seen": 102656,
"step": 675
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.23266607522964478,
"learning_rate": 4.881358725232342e-05,
"loss": 0.2338,
"num_input_tokens_seen": 103424,
"step": 680
},
{
"epoch": 3.8055555555555554,
"grad_norm": 0.040413811802864075,
"learning_rate": 4.877641290737884e-05,
"loss": 0.2254,
"num_input_tokens_seen": 104208,
"step": 685
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.050276659429073334,
"learning_rate": 4.873867971248324e-05,
"loss": 0.2327,
"num_input_tokens_seen": 104960,
"step": 690
},
{
"epoch": 3.861111111111111,
"grad_norm": 0.03541669622063637,
"learning_rate": 4.870038855453213e-05,
"loss": 0.2327,
"num_input_tokens_seen": 105744,
"step": 695
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.05416063964366913,
"learning_rate": 4.866154033353561e-05,
"loss": 0.2289,
"num_input_tokens_seen": 106512,
"step": 700
},
{
"epoch": 3.9166666666666665,
"grad_norm": 0.015437111258506775,
"learning_rate": 4.86221359625972e-05,
"loss": 0.2532,
"num_input_tokens_seen": 107280,
"step": 705
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.2228126972913742,
"learning_rate": 4.858217636789241e-05,
"loss": 0.2275,
"num_input_tokens_seen": 108080,
"step": 710
},
{
"epoch": 3.9722222222222223,
"grad_norm": 0.2948314845561981,
"learning_rate": 4.854166248864689e-05,
"loss": 0.248,
"num_input_tokens_seen": 108848,
"step": 715
},
{
"epoch": 4.0,
"grad_norm": 0.0426083505153656,
"learning_rate": 4.850059527711444e-05,
"loss": 0.2376,
"num_input_tokens_seen": 109632,
"step": 720
},
{
"epoch": 4.0,
"eval_loss": 0.23617739975452423,
"eval_runtime": 0.8738,
"eval_samples_per_second": 45.779,
"eval_steps_per_second": 22.89,
"num_input_tokens_seen": 109632,
"step": 720
},
{
"epoch": 4.027777777777778,
"grad_norm": 0.037765853106975555,
"learning_rate": 4.84589756985546e-05,
"loss": 0.2697,
"num_input_tokens_seen": 110416,
"step": 725
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.02199905924499035,
"learning_rate": 4.8416804731209945e-05,
"loss": 0.23,
"num_input_tokens_seen": 111200,
"step": 730
},
{
"epoch": 4.083333333333333,
"grad_norm": 0.2124335765838623,
"learning_rate": 4.8374083366283096e-05,
"loss": 0.2238,
"num_input_tokens_seen": 111936,
"step": 735
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.22587046027183533,
"learning_rate": 4.833081260791345e-05,
"loss": 0.2309,
"num_input_tokens_seen": 112704,
"step": 740
},
{
"epoch": 4.138888888888889,
"grad_norm": 0.03960296884179115,
"learning_rate": 4.828699347315356e-05,
"loss": 0.2446,
"num_input_tokens_seen": 113456,
"step": 745
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.056854937225580215,
"learning_rate": 4.82426269919452e-05,
"loss": 0.2357,
"num_input_tokens_seen": 114224,
"step": 750
},
{
"epoch": 4.194444444444445,
"grad_norm": 0.19557850062847137,
"learning_rate": 4.8197714207095205e-05,
"loss": 0.207,
"num_input_tokens_seen": 114976,
"step": 755
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.20211917161941528,
"learning_rate": 4.815225617425095e-05,
"loss": 0.226,
"num_input_tokens_seen": 115728,
"step": 760
},
{
"epoch": 4.25,
"grad_norm": 0.057014793157577515,
"learning_rate": 4.8106253961875506e-05,
"loss": 0.2172,
"num_input_tokens_seen": 116496,
"step": 765
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.0766286551952362,
"learning_rate": 4.805970865122257e-05,
"loss": 0.2233,
"num_input_tokens_seen": 117248,
"step": 770
},
{
"epoch": 4.305555555555555,
"grad_norm": 0.3298178017139435,
"learning_rate": 4.8012621336311016e-05,
"loss": 0.2762,
"num_input_tokens_seen": 118000,
"step": 775
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.04625969007611275,
"learning_rate": 4.7964993123899195e-05,
"loss": 0.2508,
"num_input_tokens_seen": 118768,
"step": 780
},
{
"epoch": 4.361111111111111,
"grad_norm": 0.037253882735967636,
"learning_rate": 4.791682513345892e-05,
"loss": 0.205,
"num_input_tokens_seen": 119520,
"step": 785
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.24698075652122498,
"learning_rate": 4.786811849714918e-05,
"loss": 0.2444,
"num_input_tokens_seen": 120288,
"step": 790
},
{
"epoch": 4.416666666666667,
"grad_norm": 0.02930288575589657,
"learning_rate": 4.781887435978947e-05,
"loss": 0.24,
"num_input_tokens_seen": 121040,
"step": 795
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.2211817353963852,
"learning_rate": 4.776909387883292e-05,
"loss": 0.232,
"num_input_tokens_seen": 121824,
"step": 800
},
{
"epoch": 4.472222222222222,
"grad_norm": 0.04251676797866821,
"learning_rate": 4.771877822433911e-05,
"loss": 0.2336,
"num_input_tokens_seen": 122592,
"step": 805
},
{
"epoch": 4.5,
"grad_norm": 0.2373417764902115,
"learning_rate": 4.766792857894652e-05,
"loss": 0.2232,
"num_input_tokens_seen": 123360,
"step": 810
},
{
"epoch": 4.527777777777778,
"grad_norm": 0.09388674795627594,
"learning_rate": 4.761654613784477e-05,
"loss": 0.2362,
"num_input_tokens_seen": 124128,
"step": 815
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.26708221435546875,
"learning_rate": 4.756463210874652e-05,
"loss": 0.2136,
"num_input_tokens_seen": 124864,
"step": 820
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.7205080986022949,
"learning_rate": 4.751218771185906e-05,
"loss": 0.2201,
"num_input_tokens_seen": 125632,
"step": 825
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.19915883243083954,
"learning_rate": 4.745921417985566e-05,
"loss": 0.2112,
"num_input_tokens_seen": 126384,
"step": 830
},
{
"epoch": 4.638888888888889,
"grad_norm": 0.1531069576740265,
"learning_rate": 4.740571275784659e-05,
"loss": 0.2945,
"num_input_tokens_seen": 127152,
"step": 835
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.2744501829147339,
"learning_rate": 4.735168470334984e-05,
"loss": 0.2561,
"num_input_tokens_seen": 127920,
"step": 840
},
{
"epoch": 4.694444444444445,
"grad_norm": 0.21660907566547394,
"learning_rate": 4.729713128626158e-05,
"loss": 0.248,
"num_input_tokens_seen": 128688,
"step": 845
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.06808862090110779,
"learning_rate": 4.72420537888263e-05,
"loss": 0.2383,
"num_input_tokens_seen": 129456,
"step": 850
},
{
"epoch": 4.75,
"grad_norm": 0.04663322493433952,
"learning_rate": 4.7186453505606676e-05,
"loss": 0.2379,
"num_input_tokens_seen": 130208,
"step": 855
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.22233685851097107,
"learning_rate": 4.713033174345314e-05,
"loss": 0.2327,
"num_input_tokens_seen": 130960,
"step": 860
},
{
"epoch": 4.805555555555555,
"grad_norm": 0.234144389629364,
"learning_rate": 4.707368982147318e-05,
"loss": 0.2366,
"num_input_tokens_seen": 131712,
"step": 865
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.22509251534938812,
"learning_rate": 4.701652907100029e-05,
"loss": 0.2256,
"num_input_tokens_seen": 132496,
"step": 870
},
{
"epoch": 4.861111111111111,
"grad_norm": 0.26232126355171204,
"learning_rate": 4.695885083556275e-05,
"loss": 0.2321,
"num_input_tokens_seen": 133280,
"step": 875
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.036304328590631485,
"learning_rate": 4.6900656470851964e-05,
"loss": 0.2288,
"num_input_tokens_seen": 134064,
"step": 880
},
{
"epoch": 4.916666666666667,
"grad_norm": 0.23388998210430145,
"learning_rate": 4.684194734469067e-05,
"loss": 0.2301,
"num_input_tokens_seen": 134816,
"step": 885
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.28110790252685547,
"learning_rate": 4.678272483700074e-05,
"loss": 0.2361,
"num_input_tokens_seen": 135616,
"step": 890
},
{
"epoch": 4.972222222222222,
"grad_norm": 0.03570658713579178,
"learning_rate": 4.672299033977076e-05,
"loss": 0.2297,
"num_input_tokens_seen": 136368,
"step": 895
},
{
"epoch": 5.0,
"grad_norm": 0.22953547537326813,
"learning_rate": 4.6662745257023325e-05,
"loss": 0.2273,
"num_input_tokens_seen": 137120,
"step": 900
},
{
"epoch": 5.0,
"eval_loss": 0.2374284714460373,
"eval_runtime": 0.8812,
"eval_samples_per_second": 45.392,
"eval_steps_per_second": 22.696,
"num_input_tokens_seen": 137120,
"step": 900
},
{
"epoch": 5.027777777777778,
"grad_norm": 0.05738260596990585,
"learning_rate": 4.660199100478202e-05,
"loss": 0.2263,
"num_input_tokens_seen": 137888,
"step": 905
},
{
"epoch": 5.055555555555555,
"grad_norm": 0.3403209447860718,
"learning_rate": 4.6540729011038146e-05,
"loss": 0.2385,
"num_input_tokens_seen": 138672,
"step": 910
},
{
"epoch": 5.083333333333333,
"grad_norm": 0.082719586789608,
"learning_rate": 4.6478960715717176e-05,
"loss": 0.2097,
"num_input_tokens_seen": 139440,
"step": 915
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.078125961124897,
"learning_rate": 4.641668757064486e-05,
"loss": 0.2426,
"num_input_tokens_seen": 140176,
"step": 920
},
{
"epoch": 5.138888888888889,
"grad_norm": 0.23293739557266235,
"learning_rate": 4.6353911039513145e-05,
"loss": 0.2398,
"num_input_tokens_seen": 140912,
"step": 925
},
{
"epoch": 5.166666666666667,
"grad_norm": 0.27510783076286316,
"learning_rate": 4.6290632597845755e-05,
"loss": 0.249,
"num_input_tokens_seen": 141680,
"step": 930
},
{
"epoch": 5.194444444444445,
"grad_norm": 0.029512202367186546,
"learning_rate": 4.622685373296353e-05,
"loss": 0.2285,
"num_input_tokens_seen": 142432,
"step": 935
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.02141384780406952,
"learning_rate": 4.61625759439494e-05,
"loss": 0.2402,
"num_input_tokens_seen": 143216,
"step": 940
},
{
"epoch": 5.25,
"grad_norm": 0.2061186283826828,
"learning_rate": 4.609780074161327e-05,
"loss": 0.2217,
"num_input_tokens_seen": 143968,
"step": 945
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.2300575077533722,
"learning_rate": 4.603252964845638e-05,
"loss": 0.2381,
"num_input_tokens_seen": 144768,
"step": 950
},
{
"epoch": 5.305555555555555,
"grad_norm": 0.044719330966472626,
"learning_rate": 4.5966764198635606e-05,
"loss": 0.2356,
"num_input_tokens_seen": 145552,
"step": 955
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.02483515627682209,
"learning_rate": 4.590050593792736e-05,
"loss": 0.2379,
"num_input_tokens_seen": 146288,
"step": 960
},
{
"epoch": 5.361111111111111,
"grad_norm": 0.2347053736448288,
"learning_rate": 4.583375642369129e-05,
"loss": 0.2357,
"num_input_tokens_seen": 147040,
"step": 965
},
{
"epoch": 5.388888888888889,
"grad_norm": 0.05795735865831375,
"learning_rate": 4.5766517224833637e-05,
"loss": 0.2252,
"num_input_tokens_seen": 147824,
"step": 970
},
{
"epoch": 5.416666666666667,
"grad_norm": 0.06415160000324249,
"learning_rate": 4.569878992177039e-05,
"loss": 0.218,
"num_input_tokens_seen": 148560,
"step": 975
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.06607173383235931,
"learning_rate": 4.5630576106390114e-05,
"loss": 0.2503,
"num_input_tokens_seen": 149344,
"step": 980
},
{
"epoch": 5.472222222222222,
"grad_norm": 0.05142849683761597,
"learning_rate": 4.556187738201656e-05,
"loss": 0.2226,
"num_input_tokens_seen": 150128,
"step": 985
},
{
"epoch": 5.5,
"grad_norm": 0.2409844547510147,
"learning_rate": 4.549269536337095e-05,
"loss": 0.2328,
"num_input_tokens_seen": 150880,
"step": 990
},
{
"epoch": 5.527777777777778,
"grad_norm": 0.22324684262275696,
"learning_rate": 4.5423031676534065e-05,
"loss": 0.2514,
"num_input_tokens_seen": 151648,
"step": 995
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.21208706498146057,
"learning_rate": 4.535288795890798e-05,
"loss": 0.238,
"num_input_tokens_seen": 152416,
"step": 1000
},
{
"epoch": 5.583333333333333,
"grad_norm": 0.011771623976528645,
"learning_rate": 4.528226585917761e-05,
"loss": 0.236,
"num_input_tokens_seen": 153184,
"step": 1005
},
{
"epoch": 5.611111111111111,
"grad_norm": 0.19553421437740326,
"learning_rate": 4.521116703727193e-05,
"loss": 0.2422,
"num_input_tokens_seen": 153936,
"step": 1010
},
{
"epoch": 5.638888888888889,
"grad_norm": 0.061913229525089264,
"learning_rate": 4.5139593164324986e-05,
"loss": 0.2277,
"num_input_tokens_seen": 154720,
"step": 1015
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.18774497509002686,
"learning_rate": 4.506754592263662e-05,
"loss": 0.2296,
"num_input_tokens_seen": 155488,
"step": 1020
},
{
"epoch": 5.694444444444445,
"grad_norm": 0.06410039216279984,
"learning_rate": 4.49950270056329e-05,
"loss": 0.2196,
"num_input_tokens_seen": 156272,
"step": 1025
},
{
"epoch": 5.722222222222222,
"grad_norm": 0.21270763874053955,
"learning_rate": 4.4922038117826334e-05,
"loss": 0.2323,
"num_input_tokens_seen": 157040,
"step": 1030
},
{
"epoch": 5.75,
"grad_norm": 0.01843346282839775,
"learning_rate": 4.48485809747758e-05,
"loss": 0.2386,
"num_input_tokens_seen": 157792,
"step": 1035
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.20497548580169678,
"learning_rate": 4.477465730304624e-05,
"loss": 0.2322,
"num_input_tokens_seen": 158544,
"step": 1040
},
{
"epoch": 5.805555555555555,
"grad_norm": 0.1923503577709198,
"learning_rate": 4.4700268840168045e-05,
"loss": 0.238,
"num_input_tokens_seen": 159280,
"step": 1045
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.021967420354485512,
"learning_rate": 4.462541733459628e-05,
"loss": 0.2255,
"num_input_tokens_seen": 160016,
"step": 1050
},
{
"epoch": 5.861111111111111,
"grad_norm": 0.19093094766139984,
"learning_rate": 4.455010454566947e-05,
"loss": 0.2254,
"num_input_tokens_seen": 160768,
"step": 1055
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.19332638382911682,
"learning_rate": 4.447433224356839e-05,
"loss": 0.2382,
"num_input_tokens_seen": 161520,
"step": 1060
},
{
"epoch": 5.916666666666667,
"grad_norm": 0.023951267823576927,
"learning_rate": 4.439810220927436e-05,
"loss": 0.2359,
"num_input_tokens_seen": 162304,
"step": 1065
},
{
"epoch": 5.944444444444445,
"grad_norm": 0.04824138060212135,
"learning_rate": 4.432141623452743e-05,
"loss": 0.2315,
"num_input_tokens_seen": 163088,
"step": 1070
},
{
"epoch": 5.972222222222222,
"grad_norm": 0.18069137632846832,
"learning_rate": 4.4244276121784195e-05,
"loss": 0.2296,
"num_input_tokens_seen": 163856,
"step": 1075
},
{
"epoch": 6.0,
"grad_norm": 0.204057514667511,
"learning_rate": 4.416668368417556e-05,
"loss": 0.2282,
"num_input_tokens_seen": 164592,
"step": 1080
},
{
"epoch": 6.0,
"eval_loss": 0.2412218302488327,
"eval_runtime": 0.8694,
"eval_samples_per_second": 46.006,
"eval_steps_per_second": 23.003,
"num_input_tokens_seen": 164592,
"step": 1080
},
{
"epoch": 6.027777777777778,
"grad_norm": 0.03645794838666916,
"learning_rate": 4.408864074546401e-05,
"loss": 0.2158,
"num_input_tokens_seen": 165344,
"step": 1085
},
{
"epoch": 6.055555555555555,
"grad_norm": 0.030243530869483948,
"learning_rate": 4.401014914000078e-05,
"loss": 0.2433,
"num_input_tokens_seen": 166112,
"step": 1090
},
{
"epoch": 6.083333333333333,
"grad_norm": 0.21185517311096191,
"learning_rate": 4.393121071268274e-05,
"loss": 0.2346,
"num_input_tokens_seen": 166880,
"step": 1095
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.1881146878004074,
"learning_rate": 4.3851827318909036e-05,
"loss": 0.232,
"num_input_tokens_seen": 167648,
"step": 1100
},
{
"epoch": 6.138888888888889,
"grad_norm": 0.2051820009946823,
"learning_rate": 4.377200082453749e-05,
"loss": 0.2136,
"num_input_tokens_seen": 168432,
"step": 1105
},
{
"epoch": 6.166666666666667,
"grad_norm": 0.061713505536317825,
"learning_rate": 4.36917331058407e-05,
"loss": 0.2197,
"num_input_tokens_seen": 169168,
"step": 1110
},
{
"epoch": 6.194444444444445,
"grad_norm": 0.23004846274852753,
"learning_rate": 4.361102604946201e-05,
"loss": 0.2412,
"num_input_tokens_seen": 169936,
"step": 1115
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.1857406497001648,
"learning_rate": 4.3529881552371096e-05,
"loss": 0.2218,
"num_input_tokens_seen": 170688,
"step": 1120
},
{
"epoch": 6.25,
"grad_norm": 0.04798297584056854,
"learning_rate": 4.344830152181941e-05,
"loss": 0.2451,
"num_input_tokens_seen": 171440,
"step": 1125
},
{
"epoch": 6.277777777777778,
"grad_norm": 0.22263100743293762,
"learning_rate": 4.336628787529538e-05,
"loss": 0.2434,
"num_input_tokens_seen": 172192,
"step": 1130
},
{
"epoch": 6.305555555555555,
"grad_norm": 0.03338240459561348,
"learning_rate": 4.3283842540479264e-05,
"loss": 0.2266,
"num_input_tokens_seen": 172960,
"step": 1135
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.05144157633185387,
"learning_rate": 4.320096745519793e-05,
"loss": 0.2276,
"num_input_tokens_seen": 173744,
"step": 1140
},
{
"epoch": 6.361111111111111,
"grad_norm": 0.2107207328081131,
"learning_rate": 4.3117664567379237e-05,
"loss": 0.2437,
"num_input_tokens_seen": 174464,
"step": 1145
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.20211313664913177,
"learning_rate": 4.303393583500628e-05,
"loss": 0.2342,
"num_input_tokens_seen": 175200,
"step": 1150
},
{
"epoch": 6.416666666666667,
"grad_norm": 0.053128089755773544,
"learning_rate": 4.2949783226071406e-05,
"loss": 0.2257,
"num_input_tokens_seen": 175936,
"step": 1155
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.17988252639770508,
"learning_rate": 4.286520871852987e-05,
"loss": 0.2135,
"num_input_tokens_seen": 176672,
"step": 1160
},
{
"epoch": 6.472222222222222,
"grad_norm": 0.027884148061275482,
"learning_rate": 4.278021430025343e-05,
"loss": 0.2257,
"num_input_tokens_seen": 177440,
"step": 1165
},
{
"epoch": 6.5,
"grad_norm": 0.17274907231330872,
"learning_rate": 4.2694801968983566e-05,
"loss": 0.2188,
"num_input_tokens_seen": 178240,
"step": 1170
},
{
"epoch": 6.527777777777778,
"grad_norm": 0.25237998366355896,
"learning_rate": 4.260897373228456e-05,
"loss": 0.2585,
"num_input_tokens_seen": 178976,
"step": 1175
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.038305509835481644,
"learning_rate": 4.2522731607496275e-05,
"loss": 0.2052,
"num_input_tokens_seen": 179728,
"step": 1180
},
{
"epoch": 6.583333333333333,
"grad_norm": 0.17462146282196045,
"learning_rate": 4.2436077621686786e-05,
"loss": 0.2104,
"num_input_tokens_seen": 180448,
"step": 1185
},
{
"epoch": 6.611111111111111,
"grad_norm": 0.056279465556144714,
"learning_rate": 4.234901381160469e-05,
"loss": 0.2497,
"num_input_tokens_seen": 181168,
"step": 1190
},
{
"epoch": 6.638888888888889,
"grad_norm": 0.0892854705452919,
"learning_rate": 4.226154222363124e-05,
"loss": 0.2343,
"num_input_tokens_seen": 181968,
"step": 1195
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.0540282167494297,
"learning_rate": 4.21736649137323e-05,
"loss": 0.2443,
"num_input_tokens_seen": 182704,
"step": 1200
},
{
"epoch": 6.694444444444445,
"grad_norm": 0.17779448628425598,
"learning_rate": 4.208538394740993e-05,
"loss": 0.2134,
"num_input_tokens_seen": 183456,
"step": 1205
},
{
"epoch": 6.722222222222222,
"grad_norm": 0.2300095558166504,
"learning_rate": 4.199670139965393e-05,
"loss": 0.2263,
"num_input_tokens_seen": 184224,
"step": 1210
},
{
"epoch": 6.75,
"grad_norm": 0.06911563873291016,
"learning_rate": 4.1907619354892965e-05,
"loss": 0.2349,
"num_input_tokens_seen": 184992,
"step": 1215
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.05566706135869026,
"learning_rate": 4.1818139906945694e-05,
"loss": 0.2334,
"num_input_tokens_seen": 185728,
"step": 1220
},
{
"epoch": 6.805555555555555,
"grad_norm": 0.22851161658763885,
"learning_rate": 4.172826515897146e-05,
"loss": 0.2342,
"num_input_tokens_seen": 186544,
"step": 1225
},
{
"epoch": 6.833333333333333,
"grad_norm": 0.026552407070994377,
"learning_rate": 4.163799722342089e-05,
"loss": 0.2314,
"num_input_tokens_seen": 187296,
"step": 1230
},
{
"epoch": 6.861111111111111,
"grad_norm": 0.22173333168029785,
"learning_rate": 4.1547338221986266e-05,
"loss": 0.2378,
"num_input_tokens_seen": 188080,
"step": 1235
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.2135874181985855,
"learning_rate": 4.1456290285551596e-05,
"loss": 0.2156,
"num_input_tokens_seen": 188864,
"step": 1240
},
{
"epoch": 6.916666666666667,
"grad_norm": 0.2219124585390091,
"learning_rate": 4.13648555541426e-05,
"loss": 0.2045,
"num_input_tokens_seen": 189648,
"step": 1245
},
{
"epoch": 6.944444444444445,
"grad_norm": 0.08830613642930984,
"learning_rate": 4.127303617687636e-05,
"loss": 0.2812,
"num_input_tokens_seen": 190416,
"step": 1250
},
{
"epoch": 6.972222222222222,
"grad_norm": 0.25774773955345154,
"learning_rate": 4.118083431191081e-05,
"loss": 0.2417,
"num_input_tokens_seen": 191152,
"step": 1255
},
{
"epoch": 7.0,
"grad_norm": 0.07823996245861053,
"learning_rate": 4.108825212639405e-05,
"loss": 0.2299,
"num_input_tokens_seen": 191920,
"step": 1260
},
{
"epoch": 7.0,
"eval_loss": 0.2371632307767868,
"eval_runtime": 0.8666,
"eval_samples_per_second": 46.155,
"eval_steps_per_second": 23.077,
"num_input_tokens_seen": 191920,
"step": 1260
},
{
"epoch": 7.027777777777778,
"grad_norm": 0.1936909556388855,
"learning_rate": 4.099529179641337e-05,
"loss": 0.2537,
"num_input_tokens_seen": 192688,
"step": 1265
},
{
"epoch": 7.055555555555555,
"grad_norm": 0.01822832226753235,
"learning_rate": 4.09019555069441e-05,
"loss": 0.2315,
"num_input_tokens_seen": 193424,
"step": 1270
},
{
"epoch": 7.083333333333333,
"grad_norm": 0.19782674312591553,
"learning_rate": 4.080824545179828e-05,
"loss": 0.2183,
"num_input_tokens_seen": 194192,
"step": 1275
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.2261374592781067,
"learning_rate": 4.071416383357307e-05,
"loss": 0.2412,
"num_input_tokens_seen": 194928,
"step": 1280
},
{
"epoch": 7.138888888888889,
"grad_norm": 0.07102471590042114,
"learning_rate": 4.0619712863599e-05,
"loss": 0.2203,
"num_input_tokens_seen": 195696,
"step": 1285
},
{
"epoch": 7.166666666666667,
"grad_norm": 0.04602295532822609,
"learning_rate": 4.0524894761888e-05,
"loss": 0.2566,
"num_input_tokens_seen": 196416,
"step": 1290
},
{
"epoch": 7.194444444444445,
"grad_norm": 0.07700731605291367,
"learning_rate": 4.042971175708118e-05,
"loss": 0.2114,
"num_input_tokens_seen": 197184,
"step": 1295
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.14833351969718933,
"learning_rate": 4.0334166086396484e-05,
"loss": 0.2186,
"num_input_tokens_seen": 197952,
"step": 1300
},
{
"epoch": 7.25,
"grad_norm": 0.06778162717819214,
"learning_rate": 4.0238259995576084e-05,
"loss": 0.2327,
"num_input_tokens_seen": 198704,
"step": 1305
},
{
"epoch": 7.277777777777778,
"grad_norm": 0.03469262644648552,
"learning_rate": 4.0141995738833625e-05,
"loss": 0.2221,
"num_input_tokens_seen": 199488,
"step": 1310
},
{
"epoch": 7.305555555555555,
"grad_norm": 0.06742622703313828,
"learning_rate": 4.0045375578801214e-05,
"loss": 0.2141,
"num_input_tokens_seen": 200224,
"step": 1315
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.16787341237068176,
"learning_rate": 3.994840178647623e-05,
"loss": 0.2069,
"num_input_tokens_seen": 201024,
"step": 1320
},
{
"epoch": 7.361111111111111,
"grad_norm": 0.07787422835826874,
"learning_rate": 3.985107664116798e-05,
"loss": 0.2079,
"num_input_tokens_seen": 201792,
"step": 1325
},
{
"epoch": 7.388888888888889,
"grad_norm": 0.08745193481445312,
"learning_rate": 3.9753402430444116e-05,
"loss": 0.2555,
"num_input_tokens_seen": 202576,
"step": 1330
},
{
"epoch": 7.416666666666667,
"grad_norm": 0.15857690572738647,
"learning_rate": 3.9655381450076826e-05,
"loss": 0.2059,
"num_input_tokens_seen": 203312,
"step": 1335
},
{
"epoch": 7.444444444444445,
"grad_norm": 0.0623801089823246,
"learning_rate": 3.955701600398892e-05,
"loss": 0.2344,
"num_input_tokens_seen": 204096,
"step": 1340
},
{
"epoch": 7.472222222222222,
"grad_norm": 0.058826789259910583,
"learning_rate": 3.945830840419966e-05,
"loss": 0.2411,
"num_input_tokens_seen": 204880,
"step": 1345
},
{
"epoch": 7.5,
"grad_norm": 0.23193086683750153,
"learning_rate": 3.935926097077045e-05,
"loss": 0.285,
"num_input_tokens_seen": 205632,
"step": 1350
},
{
"epoch": 7.527777777777778,
"grad_norm": 0.05572715774178505,
"learning_rate": 3.925987603175023e-05,
"loss": 0.2509,
"num_input_tokens_seen": 206384,
"step": 1355
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.16730906069278717,
"learning_rate": 3.916015592312082e-05,
"loss": 0.2345,
"num_input_tokens_seen": 207104,
"step": 1360
},
{
"epoch": 7.583333333333333,
"grad_norm": 0.08506715297698975,
"learning_rate": 3.9060102988742e-05,
"loss": 0.2202,
"num_input_tokens_seen": 207888,
"step": 1365
},
{
"epoch": 7.611111111111111,
"grad_norm": 0.0463300496339798,
"learning_rate": 3.8959719580296415e-05,
"loss": 0.2435,
"num_input_tokens_seen": 208656,
"step": 1370
},
{
"epoch": 7.638888888888889,
"grad_norm": 0.17269517481327057,
"learning_rate": 3.885900805723429e-05,
"loss": 0.2319,
"num_input_tokens_seen": 209424,
"step": 1375
},
{
"epoch": 7.666666666666667,
"grad_norm": 0.16922199726104736,
"learning_rate": 3.875797078671798e-05,
"loss": 0.2319,
"num_input_tokens_seen": 210176,
"step": 1380
},
{
"epoch": 7.694444444444445,
"grad_norm": 0.03644829988479614,
"learning_rate": 3.865661014356635e-05,
"loss": 0.2259,
"num_input_tokens_seen": 210944,
"step": 1385
},
{
"epoch": 7.722222222222222,
"grad_norm": 0.18136528134346008,
"learning_rate": 3.855492851019893e-05,
"loss": 0.2324,
"num_input_tokens_seen": 211680,
"step": 1390
},
{
"epoch": 7.75,
"grad_norm": 0.21425503492355347,
"learning_rate": 3.8452928276579916e-05,
"loss": 0.2323,
"num_input_tokens_seen": 212432,
"step": 1395
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.045595910400152206,
"learning_rate": 3.835061184016203e-05,
"loss": 0.2255,
"num_input_tokens_seen": 213184,
"step": 1400
},
{
"epoch": 7.805555555555555,
"grad_norm": 0.17766867578029633,
"learning_rate": 3.824798160583012e-05,
"loss": 0.2389,
"num_input_tokens_seen": 213952,
"step": 1405
},
{
"epoch": 7.833333333333333,
"grad_norm": 0.18407747149467468,
"learning_rate": 3.814503998584471e-05,
"loss": 0.22,
"num_input_tokens_seen": 214720,
"step": 1410
},
{
"epoch": 7.861111111111111,
"grad_norm": 0.04432946443557739,
"learning_rate": 3.804178939978517e-05,
"loss": 0.2157,
"num_input_tokens_seen": 215488,
"step": 1415
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.08724919706583023,
"learning_rate": 3.7938232274493e-05,
"loss": 0.2435,
"num_input_tokens_seen": 216256,
"step": 1420
},
{
"epoch": 7.916666666666667,
"grad_norm": 0.1724819540977478,
"learning_rate": 3.783437104401469e-05,
"loss": 0.2059,
"num_input_tokens_seen": 217040,
"step": 1425
},
{
"epoch": 7.944444444444445,
"grad_norm": 0.11546836793422699,
"learning_rate": 3.773020814954453e-05,
"loss": 0.2073,
"num_input_tokens_seen": 217824,
"step": 1430
},
{
"epoch": 7.972222222222222,
"grad_norm": 0.1858564019203186,
"learning_rate": 3.762574603936725e-05,
"loss": 0.2652,
"num_input_tokens_seen": 218592,
"step": 1435
},
{
"epoch": 8.0,
"grad_norm": 0.09873568266630173,
"learning_rate": 3.752098716880045e-05,
"loss": 0.2302,
"num_input_tokens_seen": 219344,
"step": 1440
},
{
"epoch": 8.0,
"eval_loss": 0.24155128002166748,
"eval_runtime": 0.8836,
"eval_samples_per_second": 45.27,
"eval_steps_per_second": 22.635,
"num_input_tokens_seen": 219344,
"step": 1440
},
{
"epoch": 8.027777777777779,
"grad_norm": 0.22358053922653198,
"learning_rate": 3.74159340001369e-05,
"loss": 0.2488,
"num_input_tokens_seen": 220064,
"step": 1445
},
{
"epoch": 8.055555555555555,
"grad_norm": 0.03358878195285797,
"learning_rate": 3.731058900258668e-05,
"loss": 0.2119,
"num_input_tokens_seen": 220800,
"step": 1450
},
{
"epoch": 8.083333333333334,
"grad_norm": 0.04093853384256363,
"learning_rate": 3.7204954652219104e-05,
"loss": 0.2448,
"num_input_tokens_seen": 221568,
"step": 1455
},
{
"epoch": 8.11111111111111,
"grad_norm": 0.10109119117259979,
"learning_rate": 3.7099033431904575e-05,
"loss": 0.2349,
"num_input_tokens_seen": 222352,
"step": 1460
},
{
"epoch": 8.13888888888889,
"grad_norm": 0.048178769648075104,
"learning_rate": 3.699282783125616e-05,
"loss": 0.2407,
"num_input_tokens_seen": 223136,
"step": 1465
},
{
"epoch": 8.166666666666666,
"grad_norm": 0.07279063761234283,
"learning_rate": 3.688634034657115e-05,
"loss": 0.2239,
"num_input_tokens_seen": 223888,
"step": 1470
},
{
"epoch": 8.194444444444445,
"grad_norm": 0.10608810186386108,
"learning_rate": 3.6779573480772325e-05,
"loss": 0.2312,
"num_input_tokens_seen": 224656,
"step": 1475
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.18366475403308868,
"learning_rate": 3.6672529743349146e-05,
"loss": 0.2328,
"num_input_tokens_seen": 225392,
"step": 1480
},
{
"epoch": 8.25,
"grad_norm": 0.0931817889213562,
"learning_rate": 3.656521165029879e-05,
"loss": 0.2184,
"num_input_tokens_seen": 226192,
"step": 1485
},
{
"epoch": 8.277777777777779,
"grad_norm": 0.06541720777750015,
"learning_rate": 3.6457621724066964e-05,
"loss": 0.2425,
"num_input_tokens_seen": 226944,
"step": 1490
},
{
"epoch": 8.305555555555555,
"grad_norm": 0.05357493460178375,
"learning_rate": 3.634976249348867e-05,
"loss": 0.2362,
"num_input_tokens_seen": 227680,
"step": 1495
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.18608205020427704,
"learning_rate": 3.6241636493728736e-05,
"loss": 0.2254,
"num_input_tokens_seen": 228400,
"step": 1500
},
{
"epoch": 8.36111111111111,
"grad_norm": 0.1812172532081604,
"learning_rate": 3.613324626622224e-05,
"loss": 0.233,
"num_input_tokens_seen": 229152,
"step": 1505
},
{
"epoch": 8.38888888888889,
"grad_norm": 0.0836367979645729,
"learning_rate": 3.602459435861475e-05,
"loss": 0.2158,
"num_input_tokens_seen": 229920,
"step": 1510
},
{
"epoch": 8.416666666666666,
"grad_norm": 0.18750832974910736,
"learning_rate": 3.591568332470249e-05,
"loss": 0.2053,
"num_input_tokens_seen": 230704,
"step": 1515
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.032188136130571365,
"learning_rate": 3.5806515724372274e-05,
"loss": 0.2385,
"num_input_tokens_seen": 231456,
"step": 1520
},
{
"epoch": 8.472222222222221,
"grad_norm": 0.18904954195022583,
"learning_rate": 3.569709412354136e-05,
"loss": 0.229,
"num_input_tokens_seen": 232224,
"step": 1525
},
{
"epoch": 8.5,
"grad_norm": 0.20962204039096832,
"learning_rate": 3.5587421094097115e-05,
"loss": 0.2309,
"num_input_tokens_seen": 232976,
"step": 1530
},
{
"epoch": 8.527777777777779,
"grad_norm": 0.12048184871673584,
"learning_rate": 3.5477499213836616e-05,
"loss": 0.2276,
"num_input_tokens_seen": 233744,
"step": 1535
},
{
"epoch": 8.555555555555555,
"grad_norm": 0.20690464973449707,
"learning_rate": 3.536733106640598e-05,
"loss": 0.2264,
"num_input_tokens_seen": 234480,
"step": 1540
},
{
"epoch": 8.583333333333334,
"grad_norm": 0.24027937650680542,
"learning_rate": 3.525691924123971e-05,
"loss": 0.2105,
"num_input_tokens_seen": 235248,
"step": 1545
},
{
"epoch": 8.61111111111111,
"grad_norm": 0.07281269878149033,
"learning_rate": 3.5146266333499795e-05,
"loss": 0.252,
"num_input_tokens_seen": 235984,
"step": 1550
},
{
"epoch": 8.63888888888889,
"grad_norm": 0.06904011219739914,
"learning_rate": 3.503537494401473e-05,
"loss": 0.2535,
"num_input_tokens_seen": 236720,
"step": 1555
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.18287572264671326,
"learning_rate": 3.4924247679218375e-05,
"loss": 0.2373,
"num_input_tokens_seen": 237520,
"step": 1560
},
{
"epoch": 8.694444444444445,
"grad_norm": 0.21255025267601013,
"learning_rate": 3.481288715108868e-05,
"loss": 0.2362,
"num_input_tokens_seen": 238288,
"step": 1565
},
{
"epoch": 8.722222222222221,
"grad_norm": 0.1927708387374878,
"learning_rate": 3.4701295977086324e-05,
"loss": 0.2418,
"num_input_tokens_seen": 239072,
"step": 1570
},
{
"epoch": 8.75,
"grad_norm": 0.11111165583133698,
"learning_rate": 3.4589476780093166e-05,
"loss": 0.2354,
"num_input_tokens_seen": 239840,
"step": 1575
},
{
"epoch": 8.777777777777779,
"grad_norm": 0.16939318180084229,
"learning_rate": 3.44774321883506e-05,
"loss": 0.211,
"num_input_tokens_seen": 240624,
"step": 1580
},
{
"epoch": 8.805555555555555,
"grad_norm": 0.20855894684791565,
"learning_rate": 3.436516483539781e-05,
"loss": 0.2498,
"num_input_tokens_seen": 241392,
"step": 1585
},
{
"epoch": 8.833333333333334,
"grad_norm": 0.03823915123939514,
"learning_rate": 3.42526773600098e-05,
"loss": 0.2384,
"num_input_tokens_seen": 242144,
"step": 1590
},
{
"epoch": 8.86111111111111,
"grad_norm": 0.09917465597391129,
"learning_rate": 3.4139972406135464e-05,
"loss": 0.2292,
"num_input_tokens_seen": 242928,
"step": 1595
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.1009097546339035,
"learning_rate": 3.402705262283537e-05,
"loss": 0.2242,
"num_input_tokens_seen": 243680,
"step": 1600
},
{
"epoch": 8.916666666666666,
"grad_norm": 0.06256987899541855,
"learning_rate": 3.39139206642195e-05,
"loss": 0.2125,
"num_input_tokens_seen": 244464,
"step": 1605
},
{
"epoch": 8.944444444444445,
"grad_norm": 0.04281611740589142,
"learning_rate": 3.3800579189384944e-05,
"loss": 0.2132,
"num_input_tokens_seen": 245232,
"step": 1610
},
{
"epoch": 8.972222222222221,
"grad_norm": 0.2398250699043274,
"learning_rate": 3.3687030862353286e-05,
"loss": 0.2451,
"num_input_tokens_seen": 245968,
"step": 1615
},
{
"epoch": 9.0,
"grad_norm": 0.07234970480203629,
"learning_rate": 3.357327835200807e-05,
"loss": 0.264,
"num_input_tokens_seen": 246736,
"step": 1620
},
{
"epoch": 9.0,
"eval_loss": 0.24827322363853455,
"eval_runtime": 0.8692,
"eval_samples_per_second": 46.018,
"eval_steps_per_second": 23.009,
"num_input_tokens_seen": 246736,
"step": 1620
},
{
"epoch": 9.027777777777779,
"grad_norm": 0.05116798356175423,
"learning_rate": 3.3459324332032035e-05,
"loss": 0.2321,
"num_input_tokens_seen": 247520,
"step": 1625
},
{
"epoch": 9.055555555555555,
"grad_norm": 0.07150240242481232,
"learning_rate": 3.3345171480844275e-05,
"loss": 0.2154,
"num_input_tokens_seen": 248304,
"step": 1630
},
{
"epoch": 9.083333333333334,
"grad_norm": 0.0657280832529068,
"learning_rate": 3.32308224815373e-05,
"loss": 0.2268,
"num_input_tokens_seen": 249056,
"step": 1635
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.17856957018375397,
"learning_rate": 3.311628002181398e-05,
"loss": 0.2205,
"num_input_tokens_seen": 249824,
"step": 1640
},
{
"epoch": 9.13888888888889,
"grad_norm": 0.20213398337364197,
"learning_rate": 3.3001546793924285e-05,
"loss": 0.2317,
"num_input_tokens_seen": 250576,
"step": 1645
},
{
"epoch": 9.166666666666666,
"grad_norm": 0.0875818207859993,
"learning_rate": 3.288662549460216e-05,
"loss": 0.2283,
"num_input_tokens_seen": 251344,
"step": 1650
},
{
"epoch": 9.194444444444445,
"grad_norm": 0.05436839535832405,
"learning_rate": 3.277151882500199e-05,
"loss": 0.2213,
"num_input_tokens_seen": 252064,
"step": 1655
},
{
"epoch": 9.222222222222221,
"grad_norm": 0.1270546168088913,
"learning_rate": 3.26562294906352e-05,
"loss": 0.2261,
"num_input_tokens_seen": 252816,
"step": 1660
},
{
"epoch": 9.25,
"grad_norm": 0.2513698637485504,
"learning_rate": 3.254076020130664e-05,
"loss": 0.2196,
"num_input_tokens_seen": 253536,
"step": 1665
},
{
"epoch": 9.277777777777779,
"grad_norm": 0.42506569623947144,
"learning_rate": 3.242511367105087e-05,
"loss": 0.2385,
"num_input_tokens_seen": 254288,
"step": 1670
},
{
"epoch": 9.305555555555555,
"grad_norm": 0.32191821932792664,
"learning_rate": 3.230929261806842e-05,
"loss": 0.2056,
"num_input_tokens_seen": 255024,
"step": 1675
},
{
"epoch": 9.333333333333334,
"grad_norm": 0.1631459891796112,
"learning_rate": 3.2193299764661845e-05,
"loss": 0.2768,
"num_input_tokens_seen": 255792,
"step": 1680
},
{
"epoch": 9.36111111111111,
"grad_norm": 0.11005749553442001,
"learning_rate": 3.207713783717176e-05,
"loss": 0.2532,
"num_input_tokens_seen": 256560,
"step": 1685
},
{
"epoch": 9.38888888888889,
"grad_norm": 0.19168297946453094,
"learning_rate": 3.1960809565912794e-05,
"loss": 0.2341,
"num_input_tokens_seen": 257312,
"step": 1690
},
{
"epoch": 9.416666666666666,
"grad_norm": 0.11040922999382019,
"learning_rate": 3.1844317685109354e-05,
"loss": 0.2256,
"num_input_tokens_seen": 258048,
"step": 1695
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.07201294600963593,
"learning_rate": 3.1727664932831394e-05,
"loss": 0.222,
"num_input_tokens_seen": 258816,
"step": 1700
},
{
"epoch": 9.472222222222221,
"grad_norm": 0.062040749937295914,
"learning_rate": 3.161085405093006e-05,
"loss": 0.2195,
"num_input_tokens_seen": 259600,
"step": 1705
},
{
"epoch": 9.5,
"grad_norm": 0.20329245924949646,
"learning_rate": 3.149388778497323e-05,
"loss": 0.2417,
"num_input_tokens_seen": 260384,
"step": 1710
},
{
"epoch": 9.527777777777779,
"grad_norm": 0.13889265060424805,
"learning_rate": 3.137676888418099e-05,
"loss": 0.2158,
"num_input_tokens_seen": 261152,
"step": 1715
},
{
"epoch": 9.555555555555555,
"grad_norm": 0.25366413593292236,
"learning_rate": 3.125950010136104e-05,
"loss": 0.2201,
"num_input_tokens_seen": 261920,
"step": 1720
},
{
"epoch": 9.583333333333334,
"grad_norm": 0.2369338572025299,
"learning_rate": 3.114208419284391e-05,
"loss": 0.2369,
"num_input_tokens_seen": 262720,
"step": 1725
},
{
"epoch": 9.61111111111111,
"grad_norm": 0.1263570785522461,
"learning_rate": 3.102452391841828e-05,
"loss": 0.2208,
"num_input_tokens_seen": 263488,
"step": 1730
},
{
"epoch": 9.63888888888889,
"grad_norm": 0.18681450188159943,
"learning_rate": 3.090682204126604e-05,
"loss": 0.2471,
"num_input_tokens_seen": 264256,
"step": 1735
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.09681422263383865,
"learning_rate": 3.078898132789735e-05,
"loss": 0.2201,
"num_input_tokens_seen": 265024,
"step": 1740
},
{
"epoch": 9.694444444444445,
"grad_norm": 0.23831580579280853,
"learning_rate": 3.0671004548085675e-05,
"loss": 0.2455,
"num_input_tokens_seen": 265776,
"step": 1745
},
{
"epoch": 9.722222222222221,
"grad_norm": 0.057987093925476074,
"learning_rate": 3.0552894474802584e-05,
"loss": 0.2518,
"num_input_tokens_seen": 266528,
"step": 1750
},
{
"epoch": 9.75,
"grad_norm": 0.11228296905755997,
"learning_rate": 3.043465388415267e-05,
"loss": 0.2213,
"num_input_tokens_seen": 267312,
"step": 1755
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.24755965173244476,
"learning_rate": 3.0316285555308233e-05,
"loss": 0.2194,
"num_input_tokens_seen": 268080,
"step": 1760
},
{
"epoch": 9.805555555555555,
"grad_norm": 0.12314503639936447,
"learning_rate": 3.0197792270443982e-05,
"loss": 0.243,
"num_input_tokens_seen": 268880,
"step": 1765
},
{
"epoch": 9.833333333333334,
"grad_norm": 0.0753416046500206,
"learning_rate": 3.0079176814671656e-05,
"loss": 0.2476,
"num_input_tokens_seen": 269648,
"step": 1770
},
{
"epoch": 9.86111111111111,
"grad_norm": 0.2044837325811386,
"learning_rate": 2.9960441975974534e-05,
"loss": 0.2283,
"num_input_tokens_seen": 270400,
"step": 1775
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.17016810178756714,
"learning_rate": 2.9841590545141906e-05,
"loss": 0.2242,
"num_input_tokens_seen": 271152,
"step": 1780
},
{
"epoch": 9.916666666666666,
"grad_norm": 0.06336583197116852,
"learning_rate": 2.9722625315703512e-05,
"loss": 0.2243,
"num_input_tokens_seen": 271920,
"step": 1785
},
{
"epoch": 9.944444444444445,
"grad_norm": 0.16640453040599823,
"learning_rate": 2.9603549083863847e-05,
"loss": 0.231,
"num_input_tokens_seen": 272672,
"step": 1790
},
{
"epoch": 9.972222222222221,
"grad_norm": 0.1795589029788971,
"learning_rate": 2.9484364648436437e-05,
"loss": 0.2261,
"num_input_tokens_seen": 273440,
"step": 1795
},
{
"epoch": 10.0,
"grad_norm": 0.16841265559196472,
"learning_rate": 2.9365074810778094e-05,
"loss": 0.2165,
"num_input_tokens_seen": 274208,
"step": 1800
},
{
"epoch": 10.0,
"eval_loss": 0.2445867955684662,
"eval_runtime": 0.8599,
"eval_samples_per_second": 46.519,
"eval_steps_per_second": 23.26,
"num_input_tokens_seen": 274208,
"step": 1800
},
{
"epoch": 10.027777777777779,
"grad_norm": 0.2173292487859726,
"learning_rate": 2.9245682374723016e-05,
"loss": 0.2469,
"num_input_tokens_seen": 274976,
"step": 1805
},
{
"epoch": 10.055555555555555,
"grad_norm": 0.19157367944717407,
"learning_rate": 2.9126190146516942e-05,
"loss": 0.2426,
"num_input_tokens_seen": 275760,
"step": 1810
},
{
"epoch": 10.083333333333334,
"grad_norm": 0.21250499784946442,
"learning_rate": 2.9006600934751145e-05,
"loss": 0.2244,
"num_input_tokens_seen": 276544,
"step": 1815
},
{
"epoch": 10.11111111111111,
"grad_norm": 0.1177288368344307,
"learning_rate": 2.888691755029642e-05,
"loss": 0.2185,
"num_input_tokens_seen": 277296,
"step": 1820
},
{
"epoch": 10.13888888888889,
"grad_norm": 0.11683861166238785,
"learning_rate": 2.876714280623708e-05,
"loss": 0.2103,
"num_input_tokens_seen": 278048,
"step": 1825
},
{
"epoch": 10.166666666666666,
"grad_norm": 0.2500019073486328,
"learning_rate": 2.8647279517804754e-05,
"loss": 0.2485,
"num_input_tokens_seen": 278832,
"step": 1830
},
{
"epoch": 10.194444444444445,
"grad_norm": 0.2172568142414093,
"learning_rate": 2.8527330502312248e-05,
"loss": 0.2321,
"num_input_tokens_seen": 279584,
"step": 1835
},
{
"epoch": 10.222222222222221,
"grad_norm": 0.12808550894260406,
"learning_rate": 2.8407298579087365e-05,
"loss": 0.2312,
"num_input_tokens_seen": 280368,
"step": 1840
},
{
"epoch": 10.25,
"grad_norm": 0.12107470631599426,
"learning_rate": 2.8287186569406566e-05,
"loss": 0.2119,
"num_input_tokens_seen": 281136,
"step": 1845
},
{
"epoch": 10.277777777777779,
"grad_norm": 0.23338744044303894,
"learning_rate": 2.816699729642871e-05,
"loss": 0.2219,
"num_input_tokens_seen": 281872,
"step": 1850
},
{
"epoch": 10.305555555555555,
"grad_norm": 0.2628016173839569,
"learning_rate": 2.8046733585128687e-05,
"loss": 0.2355,
"num_input_tokens_seen": 282640,
"step": 1855
},
{
"epoch": 10.333333333333334,
"grad_norm": 0.24145862460136414,
"learning_rate": 2.792639826223101e-05,
"loss": 0.2409,
"num_input_tokens_seen": 283376,
"step": 1860
},
{
"epoch": 10.36111111111111,
"grad_norm": 0.1407560408115387,
"learning_rate": 2.7805994156143376e-05,
"loss": 0.2361,
"num_input_tokens_seen": 284112,
"step": 1865
},
{
"epoch": 10.38888888888889,
"grad_norm": 0.22101657092571259,
"learning_rate": 2.7685524096890185e-05,
"loss": 0.2294,
"num_input_tokens_seen": 284816,
"step": 1870
},
{
"epoch": 10.416666666666666,
"grad_norm": 0.11544955521821976,
"learning_rate": 2.756499091604603e-05,
"loss": 0.2308,
"num_input_tokens_seen": 285600,
"step": 1875
},
{
"epoch": 10.444444444444445,
"grad_norm": 0.14251400530338287,
"learning_rate": 2.744439744666915e-05,
"loss": 0.2352,
"num_input_tokens_seen": 286368,
"step": 1880
},
{
"epoch": 10.472222222222221,
"grad_norm": 0.21146194636821747,
"learning_rate": 2.732374652323481e-05,
"loss": 0.2143,
"num_input_tokens_seen": 287136,
"step": 1885
},
{
"epoch": 10.5,
"grad_norm": 0.1281888633966446,
"learning_rate": 2.72030409815687e-05,
"loss": 0.2166,
"num_input_tokens_seen": 287920,
"step": 1890
},
{
"epoch": 10.527777777777779,
"grad_norm": 0.2592181861400604,
"learning_rate": 2.7082283658780288e-05,
"loss": 0.2402,
"num_input_tokens_seen": 288672,
"step": 1895
},
{
"epoch": 10.555555555555555,
"grad_norm": 0.31524962186813354,
"learning_rate": 2.6961477393196126e-05,
"loss": 0.2449,
"num_input_tokens_seen": 289424,
"step": 1900
},
{
"epoch": 10.583333333333334,
"grad_norm": 0.17323945462703705,
"learning_rate": 2.684062502429312e-05,
"loss": 0.2517,
"num_input_tokens_seen": 290192,
"step": 1905
},
{
"epoch": 10.61111111111111,
"grad_norm": 0.223087340593338,
"learning_rate": 2.6719729392631826e-05,
"loss": 0.2138,
"num_input_tokens_seen": 290928,
"step": 1910
},
{
"epoch": 10.63888888888889,
"grad_norm": 0.16650070250034332,
"learning_rate": 2.659879333978964e-05,
"loss": 0.232,
"num_input_tokens_seen": 291696,
"step": 1915
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.22007060050964355,
"learning_rate": 2.6477819708294064e-05,
"loss": 0.2265,
"num_input_tokens_seen": 292432,
"step": 1920
},
{
"epoch": 10.694444444444445,
"grad_norm": 0.32412680983543396,
"learning_rate": 2.635681134155585e-05,
"loss": 0.2418,
"num_input_tokens_seen": 293200,
"step": 1925
},
{
"epoch": 10.722222222222221,
"grad_norm": 0.22895711660385132,
"learning_rate": 2.623577108380215e-05,
"loss": 0.2208,
"num_input_tokens_seen": 293952,
"step": 1930
},
{
"epoch": 10.75,
"grad_norm": 0.29938873648643494,
"learning_rate": 2.6114701780009753e-05,
"loss": 0.2194,
"num_input_tokens_seen": 294736,
"step": 1935
},
{
"epoch": 10.777777777777779,
"grad_norm": 0.2544272243976593,
"learning_rate": 2.5993606275838117e-05,
"loss": 0.2111,
"num_input_tokens_seen": 295504,
"step": 1940
},
{
"epoch": 10.805555555555555,
"grad_norm": 0.22686506807804108,
"learning_rate": 2.587248741756253e-05,
"loss": 0.2199,
"num_input_tokens_seen": 296288,
"step": 1945
},
{
"epoch": 10.833333333333334,
"grad_norm": 0.2908959686756134,
"learning_rate": 2.5751348052007206e-05,
"loss": 0.2162,
"num_input_tokens_seen": 297040,
"step": 1950
},
{
"epoch": 10.86111111111111,
"grad_norm": 0.25148704648017883,
"learning_rate": 2.5630191026478368e-05,
"loss": 0.2349,
"num_input_tokens_seen": 297776,
"step": 1955
},
{
"epoch": 10.88888888888889,
"grad_norm": 0.32410892844200134,
"learning_rate": 2.5509019188697343e-05,
"loss": 0.215,
"num_input_tokens_seen": 298528,
"step": 1960
},
{
"epoch": 10.916666666666666,
"grad_norm": 0.40616366267204285,
"learning_rate": 2.5387835386733584e-05,
"loss": 0.2247,
"num_input_tokens_seen": 299296,
"step": 1965
},
{
"epoch": 10.944444444444445,
"grad_norm": 0.3647058606147766,
"learning_rate": 2.5266642468937766e-05,
"loss": 0.2378,
"num_input_tokens_seen": 300080,
"step": 1970
},
{
"epoch": 10.972222222222221,
"grad_norm": 0.29364150762557983,
"learning_rate": 2.5145443283874848e-05,
"loss": 0.2442,
"num_input_tokens_seen": 300848,
"step": 1975
},
{
"epoch": 11.0,
"grad_norm": 0.4087861478328705,
"learning_rate": 2.5024240680257055e-05,
"loss": 0.254,
"num_input_tokens_seen": 301600,
"step": 1980
},
{
"epoch": 11.0,
"eval_loss": 0.2517322301864624,
"eval_runtime": 0.8833,
"eval_samples_per_second": 45.287,
"eval_steps_per_second": 22.643,
"num_input_tokens_seen": 301600,
"step": 1980
},
{
"epoch": 11.027777777777779,
"grad_norm": 0.415340781211853,
"learning_rate": 2.4903037506876997e-05,
"loss": 0.2302,
"num_input_tokens_seen": 302320,
"step": 1985
},
{
"epoch": 11.055555555555555,
"grad_norm": 0.4484613537788391,
"learning_rate": 2.4781836612540657e-05,
"loss": 0.2435,
"num_input_tokens_seen": 303072,
"step": 1990
},
{
"epoch": 11.083333333333334,
"grad_norm": 0.26110631227493286,
"learning_rate": 2.4660640846000453e-05,
"loss": 0.2242,
"num_input_tokens_seen": 303840,
"step": 1995
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.14913207292556763,
"learning_rate": 2.4539453055888297e-05,
"loss": 0.2396,
"num_input_tokens_seen": 304576,
"step": 2000
},
{
"epoch": 11.13888888888889,
"grad_norm": 0.20138798654079437,
"learning_rate": 2.4418276090648596e-05,
"loss": 0.218,
"num_input_tokens_seen": 305344,
"step": 2005
},
{
"epoch": 11.166666666666666,
"grad_norm": 0.3144391179084778,
"learning_rate": 2.4297112798471326e-05,
"loss": 0.2182,
"num_input_tokens_seen": 306080,
"step": 2010
},
{
"epoch": 11.194444444444445,
"grad_norm": 0.21388863027095795,
"learning_rate": 2.4175966027225107e-05,
"loss": 0.2423,
"num_input_tokens_seen": 306832,
"step": 2015
},
{
"epoch": 11.222222222222221,
"grad_norm": 0.25984007120132446,
"learning_rate": 2.405483862439023e-05,
"loss": 0.2111,
"num_input_tokens_seen": 307632,
"step": 2020
},
{
"epoch": 11.25,
"grad_norm": 0.3656483292579651,
"learning_rate": 2.3933733436991732e-05,
"loss": 0.2297,
"num_input_tokens_seen": 308384,
"step": 2025
},
{
"epoch": 11.277777777777779,
"grad_norm": 0.3722660541534424,
"learning_rate": 2.381265331153252e-05,
"loss": 0.2238,
"num_input_tokens_seen": 309152,
"step": 2030
},
{
"epoch": 11.305555555555555,
"grad_norm": 0.5844541788101196,
"learning_rate": 2.3691601093926404e-05,
"loss": 0.225,
"num_input_tokens_seen": 309904,
"step": 2035
},
{
"epoch": 11.333333333333334,
"grad_norm": 0.5075754523277283,
"learning_rate": 2.3570579629431267e-05,
"loss": 0.2381,
"num_input_tokens_seen": 310672,
"step": 2040
},
{
"epoch": 11.36111111111111,
"grad_norm": 0.3695644736289978,
"learning_rate": 2.344959176258212e-05,
"loss": 0.2563,
"num_input_tokens_seen": 311440,
"step": 2045
},
{
"epoch": 11.38888888888889,
"grad_norm": 0.5124101638793945,
"learning_rate": 2.3328640337124326e-05,
"loss": 0.1948,
"num_input_tokens_seen": 312240,
"step": 2050
},
{
"epoch": 11.416666666666666,
"grad_norm": 0.31323179602622986,
"learning_rate": 2.3207728195946688e-05,
"loss": 0.2268,
"num_input_tokens_seen": 312944,
"step": 2055
},
{
"epoch": 11.444444444444445,
"grad_norm": 0.2486533373594284,
"learning_rate": 2.3086858181014653e-05,
"loss": 0.2032,
"num_input_tokens_seen": 313712,
"step": 2060
},
{
"epoch": 11.472222222222221,
"grad_norm": 0.5734115242958069,
"learning_rate": 2.2966033133303545e-05,
"loss": 0.2351,
"num_input_tokens_seen": 314448,
"step": 2065
},
{
"epoch": 11.5,
"grad_norm": 0.42716243863105774,
"learning_rate": 2.2845255892731733e-05,
"loss": 0.2103,
"num_input_tokens_seen": 315200,
"step": 2070
},
{
"epoch": 11.527777777777779,
"grad_norm": 0.4750475287437439,
"learning_rate": 2.2724529298093915e-05,
"loss": 0.2522,
"num_input_tokens_seen": 315968,
"step": 2075
},
{
"epoch": 11.555555555555555,
"grad_norm": 0.2699427604675293,
"learning_rate": 2.26038561869944e-05,
"loss": 0.2137,
"num_input_tokens_seen": 316720,
"step": 2080
},
{
"epoch": 11.583333333333334,
"grad_norm": 0.3729088306427002,
"learning_rate": 2.248323939578039e-05,
"loss": 0.2354,
"num_input_tokens_seen": 317488,
"step": 2085
},
{
"epoch": 11.61111111111111,
"grad_norm": 0.44618797302246094,
"learning_rate": 2.2362681759475307e-05,
"loss": 0.2655,
"num_input_tokens_seen": 318208,
"step": 2090
},
{
"epoch": 11.63888888888889,
"grad_norm": 0.3804112374782562,
"learning_rate": 2.2242186111712208e-05,
"loss": 0.2315,
"num_input_tokens_seen": 318944,
"step": 2095
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.18952308595180511,
"learning_rate": 2.212175528466712e-05,
"loss": 0.2254,
"num_input_tokens_seen": 319712,
"step": 2100
},
{
"epoch": 11.694444444444445,
"grad_norm": 0.16058814525604248,
"learning_rate": 2.2001392108992504e-05,
"loss": 0.2436,
"num_input_tokens_seen": 320480,
"step": 2105
},
{
"epoch": 11.722222222222221,
"grad_norm": 0.2892865836620331,
"learning_rate": 2.1881099413750733e-05,
"loss": 0.2182,
"num_input_tokens_seen": 321296,
"step": 2110
},
{
"epoch": 11.75,
"grad_norm": 0.22166113555431366,
"learning_rate": 2.1760880026347562e-05,
"loss": 0.2188,
"num_input_tokens_seen": 322064,
"step": 2115
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.19897451996803284,
"learning_rate": 2.16407367724657e-05,
"loss": 0.2411,
"num_input_tokens_seen": 322832,
"step": 2120
},
{
"epoch": 11.805555555555555,
"grad_norm": 0.1611793488264084,
"learning_rate": 2.1520672475998373e-05,
"loss": 0.2344,
"num_input_tokens_seen": 323600,
"step": 2125
},
{
"epoch": 11.833333333333334,
"grad_norm": 0.155122309923172,
"learning_rate": 2.140068995898297e-05,
"loss": 0.2243,
"num_input_tokens_seen": 324384,
"step": 2130
},
{
"epoch": 11.86111111111111,
"grad_norm": 0.23747070133686066,
"learning_rate": 2.1280792041534714e-05,
"loss": 0.2198,
"num_input_tokens_seen": 325152,
"step": 2135
},
{
"epoch": 11.88888888888889,
"grad_norm": 0.24674496054649353,
"learning_rate": 2.116098154178035e-05,
"loss": 0.2318,
"num_input_tokens_seen": 325904,
"step": 2140
},
{
"epoch": 11.916666666666666,
"grad_norm": 0.18954938650131226,
"learning_rate": 2.1041261275791933e-05,
"loss": 0.2125,
"num_input_tokens_seen": 326672,
"step": 2145
},
{
"epoch": 11.944444444444445,
"grad_norm": 0.4533662497997284,
"learning_rate": 2.092163405752063e-05,
"loss": 0.2239,
"num_input_tokens_seen": 327440,
"step": 2150
},
{
"epoch": 11.972222222222221,
"grad_norm": 0.438764750957489,
"learning_rate": 2.0802102698730574e-05,
"loss": 0.2081,
"num_input_tokens_seen": 328224,
"step": 2155
},
{
"epoch": 12.0,
"grad_norm": 0.2310199737548828,
"learning_rate": 2.0682670008932785e-05,
"loss": 0.2522,
"num_input_tokens_seen": 328976,
"step": 2160
},
{
"epoch": 12.0,
"eval_loss": 0.24894733726978302,
"eval_runtime": 0.8684,
"eval_samples_per_second": 46.062,
"eval_steps_per_second": 23.031,
"num_input_tokens_seen": 328976,
"step": 2160
},
{
"epoch": 12.027777777777779,
"grad_norm": 0.3067687749862671,
"learning_rate": 2.0563338795319123e-05,
"loss": 0.2306,
"num_input_tokens_seen": 329760,
"step": 2165
},
{
"epoch": 12.055555555555555,
"grad_norm": 0.20359157025814056,
"learning_rate": 2.0444111862696314e-05,
"loss": 0.2389,
"num_input_tokens_seen": 330528,
"step": 2170
},
{
"epoch": 12.083333333333334,
"grad_norm": 0.3316972255706787,
"learning_rate": 2.032499201342003e-05,
"loss": 0.2192,
"num_input_tokens_seen": 331312,
"step": 2175
},
{
"epoch": 12.11111111111111,
"grad_norm": 0.34185460209846497,
"learning_rate": 2.020598204732901e-05,
"loss": 0.2054,
"num_input_tokens_seen": 332064,
"step": 2180
},
{
"epoch": 12.13888888888889,
"grad_norm": 0.4968796968460083,
"learning_rate": 2.0087084761679245e-05,
"loss": 0.2019,
"num_input_tokens_seen": 332816,
"step": 2185
},
{
"epoch": 12.166666666666666,
"grad_norm": 0.5224512219429016,
"learning_rate": 1.996830295107827e-05,
"loss": 0.2533,
"num_input_tokens_seen": 333584,
"step": 2190
},
{
"epoch": 12.194444444444445,
"grad_norm": 0.6830374002456665,
"learning_rate": 1.9849639407419423e-05,
"loss": 0.2338,
"num_input_tokens_seen": 334336,
"step": 2195
},
{
"epoch": 12.222222222222221,
"grad_norm": 0.5021493434906006,
"learning_rate": 1.973109691981627e-05,
"loss": 0.1989,
"num_input_tokens_seen": 335104,
"step": 2200
},
{
"epoch": 12.25,
"grad_norm": 0.4624042510986328,
"learning_rate": 1.9612678274537005e-05,
"loss": 0.204,
"num_input_tokens_seen": 335904,
"step": 2205
},
{
"epoch": 12.277777777777779,
"grad_norm": 0.9172624945640564,
"learning_rate": 1.9494386254939e-05,
"loss": 0.2095,
"num_input_tokens_seen": 336704,
"step": 2210
},
{
"epoch": 12.305555555555555,
"grad_norm": 0.578337550163269,
"learning_rate": 1.937622364140338e-05,
"loss": 0.2207,
"num_input_tokens_seen": 337456,
"step": 2215
},
{
"epoch": 12.333333333333334,
"grad_norm": 0.48846668004989624,
"learning_rate": 1.925819321126964e-05,
"loss": 0.2118,
"num_input_tokens_seen": 338208,
"step": 2220
},
{
"epoch": 12.36111111111111,
"grad_norm": 0.5584977865219116,
"learning_rate": 1.9140297738770385e-05,
"loss": 0.2486,
"num_input_tokens_seen": 338976,
"step": 2225
},
{
"epoch": 12.38888888888889,
"grad_norm": 0.42385029792785645,
"learning_rate": 1.9022539994966147e-05,
"loss": 0.2217,
"num_input_tokens_seen": 339712,
"step": 2230
},
{
"epoch": 12.416666666666666,
"grad_norm": 0.3373255431652069,
"learning_rate": 1.8904922747680204e-05,
"loss": 0.2387,
"num_input_tokens_seen": 340464,
"step": 2235
},
{
"epoch": 12.444444444444445,
"grad_norm": 0.5355724692344666,
"learning_rate": 1.8787448761433556e-05,
"loss": 0.228,
"num_input_tokens_seen": 341232,
"step": 2240
},
{
"epoch": 12.472222222222221,
"grad_norm": 0.6969017386436462,
"learning_rate": 1.8670120797379958e-05,
"loss": 0.2385,
"num_input_tokens_seen": 342000,
"step": 2245
},
{
"epoch": 12.5,
"grad_norm": 0.4944399297237396,
"learning_rate": 1.8552941613240983e-05,
"loss": 0.2027,
"num_input_tokens_seen": 342752,
"step": 2250
},
{
"epoch": 12.527777777777779,
"grad_norm": 0.5233724117279053,
"learning_rate": 1.8435913963241226e-05,
"loss": 0.219,
"num_input_tokens_seen": 343504,
"step": 2255
},
{
"epoch": 12.555555555555555,
"grad_norm": 0.5676577687263489,
"learning_rate": 1.831904059804358e-05,
"loss": 0.226,
"num_input_tokens_seen": 344240,
"step": 2260
},
{
"epoch": 12.583333333333334,
"grad_norm": 0.4965709447860718,
"learning_rate": 1.8202324264684544e-05,
"loss": 0.196,
"num_input_tokens_seen": 345008,
"step": 2265
},
{
"epoch": 12.61111111111111,
"grad_norm": 0.5146750211715698,
"learning_rate": 1.8085767706509712e-05,
"loss": 0.1933,
"num_input_tokens_seen": 345776,
"step": 2270
},
{
"epoch": 12.63888888888889,
"grad_norm": 0.6066842675209045,
"learning_rate": 1.7969373663109234e-05,
"loss": 0.2546,
"num_input_tokens_seen": 346544,
"step": 2275
},
{
"epoch": 12.666666666666666,
"grad_norm": 1.1421144008636475,
"learning_rate": 1.7853144870253458e-05,
"loss": 0.2309,
"num_input_tokens_seen": 347312,
"step": 2280
},
{
"epoch": 12.694444444444445,
"grad_norm": 0.7533566355705261,
"learning_rate": 1.7737084059828637e-05,
"loss": 0.257,
"num_input_tokens_seen": 348064,
"step": 2285
},
{
"epoch": 12.722222222222221,
"grad_norm": 0.49484172463417053,
"learning_rate": 1.7621193959772657e-05,
"loss": 0.201,
"num_input_tokens_seen": 348800,
"step": 2290
},
{
"epoch": 12.75,
"grad_norm": 0.5660438537597656,
"learning_rate": 1.750547729401101e-05,
"loss": 0.2252,
"num_input_tokens_seen": 349552,
"step": 2295
},
{
"epoch": 12.777777777777779,
"grad_norm": 0.6149576902389526,
"learning_rate": 1.7389936782392695e-05,
"loss": 0.2122,
"num_input_tokens_seen": 350304,
"step": 2300
},
{
"epoch": 12.805555555555555,
"grad_norm": 0.9000388979911804,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.2483,
"num_input_tokens_seen": 351072,
"step": 2305
},
{
"epoch": 12.833333333333334,
"grad_norm": 0.9429511427879333,
"learning_rate": 1.7159395080216273e-05,
"loss": 0.2592,
"num_input_tokens_seen": 351856,
"step": 2310
},
{
"epoch": 12.86111111111111,
"grad_norm": 0.7645384669303894,
"learning_rate": 1.7044399308398983e-05,
"loss": 0.216,
"num_input_tokens_seen": 352624,
"step": 2315
},
{
"epoch": 12.88888888888889,
"grad_norm": 1.0084342956542969,
"learning_rate": 1.692959052807928e-05,
"loss": 0.2183,
"num_input_tokens_seen": 353376,
"step": 2320
},
{
"epoch": 12.916666666666666,
"grad_norm": 0.5593501329421997,
"learning_rate": 1.681497143776689e-05,
"loss": 0.2226,
"num_input_tokens_seen": 354112,
"step": 2325
},
{
"epoch": 12.944444444444445,
"grad_norm": 0.6755273938179016,
"learning_rate": 1.670054473151298e-05,
"loss": 0.2377,
"num_input_tokens_seen": 354880,
"step": 2330
},
{
"epoch": 12.972222222222221,
"grad_norm": 0.7064863443374634,
"learning_rate": 1.658631309884684e-05,
"loss": 0.2067,
"num_input_tokens_seen": 355632,
"step": 2335
},
{
"epoch": 13.0,
"grad_norm": 0.5242209434509277,
"learning_rate": 1.6472279224712702e-05,
"loss": 0.2228,
"num_input_tokens_seen": 356400,
"step": 2340
},
{
"epoch": 13.0,
"eval_loss": 0.2544581890106201,
"eval_runtime": 0.8689,
"eval_samples_per_second": 46.034,
"eval_steps_per_second": 23.017,
"num_input_tokens_seen": 356400,
"step": 2340
},
{
"epoch": 13.027777777777779,
"grad_norm": 0.9090105891227722,
"learning_rate": 1.6358445789406584e-05,
"loss": 0.2227,
"num_input_tokens_seen": 357136,
"step": 2345
},
{
"epoch": 13.055555555555555,
"grad_norm": 0.6356475949287415,
"learning_rate": 1.6244815468513315e-05,
"loss": 0.2572,
"num_input_tokens_seen": 357872,
"step": 2350
},
{
"epoch": 13.083333333333334,
"grad_norm": 0.7765060067176819,
"learning_rate": 1.6131390932843648e-05,
"loss": 0.1878,
"num_input_tokens_seen": 358608,
"step": 2355
},
{
"epoch": 13.11111111111111,
"grad_norm": 0.7711853384971619,
"learning_rate": 1.6018174848371494e-05,
"loss": 0.2009,
"num_input_tokens_seen": 359408,
"step": 2360
},
{
"epoch": 13.13888888888889,
"grad_norm": 0.5515719652175903,
"learning_rate": 1.5905169876171223e-05,
"loss": 0.2151,
"num_input_tokens_seen": 360176,
"step": 2365
},
{
"epoch": 13.166666666666666,
"grad_norm": 0.7772662043571472,
"learning_rate": 1.579237867235514e-05,
"loss": 0.1831,
"num_input_tokens_seen": 360928,
"step": 2370
},
{
"epoch": 13.194444444444445,
"grad_norm": 0.9589880704879761,
"learning_rate": 1.567980388801109e-05,
"loss": 0.2382,
"num_input_tokens_seen": 361680,
"step": 2375
},
{
"epoch": 13.222222222222221,
"grad_norm": 0.742833137512207,
"learning_rate": 1.556744816914008e-05,
"loss": 0.2503,
"num_input_tokens_seen": 362448,
"step": 2380
},
{
"epoch": 13.25,
"grad_norm": 0.5612350702285767,
"learning_rate": 1.5455314156594124e-05,
"loss": 0.23,
"num_input_tokens_seen": 363216,
"step": 2385
},
{
"epoch": 13.277777777777779,
"grad_norm": 0.9716349244117737,
"learning_rate": 1.534340448601418e-05,
"loss": 0.2322,
"num_input_tokens_seen": 363984,
"step": 2390
},
{
"epoch": 13.305555555555555,
"grad_norm": 0.6680596470832825,
"learning_rate": 1.523172178776816e-05,
"loss": 0.2185,
"num_input_tokens_seen": 364736,
"step": 2395
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.775281548500061,
"learning_rate": 1.512026868688915e-05,
"loss": 0.2205,
"num_input_tokens_seen": 365456,
"step": 2400
},
{
"epoch": 13.36111111111111,
"grad_norm": 0.6668827533721924,
"learning_rate": 1.5009047803013699e-05,
"loss": 0.2149,
"num_input_tokens_seen": 366240,
"step": 2405
},
{
"epoch": 13.38888888888889,
"grad_norm": 1.0453615188598633,
"learning_rate": 1.4898061750320212e-05,
"loss": 0.2087,
"num_input_tokens_seen": 367024,
"step": 2410
},
{
"epoch": 13.416666666666666,
"grad_norm": 1.2332998514175415,
"learning_rate": 1.4787313137467546e-05,
"loss": 0.2069,
"num_input_tokens_seen": 367792,
"step": 2415
},
{
"epoch": 13.444444444444445,
"grad_norm": 0.6259872913360596,
"learning_rate": 1.4676804567533687e-05,
"loss": 0.222,
"num_input_tokens_seen": 368592,
"step": 2420
},
{
"epoch": 13.472222222222221,
"grad_norm": 1.6646252870559692,
"learning_rate": 1.4566538637954554e-05,
"loss": 0.2413,
"num_input_tokens_seen": 369328,
"step": 2425
},
{
"epoch": 13.5,
"grad_norm": 1.4683263301849365,
"learning_rate": 1.4456517940462949e-05,
"loss": 0.1956,
"num_input_tokens_seen": 370064,
"step": 2430
},
{
"epoch": 13.527777777777779,
"grad_norm": 0.7791321873664856,
"learning_rate": 1.4346745061027644e-05,
"loss": 0.1904,
"num_input_tokens_seen": 370832,
"step": 2435
},
{
"epoch": 13.555555555555555,
"grad_norm": 1.082039713859558,
"learning_rate": 1.4237222579792618e-05,
"loss": 0.2108,
"num_input_tokens_seen": 371568,
"step": 2440
},
{
"epoch": 13.583333333333334,
"grad_norm": 0.8041364550590515,
"learning_rate": 1.4127953071016383e-05,
"loss": 0.2039,
"num_input_tokens_seen": 372320,
"step": 2445
},
{
"epoch": 13.61111111111111,
"grad_norm": 0.7636337280273438,
"learning_rate": 1.4018939103011472e-05,
"loss": 0.1884,
"num_input_tokens_seen": 373072,
"step": 2450
},
{
"epoch": 13.63888888888889,
"grad_norm": 1.480279564857483,
"learning_rate": 1.3910183238084112e-05,
"loss": 0.2047,
"num_input_tokens_seen": 373840,
"step": 2455
},
{
"epoch": 13.666666666666666,
"grad_norm": 1.4362431764602661,
"learning_rate": 1.3801688032473958e-05,
"loss": 0.2926,
"num_input_tokens_seen": 374624,
"step": 2460
},
{
"epoch": 13.694444444444445,
"grad_norm": 0.8362478613853455,
"learning_rate": 1.369345603629406e-05,
"loss": 0.1942,
"num_input_tokens_seen": 375392,
"step": 2465
},
{
"epoch": 13.722222222222221,
"grad_norm": 0.9436230063438416,
"learning_rate": 1.3585489793470862e-05,
"loss": 0.2216,
"num_input_tokens_seen": 376144,
"step": 2470
},
{
"epoch": 13.75,
"grad_norm": 1.2598055601119995,
"learning_rate": 1.3477791841684451e-05,
"loss": 0.1824,
"num_input_tokens_seen": 376880,
"step": 2475
},
{
"epoch": 13.777777777777779,
"grad_norm": 1.0539710521697998,
"learning_rate": 1.337036471230889e-05,
"loss": 0.1747,
"num_input_tokens_seen": 377664,
"step": 2480
},
{
"epoch": 13.805555555555555,
"grad_norm": 1.3532763719558716,
"learning_rate": 1.3263210930352737e-05,
"loss": 0.1954,
"num_input_tokens_seen": 378464,
"step": 2485
},
{
"epoch": 13.833333333333334,
"grad_norm": 1.3453116416931152,
"learning_rate": 1.3156333014399674e-05,
"loss": 0.2218,
"num_input_tokens_seen": 379216,
"step": 2490
},
{
"epoch": 13.86111111111111,
"grad_norm": 1.9826656579971313,
"learning_rate": 1.3049733476549352e-05,
"loss": 0.2116,
"num_input_tokens_seen": 379952,
"step": 2495
},
{
"epoch": 13.88888888888889,
"grad_norm": 1.7242130041122437,
"learning_rate": 1.2943414822358285e-05,
"loss": 0.1947,
"num_input_tokens_seen": 380720,
"step": 2500
},
{
"epoch": 13.916666666666666,
"grad_norm": 1.9099639654159546,
"learning_rate": 1.2837379550781003e-05,
"loss": 0.2132,
"num_input_tokens_seen": 381504,
"step": 2505
},
{
"epoch": 13.944444444444445,
"grad_norm": 2.161792516708374,
"learning_rate": 1.2731630154111296e-05,
"loss": 0.2268,
"num_input_tokens_seen": 382272,
"step": 2510
},
{
"epoch": 13.972222222222221,
"grad_norm": 2.0257785320281982,
"learning_rate": 1.262616911792365e-05,
"loss": 0.2125,
"num_input_tokens_seen": 383056,
"step": 2515
},
{
"epoch": 14.0,
"grad_norm": 1.314621090888977,
"learning_rate": 1.2520998921014792e-05,
"loss": 0.1836,
"num_input_tokens_seen": 383808,
"step": 2520
},
{
"epoch": 14.0,
"eval_loss": 0.26544028520584106,
"eval_runtime": 0.8657,
"eval_samples_per_second": 46.206,
"eval_steps_per_second": 23.103,
"num_input_tokens_seen": 383808,
"step": 2520
},
{
"epoch": 14.027777777777779,
"grad_norm": 1.272783875465393,
"learning_rate": 1.2416122035345507e-05,
"loss": 0.1602,
"num_input_tokens_seen": 384576,
"step": 2525
},
{
"epoch": 14.055555555555555,
"grad_norm": 1.6169074773788452,
"learning_rate": 1.2311540925982403e-05,
"loss": 0.1931,
"num_input_tokens_seen": 385344,
"step": 2530
},
{
"epoch": 14.083333333333334,
"grad_norm": 1.8734256029129028,
"learning_rate": 1.2207258051040099e-05,
"loss": 0.1959,
"num_input_tokens_seen": 386096,
"step": 2535
},
{
"epoch": 14.11111111111111,
"grad_norm": 1.183911919593811,
"learning_rate": 1.2103275861623378e-05,
"loss": 0.161,
"num_input_tokens_seen": 386880,
"step": 2540
},
{
"epoch": 14.13888888888889,
"grad_norm": 1.1330690383911133,
"learning_rate": 1.1999596801769616e-05,
"loss": 0.1154,
"num_input_tokens_seen": 387696,
"step": 2545
},
{
"epoch": 14.166666666666666,
"grad_norm": 1.8644895553588867,
"learning_rate": 1.189622330839129e-05,
"loss": 0.2176,
"num_input_tokens_seen": 388480,
"step": 2550
},
{
"epoch": 14.194444444444445,
"grad_norm": 2.0677993297576904,
"learning_rate": 1.179315781121874e-05,
"loss": 0.1774,
"num_input_tokens_seen": 389248,
"step": 2555
},
{
"epoch": 14.222222222222221,
"grad_norm": 2.278595209121704,
"learning_rate": 1.1690402732743042e-05,
"loss": 0.1644,
"num_input_tokens_seen": 390016,
"step": 2560
},
{
"epoch": 14.25,
"grad_norm": 2.7160439491271973,
"learning_rate": 1.158796048815906e-05,
"loss": 0.1784,
"num_input_tokens_seen": 390832,
"step": 2565
},
{
"epoch": 14.277777777777779,
"grad_norm": 2.1027543544769287,
"learning_rate": 1.1485833485308702e-05,
"loss": 0.1768,
"num_input_tokens_seen": 391568,
"step": 2570
},
{
"epoch": 14.305555555555555,
"grad_norm": 2.1385934352874756,
"learning_rate": 1.1384024124624324e-05,
"loss": 0.2043,
"num_input_tokens_seen": 392352,
"step": 2575
},
{
"epoch": 14.333333333333334,
"grad_norm": 2.0394463539123535,
"learning_rate": 1.1282534799072272e-05,
"loss": 0.1632,
"num_input_tokens_seen": 393104,
"step": 2580
},
{
"epoch": 14.36111111111111,
"grad_norm": 3.263662815093994,
"learning_rate": 1.1181367894096684e-05,
"loss": 0.1768,
"num_input_tokens_seen": 393840,
"step": 2585
},
{
"epoch": 14.38888888888889,
"grad_norm": 2.817535400390625,
"learning_rate": 1.1080525787563393e-05,
"loss": 0.1862,
"num_input_tokens_seen": 394576,
"step": 2590
},
{
"epoch": 14.416666666666666,
"grad_norm": 3.808518648147583,
"learning_rate": 1.0980010849704036e-05,
"loss": 0.23,
"num_input_tokens_seen": 395312,
"step": 2595
},
{
"epoch": 14.444444444444445,
"grad_norm": 4.187227249145508,
"learning_rate": 1.0879825443060362e-05,
"loss": 0.1612,
"num_input_tokens_seen": 396048,
"step": 2600
},
{
"epoch": 14.472222222222221,
"grad_norm": 1.87286376953125,
"learning_rate": 1.0779971922428711e-05,
"loss": 0.1516,
"num_input_tokens_seen": 396800,
"step": 2605
},
{
"epoch": 14.5,
"grad_norm": 3.5704548358917236,
"learning_rate": 1.0680452634804603e-05,
"loss": 0.1282,
"num_input_tokens_seen": 397568,
"step": 2610
},
{
"epoch": 14.527777777777779,
"grad_norm": 2.150921583175659,
"learning_rate": 1.0581269919327643e-05,
"loss": 0.1617,
"num_input_tokens_seen": 398352,
"step": 2615
},
{
"epoch": 14.555555555555555,
"grad_norm": 3.5931949615478516,
"learning_rate": 1.0482426107226507e-05,
"loss": 0.2883,
"num_input_tokens_seen": 399104,
"step": 2620
},
{
"epoch": 14.583333333333334,
"grad_norm": 1.6676691770553589,
"learning_rate": 1.0383923521764174e-05,
"loss": 0.1869,
"num_input_tokens_seen": 399872,
"step": 2625
},
{
"epoch": 14.61111111111111,
"grad_norm": 4.933192253112793,
"learning_rate": 1.0285764478183284e-05,
"loss": 0.1692,
"num_input_tokens_seen": 400608,
"step": 2630
},
{
"epoch": 14.63888888888889,
"grad_norm": 3.1867835521698,
"learning_rate": 1.0187951283651736e-05,
"loss": 0.22,
"num_input_tokens_seen": 401344,
"step": 2635
},
{
"epoch": 14.666666666666666,
"grad_norm": 3.876699209213257,
"learning_rate": 1.0090486237208463e-05,
"loss": 0.1891,
"num_input_tokens_seen": 402080,
"step": 2640
},
{
"epoch": 14.694444444444445,
"grad_norm": 5.679113388061523,
"learning_rate": 9.993371629709391e-06,
"loss": 0.1553,
"num_input_tokens_seen": 402848,
"step": 2645
},
{
"epoch": 14.722222222222221,
"grad_norm": 1.1258200407028198,
"learning_rate": 9.89660974377359e-06,
"loss": 0.1653,
"num_input_tokens_seen": 403632,
"step": 2650
},
{
"epoch": 14.75,
"grad_norm": 3.4495646953582764,
"learning_rate": 9.800202853729651e-06,
"loss": 0.1575,
"num_input_tokens_seen": 404416,
"step": 2655
},
{
"epoch": 14.777777777777779,
"grad_norm": 2.117347240447998,
"learning_rate": 9.704153225562171e-06,
"loss": 0.1467,
"num_input_tokens_seen": 405184,
"step": 2660
},
{
"epoch": 14.805555555555555,
"grad_norm": 2.9187674522399902,
"learning_rate": 9.608463116858542e-06,
"loss": 0.2817,
"num_input_tokens_seen": 405952,
"step": 2665
},
{
"epoch": 14.833333333333334,
"grad_norm": 3.9938457012176514,
"learning_rate": 9.51313477675588e-06,
"loss": 0.1923,
"num_input_tokens_seen": 406688,
"step": 2670
},
{
"epoch": 14.86111111111111,
"grad_norm": 5.589292049407959,
"learning_rate": 9.418170445888139e-06,
"loss": 0.1637,
"num_input_tokens_seen": 407440,
"step": 2675
},
{
"epoch": 14.88888888888889,
"grad_norm": 1.3871015310287476,
"learning_rate": 9.323572356333454e-06,
"loss": 0.1199,
"num_input_tokens_seen": 408192,
"step": 2680
},
{
"epoch": 14.916666666666666,
"grad_norm": 1.9564611911773682,
"learning_rate": 9.22934273156172e-06,
"loss": 0.1095,
"num_input_tokens_seen": 408944,
"step": 2685
},
{
"epoch": 14.944444444444445,
"grad_norm": 4.645902633666992,
"learning_rate": 9.135483786382262e-06,
"loss": 0.1473,
"num_input_tokens_seen": 409680,
"step": 2690
},
{
"epoch": 14.972222222222221,
"grad_norm": 4.054070949554443,
"learning_rate": 9.0419977268918e-06,
"loss": 0.2314,
"num_input_tokens_seen": 410464,
"step": 2695
},
{
"epoch": 15.0,
"grad_norm": 3.3633830547332764,
"learning_rate": 8.948886750422636e-06,
"loss": 0.1791,
"num_input_tokens_seen": 411216,
"step": 2700
},
{
"epoch": 15.0,
"eval_loss": 0.2790451645851135,
"eval_runtime": 0.87,
"eval_samples_per_second": 45.976,
"eval_steps_per_second": 22.988,
"num_input_tokens_seen": 411216,
"step": 2700
},
{
"epoch": 15.027777777777779,
"grad_norm": 1.7929856777191162,
"learning_rate": 8.856153045490948e-06,
"loss": 0.1193,
"num_input_tokens_seen": 411984,
"step": 2705
},
{
"epoch": 15.055555555555555,
"grad_norm": 2.6607961654663086,
"learning_rate": 8.763798791745411e-06,
"loss": 0.1421,
"num_input_tokens_seen": 412768,
"step": 2710
},
{
"epoch": 15.083333333333334,
"grad_norm": 2.3993401527404785,
"learning_rate": 8.671826159915907e-06,
"loss": 0.1301,
"num_input_tokens_seen": 413552,
"step": 2715
},
{
"epoch": 15.11111111111111,
"grad_norm": 4.58685302734375,
"learning_rate": 8.58023731176254e-06,
"loss": 0.1253,
"num_input_tokens_seen": 414288,
"step": 2720
},
{
"epoch": 15.13888888888889,
"grad_norm": 3.337277889251709,
"learning_rate": 8.489034400024812e-06,
"loss": 0.0808,
"num_input_tokens_seen": 415056,
"step": 2725
},
{
"epoch": 15.166666666666666,
"grad_norm": 1.4871830940246582,
"learning_rate": 8.39821956837102e-06,
"loss": 0.2168,
"num_input_tokens_seen": 415824,
"step": 2730
},
{
"epoch": 15.194444444444445,
"grad_norm": 4.174405574798584,
"learning_rate": 8.3077949513479e-06,
"loss": 0.1412,
"num_input_tokens_seen": 416592,
"step": 2735
},
{
"epoch": 15.222222222222221,
"grad_norm": 5.589694499969482,
"learning_rate": 8.217762674330413e-06,
"loss": 0.1869,
"num_input_tokens_seen": 417344,
"step": 2740
},
{
"epoch": 15.25,
"grad_norm": 3.3008785247802734,
"learning_rate": 8.128124853471814e-06,
"loss": 0.0887,
"num_input_tokens_seen": 418096,
"step": 2745
},
{
"epoch": 15.277777777777779,
"grad_norm": 5.699565410614014,
"learning_rate": 8.03888359565391e-06,
"loss": 0.1541,
"num_input_tokens_seen": 418816,
"step": 2750
},
{
"epoch": 15.305555555555555,
"grad_norm": 2.3659095764160156,
"learning_rate": 7.950040998437542e-06,
"loss": 0.218,
"num_input_tokens_seen": 419568,
"step": 2755
},
{
"epoch": 15.333333333333334,
"grad_norm": 1.412232518196106,
"learning_rate": 7.86159915001326e-06,
"loss": 0.0468,
"num_input_tokens_seen": 420336,
"step": 2760
},
{
"epoch": 15.36111111111111,
"grad_norm": 2.961951971054077,
"learning_rate": 7.7735601291523e-06,
"loss": 0.077,
"num_input_tokens_seen": 421088,
"step": 2765
},
{
"epoch": 15.38888888888889,
"grad_norm": 1.9995497465133667,
"learning_rate": 7.685926005157651e-06,
"loss": 0.0845,
"num_input_tokens_seen": 421872,
"step": 2770
},
{
"epoch": 15.416666666666666,
"grad_norm": 7.176272869110107,
"learning_rate": 7.598698837815449e-06,
"loss": 0.0801,
"num_input_tokens_seen": 422640,
"step": 2775
},
{
"epoch": 15.444444444444445,
"grad_norm": 11.84694766998291,
"learning_rate": 7.511880677346578e-06,
"loss": 0.1608,
"num_input_tokens_seen": 423392,
"step": 2780
},
{
"epoch": 15.472222222222221,
"grad_norm": 7.272697925567627,
"learning_rate": 7.4254735643584564e-06,
"loss": 0.1564,
"num_input_tokens_seen": 424160,
"step": 2785
},
{
"epoch": 15.5,
"grad_norm": 8.075713157653809,
"learning_rate": 7.339479529797111e-06,
"loss": 0.0772,
"num_input_tokens_seen": 424928,
"step": 2790
},
{
"epoch": 15.527777777777779,
"grad_norm": 1.1608163118362427,
"learning_rate": 7.2539005948993825e-06,
"loss": 0.211,
"num_input_tokens_seen": 425664,
"step": 2795
},
{
"epoch": 15.555555555555555,
"grad_norm": 11.11883544921875,
"learning_rate": 7.168738771145464e-06,
"loss": 0.242,
"num_input_tokens_seen": 426416,
"step": 2800
},
{
"epoch": 15.583333333333334,
"grad_norm": 14.707919120788574,
"learning_rate": 7.083996060211607e-06,
"loss": 0.2303,
"num_input_tokens_seen": 427184,
"step": 2805
},
{
"epoch": 15.61111111111111,
"grad_norm": 6.5854973793029785,
"learning_rate": 6.9996744539230665e-06,
"loss": 0.054,
"num_input_tokens_seen": 427952,
"step": 2810
},
{
"epoch": 15.63888888888889,
"grad_norm": 2.7222070693969727,
"learning_rate": 6.9157759342072995e-06,
"loss": 0.0612,
"num_input_tokens_seen": 428720,
"step": 2815
},
{
"epoch": 15.666666666666666,
"grad_norm": 5.319427967071533,
"learning_rate": 6.832302473047384e-06,
"loss": 0.112,
"num_input_tokens_seen": 429472,
"step": 2820
},
{
"epoch": 15.694444444444445,
"grad_norm": 3.4745731353759766,
"learning_rate": 6.7492560324356355e-06,
"loss": 0.1822,
"num_input_tokens_seen": 430224,
"step": 2825
},
{
"epoch": 15.722222222222221,
"grad_norm": 1.6407183408737183,
"learning_rate": 6.666638564327532e-06,
"loss": 0.1439,
"num_input_tokens_seen": 430960,
"step": 2830
},
{
"epoch": 15.75,
"grad_norm": 9.03652572631836,
"learning_rate": 6.584452010595807e-06,
"loss": 0.1255,
"num_input_tokens_seen": 431776,
"step": 2835
},
{
"epoch": 15.777777777777779,
"grad_norm": 4.16492223739624,
"learning_rate": 6.502698302984811e-06,
"loss": 0.1593,
"num_input_tokens_seen": 432512,
"step": 2840
},
{
"epoch": 15.805555555555555,
"grad_norm": 1.4153860807418823,
"learning_rate": 6.421379363065142e-06,
"loss": 0.0567,
"num_input_tokens_seen": 433296,
"step": 2845
},
{
"epoch": 15.833333333333334,
"grad_norm": 0.6895349621772766,
"learning_rate": 6.340497102188425e-06,
"loss": 0.0864,
"num_input_tokens_seen": 434080,
"step": 2850
},
{
"epoch": 15.86111111111111,
"grad_norm": 3.851112127304077,
"learning_rate": 6.26005342144241e-06,
"loss": 0.0645,
"num_input_tokens_seen": 434832,
"step": 2855
},
{
"epoch": 15.88888888888889,
"grad_norm": 4.547049045562744,
"learning_rate": 6.180050211606303e-06,
"loss": 0.1454,
"num_input_tokens_seen": 435600,
"step": 2860
},
{
"epoch": 15.916666666666666,
"grad_norm": 0.9072476625442505,
"learning_rate": 6.100489353106304e-06,
"loss": 0.0997,
"num_input_tokens_seen": 436320,
"step": 2865
},
{
"epoch": 15.944444444444445,
"grad_norm": 2.3731799125671387,
"learning_rate": 6.021372715971437e-06,
"loss": 0.1351,
"num_input_tokens_seen": 437072,
"step": 2870
},
{
"epoch": 15.972222222222221,
"grad_norm": 3.2382946014404297,
"learning_rate": 5.942702159789554e-06,
"loss": 0.1,
"num_input_tokens_seen": 437824,
"step": 2875
},
{
"epoch": 16.0,
"grad_norm": 6.139621257781982,
"learning_rate": 5.864479533663655e-06,
"loss": 0.1126,
"num_input_tokens_seen": 438592,
"step": 2880
},
{
"epoch": 16.0,
"eval_loss": 0.3588094711303711,
"eval_runtime": 0.8659,
"eval_samples_per_second": 46.195,
"eval_steps_per_second": 23.098,
"num_input_tokens_seen": 438592,
"step": 2880
},
{
"epoch": 16.02777777777778,
"grad_norm": 1.339462161064148,
"learning_rate": 5.786706676168424e-06,
"loss": 0.0558,
"num_input_tokens_seen": 439360,
"step": 2885
},
{
"epoch": 16.055555555555557,
"grad_norm": 1.4482659101486206,
"learning_rate": 5.709385415307006e-06,
"loss": 0.0721,
"num_input_tokens_seen": 440144,
"step": 2890
},
{
"epoch": 16.083333333333332,
"grad_norm": 1.2527096271514893,
"learning_rate": 5.6325175684680374e-06,
"loss": 0.0536,
"num_input_tokens_seen": 440880,
"step": 2895
},
{
"epoch": 16.11111111111111,
"grad_norm": 0.44911694526672363,
"learning_rate": 5.556104942382964e-06,
"loss": 0.0441,
"num_input_tokens_seen": 441632,
"step": 2900
},
{
"epoch": 16.13888888888889,
"grad_norm": 2.401498556137085,
"learning_rate": 5.48014933308352e-06,
"loss": 0.1571,
"num_input_tokens_seen": 442384,
"step": 2905
},
{
"epoch": 16.166666666666668,
"grad_norm": 3.040595531463623,
"learning_rate": 5.404652525859552e-06,
"loss": 0.0203,
"num_input_tokens_seen": 443184,
"step": 2910
},
{
"epoch": 16.194444444444443,
"grad_norm": 0.29359209537506104,
"learning_rate": 5.329616295217046e-06,
"loss": 0.0661,
"num_input_tokens_seen": 443952,
"step": 2915
},
{
"epoch": 16.22222222222222,
"grad_norm": 2.6835484504699707,
"learning_rate": 5.2550424048364185e-06,
"loss": 0.0856,
"num_input_tokens_seen": 444720,
"step": 2920
},
{
"epoch": 16.25,
"grad_norm": 2.9343628883361816,
"learning_rate": 5.180932607531056e-06,
"loss": 0.053,
"num_input_tokens_seen": 445504,
"step": 2925
},
{
"epoch": 16.27777777777778,
"grad_norm": 1.0284254550933838,
"learning_rate": 5.107288645206149e-06,
"loss": 0.0551,
"num_input_tokens_seen": 446240,
"step": 2930
},
{
"epoch": 16.305555555555557,
"grad_norm": 3.6044394969940186,
"learning_rate": 5.034112248817685e-06,
"loss": 0.0884,
"num_input_tokens_seen": 446992,
"step": 2935
},
{
"epoch": 16.333333333333332,
"grad_norm": 0.7247748374938965,
"learning_rate": 4.961405138331826e-06,
"loss": 0.057,
"num_input_tokens_seen": 447760,
"step": 2940
},
{
"epoch": 16.36111111111111,
"grad_norm": 2.632530927658081,
"learning_rate": 4.88916902268445e-06,
"loss": 0.0696,
"num_input_tokens_seen": 448528,
"step": 2945
},
{
"epoch": 16.38888888888889,
"grad_norm": 0.6444573402404785,
"learning_rate": 4.817405599741004e-06,
"loss": 0.0385,
"num_input_tokens_seen": 449312,
"step": 2950
},
{
"epoch": 16.416666666666668,
"grad_norm": 17.892915725708008,
"learning_rate": 4.746116556256569e-06,
"loss": 0.1573,
"num_input_tokens_seen": 450064,
"step": 2955
},
{
"epoch": 16.444444444444443,
"grad_norm": 1.6689417362213135,
"learning_rate": 4.6753035678362314e-06,
"loss": 0.1832,
"num_input_tokens_seen": 450800,
"step": 2960
},
{
"epoch": 16.47222222222222,
"grad_norm": 0.3151831328868866,
"learning_rate": 4.604968298895703e-06,
"loss": 0.0241,
"num_input_tokens_seen": 451536,
"step": 2965
},
{
"epoch": 16.5,
"grad_norm": 0.10785870254039764,
"learning_rate": 4.535112402622185e-06,
"loss": 0.0329,
"num_input_tokens_seen": 452320,
"step": 2970
},
{
"epoch": 16.52777777777778,
"grad_norm": 13.819798469543457,
"learning_rate": 4.465737520935517e-06,
"loss": 0.0773,
"num_input_tokens_seen": 453072,
"step": 2975
},
{
"epoch": 16.555555555555557,
"grad_norm": 11.992451667785645,
"learning_rate": 4.396845284449608e-06,
"loss": 0.095,
"num_input_tokens_seen": 453856,
"step": 2980
},
{
"epoch": 16.583333333333332,
"grad_norm": 1.3498435020446777,
"learning_rate": 4.328437312434067e-06,
"loss": 0.0995,
"num_input_tokens_seen": 454624,
"step": 2985
},
{
"epoch": 16.61111111111111,
"grad_norm": 1.9748262166976929,
"learning_rate": 4.2605152127761675e-06,
"loss": 0.0575,
"num_input_tokens_seen": 455360,
"step": 2990
},
{
"epoch": 16.63888888888889,
"grad_norm": 0.5040590763092041,
"learning_rate": 4.19308058194306e-06,
"loss": 0.0243,
"num_input_tokens_seen": 456096,
"step": 2995
},
{
"epoch": 16.666666666666668,
"grad_norm": 15.647078514099121,
"learning_rate": 4.126135004944231e-06,
"loss": 0.0482,
"num_input_tokens_seen": 456848,
"step": 3000
},
{
"epoch": 16.694444444444443,
"grad_norm": 13.681388854980469,
"learning_rate": 4.059680055294266e-06,
"loss": 0.1086,
"num_input_tokens_seen": 457616,
"step": 3005
},
{
"epoch": 16.72222222222222,
"grad_norm": 1.3664271831512451,
"learning_rate": 3.993717294975863e-06,
"loss": 0.0971,
"num_input_tokens_seen": 458384,
"step": 3010
},
{
"epoch": 16.75,
"grad_norm": 0.569031834602356,
"learning_rate": 3.92824827440309e-06,
"loss": 0.0227,
"num_input_tokens_seen": 459152,
"step": 3015
},
{
"epoch": 16.77777777777778,
"grad_norm": 0.708084762096405,
"learning_rate": 3.863274532384981e-06,
"loss": 0.0432,
"num_input_tokens_seen": 459920,
"step": 3020
},
{
"epoch": 16.805555555555557,
"grad_norm": 0.2141667604446411,
"learning_rate": 3.798797596089351e-06,
"loss": 0.127,
"num_input_tokens_seen": 460672,
"step": 3025
},
{
"epoch": 16.833333333333332,
"grad_norm": 0.05808110535144806,
"learning_rate": 3.73481898100691e-06,
"loss": 0.0333,
"num_input_tokens_seen": 461472,
"step": 3030
},
{
"epoch": 16.86111111111111,
"grad_norm": 34.014705657958984,
"learning_rate": 3.6713401909156204e-06,
"loss": 0.1195,
"num_input_tokens_seen": 462224,
"step": 3035
},
{
"epoch": 16.88888888888889,
"grad_norm": 6.969618320465088,
"learning_rate": 3.608362717845376e-06,
"loss": 0.1465,
"num_input_tokens_seen": 462976,
"step": 3040
},
{
"epoch": 16.916666666666668,
"grad_norm": 27.208003997802734,
"learning_rate": 3.5458880420429135e-06,
"loss": 0.3196,
"num_input_tokens_seen": 463712,
"step": 3045
},
{
"epoch": 16.944444444444443,
"grad_norm": 23.051929473876953,
"learning_rate": 3.4839176319370394e-06,
"loss": 0.0664,
"num_input_tokens_seen": 464480,
"step": 3050
},
{
"epoch": 16.97222222222222,
"grad_norm": 23.996009826660156,
"learning_rate": 3.4224529441040904e-06,
"loss": 0.1817,
"num_input_tokens_seen": 465248,
"step": 3055
},
{
"epoch": 17.0,
"grad_norm": 7.2588791847229,
"learning_rate": 3.3614954232337374e-06,
"loss": 0.021,
"num_input_tokens_seen": 465984,
"step": 3060
},
{
"epoch": 17.0,
"eval_loss": 0.48010167479515076,
"eval_runtime": 0.8596,
"eval_samples_per_second": 46.535,
"eval_steps_per_second": 23.267,
"num_input_tokens_seen": 465984,
"step": 3060
},
{
"epoch": 17.02777777777778,
"grad_norm": 0.5085855722427368,
"learning_rate": 3.3010465020949818e-06,
"loss": 0.0262,
"num_input_tokens_seen": 466768,
"step": 3065
},
{
"epoch": 17.055555555555557,
"grad_norm": 0.024025170132517815,
"learning_rate": 3.2411076015025075e-06,
"loss": 0.0019,
"num_input_tokens_seen": 467568,
"step": 3070
},
{
"epoch": 17.083333333333332,
"grad_norm": 0.26111242175102234,
"learning_rate": 3.1816801302832848e-06,
"loss": 0.0135,
"num_input_tokens_seen": 468352,
"step": 3075
},
{
"epoch": 17.11111111111111,
"grad_norm": 2.5764732360839844,
"learning_rate": 3.1227654852434454e-06,
"loss": 0.0125,
"num_input_tokens_seen": 469120,
"step": 3080
},
{
"epoch": 17.13888888888889,
"grad_norm": 29.602319717407227,
"learning_rate": 3.0643650511354484e-06,
"loss": 0.1227,
"num_input_tokens_seen": 469888,
"step": 3085
},
{
"epoch": 17.166666666666668,
"grad_norm": 0.05500046908855438,
"learning_rate": 3.006480200625572e-06,
"loss": 0.0542,
"num_input_tokens_seen": 470688,
"step": 3090
},
{
"epoch": 17.194444444444443,
"grad_norm": 1.0007386207580566,
"learning_rate": 2.949112294261591e-06,
"loss": 0.0108,
"num_input_tokens_seen": 471472,
"step": 3095
},
{
"epoch": 17.22222222222222,
"grad_norm": 0.1278330683708191,
"learning_rate": 2.89226268044083e-06,
"loss": 0.0025,
"num_input_tokens_seen": 472208,
"step": 3100
},
{
"epoch": 17.25,
"grad_norm": 1.1112349033355713,
"learning_rate": 2.8359326953784737e-06,
"loss": 0.015,
"num_input_tokens_seen": 472960,
"step": 3105
},
{
"epoch": 17.27777777777778,
"grad_norm": 0.07920917123556137,
"learning_rate": 2.780123663076142e-06,
"loss": 0.0053,
"num_input_tokens_seen": 473744,
"step": 3110
},
{
"epoch": 17.305555555555557,
"grad_norm": 2.448739767074585,
"learning_rate": 2.7248368952908053e-06,
"loss": 0.0203,
"num_input_tokens_seen": 474544,
"step": 3115
},
{
"epoch": 17.333333333333332,
"grad_norm": 0.1579727977514267,
"learning_rate": 2.670073691503902e-06,
"loss": 0.0181,
"num_input_tokens_seen": 475280,
"step": 3120
},
{
"epoch": 17.36111111111111,
"grad_norm": 0.3010871112346649,
"learning_rate": 2.6158353388908293e-06,
"loss": 0.0453,
"num_input_tokens_seen": 476048,
"step": 3125
},
{
"epoch": 17.38888888888889,
"grad_norm": 31.98306655883789,
"learning_rate": 2.5621231122906873e-06,
"loss": 0.0406,
"num_input_tokens_seen": 476800,
"step": 3130
},
{
"epoch": 17.416666666666668,
"grad_norm": 2.9089503288269043,
"learning_rate": 2.5089382741762925e-06,
"loss": 0.0167,
"num_input_tokens_seen": 477568,
"step": 3135
},
{
"epoch": 17.444444444444443,
"grad_norm": 4.048463821411133,
"learning_rate": 2.4562820746245386e-06,
"loss": 0.0783,
"num_input_tokens_seen": 478288,
"step": 3140
},
{
"epoch": 17.47222222222222,
"grad_norm": 6.972721576690674,
"learning_rate": 2.4041557512869878e-06,
"loss": 0.0294,
"num_input_tokens_seen": 479056,
"step": 3145
},
{
"epoch": 17.5,
"grad_norm": 14.101398468017578,
"learning_rate": 2.3525605293607784e-06,
"loss": 0.0499,
"num_input_tokens_seen": 479824,
"step": 3150
},
{
"epoch": 17.52777777777778,
"grad_norm": 0.31221017241477966,
"learning_rate": 2.3014976215598503e-06,
"loss": 0.1516,
"num_input_tokens_seen": 480560,
"step": 3155
},
{
"epoch": 17.555555555555557,
"grad_norm": 0.8303629159927368,
"learning_rate": 2.2509682280864224e-06,
"loss": 0.0756,
"num_input_tokens_seen": 481360,
"step": 3160
},
{
"epoch": 17.583333333333332,
"grad_norm": 0.35569193959236145,
"learning_rate": 2.2009735366027795e-06,
"loss": 0.0031,
"num_input_tokens_seen": 482112,
"step": 3165
},
{
"epoch": 17.61111111111111,
"grad_norm": 0.22250580787658691,
"learning_rate": 2.151514722203385e-06,
"loss": 0.0528,
"num_input_tokens_seen": 482880,
"step": 3170
},
{
"epoch": 17.63888888888889,
"grad_norm": 0.6709349155426025,
"learning_rate": 2.1025929473872274e-06,
"loss": 0.0307,
"num_input_tokens_seen": 483632,
"step": 3175
},
{
"epoch": 17.666666666666668,
"grad_norm": 1.0654515027999878,
"learning_rate": 2.0542093620305042e-06,
"loss": 0.0148,
"num_input_tokens_seen": 484400,
"step": 3180
},
{
"epoch": 17.694444444444443,
"grad_norm": 10.747461318969727,
"learning_rate": 2.0063651033596143e-06,
"loss": 0.0472,
"num_input_tokens_seen": 485152,
"step": 3185
},
{
"epoch": 17.72222222222222,
"grad_norm": 0.02796984277665615,
"learning_rate": 1.9590612959244055e-06,
"loss": 0.0115,
"num_input_tokens_seen": 485904,
"step": 3190
},
{
"epoch": 17.75,
"grad_norm": 0.09192727506160736,
"learning_rate": 1.912299051571764e-06,
"loss": 0.1136,
"num_input_tokens_seen": 486672,
"step": 3195
},
{
"epoch": 17.77777777777778,
"grad_norm": 0.13285285234451294,
"learning_rate": 1.8660794694194573e-06,
"loss": 0.2426,
"num_input_tokens_seen": 487440,
"step": 3200
},
{
"epoch": 17.805555555555557,
"grad_norm": 0.1345347911119461,
"learning_rate": 1.8204036358303173e-06,
"loss": 0.0049,
"num_input_tokens_seen": 488176,
"step": 3205
},
{
"epoch": 17.833333333333332,
"grad_norm": 5.462226390838623,
"learning_rate": 1.775272624386695e-06,
"loss": 0.0107,
"num_input_tokens_seen": 488944,
"step": 3210
},
{
"epoch": 17.86111111111111,
"grad_norm": 15.519821166992188,
"learning_rate": 1.7306874958652408e-06,
"loss": 0.038,
"num_input_tokens_seen": 489680,
"step": 3215
},
{
"epoch": 17.88888888888889,
"grad_norm": 3.9195749759674072,
"learning_rate": 1.686649298211951e-06,
"loss": 0.0462,
"num_input_tokens_seen": 490432,
"step": 3220
},
{
"epoch": 17.916666666666668,
"grad_norm": 24.604074478149414,
"learning_rate": 1.643159066517566e-06,
"loss": 0.0269,
"num_input_tokens_seen": 491184,
"step": 3225
},
{
"epoch": 17.944444444444443,
"grad_norm": 21.248931884765625,
"learning_rate": 1.6002178229932107e-06,
"loss": 0.0796,
"num_input_tokens_seen": 491952,
"step": 3230
},
{
"epoch": 17.97222222222222,
"grad_norm": 0.939923107624054,
"learning_rate": 1.5578265769463806e-06,
"loss": 0.0934,
"num_input_tokens_seen": 492720,
"step": 3235
},
{
"epoch": 18.0,
"grad_norm": 4.183718204498291,
"learning_rate": 1.5159863247572236e-06,
"loss": 0.0091,
"num_input_tokens_seen": 493488,
"step": 3240
},
{
"epoch": 18.0,
"eval_loss": 0.5633367300033569,
"eval_runtime": 0.8943,
"eval_samples_per_second": 44.726,
"eval_steps_per_second": 22.363,
"num_input_tokens_seen": 493488,
"step": 3240
},
{
"epoch": 18.02777777777778,
"grad_norm": 0.040560852736234665,
"learning_rate": 1.4746980498551112e-06,
"loss": 0.0059,
"num_input_tokens_seen": 494224,
"step": 3245
},
{
"epoch": 18.055555555555557,
"grad_norm": 7.204439640045166,
"learning_rate": 1.4339627226955392e-06,
"loss": 0.0083,
"num_input_tokens_seen": 494960,
"step": 3250
},
{
"epoch": 18.083333333333332,
"grad_norm": 0.05959023907780647,
"learning_rate": 1.3937813007373013e-06,
"loss": 0.0044,
"num_input_tokens_seen": 495712,
"step": 3255
},
{
"epoch": 18.11111111111111,
"grad_norm": 2.2561721801757812,
"learning_rate": 1.354154728419979e-06,
"loss": 0.0092,
"num_input_tokens_seen": 496480,
"step": 3260
},
{
"epoch": 18.13888888888889,
"grad_norm": 11.598490715026855,
"learning_rate": 1.31508393714177e-06,
"loss": 0.0193,
"num_input_tokens_seen": 497216,
"step": 3265
},
{
"epoch": 18.166666666666668,
"grad_norm": 2.7422096729278564,
"learning_rate": 1.276569845237574e-06,
"loss": 0.0046,
"num_input_tokens_seen": 497968,
"step": 3270
},
{
"epoch": 18.194444444444443,
"grad_norm": 0.027248982340097427,
"learning_rate": 1.2386133579574189e-06,
"loss": 0.0107,
"num_input_tokens_seen": 498704,
"step": 3275
},
{
"epoch": 18.22222222222222,
"grad_norm": 2.369640350341797,
"learning_rate": 1.2012153674451715e-06,
"loss": 0.1142,
"num_input_tokens_seen": 499440,
"step": 3280
},
{
"epoch": 18.25,
"grad_norm": 0.20769014954566956,
"learning_rate": 1.1643767527175857e-06,
"loss": 0.0073,
"num_input_tokens_seen": 500176,
"step": 3285
},
{
"epoch": 18.27777777777778,
"grad_norm": 0.37721478939056396,
"learning_rate": 1.1280983796436245e-06,
"loss": 0.1195,
"num_input_tokens_seen": 500928,
"step": 3290
},
{
"epoch": 18.305555555555557,
"grad_norm": 0.7713701128959656,
"learning_rate": 1.0923811009241142e-06,
"loss": 0.019,
"num_input_tokens_seen": 501696,
"step": 3295
},
{
"epoch": 18.333333333333332,
"grad_norm": 8.40517807006836,
"learning_rate": 1.0572257560717086e-06,
"loss": 0.0174,
"num_input_tokens_seen": 502480,
"step": 3300
},
{
"epoch": 18.36111111111111,
"grad_norm": 0.6811085939407349,
"learning_rate": 1.0226331713911546e-06,
"loss": 0.0214,
"num_input_tokens_seen": 503264,
"step": 3305
},
{
"epoch": 18.38888888888889,
"grad_norm": 37.946937561035156,
"learning_rate": 9.886041599598606e-07,
"loss": 0.1155,
"num_input_tokens_seen": 504000,
"step": 3310
},
{
"epoch": 18.416666666666668,
"grad_norm": 1.640047311782837,
"learning_rate": 9.551395216087944e-07,
"loss": 0.0446,
"num_input_tokens_seen": 504736,
"step": 3315
},
{
"epoch": 18.444444444444443,
"grad_norm": 0.6278148889541626,
"learning_rate": 9.222400429036854e-07,
"loss": 0.006,
"num_input_tokens_seen": 505504,
"step": 3320
},
{
"epoch": 18.47222222222222,
"grad_norm": 1.218808650970459,
"learning_rate": 8.899064971265276e-07,
"loss": 0.0058,
"num_input_tokens_seen": 506256,
"step": 3325
},
{
"epoch": 18.5,
"grad_norm": 0.3586307764053345,
"learning_rate": 8.581396442574135e-07,
"loss": 0.0033,
"num_input_tokens_seen": 507024,
"step": 3330
},
{
"epoch": 18.52777777777778,
"grad_norm": 0.11738912761211395,
"learning_rate": 8.269402309566743e-07,
"loss": 0.0016,
"num_input_tokens_seen": 507808,
"step": 3335
},
{
"epoch": 18.555555555555557,
"grad_norm": 0.5308789610862732,
"learning_rate": 7.963089905473092e-07,
"loss": 0.0013,
"num_input_tokens_seen": 508560,
"step": 3340
},
{
"epoch": 18.583333333333332,
"grad_norm": 34.2119140625,
"learning_rate": 7.662466429977699e-07,
"loss": 0.0493,
"num_input_tokens_seen": 509312,
"step": 3345
},
{
"epoch": 18.61111111111111,
"grad_norm": 0.14879710972309113,
"learning_rate": 7.367538949050345e-07,
"loss": 0.0152,
"num_input_tokens_seen": 510064,
"step": 3350
},
{
"epoch": 18.63888888888889,
"grad_norm": 12.18697452545166,
"learning_rate": 7.078314394779961e-07,
"loss": 0.0157,
"num_input_tokens_seen": 510832,
"step": 3355
},
{
"epoch": 18.666666666666668,
"grad_norm": 0.2048877626657486,
"learning_rate": 6.794799565211646e-07,
"loss": 0.0163,
"num_input_tokens_seen": 511616,
"step": 3360
},
{
"epoch": 18.694444444444443,
"grad_norm": 0.5249136686325073,
"learning_rate": 6.517001124186989e-07,
"loss": 0.0062,
"num_input_tokens_seen": 512400,
"step": 3365
},
{
"epoch": 18.72222222222222,
"grad_norm": 0.056645072996616364,
"learning_rate": 6.244925601187363e-07,
"loss": 0.0011,
"num_input_tokens_seen": 513152,
"step": 3370
},
{
"epoch": 18.75,
"grad_norm": 0.7716118693351746,
"learning_rate": 5.978579391180461e-07,
"loss": 0.0056,
"num_input_tokens_seen": 513888,
"step": 3375
},
{
"epoch": 18.77777777777778,
"grad_norm": 0.3327917754650116,
"learning_rate": 5.717968754469977e-07,
"loss": 0.0016,
"num_input_tokens_seen": 514688,
"step": 3380
},
{
"epoch": 18.805555555555557,
"grad_norm": 2.8160390853881836,
"learning_rate": 5.463099816548579e-07,
"loss": 0.0102,
"num_input_tokens_seen": 515440,
"step": 3385
},
{
"epoch": 18.833333333333332,
"grad_norm": 0.10132934898138046,
"learning_rate": 5.213978567953775e-07,
"loss": 0.0067,
"num_input_tokens_seen": 516224,
"step": 3390
},
{
"epoch": 18.86111111111111,
"grad_norm": 1.5919185876846313,
"learning_rate": 4.970610864127173e-07,
"loss": 0.0061,
"num_input_tokens_seen": 516992,
"step": 3395
},
{
"epoch": 18.88888888888889,
"grad_norm": 1.101412057876587,
"learning_rate": 4.7330024252768555e-07,
"loss": 0.0037,
"num_input_tokens_seen": 517792,
"step": 3400
},
{
"epoch": 18.916666666666668,
"grad_norm": 12.524344444274902,
"learning_rate": 4.5011588362429134e-07,
"loss": 0.1769,
"num_input_tokens_seen": 518528,
"step": 3405
},
{
"epoch": 18.944444444444443,
"grad_norm": 0.13883286714553833,
"learning_rate": 4.2750855463662143e-07,
"loss": 0.0008,
"num_input_tokens_seen": 519296,
"step": 3410
},
{
"epoch": 18.97222222222222,
"grad_norm": 0.5751574635505676,
"learning_rate": 4.05478786936031e-07,
"loss": 0.0077,
"num_input_tokens_seen": 520064,
"step": 3415
},
{
"epoch": 19.0,
"grad_norm": 0.2156560719013214,
"learning_rate": 3.8402709831865113e-07,
"loss": 0.0818,
"num_input_tokens_seen": 520816,
"step": 3420
},
{
"epoch": 19.0,
"eval_loss": 0.5927966833114624,
"eval_runtime": 0.8667,
"eval_samples_per_second": 46.152,
"eval_steps_per_second": 23.076,
"num_input_tokens_seen": 520816,
"step": 3420
},
{
"epoch": 19.02777777777778,
"grad_norm": 0.09093061834573746,
"learning_rate": 3.6315399299321484e-07,
"loss": 0.0027,
"num_input_tokens_seen": 521584,
"step": 3425
},
{
"epoch": 19.055555555555557,
"grad_norm": 0.22736388444900513,
"learning_rate": 3.428599615692141e-07,
"loss": 0.1626,
"num_input_tokens_seen": 522368,
"step": 3430
},
{
"epoch": 19.083333333333332,
"grad_norm": 0.39018547534942627,
"learning_rate": 3.2314548104537545e-07,
"loss": 0.0023,
"num_input_tokens_seen": 523168,
"step": 3435
},
{
"epoch": 19.11111111111111,
"grad_norm": 18.854568481445312,
"learning_rate": 3.040110147984221e-07,
"loss": 0.0252,
"num_input_tokens_seen": 523936,
"step": 3440
},
{
"epoch": 19.13888888888889,
"grad_norm": 1.008324384689331,
"learning_rate": 2.8545701257221e-07,
"loss": 0.0513,
"num_input_tokens_seen": 524704,
"step": 3445
},
{
"epoch": 19.166666666666668,
"grad_norm": 1.5237793922424316,
"learning_rate": 2.674839104671367e-07,
"loss": 0.0035,
"num_input_tokens_seen": 525440,
"step": 3450
},
{
"epoch": 19.194444444444443,
"grad_norm": 7.896567344665527,
"learning_rate": 2.5009213092991034e-07,
"loss": 0.009,
"num_input_tokens_seen": 526192,
"step": 3455
},
{
"epoch": 19.22222222222222,
"grad_norm": 1.473685383796692,
"learning_rate": 2.3328208274359942e-07,
"loss": 0.0075,
"num_input_tokens_seen": 526944,
"step": 3460
},
{
"epoch": 19.25,
"grad_norm": 0.9116981029510498,
"learning_rate": 2.170541610180432e-07,
"loss": 0.0014,
"num_input_tokens_seen": 527696,
"step": 3465
},
{
"epoch": 19.27777777777778,
"grad_norm": 0.41270574927330017,
"learning_rate": 2.014087471805509e-07,
"loss": 0.0029,
"num_input_tokens_seen": 528480,
"step": 3470
},
{
"epoch": 19.305555555555557,
"grad_norm": 0.1433163583278656,
"learning_rate": 1.8634620896695043e-07,
"loss": 0.0072,
"num_input_tokens_seen": 529248,
"step": 3475
},
{
"epoch": 19.333333333333332,
"grad_norm": 0.15605846047401428,
"learning_rate": 1.7186690041292586e-07,
"loss": 0.0018,
"num_input_tokens_seen": 530016,
"step": 3480
},
{
"epoch": 19.36111111111111,
"grad_norm": 0.1406908482313156,
"learning_rate": 1.5797116184571304e-07,
"loss": 0.0104,
"num_input_tokens_seen": 530768,
"step": 3485
},
{
"epoch": 19.38888888888889,
"grad_norm": 1.887284278869629,
"learning_rate": 1.4465931987609482e-07,
"loss": 0.004,
"num_input_tokens_seen": 531520,
"step": 3490
},
{
"epoch": 19.416666666666668,
"grad_norm": 0.1923336386680603,
"learning_rate": 1.319316873907267e-07,
"loss": 0.0393,
"num_input_tokens_seen": 532256,
"step": 3495
},
{
"epoch": 19.444444444444443,
"grad_norm": 5.755624294281006,
"learning_rate": 1.1978856354477595e-07,
"loss": 0.0097,
"num_input_tokens_seen": 533008,
"step": 3500
},
{
"epoch": 19.47222222222222,
"grad_norm": 1.6134008169174194,
"learning_rate": 1.0823023375489127e-07,
"loss": 0.0223,
"num_input_tokens_seen": 533760,
"step": 3505
},
{
"epoch": 19.5,
"grad_norm": 0.05306548252701759,
"learning_rate": 9.725696969249965e-08,
"loss": 0.0183,
"num_input_tokens_seen": 534512,
"step": 3510
},
{
"epoch": 19.52777777777778,
"grad_norm": 34.357643127441406,
"learning_rate": 8.686902927741991e-08,
"loss": 0.0611,
"num_input_tokens_seen": 535296,
"step": 3515
},
{
"epoch": 19.555555555555557,
"grad_norm": 0.7180253267288208,
"learning_rate": 7.706665667180091e-08,
"loss": 0.0374,
"num_input_tokens_seen": 536080,
"step": 3520
},
{
"epoch": 19.583333333333332,
"grad_norm": 0.3030802011489868,
"learning_rate": 6.785008227437329e-08,
"loss": 0.001,
"num_input_tokens_seen": 536832,
"step": 3525
},
{
"epoch": 19.61111111111111,
"grad_norm": 0.009636443108320236,
"learning_rate": 5.921952271504827e-08,
"loss": 0.0014,
"num_input_tokens_seen": 537584,
"step": 3530
},
{
"epoch": 19.63888888888889,
"grad_norm": 0.7338316440582275,
"learning_rate": 5.117518084981621e-08,
"loss": 0.0076,
"num_input_tokens_seen": 538320,
"step": 3535
},
{
"epoch": 19.666666666666668,
"grad_norm": 7.76223611831665,
"learning_rate": 4.371724575597535e-08,
"loss": 0.0131,
"num_input_tokens_seen": 539072,
"step": 3540
},
{
"epoch": 19.694444444444443,
"grad_norm": 6.488776206970215,
"learning_rate": 3.684589272771044e-08,
"loss": 0.0141,
"num_input_tokens_seen": 539824,
"step": 3545
},
{
"epoch": 19.72222222222222,
"grad_norm": 0.19823488593101501,
"learning_rate": 3.056128327193486e-08,
"loss": 0.0029,
"num_input_tokens_seen": 540592,
"step": 3550
},
{
"epoch": 19.75,
"grad_norm": 23.785985946655273,
"learning_rate": 2.486356510453258e-08,
"loss": 0.0548,
"num_input_tokens_seen": 541392,
"step": 3555
},
{
"epoch": 19.77777777777778,
"grad_norm": 0.09434127807617188,
"learning_rate": 1.975287214685817e-08,
"loss": 0.0028,
"num_input_tokens_seen": 542160,
"step": 3560
},
{
"epoch": 19.805555555555557,
"grad_norm": 0.27645689249038696,
"learning_rate": 1.522932452260595e-08,
"loss": 0.1098,
"num_input_tokens_seen": 542912,
"step": 3565
},
{
"epoch": 19.833333333333332,
"grad_norm": 0.12851421535015106,
"learning_rate": 1.1293028554978935e-08,
"loss": 0.0012,
"num_input_tokens_seen": 543648,
"step": 3570
},
{
"epoch": 19.86111111111111,
"grad_norm": 0.3019014000892639,
"learning_rate": 7.944076764190845e-09,
"loss": 0.0038,
"num_input_tokens_seen": 544432,
"step": 3575
},
{
"epoch": 19.88888888888889,
"grad_norm": 2.531794786453247,
"learning_rate": 5.182547865290044e-09,
"loss": 0.0207,
"num_input_tokens_seen": 545216,
"step": 3580
},
{
"epoch": 19.916666666666668,
"grad_norm": 0.03003126010298729,
"learning_rate": 3.008506766313812e-09,
"loss": 0.0033,
"num_input_tokens_seen": 545952,
"step": 3585
},
{
"epoch": 19.944444444444443,
"grad_norm": 0.3342243432998657,
"learning_rate": 1.4220045667645566e-09,
"loss": 0.0011,
"num_input_tokens_seen": 546720,
"step": 3590
},
{
"epoch": 19.97222222222222,
"grad_norm": 0.189296156167984,
"learning_rate": 4.2307855639411865e-10,
"loss": 0.0009,
"num_input_tokens_seen": 547488,
"step": 3595
},
{
"epoch": 20.0,
"grad_norm": 0.32087770104408264,
"learning_rate": 1.1752214348903501e-11,
"loss": 0.0025,
"num_input_tokens_seen": 548240,
"step": 3600
},
{
"epoch": 20.0,
"eval_loss": 0.6011862754821777,
"eval_runtime": 0.8651,
"eval_samples_per_second": 46.239,
"eval_steps_per_second": 23.119,
"num_input_tokens_seen": 548240,
"step": 3600
},
{
"epoch": 20.0,
"num_input_tokens_seen": 548240,
"step": 3600,
"total_flos": 2.468699941306368e+16,
"train_loss": 0.2379362821903649,
"train_runtime": 353.13,
"train_samples_per_second": 20.389,
"train_steps_per_second": 10.195
}
],
"logging_steps": 5,
"max_steps": 3600,
"num_input_tokens_seen": 548240,
"num_train_epochs": 20,
"save_steps": 180,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.468699941306368e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}