train_svamp_1754652179 / trainer_state.json
rbelanec's picture
End of training
903ccd8 verified
{
"best_global_step": 948,
"best_metric": 0.07719651609659195,
"best_model_checkpoint": "saves/p-tuning/llama-3-8b-instruct/train_svamp_1754652179/checkpoint-948",
"epoch": 10.0,
"eval_steps": 79,
"global_step": 1580,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03164556962025317,
"grad_norm": 57.304134368896484,
"learning_rate": 1.2658227848101265e-06,
"loss": 2.5423,
"num_input_tokens_seen": 2272,
"step": 5
},
{
"epoch": 0.06329113924050633,
"grad_norm": 31.999879837036133,
"learning_rate": 2.848101265822785e-06,
"loss": 2.316,
"num_input_tokens_seen": 4640,
"step": 10
},
{
"epoch": 0.0949367088607595,
"grad_norm": 29.859228134155273,
"learning_rate": 4.430379746835443e-06,
"loss": 2.0954,
"num_input_tokens_seen": 6848,
"step": 15
},
{
"epoch": 0.12658227848101267,
"grad_norm": 30.689695358276367,
"learning_rate": 6.012658227848101e-06,
"loss": 1.8232,
"num_input_tokens_seen": 9056,
"step": 20
},
{
"epoch": 0.15822784810126583,
"grad_norm": 16.21649169921875,
"learning_rate": 7.5949367088607605e-06,
"loss": 1.5523,
"num_input_tokens_seen": 11296,
"step": 25
},
{
"epoch": 0.189873417721519,
"grad_norm": 31.955402374267578,
"learning_rate": 9.177215189873418e-06,
"loss": 1.2851,
"num_input_tokens_seen": 13568,
"step": 30
},
{
"epoch": 0.22151898734177214,
"grad_norm": 43.333091735839844,
"learning_rate": 1.0759493670886076e-05,
"loss": 1.0306,
"num_input_tokens_seen": 15840,
"step": 35
},
{
"epoch": 0.25316455696202533,
"grad_norm": 32.48151397705078,
"learning_rate": 1.2341772151898735e-05,
"loss": 0.7859,
"num_input_tokens_seen": 18240,
"step": 40
},
{
"epoch": 0.2848101265822785,
"grad_norm": 30.37523651123047,
"learning_rate": 1.3924050632911393e-05,
"loss": 0.4745,
"num_input_tokens_seen": 20480,
"step": 45
},
{
"epoch": 0.31645569620253167,
"grad_norm": 57.03865051269531,
"learning_rate": 1.550632911392405e-05,
"loss": 0.3407,
"num_input_tokens_seen": 22624,
"step": 50
},
{
"epoch": 0.34810126582278483,
"grad_norm": 36.00380325317383,
"learning_rate": 1.7088607594936708e-05,
"loss": 0.2401,
"num_input_tokens_seen": 24928,
"step": 55
},
{
"epoch": 0.379746835443038,
"grad_norm": 21.761754989624023,
"learning_rate": 1.8670886075949368e-05,
"loss": 0.279,
"num_input_tokens_seen": 27264,
"step": 60
},
{
"epoch": 0.41139240506329117,
"grad_norm": 15.638097763061523,
"learning_rate": 2.0253164556962025e-05,
"loss": 0.232,
"num_input_tokens_seen": 29504,
"step": 65
},
{
"epoch": 0.4430379746835443,
"grad_norm": 16.34490966796875,
"learning_rate": 2.1835443037974685e-05,
"loss": 0.4193,
"num_input_tokens_seen": 31776,
"step": 70
},
{
"epoch": 0.47468354430379744,
"grad_norm": 38.84067153930664,
"learning_rate": 2.341772151898734e-05,
"loss": 0.2917,
"num_input_tokens_seen": 34016,
"step": 75
},
{
"epoch": 0.5,
"eval_loss": 0.26484471559524536,
"eval_runtime": 1.7772,
"eval_samples_per_second": 39.387,
"eval_steps_per_second": 10.128,
"num_input_tokens_seen": 35776,
"step": 79
},
{
"epoch": 0.5063291139240507,
"grad_norm": 29.9153995513916,
"learning_rate": 2.5e-05,
"loss": 0.2394,
"num_input_tokens_seen": 36224,
"step": 80
},
{
"epoch": 0.5379746835443038,
"grad_norm": 45.06348419189453,
"learning_rate": 2.6582278481012658e-05,
"loss": 0.1805,
"num_input_tokens_seen": 38400,
"step": 85
},
{
"epoch": 0.569620253164557,
"grad_norm": 223.73489379882812,
"learning_rate": 2.8164556962025318e-05,
"loss": 0.3283,
"num_input_tokens_seen": 40704,
"step": 90
},
{
"epoch": 0.6012658227848101,
"grad_norm": 12.492780685424805,
"learning_rate": 2.9746835443037974e-05,
"loss": 0.1008,
"num_input_tokens_seen": 42848,
"step": 95
},
{
"epoch": 0.6329113924050633,
"grad_norm": 156.92747497558594,
"learning_rate": 3.132911392405064e-05,
"loss": 0.4571,
"num_input_tokens_seen": 45056,
"step": 100
},
{
"epoch": 0.6645569620253164,
"grad_norm": 14.881486892700195,
"learning_rate": 3.291139240506329e-05,
"loss": 0.3661,
"num_input_tokens_seen": 47296,
"step": 105
},
{
"epoch": 0.6962025316455697,
"grad_norm": 9.662066459655762,
"learning_rate": 3.449367088607595e-05,
"loss": 0.1226,
"num_input_tokens_seen": 49536,
"step": 110
},
{
"epoch": 0.7278481012658228,
"grad_norm": 17.59064483642578,
"learning_rate": 3.607594936708861e-05,
"loss": 0.221,
"num_input_tokens_seen": 51744,
"step": 115
},
{
"epoch": 0.759493670886076,
"grad_norm": 14.502843856811523,
"learning_rate": 3.765822784810127e-05,
"loss": 0.1247,
"num_input_tokens_seen": 53888,
"step": 120
},
{
"epoch": 0.7911392405063291,
"grad_norm": 94.63813018798828,
"learning_rate": 3.924050632911392e-05,
"loss": 0.2585,
"num_input_tokens_seen": 56128,
"step": 125
},
{
"epoch": 0.8227848101265823,
"grad_norm": 12.14871597290039,
"learning_rate": 4.0822784810126584e-05,
"loss": 0.1239,
"num_input_tokens_seen": 58400,
"step": 130
},
{
"epoch": 0.8544303797468354,
"grad_norm": 19.46076202392578,
"learning_rate": 4.240506329113924e-05,
"loss": 0.2201,
"num_input_tokens_seen": 60640,
"step": 135
},
{
"epoch": 0.8860759493670886,
"grad_norm": 35.38789749145508,
"learning_rate": 4.3987341772151904e-05,
"loss": 0.2173,
"num_input_tokens_seen": 62880,
"step": 140
},
{
"epoch": 0.9177215189873418,
"grad_norm": 24.81549835205078,
"learning_rate": 4.556962025316456e-05,
"loss": 0.2943,
"num_input_tokens_seen": 65088,
"step": 145
},
{
"epoch": 0.9493670886075949,
"grad_norm": 25.282819747924805,
"learning_rate": 4.715189873417722e-05,
"loss": 0.2326,
"num_input_tokens_seen": 67296,
"step": 150
},
{
"epoch": 0.9810126582278481,
"grad_norm": 62.82623291015625,
"learning_rate": 4.8734177215189874e-05,
"loss": 0.1311,
"num_input_tokens_seen": 69632,
"step": 155
},
{
"epoch": 1.0,
"eval_loss": 0.6598067879676819,
"eval_runtime": 1.7725,
"eval_samples_per_second": 39.491,
"eval_steps_per_second": 10.155,
"num_input_tokens_seen": 70672,
"step": 158
},
{
"epoch": 1.0126582278481013,
"grad_norm": 59.08744430541992,
"learning_rate": 4.999993898868453e-05,
"loss": 0.3077,
"num_input_tokens_seen": 71504,
"step": 160
},
{
"epoch": 1.0443037974683544,
"grad_norm": 12.225666046142578,
"learning_rate": 4.999780362391087e-05,
"loss": 0.2211,
"num_input_tokens_seen": 73744,
"step": 165
},
{
"epoch": 1.0759493670886076,
"grad_norm": 19.726652145385742,
"learning_rate": 4.999261799114754e-05,
"loss": 0.2402,
"num_input_tokens_seen": 75888,
"step": 170
},
{
"epoch": 1.1075949367088607,
"grad_norm": 9.605252265930176,
"learning_rate": 4.9984382723152935e-05,
"loss": 0.4507,
"num_input_tokens_seen": 78288,
"step": 175
},
{
"epoch": 1.139240506329114,
"grad_norm": 11.414168357849121,
"learning_rate": 4.99730988248063e-05,
"loss": 0.3113,
"num_input_tokens_seen": 80496,
"step": 180
},
{
"epoch": 1.1708860759493671,
"grad_norm": 2.0183305740356445,
"learning_rate": 4.9958767672985176e-05,
"loss": 0.1182,
"num_input_tokens_seen": 82800,
"step": 185
},
{
"epoch": 1.2025316455696202,
"grad_norm": 3.184451103210449,
"learning_rate": 4.994139101639732e-05,
"loss": 0.0623,
"num_input_tokens_seen": 84976,
"step": 190
},
{
"epoch": 1.2341772151898733,
"grad_norm": 3.47723388671875,
"learning_rate": 4.99209709753674e-05,
"loss": 0.1357,
"num_input_tokens_seen": 87280,
"step": 195
},
{
"epoch": 1.2658227848101267,
"grad_norm": 3.638543128967285,
"learning_rate": 4.989751004157822e-05,
"loss": 0.12,
"num_input_tokens_seen": 89584,
"step": 200
},
{
"epoch": 1.2974683544303798,
"grad_norm": 2.2222177982330322,
"learning_rate": 4.98710110777667e-05,
"loss": 0.0584,
"num_input_tokens_seen": 91856,
"step": 205
},
{
"epoch": 1.3291139240506329,
"grad_norm": 26.8077392578125,
"learning_rate": 4.984147731737455e-05,
"loss": 0.2019,
"num_input_tokens_seen": 94064,
"step": 210
},
{
"epoch": 1.360759493670886,
"grad_norm": 7.961758136749268,
"learning_rate": 4.980891236415376e-05,
"loss": 0.2216,
"num_input_tokens_seen": 96304,
"step": 215
},
{
"epoch": 1.3924050632911391,
"grad_norm": 45.82145690917969,
"learning_rate": 4.9773320191726776e-05,
"loss": 0.1724,
"num_input_tokens_seen": 98576,
"step": 220
},
{
"epoch": 1.4240506329113924,
"grad_norm": 13.437238693237305,
"learning_rate": 4.973470514310175e-05,
"loss": 0.2237,
"num_input_tokens_seen": 100688,
"step": 225
},
{
"epoch": 1.4556962025316456,
"grad_norm": 4.555171489715576,
"learning_rate": 4.969307193014248e-05,
"loss": 0.2799,
"num_input_tokens_seen": 102864,
"step": 230
},
{
"epoch": 1.4873417721518987,
"grad_norm": 0.40021178126335144,
"learning_rate": 4.964842563299359e-05,
"loss": 0.0572,
"num_input_tokens_seen": 104944,
"step": 235
},
{
"epoch": 1.5,
"eval_loss": 0.14671920239925385,
"eval_runtime": 1.7757,
"eval_samples_per_second": 39.421,
"eval_steps_per_second": 10.137,
"num_input_tokens_seen": 105904,
"step": 237
},
{
"epoch": 1.518987341772152,
"grad_norm": 2.4955692291259766,
"learning_rate": 4.960077169946052e-05,
"loss": 0.1011,
"num_input_tokens_seen": 107248,
"step": 240
},
{
"epoch": 1.5506329113924051,
"grad_norm": 0.9815319180488586,
"learning_rate": 4.9550115944344855e-05,
"loss": 0.0578,
"num_input_tokens_seen": 109360,
"step": 245
},
{
"epoch": 1.5822784810126582,
"grad_norm": 46.03337860107422,
"learning_rate": 4.949646454873477e-05,
"loss": 0.1606,
"num_input_tokens_seen": 111568,
"step": 250
},
{
"epoch": 1.6139240506329116,
"grad_norm": 9.345287322998047,
"learning_rate": 4.9439824059250794e-05,
"loss": 0.2063,
"num_input_tokens_seen": 113936,
"step": 255
},
{
"epoch": 1.6455696202531644,
"grad_norm": 27.278141021728516,
"learning_rate": 4.938020138724702e-05,
"loss": 0.2002,
"num_input_tokens_seen": 116304,
"step": 260
},
{
"epoch": 1.6772151898734178,
"grad_norm": 3.0424633026123047,
"learning_rate": 4.9317603807967705e-05,
"loss": 0.1304,
"num_input_tokens_seen": 118608,
"step": 265
},
{
"epoch": 1.7088607594936709,
"grad_norm": 3.5982022285461426,
"learning_rate": 4.925203895965963e-05,
"loss": 0.128,
"num_input_tokens_seen": 120784,
"step": 270
},
{
"epoch": 1.740506329113924,
"grad_norm": 2.8051130771636963,
"learning_rate": 4.918351484263997e-05,
"loss": 0.12,
"num_input_tokens_seen": 122960,
"step": 275
},
{
"epoch": 1.7721518987341773,
"grad_norm": 5.42772102355957,
"learning_rate": 4.911203981832013e-05,
"loss": 0.0565,
"num_input_tokens_seen": 125232,
"step": 280
},
{
"epoch": 1.8037974683544302,
"grad_norm": 5.5666937828063965,
"learning_rate": 4.903762260818551e-05,
"loss": 0.2118,
"num_input_tokens_seen": 127504,
"step": 285
},
{
"epoch": 1.8354430379746836,
"grad_norm": 2.1527256965637207,
"learning_rate": 4.896027229273123e-05,
"loss": 0.1114,
"num_input_tokens_seen": 129776,
"step": 290
},
{
"epoch": 1.8670886075949367,
"grad_norm": 2.752041816711426,
"learning_rate": 4.887999831035414e-05,
"loss": 0.1067,
"num_input_tokens_seen": 131984,
"step": 295
},
{
"epoch": 1.8987341772151898,
"grad_norm": 5.814232349395752,
"learning_rate": 4.879681045620115e-05,
"loss": 0.2103,
"num_input_tokens_seen": 134288,
"step": 300
},
{
"epoch": 1.9303797468354431,
"grad_norm": 6.459428310394287,
"learning_rate": 4.8710718880974e-05,
"loss": 0.1299,
"num_input_tokens_seen": 136496,
"step": 305
},
{
"epoch": 1.9620253164556962,
"grad_norm": 11.473212242126465,
"learning_rate": 4.862173408969066e-05,
"loss": 0.0624,
"num_input_tokens_seen": 138896,
"step": 310
},
{
"epoch": 1.9936708860759493,
"grad_norm": 25.566421508789062,
"learning_rate": 4.852986694040347e-05,
"loss": 0.1331,
"num_input_tokens_seen": 141136,
"step": 315
},
{
"epoch": 2.0,
"eval_loss": 0.15266959369182587,
"eval_runtime": 1.7734,
"eval_samples_per_second": 39.471,
"eval_steps_per_second": 10.15,
"num_input_tokens_seen": 141328,
"step": 316
},
{
"epoch": 2.0253164556962027,
"grad_norm": 2.049661159515381,
"learning_rate": 4.843512864287425e-05,
"loss": 0.1069,
"num_input_tokens_seen": 143152,
"step": 320
},
{
"epoch": 2.0569620253164556,
"grad_norm": 1.3843505382537842,
"learning_rate": 4.833753075720647e-05,
"loss": 0.0599,
"num_input_tokens_seen": 145328,
"step": 325
},
{
"epoch": 2.088607594936709,
"grad_norm": 1.9298903942108154,
"learning_rate": 4.8237085192434676e-05,
"loss": 0.0544,
"num_input_tokens_seen": 147632,
"step": 330
},
{
"epoch": 2.1202531645569622,
"grad_norm": 1.791174054145813,
"learning_rate": 4.8133804205071285e-05,
"loss": 0.0435,
"num_input_tokens_seen": 149680,
"step": 335
},
{
"epoch": 2.151898734177215,
"grad_norm": 14.78237533569336,
"learning_rate": 4.802770039761108e-05,
"loss": 0.216,
"num_input_tokens_seen": 151952,
"step": 340
},
{
"epoch": 2.1835443037974684,
"grad_norm": 0.16718049347400665,
"learning_rate": 4.791878671699342e-05,
"loss": 0.1604,
"num_input_tokens_seen": 154224,
"step": 345
},
{
"epoch": 2.2151898734177213,
"grad_norm": 0.8592212796211243,
"learning_rate": 4.7807076453022425e-05,
"loss": 0.0732,
"num_input_tokens_seen": 156400,
"step": 350
},
{
"epoch": 2.2468354430379747,
"grad_norm": 1.9021673202514648,
"learning_rate": 4.7692583236745334e-05,
"loss": 0.2025,
"num_input_tokens_seen": 158576,
"step": 355
},
{
"epoch": 2.278481012658228,
"grad_norm": 3.9820752143859863,
"learning_rate": 4.757532103878925e-05,
"loss": 0.1819,
"num_input_tokens_seen": 160944,
"step": 360
},
{
"epoch": 2.310126582278481,
"grad_norm": 0.6237077116966248,
"learning_rate": 4.745530416765641e-05,
"loss": 0.1606,
"num_input_tokens_seen": 163248,
"step": 365
},
{
"epoch": 2.3417721518987342,
"grad_norm": 5.74051570892334,
"learning_rate": 4.733254726797821e-05,
"loss": 0.1306,
"num_input_tokens_seen": 165584,
"step": 370
},
{
"epoch": 2.3734177215189876,
"grad_norm": 0.508920431137085,
"learning_rate": 4.72070653187283e-05,
"loss": 0.0429,
"num_input_tokens_seen": 167824,
"step": 375
},
{
"epoch": 2.4050632911392404,
"grad_norm": 2.0854220390319824,
"learning_rate": 4.707887363139479e-05,
"loss": 0.1268,
"num_input_tokens_seen": 170000,
"step": 380
},
{
"epoch": 2.4367088607594938,
"grad_norm": 0.6447995901107788,
"learning_rate": 4.6947987848111955e-05,
"loss": 0.0904,
"num_input_tokens_seen": 172208,
"step": 385
},
{
"epoch": 2.4683544303797467,
"grad_norm": 2.10355806350708,
"learning_rate": 4.68144239397515e-05,
"loss": 0.0635,
"num_input_tokens_seen": 174480,
"step": 390
},
{
"epoch": 2.5,
"grad_norm": 2.1555063724517822,
"learning_rate": 4.667819820397382e-05,
"loss": 0.1217,
"num_input_tokens_seen": 176752,
"step": 395
},
{
"epoch": 2.5,
"eval_loss": 0.09895941615104675,
"eval_runtime": 1.778,
"eval_samples_per_second": 39.371,
"eval_steps_per_second": 10.124,
"num_input_tokens_seen": 176752,
"step": 395
},
{
"epoch": 2.5316455696202533,
"grad_norm": 3.2891459465026855,
"learning_rate": 4.653932726323935e-05,
"loss": 0.1123,
"num_input_tokens_seen": 178864,
"step": 400
},
{
"epoch": 2.5632911392405062,
"grad_norm": 0.8346641063690186,
"learning_rate": 4.639782806278021e-05,
"loss": 0.0482,
"num_input_tokens_seen": 181104,
"step": 405
},
{
"epoch": 2.5949367088607596,
"grad_norm": 16.31033706665039,
"learning_rate": 4.625371786853258e-05,
"loss": 0.2108,
"num_input_tokens_seen": 183472,
"step": 410
},
{
"epoch": 2.6265822784810124,
"grad_norm": 1.3053120374679565,
"learning_rate": 4.610701426502984e-05,
"loss": 0.1201,
"num_input_tokens_seen": 185776,
"step": 415
},
{
"epoch": 2.6582278481012658,
"grad_norm": 1.6082241535186768,
"learning_rate": 4.5957735153256915e-05,
"loss": 0.1283,
"num_input_tokens_seen": 188080,
"step": 420
},
{
"epoch": 2.689873417721519,
"grad_norm": 1.7989009618759155,
"learning_rate": 4.5805898748465955e-05,
"loss": 0.0637,
"num_input_tokens_seen": 190320,
"step": 425
},
{
"epoch": 2.721518987341772,
"grad_norm": 2.346414804458618,
"learning_rate": 4.565152357795368e-05,
"loss": 0.1731,
"num_input_tokens_seen": 192496,
"step": 430
},
{
"epoch": 2.7531645569620253,
"grad_norm": 0.5701020359992981,
"learning_rate": 4.549462847880066e-05,
"loss": 0.0451,
"num_input_tokens_seen": 194704,
"step": 435
},
{
"epoch": 2.7848101265822782,
"grad_norm": 2.4175257682800293,
"learning_rate": 4.53352325955728e-05,
"loss": 0.0796,
"num_input_tokens_seen": 196976,
"step": 440
},
{
"epoch": 2.8164556962025316,
"grad_norm": 1.1178758144378662,
"learning_rate": 4.517335537798526e-05,
"loss": 0.1103,
"num_input_tokens_seen": 199312,
"step": 445
},
{
"epoch": 2.848101265822785,
"grad_norm": 1.3870896100997925,
"learning_rate": 4.5009016578529194e-05,
"loss": 0.072,
"num_input_tokens_seen": 201584,
"step": 450
},
{
"epoch": 2.879746835443038,
"grad_norm": 1.876051664352417,
"learning_rate": 4.4842236250061535e-05,
"loss": 0.13,
"num_input_tokens_seen": 203760,
"step": 455
},
{
"epoch": 2.911392405063291,
"grad_norm": 2.2049477100372314,
"learning_rate": 4.467303474335809e-05,
"loss": 0.083,
"num_input_tokens_seen": 205968,
"step": 460
},
{
"epoch": 2.9430379746835444,
"grad_norm": 1.2528291940689087,
"learning_rate": 4.4501432704630305e-05,
"loss": 0.1533,
"num_input_tokens_seen": 208240,
"step": 465
},
{
"epoch": 2.9746835443037973,
"grad_norm": 1.5981637239456177,
"learning_rate": 4.432745107300603e-05,
"loss": 0.0748,
"num_input_tokens_seen": 210416,
"step": 470
},
{
"epoch": 3.0,
"eval_loss": 0.10051123797893524,
"eval_runtime": 1.7672,
"eval_samples_per_second": 39.611,
"eval_steps_per_second": 10.186,
"num_input_tokens_seen": 211808,
"step": 474
},
{
"epoch": 3.0063291139240507,
"grad_norm": 2.3567254543304443,
"learning_rate": 4.415111107797445e-05,
"loss": 0.0729,
"num_input_tokens_seen": 212288,
"step": 475
},
{
"epoch": 3.037974683544304,
"grad_norm": 2.7346839904785156,
"learning_rate": 4.3972434236795656e-05,
"loss": 0.1298,
"num_input_tokens_seen": 214400,
"step": 480
},
{
"epoch": 3.069620253164557,
"grad_norm": 0.6904764771461487,
"learning_rate": 4.379144235187505e-05,
"loss": 0.0523,
"num_input_tokens_seen": 216736,
"step": 485
},
{
"epoch": 3.1012658227848102,
"grad_norm": 1.1353236436843872,
"learning_rate": 4.360815750810302e-05,
"loss": 0.0871,
"num_input_tokens_seen": 218976,
"step": 490
},
{
"epoch": 3.132911392405063,
"grad_norm": 1.1411477327346802,
"learning_rate": 4.342260207016012e-05,
"loss": 0.0731,
"num_input_tokens_seen": 221216,
"step": 495
},
{
"epoch": 3.1645569620253164,
"grad_norm": 0.23975726962089539,
"learning_rate": 4.3234798679788016e-05,
"loss": 0.076,
"num_input_tokens_seen": 223520,
"step": 500
},
{
"epoch": 3.1962025316455698,
"grad_norm": 3.357827663421631,
"learning_rate": 4.304477025302681e-05,
"loss": 0.0982,
"num_input_tokens_seen": 225824,
"step": 505
},
{
"epoch": 3.2278481012658227,
"grad_norm": 1.5692389011383057,
"learning_rate": 4.285253997741875e-05,
"loss": 0.0788,
"num_input_tokens_seen": 228000,
"step": 510
},
{
"epoch": 3.259493670886076,
"grad_norm": 0.5077994465827942,
"learning_rate": 4.2658131309178805e-05,
"loss": 0.0141,
"num_input_tokens_seen": 230144,
"step": 515
},
{
"epoch": 3.291139240506329,
"grad_norm": 1.5440895557403564,
"learning_rate": 4.246156797033261e-05,
"loss": 0.0495,
"num_input_tokens_seen": 232512,
"step": 520
},
{
"epoch": 3.3227848101265822,
"grad_norm": 0.6975259780883789,
"learning_rate": 4.2262873945821754e-05,
"loss": 0.0446,
"num_input_tokens_seen": 234752,
"step": 525
},
{
"epoch": 3.3544303797468356,
"grad_norm": 1.3985381126403809,
"learning_rate": 4.206207348057721e-05,
"loss": 0.0229,
"num_input_tokens_seen": 236960,
"step": 530
},
{
"epoch": 3.3860759493670884,
"grad_norm": 0.12185825407505035,
"learning_rate": 4.1859191076560844e-05,
"loss": 0.0489,
"num_input_tokens_seen": 239200,
"step": 535
},
{
"epoch": 3.4177215189873418,
"grad_norm": 0.8711693286895752,
"learning_rate": 4.165425148977571e-05,
"loss": 0.0636,
"num_input_tokens_seen": 241344,
"step": 540
},
{
"epoch": 3.449367088607595,
"grad_norm": 1.7671948671340942,
"learning_rate": 4.144727972724524e-05,
"loss": 0.1113,
"num_input_tokens_seen": 243648,
"step": 545
},
{
"epoch": 3.481012658227848,
"grad_norm": 3.0171236991882324,
"learning_rate": 4.123830104396189e-05,
"loss": 0.0843,
"num_input_tokens_seen": 245792,
"step": 550
},
{
"epoch": 3.5,
"eval_loss": 0.08688981086015701,
"eval_runtime": 1.7725,
"eval_samples_per_second": 39.492,
"eval_steps_per_second": 10.155,
"num_input_tokens_seen": 247104,
"step": 553
},
{
"epoch": 3.5126582278481013,
"grad_norm": 0.3095618486404419,
"learning_rate": 4.10273409398055e-05,
"loss": 0.1233,
"num_input_tokens_seen": 248000,
"step": 555
},
{
"epoch": 3.5443037974683547,
"grad_norm": 1.4407626390457153,
"learning_rate": 4.0814425156431713e-05,
"loss": 0.0679,
"num_input_tokens_seen": 250144,
"step": 560
},
{
"epoch": 3.5759493670886076,
"grad_norm": 0.7778800129890442,
"learning_rate": 4.0599579674130964e-05,
"loss": 0.0232,
"num_input_tokens_seen": 252288,
"step": 565
},
{
"epoch": 3.607594936708861,
"grad_norm": 1.3945868015289307,
"learning_rate": 4.0382830708658316e-05,
"loss": 0.0948,
"num_input_tokens_seen": 254496,
"step": 570
},
{
"epoch": 3.6392405063291138,
"grad_norm": 0.7456892132759094,
"learning_rate": 4.016420470803464e-05,
"loss": 0.0113,
"num_input_tokens_seen": 256672,
"step": 575
},
{
"epoch": 3.670886075949367,
"grad_norm": 1.2941418886184692,
"learning_rate": 3.9943728349319286e-05,
"loss": 0.0971,
"num_input_tokens_seen": 258944,
"step": 580
},
{
"epoch": 3.7025316455696204,
"grad_norm": 0.5528960824012756,
"learning_rate": 3.972142853535499e-05,
"loss": 0.0909,
"num_input_tokens_seen": 261216,
"step": 585
},
{
"epoch": 3.7341772151898733,
"grad_norm": 1.220745325088501,
"learning_rate": 3.949733239148512e-05,
"loss": 0.1046,
"num_input_tokens_seen": 263552,
"step": 590
},
{
"epoch": 3.7658227848101267,
"grad_norm": 1.580295443534851,
"learning_rate": 3.927146726224379e-05,
"loss": 0.0776,
"num_input_tokens_seen": 265792,
"step": 595
},
{
"epoch": 3.7974683544303796,
"grad_norm": 1.8577332496643066,
"learning_rate": 3.9043860708019273e-05,
"loss": 0.0815,
"num_input_tokens_seen": 267968,
"step": 600
},
{
"epoch": 3.829113924050633,
"grad_norm": 2.401762008666992,
"learning_rate": 3.881454050169102e-05,
"loss": 0.0598,
"num_input_tokens_seen": 270112,
"step": 605
},
{
"epoch": 3.8607594936708862,
"grad_norm": 3.1154701709747314,
"learning_rate": 3.8583534625240786e-05,
"loss": 0.1975,
"num_input_tokens_seen": 272352,
"step": 610
},
{
"epoch": 3.892405063291139,
"grad_norm": 1.4128812551498413,
"learning_rate": 3.835087126633821e-05,
"loss": 0.1122,
"num_input_tokens_seen": 274688,
"step": 615
},
{
"epoch": 3.9240506329113924,
"grad_norm": 1.2004035711288452,
"learning_rate": 3.8116578814901315e-05,
"loss": 0.1368,
"num_input_tokens_seen": 276864,
"step": 620
},
{
"epoch": 3.9556962025316453,
"grad_norm": 0.5005134344100952,
"learning_rate": 3.788068585963238e-05,
"loss": 0.0475,
"num_input_tokens_seen": 279104,
"step": 625
},
{
"epoch": 3.9873417721518987,
"grad_norm": 0.2701457440853119,
"learning_rate": 3.764322118452943e-05,
"loss": 0.1127,
"num_input_tokens_seen": 281344,
"step": 630
},
{
"epoch": 4.0,
"eval_loss": 0.08055847138166428,
"eval_runtime": 1.77,
"eval_samples_per_second": 39.549,
"eval_steps_per_second": 10.17,
"num_input_tokens_seen": 282048,
"step": 632
},
{
"epoch": 4.018987341772152,
"grad_norm": 0.7883828282356262,
"learning_rate": 3.740421376537402e-05,
"loss": 0.0492,
"num_input_tokens_seen": 283456,
"step": 635
},
{
"epoch": 4.050632911392405,
"grad_norm": 2.189941883087158,
"learning_rate": 3.7163692766195554e-05,
"loss": 0.0655,
"num_input_tokens_seen": 285600,
"step": 640
},
{
"epoch": 4.082278481012658,
"grad_norm": 0.2087947428226471,
"learning_rate": 3.6921687535712656e-05,
"loss": 0.0519,
"num_input_tokens_seen": 287808,
"step": 645
},
{
"epoch": 4.113924050632911,
"grad_norm": 1.8172799348831177,
"learning_rate": 3.667822760375198e-05,
"loss": 0.0551,
"num_input_tokens_seen": 290048,
"step": 650
},
{
"epoch": 4.1455696202531644,
"grad_norm": 0.24938087165355682,
"learning_rate": 3.643334267764495e-05,
"loss": 0.011,
"num_input_tokens_seen": 292320,
"step": 655
},
{
"epoch": 4.177215189873418,
"grad_norm": 1.2110445499420166,
"learning_rate": 3.618706263860284e-05,
"loss": 0.0557,
"num_input_tokens_seen": 294560,
"step": 660
},
{
"epoch": 4.208860759493671,
"grad_norm": 0.8580341339111328,
"learning_rate": 3.5939417538070595e-05,
"loss": 0.0788,
"num_input_tokens_seen": 296832,
"step": 665
},
{
"epoch": 4.2405063291139244,
"grad_norm": 0.9276352524757385,
"learning_rate": 3.569043759405995e-05,
"loss": 0.0306,
"num_input_tokens_seen": 299168,
"step": 670
},
{
"epoch": 4.272151898734177,
"grad_norm": 1.2783339023590088,
"learning_rate": 3.5440153187462145e-05,
"loss": 0.0951,
"num_input_tokens_seen": 301312,
"step": 675
},
{
"epoch": 4.30379746835443,
"grad_norm": 1.2173526287078857,
"learning_rate": 3.518859485834082e-05,
"loss": 0.0305,
"num_input_tokens_seen": 303520,
"step": 680
},
{
"epoch": 4.3354430379746836,
"grad_norm": 8.077330589294434,
"learning_rate": 3.49357933022055e-05,
"loss": 0.1058,
"num_input_tokens_seen": 305792,
"step": 685
},
{
"epoch": 4.367088607594937,
"grad_norm": 0.9942968487739563,
"learning_rate": 3.468177936626603e-05,
"loss": 0.0481,
"num_input_tokens_seen": 307968,
"step": 690
},
{
"epoch": 4.39873417721519,
"grad_norm": 0.5724031329154968,
"learning_rate": 3.442658404566861e-05,
"loss": 0.056,
"num_input_tokens_seen": 310240,
"step": 695
},
{
"epoch": 4.430379746835443,
"grad_norm": 0.6137570738792419,
"learning_rate": 3.417023847971368e-05,
"loss": 0.0666,
"num_input_tokens_seen": 312416,
"step": 700
},
{
"epoch": 4.462025316455696,
"grad_norm": 0.5140513181686401,
"learning_rate": 3.3912773948056274e-05,
"loss": 0.1092,
"num_input_tokens_seen": 314656,
"step": 705
},
{
"epoch": 4.493670886075949,
"grad_norm": 1.4817256927490234,
"learning_rate": 3.3654221866889254e-05,
"loss": 0.0631,
"num_input_tokens_seen": 316768,
"step": 710
},
{
"epoch": 4.5,
"eval_loss": 0.08643601834774017,
"eval_runtime": 1.784,
"eval_samples_per_second": 39.237,
"eval_steps_per_second": 10.09,
"num_input_tokens_seen": 317248,
"step": 711
},
{
"epoch": 4.525316455696203,
"grad_norm": 0.3269573748111725,
"learning_rate": 3.339461378510982e-05,
"loss": 0.0729,
"num_input_tokens_seen": 318912,
"step": 715
},
{
"epoch": 4.556962025316456,
"grad_norm": 1.8290832042694092,
"learning_rate": 3.313398138046987e-05,
"loss": 0.1263,
"num_input_tokens_seen": 321312,
"step": 720
},
{
"epoch": 4.588607594936709,
"grad_norm": 1.4354349374771118,
"learning_rate": 3.287235645571071e-05,
"loss": 0.1048,
"num_input_tokens_seen": 323488,
"step": 725
},
{
"epoch": 4.620253164556962,
"grad_norm": 1.9681675434112549,
"learning_rate": 3.26097709346823e-05,
"loss": 0.0817,
"num_input_tokens_seen": 325824,
"step": 730
},
{
"epoch": 4.651898734177215,
"grad_norm": 1.6938735246658325,
"learning_rate": 3.234625685844803e-05,
"loss": 0.0464,
"num_input_tokens_seen": 328064,
"step": 735
},
{
"epoch": 4.6835443037974684,
"grad_norm": 2.0596401691436768,
"learning_rate": 3.208184638137484e-05,
"loss": 0.1184,
"num_input_tokens_seen": 330208,
"step": 740
},
{
"epoch": 4.715189873417722,
"grad_norm": 0.32809868454933167,
"learning_rate": 3.181657176720987e-05,
"loss": 0.0494,
"num_input_tokens_seen": 332512,
"step": 745
},
{
"epoch": 4.746835443037975,
"grad_norm": 1.6838154792785645,
"learning_rate": 3.1550465385143455e-05,
"loss": 0.1421,
"num_input_tokens_seen": 334752,
"step": 750
},
{
"epoch": 4.7784810126582276,
"grad_norm": 0.052991144359111786,
"learning_rate": 3.128355970585949e-05,
"loss": 0.0342,
"num_input_tokens_seen": 337024,
"step": 755
},
{
"epoch": 4.810126582278481,
"grad_norm": 0.853191077709198,
"learning_rate": 3.101588729757323e-05,
"loss": 0.0806,
"num_input_tokens_seen": 339264,
"step": 760
},
{
"epoch": 4.841772151898734,
"grad_norm": 0.7384766340255737,
"learning_rate": 3.074748082205734e-05,
"loss": 0.0258,
"num_input_tokens_seen": 341536,
"step": 765
},
{
"epoch": 4.8734177215189876,
"grad_norm": 1.546053171157837,
"learning_rate": 3.0478373030656405e-05,
"loss": 0.0363,
"num_input_tokens_seen": 343872,
"step": 770
},
{
"epoch": 4.905063291139241,
"grad_norm": 2.8107569217681885,
"learning_rate": 3.0208596760290587e-05,
"loss": 0.0848,
"num_input_tokens_seen": 346176,
"step": 775
},
{
"epoch": 4.936708860759493,
"grad_norm": 0.4796382486820221,
"learning_rate": 2.993818492944882e-05,
"loss": 0.0604,
"num_input_tokens_seen": 348384,
"step": 780
},
{
"epoch": 4.968354430379747,
"grad_norm": 0.3333726227283478,
"learning_rate": 2.9667170534172035e-05,
"loss": 0.0462,
"num_input_tokens_seen": 350560,
"step": 785
},
{
"epoch": 5.0,
"grad_norm": 0.2007966786623001,
"learning_rate": 2.9395586644026945e-05,
"loss": 0.0826,
"num_input_tokens_seen": 352592,
"step": 790
},
{
"epoch": 5.0,
"eval_loss": 0.08093634247779846,
"eval_runtime": 1.7738,
"eval_samples_per_second": 39.464,
"eval_steps_per_second": 10.148,
"num_input_tokens_seen": 352592,
"step": 790
},
{
"epoch": 5.031645569620253,
"grad_norm": 0.4830271005630493,
"learning_rate": 2.9123466398070852e-05,
"loss": 0.0523,
"num_input_tokens_seen": 354736,
"step": 795
},
{
"epoch": 5.063291139240507,
"grad_norm": 0.9163768291473389,
"learning_rate": 2.8850843000807964e-05,
"loss": 0.0531,
"num_input_tokens_seen": 357072,
"step": 800
},
{
"epoch": 5.094936708860759,
"grad_norm": 1.5045676231384277,
"learning_rate": 2.8577749718137724e-05,
"loss": 0.0262,
"num_input_tokens_seen": 359376,
"step": 805
},
{
"epoch": 5.1265822784810124,
"grad_norm": 0.9244067668914795,
"learning_rate": 2.830421987329569e-05,
"loss": 0.0479,
"num_input_tokens_seen": 361744,
"step": 810
},
{
"epoch": 5.158227848101266,
"grad_norm": 0.04344862326979637,
"learning_rate": 2.803028684278735e-05,
"loss": 0.1163,
"num_input_tokens_seen": 363952,
"step": 815
},
{
"epoch": 5.189873417721519,
"grad_norm": 5.920812129974365,
"learning_rate": 2.775598405231549e-05,
"loss": 0.0831,
"num_input_tokens_seen": 366224,
"step": 820
},
{
"epoch": 5.2215189873417724,
"grad_norm": 3.040343761444092,
"learning_rate": 2.7481344972701545e-05,
"loss": 0.077,
"num_input_tokens_seen": 368496,
"step": 825
},
{
"epoch": 5.253164556962025,
"grad_norm": 0.3769160509109497,
"learning_rate": 2.7206403115801427e-05,
"loss": 0.039,
"num_input_tokens_seen": 370800,
"step": 830
},
{
"epoch": 5.284810126582278,
"grad_norm": 0.4965035021305084,
"learning_rate": 2.6931192030416362e-05,
"loss": 0.0347,
"num_input_tokens_seen": 373040,
"step": 835
},
{
"epoch": 5.3164556962025316,
"grad_norm": 1.6629559993743896,
"learning_rate": 2.6655745298199253e-05,
"loss": 0.067,
"num_input_tokens_seen": 375280,
"step": 840
},
{
"epoch": 5.348101265822785,
"grad_norm": 0.16424624621868134,
"learning_rate": 2.6380096529556936e-05,
"loss": 0.0659,
"num_input_tokens_seen": 377520,
"step": 845
},
{
"epoch": 5.379746835443038,
"grad_norm": 1.512812852859497,
"learning_rate": 2.6104279359549043e-05,
"loss": 0.0597,
"num_input_tokens_seen": 379696,
"step": 850
},
{
"epoch": 5.4113924050632916,
"grad_norm": 27.909957885742188,
"learning_rate": 2.582832744378377e-05,
"loss": 0.1124,
"num_input_tokens_seen": 381904,
"step": 855
},
{
"epoch": 5.443037974683544,
"grad_norm": 0.5168792605400085,
"learning_rate": 2.555227445431119e-05,
"loss": 0.0281,
"num_input_tokens_seen": 384112,
"step": 860
},
{
"epoch": 5.474683544303797,
"grad_norm": 0.2397191971540451,
"learning_rate": 2.5276154075514547e-05,
"loss": 0.0446,
"num_input_tokens_seen": 386416,
"step": 865
},
{
"epoch": 5.5,
"eval_loss": 0.09230098128318787,
"eval_runtime": 1.7721,
"eval_samples_per_second": 39.501,
"eval_steps_per_second": 10.157,
"num_input_tokens_seen": 388176,
"step": 869
},
{
"epoch": 5.506329113924051,
"grad_norm": 0.22594130039215088,
"learning_rate": 2.5e-05,
"loss": 0.0487,
"num_input_tokens_seen": 388624,
"step": 870
},
{
"epoch": 5.537974683544304,
"grad_norm": 1.0183571577072144,
"learning_rate": 2.472384592448546e-05,
"loss": 0.0566,
"num_input_tokens_seen": 390864,
"step": 875
},
{
"epoch": 5.569620253164557,
"grad_norm": 0.8194377422332764,
"learning_rate": 2.4447725545688814e-05,
"loss": 0.0534,
"num_input_tokens_seen": 393104,
"step": 880
},
{
"epoch": 5.60126582278481,
"grad_norm": 0.9605931639671326,
"learning_rate": 2.4171672556216234e-05,
"loss": 0.0834,
"num_input_tokens_seen": 395440,
"step": 885
},
{
"epoch": 5.632911392405063,
"grad_norm": 0.7708854675292969,
"learning_rate": 2.3895720640450963e-05,
"loss": 0.0516,
"num_input_tokens_seen": 397776,
"step": 890
},
{
"epoch": 5.6645569620253164,
"grad_norm": 0.2176842838525772,
"learning_rate": 2.3619903470443067e-05,
"loss": 0.0432,
"num_input_tokens_seen": 399984,
"step": 895
},
{
"epoch": 5.69620253164557,
"grad_norm": 2.65143084526062,
"learning_rate": 2.3344254701800756e-05,
"loss": 0.0658,
"num_input_tokens_seen": 402256,
"step": 900
},
{
"epoch": 5.727848101265823,
"grad_norm": 0.07710819691419601,
"learning_rate": 2.306880796958364e-05,
"loss": 0.0064,
"num_input_tokens_seen": 404400,
"step": 905
},
{
"epoch": 5.759493670886076,
"grad_norm": 0.05985208973288536,
"learning_rate": 2.279359688419858e-05,
"loss": 0.0102,
"num_input_tokens_seen": 406512,
"step": 910
},
{
"epoch": 5.791139240506329,
"grad_norm": 2.6067276000976562,
"learning_rate": 2.2518655027298464e-05,
"loss": 0.0914,
"num_input_tokens_seen": 408752,
"step": 915
},
{
"epoch": 5.822784810126582,
"grad_norm": 0.6947869062423706,
"learning_rate": 2.2244015947684514e-05,
"loss": 0.0731,
"num_input_tokens_seen": 411088,
"step": 920
},
{
"epoch": 5.8544303797468356,
"grad_norm": 1.428839087486267,
"learning_rate": 2.1969713157212655e-05,
"loss": 0.061,
"num_input_tokens_seen": 413200,
"step": 925
},
{
"epoch": 5.886075949367089,
"grad_norm": 2.066138982772827,
"learning_rate": 2.169578012670431e-05,
"loss": 0.0674,
"num_input_tokens_seen": 415440,
"step": 930
},
{
"epoch": 5.917721518987342,
"grad_norm": 0.2917608618736267,
"learning_rate": 2.142225028186228e-05,
"loss": 0.0796,
"num_input_tokens_seen": 417744,
"step": 935
},
{
"epoch": 5.949367088607595,
"grad_norm": 1.0031896829605103,
"learning_rate": 2.1149156999192042e-05,
"loss": 0.0833,
"num_input_tokens_seen": 419952,
"step": 940
},
{
"epoch": 5.981012658227848,
"grad_norm": 1.1027328968048096,
"learning_rate": 2.087653360192915e-05,
"loss": 0.0774,
"num_input_tokens_seen": 422160,
"step": 945
},
{
"epoch": 6.0,
"eval_loss": 0.07719651609659195,
"eval_runtime": 1.7754,
"eval_samples_per_second": 39.428,
"eval_steps_per_second": 10.139,
"num_input_tokens_seen": 423184,
"step": 948
},
{
"epoch": 6.012658227848101,
"grad_norm": 0.5562558174133301,
"learning_rate": 2.0604413355973054e-05,
"loss": 0.0359,
"num_input_tokens_seen": 424176,
"step": 950
},
{
"epoch": 6.044303797468355,
"grad_norm": 0.7119388580322266,
"learning_rate": 2.033282946582797e-05,
"loss": 0.0186,
"num_input_tokens_seen": 426352,
"step": 955
},
{
"epoch": 6.075949367088608,
"grad_norm": 1.0250580310821533,
"learning_rate": 2.0061815070551186e-05,
"loss": 0.045,
"num_input_tokens_seen": 428560,
"step": 960
},
{
"epoch": 6.1075949367088604,
"grad_norm": 1.0718591213226318,
"learning_rate": 1.9791403239709416e-05,
"loss": 0.016,
"num_input_tokens_seen": 430832,
"step": 965
},
{
"epoch": 6.139240506329114,
"grad_norm": 3.242114543914795,
"learning_rate": 1.9521626969343608e-05,
"loss": 0.038,
"num_input_tokens_seen": 432976,
"step": 970
},
{
"epoch": 6.170886075949367,
"grad_norm": 2.3887522220611572,
"learning_rate": 1.9252519177942667e-05,
"loss": 0.0514,
"num_input_tokens_seen": 435216,
"step": 975
},
{
"epoch": 6.2025316455696204,
"grad_norm": 2.0255088806152344,
"learning_rate": 1.8984112702426774e-05,
"loss": 0.0561,
"num_input_tokens_seen": 437584,
"step": 980
},
{
"epoch": 6.234177215189874,
"grad_norm": 1.4888111352920532,
"learning_rate": 1.8716440294140518e-05,
"loss": 0.0444,
"num_input_tokens_seen": 439760,
"step": 985
},
{
"epoch": 6.265822784810126,
"grad_norm": 0.281730592250824,
"learning_rate": 1.8449534614856558e-05,
"loss": 0.0424,
"num_input_tokens_seen": 442000,
"step": 990
},
{
"epoch": 6.2974683544303796,
"grad_norm": 0.588544487953186,
"learning_rate": 1.818342823279014e-05,
"loss": 0.0307,
"num_input_tokens_seen": 444208,
"step": 995
},
{
"epoch": 6.329113924050633,
"grad_norm": 0.21011468768119812,
"learning_rate": 1.7918153618625162e-05,
"loss": 0.0459,
"num_input_tokens_seen": 446480,
"step": 1000
},
{
"epoch": 6.360759493670886,
"grad_norm": 1.0189317464828491,
"learning_rate": 1.7653743141551983e-05,
"loss": 0.1302,
"num_input_tokens_seen": 448624,
"step": 1005
},
{
"epoch": 6.3924050632911396,
"grad_norm": 0.2481241524219513,
"learning_rate": 1.7390229065317696e-05,
"loss": 0.0243,
"num_input_tokens_seen": 450960,
"step": 1010
},
{
"epoch": 6.424050632911392,
"grad_norm": 0.10512924939393997,
"learning_rate": 1.7127643544289297e-05,
"loss": 0.053,
"num_input_tokens_seen": 453264,
"step": 1015
},
{
"epoch": 6.455696202531645,
"grad_norm": 0.8155413866043091,
"learning_rate": 1.6866018619530123e-05,
"loss": 0.0309,
"num_input_tokens_seen": 455472,
"step": 1020
},
{
"epoch": 6.487341772151899,
"grad_norm": 1.3210691213607788,
"learning_rate": 1.660538621489019e-05,
"loss": 0.0554,
"num_input_tokens_seen": 457744,
"step": 1025
},
{
"epoch": 6.5,
"eval_loss": 0.10978475958108902,
"eval_runtime": 1.7716,
"eval_samples_per_second": 39.512,
"eval_steps_per_second": 10.16,
"num_input_tokens_seen": 458640,
"step": 1027
},
{
"epoch": 6.518987341772152,
"grad_norm": 0.3908012807369232,
"learning_rate": 1.6345778133110752e-05,
"loss": 0.0129,
"num_input_tokens_seen": 459984,
"step": 1030
},
{
"epoch": 6.550632911392405,
"grad_norm": 2.6782333850860596,
"learning_rate": 1.6087226051943728e-05,
"loss": 0.0748,
"num_input_tokens_seen": 462224,
"step": 1035
},
{
"epoch": 6.582278481012658,
"grad_norm": 0.06257260590791702,
"learning_rate": 1.5829761520286334e-05,
"loss": 0.0517,
"num_input_tokens_seen": 464304,
"step": 1040
},
{
"epoch": 6.613924050632911,
"grad_norm": 0.9672399759292603,
"learning_rate": 1.5573415954331398e-05,
"loss": 0.0334,
"num_input_tokens_seen": 466576,
"step": 1045
},
{
"epoch": 6.6455696202531644,
"grad_norm": 1.3417441844940186,
"learning_rate": 1.5318220633733978e-05,
"loss": 0.0753,
"num_input_tokens_seen": 468720,
"step": 1050
},
{
"epoch": 6.677215189873418,
"grad_norm": 0.6716601252555847,
"learning_rate": 1.5064206697794509e-05,
"loss": 0.0394,
"num_input_tokens_seen": 470992,
"step": 1055
},
{
"epoch": 6.708860759493671,
"grad_norm": 0.5984333157539368,
"learning_rate": 1.481140514165919e-05,
"loss": 0.0369,
"num_input_tokens_seen": 473264,
"step": 1060
},
{
"epoch": 6.740506329113924,
"grad_norm": 1.1648204326629639,
"learning_rate": 1.455984681253787e-05,
"loss": 0.0522,
"num_input_tokens_seen": 475504,
"step": 1065
},
{
"epoch": 6.772151898734177,
"grad_norm": 0.014528759755194187,
"learning_rate": 1.4309562405940052e-05,
"loss": 0.0625,
"num_input_tokens_seen": 477776,
"step": 1070
},
{
"epoch": 6.80379746835443,
"grad_norm": 0.037693943828344345,
"learning_rate": 1.40605824619294e-05,
"loss": 0.0443,
"num_input_tokens_seen": 480016,
"step": 1075
},
{
"epoch": 6.8354430379746836,
"grad_norm": 0.6183596253395081,
"learning_rate": 1.3812937361397165e-05,
"loss": 0.0506,
"num_input_tokens_seen": 482256,
"step": 1080
},
{
"epoch": 6.867088607594937,
"grad_norm": 0.3752747178077698,
"learning_rate": 1.3566657322355058e-05,
"loss": 0.0775,
"num_input_tokens_seen": 484432,
"step": 1085
},
{
"epoch": 6.89873417721519,
"grad_norm": 0.7681966423988342,
"learning_rate": 1.3321772396248029e-05,
"loss": 0.0336,
"num_input_tokens_seen": 486608,
"step": 1090
},
{
"epoch": 6.930379746835443,
"grad_norm": 0.9100373387336731,
"learning_rate": 1.3078312464287353e-05,
"loss": 0.0285,
"num_input_tokens_seen": 488784,
"step": 1095
},
{
"epoch": 6.962025316455696,
"grad_norm": 3.5124988555908203,
"learning_rate": 1.2836307233804448e-05,
"loss": 0.0453,
"num_input_tokens_seen": 491024,
"step": 1100
},
{
"epoch": 6.993670886075949,
"grad_norm": 1.1614471673965454,
"learning_rate": 1.2595786234625989e-05,
"loss": 0.046,
"num_input_tokens_seen": 493232,
"step": 1105
},
{
"epoch": 7.0,
"eval_loss": 0.09714015573263168,
"eval_runtime": 1.7702,
"eval_samples_per_second": 39.543,
"eval_steps_per_second": 10.168,
"num_input_tokens_seen": 493440,
"step": 1106
},
{
"epoch": 7.025316455696203,
"grad_norm": 0.7261226773262024,
"learning_rate": 1.2356778815470573e-05,
"loss": 0.0066,
"num_input_tokens_seen": 495200,
"step": 1110
},
{
"epoch": 7.056962025316456,
"grad_norm": 1.433316946029663,
"learning_rate": 1.2119314140367628e-05,
"loss": 0.0224,
"num_input_tokens_seen": 497504,
"step": 1115
},
{
"epoch": 7.0886075949367084,
"grad_norm": 0.3160828948020935,
"learning_rate": 1.1883421185098684e-05,
"loss": 0.0242,
"num_input_tokens_seen": 499744,
"step": 1120
},
{
"epoch": 7.120253164556962,
"grad_norm": 5.301418304443359,
"learning_rate": 1.1649128733661802e-05,
"loss": 0.0706,
"num_input_tokens_seen": 501952,
"step": 1125
},
{
"epoch": 7.151898734177215,
"grad_norm": 1.4883524179458618,
"learning_rate": 1.1416465374759222e-05,
"loss": 0.0311,
"num_input_tokens_seen": 504128,
"step": 1130
},
{
"epoch": 7.1835443037974684,
"grad_norm": 0.2804189920425415,
"learning_rate": 1.118545949830898e-05,
"loss": 0.0176,
"num_input_tokens_seen": 506368,
"step": 1135
},
{
"epoch": 7.215189873417722,
"grad_norm": 1.2443137168884277,
"learning_rate": 1.0956139291980727e-05,
"loss": 0.0342,
"num_input_tokens_seen": 508576,
"step": 1140
},
{
"epoch": 7.246835443037975,
"grad_norm": 0.038201794028282166,
"learning_rate": 1.0728532737756217e-05,
"loss": 0.038,
"num_input_tokens_seen": 510848,
"step": 1145
},
{
"epoch": 7.2784810126582276,
"grad_norm": 0.007585880812257528,
"learning_rate": 1.050266760851489e-05,
"loss": 0.0022,
"num_input_tokens_seen": 513056,
"step": 1150
},
{
"epoch": 7.310126582278481,
"grad_norm": 2.4454050064086914,
"learning_rate": 1.0278571464645011e-05,
"loss": 0.0579,
"num_input_tokens_seen": 515360,
"step": 1155
},
{
"epoch": 7.341772151898734,
"grad_norm": 0.8718327283859253,
"learning_rate": 1.0056271650680713e-05,
"loss": 0.0696,
"num_input_tokens_seen": 517632,
"step": 1160
},
{
"epoch": 7.3734177215189876,
"grad_norm": 1.197663426399231,
"learning_rate": 9.835795291965372e-06,
"loss": 0.0513,
"num_input_tokens_seen": 519872,
"step": 1165
},
{
"epoch": 7.405063291139241,
"grad_norm": 2.151942253112793,
"learning_rate": 9.617169291341689e-06,
"loss": 0.0362,
"num_input_tokens_seen": 522048,
"step": 1170
},
{
"epoch": 7.436708860759493,
"grad_norm": 0.21822425723075867,
"learning_rate": 9.400420325869047e-06,
"loss": 0.0386,
"num_input_tokens_seen": 524256,
"step": 1175
},
{
"epoch": 7.468354430379747,
"grad_norm": 0.42767906188964844,
"learning_rate": 9.185574843568292e-06,
"loss": 0.0254,
"num_input_tokens_seen": 526528,
"step": 1180
},
{
"epoch": 7.5,
"grad_norm": 0.6816165447235107,
"learning_rate": 8.972659060194506e-06,
"loss": 0.1462,
"num_input_tokens_seen": 528768,
"step": 1185
},
{
"epoch": 7.5,
"eval_loss": 0.10821857303380966,
"eval_runtime": 1.7807,
"eval_samples_per_second": 39.311,
"eval_steps_per_second": 10.109,
"num_input_tokens_seen": 528768,
"step": 1185
},
{
"epoch": 7.531645569620253,
"grad_norm": 0.052816856652498245,
"learning_rate": 8.761698956038111e-06,
"loss": 0.0325,
"num_input_tokens_seen": 531040,
"step": 1190
},
{
"epoch": 7.563291139240507,
"grad_norm": 1.8085730075836182,
"learning_rate": 8.552720272754767e-06,
"loss": 0.0397,
"num_input_tokens_seen": 533312,
"step": 1195
},
{
"epoch": 7.594936708860759,
"grad_norm": 4.758277893066406,
"learning_rate": 8.345748510224299e-06,
"loss": 0.0334,
"num_input_tokens_seen": 535520,
"step": 1200
},
{
"epoch": 7.6265822784810124,
"grad_norm": 2.2314627170562744,
"learning_rate": 8.140808923439162e-06,
"loss": 0.0192,
"num_input_tokens_seen": 537792,
"step": 1205
},
{
"epoch": 7.658227848101266,
"grad_norm": 0.8286902904510498,
"learning_rate": 7.937926519422789e-06,
"loss": 0.0445,
"num_input_tokens_seen": 539968,
"step": 1210
},
{
"epoch": 7.689873417721519,
"grad_norm": 2.621687650680542,
"learning_rate": 7.737126054178237e-06,
"loss": 0.0544,
"num_input_tokens_seen": 542208,
"step": 1215
},
{
"epoch": 7.7215189873417724,
"grad_norm": 0.02005881257355213,
"learning_rate": 7.538432029667395e-06,
"loss": 0.0199,
"num_input_tokens_seen": 544416,
"step": 1220
},
{
"epoch": 7.753164556962025,
"grad_norm": 0.35898730158805847,
"learning_rate": 7.341868690821199e-06,
"loss": 0.0619,
"num_input_tokens_seen": 546816,
"step": 1225
},
{
"epoch": 7.784810126582278,
"grad_norm": 0.4341179430484772,
"learning_rate": 7.147460022581257e-06,
"loss": 0.0158,
"num_input_tokens_seen": 549120,
"step": 1230
},
{
"epoch": 7.8164556962025316,
"grad_norm": 1.9629868268966675,
"learning_rate": 6.955229746973185e-06,
"loss": 0.0194,
"num_input_tokens_seen": 551328,
"step": 1235
},
{
"epoch": 7.848101265822785,
"grad_norm": 0.34731948375701904,
"learning_rate": 6.765201320211987e-06,
"loss": 0.0091,
"num_input_tokens_seen": 553504,
"step": 1240
},
{
"epoch": 7.879746835443038,
"grad_norm": 5.475011348724365,
"learning_rate": 6.57739792983989e-06,
"loss": 0.1217,
"num_input_tokens_seen": 555680,
"step": 1245
},
{
"epoch": 7.911392405063291,
"grad_norm": 0.41866156458854675,
"learning_rate": 6.391842491896977e-06,
"loss": 0.0086,
"num_input_tokens_seen": 557824,
"step": 1250
},
{
"epoch": 7.943037974683544,
"grad_norm": 0.4846521317958832,
"learning_rate": 6.2085576481249575e-06,
"loss": 0.0197,
"num_input_tokens_seen": 560032,
"step": 1255
},
{
"epoch": 7.974683544303797,
"grad_norm": 0.3908102810382843,
"learning_rate": 6.0275657632043485e-06,
"loss": 0.0111,
"num_input_tokens_seen": 562304,
"step": 1260
},
{
"epoch": 8.0,
"eval_loss": 0.12035088986158371,
"eval_runtime": 1.7665,
"eval_samples_per_second": 39.626,
"eval_steps_per_second": 10.19,
"num_input_tokens_seen": 563872,
"step": 1264
},
{
"epoch": 8.00632911392405,
"grad_norm": 0.4609631896018982,
"learning_rate": 5.848888922025553e-06,
"loss": 0.0333,
"num_input_tokens_seen": 564320,
"step": 1265
},
{
"epoch": 8.037974683544304,
"grad_norm": 0.19440504908561707,
"learning_rate": 5.6725489269939704e-06,
"loss": 0.003,
"num_input_tokens_seen": 566432,
"step": 1270
},
{
"epoch": 8.069620253164556,
"grad_norm": 1.0085276365280151,
"learning_rate": 5.4985672953697e-06,
"loss": 0.0435,
"num_input_tokens_seen": 568704,
"step": 1275
},
{
"epoch": 8.10126582278481,
"grad_norm": 0.4334363639354706,
"learning_rate": 5.3269652566419165e-06,
"loss": 0.0496,
"num_input_tokens_seen": 570912,
"step": 1280
},
{
"epoch": 8.132911392405063,
"grad_norm": 1.7338502407073975,
"learning_rate": 5.157763749938468e-06,
"loss": 0.0107,
"num_input_tokens_seen": 573184,
"step": 1285
},
{
"epoch": 8.164556962025316,
"grad_norm": 2.3482325077056885,
"learning_rate": 4.990983421470813e-06,
"loss": 0.0322,
"num_input_tokens_seen": 575424,
"step": 1290
},
{
"epoch": 8.19620253164557,
"grad_norm": 0.08308505266904831,
"learning_rate": 4.826644622014748e-06,
"loss": 0.0251,
"num_input_tokens_seen": 577664,
"step": 1295
},
{
"epoch": 8.227848101265822,
"grad_norm": 0.07593591511249542,
"learning_rate": 4.664767404427201e-06,
"loss": 0.0081,
"num_input_tokens_seen": 579968,
"step": 1300
},
{
"epoch": 8.259493670886076,
"grad_norm": 4.225083351135254,
"learning_rate": 4.505371521199342e-06,
"loss": 0.0225,
"num_input_tokens_seen": 582208,
"step": 1305
},
{
"epoch": 8.291139240506329,
"grad_norm": 1.984985589981079,
"learning_rate": 4.3484764220463295e-06,
"loss": 0.0402,
"num_input_tokens_seen": 584512,
"step": 1310
},
{
"epoch": 8.322784810126583,
"grad_norm": 1.2428722381591797,
"learning_rate": 4.19410125153405e-06,
"loss": 0.0378,
"num_input_tokens_seen": 586848,
"step": 1315
},
{
"epoch": 8.354430379746836,
"grad_norm": 0.7120981216430664,
"learning_rate": 4.042264846743085e-06,
"loss": 0.051,
"num_input_tokens_seen": 589120,
"step": 1320
},
{
"epoch": 8.386075949367088,
"grad_norm": 0.08994811773300171,
"learning_rate": 3.892985734970162e-06,
"loss": 0.0074,
"num_input_tokens_seen": 591232,
"step": 1325
},
{
"epoch": 8.417721518987342,
"grad_norm": 0.10305431485176086,
"learning_rate": 3.746282131467424e-06,
"loss": 0.0883,
"num_input_tokens_seen": 593408,
"step": 1330
},
{
"epoch": 8.449367088607595,
"grad_norm": 0.4406466782093048,
"learning_rate": 3.602171937219789e-06,
"loss": 0.0104,
"num_input_tokens_seen": 595648,
"step": 1335
},
{
"epoch": 8.481012658227849,
"grad_norm": 0.7376933693885803,
"learning_rate": 3.4606727367606508e-06,
"loss": 0.0084,
"num_input_tokens_seen": 597920,
"step": 1340
},
{
"epoch": 8.5,
"eval_loss": 0.12124493718147278,
"eval_runtime": 1.7759,
"eval_samples_per_second": 39.417,
"eval_steps_per_second": 10.136,
"num_input_tokens_seen": 599232,
"step": 1343
},
{
"epoch": 8.512658227848101,
"grad_norm": 1.2201383113861084,
"learning_rate": 3.321801796026189e-06,
"loss": 0.014,
"num_input_tokens_seen": 600224,
"step": 1345
},
{
"epoch": 8.544303797468354,
"grad_norm": 1.9123611450195312,
"learning_rate": 3.185576060248513e-06,
"loss": 0.0314,
"num_input_tokens_seen": 602432,
"step": 1350
},
{
"epoch": 8.575949367088608,
"grad_norm": 4.122493743896484,
"learning_rate": 3.0520121518880534e-06,
"loss": 0.0547,
"num_input_tokens_seen": 604736,
"step": 1355
},
{
"epoch": 8.60759493670886,
"grad_norm": 0.40070468187332153,
"learning_rate": 2.9211263686052065e-06,
"loss": 0.043,
"num_input_tokens_seen": 606976,
"step": 1360
},
{
"epoch": 8.639240506329115,
"grad_norm": 6.602436065673828,
"learning_rate": 2.792934681271708e-06,
"loss": 0.0443,
"num_input_tokens_seen": 609120,
"step": 1365
},
{
"epoch": 8.670886075949367,
"grad_norm": 0.46621787548065186,
"learning_rate": 2.667452732021794e-06,
"loss": 0.0092,
"num_input_tokens_seen": 611296,
"step": 1370
},
{
"epoch": 8.70253164556962,
"grad_norm": 0.17130400240421295,
"learning_rate": 2.544695832343591e-06,
"loss": 0.0349,
"num_input_tokens_seen": 613568,
"step": 1375
},
{
"epoch": 8.734177215189874,
"grad_norm": 0.0715862512588501,
"learning_rate": 2.424678961210747e-06,
"loss": 0.0204,
"num_input_tokens_seen": 615712,
"step": 1380
},
{
"epoch": 8.765822784810126,
"grad_norm": 2.6348278522491455,
"learning_rate": 2.307416763254666e-06,
"loss": 0.0323,
"num_input_tokens_seen": 618048,
"step": 1385
},
{
"epoch": 8.79746835443038,
"grad_norm": 1.5881404876708984,
"learning_rate": 2.1929235469775767e-06,
"loss": 0.0159,
"num_input_tokens_seen": 620384,
"step": 1390
},
{
"epoch": 8.829113924050633,
"grad_norm": 0.0131320646032691,
"learning_rate": 2.081213283006575e-06,
"loss": 0.0413,
"num_input_tokens_seen": 622656,
"step": 1395
},
{
"epoch": 8.860759493670885,
"grad_norm": 2.6171875,
"learning_rate": 1.9722996023889162e-06,
"loss": 0.0179,
"num_input_tokens_seen": 624928,
"step": 1400
},
{
"epoch": 8.89240506329114,
"grad_norm": 0.9136054515838623,
"learning_rate": 1.8661957949287157e-06,
"loss": 0.0439,
"num_input_tokens_seen": 627200,
"step": 1405
},
{
"epoch": 8.924050632911392,
"grad_norm": 1.1005070209503174,
"learning_rate": 1.7629148075653245e-06,
"loss": 0.0207,
"num_input_tokens_seen": 629376,
"step": 1410
},
{
"epoch": 8.955696202531646,
"grad_norm": 1.8107990026474,
"learning_rate": 1.6624692427935295e-06,
"loss": 0.0289,
"num_input_tokens_seen": 631552,
"step": 1415
},
{
"epoch": 8.987341772151899,
"grad_norm": 1.783125400543213,
"learning_rate": 1.564871357125755e-06,
"loss": 0.0727,
"num_input_tokens_seen": 633856,
"step": 1420
},
{
"epoch": 9.0,
"eval_loss": 0.12704677879810333,
"eval_runtime": 1.7742,
"eval_samples_per_second": 39.454,
"eval_steps_per_second": 10.145,
"num_input_tokens_seen": 634544,
"step": 1422
},
{
"epoch": 9.018987341772151,
"grad_norm": 0.2849749028682709,
"learning_rate": 1.4701330595965402e-06,
"loss": 0.0146,
"num_input_tokens_seen": 635952,
"step": 1425
},
{
"epoch": 9.050632911392405,
"grad_norm": 0.537175178527832,
"learning_rate": 1.3782659103093426e-06,
"loss": 0.0831,
"num_input_tokens_seen": 638224,
"step": 1430
},
{
"epoch": 9.082278481012658,
"grad_norm": 2.8358240127563477,
"learning_rate": 1.2892811190259979e-06,
"loss": 0.011,
"num_input_tokens_seen": 640496,
"step": 1435
},
{
"epoch": 9.113924050632912,
"grad_norm": 2.4374454021453857,
"learning_rate": 1.2031895437988493e-06,
"loss": 0.0455,
"num_input_tokens_seen": 642800,
"step": 1440
},
{
"epoch": 9.145569620253164,
"grad_norm": 0.006563578732311726,
"learning_rate": 1.1200016896458636e-06,
"loss": 0.0241,
"num_input_tokens_seen": 645040,
"step": 1445
},
{
"epoch": 9.177215189873417,
"grad_norm": 0.4594639539718628,
"learning_rate": 1.0397277072687729e-06,
"loss": 0.0562,
"num_input_tokens_seen": 647184,
"step": 1450
},
{
"epoch": 9.208860759493671,
"grad_norm": 1.900719404220581,
"learning_rate": 9.623773918144897e-07,
"loss": 0.0395,
"num_input_tokens_seen": 649360,
"step": 1455
},
{
"epoch": 9.240506329113924,
"grad_norm": 0.4248761832714081,
"learning_rate": 8.879601816798699e-07,
"loss": 0.0207,
"num_input_tokens_seen": 651600,
"step": 1460
},
{
"epoch": 9.272151898734178,
"grad_norm": 1.5442092418670654,
"learning_rate": 8.164851573600391e-07,
"loss": 0.0167,
"num_input_tokens_seen": 653936,
"step": 1465
},
{
"epoch": 9.30379746835443,
"grad_norm": 0.35519126057624817,
"learning_rate": 7.479610403403781e-07,
"loss": 0.0198,
"num_input_tokens_seen": 656176,
"step": 1470
},
{
"epoch": 9.335443037974684,
"grad_norm": 0.621478796005249,
"learning_rate": 6.823961920322974e-07,
"loss": 0.0648,
"num_input_tokens_seen": 658256,
"step": 1475
},
{
"epoch": 9.367088607594937,
"grad_norm": 0.49386000633239746,
"learning_rate": 6.197986127529892e-07,
"loss": 0.0054,
"num_input_tokens_seen": 660496,
"step": 1480
},
{
"epoch": 9.39873417721519,
"grad_norm": 0.17025251686573029,
"learning_rate": 5.601759407492107e-07,
"loss": 0.0254,
"num_input_tokens_seen": 662800,
"step": 1485
},
{
"epoch": 9.430379746835444,
"grad_norm": 2.2807698249816895,
"learning_rate": 5.035354512652385e-07,
"loss": 0.0188,
"num_input_tokens_seen": 665008,
"step": 1490
},
{
"epoch": 9.462025316455696,
"grad_norm": 0.8504540920257568,
"learning_rate": 4.4988405565515033e-07,
"loss": 0.0476,
"num_input_tokens_seen": 667280,
"step": 1495
},
{
"epoch": 9.49367088607595,
"grad_norm": 1.3653701543807983,
"learning_rate": 3.992283005394837e-07,
"loss": 0.0517,
"num_input_tokens_seen": 669584,
"step": 1500
},
{
"epoch": 9.5,
"eval_loss": 0.1276809424161911,
"eval_runtime": 1.7754,
"eval_samples_per_second": 39.427,
"eval_steps_per_second": 10.138,
"num_input_tokens_seen": 670064,
"step": 1501
},
{
"epoch": 9.525316455696203,
"grad_norm": 0.9614121913909912,
"learning_rate": 3.515743670064159e-07,
"loss": 0.0264,
"num_input_tokens_seen": 671952,
"step": 1505
},
{
"epoch": 9.556962025316455,
"grad_norm": 0.009983611293137074,
"learning_rate": 3.0692806985752053e-07,
"loss": 0.0289,
"num_input_tokens_seen": 674192,
"step": 1510
},
{
"epoch": 9.58860759493671,
"grad_norm": 0.43508023023605347,
"learning_rate": 2.6529485689825997e-07,
"loss": 0.0332,
"num_input_tokens_seen": 676400,
"step": 1515
},
{
"epoch": 9.620253164556962,
"grad_norm": 0.21629108488559723,
"learning_rate": 2.266798082732252e-07,
"loss": 0.0642,
"num_input_tokens_seen": 678640,
"step": 1520
},
{
"epoch": 9.651898734177216,
"grad_norm": 0.09474813938140869,
"learning_rate": 1.9108763584624878e-07,
"loss": 0.0021,
"num_input_tokens_seen": 680848,
"step": 1525
},
{
"epoch": 9.683544303797468,
"grad_norm": 0.7714481949806213,
"learning_rate": 1.5852268262544768e-07,
"loss": 0.0158,
"num_input_tokens_seen": 683152,
"step": 1530
},
{
"epoch": 9.715189873417721,
"grad_norm": 0.5944305062294006,
"learning_rate": 1.2898892223330294e-07,
"loss": 0.0108,
"num_input_tokens_seen": 685328,
"step": 1535
},
{
"epoch": 9.746835443037975,
"grad_norm": 0.011387128382921219,
"learning_rate": 1.0248995842178366e-07,
"loss": 0.0088,
"num_input_tokens_seen": 687632,
"step": 1540
},
{
"epoch": 9.778481012658228,
"grad_norm": 0.8135620951652527,
"learning_rate": 7.90290246326042e-08,
"loss": 0.0077,
"num_input_tokens_seen": 689776,
"step": 1545
},
{
"epoch": 9.810126582278482,
"grad_norm": 0.3980692923069,
"learning_rate": 5.86089836026843e-08,
"loss": 0.0152,
"num_input_tokens_seen": 691984,
"step": 1550
},
{
"epoch": 9.841772151898734,
"grad_norm": 0.014442904852330685,
"learning_rate": 4.1232327014831266e-08,
"loss": 0.018,
"num_input_tokens_seen": 694128,
"step": 1555
},
{
"epoch": 9.873417721518987,
"grad_norm": 0.6166678071022034,
"learning_rate": 2.6901175193699835e-08,
"loss": 0.0147,
"num_input_tokens_seen": 696304,
"step": 1560
},
{
"epoch": 9.905063291139241,
"grad_norm": 0.7606804966926575,
"learning_rate": 1.5617276847068598e-08,
"loss": 0.0087,
"num_input_tokens_seen": 698608,
"step": 1565
},
{
"epoch": 9.936708860759493,
"grad_norm": 0.15441037714481354,
"learning_rate": 7.382008852463429e-09,
"loss": 0.032,
"num_input_tokens_seen": 700816,
"step": 1570
},
{
"epoch": 9.968354430379748,
"grad_norm": 0.7360374927520752,
"learning_rate": 2.1963760891391407e-09,
"loss": 0.0257,
"num_input_tokens_seen": 703184,
"step": 1575
},
{
"epoch": 10.0,
"grad_norm": 0.013925778679549694,
"learning_rate": 6.101131547198157e-11,
"loss": 0.0049,
"num_input_tokens_seen": 705184,
"step": 1580
},
{
"epoch": 10.0,
"eval_loss": 0.12569361925125122,
"eval_runtime": 1.7778,
"eval_samples_per_second": 39.374,
"eval_steps_per_second": 10.125,
"num_input_tokens_seen": 705184,
"step": 1580
},
{
"epoch": 10.0,
"num_input_tokens_seen": 705184,
"step": 1580,
"total_flos": 3.175411679939789e+16,
"train_loss": 0.12395850797714311,
"train_runtime": 397.268,
"train_samples_per_second": 15.858,
"train_steps_per_second": 3.977
}
],
"logging_steps": 5,
"max_steps": 1580,
"num_input_tokens_seen": 705184,
"num_train_epochs": 10,
"save_steps": 79,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.175411679939789e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}