n0w0f's picture
Upload folder using huggingface_hub
c82b56f verified
{
"best_global_step": 25600,
"best_metric": 0.5076445937156677,
"best_model_checkpoint": "/data/alamparan/mattext_ckpt_2/results/2026-02-13/00-23-20/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-14000",
"epoch": 50.0,
"eval_steps": 50,
"global_step": 25800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.09689922480620156,
"grad_norm": 4.480398654937744,
"learning_rate": 0.0001996201550387597,
"loss": 35.8864501953125,
"step": 50
},
{
"epoch": 0.09689922480620156,
"eval_loss": 24.066509246826172,
"eval_runtime": 197.2924,
"eval_samples_per_second": 96.324,
"eval_steps_per_second": 2.007,
"step": 50
},
{
"epoch": 0.1937984496124031,
"grad_norm": 3.3033525943756104,
"learning_rate": 0.0001992325581395349,
"loss": 22.6541064453125,
"step": 100
},
{
"epoch": 0.1937984496124031,
"eval_loss": 21.18689727783203,
"eval_runtime": 195.7213,
"eval_samples_per_second": 97.097,
"eval_steps_per_second": 2.023,
"step": 100
},
{
"epoch": 0.29069767441860467,
"grad_norm": 6.851633071899414,
"learning_rate": 0.00019884496124031008,
"loss": 20.8401513671875,
"step": 150
},
{
"epoch": 0.29069767441860467,
"eval_loss": 19.933605194091797,
"eval_runtime": 198.4052,
"eval_samples_per_second": 95.784,
"eval_steps_per_second": 1.996,
"step": 150
},
{
"epoch": 0.3875968992248062,
"grad_norm": 5.096057415008545,
"learning_rate": 0.00019845736434108527,
"loss": 19.824486083984375,
"step": 200
},
{
"epoch": 0.3875968992248062,
"eval_loss": 19.05492401123047,
"eval_runtime": 192.534,
"eval_samples_per_second": 98.705,
"eval_steps_per_second": 2.057,
"step": 200
},
{
"epoch": 0.4844961240310077,
"grad_norm": 6.72298526763916,
"learning_rate": 0.00019806976744186049,
"loss": 19.06490234375,
"step": 250
},
{
"epoch": 0.4844961240310077,
"eval_loss": 18.332548141479492,
"eval_runtime": 196.956,
"eval_samples_per_second": 96.489,
"eval_steps_per_second": 2.011,
"step": 250
},
{
"epoch": 0.5813953488372093,
"grad_norm": 7.238275527954102,
"learning_rate": 0.00019768217054263567,
"loss": 18.413529052734376,
"step": 300
},
{
"epoch": 0.5813953488372093,
"eval_loss": 17.510297775268555,
"eval_runtime": 192.236,
"eval_samples_per_second": 98.858,
"eval_steps_per_second": 2.06,
"step": 300
},
{
"epoch": 0.6782945736434108,
"grad_norm": 6.704541206359863,
"learning_rate": 0.00019729457364341086,
"loss": 17.664228515625,
"step": 350
},
{
"epoch": 0.6782945736434108,
"eval_loss": 16.745044708251953,
"eval_runtime": 192.5744,
"eval_samples_per_second": 98.684,
"eval_steps_per_second": 2.056,
"step": 350
},
{
"epoch": 0.7751937984496124,
"grad_norm": 8.426316261291504,
"learning_rate": 0.00019690697674418605,
"loss": 16.83594970703125,
"step": 400
},
{
"epoch": 0.7751937984496124,
"eval_loss": 15.575336456298828,
"eval_runtime": 195.6385,
"eval_samples_per_second": 97.138,
"eval_steps_per_second": 2.024,
"step": 400
},
{
"epoch": 0.872093023255814,
"grad_norm": 8.754973411560059,
"learning_rate": 0.00019651937984496124,
"loss": 15.635130615234376,
"step": 450
},
{
"epoch": 0.872093023255814,
"eval_loss": 13.746257781982422,
"eval_runtime": 196.8027,
"eval_samples_per_second": 96.564,
"eval_steps_per_second": 2.012,
"step": 450
},
{
"epoch": 0.9689922480620154,
"grad_norm": 10.155797004699707,
"learning_rate": 0.00019613178294573645,
"loss": 13.158505859375,
"step": 500
},
{
"epoch": 0.9689922480620154,
"eval_loss": 9.434493064880371,
"eval_runtime": 215.6111,
"eval_samples_per_second": 88.14,
"eval_steps_per_second": 1.837,
"step": 500
},
{
"epoch": 1.0658914728682172,
"grad_norm": 7.4197540283203125,
"learning_rate": 0.00019574418604651164,
"loss": 8.888753051757812,
"step": 550
},
{
"epoch": 1.0658914728682172,
"eval_loss": 5.876725673675537,
"eval_runtime": 195.3015,
"eval_samples_per_second": 97.306,
"eval_steps_per_second": 2.028,
"step": 550
},
{
"epoch": 1.1627906976744187,
"grad_norm": 5.196544647216797,
"learning_rate": 0.00019535658914728683,
"loss": 6.424893188476562,
"step": 600
},
{
"epoch": 1.1627906976744187,
"eval_loss": 4.754482269287109,
"eval_runtime": 197.8093,
"eval_samples_per_second": 96.072,
"eval_steps_per_second": 2.002,
"step": 600
},
{
"epoch": 1.2596899224806202,
"grad_norm": 4.070798397064209,
"learning_rate": 0.00019496899224806202,
"loss": 5.379293823242188,
"step": 650
},
{
"epoch": 1.2596899224806202,
"eval_loss": 4.241163730621338,
"eval_runtime": 195.9777,
"eval_samples_per_second": 96.97,
"eval_steps_per_second": 2.021,
"step": 650
},
{
"epoch": 1.3565891472868217,
"grad_norm": 4.9117655754089355,
"learning_rate": 0.0001945813953488372,
"loss": 4.92329345703125,
"step": 700
},
{
"epoch": 1.3565891472868217,
"eval_loss": 3.873270034790039,
"eval_runtime": 194.9014,
"eval_samples_per_second": 97.506,
"eval_steps_per_second": 2.032,
"step": 700
},
{
"epoch": 1.4534883720930232,
"grad_norm": 4.443520545959473,
"learning_rate": 0.00019419379844961242,
"loss": 4.538243713378907,
"step": 750
},
{
"epoch": 1.4534883720930232,
"eval_loss": 3.622462749481201,
"eval_runtime": 190.9707,
"eval_samples_per_second": 99.513,
"eval_steps_per_second": 2.074,
"step": 750
},
{
"epoch": 1.550387596899225,
"grad_norm": 4.078836441040039,
"learning_rate": 0.0001938062015503876,
"loss": 4.198520812988281,
"step": 800
},
{
"epoch": 1.550387596899225,
"eval_loss": 3.3803393840789795,
"eval_runtime": 195.0941,
"eval_samples_per_second": 97.409,
"eval_steps_per_second": 2.03,
"step": 800
},
{
"epoch": 1.6472868217054264,
"grad_norm": 4.003866195678711,
"learning_rate": 0.0001934186046511628,
"loss": 4.023372497558594,
"step": 850
},
{
"epoch": 1.6472868217054264,
"eval_loss": 3.1622314453125,
"eval_runtime": 197.7245,
"eval_samples_per_second": 96.114,
"eval_steps_per_second": 2.003,
"step": 850
},
{
"epoch": 1.744186046511628,
"grad_norm": 3.277304172515869,
"learning_rate": 0.00019303100775193798,
"loss": 3.669682922363281,
"step": 900
},
{
"epoch": 1.744186046511628,
"eval_loss": 2.9944067001342773,
"eval_runtime": 196.1326,
"eval_samples_per_second": 96.894,
"eval_steps_per_second": 2.019,
"step": 900
},
{
"epoch": 1.8410852713178296,
"grad_norm": 3.905837059020996,
"learning_rate": 0.00019264341085271317,
"loss": 3.536895751953125,
"step": 950
},
{
"epoch": 1.8410852713178296,
"eval_loss": 2.87215518951416,
"eval_runtime": 194.2696,
"eval_samples_per_second": 97.823,
"eval_steps_per_second": 2.038,
"step": 950
},
{
"epoch": 1.937984496124031,
"grad_norm": 2.900581121444702,
"learning_rate": 0.00019225581395348839,
"loss": 3.3392208862304686,
"step": 1000
},
{
"epoch": 1.937984496124031,
"eval_loss": 2.717120885848999,
"eval_runtime": 195.0025,
"eval_samples_per_second": 97.455,
"eval_steps_per_second": 2.031,
"step": 1000
},
{
"epoch": 2.0348837209302326,
"grad_norm": 2.5737144947052,
"learning_rate": 0.00019186821705426357,
"loss": 3.0850595092773436,
"step": 1050
},
{
"epoch": 2.0348837209302326,
"eval_loss": 2.564358711242676,
"eval_runtime": 193.706,
"eval_samples_per_second": 98.107,
"eval_steps_per_second": 2.044,
"step": 1050
},
{
"epoch": 2.1317829457364343,
"grad_norm": 3.0205888748168945,
"learning_rate": 0.00019148062015503876,
"loss": 3.0150396728515627,
"step": 1100
},
{
"epoch": 2.1317829457364343,
"eval_loss": 2.4571454524993896,
"eval_runtime": 194.1336,
"eval_samples_per_second": 97.891,
"eval_steps_per_second": 2.04,
"step": 1100
},
{
"epoch": 2.2286821705426356,
"grad_norm": 2.9379639625549316,
"learning_rate": 0.00019109302325581395,
"loss": 2.7774249267578126,
"step": 1150
},
{
"epoch": 2.2286821705426356,
"eval_loss": 2.341874122619629,
"eval_runtime": 196.8183,
"eval_samples_per_second": 96.556,
"eval_steps_per_second": 2.012,
"step": 1150
},
{
"epoch": 2.3255813953488373,
"grad_norm": 2.8902111053466797,
"learning_rate": 0.00019070542635658916,
"loss": 2.78979736328125,
"step": 1200
},
{
"epoch": 2.3255813953488373,
"eval_loss": 2.1844258308410645,
"eval_runtime": 196.3212,
"eval_samples_per_second": 96.801,
"eval_steps_per_second": 2.017,
"step": 1200
},
{
"epoch": 2.4224806201550386,
"grad_norm": 3.2294564247131348,
"learning_rate": 0.00019031782945736435,
"loss": 2.461886444091797,
"step": 1250
},
{
"epoch": 2.4224806201550386,
"eval_loss": 1.9487553834915161,
"eval_runtime": 194.5758,
"eval_samples_per_second": 97.669,
"eval_steps_per_second": 2.035,
"step": 1250
},
{
"epoch": 2.5193798449612403,
"grad_norm": 2.7845847606658936,
"learning_rate": 0.00018993023255813954,
"loss": 2.2011062622070314,
"step": 1300
},
{
"epoch": 2.5193798449612403,
"eval_loss": 1.752272129058838,
"eval_runtime": 194.8447,
"eval_samples_per_second": 97.534,
"eval_steps_per_second": 2.032,
"step": 1300
},
{
"epoch": 2.616279069767442,
"grad_norm": 2.7175710201263428,
"learning_rate": 0.00018954263565891476,
"loss": 1.9771908569335936,
"step": 1350
},
{
"epoch": 2.616279069767442,
"eval_loss": 1.6105732917785645,
"eval_runtime": 194.2477,
"eval_samples_per_second": 97.834,
"eval_steps_per_second": 2.039,
"step": 1350
},
{
"epoch": 2.7131782945736433,
"grad_norm": 2.9664433002471924,
"learning_rate": 0.00018915503875968994,
"loss": 1.8786553955078125,
"step": 1400
},
{
"epoch": 2.7131782945736433,
"eval_loss": 1.5160634517669678,
"eval_runtime": 193.1833,
"eval_samples_per_second": 98.373,
"eval_steps_per_second": 2.05,
"step": 1400
},
{
"epoch": 2.810077519379845,
"grad_norm": 2.208284616470337,
"learning_rate": 0.00018876744186046513,
"loss": 1.7303669738769532,
"step": 1450
},
{
"epoch": 2.810077519379845,
"eval_loss": 1.454694151878357,
"eval_runtime": 193.997,
"eval_samples_per_second": 97.96,
"eval_steps_per_second": 2.041,
"step": 1450
},
{
"epoch": 2.9069767441860463,
"grad_norm": 2.5443525314331055,
"learning_rate": 0.00018837984496124032,
"loss": 1.6828465270996094,
"step": 1500
},
{
"epoch": 2.9069767441860463,
"eval_loss": 1.3904365301132202,
"eval_runtime": 195.2492,
"eval_samples_per_second": 97.332,
"eval_steps_per_second": 2.028,
"step": 1500
},
{
"epoch": 3.003875968992248,
"grad_norm": 2.5396857261657715,
"learning_rate": 0.0001879922480620155,
"loss": 1.595220489501953,
"step": 1550
},
{
"epoch": 3.003875968992248,
"eval_loss": 1.366494059562683,
"eval_runtime": 195.4358,
"eval_samples_per_second": 97.239,
"eval_steps_per_second": 2.026,
"step": 1550
},
{
"epoch": 3.10077519379845,
"grad_norm": 2.160076379776001,
"learning_rate": 0.00018760465116279072,
"loss": 1.5685009765625,
"step": 1600
},
{
"epoch": 3.10077519379845,
"eval_loss": 1.3070204257965088,
"eval_runtime": 196.739,
"eval_samples_per_second": 96.595,
"eval_steps_per_second": 2.013,
"step": 1600
},
{
"epoch": 3.197674418604651,
"grad_norm": 2.343022346496582,
"learning_rate": 0.0001872170542635659,
"loss": 1.4889743041992187,
"step": 1650
},
{
"epoch": 3.197674418604651,
"eval_loss": 1.2801202535629272,
"eval_runtime": 198.8275,
"eval_samples_per_second": 95.58,
"eval_steps_per_second": 1.992,
"step": 1650
},
{
"epoch": 3.294573643410853,
"grad_norm": 2.2497730255126953,
"learning_rate": 0.0001868294573643411,
"loss": 1.4677432250976563,
"step": 1700
},
{
"epoch": 3.294573643410853,
"eval_loss": 1.2478386163711548,
"eval_runtime": 200.1485,
"eval_samples_per_second": 94.949,
"eval_steps_per_second": 1.979,
"step": 1700
},
{
"epoch": 3.391472868217054,
"grad_norm": 2.0219998359680176,
"learning_rate": 0.00018644186046511629,
"loss": 1.4308297729492188,
"step": 1750
},
{
"epoch": 3.391472868217054,
"eval_loss": 1.2120215892791748,
"eval_runtime": 199.2468,
"eval_samples_per_second": 95.379,
"eval_steps_per_second": 1.987,
"step": 1750
},
{
"epoch": 3.488372093023256,
"grad_norm": 2.2400925159454346,
"learning_rate": 0.00018605426356589147,
"loss": 1.3934197998046876,
"step": 1800
},
{
"epoch": 3.488372093023256,
"eval_loss": 1.1917102336883545,
"eval_runtime": 209.7445,
"eval_samples_per_second": 90.605,
"eval_steps_per_second": 1.888,
"step": 1800
},
{
"epoch": 3.5852713178294575,
"grad_norm": 2.072272539138794,
"learning_rate": 0.0001856666666666667,
"loss": 1.3590359497070312,
"step": 1850
},
{
"epoch": 3.5852713178294575,
"eval_loss": 1.1820892095565796,
"eval_runtime": 194.7483,
"eval_samples_per_second": 97.582,
"eval_steps_per_second": 2.033,
"step": 1850
},
{
"epoch": 3.682170542635659,
"grad_norm": 1.9475743770599365,
"learning_rate": 0.00018527906976744188,
"loss": 1.3555595397949218,
"step": 1900
},
{
"epoch": 3.682170542635659,
"eval_loss": 1.1485044956207275,
"eval_runtime": 198.2014,
"eval_samples_per_second": 95.882,
"eval_steps_per_second": 1.998,
"step": 1900
},
{
"epoch": 3.7790697674418605,
"grad_norm": 2.3375675678253174,
"learning_rate": 0.00018489147286821707,
"loss": 1.3280506896972657,
"step": 1950
},
{
"epoch": 3.7790697674418605,
"eval_loss": 1.1500495672225952,
"eval_runtime": 195.9158,
"eval_samples_per_second": 97.001,
"eval_steps_per_second": 2.021,
"step": 1950
},
{
"epoch": 3.875968992248062,
"grad_norm": 2.0993704795837402,
"learning_rate": 0.00018450387596899225,
"loss": 1.3038815307617186,
"step": 2000
},
{
"epoch": 3.875968992248062,
"eval_loss": 1.1234172582626343,
"eval_runtime": 202.1206,
"eval_samples_per_second": 94.023,
"eval_steps_per_second": 1.959,
"step": 2000
},
{
"epoch": 3.9728682170542635,
"grad_norm": 1.8678816556930542,
"learning_rate": 0.00018411627906976744,
"loss": 1.270460968017578,
"step": 2050
},
{
"epoch": 3.9728682170542635,
"eval_loss": 1.0995925664901733,
"eval_runtime": 201.1179,
"eval_samples_per_second": 94.492,
"eval_steps_per_second": 1.969,
"step": 2050
},
{
"epoch": 4.069767441860465,
"grad_norm": 1.9179855585098267,
"learning_rate": 0.00018372868217054266,
"loss": 1.2543260955810547,
"step": 2100
},
{
"epoch": 4.069767441860465,
"eval_loss": 1.0878671407699585,
"eval_runtime": 195.9153,
"eval_samples_per_second": 97.001,
"eval_steps_per_second": 2.021,
"step": 2100
},
{
"epoch": 4.166666666666667,
"grad_norm": 1.8298465013504028,
"learning_rate": 0.00018334108527131784,
"loss": 1.2222324371337892,
"step": 2150
},
{
"epoch": 4.166666666666667,
"eval_loss": 1.0717836618423462,
"eval_runtime": 197.4785,
"eval_samples_per_second": 96.233,
"eval_steps_per_second": 2.005,
"step": 2150
},
{
"epoch": 4.263565891472869,
"grad_norm": 1.885225534439087,
"learning_rate": 0.00018295348837209303,
"loss": 1.1911175537109375,
"step": 2200
},
{
"epoch": 4.263565891472869,
"eval_loss": 1.0390491485595703,
"eval_runtime": 200.4332,
"eval_samples_per_second": 94.815,
"eval_steps_per_second": 1.976,
"step": 2200
},
{
"epoch": 4.3604651162790695,
"grad_norm": 1.9083372354507446,
"learning_rate": 0.00018256589147286822,
"loss": 1.1913112640380858,
"step": 2250
},
{
"epoch": 4.3604651162790695,
"eval_loss": 1.0383073091506958,
"eval_runtime": 192.9555,
"eval_samples_per_second": 98.489,
"eval_steps_per_second": 2.052,
"step": 2250
},
{
"epoch": 4.457364341085271,
"grad_norm": 2.0930843353271484,
"learning_rate": 0.0001821782945736434,
"loss": 1.1640525817871095,
"step": 2300
},
{
"epoch": 4.457364341085271,
"eval_loss": 1.0316708087921143,
"eval_runtime": 198.9764,
"eval_samples_per_second": 95.509,
"eval_steps_per_second": 1.99,
"step": 2300
},
{
"epoch": 4.554263565891473,
"grad_norm": 1.8796041011810303,
"learning_rate": 0.0001817906976744186,
"loss": 1.1836517333984375,
"step": 2350
},
{
"epoch": 4.554263565891473,
"eval_loss": 1.021742343902588,
"eval_runtime": 199.5658,
"eval_samples_per_second": 95.227,
"eval_steps_per_second": 1.984,
"step": 2350
},
{
"epoch": 4.651162790697675,
"grad_norm": 1.826181411743164,
"learning_rate": 0.0001814031007751938,
"loss": 1.1762975311279298,
"step": 2400
},
{
"epoch": 4.651162790697675,
"eval_loss": 1.011062502861023,
"eval_runtime": 198.4529,
"eval_samples_per_second": 95.761,
"eval_steps_per_second": 1.995,
"step": 2400
},
{
"epoch": 4.748062015503876,
"grad_norm": 1.852156400680542,
"learning_rate": 0.000181015503875969,
"loss": 1.1690707397460938,
"step": 2450
},
{
"epoch": 4.748062015503876,
"eval_loss": 0.9975070953369141,
"eval_runtime": 197.7981,
"eval_samples_per_second": 96.078,
"eval_steps_per_second": 2.002,
"step": 2450
},
{
"epoch": 4.844961240310077,
"grad_norm": 2.0098986625671387,
"learning_rate": 0.0001806279069767442,
"loss": 1.1409013366699219,
"step": 2500
},
{
"epoch": 4.844961240310077,
"eval_loss": 0.9758601188659668,
"eval_runtime": 199.2887,
"eval_samples_per_second": 95.359,
"eval_steps_per_second": 1.987,
"step": 2500
},
{
"epoch": 4.941860465116279,
"grad_norm": 1.8541340827941895,
"learning_rate": 0.00018024031007751937,
"loss": 1.1015814971923827,
"step": 2550
},
{
"epoch": 4.941860465116279,
"eval_loss": 0.9661723971366882,
"eval_runtime": 200.2897,
"eval_samples_per_second": 94.883,
"eval_steps_per_second": 1.977,
"step": 2550
},
{
"epoch": 5.038759689922481,
"grad_norm": 1.6043637990951538,
"learning_rate": 0.00017985271317829456,
"loss": 1.1134808349609375,
"step": 2600
},
{
"epoch": 5.038759689922481,
"eval_loss": 0.967205286026001,
"eval_runtime": 198.2009,
"eval_samples_per_second": 95.883,
"eval_steps_per_second": 1.998,
"step": 2600
},
{
"epoch": 5.135658914728682,
"grad_norm": 1.7434065341949463,
"learning_rate": 0.00017946511627906978,
"loss": 1.0963155364990234,
"step": 2650
},
{
"epoch": 5.135658914728682,
"eval_loss": 0.9636672735214233,
"eval_runtime": 195.4024,
"eval_samples_per_second": 97.256,
"eval_steps_per_second": 2.027,
"step": 2650
},
{
"epoch": 5.232558139534884,
"grad_norm": 2.160961866378784,
"learning_rate": 0.00017907751937984497,
"loss": 1.0813286590576172,
"step": 2700
},
{
"epoch": 5.232558139534884,
"eval_loss": 0.9522321820259094,
"eval_runtime": 195.1264,
"eval_samples_per_second": 97.393,
"eval_steps_per_second": 2.029,
"step": 2700
},
{
"epoch": 5.329457364341085,
"grad_norm": 1.9293419122695923,
"learning_rate": 0.00017868992248062015,
"loss": 1.0441783142089844,
"step": 2750
},
{
"epoch": 5.329457364341085,
"eval_loss": 0.9408562183380127,
"eval_runtime": 193.0048,
"eval_samples_per_second": 98.464,
"eval_steps_per_second": 2.052,
"step": 2750
},
{
"epoch": 5.426356589147287,
"grad_norm": 1.6187139749526978,
"learning_rate": 0.00017830232558139534,
"loss": 1.0531800079345703,
"step": 2800
},
{
"epoch": 5.426356589147287,
"eval_loss": 0.9365593791007996,
"eval_runtime": 199.8497,
"eval_samples_per_second": 95.091,
"eval_steps_per_second": 1.981,
"step": 2800
},
{
"epoch": 5.523255813953488,
"grad_norm": 1.7480401992797852,
"learning_rate": 0.00017791472868217056,
"loss": 1.0467662048339843,
"step": 2850
},
{
"epoch": 5.523255813953488,
"eval_loss": 0.9324782490730286,
"eval_runtime": 194.6714,
"eval_samples_per_second": 97.621,
"eval_steps_per_second": 2.034,
"step": 2850
},
{
"epoch": 5.62015503875969,
"grad_norm": 1.6348450183868408,
"learning_rate": 0.00017752713178294574,
"loss": 1.0491456604003906,
"step": 2900
},
{
"epoch": 5.62015503875969,
"eval_loss": 0.9339238405227661,
"eval_runtime": 193.6506,
"eval_samples_per_second": 98.135,
"eval_steps_per_second": 2.045,
"step": 2900
},
{
"epoch": 5.717054263565892,
"grad_norm": 1.7508739233016968,
"learning_rate": 0.00017713953488372096,
"loss": 1.0431405639648437,
"step": 2950
},
{
"epoch": 5.717054263565892,
"eval_loss": 0.9144666194915771,
"eval_runtime": 200.9709,
"eval_samples_per_second": 94.561,
"eval_steps_per_second": 1.97,
"step": 2950
},
{
"epoch": 5.813953488372093,
"grad_norm": 1.8117504119873047,
"learning_rate": 0.00017675193798449615,
"loss": 1.029040298461914,
"step": 3000
},
{
"epoch": 5.813953488372093,
"eval_loss": 0.910137951374054,
"eval_runtime": 198.6885,
"eval_samples_per_second": 95.647,
"eval_steps_per_second": 1.993,
"step": 3000
},
{
"epoch": 5.910852713178294,
"grad_norm": 1.9361231327056885,
"learning_rate": 0.00017636434108527134,
"loss": 1.0382290649414063,
"step": 3050
},
{
"epoch": 5.910852713178294,
"eval_loss": 0.8883566856384277,
"eval_runtime": 199.026,
"eval_samples_per_second": 95.485,
"eval_steps_per_second": 1.99,
"step": 3050
},
{
"epoch": 6.007751937984496,
"grad_norm": 1.7819427251815796,
"learning_rate": 0.00017597674418604652,
"loss": 1.0060308837890626,
"step": 3100
},
{
"epoch": 6.007751937984496,
"eval_loss": 0.9043192863464355,
"eval_runtime": 193.7551,
"eval_samples_per_second": 98.083,
"eval_steps_per_second": 2.044,
"step": 3100
},
{
"epoch": 6.104651162790698,
"grad_norm": 1.7325843572616577,
"learning_rate": 0.0001755891472868217,
"loss": 1.010067138671875,
"step": 3150
},
{
"epoch": 6.104651162790698,
"eval_loss": 0.8911006450653076,
"eval_runtime": 193.7454,
"eval_samples_per_second": 98.087,
"eval_steps_per_second": 2.044,
"step": 3150
},
{
"epoch": 6.2015503875969,
"grad_norm": 1.707995057106018,
"learning_rate": 0.00017520155038759693,
"loss": 0.9815747833251953,
"step": 3200
},
{
"epoch": 6.2015503875969,
"eval_loss": 0.8801227807998657,
"eval_runtime": 197.3882,
"eval_samples_per_second": 96.277,
"eval_steps_per_second": 2.006,
"step": 3200
},
{
"epoch": 6.2984496124031,
"grad_norm": 1.4234212636947632,
"learning_rate": 0.00017481395348837211,
"loss": 0.9903421020507812,
"step": 3250
},
{
"epoch": 6.2984496124031,
"eval_loss": 0.8660780787467957,
"eval_runtime": 199.7721,
"eval_samples_per_second": 95.128,
"eval_steps_per_second": 1.982,
"step": 3250
},
{
"epoch": 6.395348837209302,
"grad_norm": 1.743849277496338,
"learning_rate": 0.0001744263565891473,
"loss": 0.9762661743164063,
"step": 3300
},
{
"epoch": 6.395348837209302,
"eval_loss": 0.8824377655982971,
"eval_runtime": 192.4288,
"eval_samples_per_second": 98.759,
"eval_steps_per_second": 2.058,
"step": 3300
},
{
"epoch": 6.492248062015504,
"grad_norm": 1.758987545967102,
"learning_rate": 0.0001740387596899225,
"loss": 0.9555432891845703,
"step": 3350
},
{
"epoch": 6.492248062015504,
"eval_loss": 0.868446409702301,
"eval_runtime": 201.784,
"eval_samples_per_second": 94.18,
"eval_steps_per_second": 1.962,
"step": 3350
},
{
"epoch": 6.589147286821706,
"grad_norm": 1.546152949333191,
"learning_rate": 0.00017365116279069768,
"loss": 0.9780608367919922,
"step": 3400
},
{
"epoch": 6.589147286821706,
"eval_loss": 0.8624841570854187,
"eval_runtime": 200.0021,
"eval_samples_per_second": 95.019,
"eval_steps_per_second": 1.98,
"step": 3400
},
{
"epoch": 6.686046511627907,
"grad_norm": 1.8314995765686035,
"learning_rate": 0.00017326356589147287,
"loss": 0.9537903594970704,
"step": 3450
},
{
"epoch": 6.686046511627907,
"eval_loss": 0.8573926091194153,
"eval_runtime": 202.7677,
"eval_samples_per_second": 93.723,
"eval_steps_per_second": 1.953,
"step": 3450
},
{
"epoch": 6.782945736434108,
"grad_norm": 1.645276427268982,
"learning_rate": 0.00017287596899224808,
"loss": 0.9461723327636719,
"step": 3500
},
{
"epoch": 6.782945736434108,
"eval_loss": 0.8483996391296387,
"eval_runtime": 201.379,
"eval_samples_per_second": 94.369,
"eval_steps_per_second": 1.966,
"step": 3500
},
{
"epoch": 6.87984496124031,
"grad_norm": 1.6280810832977295,
"learning_rate": 0.00017248837209302327,
"loss": 0.951067886352539,
"step": 3550
},
{
"epoch": 6.87984496124031,
"eval_loss": 0.8427923321723938,
"eval_runtime": 212.6273,
"eval_samples_per_second": 89.377,
"eval_steps_per_second": 1.862,
"step": 3550
},
{
"epoch": 6.976744186046512,
"grad_norm": 1.6208834648132324,
"learning_rate": 0.00017210077519379846,
"loss": 0.9458073425292969,
"step": 3600
},
{
"epoch": 6.976744186046512,
"eval_loss": 0.8510859608650208,
"eval_runtime": 206.0969,
"eval_samples_per_second": 92.209,
"eval_steps_per_second": 1.921,
"step": 3600
},
{
"epoch": 7.073643410852713,
"grad_norm": 1.702273964881897,
"learning_rate": 0.00017171317829457365,
"loss": 0.9501612854003906,
"step": 3650
},
{
"epoch": 7.073643410852713,
"eval_loss": 0.8405491709709167,
"eval_runtime": 182.1875,
"eval_samples_per_second": 104.31,
"eval_steps_per_second": 2.174,
"step": 3650
},
{
"epoch": 7.170542635658915,
"grad_norm": 1.7668451070785522,
"learning_rate": 0.00017132558139534883,
"loss": 0.9099185180664062,
"step": 3700
},
{
"epoch": 7.170542635658915,
"eval_loss": 0.8351926803588867,
"eval_runtime": 208.1044,
"eval_samples_per_second": 91.32,
"eval_steps_per_second": 1.903,
"step": 3700
},
{
"epoch": 7.267441860465116,
"grad_norm": 1.3699573278427124,
"learning_rate": 0.00017093798449612405,
"loss": 0.8952218627929688,
"step": 3750
},
{
"epoch": 7.267441860465116,
"eval_loss": 0.8353874683380127,
"eval_runtime": 200.4099,
"eval_samples_per_second": 94.826,
"eval_steps_per_second": 1.976,
"step": 3750
},
{
"epoch": 7.364341085271318,
"grad_norm": 1.6185437440872192,
"learning_rate": 0.00017055038759689924,
"loss": 0.9168299865722657,
"step": 3800
},
{
"epoch": 7.364341085271318,
"eval_loss": 0.8311659693717957,
"eval_runtime": 203.7994,
"eval_samples_per_second": 93.249,
"eval_steps_per_second": 1.943,
"step": 3800
},
{
"epoch": 7.461240310077519,
"grad_norm": 1.5149507522583008,
"learning_rate": 0.00017016279069767442,
"loss": 0.9037581634521484,
"step": 3850
},
{
"epoch": 7.461240310077519,
"eval_loss": 0.8201795220375061,
"eval_runtime": 201.3115,
"eval_samples_per_second": 94.401,
"eval_steps_per_second": 1.967,
"step": 3850
},
{
"epoch": 7.558139534883721,
"grad_norm": 1.7122201919555664,
"learning_rate": 0.0001697751937984496,
"loss": 0.9189765930175782,
"step": 3900
},
{
"epoch": 7.558139534883721,
"eval_loss": 0.8164864182472229,
"eval_runtime": 206.2635,
"eval_samples_per_second": 92.135,
"eval_steps_per_second": 1.92,
"step": 3900
},
{
"epoch": 7.655038759689923,
"grad_norm": 1.7001962661743164,
"learning_rate": 0.0001693875968992248,
"loss": 0.8901979827880859,
"step": 3950
},
{
"epoch": 7.655038759689923,
"eval_loss": 0.8141375184059143,
"eval_runtime": 203.6025,
"eval_samples_per_second": 93.339,
"eval_steps_per_second": 1.945,
"step": 3950
},
{
"epoch": 7.751937984496124,
"grad_norm": 1.6305474042892456,
"learning_rate": 0.00016900000000000002,
"loss": 0.9295430755615235,
"step": 4000
},
{
"epoch": 7.751937984496124,
"eval_loss": 0.8144168257713318,
"eval_runtime": 202.7837,
"eval_samples_per_second": 93.716,
"eval_steps_per_second": 1.953,
"step": 4000
},
{
"epoch": 7.848837209302325,
"grad_norm": 1.7076892852783203,
"learning_rate": 0.0001686124031007752,
"loss": 0.9095289611816406,
"step": 4050
},
{
"epoch": 7.848837209302325,
"eval_loss": 0.8173167109489441,
"eval_runtime": 205.0403,
"eval_samples_per_second": 92.684,
"eval_steps_per_second": 1.931,
"step": 4050
},
{
"epoch": 7.945736434108527,
"grad_norm": 1.469099521636963,
"learning_rate": 0.0001682248062015504,
"loss": 0.8886381530761719,
"step": 4100
},
{
"epoch": 7.945736434108527,
"eval_loss": 0.8015902042388916,
"eval_runtime": 206.0488,
"eval_samples_per_second": 92.231,
"eval_steps_per_second": 1.922,
"step": 4100
},
{
"epoch": 8.042635658914728,
"grad_norm": 1.439172387123108,
"learning_rate": 0.00016783720930232558,
"loss": 0.8689357757568359,
"step": 4150
},
{
"epoch": 8.042635658914728,
"eval_loss": 0.8171545267105103,
"eval_runtime": 202.334,
"eval_samples_per_second": 93.924,
"eval_steps_per_second": 1.957,
"step": 4150
},
{
"epoch": 8.13953488372093,
"grad_norm": 1.7564213275909424,
"learning_rate": 0.00016744961240310077,
"loss": 0.8746170043945313,
"step": 4200
},
{
"epoch": 8.13953488372093,
"eval_loss": 0.8083029389381409,
"eval_runtime": 202.7276,
"eval_samples_per_second": 93.742,
"eval_steps_per_second": 1.953,
"step": 4200
},
{
"epoch": 8.236434108527131,
"grad_norm": 1.6364485025405884,
"learning_rate": 0.00016706201550387595,
"loss": 0.8728688812255859,
"step": 4250
},
{
"epoch": 8.236434108527131,
"eval_loss": 0.7914299368858337,
"eval_runtime": 205.9727,
"eval_samples_per_second": 92.265,
"eval_steps_per_second": 1.923,
"step": 4250
},
{
"epoch": 8.333333333333334,
"grad_norm": 1.4534393548965454,
"learning_rate": 0.00016667441860465117,
"loss": 0.8740718841552735,
"step": 4300
},
{
"epoch": 8.333333333333334,
"eval_loss": 0.7857058048248291,
"eval_runtime": 205.9583,
"eval_samples_per_second": 92.271,
"eval_steps_per_second": 1.923,
"step": 4300
},
{
"epoch": 8.430232558139535,
"grad_norm": 1.2946027517318726,
"learning_rate": 0.00016628682170542636,
"loss": 0.8651500701904297,
"step": 4350
},
{
"epoch": 8.430232558139535,
"eval_loss": 0.7967393398284912,
"eval_runtime": 202.1907,
"eval_samples_per_second": 93.99,
"eval_steps_per_second": 1.959,
"step": 4350
},
{
"epoch": 8.527131782945737,
"grad_norm": 1.2329447269439697,
"learning_rate": 0.00016589922480620155,
"loss": 0.8730928802490234,
"step": 4400
},
{
"epoch": 8.527131782945737,
"eval_loss": 0.7840523719787598,
"eval_runtime": 200.6499,
"eval_samples_per_second": 94.712,
"eval_steps_per_second": 1.974,
"step": 4400
},
{
"epoch": 8.624031007751938,
"grad_norm": 1.441260814666748,
"learning_rate": 0.00016551162790697676,
"loss": 0.8639598083496094,
"step": 4450
},
{
"epoch": 8.624031007751938,
"eval_loss": 0.7866470217704773,
"eval_runtime": 201.2114,
"eval_samples_per_second": 94.448,
"eval_steps_per_second": 1.968,
"step": 4450
},
{
"epoch": 8.720930232558139,
"grad_norm": 1.4688923358917236,
"learning_rate": 0.00016512403100775195,
"loss": 0.8695674896240234,
"step": 4500
},
{
"epoch": 8.720930232558139,
"eval_loss": 0.7772266864776611,
"eval_runtime": 203.6283,
"eval_samples_per_second": 93.327,
"eval_steps_per_second": 1.945,
"step": 4500
},
{
"epoch": 8.817829457364342,
"grad_norm": 1.7634507417678833,
"learning_rate": 0.00016473643410852714,
"loss": 0.867518310546875,
"step": 4550
},
{
"epoch": 8.817829457364342,
"eval_loss": 0.7764750123023987,
"eval_runtime": 197.3536,
"eval_samples_per_second": 96.294,
"eval_steps_per_second": 2.007,
"step": 4550
},
{
"epoch": 8.914728682170542,
"grad_norm": 1.3822124004364014,
"learning_rate": 0.00016434883720930235,
"loss": 0.8379183959960937,
"step": 4600
},
{
"epoch": 8.914728682170542,
"eval_loss": 0.7751489877700806,
"eval_runtime": 204.9848,
"eval_samples_per_second": 92.709,
"eval_steps_per_second": 1.932,
"step": 4600
},
{
"epoch": 9.011627906976743,
"grad_norm": 1.485487461090088,
"learning_rate": 0.00016396124031007754,
"loss": 0.8471622467041016,
"step": 4650
},
{
"epoch": 9.011627906976743,
"eval_loss": 0.7833512425422668,
"eval_runtime": 204.7415,
"eval_samples_per_second": 92.819,
"eval_steps_per_second": 1.934,
"step": 4650
},
{
"epoch": 9.108527131782946,
"grad_norm": 1.704746961593628,
"learning_rate": 0.00016357364341085273,
"loss": 0.8221251678466797,
"step": 4700
},
{
"epoch": 9.108527131782946,
"eval_loss": 0.782015860080719,
"eval_runtime": 205.5077,
"eval_samples_per_second": 92.473,
"eval_steps_per_second": 1.927,
"step": 4700
},
{
"epoch": 9.205426356589147,
"grad_norm": 1.6045122146606445,
"learning_rate": 0.00016318604651162792,
"loss": 0.8225302124023437,
"step": 4750
},
{
"epoch": 9.205426356589147,
"eval_loss": 0.7847553491592407,
"eval_runtime": 203.1694,
"eval_samples_per_second": 93.538,
"eval_steps_per_second": 1.949,
"step": 4750
},
{
"epoch": 9.30232558139535,
"grad_norm": 1.482059121131897,
"learning_rate": 0.0001627984496124031,
"loss": 0.8403683471679687,
"step": 4800
},
{
"epoch": 9.30232558139535,
"eval_loss": 0.7648666501045227,
"eval_runtime": 207.7484,
"eval_samples_per_second": 91.476,
"eval_steps_per_second": 1.906,
"step": 4800
},
{
"epoch": 9.39922480620155,
"grad_norm": 1.5278195142745972,
"learning_rate": 0.00016241085271317832,
"loss": 0.8287559509277344,
"step": 4850
},
{
"epoch": 9.39922480620155,
"eval_loss": 0.7762807607650757,
"eval_runtime": 207.8748,
"eval_samples_per_second": 91.42,
"eval_steps_per_second": 1.905,
"step": 4850
},
{
"epoch": 9.496124031007753,
"grad_norm": 1.3437010049819946,
"learning_rate": 0.0001620232558139535,
"loss": 0.8490474700927735,
"step": 4900
},
{
"epoch": 9.496124031007753,
"eval_loss": 0.7723644375801086,
"eval_runtime": 202.5921,
"eval_samples_per_second": 93.804,
"eval_steps_per_second": 1.955,
"step": 4900
},
{
"epoch": 9.593023255813954,
"grad_norm": 1.41952645778656,
"learning_rate": 0.0001616356589147287,
"loss": 0.8415257263183594,
"step": 4950
},
{
"epoch": 9.593023255813954,
"eval_loss": 0.7634032964706421,
"eval_runtime": 204.2501,
"eval_samples_per_second": 93.043,
"eval_steps_per_second": 1.939,
"step": 4950
},
{
"epoch": 9.689922480620154,
"grad_norm": 1.5748244524002075,
"learning_rate": 0.00016124806201550388,
"loss": 0.8278125,
"step": 5000
},
{
"epoch": 9.689922480620154,
"eval_loss": 0.7638477087020874,
"eval_runtime": 206.8167,
"eval_samples_per_second": 91.888,
"eval_steps_per_second": 1.915,
"step": 5000
},
{
"epoch": 9.786821705426357,
"grad_norm": 1.1772520542144775,
"learning_rate": 0.00016086046511627907,
"loss": 0.8290666961669921,
"step": 5050
},
{
"epoch": 9.786821705426357,
"eval_loss": 0.7504697442054749,
"eval_runtime": 204.2184,
"eval_samples_per_second": 93.057,
"eval_steps_per_second": 1.939,
"step": 5050
},
{
"epoch": 9.883720930232558,
"grad_norm": 1.5144110918045044,
"learning_rate": 0.00016047286821705429,
"loss": 0.8253756713867187,
"step": 5100
},
{
"epoch": 9.883720930232558,
"eval_loss": 0.7516148090362549,
"eval_runtime": 203.6086,
"eval_samples_per_second": 93.336,
"eval_steps_per_second": 1.945,
"step": 5100
},
{
"epoch": 9.98062015503876,
"grad_norm": 1.3659805059432983,
"learning_rate": 0.00016008527131782947,
"loss": 0.8156401062011719,
"step": 5150
},
{
"epoch": 9.98062015503876,
"eval_loss": 0.7507323622703552,
"eval_runtime": 206.1146,
"eval_samples_per_second": 92.201,
"eval_steps_per_second": 1.921,
"step": 5150
},
{
"epoch": 10.077519379844961,
"grad_norm": 1.507645606994629,
"learning_rate": 0.00015969767441860466,
"loss": 0.8164395904541015,
"step": 5200
},
{
"epoch": 10.077519379844961,
"eval_loss": 0.7499141097068787,
"eval_runtime": 208.0903,
"eval_samples_per_second": 91.326,
"eval_steps_per_second": 1.903,
"step": 5200
},
{
"epoch": 10.174418604651162,
"grad_norm": 1.1920627355575562,
"learning_rate": 0.00015931007751937985,
"loss": 0.8137637329101562,
"step": 5250
},
{
"epoch": 10.174418604651162,
"eval_loss": 0.7570334672927856,
"eval_runtime": 210.8201,
"eval_samples_per_second": 90.143,
"eval_steps_per_second": 1.878,
"step": 5250
},
{
"epoch": 10.271317829457365,
"grad_norm": 1.5026272535324097,
"learning_rate": 0.00015892248062015504,
"loss": 0.8010871887207032,
"step": 5300
},
{
"epoch": 10.271317829457365,
"eval_loss": 0.7520400881767273,
"eval_runtime": 204.6587,
"eval_samples_per_second": 92.857,
"eval_steps_per_second": 1.935,
"step": 5300
},
{
"epoch": 10.368217054263566,
"grad_norm": 1.386483907699585,
"learning_rate": 0.00015853488372093023,
"loss": 0.808378677368164,
"step": 5350
},
{
"epoch": 10.368217054263566,
"eval_loss": 0.7365431785583496,
"eval_runtime": 208.4559,
"eval_samples_per_second": 91.166,
"eval_steps_per_second": 1.9,
"step": 5350
},
{
"epoch": 10.465116279069768,
"grad_norm": 1.3923448324203491,
"learning_rate": 0.00015814728682170544,
"loss": 0.7882123565673829,
"step": 5400
},
{
"epoch": 10.465116279069768,
"eval_loss": 0.7501969933509827,
"eval_runtime": 205.9772,
"eval_samples_per_second": 92.263,
"eval_steps_per_second": 1.923,
"step": 5400
},
{
"epoch": 10.562015503875969,
"grad_norm": 1.2909716367721558,
"learning_rate": 0.00015775968992248063,
"loss": 0.7820880126953125,
"step": 5450
},
{
"epoch": 10.562015503875969,
"eval_loss": 0.7447800636291504,
"eval_runtime": 209.0079,
"eval_samples_per_second": 90.925,
"eval_steps_per_second": 1.895,
"step": 5450
},
{
"epoch": 10.65891472868217,
"grad_norm": 1.2773196697235107,
"learning_rate": 0.00015737209302325582,
"loss": 0.788450698852539,
"step": 5500
},
{
"epoch": 10.65891472868217,
"eval_loss": 0.7404767274856567,
"eval_runtime": 213.2254,
"eval_samples_per_second": 89.126,
"eval_steps_per_second": 1.857,
"step": 5500
},
{
"epoch": 10.755813953488373,
"grad_norm": 1.4315084218978882,
"learning_rate": 0.000156984496124031,
"loss": 0.7946225738525391,
"step": 5550
},
{
"epoch": 10.755813953488373,
"eval_loss": 0.7304003238677979,
"eval_runtime": 205.986,
"eval_samples_per_second": 92.259,
"eval_steps_per_second": 1.922,
"step": 5550
},
{
"epoch": 10.852713178294573,
"grad_norm": 1.2794160842895508,
"learning_rate": 0.0001565968992248062,
"loss": 0.7854644012451172,
"step": 5600
},
{
"epoch": 10.852713178294573,
"eval_loss": 0.7165542244911194,
"eval_runtime": 211.2507,
"eval_samples_per_second": 89.959,
"eval_steps_per_second": 1.875,
"step": 5600
},
{
"epoch": 10.949612403100776,
"grad_norm": 1.4190521240234375,
"learning_rate": 0.0001562093023255814,
"loss": 0.7866236114501953,
"step": 5650
},
{
"epoch": 10.949612403100776,
"eval_loss": 0.7293540239334106,
"eval_runtime": 206.6809,
"eval_samples_per_second": 91.949,
"eval_steps_per_second": 1.916,
"step": 5650
},
{
"epoch": 11.046511627906977,
"grad_norm": 1.298693299293518,
"learning_rate": 0.0001558217054263566,
"loss": 0.7865670013427735,
"step": 5700
},
{
"epoch": 11.046511627906977,
"eval_loss": 0.7284151911735535,
"eval_runtime": 204.4335,
"eval_samples_per_second": 92.959,
"eval_steps_per_second": 1.937,
"step": 5700
},
{
"epoch": 11.143410852713178,
"grad_norm": 1.1174720525741577,
"learning_rate": 0.00015543410852713178,
"loss": 0.773541488647461,
"step": 5750
},
{
"epoch": 11.143410852713178,
"eval_loss": 0.7278522253036499,
"eval_runtime": 146.5948,
"eval_samples_per_second": 129.636,
"eval_steps_per_second": 2.701,
"step": 5750
},
{
"epoch": 11.24031007751938,
"grad_norm": 1.2272348403930664,
"learning_rate": 0.00015504651162790697,
"loss": 0.7666770935058593,
"step": 5800
},
{
"epoch": 11.24031007751938,
"eval_loss": 0.7202744483947754,
"eval_runtime": 193.7949,
"eval_samples_per_second": 98.062,
"eval_steps_per_second": 2.043,
"step": 5800
},
{
"epoch": 11.337209302325581,
"grad_norm": 1.373920202255249,
"learning_rate": 0.00015465891472868216,
"loss": 0.7730393981933594,
"step": 5850
},
{
"epoch": 11.337209302325581,
"eval_loss": 0.7175942063331604,
"eval_runtime": 197.7471,
"eval_samples_per_second": 96.103,
"eval_steps_per_second": 2.003,
"step": 5850
},
{
"epoch": 11.434108527131784,
"grad_norm": 1.4184571504592896,
"learning_rate": 0.00015427131782945737,
"loss": 0.7573123168945313,
"step": 5900
},
{
"epoch": 11.434108527131784,
"eval_loss": 0.7180309891700745,
"eval_runtime": 198.5097,
"eval_samples_per_second": 95.733,
"eval_steps_per_second": 1.995,
"step": 5900
},
{
"epoch": 11.531007751937985,
"grad_norm": 1.357619047164917,
"learning_rate": 0.00015388372093023256,
"loss": 0.7792025756835937,
"step": 5950
},
{
"epoch": 11.531007751937985,
"eval_loss": 0.7118659615516663,
"eval_runtime": 199.1368,
"eval_samples_per_second": 95.432,
"eval_steps_per_second": 1.989,
"step": 5950
},
{
"epoch": 11.627906976744185,
"grad_norm": 1.3384771347045898,
"learning_rate": 0.00015349612403100775,
"loss": 0.77959228515625,
"step": 6000
},
{
"epoch": 11.627906976744185,
"eval_loss": 0.7145297527313232,
"eval_runtime": 191.3254,
"eval_samples_per_second": 99.328,
"eval_steps_per_second": 2.07,
"step": 6000
},
{
"epoch": 11.724806201550388,
"grad_norm": 1.346449375152588,
"learning_rate": 0.00015310852713178296,
"loss": 0.760606918334961,
"step": 6050
},
{
"epoch": 11.724806201550388,
"eval_loss": 0.7076368927955627,
"eval_runtime": 195.1439,
"eval_samples_per_second": 97.385,
"eval_steps_per_second": 2.029,
"step": 6050
},
{
"epoch": 11.821705426356589,
"grad_norm": 1.3643659353256226,
"learning_rate": 0.00015272093023255815,
"loss": 0.7598040008544922,
"step": 6100
},
{
"epoch": 11.821705426356589,
"eval_loss": 0.7118851542472839,
"eval_runtime": 197.8301,
"eval_samples_per_second": 96.062,
"eval_steps_per_second": 2.002,
"step": 6100
},
{
"epoch": 11.918604651162791,
"grad_norm": 1.3271793127059937,
"learning_rate": 0.00015233333333333334,
"loss": 0.7641516876220703,
"step": 6150
},
{
"epoch": 11.918604651162791,
"eval_loss": 0.7121263146400452,
"eval_runtime": 193.0891,
"eval_samples_per_second": 98.421,
"eval_steps_per_second": 2.051,
"step": 6150
},
{
"epoch": 12.015503875968992,
"grad_norm": 1.4539422988891602,
"learning_rate": 0.00015194573643410856,
"loss": 0.7536121368408203,
"step": 6200
},
{
"epoch": 12.015503875968992,
"eval_loss": 0.6982870101928711,
"eval_runtime": 193.5336,
"eval_samples_per_second": 98.195,
"eval_steps_per_second": 2.046,
"step": 6200
},
{
"epoch": 12.112403100775193,
"grad_norm": 1.204176902770996,
"learning_rate": 0.00015155813953488374,
"loss": 0.7676261138916015,
"step": 6250
},
{
"epoch": 12.112403100775193,
"eval_loss": 0.7079237699508667,
"eval_runtime": 200.2318,
"eval_samples_per_second": 94.91,
"eval_steps_per_second": 1.978,
"step": 6250
},
{
"epoch": 12.209302325581396,
"grad_norm": 1.0595972537994385,
"learning_rate": 0.00015117054263565893,
"loss": 0.7566854858398437,
"step": 6300
},
{
"epoch": 12.209302325581396,
"eval_loss": 0.7162359356880188,
"eval_runtime": 194.1508,
"eval_samples_per_second": 97.883,
"eval_steps_per_second": 2.04,
"step": 6300
},
{
"epoch": 12.306201550387597,
"grad_norm": 1.2803763151168823,
"learning_rate": 0.00015078294573643412,
"loss": 0.7496888732910156,
"step": 6350
},
{
"epoch": 12.306201550387597,
"eval_loss": 0.7112685441970825,
"eval_runtime": 190.486,
"eval_samples_per_second": 99.766,
"eval_steps_per_second": 2.079,
"step": 6350
},
{
"epoch": 12.4031007751938,
"grad_norm": 1.1954985857009888,
"learning_rate": 0.0001503953488372093,
"loss": 0.7482534790039063,
"step": 6400
},
{
"epoch": 12.4031007751938,
"eval_loss": 0.702034056186676,
"eval_runtime": 196.6133,
"eval_samples_per_second": 96.657,
"eval_steps_per_second": 2.014,
"step": 6400
},
{
"epoch": 12.5,
"grad_norm": 1.217117428779602,
"learning_rate": 0.0001500077519379845,
"loss": 0.7486822509765625,
"step": 6450
},
{
"epoch": 12.5,
"eval_loss": 0.7068222165107727,
"eval_runtime": 200.776,
"eval_samples_per_second": 94.653,
"eval_steps_per_second": 1.972,
"step": 6450
},
{
"epoch": 12.5968992248062,
"grad_norm": 1.3999775648117065,
"learning_rate": 0.0001496201550387597,
"loss": 0.7376171112060547,
"step": 6500
},
{
"epoch": 12.5968992248062,
"eval_loss": 0.7056812644004822,
"eval_runtime": 196.5,
"eval_samples_per_second": 96.712,
"eval_steps_per_second": 2.015,
"step": 6500
},
{
"epoch": 12.693798449612403,
"grad_norm": 1.5863757133483887,
"learning_rate": 0.0001492325581395349,
"loss": 0.7303102111816406,
"step": 6550
},
{
"epoch": 12.693798449612403,
"eval_loss": 0.7024106383323669,
"eval_runtime": 192.9575,
"eval_samples_per_second": 98.488,
"eval_steps_per_second": 2.052,
"step": 6550
},
{
"epoch": 12.790697674418604,
"grad_norm": 1.221718192100525,
"learning_rate": 0.0001488449612403101,
"loss": 0.7352449798583984,
"step": 6600
},
{
"epoch": 12.790697674418604,
"eval_loss": 0.6982800960540771,
"eval_runtime": 178.9122,
"eval_samples_per_second": 106.22,
"eval_steps_per_second": 2.213,
"step": 6600
},
{
"epoch": 12.887596899224807,
"grad_norm": 1.2759432792663574,
"learning_rate": 0.00014845736434108527,
"loss": 0.7509404754638672,
"step": 6650
},
{
"epoch": 12.887596899224807,
"eval_loss": 0.7055057883262634,
"eval_runtime": 180.4112,
"eval_samples_per_second": 105.337,
"eval_steps_per_second": 2.195,
"step": 6650
},
{
"epoch": 12.984496124031008,
"grad_norm": 1.3235024213790894,
"learning_rate": 0.00014806976744186046,
"loss": 0.7331123352050781,
"step": 6700
},
{
"epoch": 12.984496124031008,
"eval_loss": 0.6977774500846863,
"eval_runtime": 177.5576,
"eval_samples_per_second": 107.03,
"eval_steps_per_second": 2.23,
"step": 6700
},
{
"epoch": 13.081395348837209,
"grad_norm": 1.3682844638824463,
"learning_rate": 0.00014768217054263568,
"loss": 0.7287850189208984,
"step": 6750
},
{
"epoch": 13.081395348837209,
"eval_loss": 0.6928258538246155,
"eval_runtime": 168.0062,
"eval_samples_per_second": 113.115,
"eval_steps_per_second": 2.357,
"step": 6750
},
{
"epoch": 13.178294573643411,
"grad_norm": 1.3493455648422241,
"learning_rate": 0.00014729457364341087,
"loss": 0.7296273040771485,
"step": 6800
},
{
"epoch": 13.178294573643411,
"eval_loss": 0.6921527981758118,
"eval_runtime": 183.7563,
"eval_samples_per_second": 103.42,
"eval_steps_per_second": 2.155,
"step": 6800
},
{
"epoch": 13.275193798449612,
"grad_norm": 1.1569421291351318,
"learning_rate": 0.00014690697674418605,
"loss": 0.7132796478271485,
"step": 6850
},
{
"epoch": 13.275193798449612,
"eval_loss": 0.6823224425315857,
"eval_runtime": 184.3854,
"eval_samples_per_second": 103.067,
"eval_steps_per_second": 2.148,
"step": 6850
},
{
"epoch": 13.372093023255815,
"grad_norm": 1.392767310142517,
"learning_rate": 0.00014651937984496124,
"loss": 0.7273464965820312,
"step": 6900
},
{
"epoch": 13.372093023255815,
"eval_loss": 0.6904884576797485,
"eval_runtime": 167.1685,
"eval_samples_per_second": 113.682,
"eval_steps_per_second": 2.369,
"step": 6900
},
{
"epoch": 13.468992248062015,
"grad_norm": 1.1925963163375854,
"learning_rate": 0.00014613178294573643,
"loss": 0.7213536834716797,
"step": 6950
},
{
"epoch": 13.468992248062015,
"eval_loss": 0.6783022284507751,
"eval_runtime": 162.2209,
"eval_samples_per_second": 117.149,
"eval_steps_per_second": 2.441,
"step": 6950
},
{
"epoch": 13.565891472868216,
"grad_norm": 1.2016693353652954,
"learning_rate": 0.00014574418604651164,
"loss": 0.7257649230957032,
"step": 7000
},
{
"epoch": 13.565891472868216,
"eval_loss": 0.6885735988616943,
"eval_runtime": 154.1542,
"eval_samples_per_second": 123.279,
"eval_steps_per_second": 2.569,
"step": 7000
},
{
"epoch": 13.662790697674419,
"grad_norm": 1.0946542024612427,
"learning_rate": 0.00014535658914728683,
"loss": 0.7063812255859375,
"step": 7050
},
{
"epoch": 13.662790697674419,
"eval_loss": 0.6905943155288696,
"eval_runtime": 159.4294,
"eval_samples_per_second": 119.2,
"eval_steps_per_second": 2.484,
"step": 7050
},
{
"epoch": 13.75968992248062,
"grad_norm": 1.445551872253418,
"learning_rate": 0.00014496899224806202,
"loss": 0.7338412475585937,
"step": 7100
},
{
"epoch": 13.75968992248062,
"eval_loss": 0.6792259216308594,
"eval_runtime": 168.9572,
"eval_samples_per_second": 112.478,
"eval_steps_per_second": 2.344,
"step": 7100
},
{
"epoch": 13.856589147286822,
"grad_norm": 1.1020028591156006,
"learning_rate": 0.0001445813953488372,
"loss": 0.7323764801025391,
"step": 7150
},
{
"epoch": 13.856589147286822,
"eval_loss": 0.6802482604980469,
"eval_runtime": 177.1213,
"eval_samples_per_second": 107.294,
"eval_steps_per_second": 2.236,
"step": 7150
},
{
"epoch": 13.953488372093023,
"grad_norm": 1.3807705640792847,
"learning_rate": 0.0001441937984496124,
"loss": 0.711009521484375,
"step": 7200
},
{
"epoch": 13.953488372093023,
"eval_loss": 0.6847464442253113,
"eval_runtime": 186.7934,
"eval_samples_per_second": 101.738,
"eval_steps_per_second": 2.12,
"step": 7200
},
{
"epoch": 14.050387596899224,
"grad_norm": 1.2024147510528564,
"learning_rate": 0.0001438062015503876,
"loss": 0.7219676971435547,
"step": 7250
},
{
"epoch": 14.050387596899224,
"eval_loss": 0.6811977028846741,
"eval_runtime": 192.2788,
"eval_samples_per_second": 98.836,
"eval_steps_per_second": 2.06,
"step": 7250
},
{
"epoch": 14.147286821705427,
"grad_norm": 1.3249707221984863,
"learning_rate": 0.0001434186046511628,
"loss": 0.7023135375976562,
"step": 7300
},
{
"epoch": 14.147286821705427,
"eval_loss": 0.6774859428405762,
"eval_runtime": 188.9814,
"eval_samples_per_second": 100.56,
"eval_steps_per_second": 2.095,
"step": 7300
},
{
"epoch": 14.244186046511627,
"grad_norm": 1.185625433921814,
"learning_rate": 0.000143031007751938,
"loss": 0.7096491241455078,
"step": 7350
},
{
"epoch": 14.244186046511627,
"eval_loss": 0.6745359301567078,
"eval_runtime": 182.8908,
"eval_samples_per_second": 103.909,
"eval_steps_per_second": 2.165,
"step": 7350
},
{
"epoch": 14.34108527131783,
"grad_norm": 1.3041390180587769,
"learning_rate": 0.00014264341085271318,
"loss": 0.7081405639648437,
"step": 7400
},
{
"epoch": 14.34108527131783,
"eval_loss": 0.6794308423995972,
"eval_runtime": 179.4287,
"eval_samples_per_second": 105.914,
"eval_steps_per_second": 2.207,
"step": 7400
},
{
"epoch": 14.437984496124031,
"grad_norm": 1.2018780708312988,
"learning_rate": 0.00014225581395348836,
"loss": 0.7056974029541015,
"step": 7450
},
{
"epoch": 14.437984496124031,
"eval_loss": 0.6793537735939026,
"eval_runtime": 186.1901,
"eval_samples_per_second": 102.068,
"eval_steps_per_second": 2.127,
"step": 7450
},
{
"epoch": 14.534883720930232,
"grad_norm": 1.1330044269561768,
"learning_rate": 0.00014186821705426355,
"loss": 0.7149968719482422,
"step": 7500
},
{
"epoch": 14.534883720930232,
"eval_loss": 0.6830456256866455,
"eval_runtime": 183.7445,
"eval_samples_per_second": 103.426,
"eval_steps_per_second": 2.155,
"step": 7500
},
{
"epoch": 14.631782945736434,
"grad_norm": 1.2338323593139648,
"learning_rate": 0.00014148062015503877,
"loss": 0.7174818420410156,
"step": 7550
},
{
"epoch": 14.631782945736434,
"eval_loss": 0.6693219542503357,
"eval_runtime": 187.3627,
"eval_samples_per_second": 101.429,
"eval_steps_per_second": 2.114,
"step": 7550
},
{
"epoch": 14.728682170542635,
"grad_norm": 1.3604202270507812,
"learning_rate": 0.00014109302325581395,
"loss": 0.6990853118896484,
"step": 7600
},
{
"epoch": 14.728682170542635,
"eval_loss": 0.6684260368347168,
"eval_runtime": 188.918,
"eval_samples_per_second": 100.594,
"eval_steps_per_second": 2.096,
"step": 7600
},
{
"epoch": 14.825581395348838,
"grad_norm": 1.1873857975006104,
"learning_rate": 0.00014070542635658917,
"loss": 0.7052565765380859,
"step": 7650
},
{
"epoch": 14.825581395348838,
"eval_loss": 0.6708822846412659,
"eval_runtime": 215.4533,
"eval_samples_per_second": 88.205,
"eval_steps_per_second": 1.838,
"step": 7650
},
{
"epoch": 14.922480620155039,
"grad_norm": 1.3028804063796997,
"learning_rate": 0.00014031782945736436,
"loss": 0.7046041870117188,
"step": 7700
},
{
"epoch": 14.922480620155039,
"eval_loss": 0.6715724468231201,
"eval_runtime": 207.9457,
"eval_samples_per_second": 91.389,
"eval_steps_per_second": 1.904,
"step": 7700
},
{
"epoch": 15.01937984496124,
"grad_norm": 1.2554810047149658,
"learning_rate": 0.00013993023255813954,
"loss": 0.6970509338378906,
"step": 7750
},
{
"epoch": 15.01937984496124,
"eval_loss": 0.6706631779670715,
"eval_runtime": 208.1519,
"eval_samples_per_second": 91.299,
"eval_steps_per_second": 1.902,
"step": 7750
},
{
"epoch": 15.116279069767442,
"grad_norm": 1.1705738306045532,
"learning_rate": 0.00013954263565891473,
"loss": 0.7088002014160156,
"step": 7800
},
{
"epoch": 15.116279069767442,
"eval_loss": 0.6595144271850586,
"eval_runtime": 214.6398,
"eval_samples_per_second": 88.539,
"eval_steps_per_second": 1.845,
"step": 7800
},
{
"epoch": 15.213178294573643,
"grad_norm": 1.4359619617462158,
"learning_rate": 0.00013915503875968995,
"loss": 0.6887350463867188,
"step": 7850
},
{
"epoch": 15.213178294573643,
"eval_loss": 0.666128396987915,
"eval_runtime": 210.8256,
"eval_samples_per_second": 90.141,
"eval_steps_per_second": 1.878,
"step": 7850
},
{
"epoch": 15.310077519379846,
"grad_norm": 1.2694238424301147,
"learning_rate": 0.00013876744186046514,
"loss": 0.6924005889892578,
"step": 7900
},
{
"epoch": 15.310077519379846,
"eval_loss": 0.6729814410209656,
"eval_runtime": 198.8675,
"eval_samples_per_second": 95.561,
"eval_steps_per_second": 1.991,
"step": 7900
},
{
"epoch": 15.406976744186046,
"grad_norm": 1.285810947418213,
"learning_rate": 0.00013837984496124032,
"loss": 0.6942383575439454,
"step": 7950
},
{
"epoch": 15.406976744186046,
"eval_loss": 0.6664624214172363,
"eval_runtime": 212.0427,
"eval_samples_per_second": 89.623,
"eval_steps_per_second": 1.868,
"step": 7950
},
{
"epoch": 15.503875968992247,
"grad_norm": 1.1837239265441895,
"learning_rate": 0.0001379922480620155,
"loss": 0.6832563018798828,
"step": 8000
},
{
"epoch": 15.503875968992247,
"eval_loss": 0.6609703898429871,
"eval_runtime": 214.853,
"eval_samples_per_second": 88.451,
"eval_steps_per_second": 1.843,
"step": 8000
},
{
"epoch": 15.60077519379845,
"grad_norm": 1.1095117330551147,
"learning_rate": 0.0001376046511627907,
"loss": 0.6847735595703125,
"step": 8050
},
{
"epoch": 15.60077519379845,
"eval_loss": 0.6504544615745544,
"eval_runtime": 217.8637,
"eval_samples_per_second": 87.229,
"eval_steps_per_second": 1.818,
"step": 8050
},
{
"epoch": 15.69767441860465,
"grad_norm": 1.2389260530471802,
"learning_rate": 0.00013721705426356591,
"loss": 0.6890559387207031,
"step": 8100
},
{
"epoch": 15.69767441860465,
"eval_loss": 0.6517437696456909,
"eval_runtime": 210.9914,
"eval_samples_per_second": 90.07,
"eval_steps_per_second": 1.877,
"step": 8100
},
{
"epoch": 15.794573643410853,
"grad_norm": 1.157686471939087,
"learning_rate": 0.0001368294573643411,
"loss": 0.6964131164550781,
"step": 8150
},
{
"epoch": 15.794573643410853,
"eval_loss": 0.6610472202301025,
"eval_runtime": 201.0817,
"eval_samples_per_second": 94.509,
"eval_steps_per_second": 1.969,
"step": 8150
},
{
"epoch": 15.891472868217054,
"grad_norm": 1.1839828491210938,
"learning_rate": 0.0001364418604651163,
"loss": 0.683568115234375,
"step": 8200
},
{
"epoch": 15.891472868217054,
"eval_loss": 0.6657268404960632,
"eval_runtime": 209.1114,
"eval_samples_per_second": 90.88,
"eval_steps_per_second": 1.894,
"step": 8200
},
{
"epoch": 15.988372093023255,
"grad_norm": 1.1578187942504883,
"learning_rate": 0.00013605426356589148,
"loss": 0.6690034484863281,
"step": 8250
},
{
"epoch": 15.988372093023255,
"eval_loss": 0.6609968543052673,
"eval_runtime": 206.0582,
"eval_samples_per_second": 92.226,
"eval_steps_per_second": 1.922,
"step": 8250
},
{
"epoch": 16.085271317829456,
"grad_norm": 1.0880696773529053,
"learning_rate": 0.00013566666666666667,
"loss": 0.6861241912841797,
"step": 8300
},
{
"epoch": 16.085271317829456,
"eval_loss": 0.6592596173286438,
"eval_runtime": 210.5186,
"eval_samples_per_second": 90.272,
"eval_steps_per_second": 1.881,
"step": 8300
},
{
"epoch": 16.18217054263566,
"grad_norm": 1.3988213539123535,
"learning_rate": 0.00013527906976744188,
"loss": 0.6735115814208984,
"step": 8350
},
{
"epoch": 16.18217054263566,
"eval_loss": 0.6536418795585632,
"eval_runtime": 214.3127,
"eval_samples_per_second": 88.674,
"eval_steps_per_second": 1.848,
"step": 8350
},
{
"epoch": 16.27906976744186,
"grad_norm": 1.1758220195770264,
"learning_rate": 0.00013489147286821707,
"loss": 0.6864335632324219,
"step": 8400
},
{
"epoch": 16.27906976744186,
"eval_loss": 0.6612951159477234,
"eval_runtime": 207.0226,
"eval_samples_per_second": 91.797,
"eval_steps_per_second": 1.913,
"step": 8400
},
{
"epoch": 16.375968992248062,
"grad_norm": 0.9774469137191772,
"learning_rate": 0.00013450387596899226,
"loss": 0.6769390869140625,
"step": 8450
},
{
"epoch": 16.375968992248062,
"eval_loss": 0.6553090810775757,
"eval_runtime": 215.4301,
"eval_samples_per_second": 88.214,
"eval_steps_per_second": 1.838,
"step": 8450
},
{
"epoch": 16.472868217054263,
"grad_norm": 1.1165791749954224,
"learning_rate": 0.00013411627906976745,
"loss": 0.6800428009033204,
"step": 8500
},
{
"epoch": 16.472868217054263,
"eval_loss": 0.6524401307106018,
"eval_runtime": 213.4954,
"eval_samples_per_second": 89.014,
"eval_steps_per_second": 1.855,
"step": 8500
},
{
"epoch": 16.569767441860463,
"grad_norm": 1.130295753479004,
"learning_rate": 0.00013372868217054263,
"loss": 0.6797599792480469,
"step": 8550
},
{
"epoch": 16.569767441860463,
"eval_loss": 0.6471645832061768,
"eval_runtime": 212.8448,
"eval_samples_per_second": 89.286,
"eval_steps_per_second": 1.861,
"step": 8550
},
{
"epoch": 16.666666666666668,
"grad_norm": 1.1940258741378784,
"learning_rate": 0.00013334108527131782,
"loss": 0.6560598754882813,
"step": 8600
},
{
"epoch": 16.666666666666668,
"eval_loss": 0.6543801426887512,
"eval_runtime": 215.4431,
"eval_samples_per_second": 88.209,
"eval_steps_per_second": 1.838,
"step": 8600
},
{
"epoch": 16.76356589147287,
"grad_norm": 1.2403243780136108,
"learning_rate": 0.00013295348837209304,
"loss": 0.6820440673828125,
"step": 8650
},
{
"epoch": 16.76356589147287,
"eval_loss": 0.657560408115387,
"eval_runtime": 222.4968,
"eval_samples_per_second": 85.412,
"eval_steps_per_second": 1.78,
"step": 8650
},
{
"epoch": 16.86046511627907,
"grad_norm": 1.1667098999023438,
"learning_rate": 0.00013256589147286822,
"loss": 0.661100082397461,
"step": 8700
},
{
"epoch": 16.86046511627907,
"eval_loss": 0.6547607183456421,
"eval_runtime": 225.7306,
"eval_samples_per_second": 84.189,
"eval_steps_per_second": 1.754,
"step": 8700
},
{
"epoch": 16.95736434108527,
"grad_norm": 1.4980967044830322,
"learning_rate": 0.0001321782945736434,
"loss": 0.6610430908203125,
"step": 8750
},
{
"epoch": 16.95736434108527,
"eval_loss": 0.6534817814826965,
"eval_runtime": 216.3877,
"eval_samples_per_second": 87.824,
"eval_steps_per_second": 1.83,
"step": 8750
},
{
"epoch": 17.05426356589147,
"grad_norm": 1.3227039575576782,
"learning_rate": 0.0001317906976744186,
"loss": 0.6713478088378906,
"step": 8800
},
{
"epoch": 17.05426356589147,
"eval_loss": 0.6427262425422668,
"eval_runtime": 218.6666,
"eval_samples_per_second": 86.909,
"eval_steps_per_second": 1.811,
"step": 8800
},
{
"epoch": 17.151162790697676,
"grad_norm": 1.0835665464401245,
"learning_rate": 0.0001314031007751938,
"loss": 0.6705303955078125,
"step": 8850
},
{
"epoch": 17.151162790697676,
"eval_loss": 0.6579347848892212,
"eval_runtime": 195.3941,
"eval_samples_per_second": 97.26,
"eval_steps_per_second": 2.027,
"step": 8850
},
{
"epoch": 17.248062015503876,
"grad_norm": 1.3096404075622559,
"learning_rate": 0.000131015503875969,
"loss": 0.6573382568359375,
"step": 8900
},
{
"epoch": 17.248062015503876,
"eval_loss": 0.6422920823097229,
"eval_runtime": 190.6185,
"eval_samples_per_second": 99.697,
"eval_steps_per_second": 2.077,
"step": 8900
},
{
"epoch": 17.344961240310077,
"grad_norm": 1.360499620437622,
"learning_rate": 0.0001306279069767442,
"loss": 0.6811130523681641,
"step": 8950
},
{
"epoch": 17.344961240310077,
"eval_loss": 0.6441925168037415,
"eval_runtime": 190.9697,
"eval_samples_per_second": 99.513,
"eval_steps_per_second": 2.074,
"step": 8950
},
{
"epoch": 17.441860465116278,
"grad_norm": 1.1536288261413574,
"learning_rate": 0.00013024031007751938,
"loss": 0.6666915893554688,
"step": 9000
},
{
"epoch": 17.441860465116278,
"eval_loss": 0.6471693515777588,
"eval_runtime": 200.6831,
"eval_samples_per_second": 94.697,
"eval_steps_per_second": 1.973,
"step": 9000
},
{
"epoch": 17.53875968992248,
"grad_norm": 1.135809302330017,
"learning_rate": 0.00012985271317829457,
"loss": 0.6658331298828125,
"step": 9050
},
{
"epoch": 17.53875968992248,
"eval_loss": 0.6478157043457031,
"eval_runtime": 195.5434,
"eval_samples_per_second": 97.186,
"eval_steps_per_second": 2.025,
"step": 9050
},
{
"epoch": 17.635658914728683,
"grad_norm": 1.2341054677963257,
"learning_rate": 0.00012946511627906976,
"loss": 0.6616094970703125,
"step": 9100
},
{
"epoch": 17.635658914728683,
"eval_loss": 0.6445872783660889,
"eval_runtime": 195.3659,
"eval_samples_per_second": 97.274,
"eval_steps_per_second": 2.027,
"step": 9100
},
{
"epoch": 17.732558139534884,
"grad_norm": 1.0323907136917114,
"learning_rate": 0.00012907751937984497,
"loss": 0.6661641693115234,
"step": 9150
},
{
"epoch": 17.732558139534884,
"eval_loss": 0.647544264793396,
"eval_runtime": 193.9072,
"eval_samples_per_second": 98.006,
"eval_steps_per_second": 2.042,
"step": 9150
},
{
"epoch": 17.829457364341085,
"grad_norm": 1.0194849967956543,
"learning_rate": 0.00012868992248062016,
"loss": 0.6610712432861328,
"step": 9200
},
{
"epoch": 17.829457364341085,
"eval_loss": 0.635890543460846,
"eval_runtime": 195.1292,
"eval_samples_per_second": 97.392,
"eval_steps_per_second": 2.029,
"step": 9200
},
{
"epoch": 17.926356589147286,
"grad_norm": 1.091950535774231,
"learning_rate": 0.00012830232558139535,
"loss": 0.6431932067871093,
"step": 9250
},
{
"epoch": 17.926356589147286,
"eval_loss": 0.6398835182189941,
"eval_runtime": 195.3898,
"eval_samples_per_second": 97.262,
"eval_steps_per_second": 2.027,
"step": 9250
},
{
"epoch": 18.023255813953487,
"grad_norm": 1.4130749702453613,
"learning_rate": 0.00012791472868217056,
"loss": 0.648166275024414,
"step": 9300
},
{
"epoch": 18.023255813953487,
"eval_loss": 0.6456313729286194,
"eval_runtime": 193.6438,
"eval_samples_per_second": 98.139,
"eval_steps_per_second": 2.045,
"step": 9300
},
{
"epoch": 18.12015503875969,
"grad_norm": 1.1974269151687622,
"learning_rate": 0.00012752713178294575,
"loss": 0.6472698211669922,
"step": 9350
},
{
"epoch": 18.12015503875969,
"eval_loss": 0.6428890824317932,
"eval_runtime": 197.0847,
"eval_samples_per_second": 96.426,
"eval_steps_per_second": 2.009,
"step": 9350
},
{
"epoch": 18.217054263565892,
"grad_norm": 1.071179747581482,
"learning_rate": 0.00012713953488372094,
"loss": 0.6578852844238281,
"step": 9400
},
{
"epoch": 18.217054263565892,
"eval_loss": 0.6396385431289673,
"eval_runtime": 226.4716,
"eval_samples_per_second": 83.913,
"eval_steps_per_second": 1.749,
"step": 9400
},
{
"epoch": 18.313953488372093,
"grad_norm": 1.1610106229782104,
"learning_rate": 0.00012675193798449615,
"loss": 0.645213851928711,
"step": 9450
},
{
"epoch": 18.313953488372093,
"eval_loss": 0.6382104158401489,
"eval_runtime": 196.6143,
"eval_samples_per_second": 96.656,
"eval_steps_per_second": 2.014,
"step": 9450
},
{
"epoch": 18.410852713178294,
"grad_norm": 0.9724190831184387,
"learning_rate": 0.00012636434108527134,
"loss": 0.637158203125,
"step": 9500
},
{
"epoch": 18.410852713178294,
"eval_loss": 0.6358678936958313,
"eval_runtime": 194.2795,
"eval_samples_per_second": 97.818,
"eval_steps_per_second": 2.038,
"step": 9500
},
{
"epoch": 18.507751937984494,
"grad_norm": 0.9816511273384094,
"learning_rate": 0.00012597674418604653,
"loss": 0.6615879821777344,
"step": 9550
},
{
"epoch": 18.507751937984494,
"eval_loss": 0.6309703588485718,
"eval_runtime": 197.0762,
"eval_samples_per_second": 96.43,
"eval_steps_per_second": 2.009,
"step": 9550
},
{
"epoch": 18.6046511627907,
"grad_norm": 1.0494227409362793,
"learning_rate": 0.00012558914728682172,
"loss": 0.6506745910644531,
"step": 9600
},
{
"epoch": 18.6046511627907,
"eval_loss": 0.636694610118866,
"eval_runtime": 201.6065,
"eval_samples_per_second": 94.263,
"eval_steps_per_second": 1.964,
"step": 9600
},
{
"epoch": 18.7015503875969,
"grad_norm": 1.328189730644226,
"learning_rate": 0.0001252015503875969,
"loss": 0.6442088317871094,
"step": 9650
},
{
"epoch": 18.7015503875969,
"eval_loss": 0.6276881098747253,
"eval_runtime": 186.5785,
"eval_samples_per_second": 101.855,
"eval_steps_per_second": 2.122,
"step": 9650
},
{
"epoch": 18.7984496124031,
"grad_norm": 1.1828275918960571,
"learning_rate": 0.0001248139534883721,
"loss": 0.6382374954223633,
"step": 9700
},
{
"epoch": 18.7984496124031,
"eval_loss": 0.636686384677887,
"eval_runtime": 180.6464,
"eval_samples_per_second": 105.2,
"eval_steps_per_second": 2.192,
"step": 9700
},
{
"epoch": 18.8953488372093,
"grad_norm": 1.2978507280349731,
"learning_rate": 0.0001244263565891473,
"loss": 0.6523281860351563,
"step": 9750
},
{
"epoch": 18.8953488372093,
"eval_loss": 0.6327991485595703,
"eval_runtime": 177.6563,
"eval_samples_per_second": 106.971,
"eval_steps_per_second": 2.229,
"step": 9750
},
{
"epoch": 18.992248062015506,
"grad_norm": 1.143589735031128,
"learning_rate": 0.0001240387596899225,
"loss": 0.6488991546630859,
"step": 9800
},
{
"epoch": 18.992248062015506,
"eval_loss": 0.6304866075515747,
"eval_runtime": 211.5352,
"eval_samples_per_second": 89.838,
"eval_steps_per_second": 1.872,
"step": 9800
},
{
"epoch": 19.089147286821706,
"grad_norm": 1.0623714923858643,
"learning_rate": 0.00012365116279069768,
"loss": 0.6333879852294921,
"step": 9850
},
{
"epoch": 19.089147286821706,
"eval_loss": 0.6215238571166992,
"eval_runtime": 202.8482,
"eval_samples_per_second": 93.686,
"eval_steps_per_second": 1.952,
"step": 9850
},
{
"epoch": 19.186046511627907,
"grad_norm": 1.1245452165603638,
"learning_rate": 0.00012326356589147287,
"loss": 0.642474136352539,
"step": 9900
},
{
"epoch": 19.186046511627907,
"eval_loss": 0.637874960899353,
"eval_runtime": 215.8143,
"eval_samples_per_second": 88.057,
"eval_steps_per_second": 1.835,
"step": 9900
},
{
"epoch": 19.282945736434108,
"grad_norm": 1.5039018392562866,
"learning_rate": 0.00012287596899224806,
"loss": 0.6468492126464844,
"step": 9950
},
{
"epoch": 19.282945736434108,
"eval_loss": 0.6239569187164307,
"eval_runtime": 212.329,
"eval_samples_per_second": 89.503,
"eval_steps_per_second": 1.865,
"step": 9950
},
{
"epoch": 19.37984496124031,
"grad_norm": 1.0403733253479004,
"learning_rate": 0.00012248837209302327,
"loss": 0.6217416381835937,
"step": 10000
},
{
"epoch": 19.37984496124031,
"eval_loss": 0.6286182999610901,
"eval_runtime": 205.5809,
"eval_samples_per_second": 92.44,
"eval_steps_per_second": 1.926,
"step": 10000
},
{
"epoch": 19.476744186046513,
"grad_norm": 1.1081093549728394,
"learning_rate": 0.00012210077519379846,
"loss": 0.6342117691040039,
"step": 10050
},
{
"epoch": 19.476744186046513,
"eval_loss": 0.6270238161087036,
"eval_runtime": 208.2038,
"eval_samples_per_second": 91.276,
"eval_steps_per_second": 1.902,
"step": 10050
},
{
"epoch": 19.573643410852714,
"grad_norm": 1.0839483737945557,
"learning_rate": 0.00012171317829457365,
"loss": 0.6427702331542968,
"step": 10100
},
{
"epoch": 19.573643410852714,
"eval_loss": 0.6364043354988098,
"eval_runtime": 196.6629,
"eval_samples_per_second": 96.632,
"eval_steps_per_second": 2.014,
"step": 10100
},
{
"epoch": 19.670542635658915,
"grad_norm": 0.998694658279419,
"learning_rate": 0.00012132558139534884,
"loss": 0.6340975570678711,
"step": 10150
},
{
"epoch": 19.670542635658915,
"eval_loss": 0.6303887963294983,
"eval_runtime": 204.4391,
"eval_samples_per_second": 92.957,
"eval_steps_per_second": 1.937,
"step": 10150
},
{
"epoch": 19.767441860465116,
"grad_norm": 1.228639006614685,
"learning_rate": 0.00012093798449612404,
"loss": 0.6313095474243164,
"step": 10200
},
{
"epoch": 19.767441860465116,
"eval_loss": 0.6240493655204773,
"eval_runtime": 204.6714,
"eval_samples_per_second": 92.851,
"eval_steps_per_second": 1.935,
"step": 10200
},
{
"epoch": 19.864341085271317,
"grad_norm": 1.3371620178222656,
"learning_rate": 0.00012055038759689923,
"loss": 0.6461412048339844,
"step": 10250
},
{
"epoch": 19.864341085271317,
"eval_loss": 0.6157258152961731,
"eval_runtime": 198.6411,
"eval_samples_per_second": 95.67,
"eval_steps_per_second": 1.994,
"step": 10250
},
{
"epoch": 19.96124031007752,
"grad_norm": 1.006132960319519,
"learning_rate": 0.00012016279069767441,
"loss": 0.6349713897705078,
"step": 10300
},
{
"epoch": 19.96124031007752,
"eval_loss": 0.6245470643043518,
"eval_runtime": 195.4527,
"eval_samples_per_second": 97.231,
"eval_steps_per_second": 2.026,
"step": 10300
},
{
"epoch": 20.058139534883722,
"grad_norm": 1.0197911262512207,
"learning_rate": 0.00011977519379844962,
"loss": 0.6510966491699218,
"step": 10350
},
{
"epoch": 20.058139534883722,
"eval_loss": 0.6246243715286255,
"eval_runtime": 191.3549,
"eval_samples_per_second": 99.313,
"eval_steps_per_second": 2.069,
"step": 10350
},
{
"epoch": 20.155038759689923,
"grad_norm": 1.1822741031646729,
"learning_rate": 0.0001193875968992248,
"loss": 0.6449718475341797,
"step": 10400
},
{
"epoch": 20.155038759689923,
"eval_loss": 0.6225494742393494,
"eval_runtime": 183.6561,
"eval_samples_per_second": 103.476,
"eval_steps_per_second": 2.156,
"step": 10400
},
{
"epoch": 20.251937984496124,
"grad_norm": 1.1490117311477661,
"learning_rate": 0.000119,
"loss": 0.6246649551391602,
"step": 10450
},
{
"epoch": 20.251937984496124,
"eval_loss": 0.6180397868156433,
"eval_runtime": 178.3831,
"eval_samples_per_second": 106.535,
"eval_steps_per_second": 2.22,
"step": 10450
},
{
"epoch": 20.348837209302324,
"grad_norm": 1.1532789468765259,
"learning_rate": 0.0001186124031007752,
"loss": 0.6352950668334961,
"step": 10500
},
{
"epoch": 20.348837209302324,
"eval_loss": 0.6122626662254333,
"eval_runtime": 191.7685,
"eval_samples_per_second": 99.099,
"eval_steps_per_second": 2.065,
"step": 10500
},
{
"epoch": 20.44573643410853,
"grad_norm": 1.1865143775939941,
"learning_rate": 0.00011822480620155038,
"loss": 0.6208504486083984,
"step": 10550
},
{
"epoch": 20.44573643410853,
"eval_loss": 0.6198189854621887,
"eval_runtime": 187.1718,
"eval_samples_per_second": 101.532,
"eval_steps_per_second": 2.116,
"step": 10550
},
{
"epoch": 20.54263565891473,
"grad_norm": 1.3015577793121338,
"learning_rate": 0.00011783720930232558,
"loss": 0.6392872619628907,
"step": 10600
},
{
"epoch": 20.54263565891473,
"eval_loss": 0.6212337017059326,
"eval_runtime": 189.8147,
"eval_samples_per_second": 100.119,
"eval_steps_per_second": 2.086,
"step": 10600
},
{
"epoch": 20.63953488372093,
"grad_norm": 1.0097442865371704,
"learning_rate": 0.00011744961240310077,
"loss": 0.630739974975586,
"step": 10650
},
{
"epoch": 20.63953488372093,
"eval_loss": 0.618180513381958,
"eval_runtime": 186.5814,
"eval_samples_per_second": 101.854,
"eval_steps_per_second": 2.122,
"step": 10650
},
{
"epoch": 20.73643410852713,
"grad_norm": 1.3883415460586548,
"learning_rate": 0.00011706201550387596,
"loss": 0.6176488494873047,
"step": 10700
},
{
"epoch": 20.73643410852713,
"eval_loss": 0.6110183596611023,
"eval_runtime": 184.6587,
"eval_samples_per_second": 102.914,
"eval_steps_per_second": 2.144,
"step": 10700
},
{
"epoch": 20.833333333333332,
"grad_norm": 1.1491754055023193,
"learning_rate": 0.00011667441860465116,
"loss": 0.6281963729858399,
"step": 10750
},
{
"epoch": 20.833333333333332,
"eval_loss": 0.6063302159309387,
"eval_runtime": 190.669,
"eval_samples_per_second": 99.67,
"eval_steps_per_second": 2.077,
"step": 10750
},
{
"epoch": 20.930232558139537,
"grad_norm": 1.07293701171875,
"learning_rate": 0.00011628682170542635,
"loss": 0.6193827438354492,
"step": 10800
},
{
"epoch": 20.930232558139537,
"eval_loss": 0.6157019734382629,
"eval_runtime": 181.2458,
"eval_samples_per_second": 104.852,
"eval_steps_per_second": 2.185,
"step": 10800
},
{
"epoch": 21.027131782945737,
"grad_norm": 1.0467058420181274,
"learning_rate": 0.00011589922480620155,
"loss": 0.6196688079833984,
"step": 10850
},
{
"epoch": 21.027131782945737,
"eval_loss": 0.6192191243171692,
"eval_runtime": 180.8537,
"eval_samples_per_second": 105.079,
"eval_steps_per_second": 2.19,
"step": 10850
},
{
"epoch": 21.124031007751938,
"grad_norm": 1.2847251892089844,
"learning_rate": 0.00011551162790697677,
"loss": 0.6209733581542969,
"step": 10900
},
{
"epoch": 21.124031007751938,
"eval_loss": 0.6143134832382202,
"eval_runtime": 186.2509,
"eval_samples_per_second": 102.034,
"eval_steps_per_second": 2.126,
"step": 10900
},
{
"epoch": 21.22093023255814,
"grad_norm": 1.1140937805175781,
"learning_rate": 0.00011512403100775195,
"loss": 0.6021680450439453,
"step": 10950
},
{
"epoch": 21.22093023255814,
"eval_loss": 0.6277331709861755,
"eval_runtime": 173.1683,
"eval_samples_per_second": 109.743,
"eval_steps_per_second": 2.287,
"step": 10950
},
{
"epoch": 21.31782945736434,
"grad_norm": 1.1287832260131836,
"learning_rate": 0.00011473643410852714,
"loss": 0.6102452087402344,
"step": 11000
},
{
"epoch": 21.31782945736434,
"eval_loss": 0.6136771440505981,
"eval_runtime": 176.7361,
"eval_samples_per_second": 107.528,
"eval_steps_per_second": 2.241,
"step": 11000
},
{
"epoch": 21.414728682170544,
"grad_norm": 1.0684071779251099,
"learning_rate": 0.00011434883720930234,
"loss": 0.6249099731445312,
"step": 11050
},
{
"epoch": 21.414728682170544,
"eval_loss": 0.6049215793609619,
"eval_runtime": 182.8852,
"eval_samples_per_second": 103.912,
"eval_steps_per_second": 2.165,
"step": 11050
},
{
"epoch": 21.511627906976745,
"grad_norm": 1.1844762563705444,
"learning_rate": 0.00011396124031007753,
"loss": 0.6158002853393555,
"step": 11100
},
{
"epoch": 21.511627906976745,
"eval_loss": 0.6154988408088684,
"eval_runtime": 181.5179,
"eval_samples_per_second": 104.695,
"eval_steps_per_second": 2.182,
"step": 11100
},
{
"epoch": 21.608527131782946,
"grad_norm": 0.994705855846405,
"learning_rate": 0.00011357364341085273,
"loss": 0.6291318130493164,
"step": 11150
},
{
"epoch": 21.608527131782946,
"eval_loss": 0.6036517024040222,
"eval_runtime": 183.5149,
"eval_samples_per_second": 103.556,
"eval_steps_per_second": 2.158,
"step": 11150
},
{
"epoch": 21.705426356589147,
"grad_norm": 1.1102783679962158,
"learning_rate": 0.00011318604651162792,
"loss": 0.6337633514404297,
"step": 11200
},
{
"epoch": 21.705426356589147,
"eval_loss": 0.6135731935501099,
"eval_runtime": 187.0592,
"eval_samples_per_second": 101.594,
"eval_steps_per_second": 2.117,
"step": 11200
},
{
"epoch": 21.802325581395348,
"grad_norm": 1.244672179222107,
"learning_rate": 0.00011279844961240311,
"loss": 0.6188518524169921,
"step": 11250
},
{
"epoch": 21.802325581395348,
"eval_loss": 0.6139986515045166,
"eval_runtime": 183.6873,
"eval_samples_per_second": 103.458,
"eval_steps_per_second": 2.156,
"step": 11250
},
{
"epoch": 21.899224806201552,
"grad_norm": 1.0099879503250122,
"learning_rate": 0.00011241085271317831,
"loss": 0.6150000381469727,
"step": 11300
},
{
"epoch": 21.899224806201552,
"eval_loss": 0.6092088222503662,
"eval_runtime": 183.1899,
"eval_samples_per_second": 103.739,
"eval_steps_per_second": 2.162,
"step": 11300
},
{
"epoch": 21.996124031007753,
"grad_norm": 1.1326417922973633,
"learning_rate": 0.0001120232558139535,
"loss": 0.6261586380004883,
"step": 11350
},
{
"epoch": 21.996124031007753,
"eval_loss": 0.6016091704368591,
"eval_runtime": 188.0838,
"eval_samples_per_second": 101.04,
"eval_steps_per_second": 2.105,
"step": 11350
},
{
"epoch": 22.093023255813954,
"grad_norm": 1.046021819114685,
"learning_rate": 0.00011163565891472869,
"loss": 0.6060951614379883,
"step": 11400
},
{
"epoch": 22.093023255813954,
"eval_loss": 0.6057671308517456,
"eval_runtime": 196.3362,
"eval_samples_per_second": 96.793,
"eval_steps_per_second": 2.017,
"step": 11400
},
{
"epoch": 22.189922480620154,
"grad_norm": 1.077669382095337,
"learning_rate": 0.00011124806201550389,
"loss": 0.6060348510742187,
"step": 11450
},
{
"epoch": 22.189922480620154,
"eval_loss": 0.6033645272254944,
"eval_runtime": 197.9875,
"eval_samples_per_second": 95.986,
"eval_steps_per_second": 2.0,
"step": 11450
},
{
"epoch": 22.286821705426355,
"grad_norm": 1.2075508832931519,
"learning_rate": 0.00011086046511627907,
"loss": 0.6069083786010743,
"step": 11500
},
{
"epoch": 22.286821705426355,
"eval_loss": 0.6047888994216919,
"eval_runtime": 199.6553,
"eval_samples_per_second": 95.184,
"eval_steps_per_second": 1.983,
"step": 11500
},
{
"epoch": 22.38372093023256,
"grad_norm": 1.1000014543533325,
"learning_rate": 0.00011047286821705428,
"loss": 0.6034501266479492,
"step": 11550
},
{
"epoch": 22.38372093023256,
"eval_loss": 0.6033228635787964,
"eval_runtime": 196.9171,
"eval_samples_per_second": 96.508,
"eval_steps_per_second": 2.011,
"step": 11550
},
{
"epoch": 22.48062015503876,
"grad_norm": 1.1755788326263428,
"learning_rate": 0.00011008527131782946,
"loss": 0.6222854232788086,
"step": 11600
},
{
"epoch": 22.48062015503876,
"eval_loss": 0.6062292456626892,
"eval_runtime": 198.4476,
"eval_samples_per_second": 95.763,
"eval_steps_per_second": 1.995,
"step": 11600
},
{
"epoch": 22.57751937984496,
"grad_norm": 1.0309172868728638,
"learning_rate": 0.00010969767441860465,
"loss": 0.6081157302856446,
"step": 11650
},
{
"epoch": 22.57751937984496,
"eval_loss": 0.6113541126251221,
"eval_runtime": 202.1539,
"eval_samples_per_second": 94.008,
"eval_steps_per_second": 1.959,
"step": 11650
},
{
"epoch": 22.674418604651162,
"grad_norm": 1.137413501739502,
"learning_rate": 0.00010931007751937985,
"loss": 0.6076561737060547,
"step": 11700
},
{
"epoch": 22.674418604651162,
"eval_loss": 0.6047736406326294,
"eval_runtime": 192.1483,
"eval_samples_per_second": 98.903,
"eval_steps_per_second": 2.061,
"step": 11700
},
{
"epoch": 22.771317829457363,
"grad_norm": 0.9180145263671875,
"learning_rate": 0.00010892248062015504,
"loss": 0.5997943878173828,
"step": 11750
},
{
"epoch": 22.771317829457363,
"eval_loss": 0.6039275527000427,
"eval_runtime": 193.6762,
"eval_samples_per_second": 98.123,
"eval_steps_per_second": 2.045,
"step": 11750
},
{
"epoch": 22.868217054263567,
"grad_norm": 0.9683696031570435,
"learning_rate": 0.00010853488372093023,
"loss": 0.6150859451293945,
"step": 11800
},
{
"epoch": 22.868217054263567,
"eval_loss": 0.5961532592773438,
"eval_runtime": 200.108,
"eval_samples_per_second": 94.969,
"eval_steps_per_second": 1.979,
"step": 11800
},
{
"epoch": 22.96511627906977,
"grad_norm": 1.114216685295105,
"learning_rate": 0.00010814728682170543,
"loss": 0.6122099304199219,
"step": 11850
},
{
"epoch": 22.96511627906977,
"eval_loss": 0.6024142503738403,
"eval_runtime": 206.7493,
"eval_samples_per_second": 91.918,
"eval_steps_per_second": 1.915,
"step": 11850
},
{
"epoch": 23.06201550387597,
"grad_norm": 1.209195613861084,
"learning_rate": 0.00010775968992248062,
"loss": 0.6041343688964844,
"step": 11900
},
{
"epoch": 23.06201550387597,
"eval_loss": 0.602457582950592,
"eval_runtime": 209.0607,
"eval_samples_per_second": 90.902,
"eval_steps_per_second": 1.894,
"step": 11900
},
{
"epoch": 23.15891472868217,
"grad_norm": 1.2101048231124878,
"learning_rate": 0.00010737209302325582,
"loss": 0.6141452026367188,
"step": 11950
},
{
"epoch": 23.15891472868217,
"eval_loss": 0.5966194868087769,
"eval_runtime": 218.8529,
"eval_samples_per_second": 86.835,
"eval_steps_per_second": 1.809,
"step": 11950
},
{
"epoch": 23.25581395348837,
"grad_norm": 1.0980631113052368,
"learning_rate": 0.00010698449612403101,
"loss": 0.6167293548583984,
"step": 12000
},
{
"epoch": 23.25581395348837,
"eval_loss": 0.5952345728874207,
"eval_runtime": 217.6315,
"eval_samples_per_second": 87.322,
"eval_steps_per_second": 1.82,
"step": 12000
},
{
"epoch": 23.352713178294575,
"grad_norm": 1.0111398696899414,
"learning_rate": 0.0001065968992248062,
"loss": 0.6044289779663086,
"step": 12050
},
{
"epoch": 23.352713178294575,
"eval_loss": 0.5976375341415405,
"eval_runtime": 213.6111,
"eval_samples_per_second": 88.965,
"eval_steps_per_second": 1.854,
"step": 12050
},
{
"epoch": 23.449612403100776,
"grad_norm": 0.9988157749176025,
"learning_rate": 0.0001062093023255814,
"loss": 0.5936355972290039,
"step": 12100
},
{
"epoch": 23.449612403100776,
"eval_loss": 0.5917235612869263,
"eval_runtime": 212.9936,
"eval_samples_per_second": 89.223,
"eval_steps_per_second": 1.859,
"step": 12100
},
{
"epoch": 23.546511627906977,
"grad_norm": 1.4022223949432373,
"learning_rate": 0.00010582170542635659,
"loss": 0.6073113250732421,
"step": 12150
},
{
"epoch": 23.546511627906977,
"eval_loss": 0.5900039672851562,
"eval_runtime": 213.2849,
"eval_samples_per_second": 89.101,
"eval_steps_per_second": 1.857,
"step": 12150
},
{
"epoch": 23.643410852713178,
"grad_norm": 1.0619962215423584,
"learning_rate": 0.00010543410852713179,
"loss": 0.5950005722045898,
"step": 12200
},
{
"epoch": 23.643410852713178,
"eval_loss": 0.6011573076248169,
"eval_runtime": 212.3832,
"eval_samples_per_second": 89.48,
"eval_steps_per_second": 1.865,
"step": 12200
},
{
"epoch": 23.74031007751938,
"grad_norm": 1.211785912513733,
"learning_rate": 0.00010504651162790698,
"loss": 0.5973508834838868,
"step": 12250
},
{
"epoch": 23.74031007751938,
"eval_loss": 0.5984842777252197,
"eval_runtime": 212.6764,
"eval_samples_per_second": 89.356,
"eval_steps_per_second": 1.862,
"step": 12250
},
{
"epoch": 23.837209302325583,
"grad_norm": 1.1594852209091187,
"learning_rate": 0.00010465891472868216,
"loss": 0.5971969223022461,
"step": 12300
},
{
"epoch": 23.837209302325583,
"eval_loss": 0.5884484052658081,
"eval_runtime": 211.7184,
"eval_samples_per_second": 89.761,
"eval_steps_per_second": 1.87,
"step": 12300
},
{
"epoch": 23.934108527131784,
"grad_norm": 0.8441356420516968,
"learning_rate": 0.00010427131782945736,
"loss": 0.5886388397216797,
"step": 12350
},
{
"epoch": 23.934108527131784,
"eval_loss": 0.595369815826416,
"eval_runtime": 202.9896,
"eval_samples_per_second": 93.621,
"eval_steps_per_second": 1.951,
"step": 12350
},
{
"epoch": 24.031007751937985,
"grad_norm": 0.9786841869354248,
"learning_rate": 0.00010388372093023255,
"loss": 0.6087466812133789,
"step": 12400
},
{
"epoch": 24.031007751937985,
"eval_loss": 0.5952097177505493,
"eval_runtime": 211.1496,
"eval_samples_per_second": 90.003,
"eval_steps_per_second": 1.875,
"step": 12400
},
{
"epoch": 24.127906976744185,
"grad_norm": 1.0507079362869263,
"learning_rate": 0.00010349612403100774,
"loss": 0.5952305221557617,
"step": 12450
},
{
"epoch": 24.127906976744185,
"eval_loss": 0.5925743579864502,
"eval_runtime": 211.9219,
"eval_samples_per_second": 89.675,
"eval_steps_per_second": 1.869,
"step": 12450
},
{
"epoch": 24.224806201550386,
"grad_norm": 1.152092456817627,
"learning_rate": 0.00010310852713178296,
"loss": 0.6006411743164063,
"step": 12500
},
{
"epoch": 24.224806201550386,
"eval_loss": 0.591440737247467,
"eval_runtime": 210.0005,
"eval_samples_per_second": 90.495,
"eval_steps_per_second": 1.886,
"step": 12500
},
{
"epoch": 24.32170542635659,
"grad_norm": 1.2130184173583984,
"learning_rate": 0.00010272093023255816,
"loss": 0.5954698944091796,
"step": 12550
},
{
"epoch": 24.32170542635659,
"eval_loss": 0.5851362943649292,
"eval_runtime": 214.348,
"eval_samples_per_second": 88.66,
"eval_steps_per_second": 1.847,
"step": 12550
},
{
"epoch": 24.41860465116279,
"grad_norm": 1.0356969833374023,
"learning_rate": 0.00010233333333333335,
"loss": 0.5853539657592773,
"step": 12600
},
{
"epoch": 24.41860465116279,
"eval_loss": 0.5944467186927795,
"eval_runtime": 215.7672,
"eval_samples_per_second": 88.076,
"eval_steps_per_second": 1.835,
"step": 12600
},
{
"epoch": 24.515503875968992,
"grad_norm": 0.9699698686599731,
"learning_rate": 0.00010194573643410855,
"loss": 0.5965142440795899,
"step": 12650
},
{
"epoch": 24.515503875968992,
"eval_loss": 0.5891871452331543,
"eval_runtime": 208.592,
"eval_samples_per_second": 91.106,
"eval_steps_per_second": 1.898,
"step": 12650
},
{
"epoch": 24.612403100775193,
"grad_norm": 1.301386833190918,
"learning_rate": 0.00010155813953488373,
"loss": 0.6103283309936524,
"step": 12700
},
{
"epoch": 24.612403100775193,
"eval_loss": 0.5880603194236755,
"eval_runtime": 211.0333,
"eval_samples_per_second": 90.052,
"eval_steps_per_second": 1.876,
"step": 12700
},
{
"epoch": 24.709302325581394,
"grad_norm": 1.274060606956482,
"learning_rate": 0.00010117054263565892,
"loss": 0.5870831298828125,
"step": 12750
},
{
"epoch": 24.709302325581394,
"eval_loss": 0.5992385149002075,
"eval_runtime": 209.409,
"eval_samples_per_second": 90.751,
"eval_steps_per_second": 1.891,
"step": 12750
},
{
"epoch": 24.8062015503876,
"grad_norm": 1.1430360078811646,
"learning_rate": 0.00010078294573643412,
"loss": 0.5938863754272461,
"step": 12800
},
{
"epoch": 24.8062015503876,
"eval_loss": 0.5917322039604187,
"eval_runtime": 219.3042,
"eval_samples_per_second": 86.656,
"eval_steps_per_second": 1.806,
"step": 12800
},
{
"epoch": 24.9031007751938,
"grad_norm": 0.9088590741157532,
"learning_rate": 0.00010039534883720931,
"loss": 0.5874351119995117,
"step": 12850
},
{
"epoch": 24.9031007751938,
"eval_loss": 0.5859392881393433,
"eval_runtime": 220.2194,
"eval_samples_per_second": 86.296,
"eval_steps_per_second": 1.798,
"step": 12850
},
{
"epoch": 25.0,
"grad_norm": 1.2513511180877686,
"learning_rate": 0.0001000077519379845,
"loss": 0.5945474243164063,
"step": 12900
},
{
"epoch": 25.0,
"eval_loss": 0.5825695395469666,
"eval_runtime": 216.9535,
"eval_samples_per_second": 87.595,
"eval_steps_per_second": 1.825,
"step": 12900
},
{
"epoch": 25.0968992248062,
"grad_norm": 1.0160269737243652,
"learning_rate": 9.96201550387597e-05,
"loss": 0.5680616760253906,
"step": 12950
},
{
"epoch": 25.0968992248062,
"eval_loss": 0.5853219032287598,
"eval_runtime": 222.4015,
"eval_samples_per_second": 85.449,
"eval_steps_per_second": 1.781,
"step": 12950
},
{
"epoch": 25.1937984496124,
"grad_norm": 1.0439985990524292,
"learning_rate": 9.923255813953489e-05,
"loss": 0.6003565979003906,
"step": 13000
},
{
"epoch": 25.1937984496124,
"eval_loss": 0.5902218818664551,
"eval_runtime": 226.1714,
"eval_samples_per_second": 84.025,
"eval_steps_per_second": 1.751,
"step": 13000
},
{
"epoch": 25.290697674418606,
"grad_norm": 1.161298394203186,
"learning_rate": 9.884496124031009e-05,
"loss": 0.59041748046875,
"step": 13050
},
{
"epoch": 25.290697674418606,
"eval_loss": 0.5912677645683289,
"eval_runtime": 227.5252,
"eval_samples_per_second": 83.525,
"eval_steps_per_second": 1.74,
"step": 13050
},
{
"epoch": 25.387596899224807,
"grad_norm": 1.0510302782058716,
"learning_rate": 9.845736434108528e-05,
"loss": 0.5883547973632812,
"step": 13100
},
{
"epoch": 25.387596899224807,
"eval_loss": 0.5912774801254272,
"eval_runtime": 230.7864,
"eval_samples_per_second": 82.345,
"eval_steps_per_second": 1.716,
"step": 13100
},
{
"epoch": 25.484496124031008,
"grad_norm": 1.0604901313781738,
"learning_rate": 9.806976744186047e-05,
"loss": 0.5883957290649414,
"step": 13150
},
{
"epoch": 25.484496124031008,
"eval_loss": 0.5866997838020325,
"eval_runtime": 215.6859,
"eval_samples_per_second": 88.11,
"eval_steps_per_second": 1.836,
"step": 13150
},
{
"epoch": 25.58139534883721,
"grad_norm": 0.931657075881958,
"learning_rate": 9.768217054263567e-05,
"loss": 0.5975632858276367,
"step": 13200
},
{
"epoch": 25.58139534883721,
"eval_loss": 0.5827761888504028,
"eval_runtime": 222.4474,
"eval_samples_per_second": 85.431,
"eval_steps_per_second": 1.78,
"step": 13200
},
{
"epoch": 25.67829457364341,
"grad_norm": 0.9771053194999695,
"learning_rate": 9.729457364341086e-05,
"loss": 0.5864889907836914,
"step": 13250
},
{
"epoch": 25.67829457364341,
"eval_loss": 0.5907730460166931,
"eval_runtime": 226.3262,
"eval_samples_per_second": 83.967,
"eval_steps_per_second": 1.75,
"step": 13250
},
{
"epoch": 25.775193798449614,
"grad_norm": 1.1834367513656616,
"learning_rate": 9.690697674418606e-05,
"loss": 0.5737375259399414,
"step": 13300
},
{
"epoch": 25.775193798449614,
"eval_loss": 0.5830610394477844,
"eval_runtime": 234.6307,
"eval_samples_per_second": 80.995,
"eval_steps_per_second": 1.688,
"step": 13300
},
{
"epoch": 25.872093023255815,
"grad_norm": 1.618025302886963,
"learning_rate": 9.651937984496125e-05,
"loss": 0.582273292541504,
"step": 13350
},
{
"epoch": 25.872093023255815,
"eval_loss": 0.5806219577789307,
"eval_runtime": 196.4825,
"eval_samples_per_second": 96.721,
"eval_steps_per_second": 2.015,
"step": 13350
},
{
"epoch": 25.968992248062015,
"grad_norm": 1.1636735200881958,
"learning_rate": 9.613178294573643e-05,
"loss": 0.5907667541503906,
"step": 13400
},
{
"epoch": 25.968992248062015,
"eval_loss": 0.5851462483406067,
"eval_runtime": 203.9634,
"eval_samples_per_second": 93.174,
"eval_steps_per_second": 1.942,
"step": 13400
},
{
"epoch": 26.065891472868216,
"grad_norm": 1.0781770944595337,
"learning_rate": 9.574418604651164e-05,
"loss": 0.5699609756469727,
"step": 13450
},
{
"epoch": 26.065891472868216,
"eval_loss": 0.586871862411499,
"eval_runtime": 233.0709,
"eval_samples_per_second": 81.537,
"eval_steps_per_second": 1.699,
"step": 13450
},
{
"epoch": 26.162790697674417,
"grad_norm": 1.005245327949524,
"learning_rate": 9.535658914728682e-05,
"loss": 0.5884146499633789,
"step": 13500
},
{
"epoch": 26.162790697674417,
"eval_loss": 0.5832611322402954,
"eval_runtime": 210.0296,
"eval_samples_per_second": 90.482,
"eval_steps_per_second": 1.885,
"step": 13500
},
{
"epoch": 26.25968992248062,
"grad_norm": 0.9500683546066284,
"learning_rate": 9.496899224806201e-05,
"loss": 0.5716347122192382,
"step": 13550
},
{
"epoch": 26.25968992248062,
"eval_loss": 0.5889795422554016,
"eval_runtime": 218.3363,
"eval_samples_per_second": 87.04,
"eval_steps_per_second": 1.814,
"step": 13550
},
{
"epoch": 26.356589147286822,
"grad_norm": 0.9411081075668335,
"learning_rate": 9.458139534883721e-05,
"loss": 0.574790267944336,
"step": 13600
},
{
"epoch": 26.356589147286822,
"eval_loss": 0.5878632068634033,
"eval_runtime": 203.8131,
"eval_samples_per_second": 93.242,
"eval_steps_per_second": 1.943,
"step": 13600
},
{
"epoch": 26.453488372093023,
"grad_norm": 1.0316822528839111,
"learning_rate": 9.41937984496124e-05,
"loss": 0.5760750198364257,
"step": 13650
},
{
"epoch": 26.453488372093023,
"eval_loss": 0.5763137340545654,
"eval_runtime": 179.5252,
"eval_samples_per_second": 105.857,
"eval_steps_per_second": 2.206,
"step": 13650
},
{
"epoch": 26.550387596899224,
"grad_norm": 1.0404475927352905,
"learning_rate": 9.38062015503876e-05,
"loss": 0.5735264587402343,
"step": 13700
},
{
"epoch": 26.550387596899224,
"eval_loss": 0.5846853852272034,
"eval_runtime": 189.6159,
"eval_samples_per_second": 100.224,
"eval_steps_per_second": 2.088,
"step": 13700
},
{
"epoch": 26.647286821705425,
"grad_norm": 0.9884878993034363,
"learning_rate": 9.34186046511628e-05,
"loss": 0.5820446014404297,
"step": 13750
},
{
"epoch": 26.647286821705425,
"eval_loss": 0.5772427916526794,
"eval_runtime": 202.492,
"eval_samples_per_second": 93.851,
"eval_steps_per_second": 1.956,
"step": 13750
},
{
"epoch": 26.74418604651163,
"grad_norm": 1.220083475112915,
"learning_rate": 9.303100775193799e-05,
"loss": 0.5717597579956055,
"step": 13800
},
{
"epoch": 26.74418604651163,
"eval_loss": 0.5792003273963928,
"eval_runtime": 233.5216,
"eval_samples_per_second": 81.38,
"eval_steps_per_second": 1.696,
"step": 13800
},
{
"epoch": 26.84108527131783,
"grad_norm": 1.0216156244277954,
"learning_rate": 9.264341085271318e-05,
"loss": 0.5711225509643555,
"step": 13850
},
{
"epoch": 26.84108527131783,
"eval_loss": 0.5813571810722351,
"eval_runtime": 216.8193,
"eval_samples_per_second": 87.649,
"eval_steps_per_second": 1.826,
"step": 13850
},
{
"epoch": 26.93798449612403,
"grad_norm": 0.9908558130264282,
"learning_rate": 9.225581395348838e-05,
"loss": 0.5861880493164062,
"step": 13900
},
{
"epoch": 26.93798449612403,
"eval_loss": 0.5844830274581909,
"eval_runtime": 206.9257,
"eval_samples_per_second": 91.84,
"eval_steps_per_second": 1.914,
"step": 13900
},
{
"epoch": 27.03488372093023,
"grad_norm": 0.9165576696395874,
"learning_rate": 9.186821705426357e-05,
"loss": 0.5591924285888672,
"step": 13950
},
{
"epoch": 27.03488372093023,
"eval_loss": 0.5829132199287415,
"eval_runtime": 211.5927,
"eval_samples_per_second": 89.814,
"eval_steps_per_second": 1.872,
"step": 13950
},
{
"epoch": 27.131782945736433,
"grad_norm": 1.1759886741638184,
"learning_rate": 9.148062015503877e-05,
"loss": 0.5728300476074218,
"step": 14000
},
{
"epoch": 27.131782945736433,
"eval_loss": 0.5667384266853333,
"eval_runtime": 190.6612,
"eval_samples_per_second": 99.674,
"eval_steps_per_second": 2.077,
"step": 14000
},
{
"epoch": 27.228682170542637,
"grad_norm": 1.1444036960601807,
"learning_rate": 9.109302325581396e-05,
"loss": 0.5587886810302735,
"step": 14050
},
{
"epoch": 27.228682170542637,
"eval_loss": 0.5758188962936401,
"eval_runtime": 190.6437,
"eval_samples_per_second": 99.683,
"eval_steps_per_second": 2.077,
"step": 14050
},
{
"epoch": 27.325581395348838,
"grad_norm": 1.031416893005371,
"learning_rate": 9.070542635658915e-05,
"loss": 0.5681009674072266,
"step": 14100
},
{
"epoch": 27.325581395348838,
"eval_loss": 0.5733669996261597,
"eval_runtime": 188.284,
"eval_samples_per_second": 100.933,
"eval_steps_per_second": 2.103,
"step": 14100
},
{
"epoch": 27.42248062015504,
"grad_norm": 1.0156781673431396,
"learning_rate": 9.031782945736435e-05,
"loss": 0.5718238830566407,
"step": 14150
},
{
"epoch": 27.42248062015504,
"eval_loss": 0.5743637681007385,
"eval_runtime": 188.8285,
"eval_samples_per_second": 100.642,
"eval_steps_per_second": 2.097,
"step": 14150
},
{
"epoch": 27.51937984496124,
"grad_norm": 1.0280048847198486,
"learning_rate": 8.993023255813954e-05,
"loss": 0.5754650497436523,
"step": 14200
},
{
"epoch": 27.51937984496124,
"eval_loss": 0.5855010151863098,
"eval_runtime": 182.9128,
"eval_samples_per_second": 103.897,
"eval_steps_per_second": 2.165,
"step": 14200
},
{
"epoch": 27.61627906976744,
"grad_norm": 1.0390145778656006,
"learning_rate": 8.954263565891474e-05,
"loss": 0.5721881103515625,
"step": 14250
},
{
"epoch": 27.61627906976744,
"eval_loss": 0.5711118578910828,
"eval_runtime": 184.1208,
"eval_samples_per_second": 103.215,
"eval_steps_per_second": 2.151,
"step": 14250
},
{
"epoch": 27.713178294573645,
"grad_norm": 1.0992286205291748,
"learning_rate": 8.915503875968993e-05,
"loss": 0.5777717208862305,
"step": 14300
},
{
"epoch": 27.713178294573645,
"eval_loss": 0.5845617651939392,
"eval_runtime": 183.5085,
"eval_samples_per_second": 103.559,
"eval_steps_per_second": 2.158,
"step": 14300
},
{
"epoch": 27.810077519379846,
"grad_norm": 1.0041630268096924,
"learning_rate": 8.876744186046511e-05,
"loss": 0.5623219680786132,
"step": 14350
},
{
"epoch": 27.810077519379846,
"eval_loss": 0.5710008144378662,
"eval_runtime": 184.7548,
"eval_samples_per_second": 102.861,
"eval_steps_per_second": 2.143,
"step": 14350
},
{
"epoch": 27.906976744186046,
"grad_norm": 0.8692350387573242,
"learning_rate": 8.837984496124031e-05,
"loss": 0.555379638671875,
"step": 14400
},
{
"epoch": 27.906976744186046,
"eval_loss": 0.5671586990356445,
"eval_runtime": 188.3383,
"eval_samples_per_second": 100.904,
"eval_steps_per_second": 2.103,
"step": 14400
},
{
"epoch": 28.003875968992247,
"grad_norm": 1.0534723997116089,
"learning_rate": 8.79922480620155e-05,
"loss": 0.5748031997680664,
"step": 14450
},
{
"epoch": 28.003875968992247,
"eval_loss": 0.5694092512130737,
"eval_runtime": 200.2532,
"eval_samples_per_second": 94.9,
"eval_steps_per_second": 1.977,
"step": 14450
},
{
"epoch": 28.100775193798448,
"grad_norm": 0.8484736680984497,
"learning_rate": 8.76046511627907e-05,
"loss": 0.5692879486083985,
"step": 14500
},
{
"epoch": 28.100775193798448,
"eval_loss": 0.5734642744064331,
"eval_runtime": 198.9989,
"eval_samples_per_second": 95.498,
"eval_steps_per_second": 1.99,
"step": 14500
},
{
"epoch": 28.197674418604652,
"grad_norm": 1.0161453485488892,
"learning_rate": 8.72170542635659e-05,
"loss": 0.5690762710571289,
"step": 14550
},
{
"epoch": 28.197674418604652,
"eval_loss": 0.5682703852653503,
"eval_runtime": 192.8935,
"eval_samples_per_second": 98.521,
"eval_steps_per_second": 2.053,
"step": 14550
},
{
"epoch": 28.294573643410853,
"grad_norm": 1.1991984844207764,
"learning_rate": 8.68294573643411e-05,
"loss": 0.571408805847168,
"step": 14600
},
{
"epoch": 28.294573643410853,
"eval_loss": 0.580035388469696,
"eval_runtime": 192.3228,
"eval_samples_per_second": 98.813,
"eval_steps_per_second": 2.059,
"step": 14600
},
{
"epoch": 28.391472868217054,
"grad_norm": 1.0290623903274536,
"learning_rate": 8.644186046511628e-05,
"loss": 0.5644237136840821,
"step": 14650
},
{
"epoch": 28.391472868217054,
"eval_loss": 0.5756661295890808,
"eval_runtime": 201.1784,
"eval_samples_per_second": 94.463,
"eval_steps_per_second": 1.968,
"step": 14650
},
{
"epoch": 28.488372093023255,
"grad_norm": 1.0187848806381226,
"learning_rate": 8.605426356589148e-05,
"loss": 0.5739951324462891,
"step": 14700
},
{
"epoch": 28.488372093023255,
"eval_loss": 0.5649608373641968,
"eval_runtime": 204.5004,
"eval_samples_per_second": 92.929,
"eval_steps_per_second": 1.936,
"step": 14700
},
{
"epoch": 28.585271317829456,
"grad_norm": 1.0154606103897095,
"learning_rate": 8.566666666666667e-05,
"loss": 0.5649435806274414,
"step": 14750
},
{
"epoch": 28.585271317829456,
"eval_loss": 0.574077308177948,
"eval_runtime": 199.9375,
"eval_samples_per_second": 95.05,
"eval_steps_per_second": 1.981,
"step": 14750
},
{
"epoch": 28.68217054263566,
"grad_norm": 0.9040335416793823,
"learning_rate": 8.527906976744187e-05,
"loss": 0.5609535980224609,
"step": 14800
},
{
"epoch": 28.68217054263566,
"eval_loss": 0.5767226219177246,
"eval_runtime": 197.5815,
"eval_samples_per_second": 96.183,
"eval_steps_per_second": 2.004,
"step": 14800
},
{
"epoch": 28.77906976744186,
"grad_norm": 1.085545301437378,
"learning_rate": 8.489147286821706e-05,
"loss": 0.5725581359863281,
"step": 14850
},
{
"epoch": 28.77906976744186,
"eval_loss": 0.5696209073066711,
"eval_runtime": 199.5241,
"eval_samples_per_second": 95.247,
"eval_steps_per_second": 1.985,
"step": 14850
},
{
"epoch": 28.875968992248062,
"grad_norm": 0.8943235278129578,
"learning_rate": 8.450387596899225e-05,
"loss": 0.555779151916504,
"step": 14900
},
{
"epoch": 28.875968992248062,
"eval_loss": 0.5655143857002258,
"eval_runtime": 196.7618,
"eval_samples_per_second": 96.584,
"eval_steps_per_second": 2.013,
"step": 14900
},
{
"epoch": 28.972868217054263,
"grad_norm": 1.4611607789993286,
"learning_rate": 8.411627906976745e-05,
"loss": 0.553552131652832,
"step": 14950
},
{
"epoch": 28.972868217054263,
"eval_loss": 0.5691696405410767,
"eval_runtime": 199.1398,
"eval_samples_per_second": 95.43,
"eval_steps_per_second": 1.989,
"step": 14950
},
{
"epoch": 29.069767441860463,
"grad_norm": 1.0959761142730713,
"learning_rate": 8.372868217054264e-05,
"loss": 0.5541238021850586,
"step": 15000
},
{
"epoch": 29.069767441860463,
"eval_loss": 0.5719203352928162,
"eval_runtime": 199.5507,
"eval_samples_per_second": 95.234,
"eval_steps_per_second": 1.984,
"step": 15000
},
{
"epoch": 29.166666666666668,
"grad_norm": 1.0363779067993164,
"learning_rate": 8.334108527131783e-05,
"loss": 0.5359441375732422,
"step": 15050
},
{
"epoch": 29.166666666666668,
"eval_loss": 0.5621792674064636,
"eval_runtime": 199.7624,
"eval_samples_per_second": 95.133,
"eval_steps_per_second": 1.982,
"step": 15050
},
{
"epoch": 29.26356589147287,
"grad_norm": 1.0751618146896362,
"learning_rate": 8.295348837209303e-05,
"loss": 0.5572662734985352,
"step": 15100
},
{
"epoch": 29.26356589147287,
"eval_loss": 0.5694544315338135,
"eval_runtime": 201.8006,
"eval_samples_per_second": 94.172,
"eval_steps_per_second": 1.962,
"step": 15100
},
{
"epoch": 29.36046511627907,
"grad_norm": 0.8606336116790771,
"learning_rate": 8.256589147286822e-05,
"loss": 0.565627555847168,
"step": 15150
},
{
"epoch": 29.36046511627907,
"eval_loss": 0.5655621886253357,
"eval_runtime": 202.5083,
"eval_samples_per_second": 93.843,
"eval_steps_per_second": 1.955,
"step": 15150
},
{
"epoch": 29.45736434108527,
"grad_norm": 1.1224424839019775,
"learning_rate": 8.217829457364342e-05,
"loss": 0.556108627319336,
"step": 15200
},
{
"epoch": 29.45736434108527,
"eval_loss": 0.5636035799980164,
"eval_runtime": 197.0694,
"eval_samples_per_second": 96.433,
"eval_steps_per_second": 2.009,
"step": 15200
},
{
"epoch": 29.55426356589147,
"grad_norm": 0.8912738561630249,
"learning_rate": 8.17906976744186e-05,
"loss": 0.5589240264892578,
"step": 15250
},
{
"epoch": 29.55426356589147,
"eval_loss": 0.5628049969673157,
"eval_runtime": 193.2666,
"eval_samples_per_second": 98.33,
"eval_steps_per_second": 2.049,
"step": 15250
},
{
"epoch": 29.651162790697676,
"grad_norm": 1.063519835472107,
"learning_rate": 8.140310077519379e-05,
"loss": 0.5502983474731445,
"step": 15300
},
{
"epoch": 29.651162790697676,
"eval_loss": 0.5709092617034912,
"eval_runtime": 188.55,
"eval_samples_per_second": 100.79,
"eval_steps_per_second": 2.1,
"step": 15300
},
{
"epoch": 29.748062015503876,
"grad_norm": 1.1637099981307983,
"learning_rate": 8.101550387596901e-05,
"loss": 0.5621489715576172,
"step": 15350
},
{
"epoch": 29.748062015503876,
"eval_loss": 0.5655869245529175,
"eval_runtime": 194.2628,
"eval_samples_per_second": 97.826,
"eval_steps_per_second": 2.038,
"step": 15350
},
{
"epoch": 29.844961240310077,
"grad_norm": 1.0051424503326416,
"learning_rate": 8.06279069767442e-05,
"loss": 0.5704508972167969,
"step": 15400
},
{
"epoch": 29.844961240310077,
"eval_loss": 0.5648623704910278,
"eval_runtime": 195.0452,
"eval_samples_per_second": 97.434,
"eval_steps_per_second": 2.03,
"step": 15400
},
{
"epoch": 29.941860465116278,
"grad_norm": 0.9984987378120422,
"learning_rate": 8.024031007751938e-05,
"loss": 0.5505734634399414,
"step": 15450
},
{
"epoch": 29.941860465116278,
"eval_loss": 0.5613234043121338,
"eval_runtime": 188.1798,
"eval_samples_per_second": 100.989,
"eval_steps_per_second": 2.104,
"step": 15450
},
{
"epoch": 30.03875968992248,
"grad_norm": 1.1080302000045776,
"learning_rate": 7.985271317829459e-05,
"loss": 0.5553411102294922,
"step": 15500
},
{
"epoch": 30.03875968992248,
"eval_loss": 0.5688788890838623,
"eval_runtime": 190.9499,
"eval_samples_per_second": 99.524,
"eval_steps_per_second": 2.074,
"step": 15500
},
{
"epoch": 30.135658914728683,
"grad_norm": 1.1303520202636719,
"learning_rate": 7.946511627906977e-05,
"loss": 0.5628437042236328,
"step": 15550
},
{
"epoch": 30.135658914728683,
"eval_loss": 0.569452166557312,
"eval_runtime": 194.8232,
"eval_samples_per_second": 97.545,
"eval_steps_per_second": 2.033,
"step": 15550
},
{
"epoch": 30.232558139534884,
"grad_norm": 0.8545628190040588,
"learning_rate": 7.907751937984496e-05,
"loss": 0.5434987258911133,
"step": 15600
},
{
"epoch": 30.232558139534884,
"eval_loss": 0.5680402517318726,
"eval_runtime": 192.2391,
"eval_samples_per_second": 98.856,
"eval_steps_per_second": 2.06,
"step": 15600
},
{
"epoch": 30.329457364341085,
"grad_norm": 0.9640957713127136,
"learning_rate": 7.868992248062016e-05,
"loss": 0.5405253219604492,
"step": 15650
},
{
"epoch": 30.329457364341085,
"eval_loss": 0.5530641674995422,
"eval_runtime": 199.6418,
"eval_samples_per_second": 95.191,
"eval_steps_per_second": 1.984,
"step": 15650
},
{
"epoch": 30.426356589147286,
"grad_norm": 1.1416120529174805,
"learning_rate": 7.830232558139535e-05,
"loss": 0.5548144912719727,
"step": 15700
},
{
"epoch": 30.426356589147286,
"eval_loss": 0.5669378638267517,
"eval_runtime": 198.882,
"eval_samples_per_second": 95.554,
"eval_steps_per_second": 1.991,
"step": 15700
},
{
"epoch": 30.52325581395349,
"grad_norm": 1.0198075771331787,
"learning_rate": 7.791472868217055e-05,
"loss": 0.5592168045043945,
"step": 15750
},
{
"epoch": 30.52325581395349,
"eval_loss": 0.5626012086868286,
"eval_runtime": 197.3119,
"eval_samples_per_second": 96.315,
"eval_steps_per_second": 2.007,
"step": 15750
},
{
"epoch": 30.62015503875969,
"grad_norm": 1.0227652788162231,
"learning_rate": 7.752713178294574e-05,
"loss": 0.5549029922485351,
"step": 15800
},
{
"epoch": 30.62015503875969,
"eval_loss": 0.5600801706314087,
"eval_runtime": 192.5035,
"eval_samples_per_second": 98.72,
"eval_steps_per_second": 2.057,
"step": 15800
},
{
"epoch": 30.717054263565892,
"grad_norm": 1.0571966171264648,
"learning_rate": 7.713953488372093e-05,
"loss": 0.5535205841064453,
"step": 15850
},
{
"epoch": 30.717054263565892,
"eval_loss": 0.554684579372406,
"eval_runtime": 201.6318,
"eval_samples_per_second": 94.251,
"eval_steps_per_second": 1.964,
"step": 15850
},
{
"epoch": 30.813953488372093,
"grad_norm": 1.0247442722320557,
"learning_rate": 7.675193798449613e-05,
"loss": 0.5452067184448243,
"step": 15900
},
{
"epoch": 30.813953488372093,
"eval_loss": 0.5608499646186829,
"eval_runtime": 203.6665,
"eval_samples_per_second": 93.309,
"eval_steps_per_second": 1.944,
"step": 15900
},
{
"epoch": 30.910852713178294,
"grad_norm": 1.0259183645248413,
"learning_rate": 7.636434108527132e-05,
"loss": 0.5513217544555664,
"step": 15950
},
{
"epoch": 30.910852713178294,
"eval_loss": 0.5694592595100403,
"eval_runtime": 204.0706,
"eval_samples_per_second": 93.125,
"eval_steps_per_second": 1.941,
"step": 15950
},
{
"epoch": 31.007751937984494,
"grad_norm": 0.9598883390426636,
"learning_rate": 7.59767441860465e-05,
"loss": 0.5515378952026367,
"step": 16000
},
{
"epoch": 31.007751937984494,
"eval_loss": 0.5637879967689514,
"eval_runtime": 206.5294,
"eval_samples_per_second": 92.016,
"eval_steps_per_second": 1.917,
"step": 16000
},
{
"epoch": 31.1046511627907,
"grad_norm": 1.090155005455017,
"learning_rate": 7.55891472868217e-05,
"loss": 0.5438331985473632,
"step": 16050
},
{
"epoch": 31.1046511627907,
"eval_loss": 0.56569504737854,
"eval_runtime": 204.4685,
"eval_samples_per_second": 92.943,
"eval_steps_per_second": 1.937,
"step": 16050
},
{
"epoch": 31.2015503875969,
"grad_norm": 1.0309280157089233,
"learning_rate": 7.52015503875969e-05,
"loss": 0.5518190383911132,
"step": 16100
},
{
"epoch": 31.2015503875969,
"eval_loss": 0.5640277862548828,
"eval_runtime": 207.3379,
"eval_samples_per_second": 91.657,
"eval_steps_per_second": 1.91,
"step": 16100
},
{
"epoch": 31.2984496124031,
"grad_norm": 1.0093274116516113,
"learning_rate": 7.48139534883721e-05,
"loss": 0.5497930145263672,
"step": 16150
},
{
"epoch": 31.2984496124031,
"eval_loss": 0.5667787790298462,
"eval_runtime": 205.5829,
"eval_samples_per_second": 92.44,
"eval_steps_per_second": 1.926,
"step": 16150
},
{
"epoch": 31.3953488372093,
"grad_norm": 0.9880069494247437,
"learning_rate": 7.44263565891473e-05,
"loss": 0.54766357421875,
"step": 16200
},
{
"epoch": 31.3953488372093,
"eval_loss": 0.5575990676879883,
"eval_runtime": 206.3994,
"eval_samples_per_second": 92.074,
"eval_steps_per_second": 1.919,
"step": 16200
},
{
"epoch": 31.492248062015506,
"grad_norm": 1.0106197595596313,
"learning_rate": 7.403875968992249e-05,
"loss": 0.5469408416748047,
"step": 16250
},
{
"epoch": 31.492248062015506,
"eval_loss": 0.562467098236084,
"eval_runtime": 204.9034,
"eval_samples_per_second": 92.746,
"eval_steps_per_second": 1.933,
"step": 16250
},
{
"epoch": 31.589147286821706,
"grad_norm": 1.035379409790039,
"learning_rate": 7.365116279069769e-05,
"loss": 0.562320556640625,
"step": 16300
},
{
"epoch": 31.589147286821706,
"eval_loss": 0.5606642365455627,
"eval_runtime": 202.8477,
"eval_samples_per_second": 93.686,
"eval_steps_per_second": 1.952,
"step": 16300
},
{
"epoch": 31.686046511627907,
"grad_norm": 1.2975516319274902,
"learning_rate": 7.326356589147288e-05,
"loss": 0.5471981430053711,
"step": 16350
},
{
"epoch": 31.686046511627907,
"eval_loss": 0.5522753000259399,
"eval_runtime": 203.8634,
"eval_samples_per_second": 93.219,
"eval_steps_per_second": 1.942,
"step": 16350
},
{
"epoch": 31.782945736434108,
"grad_norm": 0.9750486016273499,
"learning_rate": 7.287596899224806e-05,
"loss": 0.5367609024047851,
"step": 16400
},
{
"epoch": 31.782945736434108,
"eval_loss": 0.5535518527030945,
"eval_runtime": 196.989,
"eval_samples_per_second": 96.472,
"eval_steps_per_second": 2.01,
"step": 16400
},
{
"epoch": 31.87984496124031,
"grad_norm": 1.2616281509399414,
"learning_rate": 7.248837209302326e-05,
"loss": 0.5479264068603515,
"step": 16450
},
{
"epoch": 31.87984496124031,
"eval_loss": 0.5599262118339539,
"eval_runtime": 195.7368,
"eval_samples_per_second": 97.09,
"eval_steps_per_second": 2.023,
"step": 16450
},
{
"epoch": 31.97674418604651,
"grad_norm": 1.0327950716018677,
"learning_rate": 7.210077519379845e-05,
"loss": 0.5482368469238281,
"step": 16500
},
{
"epoch": 31.97674418604651,
"eval_loss": 0.5533180832862854,
"eval_runtime": 195.5573,
"eval_samples_per_second": 97.179,
"eval_steps_per_second": 2.025,
"step": 16500
},
{
"epoch": 32.07364341085271,
"grad_norm": 1.1047241687774658,
"learning_rate": 7.171317829457364e-05,
"loss": 0.5321396636962891,
"step": 16550
},
{
"epoch": 32.07364341085271,
"eval_loss": 0.5518382787704468,
"eval_runtime": 189.5095,
"eval_samples_per_second": 100.28,
"eval_steps_per_second": 2.09,
"step": 16550
},
{
"epoch": 32.17054263565891,
"grad_norm": 1.079679250717163,
"learning_rate": 7.132558139534884e-05,
"loss": 0.5443602752685547,
"step": 16600
},
{
"epoch": 32.17054263565891,
"eval_loss": 0.5624197125434875,
"eval_runtime": 184.8407,
"eval_samples_per_second": 102.813,
"eval_steps_per_second": 2.142,
"step": 16600
},
{
"epoch": 32.26744186046512,
"grad_norm": 1.32503080368042,
"learning_rate": 7.093798449612403e-05,
"loss": 0.5390607070922852,
"step": 16650
},
{
"epoch": 32.26744186046512,
"eval_loss": 0.5524035692214966,
"eval_runtime": 190.1126,
"eval_samples_per_second": 99.962,
"eval_steps_per_second": 2.083,
"step": 16650
},
{
"epoch": 32.36434108527132,
"grad_norm": 0.9187719821929932,
"learning_rate": 7.055038759689923e-05,
"loss": 0.5373795700073242,
"step": 16700
},
{
"epoch": 32.36434108527132,
"eval_loss": 0.557758629322052,
"eval_runtime": 189.2424,
"eval_samples_per_second": 100.421,
"eval_steps_per_second": 2.093,
"step": 16700
},
{
"epoch": 32.46124031007752,
"grad_norm": 0.8754042387008667,
"learning_rate": 7.016279069767442e-05,
"loss": 0.5403145217895508,
"step": 16750
},
{
"epoch": 32.46124031007752,
"eval_loss": 0.5573844313621521,
"eval_runtime": 185.4261,
"eval_samples_per_second": 102.488,
"eval_steps_per_second": 2.136,
"step": 16750
},
{
"epoch": 32.55813953488372,
"grad_norm": 0.896300196647644,
"learning_rate": 6.977519379844961e-05,
"loss": 0.5385972595214844,
"step": 16800
},
{
"epoch": 32.55813953488372,
"eval_loss": 0.5460941791534424,
"eval_runtime": 191.881,
"eval_samples_per_second": 99.041,
"eval_steps_per_second": 2.064,
"step": 16800
},
{
"epoch": 32.65503875968992,
"grad_norm": 0.8643621802330017,
"learning_rate": 6.938759689922481e-05,
"loss": 0.5400046157836914,
"step": 16850
},
{
"epoch": 32.65503875968992,
"eval_loss": 0.550368070602417,
"eval_runtime": 193.1616,
"eval_samples_per_second": 98.384,
"eval_steps_per_second": 2.05,
"step": 16850
},
{
"epoch": 32.751937984496124,
"grad_norm": 0.9831710457801819,
"learning_rate": 6.9e-05,
"loss": 0.5370166778564454,
"step": 16900
},
{
"epoch": 32.751937984496124,
"eval_loss": 0.5594847798347473,
"eval_runtime": 200.5934,
"eval_samples_per_second": 94.739,
"eval_steps_per_second": 1.974,
"step": 16900
},
{
"epoch": 32.848837209302324,
"grad_norm": 1.0108966827392578,
"learning_rate": 6.86124031007752e-05,
"loss": 0.5464648056030273,
"step": 16950
},
{
"epoch": 32.848837209302324,
"eval_loss": 0.5564253330230713,
"eval_runtime": 194.6839,
"eval_samples_per_second": 97.615,
"eval_steps_per_second": 2.034,
"step": 16950
},
{
"epoch": 32.945736434108525,
"grad_norm": 1.013802170753479,
"learning_rate": 6.82248062015504e-05,
"loss": 0.528292465209961,
"step": 17000
},
{
"epoch": 32.945736434108525,
"eval_loss": 0.554287314414978,
"eval_runtime": 196.927,
"eval_samples_per_second": 96.503,
"eval_steps_per_second": 2.011,
"step": 17000
},
{
"epoch": 33.042635658914726,
"grad_norm": 0.9317029118537903,
"learning_rate": 6.783720930232559e-05,
"loss": 0.5383699035644531,
"step": 17050
},
{
"epoch": 33.042635658914726,
"eval_loss": 0.5600277185440063,
"eval_runtime": 189.1325,
"eval_samples_per_second": 100.48,
"eval_steps_per_second": 2.094,
"step": 17050
},
{
"epoch": 33.13953488372093,
"grad_norm": 0.9573765397071838,
"learning_rate": 6.744961240310078e-05,
"loss": 0.525397605895996,
"step": 17100
},
{
"epoch": 33.13953488372093,
"eval_loss": 0.5511126518249512,
"eval_runtime": 190.6,
"eval_samples_per_second": 99.706,
"eval_steps_per_second": 2.078,
"step": 17100
},
{
"epoch": 33.236434108527135,
"grad_norm": 0.988450825214386,
"learning_rate": 6.706201550387598e-05,
"loss": 0.5410281753540039,
"step": 17150
},
{
"epoch": 33.236434108527135,
"eval_loss": 0.5519395470619202,
"eval_runtime": 177.0128,
"eval_samples_per_second": 107.359,
"eval_steps_per_second": 2.237,
"step": 17150
},
{
"epoch": 33.333333333333336,
"grad_norm": 1.0500494241714478,
"learning_rate": 6.667441860465117e-05,
"loss": 0.5410085678100586,
"step": 17200
},
{
"epoch": 33.333333333333336,
"eval_loss": 0.5577670335769653,
"eval_runtime": 191.3272,
"eval_samples_per_second": 99.327,
"eval_steps_per_second": 2.07,
"step": 17200
},
{
"epoch": 33.43023255813954,
"grad_norm": 1.2153853178024292,
"learning_rate": 6.628682170542637e-05,
"loss": 0.5333014678955078,
"step": 17250
},
{
"epoch": 33.43023255813954,
"eval_loss": 0.5504041314125061,
"eval_runtime": 192.5064,
"eval_samples_per_second": 98.719,
"eval_steps_per_second": 2.057,
"step": 17250
},
{
"epoch": 33.52713178294574,
"grad_norm": 1.2611991167068481,
"learning_rate": 6.589922480620155e-05,
"loss": 0.5398865509033203,
"step": 17300
},
{
"epoch": 33.52713178294574,
"eval_loss": 0.5494994521141052,
"eval_runtime": 194.486,
"eval_samples_per_second": 97.714,
"eval_steps_per_second": 2.036,
"step": 17300
},
{
"epoch": 33.62403100775194,
"grad_norm": 1.0226876735687256,
"learning_rate": 6.551162790697674e-05,
"loss": 0.5396590423583985,
"step": 17350
},
{
"epoch": 33.62403100775194,
"eval_loss": 0.5536758899688721,
"eval_runtime": 193.7265,
"eval_samples_per_second": 98.097,
"eval_steps_per_second": 2.044,
"step": 17350
},
{
"epoch": 33.72093023255814,
"grad_norm": 0.9030176401138306,
"learning_rate": 6.512403100775194e-05,
"loss": 0.5370613861083985,
"step": 17400
},
{
"epoch": 33.72093023255814,
"eval_loss": 0.549870491027832,
"eval_runtime": 195.2981,
"eval_samples_per_second": 97.308,
"eval_steps_per_second": 2.028,
"step": 17400
},
{
"epoch": 33.81782945736434,
"grad_norm": 0.8909381628036499,
"learning_rate": 6.473643410852713e-05,
"loss": 0.5347919464111328,
"step": 17450
},
{
"epoch": 33.81782945736434,
"eval_loss": 0.541422426700592,
"eval_runtime": 189.4035,
"eval_samples_per_second": 100.336,
"eval_steps_per_second": 2.091,
"step": 17450
},
{
"epoch": 33.91472868217054,
"grad_norm": 0.8646638989448547,
"learning_rate": 6.434883720930232e-05,
"loss": 0.5484010696411132,
"step": 17500
},
{
"epoch": 33.91472868217054,
"eval_loss": 0.5494884252548218,
"eval_runtime": 193.7113,
"eval_samples_per_second": 98.105,
"eval_steps_per_second": 2.044,
"step": 17500
},
{
"epoch": 34.01162790697674,
"grad_norm": 0.9828886985778809,
"learning_rate": 6.396124031007752e-05,
"loss": 0.5419563674926757,
"step": 17550
},
{
"epoch": 34.01162790697674,
"eval_loss": 0.5491079688072205,
"eval_runtime": 193.1117,
"eval_samples_per_second": 98.409,
"eval_steps_per_second": 2.051,
"step": 17550
},
{
"epoch": 34.10852713178294,
"grad_norm": 1.0629746913909912,
"learning_rate": 6.357364341085271e-05,
"loss": 0.5175531768798828,
"step": 17600
},
{
"epoch": 34.10852713178294,
"eval_loss": 0.5538118481636047,
"eval_runtime": 196.6121,
"eval_samples_per_second": 96.657,
"eval_steps_per_second": 2.014,
"step": 17600
},
{
"epoch": 34.20542635658915,
"grad_norm": 0.9160548448562622,
"learning_rate": 6.318604651162791e-05,
"loss": 0.539161491394043,
"step": 17650
},
{
"epoch": 34.20542635658915,
"eval_loss": 0.5461863279342651,
"eval_runtime": 198.549,
"eval_samples_per_second": 95.714,
"eval_steps_per_second": 1.994,
"step": 17650
},
{
"epoch": 34.30232558139535,
"grad_norm": 1.0820066928863525,
"learning_rate": 6.27984496124031e-05,
"loss": 0.5332397842407226,
"step": 17700
},
{
"epoch": 34.30232558139535,
"eval_loss": 0.5470810532569885,
"eval_runtime": 199.5692,
"eval_samples_per_second": 95.225,
"eval_steps_per_second": 1.984,
"step": 17700
},
{
"epoch": 34.39922480620155,
"grad_norm": 1.0704736709594727,
"learning_rate": 6.24108527131783e-05,
"loss": 0.5254277801513672,
"step": 17750
},
{
"epoch": 34.39922480620155,
"eval_loss": 0.542071521282196,
"eval_runtime": 192.8985,
"eval_samples_per_second": 98.518,
"eval_steps_per_second": 2.053,
"step": 17750
},
{
"epoch": 34.49612403100775,
"grad_norm": 1.0921117067337036,
"learning_rate": 6.20232558139535e-05,
"loss": 0.5333176040649414,
"step": 17800
},
{
"epoch": 34.49612403100775,
"eval_loss": 0.550654947757721,
"eval_runtime": 186.5933,
"eval_samples_per_second": 101.847,
"eval_steps_per_second": 2.122,
"step": 17800
},
{
"epoch": 34.593023255813954,
"grad_norm": 1.2105690240859985,
"learning_rate": 6.163565891472869e-05,
"loss": 0.5238530349731445,
"step": 17850
},
{
"epoch": 34.593023255813954,
"eval_loss": 0.54400235414505,
"eval_runtime": 174.1456,
"eval_samples_per_second": 109.127,
"eval_steps_per_second": 2.274,
"step": 17850
},
{
"epoch": 34.689922480620154,
"grad_norm": 1.0428931713104248,
"learning_rate": 6.124806201550388e-05,
"loss": 0.5392517852783203,
"step": 17900
},
{
"epoch": 34.689922480620154,
"eval_loss": 0.5388475060462952,
"eval_runtime": 175.4521,
"eval_samples_per_second": 108.314,
"eval_steps_per_second": 2.257,
"step": 17900
},
{
"epoch": 34.786821705426355,
"grad_norm": 0.8393483757972717,
"learning_rate": 6.086046511627907e-05,
"loss": 0.5320695877075196,
"step": 17950
},
{
"epoch": 34.786821705426355,
"eval_loss": 0.5461158752441406,
"eval_runtime": 175.228,
"eval_samples_per_second": 108.453,
"eval_steps_per_second": 2.26,
"step": 17950
},
{
"epoch": 34.883720930232556,
"grad_norm": 0.833777129650116,
"learning_rate": 6.047286821705427e-05,
"loss": 0.5283815383911132,
"step": 18000
},
{
"epoch": 34.883720930232556,
"eval_loss": 0.5449761152267456,
"eval_runtime": 179.826,
"eval_samples_per_second": 105.68,
"eval_steps_per_second": 2.202,
"step": 18000
},
{
"epoch": 34.98062015503876,
"grad_norm": 0.9303448796272278,
"learning_rate": 6.008527131782946e-05,
"loss": 0.524686622619629,
"step": 18050
},
{
"epoch": 34.98062015503876,
"eval_loss": 0.5538901686668396,
"eval_runtime": 184.6088,
"eval_samples_per_second": 102.942,
"eval_steps_per_second": 2.145,
"step": 18050
},
{
"epoch": 35.07751937984496,
"grad_norm": 1.0190098285675049,
"learning_rate": 5.9697674418604657e-05,
"loss": 0.5224573516845703,
"step": 18100
},
{
"epoch": 35.07751937984496,
"eval_loss": 0.5498805642127991,
"eval_runtime": 180.9123,
"eval_samples_per_second": 105.045,
"eval_steps_per_second": 2.189,
"step": 18100
},
{
"epoch": 35.174418604651166,
"grad_norm": 0.8532817959785461,
"learning_rate": 5.9310077519379844e-05,
"loss": 0.5115080642700195,
"step": 18150
},
{
"epoch": 35.174418604651166,
"eval_loss": 0.5354583263397217,
"eval_runtime": 181.0288,
"eval_samples_per_second": 104.978,
"eval_steps_per_second": 2.187,
"step": 18150
},
{
"epoch": 35.27131782945737,
"grad_norm": 1.2438424825668335,
"learning_rate": 5.892248062015504e-05,
"loss": 0.5154667663574218,
"step": 18200
},
{
"epoch": 35.27131782945737,
"eval_loss": 0.5381179451942444,
"eval_runtime": 186.0022,
"eval_samples_per_second": 102.171,
"eval_steps_per_second": 2.129,
"step": 18200
},
{
"epoch": 35.36821705426357,
"grad_norm": 0.9005379676818848,
"learning_rate": 5.8534883720930234e-05,
"loss": 0.5120582199096679,
"step": 18250
},
{
"epoch": 35.36821705426357,
"eval_loss": 0.5399240851402283,
"eval_runtime": 183.0389,
"eval_samples_per_second": 103.825,
"eval_steps_per_second": 2.163,
"step": 18250
},
{
"epoch": 35.46511627906977,
"grad_norm": 1.5690606832504272,
"learning_rate": 5.814728682170543e-05,
"loss": 0.5389008331298828,
"step": 18300
},
{
"epoch": 35.46511627906977,
"eval_loss": 0.551359236240387,
"eval_runtime": 186.3615,
"eval_samples_per_second": 101.974,
"eval_steps_per_second": 2.125,
"step": 18300
},
{
"epoch": 35.56201550387597,
"grad_norm": 1.0620508193969727,
"learning_rate": 5.7759689922480617e-05,
"loss": 0.5309218215942383,
"step": 18350
},
{
"epoch": 35.56201550387597,
"eval_loss": 0.5498600006103516,
"eval_runtime": 186.8148,
"eval_samples_per_second": 101.726,
"eval_steps_per_second": 2.12,
"step": 18350
},
{
"epoch": 35.65891472868217,
"grad_norm": 0.8293824195861816,
"learning_rate": 5.737209302325581e-05,
"loss": 0.5250505065917969,
"step": 18400
},
{
"epoch": 35.65891472868217,
"eval_loss": 0.5435429215431213,
"eval_runtime": 186.1113,
"eval_samples_per_second": 102.111,
"eval_steps_per_second": 2.128,
"step": 18400
},
{
"epoch": 35.75581395348837,
"grad_norm": 0.9203481674194336,
"learning_rate": 5.6984496124031006e-05,
"loss": 0.5103243637084961,
"step": 18450
},
{
"epoch": 35.75581395348837,
"eval_loss": 0.5460033416748047,
"eval_runtime": 183.5687,
"eval_samples_per_second": 103.525,
"eval_steps_per_second": 2.157,
"step": 18450
},
{
"epoch": 35.85271317829457,
"grad_norm": 0.9886574745178223,
"learning_rate": 5.65968992248062e-05,
"loss": 0.5369464111328125,
"step": 18500
},
{
"epoch": 35.85271317829457,
"eval_loss": 0.5451639890670776,
"eval_runtime": 184.8581,
"eval_samples_per_second": 102.803,
"eval_steps_per_second": 2.142,
"step": 18500
},
{
"epoch": 35.94961240310077,
"grad_norm": 0.9914956092834473,
"learning_rate": 5.62093023255814e-05,
"loss": 0.5096720504760742,
"step": 18550
},
{
"epoch": 35.94961240310077,
"eval_loss": 0.55196213722229,
"eval_runtime": 186.5685,
"eval_samples_per_second": 101.861,
"eval_steps_per_second": 2.123,
"step": 18550
},
{
"epoch": 36.04651162790697,
"grad_norm": 1.1980777978897095,
"learning_rate": 5.58217054263566e-05,
"loss": 0.52376220703125,
"step": 18600
},
{
"epoch": 36.04651162790697,
"eval_loss": 0.5417166948318481,
"eval_runtime": 189.0404,
"eval_samples_per_second": 100.529,
"eval_steps_per_second": 2.095,
"step": 18600
},
{
"epoch": 36.14341085271318,
"grad_norm": 1.029856562614441,
"learning_rate": 5.543410852713179e-05,
"loss": 0.5111849975585937,
"step": 18650
},
{
"epoch": 36.14341085271318,
"eval_loss": 0.5434406995773315,
"eval_runtime": 194.8649,
"eval_samples_per_second": 97.524,
"eval_steps_per_second": 2.032,
"step": 18650
},
{
"epoch": 36.24031007751938,
"grad_norm": 0.9188030958175659,
"learning_rate": 5.504651162790698e-05,
"loss": 0.5124307250976563,
"step": 18700
},
{
"epoch": 36.24031007751938,
"eval_loss": 0.5369866490364075,
"eval_runtime": 191.9625,
"eval_samples_per_second": 98.999,
"eval_steps_per_second": 2.063,
"step": 18700
},
{
"epoch": 36.33720930232558,
"grad_norm": 0.8747699856758118,
"learning_rate": 5.4658914728682174e-05,
"loss": 0.5114467239379883,
"step": 18750
},
{
"epoch": 36.33720930232558,
"eval_loss": 0.5418548583984375,
"eval_runtime": 191.4833,
"eval_samples_per_second": 99.246,
"eval_steps_per_second": 2.068,
"step": 18750
},
{
"epoch": 36.434108527131784,
"grad_norm": 1.1166157722473145,
"learning_rate": 5.427131782945737e-05,
"loss": 0.5248300170898438,
"step": 18800
},
{
"epoch": 36.434108527131784,
"eval_loss": 0.5400023460388184,
"eval_runtime": 194.0727,
"eval_samples_per_second": 97.922,
"eval_steps_per_second": 2.04,
"step": 18800
},
{
"epoch": 36.531007751937985,
"grad_norm": 1.0363603830337524,
"learning_rate": 5.3883720930232564e-05,
"loss": 0.5214292526245117,
"step": 18850
},
{
"epoch": 36.531007751937985,
"eval_loss": 0.5220096111297607,
"eval_runtime": 198.424,
"eval_samples_per_second": 95.775,
"eval_steps_per_second": 1.996,
"step": 18850
},
{
"epoch": 36.627906976744185,
"grad_norm": 0.8305726647377014,
"learning_rate": 5.349612403100775e-05,
"loss": 0.5413295364379883,
"step": 18900
},
{
"epoch": 36.627906976744185,
"eval_loss": 0.5427697896957397,
"eval_runtime": 197.5266,
"eval_samples_per_second": 96.21,
"eval_steps_per_second": 2.005,
"step": 18900
},
{
"epoch": 36.724806201550386,
"grad_norm": 0.9412527680397034,
"learning_rate": 5.3108527131782947e-05,
"loss": 0.5180264282226562,
"step": 18950
},
{
"epoch": 36.724806201550386,
"eval_loss": 0.5371807813644409,
"eval_runtime": 193.0091,
"eval_samples_per_second": 98.462,
"eval_steps_per_second": 2.052,
"step": 18950
},
{
"epoch": 36.82170542635659,
"grad_norm": 1.1160436868667603,
"learning_rate": 5.272093023255814e-05,
"loss": 0.5069771575927734,
"step": 19000
},
{
"epoch": 36.82170542635659,
"eval_loss": 0.5473487973213196,
"eval_runtime": 193.0131,
"eval_samples_per_second": 98.46,
"eval_steps_per_second": 2.052,
"step": 19000
},
{
"epoch": 36.91860465116279,
"grad_norm": 0.9080687761306763,
"learning_rate": 5.2333333333333336e-05,
"loss": 0.5186505508422852,
"step": 19050
},
{
"epoch": 36.91860465116279,
"eval_loss": 0.5444969534873962,
"eval_runtime": 197.7748,
"eval_samples_per_second": 96.089,
"eval_steps_per_second": 2.002,
"step": 19050
},
{
"epoch": 37.01550387596899,
"grad_norm": 1.1593964099884033,
"learning_rate": 5.1945736434108524e-05,
"loss": 0.5240575790405273,
"step": 19100
},
{
"epoch": 37.01550387596899,
"eval_loss": 0.5428500175476074,
"eval_runtime": 194.6691,
"eval_samples_per_second": 97.622,
"eval_steps_per_second": 2.034,
"step": 19100
},
{
"epoch": 37.1124031007752,
"grad_norm": 1.0456364154815674,
"learning_rate": 5.155813953488372e-05,
"loss": 0.5200264739990235,
"step": 19150
},
{
"epoch": 37.1124031007752,
"eval_loss": 0.5357416868209839,
"eval_runtime": 193.2375,
"eval_samples_per_second": 98.345,
"eval_steps_per_second": 2.049,
"step": 19150
},
{
"epoch": 37.2093023255814,
"grad_norm": 0.9927902817726135,
"learning_rate": 5.117054263565891e-05,
"loss": 0.5228353500366211,
"step": 19200
},
{
"epoch": 37.2093023255814,
"eval_loss": 0.5418040156364441,
"eval_runtime": 196.1849,
"eval_samples_per_second": 96.868,
"eval_steps_per_second": 2.019,
"step": 19200
},
{
"epoch": 37.3062015503876,
"grad_norm": 1.0007529258728027,
"learning_rate": 5.078294573643411e-05,
"loss": 0.5095104217529297,
"step": 19250
},
{
"epoch": 37.3062015503876,
"eval_loss": 0.5377764105796814,
"eval_runtime": 195.1001,
"eval_samples_per_second": 97.406,
"eval_steps_per_second": 2.03,
"step": 19250
},
{
"epoch": 37.4031007751938,
"grad_norm": 1.0300018787384033,
"learning_rate": 5.03953488372093e-05,
"loss": 0.5201596069335938,
"step": 19300
},
{
"epoch": 37.4031007751938,
"eval_loss": 0.5398209095001221,
"eval_runtime": 186.0194,
"eval_samples_per_second": 102.161,
"eval_steps_per_second": 2.129,
"step": 19300
},
{
"epoch": 37.5,
"grad_norm": 0.8903720378875732,
"learning_rate": 5.0007751937984504e-05,
"loss": 0.5176564025878906,
"step": 19350
},
{
"epoch": 37.5,
"eval_loss": 0.5390880107879639,
"eval_runtime": 182.7414,
"eval_samples_per_second": 103.994,
"eval_steps_per_second": 2.167,
"step": 19350
},
{
"epoch": 37.5968992248062,
"grad_norm": 1.1395453214645386,
"learning_rate": 4.962015503875969e-05,
"loss": 0.5085873413085937,
"step": 19400
},
{
"epoch": 37.5968992248062,
"eval_loss": 0.5383904576301575,
"eval_runtime": 184.2422,
"eval_samples_per_second": 103.147,
"eval_steps_per_second": 2.149,
"step": 19400
},
{
"epoch": 37.6937984496124,
"grad_norm": 1.0433534383773804,
"learning_rate": 4.923255813953489e-05,
"loss": 0.5213092803955078,
"step": 19450
},
{
"epoch": 37.6937984496124,
"eval_loss": 0.5324852466583252,
"eval_runtime": 184.2591,
"eval_samples_per_second": 103.137,
"eval_steps_per_second": 2.149,
"step": 19450
},
{
"epoch": 37.7906976744186,
"grad_norm": 1.049198031425476,
"learning_rate": 4.8844961240310075e-05,
"loss": 0.5182023620605469,
"step": 19500
},
{
"epoch": 37.7906976744186,
"eval_loss": 0.5335138440132141,
"eval_runtime": 186.3364,
"eval_samples_per_second": 101.988,
"eval_steps_per_second": 2.125,
"step": 19500
},
{
"epoch": 37.8875968992248,
"grad_norm": 0.9317577481269836,
"learning_rate": 4.8457364341085276e-05,
"loss": 0.5240017318725586,
"step": 19550
},
{
"epoch": 37.8875968992248,
"eval_loss": 0.5385186076164246,
"eval_runtime": 185.6261,
"eval_samples_per_second": 102.378,
"eval_steps_per_second": 2.133,
"step": 19550
},
{
"epoch": 37.98449612403101,
"grad_norm": 0.9193382263183594,
"learning_rate": 4.806976744186047e-05,
"loss": 0.5023036193847656,
"step": 19600
},
{
"epoch": 37.98449612403101,
"eval_loss": 0.5272864103317261,
"eval_runtime": 186.6742,
"eval_samples_per_second": 101.803,
"eval_steps_per_second": 2.121,
"step": 19600
},
{
"epoch": 38.08139534883721,
"grad_norm": 0.9680274724960327,
"learning_rate": 4.768217054263566e-05,
"loss": 0.5128543853759766,
"step": 19650
},
{
"epoch": 38.08139534883721,
"eval_loss": 0.5302436351776123,
"eval_runtime": 188.7195,
"eval_samples_per_second": 100.7,
"eval_steps_per_second": 2.098,
"step": 19650
},
{
"epoch": 38.17829457364341,
"grad_norm": 0.9249178767204285,
"learning_rate": 4.7294573643410854e-05,
"loss": 0.5085428619384765,
"step": 19700
},
{
"epoch": 38.17829457364341,
"eval_loss": 0.5326287150382996,
"eval_runtime": 184.8359,
"eval_samples_per_second": 102.816,
"eval_steps_per_second": 2.142,
"step": 19700
},
{
"epoch": 38.275193798449614,
"grad_norm": 1.0035618543624878,
"learning_rate": 4.690697674418605e-05,
"loss": 0.5140943908691407,
"step": 19750
},
{
"epoch": 38.275193798449614,
"eval_loss": 0.5443971753120422,
"eval_runtime": 183.6324,
"eval_samples_per_second": 103.489,
"eval_steps_per_second": 2.156,
"step": 19750
},
{
"epoch": 38.372093023255815,
"grad_norm": 1.1001235246658325,
"learning_rate": 4.651937984496124e-05,
"loss": 0.502054328918457,
"step": 19800
},
{
"epoch": 38.372093023255815,
"eval_loss": 0.5287387371063232,
"eval_runtime": 188.4424,
"eval_samples_per_second": 100.848,
"eval_steps_per_second": 2.101,
"step": 19800
},
{
"epoch": 38.468992248062015,
"grad_norm": 1.0103139877319336,
"learning_rate": 4.613178294573644e-05,
"loss": 0.5115917587280273,
"step": 19850
},
{
"epoch": 38.468992248062015,
"eval_loss": 0.5320472121238708,
"eval_runtime": 182.8854,
"eval_samples_per_second": 103.912,
"eval_steps_per_second": 2.165,
"step": 19850
},
{
"epoch": 38.565891472868216,
"grad_norm": 0.9626783728599548,
"learning_rate": 4.5744186046511626e-05,
"loss": 0.5209971618652344,
"step": 19900
},
{
"epoch": 38.565891472868216,
"eval_loss": 0.5395954251289368,
"eval_runtime": 183.9199,
"eval_samples_per_second": 103.328,
"eval_steps_per_second": 2.153,
"step": 19900
},
{
"epoch": 38.66279069767442,
"grad_norm": 1.2447129487991333,
"learning_rate": 4.535658914728683e-05,
"loss": 0.5177993774414062,
"step": 19950
},
{
"epoch": 38.66279069767442,
"eval_loss": 0.5312191843986511,
"eval_runtime": 183.2959,
"eval_samples_per_second": 103.679,
"eval_steps_per_second": 2.16,
"step": 19950
},
{
"epoch": 38.75968992248062,
"grad_norm": 0.8827547430992126,
"learning_rate": 4.496899224806202e-05,
"loss": 0.5207798767089844,
"step": 20000
},
{
"epoch": 38.75968992248062,
"eval_loss": 0.5287414789199829,
"eval_runtime": 182.6558,
"eval_samples_per_second": 104.043,
"eval_steps_per_second": 2.168,
"step": 20000
},
{
"epoch": 38.85658914728682,
"grad_norm": 0.8883045315742493,
"learning_rate": 4.458139534883721e-05,
"loss": 0.5186112213134766,
"step": 20050
},
{
"epoch": 38.85658914728682,
"eval_loss": 0.5344378352165222,
"eval_runtime": 180.0159,
"eval_samples_per_second": 105.568,
"eval_steps_per_second": 2.2,
"step": 20050
},
{
"epoch": 38.95348837209303,
"grad_norm": 0.9383369088172913,
"learning_rate": 4.4193798449612405e-05,
"loss": 0.5147262573242187,
"step": 20100
},
{
"epoch": 38.95348837209303,
"eval_loss": 0.5442497730255127,
"eval_runtime": 168.8857,
"eval_samples_per_second": 112.526,
"eval_steps_per_second": 2.345,
"step": 20100
},
{
"epoch": 39.05038759689923,
"grad_norm": 0.8504248857498169,
"learning_rate": 4.38062015503876e-05,
"loss": 0.5029075241088867,
"step": 20150
},
{
"epoch": 39.05038759689923,
"eval_loss": 0.5233710408210754,
"eval_runtime": 176.4514,
"eval_samples_per_second": 107.701,
"eval_steps_per_second": 2.244,
"step": 20150
},
{
"epoch": 39.14728682170543,
"grad_norm": 1.0626157522201538,
"learning_rate": 4.3418604651162794e-05,
"loss": 0.5219727325439453,
"step": 20200
},
{
"epoch": 39.14728682170543,
"eval_loss": 0.5346750617027283,
"eval_runtime": 177.159,
"eval_samples_per_second": 107.271,
"eval_steps_per_second": 2.235,
"step": 20200
},
{
"epoch": 39.24418604651163,
"grad_norm": 0.8731338381767273,
"learning_rate": 4.303100775193798e-05,
"loss": 0.5061806869506836,
"step": 20250
},
{
"epoch": 39.24418604651163,
"eval_loss": 0.5310733318328857,
"eval_runtime": 178.8862,
"eval_samples_per_second": 106.235,
"eval_steps_per_second": 2.214,
"step": 20250
},
{
"epoch": 39.34108527131783,
"grad_norm": 0.8953520059585571,
"learning_rate": 4.264341085271318e-05,
"loss": 0.5171949768066406,
"step": 20300
},
{
"epoch": 39.34108527131783,
"eval_loss": 0.5359470248222351,
"eval_runtime": 169.9833,
"eval_samples_per_second": 111.799,
"eval_steps_per_second": 2.33,
"step": 20300
},
{
"epoch": 39.43798449612403,
"grad_norm": 0.8778860569000244,
"learning_rate": 4.225581395348838e-05,
"loss": 0.5076210021972656,
"step": 20350
},
{
"epoch": 39.43798449612403,
"eval_loss": 0.5338460803031921,
"eval_runtime": 177.8436,
"eval_samples_per_second": 106.858,
"eval_steps_per_second": 2.227,
"step": 20350
},
{
"epoch": 39.53488372093023,
"grad_norm": 1.1184332370758057,
"learning_rate": 4.186821705426357e-05,
"loss": 0.528587989807129,
"step": 20400
},
{
"epoch": 39.53488372093023,
"eval_loss": 0.5384491682052612,
"eval_runtime": 173.4459,
"eval_samples_per_second": 109.567,
"eval_steps_per_second": 2.283,
"step": 20400
},
{
"epoch": 39.63178294573643,
"grad_norm": 0.9835333824157715,
"learning_rate": 4.148062015503876e-05,
"loss": 0.5009258651733398,
"step": 20450
},
{
"epoch": 39.63178294573643,
"eval_loss": 0.5286412835121155,
"eval_runtime": 175.8829,
"eval_samples_per_second": 108.049,
"eval_steps_per_second": 2.251,
"step": 20450
},
{
"epoch": 39.72868217054263,
"grad_norm": 0.8886466026306152,
"learning_rate": 4.1093023255813956e-05,
"loss": 0.499103889465332,
"step": 20500
},
{
"epoch": 39.72868217054263,
"eval_loss": 0.5248314142227173,
"eval_runtime": 177.0445,
"eval_samples_per_second": 107.34,
"eval_steps_per_second": 2.237,
"step": 20500
},
{
"epoch": 39.825581395348834,
"grad_norm": 0.9667196869850159,
"learning_rate": 4.070542635658915e-05,
"loss": 0.5149082946777344,
"step": 20550
},
{
"epoch": 39.825581395348834,
"eval_loss": 0.5282675623893738,
"eval_runtime": 176.0589,
"eval_samples_per_second": 107.941,
"eval_steps_per_second": 2.249,
"step": 20550
},
{
"epoch": 39.92248062015504,
"grad_norm": 1.0895967483520508,
"learning_rate": 4.0317829457364345e-05,
"loss": 0.5068093109130859,
"step": 20600
},
{
"epoch": 39.92248062015504,
"eval_loss": 0.5327328443527222,
"eval_runtime": 175.0237,
"eval_samples_per_second": 108.58,
"eval_steps_per_second": 2.263,
"step": 20600
},
{
"epoch": 40.01937984496124,
"grad_norm": 1.0282702445983887,
"learning_rate": 3.993023255813953e-05,
"loss": 0.5070622634887695,
"step": 20650
},
{
"epoch": 40.01937984496124,
"eval_loss": 0.5209603309631348,
"eval_runtime": 174.6222,
"eval_samples_per_second": 108.829,
"eval_steps_per_second": 2.268,
"step": 20650
},
{
"epoch": 40.116279069767444,
"grad_norm": 1.0411657094955444,
"learning_rate": 3.954263565891473e-05,
"loss": 0.5045206451416016,
"step": 20700
},
{
"epoch": 40.116279069767444,
"eval_loss": 0.5327551364898682,
"eval_runtime": 177.2652,
"eval_samples_per_second": 107.207,
"eval_steps_per_second": 2.234,
"step": 20700
},
{
"epoch": 40.213178294573645,
"grad_norm": 0.9641968607902527,
"learning_rate": 3.915503875968993e-05,
"loss": 0.5015212631225586,
"step": 20750
},
{
"epoch": 40.213178294573645,
"eval_loss": 0.5277599096298218,
"eval_runtime": 173.5229,
"eval_samples_per_second": 109.519,
"eval_steps_per_second": 2.282,
"step": 20750
},
{
"epoch": 40.310077519379846,
"grad_norm": 0.9383164644241333,
"learning_rate": 3.876744186046512e-05,
"loss": 0.5052926254272461,
"step": 20800
},
{
"epoch": 40.310077519379846,
"eval_loss": 0.5317120552062988,
"eval_runtime": 180.5338,
"eval_samples_per_second": 105.266,
"eval_steps_per_second": 2.193,
"step": 20800
},
{
"epoch": 40.406976744186046,
"grad_norm": 0.9518958926200867,
"learning_rate": 3.837984496124031e-05,
"loss": 0.504459228515625,
"step": 20850
},
{
"epoch": 40.406976744186046,
"eval_loss": 0.5327459573745728,
"eval_runtime": 178.4024,
"eval_samples_per_second": 106.523,
"eval_steps_per_second": 2.22,
"step": 20850
},
{
"epoch": 40.50387596899225,
"grad_norm": 0.929793119430542,
"learning_rate": 3.799224806201551e-05,
"loss": 0.50721923828125,
"step": 20900
},
{
"epoch": 40.50387596899225,
"eval_loss": 0.5232871174812317,
"eval_runtime": 176.0828,
"eval_samples_per_second": 107.927,
"eval_steps_per_second": 2.249,
"step": 20900
},
{
"epoch": 40.60077519379845,
"grad_norm": 1.008422613143921,
"learning_rate": 3.76046511627907e-05,
"loss": 0.5139606475830079,
"step": 20950
},
{
"epoch": 40.60077519379845,
"eval_loss": 0.5307023525238037,
"eval_runtime": 176.7655,
"eval_samples_per_second": 107.51,
"eval_steps_per_second": 2.24,
"step": 20950
},
{
"epoch": 40.69767441860465,
"grad_norm": 0.9010120630264282,
"learning_rate": 3.721705426356589e-05,
"loss": 0.5006965255737305,
"step": 21000
},
{
"epoch": 40.69767441860465,
"eval_loss": 0.5244865417480469,
"eval_runtime": 568.7253,
"eval_samples_per_second": 33.415,
"eval_steps_per_second": 0.696,
"step": 21000
},
{
"epoch": 40.79457364341085,
"grad_norm": 0.8767653703689575,
"learning_rate": 3.6829457364341084e-05,
"loss": 0.5031109619140625,
"step": 21050
},
{
"epoch": 40.79457364341085,
"eval_loss": 0.5274307727813721,
"eval_runtime": 180.3181,
"eval_samples_per_second": 105.392,
"eval_steps_per_second": 2.196,
"step": 21050
},
{
"epoch": 40.89147286821706,
"grad_norm": 0.9830530881881714,
"learning_rate": 3.644186046511628e-05,
"loss": 0.5075841903686523,
"step": 21100
},
{
"epoch": 40.89147286821706,
"eval_loss": 0.5370468497276306,
"eval_runtime": 181.9153,
"eval_samples_per_second": 104.466,
"eval_steps_per_second": 2.177,
"step": 21100
},
{
"epoch": 40.98837209302326,
"grad_norm": 1.019087791442871,
"learning_rate": 3.605426356589148e-05,
"loss": 0.5040792846679687,
"step": 21150
},
{
"epoch": 40.98837209302326,
"eval_loss": 0.5332101583480835,
"eval_runtime": 183.2927,
"eval_samples_per_second": 103.681,
"eval_steps_per_second": 2.16,
"step": 21150
},
{
"epoch": 41.08527131782946,
"grad_norm": 0.8988721370697021,
"learning_rate": 3.566666666666667e-05,
"loss": 0.5045541381835937,
"step": 21200
},
{
"epoch": 41.08527131782946,
"eval_loss": 0.5272098183631897,
"eval_runtime": 184.5631,
"eval_samples_per_second": 102.967,
"eval_steps_per_second": 2.146,
"step": 21200
},
{
"epoch": 41.18217054263566,
"grad_norm": 1.0443159341812134,
"learning_rate": 3.527906976744186e-05,
"loss": 0.5021038818359375,
"step": 21250
},
{
"epoch": 41.18217054263566,
"eval_loss": 0.5261159539222717,
"eval_runtime": 187.475,
"eval_samples_per_second": 101.368,
"eval_steps_per_second": 2.112,
"step": 21250
},
{
"epoch": 41.27906976744186,
"grad_norm": 0.7973293662071228,
"learning_rate": 3.489147286821706e-05,
"loss": 0.5001279449462891,
"step": 21300
},
{
"epoch": 41.27906976744186,
"eval_loss": 0.5278809070587158,
"eval_runtime": 189.9745,
"eval_samples_per_second": 100.034,
"eval_steps_per_second": 2.084,
"step": 21300
},
{
"epoch": 41.37596899224806,
"grad_norm": 0.8979710936546326,
"learning_rate": 3.450387596899225e-05,
"loss": 0.5035758972167969,
"step": 21350
},
{
"epoch": 41.37596899224806,
"eval_loss": 0.521926760673523,
"eval_runtime": 188.1978,
"eval_samples_per_second": 100.979,
"eval_steps_per_second": 2.104,
"step": 21350
},
{
"epoch": 41.47286821705426,
"grad_norm": 0.9254922866821289,
"learning_rate": 3.411627906976744e-05,
"loss": 0.5044261932373046,
"step": 21400
},
{
"epoch": 41.47286821705426,
"eval_loss": 0.5313370227813721,
"eval_runtime": 190.0945,
"eval_samples_per_second": 99.971,
"eval_steps_per_second": 2.083,
"step": 21400
},
{
"epoch": 41.56976744186046,
"grad_norm": 0.9278942346572876,
"learning_rate": 3.3728682170542635e-05,
"loss": 0.5007383728027344,
"step": 21450
},
{
"epoch": 41.56976744186046,
"eval_loss": 0.5329434275627136,
"eval_runtime": 186.2051,
"eval_samples_per_second": 102.059,
"eval_steps_per_second": 2.127,
"step": 21450
},
{
"epoch": 41.666666666666664,
"grad_norm": 1.0791382789611816,
"learning_rate": 3.334108527131783e-05,
"loss": 0.504736557006836,
"step": 21500
},
{
"epoch": 41.666666666666664,
"eval_loss": 0.529855489730835,
"eval_runtime": 186.3264,
"eval_samples_per_second": 101.993,
"eval_steps_per_second": 2.125,
"step": 21500
},
{
"epoch": 41.763565891472865,
"grad_norm": 0.9192059636116028,
"learning_rate": 3.2953488372093025e-05,
"loss": 0.501381721496582,
"step": 21550
},
{
"epoch": 41.763565891472865,
"eval_loss": 0.5253894925117493,
"eval_runtime": 186.6125,
"eval_samples_per_second": 101.837,
"eval_steps_per_second": 2.122,
"step": 21550
},
{
"epoch": 41.86046511627907,
"grad_norm": 0.9805233478546143,
"learning_rate": 3.256589147286822e-05,
"loss": 0.5100160217285157,
"step": 21600
},
{
"epoch": 41.86046511627907,
"eval_loss": 0.5194661021232605,
"eval_runtime": 188.6485,
"eval_samples_per_second": 100.738,
"eval_steps_per_second": 2.099,
"step": 21600
},
{
"epoch": 41.957364341085274,
"grad_norm": 0.846781313419342,
"learning_rate": 3.2178294573643414e-05,
"loss": 0.5082344818115234,
"step": 21650
},
{
"epoch": 41.957364341085274,
"eval_loss": 0.5229784846305847,
"eval_runtime": 189.4238,
"eval_samples_per_second": 100.325,
"eval_steps_per_second": 2.091,
"step": 21650
},
{
"epoch": 42.054263565891475,
"grad_norm": 0.8974575996398926,
"learning_rate": 3.179069767441861e-05,
"loss": 0.504454231262207,
"step": 21700
},
{
"epoch": 42.054263565891475,
"eval_loss": 0.5280088782310486,
"eval_runtime": 187.0803,
"eval_samples_per_second": 101.582,
"eval_steps_per_second": 2.117,
"step": 21700
},
{
"epoch": 42.151162790697676,
"grad_norm": 0.933649480342865,
"learning_rate": 3.14031007751938e-05,
"loss": 0.501411361694336,
"step": 21750
},
{
"epoch": 42.151162790697676,
"eval_loss": 0.5128213167190552,
"eval_runtime": 187.1399,
"eval_samples_per_second": 101.55,
"eval_steps_per_second": 2.116,
"step": 21750
},
{
"epoch": 42.248062015503876,
"grad_norm": 0.8940353393554688,
"learning_rate": 3.101550387596899e-05,
"loss": 0.4968502807617188,
"step": 21800
},
{
"epoch": 42.248062015503876,
"eval_loss": 0.5252892971038818,
"eval_runtime": 186.0325,
"eval_samples_per_second": 102.154,
"eval_steps_per_second": 2.129,
"step": 21800
},
{
"epoch": 42.34496124031008,
"grad_norm": 0.8218147158622742,
"learning_rate": 3.0627906976744186e-05,
"loss": 0.5118127059936524,
"step": 21850
},
{
"epoch": 42.34496124031008,
"eval_loss": 0.5292873382568359,
"eval_runtime": 184.4715,
"eval_samples_per_second": 103.019,
"eval_steps_per_second": 2.147,
"step": 21850
},
{
"epoch": 42.44186046511628,
"grad_norm": 0.8893631100654602,
"learning_rate": 3.0240310077519378e-05,
"loss": 0.5024843978881836,
"step": 21900
},
{
"epoch": 42.44186046511628,
"eval_loss": 0.524248480796814,
"eval_runtime": 184.4676,
"eval_samples_per_second": 103.021,
"eval_steps_per_second": 2.147,
"step": 21900
},
{
"epoch": 42.53875968992248,
"grad_norm": 1.0226677656173706,
"learning_rate": 2.9852713178294572e-05,
"loss": 0.4969608688354492,
"step": 21950
},
{
"epoch": 42.53875968992248,
"eval_loss": 0.5290153622627258,
"eval_runtime": 185.7762,
"eval_samples_per_second": 102.295,
"eval_steps_per_second": 2.132,
"step": 21950
},
{
"epoch": 42.63565891472868,
"grad_norm": 0.8559462428092957,
"learning_rate": 2.946511627906977e-05,
"loss": 0.5093366622924804,
"step": 22000
},
{
"epoch": 42.63565891472868,
"eval_loss": 0.516994059085846,
"eval_runtime": 186.7511,
"eval_samples_per_second": 101.761,
"eval_steps_per_second": 2.12,
"step": 22000
},
{
"epoch": 42.73255813953488,
"grad_norm": 0.8686088919639587,
"learning_rate": 2.9077519379844965e-05,
"loss": 0.49697071075439453,
"step": 22050
},
{
"epoch": 42.73255813953488,
"eval_loss": 0.5155017971992493,
"eval_runtime": 195.3357,
"eval_samples_per_second": 97.289,
"eval_steps_per_second": 2.027,
"step": 22050
},
{
"epoch": 42.82945736434109,
"grad_norm": 0.8473958969116211,
"learning_rate": 2.8689922480620157e-05,
"loss": 0.5003182220458985,
"step": 22100
},
{
"epoch": 42.82945736434109,
"eval_loss": 0.5242047309875488,
"eval_runtime": 194.0238,
"eval_samples_per_second": 97.947,
"eval_steps_per_second": 2.041,
"step": 22100
},
{
"epoch": 42.92635658914729,
"grad_norm": 0.9314485788345337,
"learning_rate": 2.830232558139535e-05,
"loss": 0.5022520065307617,
"step": 22150
},
{
"epoch": 42.92635658914729,
"eval_loss": 0.5250500440597534,
"eval_runtime": 233.2841,
"eval_samples_per_second": 81.463,
"eval_steps_per_second": 1.698,
"step": 22150
},
{
"epoch": 43.02325581395349,
"grad_norm": 0.9403958320617676,
"learning_rate": 2.7914728682170543e-05,
"loss": 0.5011024856567383,
"step": 22200
},
{
"epoch": 43.02325581395349,
"eval_loss": 0.5284552574157715,
"eval_runtime": 224.6625,
"eval_samples_per_second": 84.589,
"eval_steps_per_second": 1.763,
"step": 22200
},
{
"epoch": 43.12015503875969,
"grad_norm": 0.8705305457115173,
"learning_rate": 2.7527131782945737e-05,
"loss": 0.4972893524169922,
"step": 22250
},
{
"epoch": 43.12015503875969,
"eval_loss": 0.5284101366996765,
"eval_runtime": 194.01,
"eval_samples_per_second": 97.954,
"eval_steps_per_second": 2.041,
"step": 22250
},
{
"epoch": 43.21705426356589,
"grad_norm": 0.9412527084350586,
"learning_rate": 2.713953488372093e-05,
"loss": 0.4995740509033203,
"step": 22300
},
{
"epoch": 43.21705426356589,
"eval_loss": 0.516463041305542,
"eval_runtime": 208.129,
"eval_samples_per_second": 91.309,
"eval_steps_per_second": 1.903,
"step": 22300
},
{
"epoch": 43.31395348837209,
"grad_norm": 1.1200988292694092,
"learning_rate": 2.6751937984496123e-05,
"loss": 0.49355110168457034,
"step": 22350
},
{
"epoch": 43.31395348837209,
"eval_loss": 0.5287572741508484,
"eval_runtime": 193.966,
"eval_samples_per_second": 97.976,
"eval_steps_per_second": 2.042,
"step": 22350
},
{
"epoch": 43.41085271317829,
"grad_norm": 0.8845555186271667,
"learning_rate": 2.636434108527132e-05,
"loss": 0.48946212768554687,
"step": 22400
},
{
"epoch": 43.41085271317829,
"eval_loss": 0.5179551243782043,
"eval_runtime": 191.2172,
"eval_samples_per_second": 99.384,
"eval_steps_per_second": 2.071,
"step": 22400
},
{
"epoch": 43.507751937984494,
"grad_norm": 1.2325315475463867,
"learning_rate": 2.5976744186046513e-05,
"loss": 0.48715110778808596,
"step": 22450
},
{
"epoch": 43.507751937984494,
"eval_loss": 0.5255248546600342,
"eval_runtime": 214.106,
"eval_samples_per_second": 88.76,
"eval_steps_per_second": 1.85,
"step": 22450
},
{
"epoch": 43.604651162790695,
"grad_norm": 0.8257450461387634,
"learning_rate": 2.5589147286821708e-05,
"loss": 0.5038369369506835,
"step": 22500
},
{
"epoch": 43.604651162790695,
"eval_loss": 0.5318561792373657,
"eval_runtime": 237.7016,
"eval_samples_per_second": 79.949,
"eval_steps_per_second": 1.666,
"step": 22500
},
{
"epoch": 43.701550387596896,
"grad_norm": 0.9131925702095032,
"learning_rate": 2.52015503875969e-05,
"loss": 0.48653671264648435,
"step": 22550
},
{
"epoch": 43.701550387596896,
"eval_loss": 0.5257322192192078,
"eval_runtime": 197.9576,
"eval_samples_per_second": 96.0,
"eval_steps_per_second": 2.0,
"step": 22550
},
{
"epoch": 43.798449612403104,
"grad_norm": 0.9946778416633606,
"learning_rate": 2.4813953488372094e-05,
"loss": 0.5035680389404297,
"step": 22600
},
{
"epoch": 43.798449612403104,
"eval_loss": 0.5150420665740967,
"eval_runtime": 206.912,
"eval_samples_per_second": 91.846,
"eval_steps_per_second": 1.914,
"step": 22600
},
{
"epoch": 43.895348837209305,
"grad_norm": 1.0072699785232544,
"learning_rate": 2.442635658914729e-05,
"loss": 0.49596702575683593,
"step": 22650
},
{
"epoch": 43.895348837209305,
"eval_loss": 0.5257139205932617,
"eval_runtime": 199.7615,
"eval_samples_per_second": 95.133,
"eval_steps_per_second": 1.982,
"step": 22650
},
{
"epoch": 43.992248062015506,
"grad_norm": 0.9366716742515564,
"learning_rate": 2.4038759689922483e-05,
"loss": 0.49716136932373045,
"step": 22700
},
{
"epoch": 43.992248062015506,
"eval_loss": 0.5198754668235779,
"eval_runtime": 193.3347,
"eval_samples_per_second": 98.296,
"eval_steps_per_second": 2.048,
"step": 22700
},
{
"epoch": 44.08914728682171,
"grad_norm": 0.90858393907547,
"learning_rate": 2.3651162790697675e-05,
"loss": 0.5002625656127929,
"step": 22750
},
{
"epoch": 44.08914728682171,
"eval_loss": 0.527286171913147,
"eval_runtime": 196.2426,
"eval_samples_per_second": 96.839,
"eval_steps_per_second": 2.018,
"step": 22750
},
{
"epoch": 44.18604651162791,
"grad_norm": 1.0151848793029785,
"learning_rate": 2.326356589147287e-05,
"loss": 0.4959259033203125,
"step": 22800
},
{
"epoch": 44.18604651162791,
"eval_loss": 0.5242442488670349,
"eval_runtime": 194.4345,
"eval_samples_per_second": 97.74,
"eval_steps_per_second": 2.037,
"step": 22800
},
{
"epoch": 44.28294573643411,
"grad_norm": 0.8512209057807922,
"learning_rate": 2.287596899224806e-05,
"loss": 0.500455093383789,
"step": 22850
},
{
"epoch": 44.28294573643411,
"eval_loss": 0.5242491960525513,
"eval_runtime": 193.4476,
"eval_samples_per_second": 98.238,
"eval_steps_per_second": 2.047,
"step": 22850
},
{
"epoch": 44.37984496124031,
"grad_norm": 0.8968107104301453,
"learning_rate": 2.248837209302326e-05,
"loss": 0.4897348022460937,
"step": 22900
},
{
"epoch": 44.37984496124031,
"eval_loss": 0.5159934759140015,
"eval_runtime": 189.2217,
"eval_samples_per_second": 100.432,
"eval_steps_per_second": 2.093,
"step": 22900
},
{
"epoch": 44.47674418604651,
"grad_norm": 0.8487868309020996,
"learning_rate": 2.210077519379845e-05,
"loss": 0.48243858337402346,
"step": 22950
},
{
"epoch": 44.47674418604651,
"eval_loss": 0.5207856893539429,
"eval_runtime": 191.921,
"eval_samples_per_second": 99.02,
"eval_steps_per_second": 2.063,
"step": 22950
},
{
"epoch": 44.57364341085271,
"grad_norm": 0.9809431433677673,
"learning_rate": 2.1713178294573645e-05,
"loss": 0.48758525848388673,
"step": 23000
},
{
"epoch": 44.57364341085271,
"eval_loss": 0.5163289308547974,
"eval_runtime": 196.6885,
"eval_samples_per_second": 96.62,
"eval_steps_per_second": 2.013,
"step": 23000
},
{
"epoch": 44.67054263565891,
"grad_norm": 0.9498407244682312,
"learning_rate": 2.1325581395348836e-05,
"loss": 0.48854473114013675,
"step": 23050
},
{
"epoch": 44.67054263565891,
"eval_loss": 0.5189388394355774,
"eval_runtime": 196.4657,
"eval_samples_per_second": 96.729,
"eval_steps_per_second": 2.016,
"step": 23050
},
{
"epoch": 44.76744186046512,
"grad_norm": 0.9875770807266235,
"learning_rate": 2.0937984496124034e-05,
"loss": 0.5109722900390625,
"step": 23100
},
{
"epoch": 44.76744186046512,
"eval_loss": 0.5207294821739197,
"eval_runtime": 194.5411,
"eval_samples_per_second": 97.686,
"eval_steps_per_second": 2.036,
"step": 23100
},
{
"epoch": 44.86434108527132,
"grad_norm": 0.8651441335678101,
"learning_rate": 2.0550387596899226e-05,
"loss": 0.4815971755981445,
"step": 23150
},
{
"epoch": 44.86434108527132,
"eval_loss": 0.5165086388587952,
"eval_runtime": 194.6518,
"eval_samples_per_second": 97.631,
"eval_steps_per_second": 2.034,
"step": 23150
},
{
"epoch": 44.96124031007752,
"grad_norm": 1.0780807733535767,
"learning_rate": 2.016279069767442e-05,
"loss": 0.49749275207519533,
"step": 23200
},
{
"epoch": 44.96124031007752,
"eval_loss": 0.5201721787452698,
"eval_runtime": 195.2209,
"eval_samples_per_second": 97.346,
"eval_steps_per_second": 2.028,
"step": 23200
},
{
"epoch": 45.05813953488372,
"grad_norm": 0.9796660542488098,
"learning_rate": 1.977519379844961e-05,
"loss": 0.4934458541870117,
"step": 23250
},
{
"epoch": 45.05813953488372,
"eval_loss": 0.5206090211868286,
"eval_runtime": 194.3101,
"eval_samples_per_second": 97.802,
"eval_steps_per_second": 2.038,
"step": 23250
},
{
"epoch": 45.15503875968992,
"grad_norm": 0.8599798679351807,
"learning_rate": 1.938759689922481e-05,
"loss": 0.5016697692871094,
"step": 23300
},
{
"epoch": 45.15503875968992,
"eval_loss": 0.5147153735160828,
"eval_runtime": 192.9359,
"eval_samples_per_second": 98.499,
"eval_steps_per_second": 2.052,
"step": 23300
},
{
"epoch": 45.251937984496124,
"grad_norm": 0.7895607352256775,
"learning_rate": 1.9e-05,
"loss": 0.49334972381591796,
"step": 23350
},
{
"epoch": 45.251937984496124,
"eval_loss": 0.515512228012085,
"eval_runtime": 193.0203,
"eval_samples_per_second": 98.456,
"eval_steps_per_second": 2.052,
"step": 23350
},
{
"epoch": 45.348837209302324,
"grad_norm": 1.0045746564865112,
"learning_rate": 1.8612403100775196e-05,
"loss": 0.49473106384277343,
"step": 23400
},
{
"epoch": 45.348837209302324,
"eval_loss": 0.5082274675369263,
"eval_runtime": 197.0156,
"eval_samples_per_second": 96.459,
"eval_steps_per_second": 2.01,
"step": 23400
},
{
"epoch": 45.445736434108525,
"grad_norm": 0.9885164499282837,
"learning_rate": 1.8224806201550387e-05,
"loss": 0.49016048431396486,
"step": 23450
},
{
"epoch": 45.445736434108525,
"eval_loss": 0.5208576917648315,
"eval_runtime": 196.5842,
"eval_samples_per_second": 96.671,
"eval_steps_per_second": 2.014,
"step": 23450
},
{
"epoch": 45.542635658914726,
"grad_norm": 0.9519304633140564,
"learning_rate": 1.7837209302325582e-05,
"loss": 0.501864242553711,
"step": 23500
},
{
"epoch": 45.542635658914726,
"eval_loss": 0.5219539403915405,
"eval_runtime": 196.4914,
"eval_samples_per_second": 96.717,
"eval_steps_per_second": 2.015,
"step": 23500
},
{
"epoch": 45.63953488372093,
"grad_norm": 0.8717612624168396,
"learning_rate": 1.7449612403100777e-05,
"loss": 0.4909215545654297,
"step": 23550
},
{
"epoch": 45.63953488372093,
"eval_loss": 0.5142674446105957,
"eval_runtime": 197.216,
"eval_samples_per_second": 96.361,
"eval_steps_per_second": 2.008,
"step": 23550
},
{
"epoch": 45.736434108527135,
"grad_norm": 0.986541748046875,
"learning_rate": 1.7062015503875968e-05,
"loss": 0.4898907470703125,
"step": 23600
},
{
"epoch": 45.736434108527135,
"eval_loss": 0.5174142122268677,
"eval_runtime": 194.9761,
"eval_samples_per_second": 97.468,
"eval_steps_per_second": 2.031,
"step": 23600
},
{
"epoch": 45.833333333333336,
"grad_norm": 1.0406668186187744,
"learning_rate": 1.6674418604651163e-05,
"loss": 0.48606651306152343,
"step": 23650
},
{
"epoch": 45.833333333333336,
"eval_loss": 0.5181837677955627,
"eval_runtime": 195.1438,
"eval_samples_per_second": 97.385,
"eval_steps_per_second": 2.029,
"step": 23650
},
{
"epoch": 45.93023255813954,
"grad_norm": 1.2656255960464478,
"learning_rate": 1.6286821705426357e-05,
"loss": 0.4883313751220703,
"step": 23700
},
{
"epoch": 45.93023255813954,
"eval_loss": 0.5129756927490234,
"eval_runtime": 191.7296,
"eval_samples_per_second": 99.119,
"eval_steps_per_second": 2.065,
"step": 23700
},
{
"epoch": 46.02713178294574,
"grad_norm": 0.9951698184013367,
"learning_rate": 1.5899224806201552e-05,
"loss": 0.4959673690795898,
"step": 23750
},
{
"epoch": 46.02713178294574,
"eval_loss": 0.5178924798965454,
"eval_runtime": 193.978,
"eval_samples_per_second": 97.97,
"eval_steps_per_second": 2.041,
"step": 23750
},
{
"epoch": 46.12403100775194,
"grad_norm": 0.8214923739433289,
"learning_rate": 1.5511627906976743e-05,
"loss": 0.47073410034179686,
"step": 23800
},
{
"epoch": 46.12403100775194,
"eval_loss": 0.5200338959693909,
"eval_runtime": 194.4058,
"eval_samples_per_second": 97.754,
"eval_steps_per_second": 2.037,
"step": 23800
},
{
"epoch": 46.22093023255814,
"grad_norm": 1.0531407594680786,
"learning_rate": 1.5124031007751938e-05,
"loss": 0.4762581253051758,
"step": 23850
},
{
"epoch": 46.22093023255814,
"eval_loss": 0.5189253687858582,
"eval_runtime": 192.1836,
"eval_samples_per_second": 98.885,
"eval_steps_per_second": 2.061,
"step": 23850
},
{
"epoch": 46.31782945736434,
"grad_norm": 0.8493297696113586,
"learning_rate": 1.4736434108527133e-05,
"loss": 0.47398033142089846,
"step": 23900
},
{
"epoch": 46.31782945736434,
"eval_loss": 0.5211531519889832,
"eval_runtime": 196.6,
"eval_samples_per_second": 96.663,
"eval_steps_per_second": 2.014,
"step": 23900
},
{
"epoch": 46.41472868217054,
"grad_norm": 0.9804657697677612,
"learning_rate": 1.4348837209302326e-05,
"loss": 0.4727859878540039,
"step": 23950
},
{
"epoch": 46.41472868217054,
"eval_loss": 0.5094043612480164,
"eval_runtime": 193.37,
"eval_samples_per_second": 98.278,
"eval_steps_per_second": 2.048,
"step": 23950
},
{
"epoch": 46.51162790697674,
"grad_norm": 0.9358029961585999,
"learning_rate": 1.3961240310077519e-05,
"loss": 0.4944017028808594,
"step": 24000
},
{
"epoch": 46.51162790697674,
"eval_loss": 0.5203186273574829,
"eval_runtime": 192.6269,
"eval_samples_per_second": 98.657,
"eval_steps_per_second": 2.056,
"step": 24000
},
{
"epoch": 46.60852713178294,
"grad_norm": 1.053912878036499,
"learning_rate": 1.3573643410852712e-05,
"loss": 0.48282398223876954,
"step": 24050
},
{
"epoch": 46.60852713178294,
"eval_loss": 0.5168351531028748,
"eval_runtime": 193.3422,
"eval_samples_per_second": 98.292,
"eval_steps_per_second": 2.048,
"step": 24050
},
{
"epoch": 46.70542635658915,
"grad_norm": 0.9707403182983398,
"learning_rate": 1.3186046511627908e-05,
"loss": 0.49057979583740235,
"step": 24100
},
{
"epoch": 46.70542635658915,
"eval_loss": 0.5090420842170715,
"eval_runtime": 194.8113,
"eval_samples_per_second": 97.551,
"eval_steps_per_second": 2.033,
"step": 24100
},
{
"epoch": 46.80232558139535,
"grad_norm": 0.8060054183006287,
"learning_rate": 1.2798449612403101e-05,
"loss": 0.4925830841064453,
"step": 24150
},
{
"epoch": 46.80232558139535,
"eval_loss": 0.5142449736595154,
"eval_runtime": 193.6387,
"eval_samples_per_second": 98.142,
"eval_steps_per_second": 2.045,
"step": 24150
},
{
"epoch": 46.89922480620155,
"grad_norm": 1.0535565614700317,
"learning_rate": 1.2410852713178294e-05,
"loss": 0.48466400146484373,
"step": 24200
},
{
"epoch": 46.89922480620155,
"eval_loss": 0.5140147805213928,
"eval_runtime": 194.8466,
"eval_samples_per_second": 97.533,
"eval_steps_per_second": 2.032,
"step": 24200
},
{
"epoch": 46.99612403100775,
"grad_norm": 0.8662691712379456,
"learning_rate": 1.202325581395349e-05,
"loss": 0.4897040939331055,
"step": 24250
},
{
"epoch": 46.99612403100775,
"eval_loss": 0.5235028862953186,
"eval_runtime": 193.8048,
"eval_samples_per_second": 98.057,
"eval_steps_per_second": 2.043,
"step": 24250
},
{
"epoch": 47.093023255813954,
"grad_norm": 1.1001732349395752,
"learning_rate": 1.1635658914728682e-05,
"loss": 0.49039371490478517,
"step": 24300
},
{
"epoch": 47.093023255813954,
"eval_loss": 0.5225379467010498,
"eval_runtime": 194.172,
"eval_samples_per_second": 97.872,
"eval_steps_per_second": 2.039,
"step": 24300
},
{
"epoch": 47.189922480620154,
"grad_norm": 0.8969790935516357,
"learning_rate": 1.1248062015503877e-05,
"loss": 0.47529258728027346,
"step": 24350
},
{
"epoch": 47.189922480620154,
"eval_loss": 0.5167431831359863,
"eval_runtime": 191.575,
"eval_samples_per_second": 99.199,
"eval_steps_per_second": 2.067,
"step": 24350
},
{
"epoch": 47.286821705426355,
"grad_norm": 0.8796032071113586,
"learning_rate": 1.086046511627907e-05,
"loss": 0.48536407470703125,
"step": 24400
},
{
"epoch": 47.286821705426355,
"eval_loss": 0.5132911205291748,
"eval_runtime": 192.842,
"eval_samples_per_second": 98.547,
"eval_steps_per_second": 2.053,
"step": 24400
},
{
"epoch": 47.383720930232556,
"grad_norm": 0.7977496981620789,
"learning_rate": 1.0472868217054265e-05,
"loss": 0.4799433517456055,
"step": 24450
},
{
"epoch": 47.383720930232556,
"eval_loss": 0.5126928687095642,
"eval_runtime": 192.6571,
"eval_samples_per_second": 98.642,
"eval_steps_per_second": 2.055,
"step": 24450
},
{
"epoch": 47.48062015503876,
"grad_norm": 1.0533097982406616,
"learning_rate": 1.0085271317829458e-05,
"loss": 0.49268310546875,
"step": 24500
},
{
"epoch": 47.48062015503876,
"eval_loss": 0.5129761695861816,
"eval_runtime": 193.297,
"eval_samples_per_second": 98.315,
"eval_steps_per_second": 2.049,
"step": 24500
},
{
"epoch": 47.57751937984496,
"grad_norm": 0.9760648608207703,
"learning_rate": 9.697674418604652e-06,
"loss": 0.49476985931396483,
"step": 24550
},
{
"epoch": 47.57751937984496,
"eval_loss": 0.5181257128715515,
"eval_runtime": 187.2145,
"eval_samples_per_second": 101.509,
"eval_steps_per_second": 2.115,
"step": 24550
},
{
"epoch": 47.674418604651166,
"grad_norm": 1.0375988483428955,
"learning_rate": 9.310077519379845e-06,
"loss": 0.4825825881958008,
"step": 24600
},
{
"epoch": 47.674418604651166,
"eval_loss": 0.5161689519882202,
"eval_runtime": 195.4323,
"eval_samples_per_second": 97.241,
"eval_steps_per_second": 2.026,
"step": 24600
},
{
"epoch": 47.77131782945737,
"grad_norm": 1.0778166055679321,
"learning_rate": 8.92248062015504e-06,
"loss": 0.47775421142578123,
"step": 24650
},
{
"epoch": 47.77131782945737,
"eval_loss": 0.5174992084503174,
"eval_runtime": 193.574,
"eval_samples_per_second": 98.174,
"eval_steps_per_second": 2.046,
"step": 24650
},
{
"epoch": 47.86821705426357,
"grad_norm": 0.8025913238525391,
"learning_rate": 8.534883720930233e-06,
"loss": 0.4860971450805664,
"step": 24700
},
{
"epoch": 47.86821705426357,
"eval_loss": 0.5220958590507507,
"eval_runtime": 187.9385,
"eval_samples_per_second": 101.118,
"eval_steps_per_second": 2.107,
"step": 24700
},
{
"epoch": 47.96511627906977,
"grad_norm": 0.9379816651344299,
"learning_rate": 8.147286821705428e-06,
"loss": 0.4851155471801758,
"step": 24750
},
{
"epoch": 47.96511627906977,
"eval_loss": 0.5218383073806763,
"eval_runtime": 191.0396,
"eval_samples_per_second": 99.477,
"eval_steps_per_second": 2.073,
"step": 24750
},
{
"epoch": 48.06201550387597,
"grad_norm": 0.8389429450035095,
"learning_rate": 7.759689922480621e-06,
"loss": 0.4862751770019531,
"step": 24800
},
{
"epoch": 48.06201550387597,
"eval_loss": 0.5244157314300537,
"eval_runtime": 194.1611,
"eval_samples_per_second": 97.878,
"eval_steps_per_second": 2.04,
"step": 24800
},
{
"epoch": 48.15891472868217,
"grad_norm": 1.1113231182098389,
"learning_rate": 7.372093023255815e-06,
"loss": 0.4865293502807617,
"step": 24850
},
{
"epoch": 48.15891472868217,
"eval_loss": 0.5153717398643494,
"eval_runtime": 188.1418,
"eval_samples_per_second": 101.009,
"eval_steps_per_second": 2.105,
"step": 24850
},
{
"epoch": 48.25581395348837,
"grad_norm": 0.9999150633811951,
"learning_rate": 6.984496124031008e-06,
"loss": 0.48201698303222656,
"step": 24900
},
{
"epoch": 48.25581395348837,
"eval_loss": 0.5217995047569275,
"eval_runtime": 202.7563,
"eval_samples_per_second": 93.728,
"eval_steps_per_second": 1.953,
"step": 24900
},
{
"epoch": 48.35271317829457,
"grad_norm": 0.9121875166893005,
"learning_rate": 6.596899224806203e-06,
"loss": 0.4848302459716797,
"step": 24950
},
{
"epoch": 48.35271317829457,
"eval_loss": 0.5181519389152527,
"eval_runtime": 203.5881,
"eval_samples_per_second": 93.345,
"eval_steps_per_second": 1.945,
"step": 24950
},
{
"epoch": 48.44961240310077,
"grad_norm": 0.9109277129173279,
"learning_rate": 6.209302325581396e-06,
"loss": 0.4896494293212891,
"step": 25000
},
{
"epoch": 48.44961240310077,
"eval_loss": 0.5163356065750122,
"eval_runtime": 205.6732,
"eval_samples_per_second": 92.399,
"eval_steps_per_second": 1.925,
"step": 25000
},
{
"epoch": 48.54651162790697,
"grad_norm": 0.8780000805854797,
"learning_rate": 5.8217054263565895e-06,
"loss": 0.4852371597290039,
"step": 25050
},
{
"epoch": 48.54651162790697,
"eval_loss": 0.5103786587715149,
"eval_runtime": 203.7202,
"eval_samples_per_second": 93.285,
"eval_steps_per_second": 1.944,
"step": 25050
},
{
"epoch": 48.64341085271318,
"grad_norm": 1.0753669738769531,
"learning_rate": 5.4341085271317826e-06,
"loss": 0.48256893157958985,
"step": 25100
},
{
"epoch": 48.64341085271318,
"eval_loss": 0.5188427567481995,
"eval_runtime": 206.588,
"eval_samples_per_second": 91.99,
"eval_steps_per_second": 1.917,
"step": 25100
},
{
"epoch": 48.74031007751938,
"grad_norm": 0.9151817560195923,
"learning_rate": 5.0465116279069764e-06,
"loss": 0.4795745849609375,
"step": 25150
},
{
"epoch": 48.74031007751938,
"eval_loss": 0.5129519104957581,
"eval_runtime": 217.8088,
"eval_samples_per_second": 87.251,
"eval_steps_per_second": 1.818,
"step": 25150
},
{
"epoch": 48.83720930232558,
"grad_norm": 0.8677839636802673,
"learning_rate": 4.65891472868217e-06,
"loss": 0.4724645233154297,
"step": 25200
},
{
"epoch": 48.83720930232558,
"eval_loss": 0.5128815174102783,
"eval_runtime": 222.6438,
"eval_samples_per_second": 85.356,
"eval_steps_per_second": 1.779,
"step": 25200
},
{
"epoch": 48.934108527131784,
"grad_norm": 1.0911849737167358,
"learning_rate": 4.271317829457364e-06,
"loss": 0.4840513610839844,
"step": 25250
},
{
"epoch": 48.934108527131784,
"eval_loss": 0.5184940695762634,
"eval_runtime": 212.3902,
"eval_samples_per_second": 89.477,
"eval_steps_per_second": 1.864,
"step": 25250
},
{
"epoch": 49.031007751937985,
"grad_norm": 1.049880027770996,
"learning_rate": 3.883720930232558e-06,
"loss": 0.4812747955322266,
"step": 25300
},
{
"epoch": 49.031007751937985,
"eval_loss": 0.5137081742286682,
"eval_runtime": 207.0601,
"eval_samples_per_second": 91.78,
"eval_steps_per_second": 1.912,
"step": 25300
},
{
"epoch": 49.127906976744185,
"grad_norm": 0.7971972823143005,
"learning_rate": 3.496124031007752e-06,
"loss": 0.4881282424926758,
"step": 25350
},
{
"epoch": 49.127906976744185,
"eval_loss": 0.5165645480155945,
"eval_runtime": 202.7666,
"eval_samples_per_second": 93.724,
"eval_steps_per_second": 1.953,
"step": 25350
},
{
"epoch": 49.224806201550386,
"grad_norm": 0.8528485298156738,
"learning_rate": 3.108527131782946e-06,
"loss": 0.48426227569580077,
"step": 25400
},
{
"epoch": 49.224806201550386,
"eval_loss": 0.5217786431312561,
"eval_runtime": 201.3146,
"eval_samples_per_second": 94.4,
"eval_steps_per_second": 1.967,
"step": 25400
},
{
"epoch": 49.32170542635659,
"grad_norm": 0.9365469813346863,
"learning_rate": 2.7209302325581397e-06,
"loss": 0.4818797302246094,
"step": 25450
},
{
"epoch": 49.32170542635659,
"eval_loss": 0.5201877951622009,
"eval_runtime": 204.7457,
"eval_samples_per_second": 92.818,
"eval_steps_per_second": 1.934,
"step": 25450
},
{
"epoch": 49.41860465116279,
"grad_norm": 0.787333071231842,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.48442951202392576,
"step": 25500
},
{
"epoch": 49.41860465116279,
"eval_loss": 0.5099871158599854,
"eval_runtime": 200.4033,
"eval_samples_per_second": 94.829,
"eval_steps_per_second": 1.976,
"step": 25500
},
{
"epoch": 49.51550387596899,
"grad_norm": 1.1736680269241333,
"learning_rate": 1.9457364341085275e-06,
"loss": 0.49098060607910154,
"step": 25550
},
{
"epoch": 49.51550387596899,
"eval_loss": 0.5097952485084534,
"eval_runtime": 202.1301,
"eval_samples_per_second": 94.019,
"eval_steps_per_second": 1.959,
"step": 25550
},
{
"epoch": 49.6124031007752,
"grad_norm": 0.8908654451370239,
"learning_rate": 1.558139534883721e-06,
"loss": 0.4801137924194336,
"step": 25600
},
{
"epoch": 49.6124031007752,
"eval_loss": 0.5076445937156677,
"eval_runtime": 205.4214,
"eval_samples_per_second": 92.512,
"eval_steps_per_second": 1.928,
"step": 25600
},
{
"epoch": 49.7093023255814,
"grad_norm": 0.9247883558273315,
"learning_rate": 1.1705426356589148e-06,
"loss": 0.47291782379150393,
"step": 25650
},
{
"epoch": 49.7093023255814,
"eval_loss": 0.5182952880859375,
"eval_runtime": 215.4281,
"eval_samples_per_second": 88.215,
"eval_steps_per_second": 1.838,
"step": 25650
},
{
"epoch": 49.8062015503876,
"grad_norm": 1.0541952848434448,
"learning_rate": 7.829457364341086e-07,
"loss": 0.4783976745605469,
"step": 25700
},
{
"epoch": 49.8062015503876,
"eval_loss": 0.5079155564308167,
"eval_runtime": 203.5265,
"eval_samples_per_second": 93.374,
"eval_steps_per_second": 1.946,
"step": 25700
},
{
"epoch": 49.9031007751938,
"grad_norm": 1.1467986106872559,
"learning_rate": 3.953488372093023e-07,
"loss": 0.48399200439453127,
"step": 25750
},
{
"epoch": 49.9031007751938,
"eval_loss": 0.5200158357620239,
"eval_runtime": 203.7997,
"eval_samples_per_second": 93.248,
"eval_steps_per_second": 1.943,
"step": 25750
},
{
"epoch": 50.0,
"grad_norm": 1.033553123474121,
"learning_rate": 7.751937984496125e-09,
"loss": 0.48281837463378907,
"step": 25800
},
{
"epoch": 50.0,
"eval_loss": 0.5126314163208008,
"eval_runtime": 203.2326,
"eval_samples_per_second": 93.509,
"eval_steps_per_second": 1.949,
"step": 25800
}
],
"logging_steps": 50,
"max_steps": 25800,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.204755448469504e+17,
"train_batch_size": 192,
"trial_name": null,
"trial_params": null
}