runs / trainer_state.json
yknxh's picture
Model save
4080030 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 543,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0055248618784530384,
"grad_norm": 7.4375,
"learning_rate": 3.6363636363636366e-07,
"loss": 2.4042,
"step": 1
},
{
"epoch": 0.027624309392265192,
"grad_norm": 7.59375,
"learning_rate": 1.8181818181818183e-06,
"loss": 2.4209,
"step": 5
},
{
"epoch": 0.055248618784530384,
"grad_norm": 7.1875,
"learning_rate": 3.6363636363636366e-06,
"loss": 2.4119,
"step": 10
},
{
"epoch": 0.08287292817679558,
"grad_norm": 7.03125,
"learning_rate": 5.4545454545454545e-06,
"loss": 2.3657,
"step": 15
},
{
"epoch": 0.11049723756906077,
"grad_norm": 7.28125,
"learning_rate": 7.272727272727273e-06,
"loss": 2.3123,
"step": 20
},
{
"epoch": 0.13812154696132597,
"grad_norm": 5.65625,
"learning_rate": 9.090909090909091e-06,
"loss": 2.1773,
"step": 25
},
{
"epoch": 0.16574585635359115,
"grad_norm": 3.84375,
"learning_rate": 1.0909090909090909e-05,
"loss": 2.0066,
"step": 30
},
{
"epoch": 0.19337016574585636,
"grad_norm": 3.0,
"learning_rate": 1.2727272727272728e-05,
"loss": 1.9028,
"step": 35
},
{
"epoch": 0.22099447513812154,
"grad_norm": 2.421875,
"learning_rate": 1.4545454545454546e-05,
"loss": 1.7204,
"step": 40
},
{
"epoch": 0.24861878453038674,
"grad_norm": 2.109375,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.5881,
"step": 45
},
{
"epoch": 0.27624309392265195,
"grad_norm": 1.8125,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.4278,
"step": 50
},
{
"epoch": 0.30386740331491713,
"grad_norm": 1.71875,
"learning_rate": 2e-05,
"loss": 1.2964,
"step": 55
},
{
"epoch": 0.3314917127071823,
"grad_norm": 1.2109375,
"learning_rate": 1.9994819965926346e-05,
"loss": 1.1979,
"step": 60
},
{
"epoch": 0.35911602209944754,
"grad_norm": 1.0078125,
"learning_rate": 1.997928523025598e-05,
"loss": 1.1192,
"step": 65
},
{
"epoch": 0.3867403314917127,
"grad_norm": 0.76171875,
"learning_rate": 1.9953411887080917e-05,
"loss": 1.0823,
"step": 70
},
{
"epoch": 0.4143646408839779,
"grad_norm": 0.70703125,
"learning_rate": 1.9917226741361014e-05,
"loss": 1.0383,
"step": 75
},
{
"epoch": 0.4419889502762431,
"grad_norm": 0.69921875,
"learning_rate": 1.987076728115383e-05,
"loss": 1.0168,
"step": 80
},
{
"epoch": 0.4696132596685083,
"grad_norm": 0.68359375,
"learning_rate": 1.9814081638776743e-05,
"loss": 1.004,
"step": 85
},
{
"epoch": 0.4972375690607735,
"grad_norm": 0.7109375,
"learning_rate": 1.9747228540941555e-05,
"loss": 0.993,
"step": 90
},
{
"epoch": 0.5248618784530387,
"grad_norm": 0.765625,
"learning_rate": 1.9670277247913205e-05,
"loss": 0.9644,
"step": 95
},
{
"epoch": 0.5524861878453039,
"grad_norm": 0.76953125,
"learning_rate": 1.958330748175568e-05,
"loss": 0.996,
"step": 100
},
{
"epoch": 0.580110497237569,
"grad_norm": 0.734375,
"learning_rate": 1.948640934373939e-05,
"loss": 0.9704,
"step": 105
},
{
"epoch": 0.6077348066298343,
"grad_norm": 0.76953125,
"learning_rate": 1.9379683220995657e-05,
"loss": 0.9572,
"step": 110
},
{
"epoch": 0.6353591160220995,
"grad_norm": 0.86328125,
"learning_rate": 1.9263239682514953e-05,
"loss": 0.9553,
"step": 115
},
{
"epoch": 0.6629834254143646,
"grad_norm": 0.75,
"learning_rate": 1.9137199364596673e-05,
"loss": 0.9454,
"step": 120
},
{
"epoch": 0.6906077348066298,
"grad_norm": 0.75390625,
"learning_rate": 1.9001692845869113e-05,
"loss": 0.939,
"step": 125
},
{
"epoch": 0.7182320441988951,
"grad_norm": 0.77734375,
"learning_rate": 1.8856860512009115e-05,
"loss": 0.9433,
"step": 130
},
{
"epoch": 0.7458563535911602,
"grad_norm": 0.76953125,
"learning_rate": 1.8702852410301556e-05,
"loss": 0.9329,
"step": 135
},
{
"epoch": 0.7734806629834254,
"grad_norm": 0.71484375,
"learning_rate": 1.853982809418932e-05,
"loss": 0.9416,
"step": 140
},
{
"epoch": 0.8011049723756906,
"grad_norm": 0.7265625,
"learning_rate": 1.8367956457974872e-05,
"loss": 0.914,
"step": 145
},
{
"epoch": 0.8287292817679558,
"grad_norm": 0.87890625,
"learning_rate": 1.8187415561844586e-05,
"loss": 0.9229,
"step": 150
},
{
"epoch": 0.856353591160221,
"grad_norm": 0.828125,
"learning_rate": 1.7998392447397197e-05,
"loss": 0.9259,
"step": 155
},
{
"epoch": 0.8839779005524862,
"grad_norm": 0.85546875,
"learning_rate": 1.7801082943867406e-05,
"loss": 0.9421,
"step": 160
},
{
"epoch": 0.9116022099447514,
"grad_norm": 0.8046875,
"learning_rate": 1.7595691465245484e-05,
"loss": 0.9225,
"step": 165
},
{
"epoch": 0.9392265193370166,
"grad_norm": 0.7109375,
"learning_rate": 1.7382430798502977e-05,
"loss": 0.9066,
"step": 170
},
{
"epoch": 0.9668508287292817,
"grad_norm": 0.83984375,
"learning_rate": 1.7161521883143936e-05,
"loss": 0.8903,
"step": 175
},
{
"epoch": 0.994475138121547,
"grad_norm": 0.875,
"learning_rate": 1.693319358231011e-05,
"loss": 0.9252,
"step": 180
},
{
"epoch": 1.022099447513812,
"grad_norm": 0.8203125,
"learning_rate": 1.6697682445677158e-05,
"loss": 0.9035,
"step": 185
},
{
"epoch": 1.0497237569060773,
"grad_norm": 0.69140625,
"learning_rate": 1.6455232464387587e-05,
"loss": 0.9036,
"step": 190
},
{
"epoch": 1.0773480662983426,
"grad_norm": 0.71875,
"learning_rate": 1.6206094818274228e-05,
"loss": 0.8932,
"step": 195
},
{
"epoch": 1.1049723756906078,
"grad_norm": 0.91796875,
"learning_rate": 1.595052761563627e-05,
"loss": 0.9065,
"step": 200
},
{
"epoch": 1.132596685082873,
"grad_norm": 0.859375,
"learning_rate": 1.5688795625837274e-05,
"loss": 0.8995,
"step": 205
},
{
"epoch": 1.160220994475138,
"grad_norm": 0.78125,
"learning_rate": 1.542117000500229e-05,
"loss": 0.8844,
"step": 210
},
{
"epoch": 1.1878453038674033,
"grad_norm": 0.765625,
"learning_rate": 1.5147928015098309e-05,
"loss": 0.8894,
"step": 215
},
{
"epoch": 1.2154696132596685,
"grad_norm": 0.81640625,
"learning_rate": 1.4869352736688938e-05,
"loss": 0.894,
"step": 220
},
{
"epoch": 1.2430939226519337,
"grad_norm": 0.76953125,
"learning_rate": 1.458573277566103e-05,
"loss": 0.9222,
"step": 225
},
{
"epoch": 1.270718232044199,
"grad_norm": 0.75390625,
"learning_rate": 1.4297361964227004e-05,
"loss": 0.9014,
"step": 230
},
{
"epoch": 1.298342541436464,
"grad_norm": 0.8125,
"learning_rate": 1.4004539056512667e-05,
"loss": 0.9052,
"step": 235
},
{
"epoch": 1.3259668508287292,
"grad_norm": 0.87109375,
"learning_rate": 1.3707567419045926e-05,
"loss": 0.894,
"step": 240
},
{
"epoch": 1.3535911602209945,
"grad_norm": 0.8125,
"learning_rate": 1.3406754716466978e-05,
"loss": 0.9045,
"step": 245
},
{
"epoch": 1.3812154696132597,
"grad_norm": 0.78515625,
"learning_rate": 1.3102412592785654e-05,
"loss": 0.8737,
"step": 250
},
{
"epoch": 1.408839779005525,
"grad_norm": 0.7734375,
"learning_rate": 1.2794856348516095e-05,
"loss": 0.9029,
"step": 255
},
{
"epoch": 1.43646408839779,
"grad_norm": 0.83984375,
"learning_rate": 1.248440461402328e-05,
"loss": 0.8883,
"step": 260
},
{
"epoch": 1.4640883977900552,
"grad_norm": 0.96875,
"learning_rate": 1.2171379019419786e-05,
"loss": 0.8932,
"step": 265
},
{
"epoch": 1.4917127071823204,
"grad_norm": 0.890625,
"learning_rate": 1.1856103861354809e-05,
"loss": 0.8917,
"step": 270
},
{
"epoch": 1.5193370165745856,
"grad_norm": 0.78515625,
"learning_rate": 1.153890576704062e-05,
"loss": 0.9033,
"step": 275
},
{
"epoch": 1.5469613259668509,
"grad_norm": 0.8046875,
"learning_rate": 1.1220113355864549e-05,
"loss": 0.8839,
"step": 280
},
{
"epoch": 1.5745856353591159,
"grad_norm": 0.8046875,
"learning_rate": 1.0900056898937055e-05,
"loss": 0.8887,
"step": 285
},
{
"epoch": 1.6022099447513813,
"grad_norm": 0.8046875,
"learning_rate": 1.0579067976928614e-05,
"loss": 0.8951,
"step": 290
},
{
"epoch": 1.6298342541436464,
"grad_norm": 0.78125,
"learning_rate": 1.0257479136549889e-05,
"loss": 0.8954,
"step": 295
},
{
"epoch": 1.6574585635359116,
"grad_norm": 0.8359375,
"learning_rate": 9.935623546031043e-06,
"loss": 0.9004,
"step": 300
},
{
"epoch": 1.6850828729281768,
"grad_norm": 0.85546875,
"learning_rate": 9.613834649957216e-06,
"loss": 0.9045,
"step": 305
},
{
"epoch": 1.7127071823204418,
"grad_norm": 0.765625,
"learning_rate": 9.292445823817647e-06,
"loss": 0.8737,
"step": 310
},
{
"epoch": 1.7403314917127073,
"grad_norm": 0.8203125,
"learning_rate": 8.971790028626395e-06,
"loss": 0.8722,
"step": 315
},
{
"epoch": 1.7679558011049723,
"grad_norm": 0.921875,
"learning_rate": 8.652199465972462e-06,
"loss": 0.8995,
"step": 320
},
{
"epoch": 1.7955801104972375,
"grad_norm": 0.8203125,
"learning_rate": 8.334005233856681e-06,
"loss": 0.9114,
"step": 325
},
{
"epoch": 1.8232044198895028,
"grad_norm": 0.79296875,
"learning_rate": 8.017536983671929e-06,
"loss": 0.891,
"step": 330
},
{
"epoch": 1.850828729281768,
"grad_norm": 0.75,
"learning_rate": 7.703122578682047e-06,
"loss": 0.8875,
"step": 335
},
{
"epoch": 1.8784530386740332,
"grad_norm": 0.8125,
"learning_rate": 7.391087754353252e-06,
"loss": 0.8779,
"step": 340
},
{
"epoch": 1.9060773480662982,
"grad_norm": 0.76953125,
"learning_rate": 7.081755780889978e-06,
"loss": 0.885,
"step": 345
},
{
"epoch": 1.9337016574585635,
"grad_norm": 0.828125,
"learning_rate": 6.7754471283247594e-06,
"loss": 0.8875,
"step": 350
},
{
"epoch": 1.9613259668508287,
"grad_norm": 0.9140625,
"learning_rate": 6.472479134509052e-06,
"loss": 0.9037,
"step": 355
},
{
"epoch": 1.988950276243094,
"grad_norm": 0.83203125,
"learning_rate": 6.173165676349103e-06,
"loss": 0.8817,
"step": 360
},
{
"epoch": 2.016574585635359,
"grad_norm": 0.81640625,
"learning_rate": 5.8778168446273045e-06,
"loss": 0.8876,
"step": 365
},
{
"epoch": 2.044198895027624,
"grad_norm": 0.77734375,
"learning_rate": 5.586738622746042e-06,
"loss": 0.891,
"step": 370
},
{
"epoch": 2.0718232044198897,
"grad_norm": 0.80078125,
"learning_rate": 5.300232569726805e-06,
"loss": 0.8843,
"step": 375
},
{
"epoch": 2.0994475138121547,
"grad_norm": 0.9140625,
"learning_rate": 5.0185955077929774e-06,
"loss": 0.8696,
"step": 380
},
{
"epoch": 2.12707182320442,
"grad_norm": 0.7578125,
"learning_rate": 4.742119214860009e-06,
"loss": 0.8775,
"step": 385
},
{
"epoch": 2.154696132596685,
"grad_norm": 0.7578125,
"learning_rate": 4.471090122251496e-06,
"loss": 0.8797,
"step": 390
},
{
"epoch": 2.18232044198895,
"grad_norm": 0.78125,
"learning_rate": 4.205789017954364e-06,
"loss": 0.8832,
"step": 395
},
{
"epoch": 2.2099447513812156,
"grad_norm": 0.859375,
"learning_rate": 3.946490755720621e-06,
"loss": 0.884,
"step": 400
},
{
"epoch": 2.2375690607734806,
"grad_norm": 0.9453125,
"learning_rate": 3.6934639703169905e-06,
"loss": 0.8737,
"step": 405
},
{
"epoch": 2.265193370165746,
"grad_norm": 0.86328125,
"learning_rate": 3.4469707992174607e-06,
"loss": 0.8981,
"step": 410
},
{
"epoch": 2.292817679558011,
"grad_norm": 0.80078125,
"learning_rate": 3.207266611027069e-06,
"loss": 0.8736,
"step": 415
},
{
"epoch": 2.320441988950276,
"grad_norm": 0.78125,
"learning_rate": 2.97459974091831e-06,
"loss": 0.8757,
"step": 420
},
{
"epoch": 2.3480662983425415,
"grad_norm": 0.8203125,
"learning_rate": 2.7492112333541744e-06,
"loss": 0.902,
"step": 425
},
{
"epoch": 2.3756906077348066,
"grad_norm": 0.90625,
"learning_rate": 2.531334592364457e-06,
"loss": 0.8766,
"step": 430
},
{
"epoch": 2.403314917127072,
"grad_norm": 0.7734375,
"learning_rate": 2.3211955396340003e-06,
"loss": 0.8982,
"step": 435
},
{
"epoch": 2.430939226519337,
"grad_norm": 0.74609375,
"learning_rate": 2.1190117806534714e-06,
"loss": 0.8801,
"step": 440
},
{
"epoch": 2.458563535911602,
"grad_norm": 0.828125,
"learning_rate": 1.924992779174999e-06,
"loss": 0.8707,
"step": 445
},
{
"epoch": 2.4861878453038675,
"grad_norm": 0.859375,
"learning_rate": 1.7393395402063085e-06,
"loss": 0.8939,
"step": 450
},
{
"epoch": 2.5138121546961325,
"grad_norm": 0.859375,
"learning_rate": 1.5622444017681438e-06,
"loss": 0.8707,
"step": 455
},
{
"epoch": 2.541436464088398,
"grad_norm": 0.78515625,
"learning_rate": 1.3938908356307846e-06,
"loss": 0.8771,
"step": 460
},
{
"epoch": 2.569060773480663,
"grad_norm": 0.76171875,
"learning_rate": 1.2344532572360325e-06,
"loss": 0.857,
"step": 465
},
{
"epoch": 2.596685082872928,
"grad_norm": 0.83984375,
"learning_rate": 1.0840968450016276e-06,
"loss": 0.885,
"step": 470
},
{
"epoch": 2.6243093922651934,
"grad_norm": 0.78125,
"learning_rate": 9.42977369195286e-07,
"loss": 0.9007,
"step": 475
},
{
"epoch": 2.6519337016574585,
"grad_norm": 0.90234375,
"learning_rate": 8.112410305556307e-07,
"loss": 0.8988,
"step": 480
},
{
"epoch": 2.679558011049724,
"grad_norm": 0.78125,
"learning_rate": 6.890243088272453e-07,
"loss": 0.8702,
"step": 485
},
{
"epoch": 2.707182320441989,
"grad_norm": 0.875,
"learning_rate": 5.764538213667103e-07,
"loss": 0.8981,
"step": 490
},
{
"epoch": 2.734806629834254,
"grad_norm": 0.89453125,
"learning_rate": 4.73646191966175e-07,
"loss": 0.8912,
"step": 495
},
{
"epoch": 2.7624309392265194,
"grad_norm": 0.875,
"learning_rate": 3.8070793003030296e-07,
"loss": 0.8967,
"step": 500
},
{
"epoch": 2.7900552486187844,
"grad_norm": 0.828125,
"learning_rate": 2.9773532023180897e-07,
"loss": 0.8933,
"step": 505
},
{
"epoch": 2.81767955801105,
"grad_norm": 0.7578125,
"learning_rate": 2.248143227598809e-07,
"loss": 0.9039,
"step": 510
},
{
"epoch": 2.845303867403315,
"grad_norm": 0.7578125,
"learning_rate": 1.6202048426483652e-07,
"loss": 0.8864,
"step": 515
},
{
"epoch": 2.87292817679558,
"grad_norm": 0.8515625,
"learning_rate": 1.094188595912804e-07,
"loss": 0.8997,
"step": 520
},
{
"epoch": 2.9005524861878453,
"grad_norm": 0.87109375,
"learning_rate": 6.706394438083962e-08,
"loss": 0.8805,
"step": 525
},
{
"epoch": 2.9281767955801103,
"grad_norm": 0.7890625,
"learning_rate": 3.4999618614309784e-08,
"loss": 0.8959,
"step": 530
},
{
"epoch": 2.955801104972376,
"grad_norm": 0.7265625,
"learning_rate": 1.325910115169471e-08,
"loss": 0.8552,
"step": 535
},
{
"epoch": 2.983425414364641,
"grad_norm": 0.82421875,
"learning_rate": 1.8649153172423106e-09,
"loss": 0.8753,
"step": 540
},
{
"epoch": 3.0,
"step": 543,
"total_flos": 2.5856227500097536e+16,
"train_loss": 1.017678025019103,
"train_runtime": 224.2528,
"train_samples_per_second": 38.568,
"train_steps_per_second": 2.421
}
],
"logging_steps": 5,
"max_steps": 543,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5856227500097536e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}