mistral-chatpro / checkpoint-800 /trainer_state.json
Doug240's picture
Upload folder using huggingface_hub
d50d315 verified
{
"best_global_step": 800,
"best_metric": 0.8280864357948303,
"best_model_checkpoint": "/workspace/model/finetuned/checkpoint-800",
"epoch": 4.848484848484849,
"eval_steps": 25,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.15151515151515152,
"grad_norm": 180.98402404785156,
"learning_rate": 2.7e-06,
"loss": 30.8939,
"step": 25
},
{
"epoch": 0.15151515151515152,
"eval_loss": 2.02256441116333,
"eval_runtime": 26.9436,
"eval_samples_per_second": 21.786,
"eval_steps_per_second": 2.746,
"step": 25
},
{
"epoch": 0.30303030303030304,
"grad_norm": 385.3758544921875,
"learning_rate": 6.45e-06,
"loss": 28.2375,
"step": 50
},
{
"epoch": 0.30303030303030304,
"eval_loss": 1.5823099613189697,
"eval_runtime": 27.0059,
"eval_samples_per_second": 21.736,
"eval_steps_per_second": 2.74,
"step": 50
},
{
"epoch": 0.45454545454545453,
"grad_norm": 30.632709503173828,
"learning_rate": 1.02e-05,
"loss": 21.1384,
"step": 75
},
{
"epoch": 0.45454545454545453,
"eval_loss": 1.216854214668274,
"eval_runtime": 26.9681,
"eval_samples_per_second": 21.766,
"eval_steps_per_second": 2.744,
"step": 75
},
{
"epoch": 0.6060606060606061,
"grad_norm": 9.063393592834473,
"learning_rate": 1.395e-05,
"loss": 18.6661,
"step": 100
},
{
"epoch": 0.6060606060606061,
"eval_loss": 1.1212154626846313,
"eval_runtime": 26.8008,
"eval_samples_per_second": 21.902,
"eval_steps_per_second": 2.761,
"step": 100
},
{
"epoch": 0.7575757575757576,
"grad_norm": 10.971136093139648,
"learning_rate": 1.77e-05,
"loss": 17.3687,
"step": 125
},
{
"epoch": 0.7575757575757576,
"eval_loss": 1.0298739671707153,
"eval_runtime": 26.8143,
"eval_samples_per_second": 21.891,
"eval_steps_per_second": 2.76,
"step": 125
},
{
"epoch": 0.9090909090909091,
"grad_norm": 8.818650245666504,
"learning_rate": 2.145e-05,
"loss": 15.8605,
"step": 150
},
{
"epoch": 0.9090909090909091,
"eval_loss": 0.9787100553512573,
"eval_runtime": 26.571,
"eval_samples_per_second": 22.092,
"eval_steps_per_second": 2.785,
"step": 150
},
{
"epoch": 1.0606060606060606,
"grad_norm": 11.700923919677734,
"learning_rate": 2.52e-05,
"loss": 15.163,
"step": 175
},
{
"epoch": 1.0606060606060606,
"eval_loss": 0.9414308667182922,
"eval_runtime": 26.626,
"eval_samples_per_second": 22.046,
"eval_steps_per_second": 2.779,
"step": 175
},
{
"epoch": 1.2121212121212122,
"grad_norm": 10.862292289733887,
"learning_rate": 2.895e-05,
"loss": 14.6871,
"step": 200
},
{
"epoch": 1.2121212121212122,
"eval_loss": 0.9198422431945801,
"eval_runtime": 26.7039,
"eval_samples_per_second": 21.982,
"eval_steps_per_second": 2.771,
"step": 200
},
{
"epoch": 1.3636363636363638,
"grad_norm": 11.315472602844238,
"learning_rate": 2.9136e-05,
"loss": 14.8471,
"step": 225
},
{
"epoch": 1.3636363636363638,
"eval_loss": 0.9041078686714172,
"eval_runtime": 26.5947,
"eval_samples_per_second": 22.072,
"eval_steps_per_second": 2.783,
"step": 225
},
{
"epoch": 1.5151515151515151,
"grad_norm": 12.701277732849121,
"learning_rate": 2.7936e-05,
"loss": 14.163,
"step": 250
},
{
"epoch": 1.5151515151515151,
"eval_loss": 0.8926578760147095,
"eval_runtime": 26.7046,
"eval_samples_per_second": 21.981,
"eval_steps_per_second": 2.771,
"step": 250
},
{
"epoch": 1.6666666666666665,
"grad_norm": 11.456862449645996,
"learning_rate": 2.6736e-05,
"loss": 14.0006,
"step": 275
},
{
"epoch": 1.6666666666666665,
"eval_loss": 0.8840105533599854,
"eval_runtime": 26.5642,
"eval_samples_per_second": 22.097,
"eval_steps_per_second": 2.786,
"step": 275
},
{
"epoch": 1.8181818181818183,
"grad_norm": 11.606865882873535,
"learning_rate": 2.5536e-05,
"loss": 14.0325,
"step": 300
},
{
"epoch": 1.8181818181818183,
"eval_loss": 0.8764263987541199,
"eval_runtime": 26.4677,
"eval_samples_per_second": 22.178,
"eval_steps_per_second": 2.796,
"step": 300
},
{
"epoch": 1.9696969696969697,
"grad_norm": 11.702313423156738,
"learning_rate": 2.4336000000000002e-05,
"loss": 14.1239,
"step": 325
},
{
"epoch": 1.9696969696969697,
"eval_loss": 0.8708490133285522,
"eval_runtime": 26.714,
"eval_samples_per_second": 21.973,
"eval_steps_per_second": 2.77,
"step": 325
},
{
"epoch": 2.121212121212121,
"grad_norm": 13.907278060913086,
"learning_rate": 2.3136e-05,
"loss": 13.6706,
"step": 350
},
{
"epoch": 2.121212121212121,
"eval_loss": 0.8656915426254272,
"eval_runtime": 26.7031,
"eval_samples_per_second": 21.982,
"eval_steps_per_second": 2.771,
"step": 350
},
{
"epoch": 2.2727272727272725,
"grad_norm": 13.098384857177734,
"learning_rate": 2.1935999999999998e-05,
"loss": 13.5478,
"step": 375
},
{
"epoch": 2.2727272727272725,
"eval_loss": 0.8606927394866943,
"eval_runtime": 26.6066,
"eval_samples_per_second": 22.062,
"eval_steps_per_second": 2.781,
"step": 375
},
{
"epoch": 2.4242424242424243,
"grad_norm": 15.584559440612793,
"learning_rate": 2.0736000000000003e-05,
"loss": 13.5654,
"step": 400
},
{
"epoch": 2.4242424242424243,
"eval_loss": 0.8570966720581055,
"eval_runtime": 26.7383,
"eval_samples_per_second": 21.954,
"eval_steps_per_second": 2.768,
"step": 400
},
{
"epoch": 2.5757575757575757,
"grad_norm": 14.500994682312012,
"learning_rate": 1.9536e-05,
"loss": 13.4998,
"step": 425
},
{
"epoch": 2.5757575757575757,
"eval_loss": 0.8537192940711975,
"eval_runtime": 26.82,
"eval_samples_per_second": 21.887,
"eval_steps_per_second": 2.759,
"step": 425
},
{
"epoch": 2.7272727272727275,
"grad_norm": 13.635045051574707,
"learning_rate": 1.8336e-05,
"loss": 13.3694,
"step": 450
},
{
"epoch": 2.7272727272727275,
"eval_loss": 0.8501807451248169,
"eval_runtime": 26.631,
"eval_samples_per_second": 22.042,
"eval_steps_per_second": 2.779,
"step": 450
},
{
"epoch": 2.878787878787879,
"grad_norm": 14.899593353271484,
"learning_rate": 1.7136000000000003e-05,
"loss": 13.3274,
"step": 475
},
{
"epoch": 2.878787878787879,
"eval_loss": 0.8472868204116821,
"eval_runtime": 26.7572,
"eval_samples_per_second": 21.938,
"eval_steps_per_second": 2.766,
"step": 475
},
{
"epoch": 3.0303030303030303,
"grad_norm": 14.57861614227295,
"learning_rate": 1.5936e-05,
"loss": 13.1797,
"step": 500
},
{
"epoch": 3.0303030303030303,
"eval_loss": 0.8451663255691528,
"eval_runtime": 26.9575,
"eval_samples_per_second": 21.775,
"eval_steps_per_second": 2.745,
"step": 500
},
{
"epoch": 3.1818181818181817,
"grad_norm": 15.23614501953125,
"learning_rate": 1.4736000000000001e-05,
"loss": 13.221,
"step": 525
},
{
"epoch": 3.1818181818181817,
"eval_loss": 0.8429368734359741,
"eval_runtime": 26.6839,
"eval_samples_per_second": 21.998,
"eval_steps_per_second": 2.773,
"step": 525
},
{
"epoch": 3.3333333333333335,
"grad_norm": 16.392993927001953,
"learning_rate": 1.3536e-05,
"loss": 13.1811,
"step": 550
},
{
"epoch": 3.3333333333333335,
"eval_loss": 0.8409376740455627,
"eval_runtime": 26.569,
"eval_samples_per_second": 22.093,
"eval_steps_per_second": 2.785,
"step": 550
},
{
"epoch": 3.484848484848485,
"grad_norm": 14.45429515838623,
"learning_rate": 1.2336e-05,
"loss": 12.7355,
"step": 575
},
{
"epoch": 3.484848484848485,
"eval_loss": 0.8386228084564209,
"eval_runtime": 26.5967,
"eval_samples_per_second": 22.07,
"eval_steps_per_second": 2.782,
"step": 575
},
{
"epoch": 3.6363636363636362,
"grad_norm": 15.168094635009766,
"learning_rate": 1.1136e-05,
"loss": 13.0834,
"step": 600
},
{
"epoch": 3.6363636363636362,
"eval_loss": 0.8364977240562439,
"eval_runtime": 26.5442,
"eval_samples_per_second": 22.114,
"eval_steps_per_second": 2.788,
"step": 600
},
{
"epoch": 3.787878787878788,
"grad_norm": 16.040002822875977,
"learning_rate": 9.936e-06,
"loss": 13.1575,
"step": 625
},
{
"epoch": 3.787878787878788,
"eval_loss": 0.8345832824707031,
"eval_runtime": 26.5067,
"eval_samples_per_second": 22.145,
"eval_steps_per_second": 2.792,
"step": 625
},
{
"epoch": 3.9393939393939394,
"grad_norm": 16.534528732299805,
"learning_rate": 8.736e-06,
"loss": 12.8282,
"step": 650
},
{
"epoch": 3.9393939393939394,
"eval_loss": 0.832955539226532,
"eval_runtime": 26.6209,
"eval_samples_per_second": 22.05,
"eval_steps_per_second": 2.78,
"step": 650
},
{
"epoch": 4.090909090909091,
"grad_norm": 15.697587966918945,
"learning_rate": 7.5359999999999995e-06,
"loss": 12.707,
"step": 675
},
{
"epoch": 4.090909090909091,
"eval_loss": 0.8321042656898499,
"eval_runtime": 26.6706,
"eval_samples_per_second": 22.009,
"eval_steps_per_second": 2.775,
"step": 675
},
{
"epoch": 4.242424242424242,
"grad_norm": 16.229135513305664,
"learning_rate": 6.336e-06,
"loss": 12.7864,
"step": 700
},
{
"epoch": 4.242424242424242,
"eval_loss": 0.8310558795928955,
"eval_runtime": 26.5267,
"eval_samples_per_second": 22.129,
"eval_steps_per_second": 2.79,
"step": 700
},
{
"epoch": 4.393939393939394,
"grad_norm": 16.64604377746582,
"learning_rate": 5.136e-06,
"loss": 12.937,
"step": 725
},
{
"epoch": 4.393939393939394,
"eval_loss": 0.8296888470649719,
"eval_runtime": 26.5659,
"eval_samples_per_second": 22.096,
"eval_steps_per_second": 2.786,
"step": 725
},
{
"epoch": 4.545454545454545,
"grad_norm": 16.23679542541504,
"learning_rate": 3.936e-06,
"loss": 12.8632,
"step": 750
},
{
"epoch": 4.545454545454545,
"eval_loss": 0.828894317150116,
"eval_runtime": 26.6204,
"eval_samples_per_second": 22.051,
"eval_steps_per_second": 2.78,
"step": 750
},
{
"epoch": 4.696969696969697,
"grad_norm": 16.246938705444336,
"learning_rate": 2.736e-06,
"loss": 13.111,
"step": 775
},
{
"epoch": 4.696969696969697,
"eval_loss": 0.828315794467926,
"eval_runtime": 26.5355,
"eval_samples_per_second": 22.121,
"eval_steps_per_second": 2.789,
"step": 775
},
{
"epoch": 4.848484848484849,
"grad_norm": 17.31324577331543,
"learning_rate": 1.5360000000000002e-06,
"loss": 13.0132,
"step": 800
},
{
"epoch": 4.848484848484849,
"eval_loss": 0.8280864357948303,
"eval_runtime": 26.4945,
"eval_samples_per_second": 22.156,
"eval_steps_per_second": 2.793,
"step": 800
}
],
"logging_steps": 25,
"max_steps": 825,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.596536164371661e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}