hp_ablations_mistral_epoch1 / trainer_state.json
sedrickkeh's picture
End of training
248a73e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984810126582279,
"eval_steps": 500,
"global_step": 493,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020253164556962026,
"grad_norm": 2.882945391057014,
"learning_rate": 5e-06,
"loss": 0.7569,
"step": 10
},
{
"epoch": 0.04050632911392405,
"grad_norm": 2.215930707747834,
"learning_rate": 5e-06,
"loss": 0.6506,
"step": 20
},
{
"epoch": 0.060759493670886074,
"grad_norm": 1.894429021053666,
"learning_rate": 5e-06,
"loss": 0.6288,
"step": 30
},
{
"epoch": 0.0810126582278481,
"grad_norm": 1.4315710417507135,
"learning_rate": 5e-06,
"loss": 0.6136,
"step": 40
},
{
"epoch": 0.10126582278481013,
"grad_norm": 1.7157625882254233,
"learning_rate": 5e-06,
"loss": 0.6033,
"step": 50
},
{
"epoch": 0.12151898734177215,
"grad_norm": 1.4217599360047826,
"learning_rate": 5e-06,
"loss": 0.5971,
"step": 60
},
{
"epoch": 0.14177215189873418,
"grad_norm": 1.4919608736087413,
"learning_rate": 5e-06,
"loss": 0.5928,
"step": 70
},
{
"epoch": 0.1620253164556962,
"grad_norm": 1.3847069011356332,
"learning_rate": 5e-06,
"loss": 0.5879,
"step": 80
},
{
"epoch": 0.18227848101265823,
"grad_norm": 1.8689825245661746,
"learning_rate": 5e-06,
"loss": 0.5865,
"step": 90
},
{
"epoch": 0.20253164556962025,
"grad_norm": 1.6007319042790646,
"learning_rate": 5e-06,
"loss": 0.5891,
"step": 100
},
{
"epoch": 0.22278481012658227,
"grad_norm": 1.6661187893059985,
"learning_rate": 5e-06,
"loss": 0.5817,
"step": 110
},
{
"epoch": 0.2430379746835443,
"grad_norm": 1.5088824762043296,
"learning_rate": 5e-06,
"loss": 0.579,
"step": 120
},
{
"epoch": 0.26329113924050634,
"grad_norm": 2.4968134853778117,
"learning_rate": 5e-06,
"loss": 0.578,
"step": 130
},
{
"epoch": 0.28354430379746837,
"grad_norm": 1.7148820016172805,
"learning_rate": 5e-06,
"loss": 0.5678,
"step": 140
},
{
"epoch": 0.3037974683544304,
"grad_norm": 3.7110251410070583,
"learning_rate": 5e-06,
"loss": 0.5781,
"step": 150
},
{
"epoch": 0.3240506329113924,
"grad_norm": 2.7150715560626195,
"learning_rate": 5e-06,
"loss": 0.5763,
"step": 160
},
{
"epoch": 0.34430379746835443,
"grad_norm": 1.2312010583367314,
"learning_rate": 5e-06,
"loss": 0.5751,
"step": 170
},
{
"epoch": 0.36455696202531646,
"grad_norm": 1.9963116161483436,
"learning_rate": 5e-06,
"loss": 0.5706,
"step": 180
},
{
"epoch": 0.3848101265822785,
"grad_norm": 1.3042779714841761,
"learning_rate": 5e-06,
"loss": 0.5716,
"step": 190
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.4634415468631845,
"learning_rate": 5e-06,
"loss": 0.5675,
"step": 200
},
{
"epoch": 0.4253164556962025,
"grad_norm": 1.4367866258568494,
"learning_rate": 5e-06,
"loss": 0.5632,
"step": 210
},
{
"epoch": 0.44556962025316454,
"grad_norm": 1.9787155615814422,
"learning_rate": 5e-06,
"loss": 0.5686,
"step": 220
},
{
"epoch": 0.46582278481012657,
"grad_norm": 1.554761610864969,
"learning_rate": 5e-06,
"loss": 0.5679,
"step": 230
},
{
"epoch": 0.4860759493670886,
"grad_norm": 1.4050684281133805,
"learning_rate": 5e-06,
"loss": 0.5634,
"step": 240
},
{
"epoch": 0.5063291139240507,
"grad_norm": 2.389886434286958,
"learning_rate": 5e-06,
"loss": 0.5679,
"step": 250
},
{
"epoch": 0.5265822784810127,
"grad_norm": 1.9930786400945644,
"learning_rate": 5e-06,
"loss": 0.5549,
"step": 260
},
{
"epoch": 0.5468354430379747,
"grad_norm": 2.241950368987,
"learning_rate": 5e-06,
"loss": 0.5627,
"step": 270
},
{
"epoch": 0.5670886075949367,
"grad_norm": 1.2762960468034965,
"learning_rate": 5e-06,
"loss": 0.557,
"step": 280
},
{
"epoch": 0.5873417721518988,
"grad_norm": 1.9147658341187483,
"learning_rate": 5e-06,
"loss": 0.558,
"step": 290
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.8505958524714776,
"learning_rate": 5e-06,
"loss": 0.5571,
"step": 300
},
{
"epoch": 0.6278481012658228,
"grad_norm": 1.9130752385076857,
"learning_rate": 5e-06,
"loss": 0.5619,
"step": 310
},
{
"epoch": 0.6481012658227848,
"grad_norm": 1.3375681147091247,
"learning_rate": 5e-06,
"loss": 0.5589,
"step": 320
},
{
"epoch": 0.6683544303797468,
"grad_norm": 1.7350301588890396,
"learning_rate": 5e-06,
"loss": 0.559,
"step": 330
},
{
"epoch": 0.6886075949367089,
"grad_norm": 1.3739045710832587,
"learning_rate": 5e-06,
"loss": 0.5577,
"step": 340
},
{
"epoch": 0.7088607594936709,
"grad_norm": 1.2476279822426537,
"learning_rate": 5e-06,
"loss": 0.5592,
"step": 350
},
{
"epoch": 0.7291139240506329,
"grad_norm": 1.5513351657124166,
"learning_rate": 5e-06,
"loss": 0.5624,
"step": 360
},
{
"epoch": 0.7493670886075949,
"grad_norm": 1.4242614435377774,
"learning_rate": 5e-06,
"loss": 0.5553,
"step": 370
},
{
"epoch": 0.769620253164557,
"grad_norm": 1.154791151204361,
"learning_rate": 5e-06,
"loss": 0.5523,
"step": 380
},
{
"epoch": 0.789873417721519,
"grad_norm": 1.2406846291234284,
"learning_rate": 5e-06,
"loss": 0.5564,
"step": 390
},
{
"epoch": 0.810126582278481,
"grad_norm": 1.3155697193277984,
"learning_rate": 5e-06,
"loss": 0.5572,
"step": 400
},
{
"epoch": 0.830379746835443,
"grad_norm": 1.290691328404162,
"learning_rate": 5e-06,
"loss": 0.5516,
"step": 410
},
{
"epoch": 0.850632911392405,
"grad_norm": 1.2175694217624276,
"learning_rate": 5e-06,
"loss": 0.5592,
"step": 420
},
{
"epoch": 0.8708860759493671,
"grad_norm": 1.201899124438492,
"learning_rate": 5e-06,
"loss": 0.5526,
"step": 430
},
{
"epoch": 0.8911392405063291,
"grad_norm": 1.182820314526854,
"learning_rate": 5e-06,
"loss": 0.5577,
"step": 440
},
{
"epoch": 0.9113924050632911,
"grad_norm": 1.4381251643237962,
"learning_rate": 5e-06,
"loss": 0.5494,
"step": 450
},
{
"epoch": 0.9316455696202531,
"grad_norm": 1.0475436039121593,
"learning_rate": 5e-06,
"loss": 0.5539,
"step": 460
},
{
"epoch": 0.9518987341772152,
"grad_norm": 1.3740300354445663,
"learning_rate": 5e-06,
"loss": 0.5549,
"step": 470
},
{
"epoch": 0.9721518987341772,
"grad_norm": 1.0792354520292615,
"learning_rate": 5e-06,
"loss": 0.5522,
"step": 480
},
{
"epoch": 0.9924050632911392,
"grad_norm": 1.1522113303559152,
"learning_rate": 5e-06,
"loss": 0.549,
"step": 490
},
{
"epoch": 0.9984810126582279,
"eval_loss": 0.0689920112490654,
"eval_runtime": 505.0744,
"eval_samples_per_second": 26.341,
"eval_steps_per_second": 0.412,
"step": 493
},
{
"epoch": 0.9984810126582279,
"step": 493,
"total_flos": 825583982346240.0,
"train_loss": 0.5740175109364198,
"train_runtime": 27939.3141,
"train_samples_per_second": 9.047,
"train_steps_per_second": 0.018
}
],
"logging_steps": 10,
"max_steps": 493,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 825583982346240.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}