phi2_pt_qa_1_v5 / trainer_state.json
vsvasconcelos's picture
Upload 8 files
d56a33e verified
{
"best_metric": 0.8981696963310242,
"best_model_checkpoint": "./Phi-2_PT_QA_1_v5/checkpoint-4100",
"epoch": 0.7496286138727003,
"eval_steps": 100,
"global_step": 4100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018283624728602444,
"grad_norm": 361.1343994140625,
"learning_rate": 1e-05,
"loss": 7.4107,
"step": 100
},
{
"epoch": 0.018283624728602444,
"eval_loss": 7.992985248565674,
"eval_runtime": 189.157,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 100
},
{
"epoch": 0.03656724945720489,
"grad_norm": 462.70465087890625,
"learning_rate": 2e-05,
"loss": 6.869,
"step": 200
},
{
"epoch": 0.03656724945720489,
"eval_loss": 2.2097244262695312,
"eval_runtime": 189.1384,
"eval_samples_per_second": 2.644,
"eval_steps_per_second": 0.169,
"step": 200
},
{
"epoch": 0.054850874185807336,
"grad_norm": 45.288211822509766,
"learning_rate": 3e-05,
"loss": 1.3255,
"step": 300
},
{
"epoch": 0.054850874185807336,
"eval_loss": 1.1541022062301636,
"eval_runtime": 188.9446,
"eval_samples_per_second": 2.646,
"eval_steps_per_second": 0.169,
"step": 300
},
{
"epoch": 0.07313449891440978,
"grad_norm": 157.0716094970703,
"learning_rate": 4e-05,
"loss": 1.1832,
"step": 400
},
{
"epoch": 0.07313449891440978,
"eval_loss": 1.1192466020584106,
"eval_runtime": 189.0814,
"eval_samples_per_second": 2.644,
"eval_steps_per_second": 0.169,
"step": 400
},
{
"epoch": 0.09141812364301223,
"grad_norm": 26.455596923828125,
"learning_rate": 5e-05,
"loss": 1.1759,
"step": 500
},
{
"epoch": 0.09141812364301223,
"eval_loss": 1.0633209943771362,
"eval_runtime": 188.7155,
"eval_samples_per_second": 2.649,
"eval_steps_per_second": 0.17,
"step": 500
},
{
"epoch": 0.10970174837161467,
"grad_norm": 34.8622932434082,
"learning_rate": 4.9950071057370804e-05,
"loss": 1.1282,
"step": 600
},
{
"epoch": 0.10970174837161467,
"eval_loss": 1.0417226552963257,
"eval_runtime": 189.0917,
"eval_samples_per_second": 2.644,
"eval_steps_per_second": 0.169,
"step": 600
},
{
"epoch": 0.12798537310021713,
"grad_norm": 25.281723022460938,
"learning_rate": 4.9800483661428186e-05,
"loss": 1.1376,
"step": 700
},
{
"epoch": 0.12798537310021713,
"eval_loss": 1.0301182270050049,
"eval_runtime": 189.2584,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 700
},
{
"epoch": 0.14626899782881955,
"grad_norm": 22.426198959350586,
"learning_rate": 4.9551835311412955e-05,
"loss": 1.1079,
"step": 800
},
{
"epoch": 0.14626899782881955,
"eval_loss": 1.0073903799057007,
"eval_runtime": 189.1597,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 800
},
{
"epoch": 0.164552622557422,
"grad_norm": 35.90736389160156,
"learning_rate": 4.920511918726131e-05,
"loss": 1.0631,
"step": 900
},
{
"epoch": 0.164552622557422,
"eval_loss": 0.9627833366394043,
"eval_runtime": 189.137,
"eval_samples_per_second": 2.644,
"eval_steps_per_second": 0.169,
"step": 900
},
{
"epoch": 0.18283624728602446,
"grad_norm": 23.095998764038086,
"learning_rate": 4.876172018253099e-05,
"loss": 1.0368,
"step": 1000
},
{
"epoch": 0.18283624728602446,
"eval_loss": 0.9648867249488831,
"eval_runtime": 188.6889,
"eval_samples_per_second": 2.65,
"eval_steps_per_second": 0.17,
"step": 1000
},
{
"epoch": 0.2011198720146269,
"grad_norm": 29.256031036376953,
"learning_rate": 4.82234093726995e-05,
"loss": 1.0427,
"step": 1100
},
{
"epoch": 0.2011198720146269,
"eval_loss": 0.9637861847877502,
"eval_runtime": 188.6111,
"eval_samples_per_second": 2.651,
"eval_steps_per_second": 0.17,
"step": 1100
},
{
"epoch": 0.21940349674322934,
"grad_norm": 37.598472595214844,
"learning_rate": 4.75923369409301e-05,
"loss": 1.0326,
"step": 1200
},
{
"epoch": 0.21940349674322934,
"eval_loss": 0.9529294967651367,
"eval_runtime": 188.8671,
"eval_samples_per_second": 2.647,
"eval_steps_per_second": 0.169,
"step": 1200
},
{
"epoch": 0.2376871214718318,
"grad_norm": 31.624174118041992,
"learning_rate": 4.6871023589562045e-05,
"loss": 1.0522,
"step": 1300
},
{
"epoch": 0.2376871214718318,
"eval_loss": 0.9683192372322083,
"eval_runtime": 189.2334,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 1300
},
{
"epoch": 0.25597074620043425,
"grad_norm": 20.714860916137695,
"learning_rate": 4.60623504716304e-05,
"loss": 1.1055,
"step": 1400
},
{
"epoch": 0.25597074620043425,
"eval_loss": 1.0250566005706787,
"eval_runtime": 189.0522,
"eval_samples_per_second": 2.645,
"eval_steps_per_second": 0.169,
"step": 1400
},
{
"epoch": 0.2742543709290367,
"grad_norm": 13.228219032287598,
"learning_rate": 4.516954768263203e-05,
"loss": 1.1199,
"step": 1500
},
{
"epoch": 0.2742543709290367,
"eval_loss": 0.9792861342430115,
"eval_runtime": 189.1669,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 1500
},
{
"epoch": 0.2925379956576391,
"grad_norm": 13.624972343444824,
"learning_rate": 4.419618135850542e-05,
"loss": 1.0705,
"step": 1600
},
{
"epoch": 0.2925379956576391,
"eval_loss": 0.975660502910614,
"eval_runtime": 189.2814,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 1600
},
{
"epoch": 0.31082162038624156,
"grad_norm": 28.702505111694336,
"learning_rate": 4.314613943135893e-05,
"loss": 1.0594,
"step": 1700
},
{
"epoch": 0.31082162038624156,
"eval_loss": 0.9642728567123413,
"eval_runtime": 189.2767,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 1700
},
{
"epoch": 0.329105245114844,
"grad_norm": 11.370965957641602,
"learning_rate": 4.2023616099843676e-05,
"loss": 1.0422,
"step": 1800
},
{
"epoch": 0.329105245114844,
"eval_loss": 0.9481159448623657,
"eval_runtime": 189.186,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 1800
},
{
"epoch": 0.34738886984344647,
"grad_norm": 20.44642448425293,
"learning_rate": 4.083309507620118e-05,
"loss": 1.0466,
"step": 1900
},
{
"epoch": 0.34738886984344647,
"eval_loss": 0.9562661051750183,
"eval_runtime": 189.281,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 1900
},
{
"epoch": 0.3656724945720489,
"grad_norm": 10.944358825683594,
"learning_rate": 3.95793316769025e-05,
"loss": 1.0369,
"step": 2000
},
{
"epoch": 0.3656724945720489,
"eval_loss": 0.9367031455039978,
"eval_runtime": 189.225,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 2000
},
{
"epoch": 0.3839561193006514,
"grad_norm": 11.027617454528809,
"learning_rate": 3.8267333828414366e-05,
"loss": 1.0167,
"step": 2100
},
{
"epoch": 0.3839561193006514,
"eval_loss": 0.9452427625656128,
"eval_runtime": 189.3516,
"eval_samples_per_second": 2.641,
"eval_steps_per_second": 0.169,
"step": 2100
},
{
"epoch": 0.4022397440292538,
"grad_norm": 13.287491798400879,
"learning_rate": 3.690234206396134e-05,
"loss": 1.0456,
"step": 2200
},
{
"epoch": 0.4022397440292538,
"eval_loss": 0.9392285943031311,
"eval_runtime": 189.33,
"eval_samples_per_second": 2.641,
"eval_steps_per_second": 0.169,
"step": 2200
},
{
"epoch": 0.42052336875785623,
"grad_norm": 11.296989440917969,
"learning_rate": 3.5489808591183144e-05,
"loss": 1.0243,
"step": 2300
},
{
"epoch": 0.42052336875785623,
"eval_loss": 0.926450252532959,
"eval_runtime": 189.2591,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 2300
},
{
"epoch": 0.4388069934864587,
"grad_norm": 8.259175300598145,
"learning_rate": 3.403537551429771e-05,
"loss": 0.9798,
"step": 2400
},
{
"epoch": 0.4388069934864587,
"eval_loss": 0.9248631596565247,
"eval_runtime": 189.2646,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 2400
},
{
"epoch": 0.45709061821506114,
"grad_norm": 8.901100158691406,
"learning_rate": 3.254485229775735e-05,
"loss": 1.0033,
"step": 2500
},
{
"epoch": 0.45709061821506114,
"eval_loss": 0.9127222895622253,
"eval_runtime": 189.2237,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 2500
},
{
"epoch": 0.4753742429436636,
"grad_norm": 10.157410621643066,
"learning_rate": 3.102419256141536e-05,
"loss": 1.0023,
"step": 2600
},
{
"epoch": 0.4753742429436636,
"eval_loss": 0.9139639735221863,
"eval_runtime": 189.3397,
"eval_samples_per_second": 2.641,
"eval_steps_per_second": 0.169,
"step": 2600
},
{
"epoch": 0.49365786767226605,
"grad_norm": 14.876816749572754,
"learning_rate": 2.9479470299890476e-05,
"loss": 1.0095,
"step": 2700
},
{
"epoch": 0.49365786767226605,
"eval_loss": 0.9182960987091064,
"eval_runtime": 189.2612,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 2700
},
{
"epoch": 0.5119414924008685,
"grad_norm": 10.439401626586914,
"learning_rate": 2.79168556211166e-05,
"loss": 0.983,
"step": 2800
},
{
"epoch": 0.5119414924008685,
"eval_loss": 0.9220383763313293,
"eval_runtime": 189.1245,
"eval_samples_per_second": 2.644,
"eval_steps_per_second": 0.169,
"step": 2800
},
{
"epoch": 0.530225117129471,
"grad_norm": 10.412522315979004,
"learning_rate": 2.6342590100985565e-05,
"loss": 1.0071,
"step": 2900
},
{
"epoch": 0.530225117129471,
"eval_loss": 0.9112712144851685,
"eval_runtime": 189.2472,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.169,
"step": 2900
},
{
"epoch": 0.5485087418580734,
"grad_norm": 7.356208801269531,
"learning_rate": 2.476296185252441e-05,
"loss": 0.979,
"step": 3000
},
{
"epoch": 0.5485087418580734,
"eval_loss": 0.9097906351089478,
"eval_runtime": 189.1646,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 3000
},
{
"epoch": 0.5667923665866758,
"grad_norm": 15.310144424438477,
"learning_rate": 2.318428040918855e-05,
"loss": 0.969,
"step": 3100
},
{
"epoch": 0.5667923665866758,
"eval_loss": 0.9059516787528992,
"eval_runtime": 189.1543,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 3100
},
{
"epoch": 0.5850759913152782,
"grad_norm": 12.159485816955566,
"learning_rate": 2.1612851522595114e-05,
"loss": 0.9923,
"step": 3200
},
{
"epoch": 0.5850759913152782,
"eval_loss": 0.9071580171585083,
"eval_runtime": 189.0679,
"eval_samples_per_second": 2.645,
"eval_steps_per_second": 0.169,
"step": 3200
},
{
"epoch": 0.6033596160438807,
"grad_norm": 7.6948676109313965,
"learning_rate": 2.0054951975362067e-05,
"loss": 0.9819,
"step": 3300
},
{
"epoch": 0.6033596160438807,
"eval_loss": 0.9018483757972717,
"eval_runtime": 189.0913,
"eval_samples_per_second": 2.644,
"eval_steps_per_second": 0.169,
"step": 3300
},
{
"epoch": 0.6216432407724831,
"grad_norm": 8.831514358520508,
"learning_rate": 1.8516804509658687e-05,
"loss": 0.968,
"step": 3400
},
{
"epoch": 0.6216432407724831,
"eval_loss": 0.9070228338241577,
"eval_runtime": 188.8425,
"eval_samples_per_second": 2.648,
"eval_steps_per_second": 0.169,
"step": 3400
},
{
"epoch": 0.6399268655010856,
"grad_norm": 13.450331687927246,
"learning_rate": 1.7004552971610604e-05,
"loss": 0.9681,
"step": 3500
},
{
"epoch": 0.6399268655010856,
"eval_loss": 0.9079869389533997,
"eval_runtime": 188.9533,
"eval_samples_per_second": 2.646,
"eval_steps_per_second": 0.169,
"step": 3500
},
{
"epoch": 0.658210490229688,
"grad_norm": 17.26088523864746,
"learning_rate": 1.552423777084053e-05,
"loss": 0.978,
"step": 3600
},
{
"epoch": 0.658210490229688,
"eval_loss": 0.9043178558349609,
"eval_runtime": 189.1442,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 3600
},
{
"epoch": 0.6764941149582905,
"grad_norm": 10.180910110473633,
"learning_rate": 1.4081771753167055e-05,
"loss": 0.9892,
"step": 3700
},
{
"epoch": 0.6764941149582905,
"eval_loss": 0.9008078575134277,
"eval_runtime": 189.3709,
"eval_samples_per_second": 2.64,
"eval_steps_per_second": 0.169,
"step": 3700
},
{
"epoch": 0.6947777396868929,
"grad_norm": 7.841503620147705,
"learning_rate": 1.2682916582833473e-05,
"loss": 0.9588,
"step": 3800
},
{
"epoch": 0.6947777396868929,
"eval_loss": 0.9007475972175598,
"eval_runtime": 189.4241,
"eval_samples_per_second": 2.64,
"eval_steps_per_second": 0.169,
"step": 3800
},
{
"epoch": 0.7130613644154954,
"grad_norm": 10.395474433898926,
"learning_rate": 1.133325972860347e-05,
"loss": 0.9645,
"step": 3900
},
{
"epoch": 0.7130613644154954,
"eval_loss": 0.9001818895339966,
"eval_runtime": 189.3239,
"eval_samples_per_second": 2.641,
"eval_steps_per_second": 0.169,
"step": 3900
},
{
"epoch": 0.7313449891440978,
"grad_norm": 8.191793441772461,
"learning_rate": 1.0038192145648567e-05,
"loss": 0.9627,
"step": 4000
},
{
"epoch": 0.7313449891440978,
"eval_loss": 0.8982793092727661,
"eval_runtime": 189.1681,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 4000
},
{
"epoch": 0.7496286138727003,
"grad_norm": 11.999282836914062,
"learning_rate": 8.802886742372774e-06,
"loss": 0.9633,
"step": 4100
},
{
"epoch": 0.7496286138727003,
"eval_loss": 0.8981696963310242,
"eval_runtime": 189.2147,
"eval_samples_per_second": 2.643,
"eval_steps_per_second": 0.169,
"step": 4100
}
],
"logging_steps": 100,
"max_steps": 5470,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"total_flos": 8.406940484736e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}