gpt2_m080_tiny-stories_1024_dpos / trainer_state.json
jonasknobloch's picture
Upload folder using huggingface_hub
c107b8a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 19173,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02607833933135138,
"grad_norm": 1.453355312347412,
"learning_rate": 4.8696083033432434e-05,
"loss": 4.3635,
"step": 500
},
{
"epoch": 0.05215667866270276,
"grad_norm": 2.793958902359009,
"learning_rate": 4.7392166066864866e-05,
"loss": 2.8808,
"step": 1000
},
{
"epoch": 0.05215667866270276,
"eval_accuracy": 0.45157249686137624,
"eval_loss": 2.42598557472229,
"eval_runtime": 55.0824,
"eval_samples_per_second": 111.996,
"eval_steps_per_second": 3.504,
"step": 1000
},
{
"epoch": 0.07823501799405413,
"grad_norm": 1.6672502756118774,
"learning_rate": 4.608824910029729e-05,
"loss": 2.2617,
"step": 1500
},
{
"epoch": 0.10431335732540552,
"grad_norm": 1.5480303764343262,
"learning_rate": 4.478433213372973e-05,
"loss": 1.9542,
"step": 2000
},
{
"epoch": 0.10431335732540552,
"eval_accuracy": 0.5731481168970384,
"eval_loss": 1.7747852802276611,
"eval_runtime": 54.447,
"eval_samples_per_second": 113.303,
"eval_steps_per_second": 3.545,
"step": 2000
},
{
"epoch": 0.1303916966567569,
"grad_norm": 1.4495408535003662,
"learning_rate": 4.348041516716216e-05,
"loss": 1.8025,
"step": 2500
},
{
"epoch": 0.15647003598810827,
"grad_norm": 1.5346728563308716,
"learning_rate": 4.2176498200594586e-05,
"loss": 1.7122,
"step": 3000
},
{
"epoch": 0.15647003598810827,
"eval_accuracy": 0.6041759264585153,
"eval_loss": 1.601155400276184,
"eval_runtime": 54.5006,
"eval_samples_per_second": 113.191,
"eval_steps_per_second": 3.541,
"step": 3000
},
{
"epoch": 0.18254837531945967,
"grad_norm": 1.4614065885543823,
"learning_rate": 4.087258123402702e-05,
"loss": 1.6441,
"step": 3500
},
{
"epoch": 0.20862671465081103,
"grad_norm": 1.3059850931167603,
"learning_rate": 3.956866426745945e-05,
"loss": 1.5962,
"step": 4000
},
{
"epoch": 0.20862671465081103,
"eval_accuracy": 0.6221138169642397,
"eval_loss": 1.500780701637268,
"eval_runtime": 54.6417,
"eval_samples_per_second": 112.899,
"eval_steps_per_second": 3.532,
"step": 4000
},
{
"epoch": 0.2347050539821624,
"grad_norm": 1.243977427482605,
"learning_rate": 3.826474730089188e-05,
"loss": 1.5564,
"step": 4500
},
{
"epoch": 0.2607833933135138,
"grad_norm": 1.3383909463882446,
"learning_rate": 3.696083033432431e-05,
"loss": 1.5228,
"step": 5000
},
{
"epoch": 0.2607833933135138,
"eval_accuracy": 0.6330674277641162,
"eval_loss": 1.4412604570388794,
"eval_runtime": 54.0686,
"eval_samples_per_second": 114.096,
"eval_steps_per_second": 3.57,
"step": 5000
},
{
"epoch": 0.2868617326448652,
"grad_norm": 1.200243592262268,
"learning_rate": 3.5656913367756745e-05,
"loss": 1.495,
"step": 5500
},
{
"epoch": 0.31294007197621654,
"grad_norm": 1.1654722690582275,
"learning_rate": 3.435299640118918e-05,
"loss": 1.4706,
"step": 6000
},
{
"epoch": 0.31294007197621654,
"eval_accuracy": 0.6413079492629166,
"eval_loss": 1.3981218338012695,
"eval_runtime": 53.8081,
"eval_samples_per_second": 114.648,
"eval_steps_per_second": 3.587,
"step": 6000
},
{
"epoch": 0.33901841130756794,
"grad_norm": 1.2035291194915771,
"learning_rate": 3.30490794346216e-05,
"loss": 1.4546,
"step": 6500
},
{
"epoch": 0.36509675063891933,
"grad_norm": 1.1253894567489624,
"learning_rate": 3.1745162468054033e-05,
"loss": 1.4342,
"step": 7000
},
{
"epoch": 0.36509675063891933,
"eval_accuracy": 0.6485064619284103,
"eval_loss": 1.3601430654525757,
"eval_runtime": 54.0475,
"eval_samples_per_second": 114.14,
"eval_steps_per_second": 3.571,
"step": 7000
},
{
"epoch": 0.3911750899702707,
"grad_norm": 1.087329626083374,
"learning_rate": 3.0441245501486465e-05,
"loss": 1.4142,
"step": 7500
},
{
"epoch": 0.41725342930162207,
"grad_norm": 1.2092727422714233,
"learning_rate": 2.9137328534918894e-05,
"loss": 1.4,
"step": 8000
},
{
"epoch": 0.41725342930162207,
"eval_accuracy": 0.6541551132194254,
"eval_loss": 1.3316493034362793,
"eval_runtime": 53.9118,
"eval_samples_per_second": 114.428,
"eval_steps_per_second": 3.58,
"step": 8000
},
{
"epoch": 0.44333176863297347,
"grad_norm": 1.1115912199020386,
"learning_rate": 2.7833411568351332e-05,
"loss": 1.3875,
"step": 8500
},
{
"epoch": 0.4694101079643248,
"grad_norm": 1.1426430940628052,
"learning_rate": 2.652949460178376e-05,
"loss": 1.3759,
"step": 9000
},
{
"epoch": 0.4694101079643248,
"eval_accuracy": 0.6584719390475539,
"eval_loss": 1.3087505102157593,
"eval_runtime": 53.1728,
"eval_samples_per_second": 116.018,
"eval_steps_per_second": 3.63,
"step": 9000
},
{
"epoch": 0.4954884472956762,
"grad_norm": 1.11874520778656,
"learning_rate": 2.5225577635216192e-05,
"loss": 1.3662,
"step": 9500
},
{
"epoch": 0.5215667866270276,
"grad_norm": 1.1779828071594238,
"learning_rate": 2.392166066864862e-05,
"loss": 1.3551,
"step": 10000
},
{
"epoch": 0.5215667866270276,
"eval_accuracy": 0.6621520873373268,
"eval_loss": 1.2907606363296509,
"eval_runtime": 52.8312,
"eval_samples_per_second": 116.768,
"eval_steps_per_second": 3.653,
"step": 10000
},
{
"epoch": 0.547645125958379,
"grad_norm": 1.0862423181533813,
"learning_rate": 2.2617743702081052e-05,
"loss": 1.3463,
"step": 10500
},
{
"epoch": 0.5737234652897304,
"grad_norm": 1.142232894897461,
"learning_rate": 2.1313826735513484e-05,
"loss": 1.3322,
"step": 11000
},
{
"epoch": 0.5737234652897304,
"eval_accuracy": 0.6656504862153292,
"eval_loss": 1.273500919342041,
"eval_runtime": 52.8062,
"eval_samples_per_second": 116.823,
"eval_steps_per_second": 3.655,
"step": 11000
},
{
"epoch": 0.5998018046210817,
"grad_norm": 1.1338053941726685,
"learning_rate": 2.0009909768945913e-05,
"loss": 1.3289,
"step": 11500
},
{
"epoch": 0.6258801439524331,
"grad_norm": 1.1444751024246216,
"learning_rate": 1.8705992802378344e-05,
"loss": 1.3179,
"step": 12000
},
{
"epoch": 0.6258801439524331,
"eval_accuracy": 0.66850412628209,
"eval_loss": 1.2587136030197144,
"eval_runtime": 52.8166,
"eval_samples_per_second": 116.8,
"eval_steps_per_second": 3.654,
"step": 12000
},
{
"epoch": 0.6519584832837845,
"grad_norm": 1.0998445749282837,
"learning_rate": 1.7402075835810776e-05,
"loss": 1.3157,
"step": 12500
},
{
"epoch": 0.6780368226151359,
"grad_norm": 1.1154826879501343,
"learning_rate": 1.6098158869243208e-05,
"loss": 1.3075,
"step": 13000
},
{
"epoch": 0.6780368226151359,
"eval_accuracy": 0.6710853799156917,
"eval_loss": 1.2457832098007202,
"eval_runtime": 53.0417,
"eval_samples_per_second": 116.305,
"eval_steps_per_second": 3.639,
"step": 13000
},
{
"epoch": 0.7041151619464873,
"grad_norm": 1.135167121887207,
"learning_rate": 1.4794241902675638e-05,
"loss": 1.3038,
"step": 13500
},
{
"epoch": 0.7301935012778387,
"grad_norm": 1.1100085973739624,
"learning_rate": 1.349032493610807e-05,
"loss": 1.2997,
"step": 14000
},
{
"epoch": 0.7301935012778387,
"eval_accuracy": 0.6729637212645385,
"eval_loss": 1.236211895942688,
"eval_runtime": 52.7336,
"eval_samples_per_second": 116.984,
"eval_steps_per_second": 3.66,
"step": 14000
},
{
"epoch": 0.7562718406091901,
"grad_norm": 1.1251678466796875,
"learning_rate": 1.21864079695405e-05,
"loss": 1.2897,
"step": 14500
},
{
"epoch": 0.7823501799405413,
"grad_norm": 1.146391749382019,
"learning_rate": 1.0882491002972931e-05,
"loss": 1.2869,
"step": 15000
},
{
"epoch": 0.7823501799405413,
"eval_accuracy": 0.6746791695050157,
"eval_loss": 1.2276582717895508,
"eval_runtime": 52.7689,
"eval_samples_per_second": 116.906,
"eval_steps_per_second": 3.657,
"step": 15000
},
{
"epoch": 0.8084285192718927,
"grad_norm": 1.096614122390747,
"learning_rate": 9.578574036405362e-06,
"loss": 1.2794,
"step": 15500
},
{
"epoch": 0.8345068586032441,
"grad_norm": 1.1054586172103882,
"learning_rate": 8.274657069837793e-06,
"loss": 1.2766,
"step": 16000
},
{
"epoch": 0.8345068586032441,
"eval_accuracy": 0.676915463705815,
"eval_loss": 1.2177897691726685,
"eval_runtime": 53.1333,
"eval_samples_per_second": 116.104,
"eval_steps_per_second": 3.632,
"step": 16000
},
{
"epoch": 0.8605851979345955,
"grad_norm": 1.099358320236206,
"learning_rate": 6.970740103270223e-06,
"loss": 1.2751,
"step": 16500
},
{
"epoch": 0.8866635372659469,
"grad_norm": 1.1102577447891235,
"learning_rate": 5.666823136702655e-06,
"loss": 1.271,
"step": 17000
},
{
"epoch": 0.8866635372659469,
"eval_accuracy": 0.6781523738263734,
"eval_loss": 1.2116661071777344,
"eval_runtime": 53.2421,
"eval_samples_per_second": 115.867,
"eval_steps_per_second": 3.625,
"step": 17000
},
{
"epoch": 0.9127418765972983,
"grad_norm": 1.1219637393951416,
"learning_rate": 4.362906170135086e-06,
"loss": 1.2648,
"step": 17500
},
{
"epoch": 0.9388202159286496,
"grad_norm": 1.1664655208587646,
"learning_rate": 3.058989203567517e-06,
"loss": 1.2624,
"step": 18000
},
{
"epoch": 0.9388202159286496,
"eval_accuracy": 0.6793829456936877,
"eval_loss": 1.2054765224456787,
"eval_runtime": 54.1549,
"eval_samples_per_second": 113.914,
"eval_steps_per_second": 3.564,
"step": 18000
},
{
"epoch": 0.964898555260001,
"grad_norm": 1.1197658777236938,
"learning_rate": 1.7550722369999478e-06,
"loss": 1.2625,
"step": 18500
},
{
"epoch": 0.9909768945913524,
"grad_norm": 1.1014546155929565,
"learning_rate": 4.511552704323789e-07,
"loss": 1.2593,
"step": 19000
},
{
"epoch": 0.9909768945913524,
"eval_accuracy": 0.6801362470917321,
"eval_loss": 1.202091932296753,
"eval_runtime": 52.9233,
"eval_samples_per_second": 116.565,
"eval_steps_per_second": 3.647,
"step": 19000
},
{
"epoch": 1.0,
"step": 19173,
"total_flos": 3.20619433033728e+17,
"train_loss": 1.5428574307948617,
"train_runtime": 7427.5337,
"train_samples_per_second": 82.602,
"train_steps_per_second": 2.581
}
],
"logging_steps": 500,
"max_steps": 19173,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.20619433033728e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}