gpt2_m050_tiny-stories_1024_dpos / trainer_state.json
jonasknobloch's picture
Upload folder using huggingface_hub
f305ed8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 19061,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.026231572320444888,
"grad_norm": 1.3818440437316895,
"learning_rate": 4.868842138397776e-05,
"loss": 4.3761,
"step": 500
},
{
"epoch": 0.052463144640889775,
"grad_norm": 2.8775651454925537,
"learning_rate": 4.7376842767955515e-05,
"loss": 2.9069,
"step": 1000
},
{
"epoch": 0.052463144640889775,
"eval_accuracy": 0.44778331183560627,
"eval_loss": 2.44579815864563,
"eval_runtime": 53.4802,
"eval_samples_per_second": 114.697,
"eval_steps_per_second": 3.59,
"step": 1000
},
{
"epoch": 0.07869471696133466,
"grad_norm": 2.087350368499756,
"learning_rate": 4.606526415193327e-05,
"loss": 2.2789,
"step": 1500
},
{
"epoch": 0.10492628928177955,
"grad_norm": 1.8893406391143799,
"learning_rate": 4.475368553591103e-05,
"loss": 1.9731,
"step": 2000
},
{
"epoch": 0.10492628928177955,
"eval_accuracy": 0.5686255574030745,
"eval_loss": 1.793091058731079,
"eval_runtime": 53.0446,
"eval_samples_per_second": 115.639,
"eval_steps_per_second": 3.62,
"step": 2000
},
{
"epoch": 0.13115786160222442,
"grad_norm": 1.4629850387573242,
"learning_rate": 4.3442106919888784e-05,
"loss": 1.8193,
"step": 2500
},
{
"epoch": 0.15738943392266933,
"grad_norm": 1.3035223484039307,
"learning_rate": 4.213052830386653e-05,
"loss": 1.7234,
"step": 3000
},
{
"epoch": 0.15738943392266933,
"eval_accuracy": 0.6008603552909747,
"eval_loss": 1.6119489669799805,
"eval_runtime": 53.4678,
"eval_samples_per_second": 114.723,
"eval_steps_per_second": 3.591,
"step": 3000
},
{
"epoch": 0.1836210062431142,
"grad_norm": 1.390590786933899,
"learning_rate": 4.0818949687844296e-05,
"loss": 1.6587,
"step": 3500
},
{
"epoch": 0.2098525785635591,
"grad_norm": 1.2905086278915405,
"learning_rate": 3.9507371071822046e-05,
"loss": 1.6063,
"step": 4000
},
{
"epoch": 0.2098525785635591,
"eval_accuracy": 0.6191917492074207,
"eval_loss": 1.511812686920166,
"eval_runtime": 53.5523,
"eval_samples_per_second": 114.542,
"eval_steps_per_second": 3.585,
"step": 4000
},
{
"epoch": 0.23608415088400397,
"grad_norm": 1.2503050565719604,
"learning_rate": 3.81957924557998e-05,
"loss": 1.5624,
"step": 4500
},
{
"epoch": 0.26231572320444885,
"grad_norm": 1.2357672452926636,
"learning_rate": 3.688421383977756e-05,
"loss": 1.5331,
"step": 5000
},
{
"epoch": 0.26231572320444885,
"eval_accuracy": 0.6299159118558132,
"eval_loss": 1.4537405967712402,
"eval_runtime": 54.1401,
"eval_samples_per_second": 113.299,
"eval_steps_per_second": 3.546,
"step": 5000
},
{
"epoch": 0.2885472955248938,
"grad_norm": 1.2465555667877197,
"learning_rate": 3.5572635223755315e-05,
"loss": 1.5044,
"step": 5500
},
{
"epoch": 0.31477886784533865,
"grad_norm": 1.2649658918380737,
"learning_rate": 3.426105660773307e-05,
"loss": 1.4812,
"step": 6000
},
{
"epoch": 0.31477886784533865,
"eval_accuracy": 0.6392144357635486,
"eval_loss": 1.405892252922058,
"eval_runtime": 53.4127,
"eval_samples_per_second": 114.842,
"eval_steps_per_second": 3.595,
"step": 6000
},
{
"epoch": 0.3410104401657835,
"grad_norm": 1.2671959400177002,
"learning_rate": 3.294947799171083e-05,
"loss": 1.4634,
"step": 6500
},
{
"epoch": 0.3672420124862284,
"grad_norm": 1.1386276483535767,
"learning_rate": 3.163789937568858e-05,
"loss": 1.4428,
"step": 7000
},
{
"epoch": 0.3672420124862284,
"eval_accuracy": 0.6457219204466172,
"eval_loss": 1.3719512224197388,
"eval_runtime": 53.3481,
"eval_samples_per_second": 114.981,
"eval_steps_per_second": 3.599,
"step": 7000
},
{
"epoch": 0.3934735848066733,
"grad_norm": 1.2336556911468506,
"learning_rate": 3.0326320759666336e-05,
"loss": 1.4271,
"step": 7500
},
{
"epoch": 0.4197051571271182,
"grad_norm": 1.1540193557739258,
"learning_rate": 2.9014742143644093e-05,
"loss": 1.4149,
"step": 8000
},
{
"epoch": 0.4197051571271182,
"eval_accuracy": 0.6509898994148602,
"eval_loss": 1.343773603439331,
"eval_runtime": 53.3234,
"eval_samples_per_second": 115.034,
"eval_steps_per_second": 3.601,
"step": 8000
},
{
"epoch": 0.4459367294475631,
"grad_norm": 1.0972239971160889,
"learning_rate": 2.7703163527621846e-05,
"loss": 1.3979,
"step": 8500
},
{
"epoch": 0.47216830176800795,
"grad_norm": 1.1413357257843018,
"learning_rate": 2.6391584911599605e-05,
"loss": 1.3857,
"step": 9000
},
{
"epoch": 0.47216830176800795,
"eval_accuracy": 0.6563627375706007,
"eval_loss": 1.3179402351379395,
"eval_runtime": 52.7311,
"eval_samples_per_second": 116.326,
"eval_steps_per_second": 3.641,
"step": 9000
},
{
"epoch": 0.4983998740884529,
"grad_norm": 1.1442935466766357,
"learning_rate": 2.5080006295577358e-05,
"loss": 1.3798,
"step": 9500
},
{
"epoch": 0.5246314464088977,
"grad_norm": 1.1634732484817505,
"learning_rate": 2.3768427679555114e-05,
"loss": 1.3654,
"step": 10000
},
{
"epoch": 0.5246314464088977,
"eval_accuracy": 0.6599810169811327,
"eval_loss": 1.2988349199295044,
"eval_runtime": 52.7299,
"eval_samples_per_second": 116.329,
"eval_steps_per_second": 3.641,
"step": 10000
},
{
"epoch": 0.5508630187293426,
"grad_norm": 1.173746943473816,
"learning_rate": 2.245684906353287e-05,
"loss": 1.3583,
"step": 10500
},
{
"epoch": 0.5770945910497876,
"grad_norm": 1.1500358581542969,
"learning_rate": 2.1145270447510627e-05,
"loss": 1.3449,
"step": 11000
},
{
"epoch": 0.5770945910497876,
"eval_accuracy": 0.6630447219653862,
"eval_loss": 1.2830150127410889,
"eval_runtime": 52.9419,
"eval_samples_per_second": 115.863,
"eval_steps_per_second": 3.627,
"step": 11000
},
{
"epoch": 0.6033261633702324,
"grad_norm": 1.18638277053833,
"learning_rate": 1.983369183148838e-05,
"loss": 1.3371,
"step": 11500
},
{
"epoch": 0.6295577356906773,
"grad_norm": 1.1297719478607178,
"learning_rate": 1.8522113215466136e-05,
"loss": 1.3302,
"step": 12000
},
{
"epoch": 0.6295577356906773,
"eval_accuracy": 0.6660457982859825,
"eval_loss": 1.2687662839889526,
"eval_runtime": 52.8433,
"eval_samples_per_second": 116.079,
"eval_steps_per_second": 3.633,
"step": 12000
},
{
"epoch": 0.6557893080111222,
"grad_norm": 1.1766819953918457,
"learning_rate": 1.7210534599443892e-05,
"loss": 1.3227,
"step": 12500
},
{
"epoch": 0.682020880331567,
"grad_norm": 1.128738284111023,
"learning_rate": 1.589895598342165e-05,
"loss": 1.3174,
"step": 13000
},
{
"epoch": 0.682020880331567,
"eval_accuracy": 0.6682683987237139,
"eval_loss": 1.2574915885925293,
"eval_runtime": 53.1648,
"eval_samples_per_second": 115.377,
"eval_steps_per_second": 3.611,
"step": 13000
},
{
"epoch": 0.708252452652012,
"grad_norm": 1.147472620010376,
"learning_rate": 1.4587377367399401e-05,
"loss": 1.3118,
"step": 13500
},
{
"epoch": 0.7344840249724568,
"grad_norm": 1.12171471118927,
"learning_rate": 1.3275798751377158e-05,
"loss": 1.3052,
"step": 14000
},
{
"epoch": 0.7344840249724568,
"eval_accuracy": 0.6708049392820683,
"eval_loss": 1.2456800937652588,
"eval_runtime": 52.8767,
"eval_samples_per_second": 116.006,
"eval_steps_per_second": 3.631,
"step": 14000
},
{
"epoch": 0.7607155972929017,
"grad_norm": 1.106832504272461,
"learning_rate": 1.1964220135354914e-05,
"loss": 1.3006,
"step": 14500
},
{
"epoch": 0.7869471696133467,
"grad_norm": 1.1112337112426758,
"learning_rate": 1.0652641519332669e-05,
"loss": 1.2959,
"step": 15000
},
{
"epoch": 0.7869471696133467,
"eval_accuracy": 0.6724595790142662,
"eval_loss": 1.2370907068252563,
"eval_runtime": 52.7887,
"eval_samples_per_second": 116.199,
"eval_steps_per_second": 3.637,
"step": 15000
},
{
"epoch": 0.8131787419337915,
"grad_norm": 1.1115854978561401,
"learning_rate": 9.341062903310425e-06,
"loss": 1.2901,
"step": 15500
},
{
"epoch": 0.8394103142542364,
"grad_norm": 1.1028845310211182,
"learning_rate": 8.029484287288181e-06,
"loss": 1.2847,
"step": 16000
},
{
"epoch": 0.8394103142542364,
"eval_accuracy": 0.6743223435167859,
"eval_loss": 1.2278393507003784,
"eval_runtime": 52.9124,
"eval_samples_per_second": 115.927,
"eval_steps_per_second": 3.629,
"step": 16000
},
{
"epoch": 0.8656418865746813,
"grad_norm": 1.1097549200057983,
"learning_rate": 6.717905671265937e-06,
"loss": 1.2842,
"step": 16500
},
{
"epoch": 0.8918734588951261,
"grad_norm": 1.1161189079284668,
"learning_rate": 5.406327055243691e-06,
"loss": 1.28,
"step": 17000
},
{
"epoch": 0.8918734588951261,
"eval_accuracy": 0.6759062272014932,
"eval_loss": 1.2205840349197388,
"eval_runtime": 52.9403,
"eval_samples_per_second": 115.866,
"eval_steps_per_second": 3.627,
"step": 17000
},
{
"epoch": 0.9181050312155711,
"grad_norm": 1.112290859222412,
"learning_rate": 4.0947484392214475e-06,
"loss": 1.2749,
"step": 17500
},
{
"epoch": 0.9443366035360159,
"grad_norm": 1.10244619846344,
"learning_rate": 2.7831698231992025e-06,
"loss": 1.27,
"step": 18000
},
{
"epoch": 0.9443366035360159,
"eval_accuracy": 0.6768434261098102,
"eval_loss": 1.2162342071533203,
"eval_runtime": 52.5316,
"eval_samples_per_second": 116.768,
"eval_steps_per_second": 3.655,
"step": 18000
},
{
"epoch": 0.9705681758564608,
"grad_norm": 1.1379034519195557,
"learning_rate": 1.4715912071769583e-06,
"loss": 1.2756,
"step": 18500
},
{
"epoch": 0.9967997481769058,
"grad_norm": 1.120842456817627,
"learning_rate": 1.6001259115471381e-07,
"loss": 1.272,
"step": 19000
},
{
"epoch": 0.9967997481769058,
"eval_accuracy": 0.6775780778641618,
"eval_loss": 1.2128527164459229,
"eval_runtime": 53.048,
"eval_samples_per_second": 115.631,
"eval_steps_per_second": 3.619,
"step": 19000
},
{
"epoch": 1.0,
"step": 19061,
"total_flos": 3.18739175571456e+17,
"train_loss": 1.5558179828629188,
"train_runtime": 7460.0601,
"train_samples_per_second": 81.759,
"train_steps_per_second": 2.555
}
],
"logging_steps": 500,
"max_steps": 19061,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.18739175571456e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}