gpt2_u040_tiny-stories_1024_dpos / trainer_state.json
jonasknobloch's picture
Upload folder using huggingface_hub
142ffd2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 19122,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02614789247986612,
"grad_norm": 1.8049263954162598,
"learning_rate": 4.8692605376006696e-05,
"loss": 4.386,
"step": 500
},
{
"epoch": 0.05229578495973224,
"grad_norm": 2.1683125495910645,
"learning_rate": 4.738521075201339e-05,
"loss": 2.8993,
"step": 1000
},
{
"epoch": 0.05229578495973224,
"eval_accuracy": 0.4572555901412006,
"eval_loss": 2.412806749343872,
"eval_runtime": 55.3639,
"eval_samples_per_second": 111.137,
"eval_steps_per_second": 3.486,
"step": 1000
},
{
"epoch": 0.07844367743959837,
"grad_norm": 2.7267353534698486,
"learning_rate": 4.6077816128020084e-05,
"loss": 2.2597,
"step": 1500
},
{
"epoch": 0.10459156991946449,
"grad_norm": 1.5406957864761353,
"learning_rate": 4.477042150402678e-05,
"loss": 1.9582,
"step": 2000
},
{
"epoch": 0.10459156991946449,
"eval_accuracy": 0.573149274789702,
"eval_loss": 1.7776833772659302,
"eval_runtime": 53.5813,
"eval_samples_per_second": 114.835,
"eval_steps_per_second": 3.602,
"step": 2000
},
{
"epoch": 0.13073946239933062,
"grad_norm": 1.8701914548873901,
"learning_rate": 4.346302688003347e-05,
"loss": 1.8076,
"step": 2500
},
{
"epoch": 0.15688735487919675,
"grad_norm": 1.4391542673110962,
"learning_rate": 4.2155632256040165e-05,
"loss": 1.7164,
"step": 3000
},
{
"epoch": 0.15688735487919675,
"eval_accuracy": 0.6048887929323908,
"eval_loss": 1.5985183715820312,
"eval_runtime": 53.9723,
"eval_samples_per_second": 114.003,
"eval_steps_per_second": 3.576,
"step": 3000
},
{
"epoch": 0.18303524735906285,
"grad_norm": 1.450477957725525,
"learning_rate": 4.084823763204686e-05,
"loss": 1.6474,
"step": 3500
},
{
"epoch": 0.20918313983892897,
"grad_norm": 1.2564880847930908,
"learning_rate": 3.954084300805355e-05,
"loss": 1.5976,
"step": 4000
},
{
"epoch": 0.20918313983892897,
"eval_accuracy": 0.6224453369669708,
"eval_loss": 1.5014071464538574,
"eval_runtime": 53.8551,
"eval_samples_per_second": 114.251,
"eval_steps_per_second": 3.584,
"step": 4000
},
{
"epoch": 0.2353310323187951,
"grad_norm": 1.2992570400238037,
"learning_rate": 3.8233448384060246e-05,
"loss": 1.5568,
"step": 4500
},
{
"epoch": 0.26147892479866125,
"grad_norm": 1.233372688293457,
"learning_rate": 3.692605376006694e-05,
"loss": 1.5257,
"step": 5000
},
{
"epoch": 0.26147892479866125,
"eval_accuracy": 0.634163627117497,
"eval_loss": 1.439920425415039,
"eval_runtime": 52.748,
"eval_samples_per_second": 116.649,
"eval_steps_per_second": 3.659,
"step": 5000
},
{
"epoch": 0.28762681727852735,
"grad_norm": 1.2047041654586792,
"learning_rate": 3.5618659136073633e-05,
"loss": 1.4979,
"step": 5500
},
{
"epoch": 0.3137747097583935,
"grad_norm": 1.3004554510116577,
"learning_rate": 3.431126451208033e-05,
"loss": 1.4723,
"step": 6000
},
{
"epoch": 0.3137747097583935,
"eval_accuracy": 0.6424584626720485,
"eval_loss": 1.3954555988311768,
"eval_runtime": 52.3736,
"eval_samples_per_second": 117.483,
"eval_steps_per_second": 3.685,
"step": 6000
},
{
"epoch": 0.3399226022382596,
"grad_norm": 1.1953301429748535,
"learning_rate": 3.300386988808703e-05,
"loss": 1.453,
"step": 6500
},
{
"epoch": 0.3660704947181257,
"grad_norm": 1.2482521533966064,
"learning_rate": 3.1696475264093715e-05,
"loss": 1.4337,
"step": 7000
},
{
"epoch": 0.3660704947181257,
"eval_accuracy": 0.6488618113631875,
"eval_loss": 1.3617639541625977,
"eval_runtime": 53.7239,
"eval_samples_per_second": 114.53,
"eval_steps_per_second": 3.592,
"step": 7000
},
{
"epoch": 0.39221838719799185,
"grad_norm": 1.1894769668579102,
"learning_rate": 3.038908064010041e-05,
"loss": 1.417,
"step": 7500
},
{
"epoch": 0.41836627967785794,
"grad_norm": 1.125042200088501,
"learning_rate": 2.9081686016107102e-05,
"loss": 1.4068,
"step": 8000
},
{
"epoch": 0.41836627967785794,
"eval_accuracy": 0.6549491708580115,
"eval_loss": 1.3317630290985107,
"eval_runtime": 53.4037,
"eval_samples_per_second": 115.217,
"eval_steps_per_second": 3.614,
"step": 8000
},
{
"epoch": 0.4445141721577241,
"grad_norm": 1.270693063735962,
"learning_rate": 2.77742913921138e-05,
"loss": 1.388,
"step": 8500
},
{
"epoch": 0.4706620646375902,
"grad_norm": 1.1014580726623535,
"learning_rate": 2.6466896768120493e-05,
"loss": 1.3766,
"step": 9000
},
{
"epoch": 0.4706620646375902,
"eval_accuracy": 0.6593696833705641,
"eval_loss": 1.3081881999969482,
"eval_runtime": 53.9417,
"eval_samples_per_second": 114.068,
"eval_steps_per_second": 3.578,
"step": 9000
},
{
"epoch": 0.49680995711745635,
"grad_norm": 1.1594932079315186,
"learning_rate": 2.5159502144127183e-05,
"loss": 1.3686,
"step": 9500
},
{
"epoch": 0.5229578495973225,
"grad_norm": 1.1389472484588623,
"learning_rate": 2.385210752013388e-05,
"loss": 1.3567,
"step": 10000
},
{
"epoch": 0.5229578495973225,
"eval_accuracy": 0.6632432120706919,
"eval_loss": 1.2883645296096802,
"eval_runtime": 53.6247,
"eval_samples_per_second": 114.742,
"eval_steps_per_second": 3.599,
"step": 10000
},
{
"epoch": 0.5491057420771885,
"grad_norm": 1.08192777633667,
"learning_rate": 2.254471289614057e-05,
"loss": 1.3455,
"step": 10500
},
{
"epoch": 0.5752536345570547,
"grad_norm": 1.1092888116836548,
"learning_rate": 2.1237318272147268e-05,
"loss": 1.3373,
"step": 11000
},
{
"epoch": 0.5752536345570547,
"eval_accuracy": 0.6666822357673398,
"eval_loss": 1.2716636657714844,
"eval_runtime": 53.5985,
"eval_samples_per_second": 114.798,
"eval_steps_per_second": 3.601,
"step": 11000
},
{
"epoch": 0.6014015270369208,
"grad_norm": 1.1305238008499146,
"learning_rate": 1.9929923648153958e-05,
"loss": 1.3273,
"step": 11500
},
{
"epoch": 0.627549419516787,
"grad_norm": 1.1033620834350586,
"learning_rate": 1.8622529024160655e-05,
"loss": 1.3231,
"step": 12000
},
{
"epoch": 0.627549419516787,
"eval_accuracy": 0.6691524801180201,
"eval_loss": 1.2592648267745972,
"eval_runtime": 54.0954,
"eval_samples_per_second": 113.743,
"eval_steps_per_second": 3.568,
"step": 12000
},
{
"epoch": 0.653697311996653,
"grad_norm": 1.108879566192627,
"learning_rate": 1.7315134400167346e-05,
"loss": 1.3152,
"step": 12500
},
{
"epoch": 0.6798452044765192,
"grad_norm": 1.1043877601623535,
"learning_rate": 1.6007739776174043e-05,
"loss": 1.3101,
"step": 13000
},
{
"epoch": 0.6798452044765192,
"eval_accuracy": 0.671657040037531,
"eval_loss": 1.2451061010360718,
"eval_runtime": 53.6863,
"eval_samples_per_second": 114.61,
"eval_steps_per_second": 3.595,
"step": 13000
},
{
"epoch": 0.7059930969563853,
"grad_norm": 1.0998560190200806,
"learning_rate": 1.4700345152180736e-05,
"loss": 1.3042,
"step": 13500
},
{
"epoch": 0.7321409894362514,
"grad_norm": 1.1624592542648315,
"learning_rate": 1.3392950528187429e-05,
"loss": 1.2962,
"step": 14000
},
{
"epoch": 0.7321409894362514,
"eval_accuracy": 0.6740324399688046,
"eval_loss": 1.2344375848770142,
"eval_runtime": 53.7465,
"eval_samples_per_second": 114.482,
"eval_steps_per_second": 3.591,
"step": 14000
},
{
"epoch": 0.7582888819161175,
"grad_norm": 1.0827239751815796,
"learning_rate": 1.2085555904194122e-05,
"loss": 1.2915,
"step": 14500
},
{
"epoch": 0.7844367743959837,
"grad_norm": 1.1423588991165161,
"learning_rate": 1.0778161280200816e-05,
"loss": 1.2842,
"step": 15000
},
{
"epoch": 0.7844367743959837,
"eval_accuracy": 0.6758603794825307,
"eval_loss": 1.22589910030365,
"eval_runtime": 53.8871,
"eval_samples_per_second": 114.183,
"eval_steps_per_second": 3.582,
"step": 15000
},
{
"epoch": 0.8105846668758498,
"grad_norm": 1.1340547800064087,
"learning_rate": 9.47076665620751e-06,
"loss": 1.2796,
"step": 15500
},
{
"epoch": 0.8367325593557159,
"grad_norm": 1.1284784078598022,
"learning_rate": 8.163372032214205e-06,
"loss": 1.2752,
"step": 16000
},
{
"epoch": 0.8367325593557159,
"eval_accuracy": 0.6775672612951045,
"eval_loss": 1.2177867889404297,
"eval_runtime": 53.7311,
"eval_samples_per_second": 114.515,
"eval_steps_per_second": 3.592,
"step": 16000
},
{
"epoch": 0.862880451835582,
"grad_norm": 1.138634443283081,
"learning_rate": 6.855977408220898e-06,
"loss": 1.2715,
"step": 16500
},
{
"epoch": 0.8890283443154482,
"grad_norm": 1.1163477897644043,
"learning_rate": 5.548582784227591e-06,
"loss": 1.2718,
"step": 17000
},
{
"epoch": 0.8890283443154482,
"eval_accuracy": 0.6789395663115799,
"eval_loss": 1.2103002071380615,
"eval_runtime": 53.645,
"eval_samples_per_second": 114.699,
"eval_steps_per_second": 3.598,
"step": 17000
},
{
"epoch": 0.9151762367953143,
"grad_norm": 1.133549451828003,
"learning_rate": 4.241188160234285e-06,
"loss": 1.2698,
"step": 17500
},
{
"epoch": 0.9413241292751804,
"grad_norm": 1.1056196689605713,
"learning_rate": 2.933793536240979e-06,
"loss": 1.2628,
"step": 18000
},
{
"epoch": 0.9413241292751804,
"eval_accuracy": 0.6802642108157907,
"eval_loss": 1.204982876777649,
"eval_runtime": 53.7116,
"eval_samples_per_second": 114.556,
"eval_steps_per_second": 3.593,
"step": 18000
},
{
"epoch": 0.9674720217550465,
"grad_norm": 1.126265525817871,
"learning_rate": 1.6263989122476732e-06,
"loss": 1.2634,
"step": 18500
},
{
"epoch": 0.9936199142349127,
"grad_norm": 1.1292709112167358,
"learning_rate": 3.190042882543667e-07,
"loss": 1.2576,
"step": 19000
},
{
"epoch": 0.9936199142349127,
"eval_accuracy": 0.6809181130440626,
"eval_loss": 1.2015681266784668,
"eval_runtime": 53.7272,
"eval_samples_per_second": 114.523,
"eval_steps_per_second": 3.592,
"step": 19000
},
{
"epoch": 1.0,
"step": 19122,
"total_flos": 3.19770233929728e+17,
"train_loss": 1.5458110461810084,
"train_runtime": 7443.7007,
"train_samples_per_second": 82.204,
"train_steps_per_second": 2.569
}
],
"logging_steps": 500,
"max_steps": 19122,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.19770233929728e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}