gpt2_m030_tiny-stories_1024 / trainer_state.json
jonasknobloch's picture
Upload folder using huggingface_hub
4cc2695 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 19042,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02625774603508035,
"grad_norm": 1.9965242147445679,
"learning_rate": 4.868711269824598e-05,
"loss": 4.4037,
"step": 500
},
{
"epoch": 0.0525154920701607,
"grad_norm": 1.66657555103302,
"learning_rate": 4.737422539649197e-05,
"loss": 2.9308,
"step": 1000
},
{
"epoch": 0.0525154920701607,
"eval_accuracy": 0.4407849232661833,
"eval_loss": 2.475210666656494,
"eval_runtime": 52.4616,
"eval_samples_per_second": 116.809,
"eval_steps_per_second": 3.66,
"step": 1000
},
{
"epoch": 0.07877323810524105,
"grad_norm": 1.709204912185669,
"learning_rate": 4.606133809473795e-05,
"loss": 2.3035,
"step": 1500
},
{
"epoch": 0.1050309841403214,
"grad_norm": 1.9202218055725098,
"learning_rate": 4.4748450792983934e-05,
"loss": 1.9919,
"step": 2000
},
{
"epoch": 0.1050309841403214,
"eval_accuracy": 0.5647940067737086,
"eval_loss": 1.8136076927185059,
"eval_runtime": 52.2754,
"eval_samples_per_second": 117.225,
"eval_steps_per_second": 3.673,
"step": 2000
},
{
"epoch": 0.13128873017540174,
"grad_norm": 1.549422025680542,
"learning_rate": 4.3435563491229914e-05,
"loss": 1.8363,
"step": 2500
},
{
"epoch": 0.1575464762104821,
"grad_norm": 1.7258355617523193,
"learning_rate": 4.2122676189475893e-05,
"loss": 1.7406,
"step": 3000
},
{
"epoch": 0.1575464762104821,
"eval_accuracy": 0.5983668062755066,
"eval_loss": 1.6234792470932007,
"eval_runtime": 52.6459,
"eval_samples_per_second": 116.4,
"eval_steps_per_second": 3.647,
"step": 3000
},
{
"epoch": 0.18380422224556245,
"grad_norm": 1.4436798095703125,
"learning_rate": 4.080978888772188e-05,
"loss": 1.6711,
"step": 3500
},
{
"epoch": 0.2100619682806428,
"grad_norm": 1.3306225538253784,
"learning_rate": 3.949690158596786e-05,
"loss": 1.6185,
"step": 4000
},
{
"epoch": 0.2100619682806428,
"eval_accuracy": 0.6164799047495081,
"eval_loss": 1.5258336067199707,
"eval_runtime": 52.3533,
"eval_samples_per_second": 117.051,
"eval_steps_per_second": 3.667,
"step": 4000
},
{
"epoch": 0.23631971431572313,
"grad_norm": 1.235955834388733,
"learning_rate": 3.8184014284213846e-05,
"loss": 1.5788,
"step": 4500
},
{
"epoch": 0.2625774603508035,
"grad_norm": 1.1904724836349487,
"learning_rate": 3.6871126982459825e-05,
"loss": 1.5461,
"step": 5000
},
{
"epoch": 0.2625774603508035,
"eval_accuracy": 0.6282321233049777,
"eval_loss": 1.4625129699707031,
"eval_runtime": 52.375,
"eval_samples_per_second": 117.002,
"eval_steps_per_second": 3.666,
"step": 5000
},
{
"epoch": 0.28883520638588384,
"grad_norm": 1.2717902660369873,
"learning_rate": 3.555823968070581e-05,
"loss": 1.5198,
"step": 5500
},
{
"epoch": 0.3150929524209642,
"grad_norm": 1.1882010698318481,
"learning_rate": 3.42453523789518e-05,
"loss": 1.4955,
"step": 6000
},
{
"epoch": 0.3150929524209642,
"eval_accuracy": 0.6368267127605542,
"eval_loss": 1.4170297384262085,
"eval_runtime": 52.2927,
"eval_samples_per_second": 117.187,
"eval_steps_per_second": 3.672,
"step": 6000
},
{
"epoch": 0.34135069845604454,
"grad_norm": 1.1391606330871582,
"learning_rate": 3.293246507719778e-05,
"loss": 1.4737,
"step": 6500
},
{
"epoch": 0.3676084444911249,
"grad_norm": 1.325378179550171,
"learning_rate": 3.161957777544376e-05,
"loss": 1.4553,
"step": 7000
},
{
"epoch": 0.3676084444911249,
"eval_accuracy": 0.643254111059215,
"eval_loss": 1.3824151754379272,
"eval_runtime": 52.618,
"eval_samples_per_second": 116.462,
"eval_steps_per_second": 3.649,
"step": 7000
},
{
"epoch": 0.39386619052620525,
"grad_norm": 1.1828022003173828,
"learning_rate": 3.030669047368974e-05,
"loss": 1.4368,
"step": 7500
},
{
"epoch": 0.4201239365612856,
"grad_norm": 1.1762062311172485,
"learning_rate": 2.8993803171935723e-05,
"loss": 1.4218,
"step": 8000
},
{
"epoch": 0.4201239365612856,
"eval_accuracy": 0.6492348631603664,
"eval_loss": 1.3531708717346191,
"eval_runtime": 52.4914,
"eval_samples_per_second": 116.743,
"eval_steps_per_second": 3.658,
"step": 8000
},
{
"epoch": 0.4463816825963659,
"grad_norm": 1.1989212036132812,
"learning_rate": 2.7680915870181706e-05,
"loss": 1.4113,
"step": 8500
},
{
"epoch": 0.47263942863144626,
"grad_norm": 1.2929445505142212,
"learning_rate": 2.636802856842769e-05,
"loss": 1.3986,
"step": 9000
},
{
"epoch": 0.47263942863144626,
"eval_accuracy": 0.6536616055271829,
"eval_loss": 1.3304531574249268,
"eval_runtime": 52.4713,
"eval_samples_per_second": 116.788,
"eval_steps_per_second": 3.659,
"step": 9000
},
{
"epoch": 0.4988971746665266,
"grad_norm": 1.1339648962020874,
"learning_rate": 2.5055141266673672e-05,
"loss": 1.3867,
"step": 9500
},
{
"epoch": 0.525154920701607,
"grad_norm": 1.1032936573028564,
"learning_rate": 2.374225396491965e-05,
"loss": 1.3722,
"step": 10000
},
{
"epoch": 0.525154920701607,
"eval_accuracy": 0.6575196715746703,
"eval_loss": 1.310016393661499,
"eval_runtime": 52.3673,
"eval_samples_per_second": 117.02,
"eval_steps_per_second": 3.666,
"step": 10000
},
{
"epoch": 0.5514126667366873,
"grad_norm": 1.0910552740097046,
"learning_rate": 2.2429366663165635e-05,
"loss": 1.3665,
"step": 10500
},
{
"epoch": 0.5776704127717677,
"grad_norm": 1.133002758026123,
"learning_rate": 2.1116479361411618e-05,
"loss": 1.3573,
"step": 11000
},
{
"epoch": 0.5776704127717677,
"eval_accuracy": 0.660813846797802,
"eval_loss": 1.2933967113494873,
"eval_runtime": 52.0286,
"eval_samples_per_second": 117.781,
"eval_steps_per_second": 3.69,
"step": 11000
},
{
"epoch": 0.603928158806848,
"grad_norm": 1.1310782432556152,
"learning_rate": 1.98035920596576e-05,
"loss": 1.3486,
"step": 11500
},
{
"epoch": 0.6301859048419284,
"grad_norm": 1.0979714393615723,
"learning_rate": 1.8490704757903583e-05,
"loss": 1.3448,
"step": 12000
},
{
"epoch": 0.6301859048419284,
"eval_accuracy": 0.6639449961588427,
"eval_loss": 1.2785269021987915,
"eval_runtime": 52.6733,
"eval_samples_per_second": 116.34,
"eval_steps_per_second": 3.645,
"step": 12000
},
{
"epoch": 0.6564436508770087,
"grad_norm": 1.1284676790237427,
"learning_rate": 1.7177817456149563e-05,
"loss": 1.3377,
"step": 12500
},
{
"epoch": 0.6827013969120891,
"grad_norm": 1.0903546810150146,
"learning_rate": 1.5864930154395546e-05,
"loss": 1.3291,
"step": 13000
},
{
"epoch": 0.6827013969120891,
"eval_accuracy": 0.6664656758777874,
"eval_loss": 1.2657496929168701,
"eval_runtime": 53.0667,
"eval_samples_per_second": 115.477,
"eval_steps_per_second": 3.618,
"step": 13000
},
{
"epoch": 0.7089591429471694,
"grad_norm": 1.1001683473587036,
"learning_rate": 1.4552042852641529e-05,
"loss": 1.3225,
"step": 13500
},
{
"epoch": 0.7352168889822498,
"grad_norm": 1.1466083526611328,
"learning_rate": 1.3239155550887514e-05,
"loss": 1.3174,
"step": 14000
},
{
"epoch": 0.7352168889822498,
"eval_accuracy": 0.6685889999974477,
"eval_loss": 1.2551158666610718,
"eval_runtime": 52.488,
"eval_samples_per_second": 116.75,
"eval_steps_per_second": 3.658,
"step": 14000
},
{
"epoch": 0.7614746350173301,
"grad_norm": 1.1432747840881348,
"learning_rate": 1.1926268249133495e-05,
"loss": 1.3109,
"step": 14500
},
{
"epoch": 0.7877323810524105,
"grad_norm": 1.111801266670227,
"learning_rate": 1.0613380947379476e-05,
"loss": 1.3052,
"step": 15000
},
{
"epoch": 0.7877323810524105,
"eval_accuracy": 0.670427906199194,
"eval_loss": 1.2463113069534302,
"eval_runtime": 51.9603,
"eval_samples_per_second": 117.936,
"eval_steps_per_second": 3.695,
"step": 15000
},
{
"epoch": 0.8139901270874909,
"grad_norm": 1.1736992597579956,
"learning_rate": 9.30049364562546e-06,
"loss": 1.3038,
"step": 15500
},
{
"epoch": 0.8402478731225712,
"grad_norm": 1.1293760538101196,
"learning_rate": 7.987606343871442e-06,
"loss": 1.2968,
"step": 16000
},
{
"epoch": 0.8402478731225712,
"eval_accuracy": 0.6724532871884005,
"eval_loss": 1.236586570739746,
"eval_runtime": 52.6147,
"eval_samples_per_second": 116.469,
"eval_steps_per_second": 3.649,
"step": 16000
},
{
"epoch": 0.8665056191576515,
"grad_norm": 1.1446099281311035,
"learning_rate": 6.674719042117425e-06,
"loss": 1.295,
"step": 16500
},
{
"epoch": 0.8927633651927318,
"grad_norm": 1.1120586395263672,
"learning_rate": 5.361831740363407e-06,
"loss": 1.2856,
"step": 17000
},
{
"epoch": 0.8927633651927318,
"eval_accuracy": 0.673523483380933,
"eval_loss": 1.230779767036438,
"eval_runtime": 52.5434,
"eval_samples_per_second": 116.627,
"eval_steps_per_second": 3.654,
"step": 17000
},
{
"epoch": 0.9190211112278122,
"grad_norm": 1.148032546043396,
"learning_rate": 4.04894443860939e-06,
"loss": 1.2862,
"step": 17500
},
{
"epoch": 0.9452788572628925,
"grad_norm": 1.116765022277832,
"learning_rate": 2.7360571368553723e-06,
"loss": 1.2817,
"step": 18000
},
{
"epoch": 0.9452788572628925,
"eval_accuracy": 0.6749144672531769,
"eval_loss": 1.2248890399932861,
"eval_runtime": 52.4248,
"eval_samples_per_second": 116.891,
"eval_steps_per_second": 3.662,
"step": 18000
},
{
"epoch": 0.9715366032979729,
"grad_norm": 1.1118154525756836,
"learning_rate": 1.423169835101355e-06,
"loss": 1.2816,
"step": 18500
},
{
"epoch": 0.9977943493330532,
"grad_norm": 1.120948076248169,
"learning_rate": 1.1028253334733746e-07,
"loss": 1.2814,
"step": 19000
},
{
"epoch": 0.9977943493330532,
"eval_accuracy": 0.6756562189740409,
"eval_loss": 1.2216291427612305,
"eval_runtime": 53.1168,
"eval_samples_per_second": 115.368,
"eval_steps_per_second": 3.615,
"step": 19000
},
{
"epoch": 1.0,
"step": 19042,
"total_flos": 3.18433463894016e+17,
"train_loss": 1.5689714319777832,
"train_runtime": 7397.2028,
"train_samples_per_second": 82.375,
"train_steps_per_second": 2.574
}
],
"logging_steps": 500,
"max_steps": 19042,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.18433463894016e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}