gpt_12 / trainer_state.json
gokulsrinivasagan's picture
End of training
821806e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.429204003563381,
"eval_steps": 1000000,
"global_step": 49143,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0043668885046027,
"grad_norm": 2.018752336502075,
"learning_rate": 5.000000000000001e-07,
"loss": 10.1855,
"step": 500
},
{
"epoch": 0.0087337770092054,
"grad_norm": 2.0094995498657227,
"learning_rate": 1.0000000000000002e-06,
"loss": 9.2041,
"step": 1000
},
{
"epoch": 0.013100665513808101,
"grad_norm": 1.4977174997329712,
"learning_rate": 1.5e-06,
"loss": 8.6586,
"step": 1500
},
{
"epoch": 0.0174675540184108,
"grad_norm": 1.407737135887146,
"learning_rate": 2.0000000000000003e-06,
"loss": 8.1201,
"step": 2000
},
{
"epoch": 0.021834442523013503,
"grad_norm": 1.2318576574325562,
"learning_rate": 2.5e-06,
"loss": 7.6276,
"step": 2500
},
{
"epoch": 0.026201331027616202,
"grad_norm": 1.2920515537261963,
"learning_rate": 3e-06,
"loss": 7.2474,
"step": 3000
},
{
"epoch": 0.030568219532218905,
"grad_norm": 1.1667238473892212,
"learning_rate": 3.5e-06,
"loss": 7.0004,
"step": 3500
},
{
"epoch": 0.0349351080368216,
"grad_norm": 1.1691466569900513,
"learning_rate": 4.000000000000001e-06,
"loss": 6.8215,
"step": 4000
},
{
"epoch": 0.03930199654142431,
"grad_norm": 1.065576195716858,
"learning_rate": 4.5e-06,
"loss": 6.652,
"step": 4500
},
{
"epoch": 0.043668885046027006,
"grad_norm": 1.477279782295227,
"learning_rate": 5e-06,
"loss": 6.5107,
"step": 5000
},
{
"epoch": 0.048035773550629705,
"grad_norm": 1.583247184753418,
"learning_rate": 5.500000000000001e-06,
"loss": 6.3804,
"step": 5500
},
{
"epoch": 0.052402662055232405,
"grad_norm": 1.5069547891616821,
"learning_rate": 6e-06,
"loss": 6.2661,
"step": 6000
},
{
"epoch": 0.056769550559835104,
"grad_norm": 1.7084009647369385,
"learning_rate": 6.5000000000000004e-06,
"loss": 6.1654,
"step": 6500
},
{
"epoch": 0.06113643906443781,
"grad_norm": 1.6150327920913696,
"learning_rate": 7e-06,
"loss": 6.0775,
"step": 7000
},
{
"epoch": 0.06550332756904051,
"grad_norm": 1.9077385663986206,
"learning_rate": 7.500000000000001e-06,
"loss": 5.9847,
"step": 7500
},
{
"epoch": 0.0698702160736432,
"grad_norm": 2.038984537124634,
"learning_rate": 8.000000000000001e-06,
"loss": 5.8919,
"step": 8000
},
{
"epoch": 0.07423710457824591,
"grad_norm": 2.5044028759002686,
"learning_rate": 8.5e-06,
"loss": 5.8066,
"step": 8500
},
{
"epoch": 0.07860399308284861,
"grad_norm": 2.200798511505127,
"learning_rate": 9e-06,
"loss": 5.7103,
"step": 9000
},
{
"epoch": 0.0829708815874513,
"grad_norm": 2.3363890647888184,
"learning_rate": 9.5e-06,
"loss": 5.6353,
"step": 9500
},
{
"epoch": 0.08733777009205401,
"grad_norm": 2.091174602508545,
"learning_rate": 1e-05,
"loss": 5.5577,
"step": 10000
},
{
"epoch": 0.0917046585966567,
"grad_norm": 2.0803627967834473,
"learning_rate": 9.999562929421844e-06,
"loss": 5.4809,
"step": 10500
},
{
"epoch": 0.09607154710125941,
"grad_norm": 2.083531141281128,
"learning_rate": 9.999125858843687e-06,
"loss": 5.4045,
"step": 11000
},
{
"epoch": 0.10043843560586212,
"grad_norm": 1.9286231994628906,
"learning_rate": 9.99868878826553e-06,
"loss": 5.3344,
"step": 11500
},
{
"epoch": 0.10480532411046481,
"grad_norm": 2.1751465797424316,
"learning_rate": 9.998251717687372e-06,
"loss": 5.2562,
"step": 12000
},
{
"epoch": 0.10917221261506752,
"grad_norm": 2.120579957962036,
"learning_rate": 9.997814647109217e-06,
"loss": 5.2069,
"step": 12500
},
{
"epoch": 0.11353910111967021,
"grad_norm": 2.0757505893707275,
"learning_rate": 9.997377576531058e-06,
"loss": 5.1558,
"step": 13000
},
{
"epoch": 0.11790598962427291,
"grad_norm": 2.013015031814575,
"learning_rate": 9.996940505952902e-06,
"loss": 5.0984,
"step": 13500
},
{
"epoch": 0.12227287812887562,
"grad_norm": 1.910936951637268,
"learning_rate": 9.996503435374745e-06,
"loss": 5.0394,
"step": 14000
},
{
"epoch": 0.1266397666334783,
"grad_norm": 2.150876045227051,
"learning_rate": 9.996066364796588e-06,
"loss": 5.0009,
"step": 14500
},
{
"epoch": 0.13100665513808102,
"grad_norm": 1.9558886289596558,
"learning_rate": 9.995629294218431e-06,
"loss": 4.9463,
"step": 15000
},
{
"epoch": 0.13537354364268372,
"grad_norm": 1.999248743057251,
"learning_rate": 9.995192223640275e-06,
"loss": 4.9029,
"step": 15500
},
{
"epoch": 0.1397404321472864,
"grad_norm": 2.1909689903259277,
"learning_rate": 9.994755153062116e-06,
"loss": 4.843,
"step": 16000
},
{
"epoch": 0.1441073206518891,
"grad_norm": 2.090623140335083,
"learning_rate": 9.994318082483961e-06,
"loss": 4.7988,
"step": 16500
},
{
"epoch": 0.14847420915649182,
"grad_norm": 2.2268435955047607,
"learning_rate": 9.993881011905804e-06,
"loss": 4.7431,
"step": 17000
},
{
"epoch": 0.15284109766109452,
"grad_norm": 2.164546251296997,
"learning_rate": 9.993443941327646e-06,
"loss": 4.6919,
"step": 17500
},
{
"epoch": 0.15720798616569723,
"grad_norm": 2.255798101425171,
"learning_rate": 9.993006870749489e-06,
"loss": 4.6517,
"step": 18000
},
{
"epoch": 0.1615748746702999,
"grad_norm": 2.169243812561035,
"learning_rate": 9.992569800171332e-06,
"loss": 4.6128,
"step": 18500
},
{
"epoch": 0.1659417631749026,
"grad_norm": 2.106949806213379,
"learning_rate": 9.992132729593176e-06,
"loss": 4.5726,
"step": 19000
},
{
"epoch": 0.17030865167950532,
"grad_norm": 2.143815040588379,
"learning_rate": 9.991695659015019e-06,
"loss": 4.5344,
"step": 19500
},
{
"epoch": 0.17467554018410802,
"grad_norm": 2.406649589538574,
"learning_rate": 9.991258588436862e-06,
"loss": 4.5041,
"step": 20000
},
{
"epoch": 0.17904242868871073,
"grad_norm": 2.092935085296631,
"learning_rate": 9.990821517858704e-06,
"loss": 4.4631,
"step": 20500
},
{
"epoch": 0.1834093171933134,
"grad_norm": 2.0865073204040527,
"learning_rate": 9.990384447280548e-06,
"loss": 4.4408,
"step": 21000
},
{
"epoch": 0.18777620569791612,
"grad_norm": 2.061974287033081,
"learning_rate": 9.98994737670239e-06,
"loss": 4.4113,
"step": 21500
},
{
"epoch": 0.19214309420251882,
"grad_norm": 1.916175365447998,
"learning_rate": 9.989510306124233e-06,
"loss": 4.3833,
"step": 22000
},
{
"epoch": 0.19650998270712153,
"grad_norm": 1.9595962762832642,
"learning_rate": 9.989073235546078e-06,
"loss": 4.353,
"step": 22500
},
{
"epoch": 0.20087687121172423,
"grad_norm": 2.0971903800964355,
"learning_rate": 9.98863616496792e-06,
"loss": 4.3333,
"step": 23000
},
{
"epoch": 0.2052437597163269,
"grad_norm": 2.0486457347869873,
"learning_rate": 9.988199094389763e-06,
"loss": 4.3109,
"step": 23500
},
{
"epoch": 0.20961064822092962,
"grad_norm": 1.9522242546081543,
"learning_rate": 9.987762023811606e-06,
"loss": 4.279,
"step": 24000
},
{
"epoch": 0.21397753672553232,
"grad_norm": 2.1979501247406006,
"learning_rate": 9.98732495323345e-06,
"loss": 4.2567,
"step": 24500
},
{
"epoch": 0.21834442523013503,
"grad_norm": 1.9632526636123657,
"learning_rate": 9.986887882655293e-06,
"loss": 4.2416,
"step": 25000
},
{
"epoch": 0.22271131373473774,
"grad_norm": 2.0721206665039062,
"learning_rate": 9.986450812077136e-06,
"loss": 4.22,
"step": 25500
},
{
"epoch": 0.22707820223934042,
"grad_norm": 2.1473758220672607,
"learning_rate": 9.986013741498977e-06,
"loss": 4.2079,
"step": 26000
},
{
"epoch": 0.23144509074394312,
"grad_norm": 2.040027379989624,
"learning_rate": 9.98557667092082e-06,
"loss": 4.182,
"step": 26500
},
{
"epoch": 0.23581197924854583,
"grad_norm": 2.2831156253814697,
"learning_rate": 9.985139600342664e-06,
"loss": 4.1655,
"step": 27000
},
{
"epoch": 0.24017886775314853,
"grad_norm": 1.9741929769515991,
"learning_rate": 9.984702529764507e-06,
"loss": 4.1594,
"step": 27500
},
{
"epoch": 0.24454575625775124,
"grad_norm": 2.0276150703430176,
"learning_rate": 9.98426545918635e-06,
"loss": 4.1374,
"step": 28000
},
{
"epoch": 0.24891264476235392,
"grad_norm": 1.9253956079483032,
"learning_rate": 9.983828388608193e-06,
"loss": 4.1206,
"step": 28500
},
{
"epoch": 0.2532795332669566,
"grad_norm": 1.9739083051681519,
"learning_rate": 9.983391318030037e-06,
"loss": 4.1049,
"step": 29000
},
{
"epoch": 0.25764642177155933,
"grad_norm": 2.0716798305511475,
"learning_rate": 9.98295424745188e-06,
"loss": 4.0768,
"step": 29500
},
{
"epoch": 0.26201331027616204,
"grad_norm": 2.0090582370758057,
"learning_rate": 9.982517176873723e-06,
"loss": 4.0771,
"step": 30000
},
{
"epoch": 0.26638019878076474,
"grad_norm": 1.9497123956680298,
"learning_rate": 9.982080106295565e-06,
"loss": 4.0663,
"step": 30500
},
{
"epoch": 0.27074708728536745,
"grad_norm": 2.1742944717407227,
"learning_rate": 9.98164303571741e-06,
"loss": 4.0483,
"step": 31000
},
{
"epoch": 0.27511397578997016,
"grad_norm": 1.954126238822937,
"learning_rate": 9.981205965139251e-06,
"loss": 4.0313,
"step": 31500
},
{
"epoch": 0.2794808642945728,
"grad_norm": 2.0533246994018555,
"learning_rate": 9.980768894561094e-06,
"loss": 4.0169,
"step": 32000
},
{
"epoch": 0.2838477527991755,
"grad_norm": 1.8938665390014648,
"learning_rate": 9.980331823982938e-06,
"loss": 4.0087,
"step": 32500
},
{
"epoch": 0.2882146413037782,
"grad_norm": 1.9633103609085083,
"learning_rate": 9.97989475340478e-06,
"loss": 3.9909,
"step": 33000
},
{
"epoch": 0.2925815298083809,
"grad_norm": 1.903270959854126,
"learning_rate": 9.979457682826622e-06,
"loss": 3.9817,
"step": 33500
},
{
"epoch": 0.29694841831298363,
"grad_norm": 2.009631395339966,
"learning_rate": 9.979020612248467e-06,
"loss": 3.9712,
"step": 34000
},
{
"epoch": 0.30131530681758634,
"grad_norm": 1.9002183675765991,
"learning_rate": 9.97858354167031e-06,
"loss": 3.9701,
"step": 34500
},
{
"epoch": 0.30568219532218904,
"grad_norm": 1.9432848691940308,
"learning_rate": 9.978146471092152e-06,
"loss": 3.9536,
"step": 35000
},
{
"epoch": 0.31004908382679175,
"grad_norm": 1.9657421112060547,
"learning_rate": 9.977709400513997e-06,
"loss": 3.9392,
"step": 35500
},
{
"epoch": 0.31441597233139446,
"grad_norm": 1.9874509572982788,
"learning_rate": 9.977272329935838e-06,
"loss": 3.9299,
"step": 36000
},
{
"epoch": 0.31878286083599716,
"grad_norm": 2.0522308349609375,
"learning_rate": 9.976835259357682e-06,
"loss": 3.9203,
"step": 36500
},
{
"epoch": 0.3231497493405998,
"grad_norm": 1.9851490259170532,
"learning_rate": 9.976398188779525e-06,
"loss": 3.9125,
"step": 37000
},
{
"epoch": 0.3275166378452025,
"grad_norm": 2.00964093208313,
"learning_rate": 9.975961118201368e-06,
"loss": 3.9056,
"step": 37500
},
{
"epoch": 0.3318835263498052,
"grad_norm": 1.8827855587005615,
"learning_rate": 9.975524047623211e-06,
"loss": 3.8967,
"step": 38000
},
{
"epoch": 0.33625041485440793,
"grad_norm": 1.8730061054229736,
"learning_rate": 9.975086977045055e-06,
"loss": 3.891,
"step": 38500
},
{
"epoch": 0.34061730335901064,
"grad_norm": 1.9370964765548706,
"learning_rate": 9.974649906466896e-06,
"loss": 3.8769,
"step": 39000
},
{
"epoch": 0.34498419186361334,
"grad_norm": 1.959948182106018,
"learning_rate": 9.97421283588874e-06,
"loss": 3.8707,
"step": 39500
},
{
"epoch": 0.34935108036821605,
"grad_norm": 1.9862849712371826,
"learning_rate": 9.973775765310583e-06,
"loss": 3.8568,
"step": 40000
},
{
"epoch": 0.35371796887281876,
"grad_norm": 1.9760117530822754,
"learning_rate": 9.973338694732426e-06,
"loss": 3.8536,
"step": 40500
},
{
"epoch": 0.35808485737742146,
"grad_norm": 2.0359582901000977,
"learning_rate": 9.972901624154269e-06,
"loss": 3.8466,
"step": 41000
},
{
"epoch": 0.36245174588202417,
"grad_norm": 1.9113123416900635,
"learning_rate": 9.972464553576112e-06,
"loss": 3.8345,
"step": 41500
},
{
"epoch": 0.3668186343866268,
"grad_norm": 1.9586379528045654,
"learning_rate": 9.972027482997956e-06,
"loss": 3.8306,
"step": 42000
},
{
"epoch": 0.3711855228912295,
"grad_norm": 1.887161374092102,
"learning_rate": 9.971590412419799e-06,
"loss": 3.8178,
"step": 42500
},
{
"epoch": 0.37555241139583223,
"grad_norm": 1.8756746053695679,
"learning_rate": 9.971153341841642e-06,
"loss": 3.8145,
"step": 43000
},
{
"epoch": 0.37991929990043494,
"grad_norm": 1.9797776937484741,
"learning_rate": 9.970716271263484e-06,
"loss": 3.8106,
"step": 43500
},
{
"epoch": 0.38428618840503764,
"grad_norm": 1.9709391593933105,
"learning_rate": 9.970279200685328e-06,
"loss": 3.7974,
"step": 44000
},
{
"epoch": 0.38865307690964035,
"grad_norm": 1.8535213470458984,
"learning_rate": 9.96984213010717e-06,
"loss": 3.7866,
"step": 44500
},
{
"epoch": 0.39301996541424306,
"grad_norm": 1.8140771389007568,
"learning_rate": 9.969405059529013e-06,
"loss": 3.788,
"step": 45000
},
{
"epoch": 0.39738685391884576,
"grad_norm": 1.8980203866958618,
"learning_rate": 9.968967988950856e-06,
"loss": 3.7813,
"step": 45500
},
{
"epoch": 0.40175374242344847,
"grad_norm": 1.88387131690979,
"learning_rate": 9.9685309183727e-06,
"loss": 3.7782,
"step": 46000
},
{
"epoch": 0.4061206309280511,
"grad_norm": 2.057882785797119,
"learning_rate": 9.968093847794543e-06,
"loss": 3.7688,
"step": 46500
},
{
"epoch": 0.4104875194326538,
"grad_norm": 1.9363012313842773,
"learning_rate": 9.967656777216386e-06,
"loss": 3.7582,
"step": 47000
},
{
"epoch": 0.41485440793725653,
"grad_norm": 1.8989619016647339,
"learning_rate": 9.96721970663823e-06,
"loss": 3.7643,
"step": 47500
},
{
"epoch": 0.41922129644185924,
"grad_norm": 1.9946751594543457,
"learning_rate": 9.966782636060071e-06,
"loss": 3.7483,
"step": 48000
},
{
"epoch": 0.42358818494646194,
"grad_norm": 1.8525508642196655,
"learning_rate": 9.966345565481916e-06,
"loss": 3.7442,
"step": 48500
},
{
"epoch": 0.42795507345106465,
"grad_norm": 1.8581886291503906,
"learning_rate": 9.965908494903757e-06,
"loss": 3.7361,
"step": 49000
},
{
"epoch": 0.429204003563381,
"step": 49143,
"total_flos": 1.2327047290505134e+18,
"train_loss": 4.777673066357365,
"train_runtime": 46797.1012,
"train_samples_per_second": 11744.031,
"train_steps_per_second": 244.669
}
],
"logging_steps": 500,
"max_steps": 11449800,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2327047290505134e+18,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}