sedrickkeh's picture
End of training
e850e94 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9988776655443323,
"eval_steps": 500,
"global_step": 1002,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029928918817807706,
"grad_norm": 1.3197502899687559,
"learning_rate": 5e-06,
"loss": 0.7729,
"step": 10
},
{
"epoch": 0.05985783763561541,
"grad_norm": 0.7957246856225982,
"learning_rate": 5e-06,
"loss": 0.694,
"step": 20
},
{
"epoch": 0.08978675645342311,
"grad_norm": 0.7023919119728642,
"learning_rate": 5e-06,
"loss": 0.6692,
"step": 30
},
{
"epoch": 0.11971567527123082,
"grad_norm": 0.6852471491957217,
"learning_rate": 5e-06,
"loss": 0.6652,
"step": 40
},
{
"epoch": 0.14964459408903855,
"grad_norm": 0.7441284057804172,
"learning_rate": 5e-06,
"loss": 0.6567,
"step": 50
},
{
"epoch": 0.17957351290684623,
"grad_norm": 0.6737131800519109,
"learning_rate": 5e-06,
"loss": 0.6489,
"step": 60
},
{
"epoch": 0.20950243172465394,
"grad_norm": 0.8645698704938743,
"learning_rate": 5e-06,
"loss": 0.6458,
"step": 70
},
{
"epoch": 0.23943135054246165,
"grad_norm": 0.6824403788440216,
"learning_rate": 5e-06,
"loss": 0.6472,
"step": 80
},
{
"epoch": 0.26936026936026936,
"grad_norm": 0.8355879527708924,
"learning_rate": 5e-06,
"loss": 0.6382,
"step": 90
},
{
"epoch": 0.2992891881780771,
"grad_norm": 0.6566317269166482,
"learning_rate": 5e-06,
"loss": 0.6394,
"step": 100
},
{
"epoch": 0.3292181069958848,
"grad_norm": 0.7025002610859795,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 110
},
{
"epoch": 0.35914702581369246,
"grad_norm": 0.7294514273893201,
"learning_rate": 5e-06,
"loss": 0.6341,
"step": 120
},
{
"epoch": 0.3890759446315002,
"grad_norm": 0.7204998726570041,
"learning_rate": 5e-06,
"loss": 0.6342,
"step": 130
},
{
"epoch": 0.4190048634493079,
"grad_norm": 0.9245929000779519,
"learning_rate": 5e-06,
"loss": 0.6279,
"step": 140
},
{
"epoch": 0.4489337822671156,
"grad_norm": 0.8312008040431372,
"learning_rate": 5e-06,
"loss": 0.6298,
"step": 150
},
{
"epoch": 0.4788627010849233,
"grad_norm": 0.6941447661619787,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 160
},
{
"epoch": 0.508791619902731,
"grad_norm": 0.7880713474277835,
"learning_rate": 5e-06,
"loss": 0.623,
"step": 170
},
{
"epoch": 0.5387205387205387,
"grad_norm": 0.7199931353143368,
"learning_rate": 5e-06,
"loss": 0.6247,
"step": 180
},
{
"epoch": 0.5686494575383464,
"grad_norm": 0.7680737861171968,
"learning_rate": 5e-06,
"loss": 0.6223,
"step": 190
},
{
"epoch": 0.5985783763561542,
"grad_norm": 0.7601491643468152,
"learning_rate": 5e-06,
"loss": 0.6299,
"step": 200
},
{
"epoch": 0.6285072951739619,
"grad_norm": 0.76786698349262,
"learning_rate": 5e-06,
"loss": 0.6236,
"step": 210
},
{
"epoch": 0.6584362139917695,
"grad_norm": 0.9029566246000676,
"learning_rate": 5e-06,
"loss": 0.6224,
"step": 220
},
{
"epoch": 0.6883651328095772,
"grad_norm": 0.7045261038164553,
"learning_rate": 5e-06,
"loss": 0.6245,
"step": 230
},
{
"epoch": 0.7182940516273849,
"grad_norm": 0.6774924026654922,
"learning_rate": 5e-06,
"loss": 0.6234,
"step": 240
},
{
"epoch": 0.7482229704451927,
"grad_norm": 0.6011441610841004,
"learning_rate": 5e-06,
"loss": 0.6201,
"step": 250
},
{
"epoch": 0.7781518892630004,
"grad_norm": 0.6589701033868924,
"learning_rate": 5e-06,
"loss": 0.6188,
"step": 260
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.7793955701511873,
"learning_rate": 5e-06,
"loss": 0.6263,
"step": 270
},
{
"epoch": 0.8380097268986157,
"grad_norm": 0.6801997659823543,
"learning_rate": 5e-06,
"loss": 0.6164,
"step": 280
},
{
"epoch": 0.8679386457164235,
"grad_norm": 0.7863034594758392,
"learning_rate": 5e-06,
"loss": 0.6133,
"step": 290
},
{
"epoch": 0.8978675645342312,
"grad_norm": 0.8097674195506819,
"learning_rate": 5e-06,
"loss": 0.6145,
"step": 300
},
{
"epoch": 0.9277964833520389,
"grad_norm": 0.6976711471967793,
"learning_rate": 5e-06,
"loss": 0.6132,
"step": 310
},
{
"epoch": 0.9577254021698466,
"grad_norm": 0.6845188306806823,
"learning_rate": 5e-06,
"loss": 0.615,
"step": 320
},
{
"epoch": 0.9876543209876543,
"grad_norm": 0.9820656021369418,
"learning_rate": 5e-06,
"loss": 0.6085,
"step": 330
},
{
"epoch": 0.9996258885147774,
"eval_loss": 0.6192271709442139,
"eval_runtime": 270.7058,
"eval_samples_per_second": 33.25,
"eval_steps_per_second": 0.521,
"step": 334
},
{
"epoch": 1.017583239805462,
"grad_norm": 0.8670558349646079,
"learning_rate": 5e-06,
"loss": 0.6314,
"step": 340
},
{
"epoch": 1.0475121586232696,
"grad_norm": 1.0194196930172406,
"learning_rate": 5e-06,
"loss": 0.5525,
"step": 350
},
{
"epoch": 1.0774410774410774,
"grad_norm": 0.7571264421758325,
"learning_rate": 5e-06,
"loss": 0.5475,
"step": 360
},
{
"epoch": 1.1073699962588852,
"grad_norm": 0.716142300432686,
"learning_rate": 5e-06,
"loss": 0.5479,
"step": 370
},
{
"epoch": 1.1372989150766928,
"grad_norm": 0.7134968159345548,
"learning_rate": 5e-06,
"loss": 0.5483,
"step": 380
},
{
"epoch": 1.1672278338945006,
"grad_norm": 0.7093422370162528,
"learning_rate": 5e-06,
"loss": 0.5497,
"step": 390
},
{
"epoch": 1.1971567527123081,
"grad_norm": 0.6758306313904245,
"learning_rate": 5e-06,
"loss": 0.5499,
"step": 400
},
{
"epoch": 1.227085671530116,
"grad_norm": 0.6590188596738886,
"learning_rate": 5e-06,
"loss": 0.5523,
"step": 410
},
{
"epoch": 1.2570145903479237,
"grad_norm": 0.7115281205352587,
"learning_rate": 5e-06,
"loss": 0.557,
"step": 420
},
{
"epoch": 1.2869435091657313,
"grad_norm": 0.6651956769462775,
"learning_rate": 5e-06,
"loss": 0.5521,
"step": 430
},
{
"epoch": 1.316872427983539,
"grad_norm": 0.686904033471436,
"learning_rate": 5e-06,
"loss": 0.5542,
"step": 440
},
{
"epoch": 1.3468013468013469,
"grad_norm": 0.7052326227629313,
"learning_rate": 5e-06,
"loss": 0.5473,
"step": 450
},
{
"epoch": 1.3767302656191545,
"grad_norm": 0.6603203892732427,
"learning_rate": 5e-06,
"loss": 0.558,
"step": 460
},
{
"epoch": 1.4066591844369623,
"grad_norm": 0.7204930864199384,
"learning_rate": 5e-06,
"loss": 0.557,
"step": 470
},
{
"epoch": 1.43658810325477,
"grad_norm": 0.6582974125304011,
"learning_rate": 5e-06,
"loss": 0.5598,
"step": 480
},
{
"epoch": 1.4665170220725776,
"grad_norm": 0.653408089340934,
"learning_rate": 5e-06,
"loss": 0.5589,
"step": 490
},
{
"epoch": 1.4964459408903854,
"grad_norm": 0.7722703692356943,
"learning_rate": 5e-06,
"loss": 0.5549,
"step": 500
},
{
"epoch": 1.5263748597081932,
"grad_norm": 0.6410463952946445,
"learning_rate": 5e-06,
"loss": 0.5571,
"step": 510
},
{
"epoch": 1.5563037785260008,
"grad_norm": 0.6788292489082296,
"learning_rate": 5e-06,
"loss": 0.5567,
"step": 520
},
{
"epoch": 1.5862326973438083,
"grad_norm": 0.7347495173956178,
"learning_rate": 5e-06,
"loss": 0.5557,
"step": 530
},
{
"epoch": 1.6161616161616161,
"grad_norm": 0.7489697605253047,
"learning_rate": 5e-06,
"loss": 0.56,
"step": 540
},
{
"epoch": 1.646090534979424,
"grad_norm": 0.6649594456868578,
"learning_rate": 5e-06,
"loss": 0.5596,
"step": 550
},
{
"epoch": 1.6760194537972315,
"grad_norm": 0.6944801894329058,
"learning_rate": 5e-06,
"loss": 0.5499,
"step": 560
},
{
"epoch": 1.7059483726150393,
"grad_norm": 0.7516636245416078,
"learning_rate": 5e-06,
"loss": 0.5519,
"step": 570
},
{
"epoch": 1.735877291432847,
"grad_norm": 0.834145985540098,
"learning_rate": 5e-06,
"loss": 0.5525,
"step": 580
},
{
"epoch": 1.7658062102506547,
"grad_norm": 0.6940488546001392,
"learning_rate": 5e-06,
"loss": 0.5561,
"step": 590
},
{
"epoch": 1.7957351290684624,
"grad_norm": 0.6996951151429136,
"learning_rate": 5e-06,
"loss": 0.5568,
"step": 600
},
{
"epoch": 1.8256640478862702,
"grad_norm": 0.6321044767548653,
"learning_rate": 5e-06,
"loss": 0.5543,
"step": 610
},
{
"epoch": 1.8555929667040778,
"grad_norm": 0.6380400908901183,
"learning_rate": 5e-06,
"loss": 0.5521,
"step": 620
},
{
"epoch": 1.8855218855218854,
"grad_norm": 0.6726516418071744,
"learning_rate": 5e-06,
"loss": 0.5536,
"step": 630
},
{
"epoch": 1.9154508043396934,
"grad_norm": 0.6952484734366503,
"learning_rate": 5e-06,
"loss": 0.5556,
"step": 640
},
{
"epoch": 1.945379723157501,
"grad_norm": 0.6339074390401458,
"learning_rate": 5e-06,
"loss": 0.554,
"step": 650
},
{
"epoch": 1.9753086419753085,
"grad_norm": 0.8335015680516275,
"learning_rate": 5e-06,
"loss": 0.5595,
"step": 660
},
{
"epoch": 1.9992517770295548,
"eval_loss": 0.615513026714325,
"eval_runtime": 271.7727,
"eval_samples_per_second": 33.12,
"eval_steps_per_second": 0.519,
"step": 668
},
{
"epoch": 2.0052375607931165,
"grad_norm": 1.095470898789856,
"learning_rate": 5e-06,
"loss": 0.5957,
"step": 670
},
{
"epoch": 2.035166479610924,
"grad_norm": 0.7925440628175368,
"learning_rate": 5e-06,
"loss": 0.4851,
"step": 680
},
{
"epoch": 2.0650953984287317,
"grad_norm": 0.7600969395946293,
"learning_rate": 5e-06,
"loss": 0.4799,
"step": 690
},
{
"epoch": 2.0950243172465393,
"grad_norm": 0.8685890982294241,
"learning_rate": 5e-06,
"loss": 0.4845,
"step": 700
},
{
"epoch": 2.1249532360643473,
"grad_norm": 0.7159762779954674,
"learning_rate": 5e-06,
"loss": 0.4862,
"step": 710
},
{
"epoch": 2.154882154882155,
"grad_norm": 0.7850425626912287,
"learning_rate": 5e-06,
"loss": 0.4882,
"step": 720
},
{
"epoch": 2.1848110736999624,
"grad_norm": 0.7829173560959974,
"learning_rate": 5e-06,
"loss": 0.4894,
"step": 730
},
{
"epoch": 2.2147399925177704,
"grad_norm": 0.7053202412118417,
"learning_rate": 5e-06,
"loss": 0.4898,
"step": 740
},
{
"epoch": 2.244668911335578,
"grad_norm": 0.7275797577145928,
"learning_rate": 5e-06,
"loss": 0.4917,
"step": 750
},
{
"epoch": 2.2745978301533856,
"grad_norm": 0.684143630508004,
"learning_rate": 5e-06,
"loss": 0.4878,
"step": 760
},
{
"epoch": 2.3045267489711936,
"grad_norm": 0.778690697436679,
"learning_rate": 5e-06,
"loss": 0.4936,
"step": 770
},
{
"epoch": 2.334455667789001,
"grad_norm": 0.6973756438711023,
"learning_rate": 5e-06,
"loss": 0.4885,
"step": 780
},
{
"epoch": 2.3643845866068087,
"grad_norm": 0.7512378015475496,
"learning_rate": 5e-06,
"loss": 0.4902,
"step": 790
},
{
"epoch": 2.3943135054246163,
"grad_norm": 0.6954041240036626,
"learning_rate": 5e-06,
"loss": 0.4947,
"step": 800
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.7661445266388807,
"learning_rate": 5e-06,
"loss": 0.4995,
"step": 810
},
{
"epoch": 2.454171343060232,
"grad_norm": 0.7288724567709918,
"learning_rate": 5e-06,
"loss": 0.4979,
"step": 820
},
{
"epoch": 2.48410026187804,
"grad_norm": 0.7507674417043292,
"learning_rate": 5e-06,
"loss": 0.4968,
"step": 830
},
{
"epoch": 2.5140291806958475,
"grad_norm": 0.6886877322873068,
"learning_rate": 5e-06,
"loss": 0.4951,
"step": 840
},
{
"epoch": 2.543958099513655,
"grad_norm": 0.710314562589874,
"learning_rate": 5e-06,
"loss": 0.498,
"step": 850
},
{
"epoch": 2.5738870183314626,
"grad_norm": 0.6994762876301733,
"learning_rate": 5e-06,
"loss": 0.4959,
"step": 860
},
{
"epoch": 2.6038159371492706,
"grad_norm": 0.7582356365854407,
"learning_rate": 5e-06,
"loss": 0.4938,
"step": 870
},
{
"epoch": 2.633744855967078,
"grad_norm": 0.731935619090177,
"learning_rate": 5e-06,
"loss": 0.4921,
"step": 880
},
{
"epoch": 2.6636737747848858,
"grad_norm": 0.74782144362319,
"learning_rate": 5e-06,
"loss": 0.4977,
"step": 890
},
{
"epoch": 2.6936026936026938,
"grad_norm": 0.6942188030457457,
"learning_rate": 5e-06,
"loss": 0.5011,
"step": 900
},
{
"epoch": 2.7235316124205013,
"grad_norm": 0.6881327401867374,
"learning_rate": 5e-06,
"loss": 0.4947,
"step": 910
},
{
"epoch": 2.753460531238309,
"grad_norm": 0.6856202337817314,
"learning_rate": 5e-06,
"loss": 0.4959,
"step": 920
},
{
"epoch": 2.7833894500561165,
"grad_norm": 0.7141040450174527,
"learning_rate": 5e-06,
"loss": 0.5016,
"step": 930
},
{
"epoch": 2.8133183688739245,
"grad_norm": 0.6857610208401852,
"learning_rate": 5e-06,
"loss": 0.4985,
"step": 940
},
{
"epoch": 2.843247287691732,
"grad_norm": 0.6698180625003869,
"learning_rate": 5e-06,
"loss": 0.4986,
"step": 950
},
{
"epoch": 2.87317620650954,
"grad_norm": 0.8039520213911328,
"learning_rate": 5e-06,
"loss": 0.502,
"step": 960
},
{
"epoch": 2.9031051253273477,
"grad_norm": 0.7415409936401505,
"learning_rate": 5e-06,
"loss": 0.4959,
"step": 970
},
{
"epoch": 2.9330340441451552,
"grad_norm": 0.7856625436324756,
"learning_rate": 5e-06,
"loss": 0.5023,
"step": 980
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.761345605606732,
"learning_rate": 5e-06,
"loss": 0.5048,
"step": 990
},
{
"epoch": 2.992891881780771,
"grad_norm": 0.7302412373936236,
"learning_rate": 5e-06,
"loss": 0.5047,
"step": 1000
},
{
"epoch": 2.9988776655443323,
"eval_loss": 0.6354114413261414,
"eval_runtime": 270.917,
"eval_samples_per_second": 33.224,
"eval_steps_per_second": 0.52,
"step": 1002
},
{
"epoch": 2.9988776655443323,
"step": 1002,
"total_flos": 3817854814126080.0,
"train_loss": 0.5626692353727337,
"train_runtime": 46778.8688,
"train_samples_per_second": 10.967,
"train_steps_per_second": 0.021
}
],
"logging_steps": 10,
"max_steps": 1002,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3817854814126080.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}