oh_v1.3_evol_instruct_x.125 / trainer_state.json
sedrickkeh's picture
End of training
c27468e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.991501416430595,
"eval_steps": 500,
"global_step": 792,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03777148253068933,
"grad_norm": 2.3805434235210083,
"learning_rate": 5e-06,
"loss": 1.0395,
"step": 10
},
{
"epoch": 0.07554296506137866,
"grad_norm": 2.856513616647182,
"learning_rate": 5e-06,
"loss": 0.9048,
"step": 20
},
{
"epoch": 0.11331444759206799,
"grad_norm": 1.44752023730039,
"learning_rate": 5e-06,
"loss": 0.8676,
"step": 30
},
{
"epoch": 0.1510859301227573,
"grad_norm": 2.724838836111604,
"learning_rate": 5e-06,
"loss": 0.8452,
"step": 40
},
{
"epoch": 0.18885741265344666,
"grad_norm": 1.5098321508092247,
"learning_rate": 5e-06,
"loss": 0.8261,
"step": 50
},
{
"epoch": 0.22662889518413598,
"grad_norm": 1.1562207951359371,
"learning_rate": 5e-06,
"loss": 0.8055,
"step": 60
},
{
"epoch": 0.26440037771482533,
"grad_norm": 1.0400869852117345,
"learning_rate": 5e-06,
"loss": 0.7953,
"step": 70
},
{
"epoch": 0.3021718602455146,
"grad_norm": 0.9148701519417262,
"learning_rate": 5e-06,
"loss": 0.786,
"step": 80
},
{
"epoch": 0.33994334277620397,
"grad_norm": 0.8722664324312153,
"learning_rate": 5e-06,
"loss": 0.7725,
"step": 90
},
{
"epoch": 0.3777148253068933,
"grad_norm": 1.0427331036560754,
"learning_rate": 5e-06,
"loss": 0.7813,
"step": 100
},
{
"epoch": 0.4154863078375826,
"grad_norm": 0.6602583497680015,
"learning_rate": 5e-06,
"loss": 0.7702,
"step": 110
},
{
"epoch": 0.45325779036827196,
"grad_norm": 0.6261264742184848,
"learning_rate": 5e-06,
"loss": 0.7601,
"step": 120
},
{
"epoch": 0.4910292728989613,
"grad_norm": 0.8612076277718839,
"learning_rate": 5e-06,
"loss": 0.765,
"step": 130
},
{
"epoch": 0.5288007554296507,
"grad_norm": 0.7081715175245555,
"learning_rate": 5e-06,
"loss": 0.7585,
"step": 140
},
{
"epoch": 0.56657223796034,
"grad_norm": 0.6464092566763289,
"learning_rate": 5e-06,
"loss": 0.7582,
"step": 150
},
{
"epoch": 0.6043437204910292,
"grad_norm": 0.7867913648175029,
"learning_rate": 5e-06,
"loss": 0.7535,
"step": 160
},
{
"epoch": 0.6421152030217187,
"grad_norm": 0.8335496759284264,
"learning_rate": 5e-06,
"loss": 0.7506,
"step": 170
},
{
"epoch": 0.6798866855524079,
"grad_norm": 0.9045739681846007,
"learning_rate": 5e-06,
"loss": 0.7538,
"step": 180
},
{
"epoch": 0.7176581680830972,
"grad_norm": 0.642733706975797,
"learning_rate": 5e-06,
"loss": 0.7512,
"step": 190
},
{
"epoch": 0.7554296506137866,
"grad_norm": 0.7751098599889861,
"learning_rate": 5e-06,
"loss": 0.7432,
"step": 200
},
{
"epoch": 0.7932011331444759,
"grad_norm": 0.9169213826519828,
"learning_rate": 5e-06,
"loss": 0.7523,
"step": 210
},
{
"epoch": 0.8309726156751652,
"grad_norm": 0.751413803169088,
"learning_rate": 5e-06,
"loss": 0.7475,
"step": 220
},
{
"epoch": 0.8687440982058546,
"grad_norm": 0.7640332235725673,
"learning_rate": 5e-06,
"loss": 0.7384,
"step": 230
},
{
"epoch": 0.9065155807365439,
"grad_norm": 0.728990276372915,
"learning_rate": 5e-06,
"loss": 0.742,
"step": 240
},
{
"epoch": 0.9442870632672332,
"grad_norm": 0.6854770933941848,
"learning_rate": 5e-06,
"loss": 0.7415,
"step": 250
},
{
"epoch": 0.9820585457979226,
"grad_norm": 0.7112160685903344,
"learning_rate": 5e-06,
"loss": 0.7413,
"step": 260
},
{
"epoch": 0.9971671388101983,
"eval_loss": 0.7381541132926941,
"eval_runtime": 185.7861,
"eval_samples_per_second": 38.399,
"eval_steps_per_second": 0.603,
"step": 264
},
{
"epoch": 1.019830028328612,
"grad_norm": 0.8806260533344807,
"learning_rate": 5e-06,
"loss": 0.7317,
"step": 270
},
{
"epoch": 1.0576015108593013,
"grad_norm": 0.8463597906884003,
"learning_rate": 5e-06,
"loss": 0.6852,
"step": 280
},
{
"epoch": 1.0953729933899905,
"grad_norm": 0.998627698983325,
"learning_rate": 5e-06,
"loss": 0.6903,
"step": 290
},
{
"epoch": 1.13314447592068,
"grad_norm": 0.8372558817429967,
"learning_rate": 5e-06,
"loss": 0.684,
"step": 300
},
{
"epoch": 1.1709159584513693,
"grad_norm": 0.6335303040398027,
"learning_rate": 5e-06,
"loss": 0.6899,
"step": 310
},
{
"epoch": 1.2086874409820585,
"grad_norm": 0.8131802909561154,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 320
},
{
"epoch": 1.246458923512748,
"grad_norm": 0.8544395006729588,
"learning_rate": 5e-06,
"loss": 0.6876,
"step": 330
},
{
"epoch": 1.284230406043437,
"grad_norm": 0.7496941658426767,
"learning_rate": 5e-06,
"loss": 0.6841,
"step": 340
},
{
"epoch": 1.3220018885741265,
"grad_norm": 0.6225060094101681,
"learning_rate": 5e-06,
"loss": 0.6875,
"step": 350
},
{
"epoch": 1.3597733711048159,
"grad_norm": 0.8041619905694252,
"learning_rate": 5e-06,
"loss": 0.6835,
"step": 360
},
{
"epoch": 1.3975448536355053,
"grad_norm": 0.6579082421853544,
"learning_rate": 5e-06,
"loss": 0.6839,
"step": 370
},
{
"epoch": 1.4353163361661945,
"grad_norm": 0.6914768615360496,
"learning_rate": 5e-06,
"loss": 0.6888,
"step": 380
},
{
"epoch": 1.4730878186968839,
"grad_norm": 0.5488530522257256,
"learning_rate": 5e-06,
"loss": 0.6844,
"step": 390
},
{
"epoch": 1.510859301227573,
"grad_norm": 0.5967231206297695,
"learning_rate": 5e-06,
"loss": 0.6835,
"step": 400
},
{
"epoch": 1.5486307837582625,
"grad_norm": 0.8534656126840098,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 410
},
{
"epoch": 1.5864022662889519,
"grad_norm": 0.8114796417385948,
"learning_rate": 5e-06,
"loss": 0.6892,
"step": 420
},
{
"epoch": 1.6241737488196413,
"grad_norm": 0.7087354816053721,
"learning_rate": 5e-06,
"loss": 0.6854,
"step": 430
},
{
"epoch": 1.6619452313503305,
"grad_norm": 0.5687873916760361,
"learning_rate": 5e-06,
"loss": 0.6874,
"step": 440
},
{
"epoch": 1.6997167138810199,
"grad_norm": 0.7321157421532287,
"learning_rate": 5e-06,
"loss": 0.6891,
"step": 450
},
{
"epoch": 1.737488196411709,
"grad_norm": 0.9059336248026789,
"learning_rate": 5e-06,
"loss": 0.6813,
"step": 460
},
{
"epoch": 1.7752596789423984,
"grad_norm": 0.675756534213701,
"learning_rate": 5e-06,
"loss": 0.6835,
"step": 470
},
{
"epoch": 1.8130311614730878,
"grad_norm": 0.6005853815924641,
"learning_rate": 5e-06,
"loss": 0.6835,
"step": 480
},
{
"epoch": 1.8508026440037773,
"grad_norm": 0.7612226170902838,
"learning_rate": 5e-06,
"loss": 0.68,
"step": 490
},
{
"epoch": 1.8885741265344664,
"grad_norm": 0.7562478445031421,
"learning_rate": 5e-06,
"loss": 0.6805,
"step": 500
},
{
"epoch": 1.9263456090651558,
"grad_norm": 0.7011514213635397,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 510
},
{
"epoch": 1.964117091595845,
"grad_norm": 0.668105825093532,
"learning_rate": 5e-06,
"loss": 0.6815,
"step": 520
},
{
"epoch": 1.9981114258734656,
"eval_loss": 0.7243772149085999,
"eval_runtime": 177.4303,
"eval_samples_per_second": 40.207,
"eval_steps_per_second": 0.631,
"step": 529
},
{
"epoch": 2.0018885741265344,
"grad_norm": 1.0061690465784974,
"learning_rate": 5e-06,
"loss": 0.7,
"step": 530
},
{
"epoch": 2.039660056657224,
"grad_norm": 0.7877880100051793,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 540
},
{
"epoch": 2.0774315391879132,
"grad_norm": 0.793030648640271,
"learning_rate": 5e-06,
"loss": 0.6269,
"step": 550
},
{
"epoch": 2.1152030217186026,
"grad_norm": 0.7589773401731925,
"learning_rate": 5e-06,
"loss": 0.6306,
"step": 560
},
{
"epoch": 2.1529745042492916,
"grad_norm": 0.6507155946743034,
"learning_rate": 5e-06,
"loss": 0.6302,
"step": 570
},
{
"epoch": 2.190745986779981,
"grad_norm": 0.7706192007874249,
"learning_rate": 5e-06,
"loss": 0.6278,
"step": 580
},
{
"epoch": 2.2285174693106704,
"grad_norm": 0.6327752250601594,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 590
},
{
"epoch": 2.26628895184136,
"grad_norm": 0.6327979312894738,
"learning_rate": 5e-06,
"loss": 0.6326,
"step": 600
},
{
"epoch": 2.304060434372049,
"grad_norm": 0.5816500449436098,
"learning_rate": 5e-06,
"loss": 0.6322,
"step": 610
},
{
"epoch": 2.3418319169027386,
"grad_norm": 0.7685458410506589,
"learning_rate": 5e-06,
"loss": 0.632,
"step": 620
},
{
"epoch": 2.3796033994334276,
"grad_norm": 0.7331975528938945,
"learning_rate": 5e-06,
"loss": 0.6359,
"step": 630
},
{
"epoch": 2.417374881964117,
"grad_norm": 0.6048367664881513,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 640
},
{
"epoch": 2.4551463644948064,
"grad_norm": 0.7225741561090323,
"learning_rate": 5e-06,
"loss": 0.6304,
"step": 650
},
{
"epoch": 2.492917847025496,
"grad_norm": 0.6762661427796176,
"learning_rate": 5e-06,
"loss": 0.6348,
"step": 660
},
{
"epoch": 2.530689329556185,
"grad_norm": 0.6888475213512071,
"learning_rate": 5e-06,
"loss": 0.6335,
"step": 670
},
{
"epoch": 2.568460812086874,
"grad_norm": 0.7082247476426633,
"learning_rate": 5e-06,
"loss": 0.6349,
"step": 680
},
{
"epoch": 2.6062322946175636,
"grad_norm": 0.7648839755479956,
"learning_rate": 5e-06,
"loss": 0.6285,
"step": 690
},
{
"epoch": 2.644003777148253,
"grad_norm": 0.7473110651002637,
"learning_rate": 5e-06,
"loss": 0.6341,
"step": 700
},
{
"epoch": 2.6817752596789424,
"grad_norm": 0.6406025398016005,
"learning_rate": 5e-06,
"loss": 0.6393,
"step": 710
},
{
"epoch": 2.7195467422096318,
"grad_norm": 0.6180603575164161,
"learning_rate": 5e-06,
"loss": 0.6397,
"step": 720
},
{
"epoch": 2.757318224740321,
"grad_norm": 0.7188210996816503,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 730
},
{
"epoch": 2.7950897072710106,
"grad_norm": 0.7014279573066574,
"learning_rate": 5e-06,
"loss": 0.6394,
"step": 740
},
{
"epoch": 2.8328611898017,
"grad_norm": 0.6189845652330979,
"learning_rate": 5e-06,
"loss": 0.6321,
"step": 750
},
{
"epoch": 2.870632672332389,
"grad_norm": 0.6750791101100844,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 760
},
{
"epoch": 2.9084041548630784,
"grad_norm": 0.645156423238826,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 770
},
{
"epoch": 2.9461756373937678,
"grad_norm": 0.7328592259216773,
"learning_rate": 5e-06,
"loss": 0.6412,
"step": 780
},
{
"epoch": 2.983947119924457,
"grad_norm": 0.5893339209675429,
"learning_rate": 5e-06,
"loss": 0.6392,
"step": 790
},
{
"epoch": 2.991501416430595,
"eval_loss": 0.7273694276809692,
"eval_runtime": 177.6532,
"eval_samples_per_second": 40.157,
"eval_steps_per_second": 0.63,
"step": 792
},
{
"epoch": 2.991501416430595,
"step": 792,
"total_flos": 1326420118732800.0,
"train_loss": 0.7023529005472107,
"train_runtime": 26189.6442,
"train_samples_per_second": 15.525,
"train_steps_per_second": 0.03
}
],
"logging_steps": 10,
"max_steps": 792,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1326420118732800.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}