oh_v1.3_evol_instruct_x.5 / trainer_state.json
sedrickkeh's picture
End of training
668893f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9960988296488944,
"eval_steps": 500,
"global_step": 864,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.034677069787602946,
"grad_norm": 4.0249637874471995,
"learning_rate": 5e-06,
"loss": 1.0488,
"step": 10
},
{
"epoch": 0.06935413957520589,
"grad_norm": 1.7273291496388328,
"learning_rate": 5e-06,
"loss": 0.9328,
"step": 20
},
{
"epoch": 0.10403120936280884,
"grad_norm": 2.588347397545261,
"learning_rate": 5e-06,
"loss": 0.8892,
"step": 30
},
{
"epoch": 0.13870827915041178,
"grad_norm": 1.5360583750484056,
"learning_rate": 5e-06,
"loss": 0.8717,
"step": 40
},
{
"epoch": 0.17338534893801474,
"grad_norm": 1.1396695772844396,
"learning_rate": 5e-06,
"loss": 0.8475,
"step": 50
},
{
"epoch": 0.20806241872561768,
"grad_norm": 1.0308386132806844,
"learning_rate": 5e-06,
"loss": 0.8293,
"step": 60
},
{
"epoch": 0.24273948851322064,
"grad_norm": 1.0528463497860754,
"learning_rate": 5e-06,
"loss": 0.8176,
"step": 70
},
{
"epoch": 0.27741655830082357,
"grad_norm": 1.0138322547682082,
"learning_rate": 5e-06,
"loss": 0.8068,
"step": 80
},
{
"epoch": 0.31209362808842656,
"grad_norm": 1.0784904725043016,
"learning_rate": 5e-06,
"loss": 0.7992,
"step": 90
},
{
"epoch": 0.3467706978760295,
"grad_norm": 0.9906052569468718,
"learning_rate": 5e-06,
"loss": 0.7991,
"step": 100
},
{
"epoch": 0.3814477676636324,
"grad_norm": 1.1832971820048892,
"learning_rate": 5e-06,
"loss": 0.7894,
"step": 110
},
{
"epoch": 0.41612483745123535,
"grad_norm": 0.773999499704406,
"learning_rate": 5e-06,
"loss": 0.7832,
"step": 120
},
{
"epoch": 0.45080190723883834,
"grad_norm": 1.1303467514316676,
"learning_rate": 5e-06,
"loss": 0.7807,
"step": 130
},
{
"epoch": 0.48547897702644127,
"grad_norm": 0.6069961967445706,
"learning_rate": 5e-06,
"loss": 0.7857,
"step": 140
},
{
"epoch": 0.5201560468140443,
"grad_norm": 0.6816537291509634,
"learning_rate": 5e-06,
"loss": 0.7779,
"step": 150
},
{
"epoch": 0.5548331166016471,
"grad_norm": 0.7684116239237637,
"learning_rate": 5e-06,
"loss": 0.7739,
"step": 160
},
{
"epoch": 0.5895101863892501,
"grad_norm": 0.8491777059398649,
"learning_rate": 5e-06,
"loss": 0.7757,
"step": 170
},
{
"epoch": 0.6241872561768531,
"grad_norm": 1.0896111775038086,
"learning_rate": 5e-06,
"loss": 0.7675,
"step": 180
},
{
"epoch": 0.658864325964456,
"grad_norm": 0.8510614586902955,
"learning_rate": 5e-06,
"loss": 0.7732,
"step": 190
},
{
"epoch": 0.693541395752059,
"grad_norm": 0.6064417861325208,
"learning_rate": 5e-06,
"loss": 0.7677,
"step": 200
},
{
"epoch": 0.7282184655396619,
"grad_norm": 0.5980019145794307,
"learning_rate": 5e-06,
"loss": 0.7667,
"step": 210
},
{
"epoch": 0.7628955353272648,
"grad_norm": 0.7017739167578199,
"learning_rate": 5e-06,
"loss": 0.7644,
"step": 220
},
{
"epoch": 0.7975726051148678,
"grad_norm": 0.6027062835922619,
"learning_rate": 5e-06,
"loss": 0.7664,
"step": 230
},
{
"epoch": 0.8322496749024707,
"grad_norm": 0.6596293739585312,
"learning_rate": 5e-06,
"loss": 0.7559,
"step": 240
},
{
"epoch": 0.8669267446900737,
"grad_norm": 0.7441210106410687,
"learning_rate": 5e-06,
"loss": 0.7612,
"step": 250
},
{
"epoch": 0.9016038144776767,
"grad_norm": 0.7762267182363182,
"learning_rate": 5e-06,
"loss": 0.758,
"step": 260
},
{
"epoch": 0.9362808842652796,
"grad_norm": 0.6385587861511055,
"learning_rate": 5e-06,
"loss": 0.759,
"step": 270
},
{
"epoch": 0.9709579540528825,
"grad_norm": 0.6361978634019184,
"learning_rate": 5e-06,
"loss": 0.7583,
"step": 280
},
{
"epoch": 0.9986996098829649,
"eval_loss": 0.753886342048645,
"eval_runtime": 197.8618,
"eval_samples_per_second": 39.265,
"eval_steps_per_second": 0.617,
"step": 288
},
{
"epoch": 1.0056350238404854,
"grad_norm": 0.9912660198339807,
"learning_rate": 5e-06,
"loss": 0.7976,
"step": 290
},
{
"epoch": 1.0403120936280885,
"grad_norm": 1.535110610753484,
"learning_rate": 5e-06,
"loss": 0.7106,
"step": 300
},
{
"epoch": 1.0749891634156914,
"grad_norm": 0.7729921866431072,
"learning_rate": 5e-06,
"loss": 0.7066,
"step": 310
},
{
"epoch": 1.1096662332032943,
"grad_norm": 0.6463456897522241,
"learning_rate": 5e-06,
"loss": 0.7058,
"step": 320
},
{
"epoch": 1.1443433029908974,
"grad_norm": 0.6035036766400345,
"learning_rate": 5e-06,
"loss": 0.7031,
"step": 330
},
{
"epoch": 1.1790203727785002,
"grad_norm": 0.7290582812008339,
"learning_rate": 5e-06,
"loss": 0.6997,
"step": 340
},
{
"epoch": 1.2136974425661031,
"grad_norm": 0.6606339548104747,
"learning_rate": 5e-06,
"loss": 0.7048,
"step": 350
},
{
"epoch": 1.2483745123537062,
"grad_norm": 0.7747101369350952,
"learning_rate": 5e-06,
"loss": 0.7049,
"step": 360
},
{
"epoch": 1.283051582141309,
"grad_norm": 0.6447861318282291,
"learning_rate": 5e-06,
"loss": 0.7025,
"step": 370
},
{
"epoch": 1.317728651928912,
"grad_norm": 0.5446961444193648,
"learning_rate": 5e-06,
"loss": 0.7071,
"step": 380
},
{
"epoch": 1.352405721716515,
"grad_norm": 0.6316405546311903,
"learning_rate": 5e-06,
"loss": 0.7062,
"step": 390
},
{
"epoch": 1.387082791504118,
"grad_norm": 0.7714724505395334,
"learning_rate": 5e-06,
"loss": 0.7013,
"step": 400
},
{
"epoch": 1.4217598612917208,
"grad_norm": 0.6435708845447846,
"learning_rate": 5e-06,
"loss": 0.7043,
"step": 410
},
{
"epoch": 1.456436931079324,
"grad_norm": 0.6549313245316034,
"learning_rate": 5e-06,
"loss": 0.705,
"step": 420
},
{
"epoch": 1.4911140008669268,
"grad_norm": 0.6778607750028962,
"learning_rate": 5e-06,
"loss": 0.7035,
"step": 430
},
{
"epoch": 1.5257910706545297,
"grad_norm": 0.6526217069666287,
"learning_rate": 5e-06,
"loss": 0.7062,
"step": 440
},
{
"epoch": 1.5604681404421328,
"grad_norm": 0.7252842545795193,
"learning_rate": 5e-06,
"loss": 0.7056,
"step": 450
},
{
"epoch": 1.5951452102297354,
"grad_norm": 0.6026254905163209,
"learning_rate": 5e-06,
"loss": 0.7019,
"step": 460
},
{
"epoch": 1.6298222800173385,
"grad_norm": 0.7348281623426512,
"learning_rate": 5e-06,
"loss": 0.7019,
"step": 470
},
{
"epoch": 1.6644993498049416,
"grad_norm": 0.7277211190473597,
"learning_rate": 5e-06,
"loss": 0.7057,
"step": 480
},
{
"epoch": 1.6991764195925443,
"grad_norm": 0.8075931048690591,
"learning_rate": 5e-06,
"loss": 0.7043,
"step": 490
},
{
"epoch": 1.7338534893801474,
"grad_norm": 0.703136267805299,
"learning_rate": 5e-06,
"loss": 0.704,
"step": 500
},
{
"epoch": 1.7685305591677505,
"grad_norm": 0.6867057678356385,
"learning_rate": 5e-06,
"loss": 0.7046,
"step": 510
},
{
"epoch": 1.8032076289553531,
"grad_norm": 0.7168492824064608,
"learning_rate": 5e-06,
"loss": 0.7031,
"step": 520
},
{
"epoch": 1.8378846987429562,
"grad_norm": 0.6468146069695732,
"learning_rate": 5e-06,
"loss": 0.7026,
"step": 530
},
{
"epoch": 1.8725617685305593,
"grad_norm": 0.5599596596574505,
"learning_rate": 5e-06,
"loss": 0.7006,
"step": 540
},
{
"epoch": 1.907238838318162,
"grad_norm": 0.7333684396811263,
"learning_rate": 5e-06,
"loss": 0.7033,
"step": 550
},
{
"epoch": 1.941915908105765,
"grad_norm": 0.6683061963261424,
"learning_rate": 5e-06,
"loss": 0.6992,
"step": 560
},
{
"epoch": 1.976592977893368,
"grad_norm": 0.655884340314605,
"learning_rate": 5e-06,
"loss": 0.6958,
"step": 570
},
{
"epoch": 1.9973992197659298,
"eval_loss": 0.7401416897773743,
"eval_runtime": 195.3569,
"eval_samples_per_second": 39.768,
"eval_steps_per_second": 0.624,
"step": 576
},
{
"epoch": 2.011270047680971,
"grad_norm": 1.0541355293769905,
"learning_rate": 5e-06,
"loss": 0.7266,
"step": 580
},
{
"epoch": 2.045947117468574,
"grad_norm": 0.8515109317795075,
"learning_rate": 5e-06,
"loss": 0.6496,
"step": 590
},
{
"epoch": 2.080624187256177,
"grad_norm": 0.8609098344070957,
"learning_rate": 5e-06,
"loss": 0.6459,
"step": 600
},
{
"epoch": 2.1153012570437797,
"grad_norm": 0.6709051306842824,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 610
},
{
"epoch": 2.149978326831383,
"grad_norm": 0.6884941523677242,
"learning_rate": 5e-06,
"loss": 0.6486,
"step": 620
},
{
"epoch": 2.184655396618986,
"grad_norm": 0.6822156803842125,
"learning_rate": 5e-06,
"loss": 0.6454,
"step": 630
},
{
"epoch": 2.2193324664065885,
"grad_norm": 0.783762019991312,
"learning_rate": 5e-06,
"loss": 0.6502,
"step": 640
},
{
"epoch": 2.2540095361941916,
"grad_norm": 0.7183875213713674,
"learning_rate": 5e-06,
"loss": 0.6523,
"step": 650
},
{
"epoch": 2.2886866059817947,
"grad_norm": 0.7034570349597838,
"learning_rate": 5e-06,
"loss": 0.6512,
"step": 660
},
{
"epoch": 2.3233636757693974,
"grad_norm": 0.6107483226470054,
"learning_rate": 5e-06,
"loss": 0.6528,
"step": 670
},
{
"epoch": 2.3580407455570005,
"grad_norm": 0.6709721031936152,
"learning_rate": 5e-06,
"loss": 0.6514,
"step": 680
},
{
"epoch": 2.3927178153446036,
"grad_norm": 0.717931740489821,
"learning_rate": 5e-06,
"loss": 0.6504,
"step": 690
},
{
"epoch": 2.4273948851322062,
"grad_norm": 0.6775786736254632,
"learning_rate": 5e-06,
"loss": 0.6568,
"step": 700
},
{
"epoch": 2.4620719549198093,
"grad_norm": 0.6141649062955427,
"learning_rate": 5e-06,
"loss": 0.6505,
"step": 710
},
{
"epoch": 2.4967490247074124,
"grad_norm": 0.6919942537111052,
"learning_rate": 5e-06,
"loss": 0.6542,
"step": 720
},
{
"epoch": 2.531426094495015,
"grad_norm": 0.7226456763829804,
"learning_rate": 5e-06,
"loss": 0.6546,
"step": 730
},
{
"epoch": 2.566103164282618,
"grad_norm": 0.56441351482389,
"learning_rate": 5e-06,
"loss": 0.6547,
"step": 740
},
{
"epoch": 2.6007802340702213,
"grad_norm": 0.6207794336554665,
"learning_rate": 5e-06,
"loss": 0.6539,
"step": 750
},
{
"epoch": 2.635457303857824,
"grad_norm": 0.5967792415368525,
"learning_rate": 5e-06,
"loss": 0.6549,
"step": 760
},
{
"epoch": 2.670134373645427,
"grad_norm": 0.7202470628059912,
"learning_rate": 5e-06,
"loss": 0.6535,
"step": 770
},
{
"epoch": 2.70481144343303,
"grad_norm": 0.6000428861128503,
"learning_rate": 5e-06,
"loss": 0.6558,
"step": 780
},
{
"epoch": 2.739488513220633,
"grad_norm": 0.6627746592450424,
"learning_rate": 5e-06,
"loss": 0.6584,
"step": 790
},
{
"epoch": 2.774165583008236,
"grad_norm": 0.6990438570732993,
"learning_rate": 5e-06,
"loss": 0.6528,
"step": 800
},
{
"epoch": 2.808842652795839,
"grad_norm": 0.6611463955257642,
"learning_rate": 5e-06,
"loss": 0.6569,
"step": 810
},
{
"epoch": 2.8435197225834417,
"grad_norm": 0.6625666916962145,
"learning_rate": 5e-06,
"loss": 0.656,
"step": 820
},
{
"epoch": 2.8781967923710448,
"grad_norm": 0.6263198113296461,
"learning_rate": 5e-06,
"loss": 0.6535,
"step": 830
},
{
"epoch": 2.912873862158648,
"grad_norm": 0.6889694820528142,
"learning_rate": 5e-06,
"loss": 0.6523,
"step": 840
},
{
"epoch": 2.9475509319462505,
"grad_norm": 0.6566291665898417,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 850
},
{
"epoch": 2.9822280017338536,
"grad_norm": 0.5999353492283839,
"learning_rate": 5e-06,
"loss": 0.6557,
"step": 860
},
{
"epoch": 2.9960988296488944,
"eval_loss": 0.7398399114608765,
"eval_runtime": 196.1227,
"eval_samples_per_second": 39.613,
"eval_steps_per_second": 0.622,
"step": 864
},
{
"epoch": 2.9960988296488944,
"step": 864,
"total_flos": 1447022800404480.0,
"train_loss": 0.7206557989120483,
"train_runtime": 28697.4972,
"train_samples_per_second": 15.431,
"train_steps_per_second": 0.03
}
],
"logging_steps": 10,
"max_steps": 864,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1447022800404480.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}