oh_v1.3_metamath_x.5 / trainer_state.json
sedrickkeh's picture
End of training
7da300c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.994350282485876,
"eval_steps": 500,
"global_step": 795,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03766478342749529,
"grad_norm": 7.01964876508617,
"learning_rate": 5e-06,
"loss": 1.0512,
"step": 10
},
{
"epoch": 0.07532956685499058,
"grad_norm": 1.7064746932399855,
"learning_rate": 5e-06,
"loss": 0.9195,
"step": 20
},
{
"epoch": 0.11299435028248588,
"grad_norm": 1.5997363297482388,
"learning_rate": 5e-06,
"loss": 0.8822,
"step": 30
},
{
"epoch": 0.15065913370998116,
"grad_norm": 1.2080439221780226,
"learning_rate": 5e-06,
"loss": 0.8521,
"step": 40
},
{
"epoch": 0.18832391713747645,
"grad_norm": 1.0679693197690212,
"learning_rate": 5e-06,
"loss": 0.8377,
"step": 50
},
{
"epoch": 0.22598870056497175,
"grad_norm": 0.9608926103732748,
"learning_rate": 5e-06,
"loss": 0.8208,
"step": 60
},
{
"epoch": 0.263653483992467,
"grad_norm": 1.1752578271193257,
"learning_rate": 5e-06,
"loss": 0.8101,
"step": 70
},
{
"epoch": 0.3013182674199623,
"grad_norm": 0.7689747973770075,
"learning_rate": 5e-06,
"loss": 0.8014,
"step": 80
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.8655895794881668,
"learning_rate": 5e-06,
"loss": 0.7916,
"step": 90
},
{
"epoch": 0.3766478342749529,
"grad_norm": 0.7877262798626278,
"learning_rate": 5e-06,
"loss": 0.7837,
"step": 100
},
{
"epoch": 0.4143126177024482,
"grad_norm": 0.670131754521518,
"learning_rate": 5e-06,
"loss": 0.7818,
"step": 110
},
{
"epoch": 0.4519774011299435,
"grad_norm": 0.6897806811013272,
"learning_rate": 5e-06,
"loss": 0.7761,
"step": 120
},
{
"epoch": 0.4896421845574388,
"grad_norm": 0.9612838426596706,
"learning_rate": 5e-06,
"loss": 0.7801,
"step": 130
},
{
"epoch": 0.527306967984934,
"grad_norm": 0.6741872992998229,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 140
},
{
"epoch": 0.5649717514124294,
"grad_norm": 0.7449128082443985,
"learning_rate": 5e-06,
"loss": 0.7697,
"step": 150
},
{
"epoch": 0.6026365348399246,
"grad_norm": 0.702009783976525,
"learning_rate": 5e-06,
"loss": 0.7661,
"step": 160
},
{
"epoch": 0.64030131826742,
"grad_norm": 0.9412048703347311,
"learning_rate": 5e-06,
"loss": 0.7701,
"step": 170
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.8925674460202105,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 180
},
{
"epoch": 0.7156308851224106,
"grad_norm": 0.6803231082977221,
"learning_rate": 5e-06,
"loss": 0.7655,
"step": 190
},
{
"epoch": 0.7532956685499058,
"grad_norm": 0.63167541083718,
"learning_rate": 5e-06,
"loss": 0.7626,
"step": 200
},
{
"epoch": 0.7909604519774012,
"grad_norm": 0.6839121568063468,
"learning_rate": 5e-06,
"loss": 0.7584,
"step": 210
},
{
"epoch": 0.8286252354048964,
"grad_norm": 0.5919745674111999,
"learning_rate": 5e-06,
"loss": 0.7546,
"step": 220
},
{
"epoch": 0.8662900188323918,
"grad_norm": 0.8606865903126677,
"learning_rate": 5e-06,
"loss": 0.7622,
"step": 230
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.8112347024571781,
"learning_rate": 5e-06,
"loss": 0.7542,
"step": 240
},
{
"epoch": 0.9416195856873822,
"grad_norm": 0.7929648837913739,
"learning_rate": 5e-06,
"loss": 0.7598,
"step": 250
},
{
"epoch": 0.9792843691148776,
"grad_norm": 0.6660909939006495,
"learning_rate": 5e-06,
"loss": 0.7563,
"step": 260
},
{
"epoch": 0.9981167608286252,
"eval_loss": 0.7525370717048645,
"eval_runtime": 282.6778,
"eval_samples_per_second": 25.301,
"eval_steps_per_second": 0.396,
"step": 265
},
{
"epoch": 1.0169491525423728,
"grad_norm": 1.3153331677845188,
"learning_rate": 5e-06,
"loss": 0.775,
"step": 270
},
{
"epoch": 1.054613935969868,
"grad_norm": 0.8617206715995941,
"learning_rate": 5e-06,
"loss": 0.6997,
"step": 280
},
{
"epoch": 1.0922787193973635,
"grad_norm": 0.6944168653287546,
"learning_rate": 5e-06,
"loss": 0.7005,
"step": 290
},
{
"epoch": 1.1299435028248588,
"grad_norm": 0.9411772868385534,
"learning_rate": 5e-06,
"loss": 0.6973,
"step": 300
},
{
"epoch": 1.167608286252354,
"grad_norm": 0.6694366573751462,
"learning_rate": 5e-06,
"loss": 0.7051,
"step": 310
},
{
"epoch": 1.2052730696798493,
"grad_norm": 0.7156857674386213,
"learning_rate": 5e-06,
"loss": 0.7018,
"step": 320
},
{
"epoch": 1.2429378531073447,
"grad_norm": 0.8805127872682743,
"learning_rate": 5e-06,
"loss": 0.7009,
"step": 330
},
{
"epoch": 1.28060263653484,
"grad_norm": 0.7003145394069171,
"learning_rate": 5e-06,
"loss": 0.7003,
"step": 340
},
{
"epoch": 1.3182674199623352,
"grad_norm": 0.9332351519153556,
"learning_rate": 5e-06,
"loss": 0.7039,
"step": 350
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.5931688673479261,
"learning_rate": 5e-06,
"loss": 0.6953,
"step": 360
},
{
"epoch": 1.3935969868173257,
"grad_norm": 0.7444491178664134,
"learning_rate": 5e-06,
"loss": 0.7021,
"step": 370
},
{
"epoch": 1.4312617702448212,
"grad_norm": 0.6201120088074494,
"learning_rate": 5e-06,
"loss": 0.6989,
"step": 380
},
{
"epoch": 1.4689265536723164,
"grad_norm": 0.6394360492158847,
"learning_rate": 5e-06,
"loss": 0.7006,
"step": 390
},
{
"epoch": 1.5065913370998116,
"grad_norm": 0.5968659634612707,
"learning_rate": 5e-06,
"loss": 0.7003,
"step": 400
},
{
"epoch": 1.544256120527307,
"grad_norm": 0.6066424887033086,
"learning_rate": 5e-06,
"loss": 0.7013,
"step": 410
},
{
"epoch": 1.5819209039548023,
"grad_norm": 0.7667292931170824,
"learning_rate": 5e-06,
"loss": 0.7022,
"step": 420
},
{
"epoch": 1.6195856873822976,
"grad_norm": 0.5682752376913638,
"learning_rate": 5e-06,
"loss": 0.6947,
"step": 430
},
{
"epoch": 1.6572504708097928,
"grad_norm": 0.6764654533865712,
"learning_rate": 5e-06,
"loss": 0.7,
"step": 440
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.6800954995181525,
"learning_rate": 5e-06,
"loss": 0.7044,
"step": 450
},
{
"epoch": 1.7325800376647833,
"grad_norm": 0.5796176798333441,
"learning_rate": 5e-06,
"loss": 0.6957,
"step": 460
},
{
"epoch": 1.7702448210922788,
"grad_norm": 0.6427876261770084,
"learning_rate": 5e-06,
"loss": 0.6982,
"step": 470
},
{
"epoch": 1.807909604519774,
"grad_norm": 0.7709175834054774,
"learning_rate": 5e-06,
"loss": 0.6927,
"step": 480
},
{
"epoch": 1.8455743879472695,
"grad_norm": 0.62498067885703,
"learning_rate": 5e-06,
"loss": 0.6966,
"step": 490
},
{
"epoch": 1.8832391713747647,
"grad_norm": 0.6802211302093598,
"learning_rate": 5e-06,
"loss": 0.7008,
"step": 500
},
{
"epoch": 1.92090395480226,
"grad_norm": 0.7015346819134285,
"learning_rate": 5e-06,
"loss": 0.6926,
"step": 510
},
{
"epoch": 1.9585687382297552,
"grad_norm": 0.6679611093712937,
"learning_rate": 5e-06,
"loss": 0.7,
"step": 520
},
{
"epoch": 1.9962335216572504,
"grad_norm": 0.6337629504219587,
"learning_rate": 5e-06,
"loss": 0.6959,
"step": 530
},
{
"epoch": 2.0,
"eval_loss": 0.7400202751159668,
"eval_runtime": 282.1309,
"eval_samples_per_second": 25.35,
"eval_steps_per_second": 0.397,
"step": 531
},
{
"epoch": 2.0338983050847457,
"grad_norm": 1.0318615681166752,
"learning_rate": 5e-06,
"loss": 0.6778,
"step": 540
},
{
"epoch": 2.071563088512241,
"grad_norm": 0.6334271963975697,
"learning_rate": 5e-06,
"loss": 0.6449,
"step": 550
},
{
"epoch": 2.109227871939736,
"grad_norm": 1.1074270168911127,
"learning_rate": 5e-06,
"loss": 0.6411,
"step": 560
},
{
"epoch": 2.146892655367232,
"grad_norm": 0.7758960956340408,
"learning_rate": 5e-06,
"loss": 0.642,
"step": 570
},
{
"epoch": 2.184557438794727,
"grad_norm": 0.6355849787561325,
"learning_rate": 5e-06,
"loss": 0.6437,
"step": 580
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.6846746599985979,
"learning_rate": 5e-06,
"loss": 0.6436,
"step": 590
},
{
"epoch": 2.2598870056497176,
"grad_norm": 0.6043112947114725,
"learning_rate": 5e-06,
"loss": 0.645,
"step": 600
},
{
"epoch": 2.297551789077213,
"grad_norm": 0.817414985795291,
"learning_rate": 5e-06,
"loss": 0.6457,
"step": 610
},
{
"epoch": 2.335216572504708,
"grad_norm": 0.698455857380687,
"learning_rate": 5e-06,
"loss": 0.6455,
"step": 620
},
{
"epoch": 2.3728813559322033,
"grad_norm": 0.6479771691734695,
"learning_rate": 5e-06,
"loss": 0.6462,
"step": 630
},
{
"epoch": 2.4105461393596985,
"grad_norm": 0.7093548332196533,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 640
},
{
"epoch": 2.4482109227871938,
"grad_norm": 0.8925613153879164,
"learning_rate": 5e-06,
"loss": 0.6466,
"step": 650
},
{
"epoch": 2.4858757062146895,
"grad_norm": 0.6757355564710857,
"learning_rate": 5e-06,
"loss": 0.6436,
"step": 660
},
{
"epoch": 2.5235404896421847,
"grad_norm": 0.6081496236845628,
"learning_rate": 5e-06,
"loss": 0.648,
"step": 670
},
{
"epoch": 2.56120527306968,
"grad_norm": 0.7447422506860626,
"learning_rate": 5e-06,
"loss": 0.6477,
"step": 680
},
{
"epoch": 2.598870056497175,
"grad_norm": 0.8278808479195525,
"learning_rate": 5e-06,
"loss": 0.6456,
"step": 690
},
{
"epoch": 2.6365348399246704,
"grad_norm": 0.8544558166675883,
"learning_rate": 5e-06,
"loss": 0.6459,
"step": 700
},
{
"epoch": 2.6741996233521657,
"grad_norm": 0.7429520936151375,
"learning_rate": 5e-06,
"loss": 0.6459,
"step": 710
},
{
"epoch": 2.711864406779661,
"grad_norm": 0.7127745829357879,
"learning_rate": 5e-06,
"loss": 0.6498,
"step": 720
},
{
"epoch": 2.7495291902071566,
"grad_norm": 0.6076001592702167,
"learning_rate": 5e-06,
"loss": 0.651,
"step": 730
},
{
"epoch": 2.7871939736346514,
"grad_norm": 0.5977493204127112,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 740
},
{
"epoch": 2.824858757062147,
"grad_norm": 0.6943542767826754,
"learning_rate": 5e-06,
"loss": 0.652,
"step": 750
},
{
"epoch": 2.8625235404896423,
"grad_norm": 0.7797150538117115,
"learning_rate": 5e-06,
"loss": 0.651,
"step": 760
},
{
"epoch": 2.9001883239171375,
"grad_norm": 0.5704333728088763,
"learning_rate": 5e-06,
"loss": 0.6464,
"step": 770
},
{
"epoch": 2.937853107344633,
"grad_norm": 0.5780251500512679,
"learning_rate": 5e-06,
"loss": 0.6448,
"step": 780
},
{
"epoch": 2.975517890772128,
"grad_norm": 0.6517902655313229,
"learning_rate": 5e-06,
"loss": 0.6504,
"step": 790
},
{
"epoch": 2.994350282485876,
"eval_loss": 0.741532027721405,
"eval_runtime": 280.9763,
"eval_samples_per_second": 25.454,
"eval_steps_per_second": 0.399,
"step": 795
},
{
"epoch": 2.994350282485876,
"step": 795,
"total_flos": 1331445230469120.0,
"train_loss": 0.7161390892364694,
"train_runtime": 46894.4924,
"train_samples_per_second": 8.693,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 795,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1331445230469120.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}