oh_v1.3_metamath_x2 / trainer_state.json
sedrickkeh's picture
End of training
9ceeb0a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9956063268892796,
"eval_steps": 500,
"global_step": 852,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0351493848857645,
"grad_norm": 6.865658287926604,
"learning_rate": 5e-06,
"loss": 1.0224,
"step": 10
},
{
"epoch": 0.070298769771529,
"grad_norm": 1.300129454515723,
"learning_rate": 5e-06,
"loss": 0.8893,
"step": 20
},
{
"epoch": 0.1054481546572935,
"grad_norm": 1.662141333626877,
"learning_rate": 5e-06,
"loss": 0.8461,
"step": 30
},
{
"epoch": 0.140597539543058,
"grad_norm": 1.1867317674423972,
"learning_rate": 5e-06,
"loss": 0.8221,
"step": 40
},
{
"epoch": 0.1757469244288225,
"grad_norm": 1.3406663541941202,
"learning_rate": 5e-06,
"loss": 0.8082,
"step": 50
},
{
"epoch": 0.210896309314587,
"grad_norm": 0.9748883756928828,
"learning_rate": 5e-06,
"loss": 0.7926,
"step": 60
},
{
"epoch": 0.2460456942003515,
"grad_norm": 0.9049088495136057,
"learning_rate": 5e-06,
"loss": 0.7786,
"step": 70
},
{
"epoch": 0.281195079086116,
"grad_norm": 0.7756246074861972,
"learning_rate": 5e-06,
"loss": 0.7672,
"step": 80
},
{
"epoch": 0.3163444639718805,
"grad_norm": 0.6803113923201367,
"learning_rate": 5e-06,
"loss": 0.7666,
"step": 90
},
{
"epoch": 0.351493848857645,
"grad_norm": 0.7087390396178761,
"learning_rate": 5e-06,
"loss": 0.7626,
"step": 100
},
{
"epoch": 0.3866432337434095,
"grad_norm": 0.6904558645177432,
"learning_rate": 5e-06,
"loss": 0.7558,
"step": 110
},
{
"epoch": 0.421792618629174,
"grad_norm": 0.5406381121382178,
"learning_rate": 5e-06,
"loss": 0.751,
"step": 120
},
{
"epoch": 0.45694200351493847,
"grad_norm": 0.7263515515258443,
"learning_rate": 5e-06,
"loss": 0.7541,
"step": 130
},
{
"epoch": 0.492091388400703,
"grad_norm": 0.6381225954297634,
"learning_rate": 5e-06,
"loss": 0.7507,
"step": 140
},
{
"epoch": 0.5272407732864675,
"grad_norm": 0.9282926398227679,
"learning_rate": 5e-06,
"loss": 0.7425,
"step": 150
},
{
"epoch": 0.562390158172232,
"grad_norm": 0.703837658050583,
"learning_rate": 5e-06,
"loss": 0.7473,
"step": 160
},
{
"epoch": 0.5975395430579965,
"grad_norm": 0.7762432087380096,
"learning_rate": 5e-06,
"loss": 0.7408,
"step": 170
},
{
"epoch": 0.632688927943761,
"grad_norm": 0.5947662547404722,
"learning_rate": 5e-06,
"loss": 0.7347,
"step": 180
},
{
"epoch": 0.6678383128295254,
"grad_norm": 0.628944705791063,
"learning_rate": 5e-06,
"loss": 0.7406,
"step": 190
},
{
"epoch": 0.70298769771529,
"grad_norm": 0.5977406811055224,
"learning_rate": 5e-06,
"loss": 0.7347,
"step": 200
},
{
"epoch": 0.7381370826010545,
"grad_norm": 0.582107769314153,
"learning_rate": 5e-06,
"loss": 0.737,
"step": 210
},
{
"epoch": 0.773286467486819,
"grad_norm": 0.6326552735959291,
"learning_rate": 5e-06,
"loss": 0.7328,
"step": 220
},
{
"epoch": 0.8084358523725835,
"grad_norm": 0.6637547116847639,
"learning_rate": 5e-06,
"loss": 0.7311,
"step": 230
},
{
"epoch": 0.843585237258348,
"grad_norm": 0.6997143410926964,
"learning_rate": 5e-06,
"loss": 0.7341,
"step": 240
},
{
"epoch": 0.8787346221441125,
"grad_norm": 0.6162729226466245,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 250
},
{
"epoch": 0.9138840070298769,
"grad_norm": 0.6199166403621413,
"learning_rate": 5e-06,
"loss": 0.7262,
"step": 260
},
{
"epoch": 0.9490333919156415,
"grad_norm": 0.6034966296550427,
"learning_rate": 5e-06,
"loss": 0.729,
"step": 270
},
{
"epoch": 0.984182776801406,
"grad_norm": 0.6195682554180708,
"learning_rate": 5e-06,
"loss": 0.7264,
"step": 280
},
{
"epoch": 0.9982425307557118,
"eval_loss": 0.721891462802887,
"eval_runtime": 302.9942,
"eval_samples_per_second": 25.304,
"eval_steps_per_second": 0.396,
"step": 284
},
{
"epoch": 1.0197715289982425,
"grad_norm": 0.6617437136575776,
"learning_rate": 5e-06,
"loss": 0.7503,
"step": 290
},
{
"epoch": 1.054920913884007,
"grad_norm": 0.596469668606961,
"learning_rate": 5e-06,
"loss": 0.6752,
"step": 300
},
{
"epoch": 1.0900702987697715,
"grad_norm": 0.7286257897811691,
"learning_rate": 5e-06,
"loss": 0.6745,
"step": 310
},
{
"epoch": 1.1252196836555362,
"grad_norm": 0.6703518701287363,
"learning_rate": 5e-06,
"loss": 0.6772,
"step": 320
},
{
"epoch": 1.1603690685413006,
"grad_norm": 0.6678193952959378,
"learning_rate": 5e-06,
"loss": 0.6757,
"step": 330
},
{
"epoch": 1.195518453427065,
"grad_norm": 0.6272061731880971,
"learning_rate": 5e-06,
"loss": 0.6772,
"step": 340
},
{
"epoch": 1.2306678383128296,
"grad_norm": 0.5618776589312474,
"learning_rate": 5e-06,
"loss": 0.6767,
"step": 350
},
{
"epoch": 1.265817223198594,
"grad_norm": 0.656461597570214,
"learning_rate": 5e-06,
"loss": 0.6742,
"step": 360
},
{
"epoch": 1.3009666080843585,
"grad_norm": 0.95088298783439,
"learning_rate": 5e-06,
"loss": 0.6803,
"step": 370
},
{
"epoch": 1.336115992970123,
"grad_norm": 0.6256062888068228,
"learning_rate": 5e-06,
"loss": 0.6789,
"step": 380
},
{
"epoch": 1.3712653778558876,
"grad_norm": 0.5178529345876333,
"learning_rate": 5e-06,
"loss": 0.6757,
"step": 390
},
{
"epoch": 1.406414762741652,
"grad_norm": 0.6633111117626306,
"learning_rate": 5e-06,
"loss": 0.6786,
"step": 400
},
{
"epoch": 1.4415641476274166,
"grad_norm": 0.5753214727933854,
"learning_rate": 5e-06,
"loss": 0.6686,
"step": 410
},
{
"epoch": 1.476713532513181,
"grad_norm": 0.7023169996268164,
"learning_rate": 5e-06,
"loss": 0.674,
"step": 420
},
{
"epoch": 1.5118629173989455,
"grad_norm": 0.601050736097527,
"learning_rate": 5e-06,
"loss": 0.676,
"step": 430
},
{
"epoch": 1.54701230228471,
"grad_norm": 0.6375081303020413,
"learning_rate": 5e-06,
"loss": 0.6779,
"step": 440
},
{
"epoch": 1.5821616871704745,
"grad_norm": 0.6076189491485879,
"learning_rate": 5e-06,
"loss": 0.6811,
"step": 450
},
{
"epoch": 1.6173110720562391,
"grad_norm": 0.6123755131309624,
"learning_rate": 5e-06,
"loss": 0.6749,
"step": 460
},
{
"epoch": 1.6524604569420034,
"grad_norm": 0.5847478738087437,
"learning_rate": 5e-06,
"loss": 0.6747,
"step": 470
},
{
"epoch": 1.687609841827768,
"grad_norm": 0.6041574755100807,
"learning_rate": 5e-06,
"loss": 0.6681,
"step": 480
},
{
"epoch": 1.7227592267135325,
"grad_norm": 0.6972984159432736,
"learning_rate": 5e-06,
"loss": 0.6686,
"step": 490
},
{
"epoch": 1.757908611599297,
"grad_norm": 0.592332789109461,
"learning_rate": 5e-06,
"loss": 0.6763,
"step": 500
},
{
"epoch": 1.7930579964850615,
"grad_norm": 0.7081266254056617,
"learning_rate": 5e-06,
"loss": 0.6707,
"step": 510
},
{
"epoch": 1.828207381370826,
"grad_norm": 0.5655551892586738,
"learning_rate": 5e-06,
"loss": 0.675,
"step": 520
},
{
"epoch": 1.8633567662565906,
"grad_norm": 0.5912936045849521,
"learning_rate": 5e-06,
"loss": 0.6746,
"step": 530
},
{
"epoch": 1.8985061511423549,
"grad_norm": 0.6418543236430647,
"learning_rate": 5e-06,
"loss": 0.6743,
"step": 540
},
{
"epoch": 1.9336555360281196,
"grad_norm": 0.8406203952305934,
"learning_rate": 5e-06,
"loss": 0.6777,
"step": 550
},
{
"epoch": 1.968804920913884,
"grad_norm": 0.6740623987469322,
"learning_rate": 5e-06,
"loss": 0.6783,
"step": 560
},
{
"epoch": 1.9969244288224957,
"eval_loss": 0.7088373899459839,
"eval_runtime": 302.5633,
"eval_samples_per_second": 25.34,
"eval_steps_per_second": 0.397,
"step": 568
},
{
"epoch": 2.0043936731107204,
"grad_norm": 0.8982450142296012,
"learning_rate": 5e-06,
"loss": 0.7079,
"step": 570
},
{
"epoch": 2.039543057996485,
"grad_norm": 0.9691008221084222,
"learning_rate": 5e-06,
"loss": 0.6203,
"step": 580
},
{
"epoch": 2.0746924428822497,
"grad_norm": 0.6447824341633516,
"learning_rate": 5e-06,
"loss": 0.6244,
"step": 590
},
{
"epoch": 2.109841827768014,
"grad_norm": 0.7064323342581214,
"learning_rate": 5e-06,
"loss": 0.6189,
"step": 600
},
{
"epoch": 2.1449912126537787,
"grad_norm": 0.5819596596280016,
"learning_rate": 5e-06,
"loss": 0.6207,
"step": 610
},
{
"epoch": 2.180140597539543,
"grad_norm": 0.7981926624790863,
"learning_rate": 5e-06,
"loss": 0.6203,
"step": 620
},
{
"epoch": 2.2152899824253076,
"grad_norm": 0.6853162161955834,
"learning_rate": 5e-06,
"loss": 0.6281,
"step": 630
},
{
"epoch": 2.2504393673110723,
"grad_norm": 0.6819271490957453,
"learning_rate": 5e-06,
"loss": 0.6246,
"step": 640
},
{
"epoch": 2.2855887521968365,
"grad_norm": 0.678545369804577,
"learning_rate": 5e-06,
"loss": 0.6277,
"step": 650
},
{
"epoch": 2.3207381370826012,
"grad_norm": 0.6597702524075268,
"learning_rate": 5e-06,
"loss": 0.631,
"step": 660
},
{
"epoch": 2.3558875219683655,
"grad_norm": 0.5352899370053985,
"learning_rate": 5e-06,
"loss": 0.627,
"step": 670
},
{
"epoch": 2.39103690685413,
"grad_norm": 0.575976735916134,
"learning_rate": 5e-06,
"loss": 0.6252,
"step": 680
},
{
"epoch": 2.4261862917398944,
"grad_norm": 0.6538224434833726,
"learning_rate": 5e-06,
"loss": 0.631,
"step": 690
},
{
"epoch": 2.461335676625659,
"grad_norm": 0.8281376962806699,
"learning_rate": 5e-06,
"loss": 0.6238,
"step": 700
},
{
"epoch": 2.4964850615114234,
"grad_norm": 0.5971561231648772,
"learning_rate": 5e-06,
"loss": 0.6244,
"step": 710
},
{
"epoch": 2.531634446397188,
"grad_norm": 0.5668390272889466,
"learning_rate": 5e-06,
"loss": 0.6254,
"step": 720
},
{
"epoch": 2.5667838312829527,
"grad_norm": 0.7378544776528181,
"learning_rate": 5e-06,
"loss": 0.6248,
"step": 730
},
{
"epoch": 2.601933216168717,
"grad_norm": 0.6067368368819991,
"learning_rate": 5e-06,
"loss": 0.6256,
"step": 740
},
{
"epoch": 2.6370826010544817,
"grad_norm": 0.6816545127839443,
"learning_rate": 5e-06,
"loss": 0.6286,
"step": 750
},
{
"epoch": 2.672231985940246,
"grad_norm": 0.787032141068753,
"learning_rate": 5e-06,
"loss": 0.628,
"step": 760
},
{
"epoch": 2.7073813708260106,
"grad_norm": 0.6393338928189319,
"learning_rate": 5e-06,
"loss": 0.6267,
"step": 770
},
{
"epoch": 2.7425307557117753,
"grad_norm": 0.5562264277034894,
"learning_rate": 5e-06,
"loss": 0.6261,
"step": 780
},
{
"epoch": 2.7776801405975395,
"grad_norm": 0.5896436524802737,
"learning_rate": 5e-06,
"loss": 0.6256,
"step": 790
},
{
"epoch": 2.812829525483304,
"grad_norm": 0.5828475505687344,
"learning_rate": 5e-06,
"loss": 0.6247,
"step": 800
},
{
"epoch": 2.8479789103690685,
"grad_norm": 0.634394806473084,
"learning_rate": 5e-06,
"loss": 0.6269,
"step": 810
},
{
"epoch": 2.883128295254833,
"grad_norm": 0.6117384621451529,
"learning_rate": 5e-06,
"loss": 0.6279,
"step": 820
},
{
"epoch": 2.9182776801405974,
"grad_norm": 0.5540272640106404,
"learning_rate": 5e-06,
"loss": 0.6212,
"step": 830
},
{
"epoch": 2.953427065026362,
"grad_norm": 0.5600169828318418,
"learning_rate": 5e-06,
"loss": 0.6282,
"step": 840
},
{
"epoch": 2.9885764499121263,
"grad_norm": 0.7592332443324643,
"learning_rate": 5e-06,
"loss": 0.6277,
"step": 850
},
{
"epoch": 2.9956063268892796,
"eval_loss": 0.7103263735771179,
"eval_runtime": 302.7761,
"eval_samples_per_second": 25.322,
"eval_steps_per_second": 0.396,
"step": 852
},
{
"epoch": 2.9956063268892796,
"step": 852,
"total_flos": 1426922353459200.0,
"train_loss": 0.6913109551852857,
"train_runtime": 50624.2334,
"train_samples_per_second": 8.632,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 852,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1426922353459200.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}