oh_v1.3_metamath_x.25 / trainer_state.json
sedrickkeh's picture
End of training
bc0de42 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9961795606494745,
"eval_steps": 500,
"global_step": 783,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.038204393505253106,
"grad_norm": 17.106196027203886,
"learning_rate": 5e-06,
"loss": 1.0699,
"step": 10
},
{
"epoch": 0.07640878701050621,
"grad_norm": 3.7575313843264073,
"learning_rate": 5e-06,
"loss": 0.9412,
"step": 20
},
{
"epoch": 0.11461318051575932,
"grad_norm": 2.0783280143086302,
"learning_rate": 5e-06,
"loss": 0.8972,
"step": 30
},
{
"epoch": 0.15281757402101243,
"grad_norm": 1.419216051491444,
"learning_rate": 5e-06,
"loss": 0.8682,
"step": 40
},
{
"epoch": 0.19102196752626552,
"grad_norm": 1.5492013803153966,
"learning_rate": 5e-06,
"loss": 0.8507,
"step": 50
},
{
"epoch": 0.22922636103151864,
"grad_norm": 0.9979261775157234,
"learning_rate": 5e-06,
"loss": 0.8317,
"step": 60
},
{
"epoch": 0.26743075453677173,
"grad_norm": 1.1453499414283712,
"learning_rate": 5e-06,
"loss": 0.8257,
"step": 70
},
{
"epoch": 0.30563514804202485,
"grad_norm": 1.0657509981340374,
"learning_rate": 5e-06,
"loss": 0.8136,
"step": 80
},
{
"epoch": 0.3438395415472779,
"grad_norm": 0.8154629296380118,
"learning_rate": 5e-06,
"loss": 0.8012,
"step": 90
},
{
"epoch": 0.38204393505253104,
"grad_norm": 0.7783174666894658,
"learning_rate": 5e-06,
"loss": 0.7944,
"step": 100
},
{
"epoch": 0.42024832855778416,
"grad_norm": 0.7557295182511866,
"learning_rate": 5e-06,
"loss": 0.7976,
"step": 110
},
{
"epoch": 0.4584527220630373,
"grad_norm": 0.7419048503084669,
"learning_rate": 5e-06,
"loss": 0.7838,
"step": 120
},
{
"epoch": 0.49665711556829034,
"grad_norm": 0.6023078446753443,
"learning_rate": 5e-06,
"loss": 0.7861,
"step": 130
},
{
"epoch": 0.5348615090735435,
"grad_norm": 0.8029384495653265,
"learning_rate": 5e-06,
"loss": 0.7794,
"step": 140
},
{
"epoch": 0.5730659025787965,
"grad_norm": 0.8049936585803824,
"learning_rate": 5e-06,
"loss": 0.7824,
"step": 150
},
{
"epoch": 0.6112702960840497,
"grad_norm": 0.8903153692892993,
"learning_rate": 5e-06,
"loss": 0.7716,
"step": 160
},
{
"epoch": 0.6494746895893028,
"grad_norm": 0.7481918362618383,
"learning_rate": 5e-06,
"loss": 0.7781,
"step": 170
},
{
"epoch": 0.6876790830945558,
"grad_norm": 0.8571380886679489,
"learning_rate": 5e-06,
"loss": 0.7728,
"step": 180
},
{
"epoch": 0.725883476599809,
"grad_norm": 0.603577733589318,
"learning_rate": 5e-06,
"loss": 0.7693,
"step": 190
},
{
"epoch": 0.7640878701050621,
"grad_norm": 0.6986000212250895,
"learning_rate": 5e-06,
"loss": 0.7708,
"step": 200
},
{
"epoch": 0.8022922636103151,
"grad_norm": 0.6654581267839026,
"learning_rate": 5e-06,
"loss": 0.7686,
"step": 210
},
{
"epoch": 0.8404966571155683,
"grad_norm": 0.7161329148587753,
"learning_rate": 5e-06,
"loss": 0.7639,
"step": 220
},
{
"epoch": 0.8787010506208214,
"grad_norm": 0.7291450225202621,
"learning_rate": 5e-06,
"loss": 0.7653,
"step": 230
},
{
"epoch": 0.9169054441260746,
"grad_norm": 0.7588585120562266,
"learning_rate": 5e-06,
"loss": 0.7633,
"step": 240
},
{
"epoch": 0.9551098376313276,
"grad_norm": 0.8527681409489486,
"learning_rate": 5e-06,
"loss": 0.758,
"step": 250
},
{
"epoch": 0.9933142311365807,
"grad_norm": 0.5733961547084557,
"learning_rate": 5e-06,
"loss": 0.7556,
"step": 260
},
{
"epoch": 0.997134670487106,
"eval_loss": 0.7598350644111633,
"eval_runtime": 277.7141,
"eval_samples_per_second": 25.397,
"eval_steps_per_second": 0.4,
"step": 261
},
{
"epoch": 1.033906399235912,
"grad_norm": 0.8279780719142003,
"learning_rate": 5e-06,
"loss": 0.7812,
"step": 270
},
{
"epoch": 1.0721107927411653,
"grad_norm": 0.8247306859370325,
"learning_rate": 5e-06,
"loss": 0.7055,
"step": 280
},
{
"epoch": 1.1103151862464182,
"grad_norm": 0.6966889761138212,
"learning_rate": 5e-06,
"loss": 0.7083,
"step": 290
},
{
"epoch": 1.1485195797516714,
"grad_norm": 0.7158212384271236,
"learning_rate": 5e-06,
"loss": 0.7028,
"step": 300
},
{
"epoch": 1.1867239732569246,
"grad_norm": 0.6513073930120158,
"learning_rate": 5e-06,
"loss": 0.7105,
"step": 310
},
{
"epoch": 1.2249283667621778,
"grad_norm": 0.6782643522162839,
"learning_rate": 5e-06,
"loss": 0.7092,
"step": 320
},
{
"epoch": 1.2631327602674307,
"grad_norm": 0.7229522703779768,
"learning_rate": 5e-06,
"loss": 0.7027,
"step": 330
},
{
"epoch": 1.3013371537726839,
"grad_norm": 0.686689277377695,
"learning_rate": 5e-06,
"loss": 0.7092,
"step": 340
},
{
"epoch": 1.3395415472779368,
"grad_norm": 0.6686838666832164,
"learning_rate": 5e-06,
"loss": 0.7076,
"step": 350
},
{
"epoch": 1.37774594078319,
"grad_norm": 0.8906041633095456,
"learning_rate": 5e-06,
"loss": 0.703,
"step": 360
},
{
"epoch": 1.4159503342884432,
"grad_norm": 0.6554966723136705,
"learning_rate": 5e-06,
"loss": 0.7078,
"step": 370
},
{
"epoch": 1.4541547277936964,
"grad_norm": 0.6044694699607192,
"learning_rate": 5e-06,
"loss": 0.7064,
"step": 380
},
{
"epoch": 1.4923591212989493,
"grad_norm": 0.6795649168516978,
"learning_rate": 5e-06,
"loss": 0.7042,
"step": 390
},
{
"epoch": 1.5305635148042025,
"grad_norm": 0.6518471106071317,
"learning_rate": 5e-06,
"loss": 0.7055,
"step": 400
},
{
"epoch": 1.5687679083094554,
"grad_norm": 0.656923516770704,
"learning_rate": 5e-06,
"loss": 0.707,
"step": 410
},
{
"epoch": 1.6069723018147086,
"grad_norm": 0.6225351538335282,
"learning_rate": 5e-06,
"loss": 0.7045,
"step": 420
},
{
"epoch": 1.6451766953199618,
"grad_norm": 0.777993155808027,
"learning_rate": 5e-06,
"loss": 0.7056,
"step": 430
},
{
"epoch": 1.683381088825215,
"grad_norm": 0.6859522609411438,
"learning_rate": 5e-06,
"loss": 0.7047,
"step": 440
},
{
"epoch": 1.7215854823304682,
"grad_norm": 0.6651740116728566,
"learning_rate": 5e-06,
"loss": 0.7004,
"step": 450
},
{
"epoch": 1.759789875835721,
"grad_norm": 0.5969267067929888,
"learning_rate": 5e-06,
"loss": 0.7012,
"step": 460
},
{
"epoch": 1.797994269340974,
"grad_norm": 0.613011364797191,
"learning_rate": 5e-06,
"loss": 0.7073,
"step": 470
},
{
"epoch": 1.8361986628462272,
"grad_norm": 0.6207675725240294,
"learning_rate": 5e-06,
"loss": 0.7002,
"step": 480
},
{
"epoch": 1.8744030563514804,
"grad_norm": 0.5957548029122114,
"learning_rate": 5e-06,
"loss": 0.7063,
"step": 490
},
{
"epoch": 1.9126074498567336,
"grad_norm": 0.6972405923078006,
"learning_rate": 5e-06,
"loss": 0.7046,
"step": 500
},
{
"epoch": 1.9508118433619868,
"grad_norm": 0.6246007349854923,
"learning_rate": 5e-06,
"loss": 0.7074,
"step": 510
},
{
"epoch": 1.9890162368672397,
"grad_norm": 0.6291588284508969,
"learning_rate": 5e-06,
"loss": 0.6989,
"step": 520
},
{
"epoch": 1.9966571155682904,
"eval_loss": 0.7465963363647461,
"eval_runtime": 278.0536,
"eval_samples_per_second": 25.366,
"eval_steps_per_second": 0.399,
"step": 522
},
{
"epoch": 2.029608404966571,
"grad_norm": 0.9162417337081611,
"learning_rate": 5e-06,
"loss": 0.7192,
"step": 530
},
{
"epoch": 2.067812798471824,
"grad_norm": 0.6666792440916017,
"learning_rate": 5e-06,
"loss": 0.6447,
"step": 540
},
{
"epoch": 2.1060171919770774,
"grad_norm": 0.6320270044953581,
"learning_rate": 5e-06,
"loss": 0.6504,
"step": 550
},
{
"epoch": 2.1442215854823305,
"grad_norm": 0.8407791173957067,
"learning_rate": 5e-06,
"loss": 0.6486,
"step": 560
},
{
"epoch": 2.1824259789875837,
"grad_norm": 0.9821964292842589,
"learning_rate": 5e-06,
"loss": 0.6528,
"step": 570
},
{
"epoch": 2.2206303724928365,
"grad_norm": 0.9530943320810575,
"learning_rate": 5e-06,
"loss": 0.6536,
"step": 580
},
{
"epoch": 2.2588347659980896,
"grad_norm": 0.7065407366963496,
"learning_rate": 5e-06,
"loss": 0.6515,
"step": 590
},
{
"epoch": 2.297039159503343,
"grad_norm": 0.9143814042284045,
"learning_rate": 5e-06,
"loss": 0.6516,
"step": 600
},
{
"epoch": 2.335243553008596,
"grad_norm": 1.0356120889247198,
"learning_rate": 5e-06,
"loss": 0.6522,
"step": 610
},
{
"epoch": 2.373447946513849,
"grad_norm": 0.5955479754526213,
"learning_rate": 5e-06,
"loss": 0.6484,
"step": 620
},
{
"epoch": 2.4116523400191023,
"grad_norm": 0.668667957683453,
"learning_rate": 5e-06,
"loss": 0.6517,
"step": 630
},
{
"epoch": 2.4498567335243555,
"grad_norm": 0.5916712868955213,
"learning_rate": 5e-06,
"loss": 0.6492,
"step": 640
},
{
"epoch": 2.4880611270296082,
"grad_norm": 0.7078011059499841,
"learning_rate": 5e-06,
"loss": 0.6499,
"step": 650
},
{
"epoch": 2.5262655205348614,
"grad_norm": 0.7113167536030103,
"learning_rate": 5e-06,
"loss": 0.6491,
"step": 660
},
{
"epoch": 2.5644699140401146,
"grad_norm": 0.7286729281959395,
"learning_rate": 5e-06,
"loss": 0.6559,
"step": 670
},
{
"epoch": 2.6026743075453678,
"grad_norm": 0.8393175922718014,
"learning_rate": 5e-06,
"loss": 0.6537,
"step": 680
},
{
"epoch": 2.640878701050621,
"grad_norm": 0.9635082597402534,
"learning_rate": 5e-06,
"loss": 0.645,
"step": 690
},
{
"epoch": 2.6790830945558737,
"grad_norm": 0.6376260449080609,
"learning_rate": 5e-06,
"loss": 0.6516,
"step": 700
},
{
"epoch": 2.7172874880611273,
"grad_norm": 0.9042073773765085,
"learning_rate": 5e-06,
"loss": 0.6538,
"step": 710
},
{
"epoch": 2.75549188156638,
"grad_norm": 0.8795780646670239,
"learning_rate": 5e-06,
"loss": 0.6537,
"step": 720
},
{
"epoch": 2.793696275071633,
"grad_norm": 0.7101546769683508,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 730
},
{
"epoch": 2.8319006685768864,
"grad_norm": 0.6112049740364579,
"learning_rate": 5e-06,
"loss": 0.6536,
"step": 740
},
{
"epoch": 2.8701050620821396,
"grad_norm": 0.6240740305145582,
"learning_rate": 5e-06,
"loss": 0.6525,
"step": 750
},
{
"epoch": 2.9083094555873927,
"grad_norm": 0.6687610050145816,
"learning_rate": 5e-06,
"loss": 0.6569,
"step": 760
},
{
"epoch": 2.9465138490926455,
"grad_norm": 0.7981405552978358,
"learning_rate": 5e-06,
"loss": 0.655,
"step": 770
},
{
"epoch": 2.9847182425978986,
"grad_norm": 0.6901040178181519,
"learning_rate": 5e-06,
"loss": 0.6567,
"step": 780
},
{
"epoch": 2.9961795606494745,
"eval_loss": 0.7494649887084961,
"eval_runtime": 280.3949,
"eval_samples_per_second": 25.154,
"eval_steps_per_second": 0.396,
"step": 783
},
{
"epoch": 2.9961795606494745,
"step": 783,
"total_flos": 1311344783523840.0,
"train_loss": 0.7237713550090181,
"train_runtime": 46210.0208,
"train_samples_per_second": 8.699,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 783,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1311344783523840.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}