oh_v1.3_opengpt_x.125 / trainer_state.json
gsmyrnis's picture
End of training
08f82bb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 819,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03663003663003663,
"grad_norm": 2.6755562756699156,
"learning_rate": 5e-06,
"loss": 1.035,
"step": 10
},
{
"epoch": 0.07326007326007326,
"grad_norm": 5.71293803806716,
"learning_rate": 5e-06,
"loss": 0.9104,
"step": 20
},
{
"epoch": 0.10989010989010989,
"grad_norm": 1.2085741029349113,
"learning_rate": 5e-06,
"loss": 0.8736,
"step": 30
},
{
"epoch": 0.14652014652014653,
"grad_norm": 1.059768802396023,
"learning_rate": 5e-06,
"loss": 0.8444,
"step": 40
},
{
"epoch": 0.18315018315018314,
"grad_norm": 1.2824869965245793,
"learning_rate": 5e-06,
"loss": 0.8236,
"step": 50
},
{
"epoch": 0.21978021978021978,
"grad_norm": 1.0163565118914315,
"learning_rate": 5e-06,
"loss": 0.8101,
"step": 60
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.8142514953905634,
"learning_rate": 5e-06,
"loss": 0.798,
"step": 70
},
{
"epoch": 0.29304029304029305,
"grad_norm": 0.78627587936006,
"learning_rate": 5e-06,
"loss": 0.7823,
"step": 80
},
{
"epoch": 0.32967032967032966,
"grad_norm": 1.219228058219057,
"learning_rate": 5e-06,
"loss": 0.7843,
"step": 90
},
{
"epoch": 0.3663003663003663,
"grad_norm": 0.856017546736893,
"learning_rate": 5e-06,
"loss": 0.7814,
"step": 100
},
{
"epoch": 0.40293040293040294,
"grad_norm": 0.8398588201896897,
"learning_rate": 5e-06,
"loss": 0.7689,
"step": 110
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.6452439779107471,
"learning_rate": 5e-06,
"loss": 0.7629,
"step": 120
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.7382887179450202,
"learning_rate": 5e-06,
"loss": 0.764,
"step": 130
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.8641898972221077,
"learning_rate": 5e-06,
"loss": 0.7579,
"step": 140
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.6968648193284981,
"learning_rate": 5e-06,
"loss": 0.7583,
"step": 150
},
{
"epoch": 0.5860805860805861,
"grad_norm": 0.7876021013623588,
"learning_rate": 5e-06,
"loss": 0.7603,
"step": 160
},
{
"epoch": 0.6227106227106227,
"grad_norm": 0.6628535279447354,
"learning_rate": 5e-06,
"loss": 0.7549,
"step": 170
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.5808515472437231,
"learning_rate": 5e-06,
"loss": 0.7532,
"step": 180
},
{
"epoch": 0.6959706959706959,
"grad_norm": 0.6897807179523675,
"learning_rate": 5e-06,
"loss": 0.7538,
"step": 190
},
{
"epoch": 0.7326007326007326,
"grad_norm": 0.6799003012866335,
"learning_rate": 5e-06,
"loss": 0.7468,
"step": 200
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.6218959361844152,
"learning_rate": 5e-06,
"loss": 0.7472,
"step": 210
},
{
"epoch": 0.8058608058608059,
"grad_norm": 0.7883865245529573,
"learning_rate": 5e-06,
"loss": 0.7439,
"step": 220
},
{
"epoch": 0.8424908424908425,
"grad_norm": 0.827056331164966,
"learning_rate": 5e-06,
"loss": 0.7387,
"step": 230
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.8475964615734182,
"learning_rate": 5e-06,
"loss": 0.7457,
"step": 240
},
{
"epoch": 0.9157509157509157,
"grad_norm": 0.6631462090502517,
"learning_rate": 5e-06,
"loss": 0.7432,
"step": 250
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.7263929092757935,
"learning_rate": 5e-06,
"loss": 0.7394,
"step": 260
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.6555923863615204,
"learning_rate": 5e-06,
"loss": 0.7388,
"step": 270
},
{
"epoch": 1.0,
"eval_loss": 0.7403956651687622,
"eval_runtime": 27.4968,
"eval_samples_per_second": 266.758,
"eval_steps_per_second": 1.055,
"step": 273
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.7513303784704211,
"learning_rate": 5e-06,
"loss": 0.7059,
"step": 280
},
{
"epoch": 1.0622710622710623,
"grad_norm": 0.9557884526645677,
"learning_rate": 5e-06,
"loss": 0.6889,
"step": 290
},
{
"epoch": 1.098901098901099,
"grad_norm": 0.632082904582309,
"learning_rate": 5e-06,
"loss": 0.6931,
"step": 300
},
{
"epoch": 1.1355311355311355,
"grad_norm": 0.7054504488885992,
"learning_rate": 5e-06,
"loss": 0.692,
"step": 310
},
{
"epoch": 1.1721611721611722,
"grad_norm": 0.9218108981577618,
"learning_rate": 5e-06,
"loss": 0.6906,
"step": 320
},
{
"epoch": 1.2087912087912087,
"grad_norm": 0.6597799156184799,
"learning_rate": 5e-06,
"loss": 0.6875,
"step": 330
},
{
"epoch": 1.2454212454212454,
"grad_norm": 0.5920093744179588,
"learning_rate": 5e-06,
"loss": 0.6854,
"step": 340
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.6066160644410351,
"learning_rate": 5e-06,
"loss": 0.6848,
"step": 350
},
{
"epoch": 1.3186813186813187,
"grad_norm": 0.7191350945649401,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 360
},
{
"epoch": 1.3553113553113554,
"grad_norm": 0.6853021132203265,
"learning_rate": 5e-06,
"loss": 0.6858,
"step": 370
},
{
"epoch": 1.3919413919413919,
"grad_norm": 0.6295828110796835,
"learning_rate": 5e-06,
"loss": 0.6906,
"step": 380
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.6225605880403292,
"learning_rate": 5e-06,
"loss": 0.6872,
"step": 390
},
{
"epoch": 1.4652014652014653,
"grad_norm": 0.5721545540006036,
"learning_rate": 5e-06,
"loss": 0.6844,
"step": 400
},
{
"epoch": 1.5018315018315018,
"grad_norm": 0.6495541464106162,
"learning_rate": 5e-06,
"loss": 0.687,
"step": 410
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.6180233995820079,
"learning_rate": 5e-06,
"loss": 0.6836,
"step": 420
},
{
"epoch": 1.575091575091575,
"grad_norm": 0.5924680089205238,
"learning_rate": 5e-06,
"loss": 0.6881,
"step": 430
},
{
"epoch": 1.6117216117216118,
"grad_norm": 0.7715824466204046,
"learning_rate": 5e-06,
"loss": 0.6925,
"step": 440
},
{
"epoch": 1.6483516483516483,
"grad_norm": 0.5635809156525354,
"learning_rate": 5e-06,
"loss": 0.6808,
"step": 450
},
{
"epoch": 1.684981684981685,
"grad_norm": 0.7211075324292373,
"learning_rate": 5e-06,
"loss": 0.6841,
"step": 460
},
{
"epoch": 1.7216117216117217,
"grad_norm": 0.7574566804669711,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 470
},
{
"epoch": 1.7582417582417582,
"grad_norm": 0.714666214669077,
"learning_rate": 5e-06,
"loss": 0.6837,
"step": 480
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.6715552536826968,
"learning_rate": 5e-06,
"loss": 0.6883,
"step": 490
},
{
"epoch": 1.8315018315018317,
"grad_norm": 0.8501456353850813,
"learning_rate": 5e-06,
"loss": 0.6824,
"step": 500
},
{
"epoch": 1.8681318681318682,
"grad_norm": 0.8210564757608708,
"learning_rate": 5e-06,
"loss": 0.6826,
"step": 510
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.6233607070658583,
"learning_rate": 5e-06,
"loss": 0.6823,
"step": 520
},
{
"epoch": 1.9413919413919414,
"grad_norm": 0.6807825420722514,
"learning_rate": 5e-06,
"loss": 0.6885,
"step": 530
},
{
"epoch": 1.978021978021978,
"grad_norm": 0.6629344076568074,
"learning_rate": 5e-06,
"loss": 0.6824,
"step": 540
},
{
"epoch": 2.0,
"eval_loss": 0.730824887752533,
"eval_runtime": 27.3193,
"eval_samples_per_second": 268.491,
"eval_steps_per_second": 1.062,
"step": 546
},
{
"epoch": 2.0146520146520146,
"grad_norm": 0.9129368210722825,
"learning_rate": 5e-06,
"loss": 0.6558,
"step": 550
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.8503780606175464,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 560
},
{
"epoch": 2.087912087912088,
"grad_norm": 0.7964894282266023,
"learning_rate": 5e-06,
"loss": 0.6286,
"step": 570
},
{
"epoch": 2.1245421245421245,
"grad_norm": 0.7422264580205383,
"learning_rate": 5e-06,
"loss": 0.6314,
"step": 580
},
{
"epoch": 2.161172161172161,
"grad_norm": 0.8546952236590375,
"learning_rate": 5e-06,
"loss": 0.6283,
"step": 590
},
{
"epoch": 2.197802197802198,
"grad_norm": 0.5776994000174623,
"learning_rate": 5e-06,
"loss": 0.6314,
"step": 600
},
{
"epoch": 2.2344322344322345,
"grad_norm": 1.0037526840488868,
"learning_rate": 5e-06,
"loss": 0.6335,
"step": 610
},
{
"epoch": 2.271062271062271,
"grad_norm": 0.7062352774981094,
"learning_rate": 5e-06,
"loss": 0.6357,
"step": 620
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.8407157088448836,
"learning_rate": 5e-06,
"loss": 0.633,
"step": 630
},
{
"epoch": 2.3443223443223444,
"grad_norm": 0.5822705575557771,
"learning_rate": 5e-06,
"loss": 0.6349,
"step": 640
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.7810470709804241,
"learning_rate": 5e-06,
"loss": 0.6358,
"step": 650
},
{
"epoch": 2.4175824175824174,
"grad_norm": 0.6792571477446749,
"learning_rate": 5e-06,
"loss": 0.634,
"step": 660
},
{
"epoch": 2.4542124542124544,
"grad_norm": 0.6582561005269292,
"learning_rate": 5e-06,
"loss": 0.6332,
"step": 670
},
{
"epoch": 2.490842490842491,
"grad_norm": 0.7333034032018606,
"learning_rate": 5e-06,
"loss": 0.636,
"step": 680
},
{
"epoch": 2.5274725274725274,
"grad_norm": 0.6514490686612251,
"learning_rate": 5e-06,
"loss": 0.6342,
"step": 690
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.7738245201171186,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 700
},
{
"epoch": 2.600732600732601,
"grad_norm": 0.6223909134135305,
"learning_rate": 5e-06,
"loss": 0.6373,
"step": 710
},
{
"epoch": 2.6373626373626373,
"grad_norm": 0.6265203120796818,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 720
},
{
"epoch": 2.6739926739926743,
"grad_norm": 0.7274455241516222,
"learning_rate": 5e-06,
"loss": 0.6382,
"step": 730
},
{
"epoch": 2.7106227106227108,
"grad_norm": 0.7062825571332273,
"learning_rate": 5e-06,
"loss": 0.638,
"step": 740
},
{
"epoch": 2.7472527472527473,
"grad_norm": 0.7066746525423848,
"learning_rate": 5e-06,
"loss": 0.6349,
"step": 750
},
{
"epoch": 2.7838827838827838,
"grad_norm": 0.6012743699495499,
"learning_rate": 5e-06,
"loss": 0.6379,
"step": 760
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.6253902136875567,
"learning_rate": 5e-06,
"loss": 0.6337,
"step": 770
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.799587316373445,
"learning_rate": 5e-06,
"loss": 0.641,
"step": 780
},
{
"epoch": 2.8937728937728937,
"grad_norm": 0.6530718377415161,
"learning_rate": 5e-06,
"loss": 0.6388,
"step": 790
},
{
"epoch": 2.9304029304029307,
"grad_norm": 0.6532704811463808,
"learning_rate": 5e-06,
"loss": 0.6369,
"step": 800
},
{
"epoch": 2.967032967032967,
"grad_norm": 0.6925982583545915,
"learning_rate": 5e-06,
"loss": 0.6421,
"step": 810
},
{
"epoch": 3.0,
"eval_loss": 0.7364696264266968,
"eval_runtime": 26.2679,
"eval_samples_per_second": 279.238,
"eval_steps_per_second": 1.104,
"step": 819
},
{
"epoch": 3.0,
"step": 819,
"total_flos": 1371855504015360.0,
"train_loss": 0.7025235231979426,
"train_runtime": 5539.9882,
"train_samples_per_second": 75.468,
"train_steps_per_second": 0.148
}
],
"logging_steps": 10,
"max_steps": 819,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1371855504015360.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}