oh_v1.3_opengpt_x.5 / trainer_state.json
gsmyrnis's picture
End of training
15d2ae6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 819,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03663003663003663,
"grad_norm": 7.746065665628135,
"learning_rate": 5e-06,
"loss": 1.0405,
"step": 10
},
{
"epoch": 0.07326007326007326,
"grad_norm": 2.858450692340174,
"learning_rate": 5e-06,
"loss": 0.9101,
"step": 20
},
{
"epoch": 0.10989010989010989,
"grad_norm": 5.616678502031101,
"learning_rate": 5e-06,
"loss": 0.8722,
"step": 30
},
{
"epoch": 0.14652014652014653,
"grad_norm": 1.8086289320620221,
"learning_rate": 5e-06,
"loss": 0.8529,
"step": 40
},
{
"epoch": 0.18315018315018314,
"grad_norm": 1.1491569593758013,
"learning_rate": 5e-06,
"loss": 0.8259,
"step": 50
},
{
"epoch": 0.21978021978021978,
"grad_norm": 1.1648582184563463,
"learning_rate": 5e-06,
"loss": 0.8144,
"step": 60
},
{
"epoch": 0.2564102564102564,
"grad_norm": 1.0006985861658089,
"learning_rate": 5e-06,
"loss": 0.7986,
"step": 70
},
{
"epoch": 0.29304029304029305,
"grad_norm": 1.1821538573997477,
"learning_rate": 5e-06,
"loss": 0.7911,
"step": 80
},
{
"epoch": 0.32967032967032966,
"grad_norm": 1.5897294264378354,
"learning_rate": 5e-06,
"loss": 0.7807,
"step": 90
},
{
"epoch": 0.3663003663003663,
"grad_norm": 1.9362698717241806,
"learning_rate": 5e-06,
"loss": 0.7819,
"step": 100
},
{
"epoch": 0.40293040293040294,
"grad_norm": 1.0281035872096098,
"learning_rate": 5e-06,
"loss": 0.7791,
"step": 110
},
{
"epoch": 0.43956043956043955,
"grad_norm": 1.6454534962205227,
"learning_rate": 5e-06,
"loss": 0.7688,
"step": 120
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.9583089645191222,
"learning_rate": 5e-06,
"loss": 0.7669,
"step": 130
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.7745194128699995,
"learning_rate": 5e-06,
"loss": 0.764,
"step": 140
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.7276307011919524,
"learning_rate": 5e-06,
"loss": 0.7586,
"step": 150
},
{
"epoch": 0.5860805860805861,
"grad_norm": 0.8554951216643023,
"learning_rate": 5e-06,
"loss": 0.7572,
"step": 160
},
{
"epoch": 0.6227106227106227,
"grad_norm": 0.7121498821814782,
"learning_rate": 5e-06,
"loss": 0.7574,
"step": 170
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.6839163656685533,
"learning_rate": 5e-06,
"loss": 0.7568,
"step": 180
},
{
"epoch": 0.6959706959706959,
"grad_norm": 0.9093255308667819,
"learning_rate": 5e-06,
"loss": 0.7472,
"step": 190
},
{
"epoch": 0.7326007326007326,
"grad_norm": 0.6638464031533008,
"learning_rate": 5e-06,
"loss": 0.751,
"step": 200
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.5927695218746816,
"learning_rate": 5e-06,
"loss": 0.7477,
"step": 210
},
{
"epoch": 0.8058608058608059,
"grad_norm": 0.6423957250273598,
"learning_rate": 5e-06,
"loss": 0.7473,
"step": 220
},
{
"epoch": 0.8424908424908425,
"grad_norm": 0.6576759690318968,
"learning_rate": 5e-06,
"loss": 0.7422,
"step": 230
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.6476360077544144,
"learning_rate": 5e-06,
"loss": 0.7417,
"step": 240
},
{
"epoch": 0.9157509157509157,
"grad_norm": 0.6320860393471378,
"learning_rate": 5e-06,
"loss": 0.7409,
"step": 250
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.5912257284209596,
"learning_rate": 5e-06,
"loss": 0.7431,
"step": 260
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.6319178097643761,
"learning_rate": 5e-06,
"loss": 0.742,
"step": 270
},
{
"epoch": 1.0,
"eval_loss": 0.7415268421173096,
"eval_runtime": 26.4114,
"eval_samples_per_second": 278.062,
"eval_steps_per_second": 1.098,
"step": 273
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.7382077696725204,
"learning_rate": 5e-06,
"loss": 0.7069,
"step": 280
},
{
"epoch": 1.0622710622710623,
"grad_norm": 0.7190349411351752,
"learning_rate": 5e-06,
"loss": 0.6897,
"step": 290
},
{
"epoch": 1.098901098901099,
"grad_norm": 0.7067376226598984,
"learning_rate": 5e-06,
"loss": 0.683,
"step": 300
},
{
"epoch": 1.1355311355311355,
"grad_norm": 0.5710281059044071,
"learning_rate": 5e-06,
"loss": 0.6868,
"step": 310
},
{
"epoch": 1.1721611721611722,
"grad_norm": 0.6626952213065733,
"learning_rate": 5e-06,
"loss": 0.6831,
"step": 320
},
{
"epoch": 1.2087912087912087,
"grad_norm": 0.7005105428895475,
"learning_rate": 5e-06,
"loss": 0.6869,
"step": 330
},
{
"epoch": 1.2454212454212454,
"grad_norm": 0.6619897882086011,
"learning_rate": 5e-06,
"loss": 0.6878,
"step": 340
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.6508408490480377,
"learning_rate": 5e-06,
"loss": 0.6925,
"step": 350
},
{
"epoch": 1.3186813186813187,
"grad_norm": 0.5432662746572642,
"learning_rate": 5e-06,
"loss": 0.6876,
"step": 360
},
{
"epoch": 1.3553113553113554,
"grad_norm": 0.6107105594531901,
"learning_rate": 5e-06,
"loss": 0.6931,
"step": 370
},
{
"epoch": 1.3919413919413919,
"grad_norm": 0.739761075105529,
"learning_rate": 5e-06,
"loss": 0.6866,
"step": 380
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.8191906114187061,
"learning_rate": 5e-06,
"loss": 0.6906,
"step": 390
},
{
"epoch": 1.4652014652014653,
"grad_norm": 0.7122072456112296,
"learning_rate": 5e-06,
"loss": 0.6887,
"step": 400
},
{
"epoch": 1.5018315018315018,
"grad_norm": 0.636634621554325,
"learning_rate": 5e-06,
"loss": 0.6873,
"step": 410
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.6281126431822524,
"learning_rate": 5e-06,
"loss": 0.6843,
"step": 420
},
{
"epoch": 1.575091575091575,
"grad_norm": 0.6457324290442519,
"learning_rate": 5e-06,
"loss": 0.6864,
"step": 430
},
{
"epoch": 1.6117216117216118,
"grad_norm": 0.6262969556153274,
"learning_rate": 5e-06,
"loss": 0.6839,
"step": 440
},
{
"epoch": 1.6483516483516483,
"grad_norm": 0.5690984785192538,
"learning_rate": 5e-06,
"loss": 0.6928,
"step": 450
},
{
"epoch": 1.684981684981685,
"grad_norm": 0.6172111079980963,
"learning_rate": 5e-06,
"loss": 0.6834,
"step": 460
},
{
"epoch": 1.7216117216117217,
"grad_norm": 0.7070197339375575,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 470
},
{
"epoch": 1.7582417582417582,
"grad_norm": 0.6047694711574751,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 480
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.5464670796066172,
"learning_rate": 5e-06,
"loss": 0.6853,
"step": 490
},
{
"epoch": 1.8315018315018317,
"grad_norm": 0.7356761200809793,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 500
},
{
"epoch": 1.8681318681318682,
"grad_norm": 0.7493300019111138,
"learning_rate": 5e-06,
"loss": 0.6857,
"step": 510
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.5706346610150999,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 520
},
{
"epoch": 1.9413919413919414,
"grad_norm": 0.6273732081753551,
"learning_rate": 5e-06,
"loss": 0.6867,
"step": 530
},
{
"epoch": 1.978021978021978,
"grad_norm": 0.608424571172591,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 540
},
{
"epoch": 2.0,
"eval_loss": 0.7290822267532349,
"eval_runtime": 26.4229,
"eval_samples_per_second": 277.941,
"eval_steps_per_second": 1.098,
"step": 546
},
{
"epoch": 2.0146520146520146,
"grad_norm": 1.1435676214887527,
"learning_rate": 5e-06,
"loss": 0.6593,
"step": 550
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.8302719819904981,
"learning_rate": 5e-06,
"loss": 0.6276,
"step": 560
},
{
"epoch": 2.087912087912088,
"grad_norm": 0.8821536275021126,
"learning_rate": 5e-06,
"loss": 0.6319,
"step": 570
},
{
"epoch": 2.1245421245421245,
"grad_norm": 0.7567116572690076,
"learning_rate": 5e-06,
"loss": 0.6317,
"step": 580
},
{
"epoch": 2.161172161172161,
"grad_norm": 0.7854569193284114,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 590
},
{
"epoch": 2.197802197802198,
"grad_norm": 0.7432443286862768,
"learning_rate": 5e-06,
"loss": 0.6336,
"step": 600
},
{
"epoch": 2.2344322344322345,
"grad_norm": 0.6585567960235167,
"learning_rate": 5e-06,
"loss": 0.6324,
"step": 610
},
{
"epoch": 2.271062271062271,
"grad_norm": 0.655030138483751,
"learning_rate": 5e-06,
"loss": 0.6354,
"step": 620
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.7081546371367903,
"learning_rate": 5e-06,
"loss": 0.6355,
"step": 630
},
{
"epoch": 2.3443223443223444,
"grad_norm": 0.6817413886775435,
"learning_rate": 5e-06,
"loss": 0.6323,
"step": 640
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.7087190344600633,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 650
},
{
"epoch": 2.4175824175824174,
"grad_norm": 0.6282200808159406,
"learning_rate": 5e-06,
"loss": 0.6309,
"step": 660
},
{
"epoch": 2.4542124542124544,
"grad_norm": 0.7257484686385395,
"learning_rate": 5e-06,
"loss": 0.6335,
"step": 670
},
{
"epoch": 2.490842490842491,
"grad_norm": 0.7901264231040732,
"learning_rate": 5e-06,
"loss": 0.6383,
"step": 680
},
{
"epoch": 2.5274725274725274,
"grad_norm": 0.5844396831283855,
"learning_rate": 5e-06,
"loss": 0.6342,
"step": 690
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.7132030040768085,
"learning_rate": 5e-06,
"loss": 0.6354,
"step": 700
},
{
"epoch": 2.600732600732601,
"grad_norm": 0.7576742849969995,
"learning_rate": 5e-06,
"loss": 0.6332,
"step": 710
},
{
"epoch": 2.6373626373626373,
"grad_norm": 0.9618623866692995,
"learning_rate": 5e-06,
"loss": 0.6382,
"step": 720
},
{
"epoch": 2.6739926739926743,
"grad_norm": 0.7036643018293387,
"learning_rate": 5e-06,
"loss": 0.637,
"step": 730
},
{
"epoch": 2.7106227106227108,
"grad_norm": 0.594473573698193,
"learning_rate": 5e-06,
"loss": 0.6336,
"step": 740
},
{
"epoch": 2.7472527472527473,
"grad_norm": 0.6437871707069581,
"learning_rate": 5e-06,
"loss": 0.6371,
"step": 750
},
{
"epoch": 2.7838827838827838,
"grad_norm": 0.6660532045165727,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 760
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.592751889739025,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 770
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.5902791558514751,
"learning_rate": 5e-06,
"loss": 0.6336,
"step": 780
},
{
"epoch": 2.8937728937728937,
"grad_norm": 0.5484108433220659,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 790
},
{
"epoch": 2.9304029304029307,
"grad_norm": 0.6657396541061575,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 800
},
{
"epoch": 2.967032967032967,
"grad_norm": 0.68952795495605,
"learning_rate": 5e-06,
"loss": 0.6339,
"step": 810
},
{
"epoch": 3.0,
"eval_loss": 0.7330417037010193,
"eval_runtime": 25.8793,
"eval_samples_per_second": 283.778,
"eval_steps_per_second": 1.121,
"step": 819
},
{
"epoch": 3.0,
"step": 819,
"total_flos": 1371855504015360.0,
"train_loss": 0.7030115820403792,
"train_runtime": 5219.8536,
"train_samples_per_second": 80.188,
"train_steps_per_second": 0.157
}
],
"logging_steps": 10,
"max_steps": 819,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1371855504015360.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}