oh_v1_w_v3_evol_instruct / trainer_state.json
sedrickkeh's picture
End of training
8dda371 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.994548758328286,
"eval_steps": 500,
"global_step": 1236,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024227740763173834,
"grad_norm": 3.9645989545939098,
"learning_rate": 5e-06,
"loss": 0.9024,
"step": 10
},
{
"epoch": 0.04845548152634767,
"grad_norm": 2.6091656491728203,
"learning_rate": 5e-06,
"loss": 0.7644,
"step": 20
},
{
"epoch": 0.0726832222895215,
"grad_norm": 1.6703261434511345,
"learning_rate": 5e-06,
"loss": 0.7238,
"step": 30
},
{
"epoch": 0.09691096305269534,
"grad_norm": 1.461094134640432,
"learning_rate": 5e-06,
"loss": 0.6977,
"step": 40
},
{
"epoch": 0.12113870381586916,
"grad_norm": 0.8067370661660782,
"learning_rate": 5e-06,
"loss": 0.6812,
"step": 50
},
{
"epoch": 0.145366444579043,
"grad_norm": 1.4104496160894404,
"learning_rate": 5e-06,
"loss": 0.6629,
"step": 60
},
{
"epoch": 0.16959418534221685,
"grad_norm": 1.2867495210279274,
"learning_rate": 5e-06,
"loss": 0.6521,
"step": 70
},
{
"epoch": 0.19382192610539067,
"grad_norm": 0.9866240288945327,
"learning_rate": 5e-06,
"loss": 0.6341,
"step": 80
},
{
"epoch": 0.2180496668685645,
"grad_norm": 0.7803813977069882,
"learning_rate": 5e-06,
"loss": 0.6285,
"step": 90
},
{
"epoch": 0.24227740763173833,
"grad_norm": 0.5198909215148648,
"learning_rate": 5e-06,
"loss": 0.6244,
"step": 100
},
{
"epoch": 0.2665051483949122,
"grad_norm": 0.5514332756156626,
"learning_rate": 5e-06,
"loss": 0.6184,
"step": 110
},
{
"epoch": 0.290732889158086,
"grad_norm": 1.2655246772691944,
"learning_rate": 5e-06,
"loss": 0.6167,
"step": 120
},
{
"epoch": 0.31496062992125984,
"grad_norm": 0.5276778250168523,
"learning_rate": 5e-06,
"loss": 0.6077,
"step": 130
},
{
"epoch": 0.3391883706844337,
"grad_norm": 0.5180368157026216,
"learning_rate": 5e-06,
"loss": 0.6025,
"step": 140
},
{
"epoch": 0.3634161114476075,
"grad_norm": 0.5457057952002654,
"learning_rate": 5e-06,
"loss": 0.6051,
"step": 150
},
{
"epoch": 0.38764385221078135,
"grad_norm": 0.6644602782199635,
"learning_rate": 5e-06,
"loss": 0.6054,
"step": 160
},
{
"epoch": 0.4118715929739552,
"grad_norm": 0.5187189241493725,
"learning_rate": 5e-06,
"loss": 0.6004,
"step": 170
},
{
"epoch": 0.436099333737129,
"grad_norm": 0.5056757936675704,
"learning_rate": 5e-06,
"loss": 0.5938,
"step": 180
},
{
"epoch": 0.46032707450030286,
"grad_norm": 0.5710728558545756,
"learning_rate": 5e-06,
"loss": 0.598,
"step": 190
},
{
"epoch": 0.48455481526347666,
"grad_norm": 0.5927928756195961,
"learning_rate": 5e-06,
"loss": 0.5983,
"step": 200
},
{
"epoch": 0.5087825560266506,
"grad_norm": 0.48351208074872803,
"learning_rate": 5e-06,
"loss": 0.59,
"step": 210
},
{
"epoch": 0.5330102967898244,
"grad_norm": 0.5309314168696151,
"learning_rate": 5e-06,
"loss": 0.5882,
"step": 220
},
{
"epoch": 0.5572380375529982,
"grad_norm": 0.5313229865997172,
"learning_rate": 5e-06,
"loss": 0.5926,
"step": 230
},
{
"epoch": 0.581465778316172,
"grad_norm": 0.5147829777258456,
"learning_rate": 5e-06,
"loss": 0.5851,
"step": 240
},
{
"epoch": 0.6056935190793459,
"grad_norm": 0.8359724219646064,
"learning_rate": 5e-06,
"loss": 0.5863,
"step": 250
},
{
"epoch": 0.6299212598425197,
"grad_norm": 0.4652543495079898,
"learning_rate": 5e-06,
"loss": 0.5813,
"step": 260
},
{
"epoch": 0.6541490006056935,
"grad_norm": 0.9372041174809007,
"learning_rate": 5e-06,
"loss": 0.5794,
"step": 270
},
{
"epoch": 0.6783767413688674,
"grad_norm": 0.5998442061578241,
"learning_rate": 5e-06,
"loss": 0.5828,
"step": 280
},
{
"epoch": 0.7026044821320412,
"grad_norm": 0.6092016010381666,
"learning_rate": 5e-06,
"loss": 0.5777,
"step": 290
},
{
"epoch": 0.726832222895215,
"grad_norm": 0.5895950017891558,
"learning_rate": 5e-06,
"loss": 0.5753,
"step": 300
},
{
"epoch": 0.7510599636583889,
"grad_norm": 0.45074902625510205,
"learning_rate": 5e-06,
"loss": 0.569,
"step": 310
},
{
"epoch": 0.7752877044215627,
"grad_norm": 0.6565760889840895,
"learning_rate": 5e-06,
"loss": 0.578,
"step": 320
},
{
"epoch": 0.7995154451847365,
"grad_norm": 0.6113007537505482,
"learning_rate": 5e-06,
"loss": 0.5792,
"step": 330
},
{
"epoch": 0.8237431859479104,
"grad_norm": 0.49731635826965837,
"learning_rate": 5e-06,
"loss": 0.5811,
"step": 340
},
{
"epoch": 0.8479709267110842,
"grad_norm": 0.5433055892052826,
"learning_rate": 5e-06,
"loss": 0.5727,
"step": 350
},
{
"epoch": 0.872198667474258,
"grad_norm": 0.5791498183091245,
"learning_rate": 5e-06,
"loss": 0.5722,
"step": 360
},
{
"epoch": 0.8964264082374318,
"grad_norm": 0.6563753601996463,
"learning_rate": 5e-06,
"loss": 0.5716,
"step": 370
},
{
"epoch": 0.9206541490006057,
"grad_norm": 0.5662298833739237,
"learning_rate": 5e-06,
"loss": 0.5673,
"step": 380
},
{
"epoch": 0.9448818897637795,
"grad_norm": 0.5441406943018641,
"learning_rate": 5e-06,
"loss": 0.5708,
"step": 390
},
{
"epoch": 0.9691096305269533,
"grad_norm": 0.6030868477920374,
"learning_rate": 5e-06,
"loss": 0.5707,
"step": 400
},
{
"epoch": 0.9933373712901272,
"grad_norm": 0.6009876519163959,
"learning_rate": 5e-06,
"loss": 0.5624,
"step": 410
},
{
"epoch": 0.9981829194427619,
"eval_loss": 0.5625064373016357,
"eval_runtime": 220.8608,
"eval_samples_per_second": 50.353,
"eval_steps_per_second": 0.394,
"step": 412
},
{
"epoch": 1.0175651120533011,
"grad_norm": 0.7010348009420115,
"learning_rate": 5e-06,
"loss": 0.542,
"step": 420
},
{
"epoch": 1.0417928528164748,
"grad_norm": 0.48384852790918653,
"learning_rate": 5e-06,
"loss": 0.5311,
"step": 430
},
{
"epoch": 1.0660205935796487,
"grad_norm": 0.7302174149962929,
"learning_rate": 5e-06,
"loss": 0.5346,
"step": 440
},
{
"epoch": 1.0902483343428226,
"grad_norm": 0.6089180089825021,
"learning_rate": 5e-06,
"loss": 0.5341,
"step": 450
},
{
"epoch": 1.1144760751059963,
"grad_norm": 0.6370901566175011,
"learning_rate": 5e-06,
"loss": 0.5278,
"step": 460
},
{
"epoch": 1.1387038158691702,
"grad_norm": 1.0810998793516953,
"learning_rate": 5e-06,
"loss": 0.5282,
"step": 470
},
{
"epoch": 1.1629315566323442,
"grad_norm": 0.5546290586428997,
"learning_rate": 5e-06,
"loss": 0.5294,
"step": 480
},
{
"epoch": 1.1871592973955178,
"grad_norm": 0.5393162492751344,
"learning_rate": 5e-06,
"loss": 0.5312,
"step": 490
},
{
"epoch": 1.2113870381586918,
"grad_norm": 0.539293775840461,
"learning_rate": 5e-06,
"loss": 0.5327,
"step": 500
},
{
"epoch": 1.2356147789218654,
"grad_norm": 0.5510862357568564,
"learning_rate": 5e-06,
"loss": 0.5292,
"step": 510
},
{
"epoch": 1.2598425196850394,
"grad_norm": 0.7420323514260083,
"learning_rate": 5e-06,
"loss": 0.5282,
"step": 520
},
{
"epoch": 1.2840702604482133,
"grad_norm": 0.7139637253621622,
"learning_rate": 5e-06,
"loss": 0.5305,
"step": 530
},
{
"epoch": 1.3082980012113872,
"grad_norm": 0.5530723883454477,
"learning_rate": 5e-06,
"loss": 0.5262,
"step": 540
},
{
"epoch": 1.3325257419745609,
"grad_norm": 0.5575001199886374,
"learning_rate": 5e-06,
"loss": 0.5219,
"step": 550
},
{
"epoch": 1.3567534827377348,
"grad_norm": 0.5598952872055329,
"learning_rate": 5e-06,
"loss": 0.5238,
"step": 560
},
{
"epoch": 1.3809812235009085,
"grad_norm": 0.541421977102063,
"learning_rate": 5e-06,
"loss": 0.5277,
"step": 570
},
{
"epoch": 1.4052089642640824,
"grad_norm": 0.4779799353516479,
"learning_rate": 5e-06,
"loss": 0.5233,
"step": 580
},
{
"epoch": 1.4294367050272563,
"grad_norm": 0.4967009823362501,
"learning_rate": 5e-06,
"loss": 0.5259,
"step": 590
},
{
"epoch": 1.45366444579043,
"grad_norm": 0.671567804923544,
"learning_rate": 5e-06,
"loss": 0.52,
"step": 600
},
{
"epoch": 1.4778921865536039,
"grad_norm": 0.5142557306648593,
"learning_rate": 5e-06,
"loss": 0.5219,
"step": 610
},
{
"epoch": 1.5021199273167776,
"grad_norm": 0.48133107485569593,
"learning_rate": 5e-06,
"loss": 0.5227,
"step": 620
},
{
"epoch": 1.5263476680799517,
"grad_norm": 0.4824602880697514,
"learning_rate": 5e-06,
"loss": 0.5254,
"step": 630
},
{
"epoch": 1.5505754088431254,
"grad_norm": 0.5818160150007355,
"learning_rate": 5e-06,
"loss": 0.5167,
"step": 640
},
{
"epoch": 1.574803149606299,
"grad_norm": 0.5210634892581837,
"learning_rate": 5e-06,
"loss": 0.5188,
"step": 650
},
{
"epoch": 1.5990308903694732,
"grad_norm": 0.5785709705676149,
"learning_rate": 5e-06,
"loss": 0.5247,
"step": 660
},
{
"epoch": 1.623258631132647,
"grad_norm": 0.49452413290015934,
"learning_rate": 5e-06,
"loss": 0.5163,
"step": 670
},
{
"epoch": 1.6474863718958206,
"grad_norm": 0.548802848702529,
"learning_rate": 5e-06,
"loss": 0.5156,
"step": 680
},
{
"epoch": 1.6717141126589945,
"grad_norm": 0.49091799711876205,
"learning_rate": 5e-06,
"loss": 0.519,
"step": 690
},
{
"epoch": 1.6959418534221684,
"grad_norm": 0.49801605722138825,
"learning_rate": 5e-06,
"loss": 0.5168,
"step": 700
},
{
"epoch": 1.720169594185342,
"grad_norm": 0.5325592423676614,
"learning_rate": 5e-06,
"loss": 0.5159,
"step": 710
},
{
"epoch": 1.744397334948516,
"grad_norm": 0.5638908149569352,
"learning_rate": 5e-06,
"loss": 0.5178,
"step": 720
},
{
"epoch": 1.76862507571169,
"grad_norm": 0.5592361495425977,
"learning_rate": 5e-06,
"loss": 0.5201,
"step": 730
},
{
"epoch": 1.7928528164748636,
"grad_norm": 0.6123941159516728,
"learning_rate": 5e-06,
"loss": 0.5158,
"step": 740
},
{
"epoch": 1.8170805572380375,
"grad_norm": 0.6140218865131343,
"learning_rate": 5e-06,
"loss": 0.5212,
"step": 750
},
{
"epoch": 1.8413082980012114,
"grad_norm": 0.5120361752286156,
"learning_rate": 5e-06,
"loss": 0.5165,
"step": 760
},
{
"epoch": 1.8655360387643851,
"grad_norm": 0.6257809652633152,
"learning_rate": 5e-06,
"loss": 0.5154,
"step": 770
},
{
"epoch": 1.889763779527559,
"grad_norm": 0.5517067441775361,
"learning_rate": 5e-06,
"loss": 0.5149,
"step": 780
},
{
"epoch": 1.913991520290733,
"grad_norm": 0.6184341957543669,
"learning_rate": 5e-06,
"loss": 0.5161,
"step": 790
},
{
"epoch": 1.9382192610539066,
"grad_norm": 0.45820885449224846,
"learning_rate": 5e-06,
"loss": 0.5163,
"step": 800
},
{
"epoch": 1.9624470018170805,
"grad_norm": 0.47230402419753703,
"learning_rate": 5e-06,
"loss": 0.5097,
"step": 810
},
{
"epoch": 1.9866747425802544,
"grad_norm": 0.5117535404655704,
"learning_rate": 5e-06,
"loss": 0.5104,
"step": 820
},
{
"epoch": 1.9987886129618413,
"eval_loss": 0.5373826026916504,
"eval_runtime": 221.5646,
"eval_samples_per_second": 50.193,
"eval_steps_per_second": 0.393,
"step": 825
},
{
"epoch": 2.010902483343428,
"grad_norm": 0.7236309822644943,
"learning_rate": 5e-06,
"loss": 0.4992,
"step": 830
},
{
"epoch": 2.0351302241066023,
"grad_norm": 0.51924831256788,
"learning_rate": 5e-06,
"loss": 0.4661,
"step": 840
},
{
"epoch": 2.059357964869776,
"grad_norm": 0.6287431376712824,
"learning_rate": 5e-06,
"loss": 0.4782,
"step": 850
},
{
"epoch": 2.0835857056329496,
"grad_norm": 0.5560237407779974,
"learning_rate": 5e-06,
"loss": 0.4752,
"step": 860
},
{
"epoch": 2.107813446396124,
"grad_norm": 0.4951419799289088,
"learning_rate": 5e-06,
"loss": 0.4783,
"step": 870
},
{
"epoch": 2.1320411871592975,
"grad_norm": 0.6200927154763336,
"learning_rate": 5e-06,
"loss": 0.4744,
"step": 880
},
{
"epoch": 2.156268927922471,
"grad_norm": 0.5434006657342179,
"learning_rate": 5e-06,
"loss": 0.4805,
"step": 890
},
{
"epoch": 2.1804966686856453,
"grad_norm": 0.5223425161148432,
"learning_rate": 5e-06,
"loss": 0.4754,
"step": 900
},
{
"epoch": 2.204724409448819,
"grad_norm": 0.6825783856408546,
"learning_rate": 5e-06,
"loss": 0.4758,
"step": 910
},
{
"epoch": 2.2289521502119927,
"grad_norm": 0.7441341191332547,
"learning_rate": 5e-06,
"loss": 0.4745,
"step": 920
},
{
"epoch": 2.253179890975167,
"grad_norm": 0.5442102121881107,
"learning_rate": 5e-06,
"loss": 0.4784,
"step": 930
},
{
"epoch": 2.2774076317383405,
"grad_norm": 0.6283995286149499,
"learning_rate": 5e-06,
"loss": 0.4804,
"step": 940
},
{
"epoch": 2.301635372501514,
"grad_norm": 0.5816653838824051,
"learning_rate": 5e-06,
"loss": 0.4778,
"step": 950
},
{
"epoch": 2.3258631132646883,
"grad_norm": 0.5549081081953725,
"learning_rate": 5e-06,
"loss": 0.4769,
"step": 960
},
{
"epoch": 2.350090854027862,
"grad_norm": 0.5279973698132591,
"learning_rate": 5e-06,
"loss": 0.4784,
"step": 970
},
{
"epoch": 2.3743185947910357,
"grad_norm": 0.6086532168897438,
"learning_rate": 5e-06,
"loss": 0.4859,
"step": 980
},
{
"epoch": 2.39854633555421,
"grad_norm": 0.5428630520508304,
"learning_rate": 5e-06,
"loss": 0.4797,
"step": 990
},
{
"epoch": 2.4227740763173835,
"grad_norm": 0.5159093369572579,
"learning_rate": 5e-06,
"loss": 0.4767,
"step": 1000
},
{
"epoch": 2.447001817080557,
"grad_norm": 0.6376648956193381,
"learning_rate": 5e-06,
"loss": 0.4797,
"step": 1010
},
{
"epoch": 2.471229557843731,
"grad_norm": 0.5547640740107239,
"learning_rate": 5e-06,
"loss": 0.4747,
"step": 1020
},
{
"epoch": 2.495457298606905,
"grad_norm": 0.4712927664253549,
"learning_rate": 5e-06,
"loss": 0.483,
"step": 1030
},
{
"epoch": 2.5196850393700787,
"grad_norm": 0.5246957682925482,
"learning_rate": 5e-06,
"loss": 0.4831,
"step": 1040
},
{
"epoch": 2.543912780133253,
"grad_norm": 0.49350610935172784,
"learning_rate": 5e-06,
"loss": 0.4761,
"step": 1050
},
{
"epoch": 2.5681405208964265,
"grad_norm": 0.5385640165749516,
"learning_rate": 5e-06,
"loss": 0.4781,
"step": 1060
},
{
"epoch": 2.5923682616596,
"grad_norm": 0.5103595421050182,
"learning_rate": 5e-06,
"loss": 0.4745,
"step": 1070
},
{
"epoch": 2.6165960024227743,
"grad_norm": 0.4917765675984819,
"learning_rate": 5e-06,
"loss": 0.4744,
"step": 1080
},
{
"epoch": 2.640823743185948,
"grad_norm": 0.5057838998647487,
"learning_rate": 5e-06,
"loss": 0.4812,
"step": 1090
},
{
"epoch": 2.6650514839491217,
"grad_norm": 0.5132155115777407,
"learning_rate": 5e-06,
"loss": 0.4714,
"step": 1100
},
{
"epoch": 2.6892792247122954,
"grad_norm": 0.5268065417493973,
"learning_rate": 5e-06,
"loss": 0.4761,
"step": 1110
},
{
"epoch": 2.7135069654754695,
"grad_norm": 0.552687045729179,
"learning_rate": 5e-06,
"loss": 0.4752,
"step": 1120
},
{
"epoch": 2.7377347062386432,
"grad_norm": 0.5709091633003863,
"learning_rate": 5e-06,
"loss": 0.4761,
"step": 1130
},
{
"epoch": 2.761962447001817,
"grad_norm": 0.4924939447065247,
"learning_rate": 5e-06,
"loss": 0.4817,
"step": 1140
},
{
"epoch": 2.786190187764991,
"grad_norm": 0.613145736032828,
"learning_rate": 5e-06,
"loss": 0.4758,
"step": 1150
},
{
"epoch": 2.8104179285281647,
"grad_norm": 0.5517385387267716,
"learning_rate": 5e-06,
"loss": 0.4732,
"step": 1160
},
{
"epoch": 2.8346456692913384,
"grad_norm": 0.5952199661627586,
"learning_rate": 5e-06,
"loss": 0.4832,
"step": 1170
},
{
"epoch": 2.8588734100545126,
"grad_norm": 0.6646202856678104,
"learning_rate": 5e-06,
"loss": 0.4767,
"step": 1180
},
{
"epoch": 2.8831011508176863,
"grad_norm": 0.7488420183599482,
"learning_rate": 5e-06,
"loss": 0.4773,
"step": 1190
},
{
"epoch": 2.90732889158086,
"grad_norm": 0.634712672540643,
"learning_rate": 5e-06,
"loss": 0.4803,
"step": 1200
},
{
"epoch": 2.931556632344034,
"grad_norm": 0.479455288955399,
"learning_rate": 5e-06,
"loss": 0.476,
"step": 1210
},
{
"epoch": 2.9557843731072078,
"grad_norm": 0.4788695949163765,
"learning_rate": 5e-06,
"loss": 0.4792,
"step": 1220
},
{
"epoch": 2.9800121138703815,
"grad_norm": 0.5724033736756394,
"learning_rate": 5e-06,
"loss": 0.4815,
"step": 1230
},
{
"epoch": 2.994548758328286,
"eval_loss": 0.530586838722229,
"eval_runtime": 221.8544,
"eval_samples_per_second": 50.127,
"eval_steps_per_second": 0.392,
"step": 1236
},
{
"epoch": 2.994548758328286,
"step": 1236,
"total_flos": 2069927276052480.0,
"train_loss": 0.5379253575716976,
"train_runtime": 37041.7776,
"train_samples_per_second": 17.113,
"train_steps_per_second": 0.033
}
],
"logging_steps": 10,
"max_steps": 1236,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2069927276052480.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}