OH_original_wo_evol_instruct_140k / trainer_state.json
sedrickkeh's picture
End of training
482c53d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.992688870836718,
"eval_steps": 500,
"global_step": 921,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03249390739236393,
"grad_norm": 2.5250117454296688,
"learning_rate": 5e-06,
"loss": 0.822,
"step": 10
},
{
"epoch": 0.06498781478472786,
"grad_norm": 1.1696004146252543,
"learning_rate": 5e-06,
"loss": 0.7337,
"step": 20
},
{
"epoch": 0.09748172217709179,
"grad_norm": 1.0985755367960321,
"learning_rate": 5e-06,
"loss": 0.7074,
"step": 30
},
{
"epoch": 0.12997562956945571,
"grad_norm": 0.9210546044771958,
"learning_rate": 5e-06,
"loss": 0.6975,
"step": 40
},
{
"epoch": 0.16246953696181965,
"grad_norm": 1.205608692095207,
"learning_rate": 5e-06,
"loss": 0.6824,
"step": 50
},
{
"epoch": 0.19496344435418358,
"grad_norm": 0.9213739096950729,
"learning_rate": 5e-06,
"loss": 0.6687,
"step": 60
},
{
"epoch": 0.22745735174654752,
"grad_norm": 1.1268962045000313,
"learning_rate": 5e-06,
"loss": 0.6663,
"step": 70
},
{
"epoch": 0.25995125913891143,
"grad_norm": 0.7485158028875047,
"learning_rate": 5e-06,
"loss": 0.651,
"step": 80
},
{
"epoch": 0.2924451665312754,
"grad_norm": 1.1057133229023792,
"learning_rate": 5e-06,
"loss": 0.6586,
"step": 90
},
{
"epoch": 0.3249390739236393,
"grad_norm": 0.6278383743313128,
"learning_rate": 5e-06,
"loss": 0.6473,
"step": 100
},
{
"epoch": 0.35743298131600326,
"grad_norm": 0.5769967627285304,
"learning_rate": 5e-06,
"loss": 0.6511,
"step": 110
},
{
"epoch": 0.38992688870836717,
"grad_norm": 0.4919697275903666,
"learning_rate": 5e-06,
"loss": 0.6441,
"step": 120
},
{
"epoch": 0.42242079610073113,
"grad_norm": 0.9547279637518388,
"learning_rate": 5e-06,
"loss": 0.6543,
"step": 130
},
{
"epoch": 0.45491470349309504,
"grad_norm": 0.6806699547566448,
"learning_rate": 5e-06,
"loss": 0.6445,
"step": 140
},
{
"epoch": 0.487408610885459,
"grad_norm": 0.7128274001677305,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 150
},
{
"epoch": 0.5199025182778229,
"grad_norm": 0.5534613632007734,
"learning_rate": 5e-06,
"loss": 0.6325,
"step": 160
},
{
"epoch": 0.5523964256701869,
"grad_norm": 0.6402876644636555,
"learning_rate": 5e-06,
"loss": 0.631,
"step": 170
},
{
"epoch": 0.5848903330625508,
"grad_norm": 0.5061595935903519,
"learning_rate": 5e-06,
"loss": 0.629,
"step": 180
},
{
"epoch": 0.6173842404549147,
"grad_norm": 0.5151243164239879,
"learning_rate": 5e-06,
"loss": 0.6383,
"step": 190
},
{
"epoch": 0.6498781478472786,
"grad_norm": 0.5209850069165646,
"learning_rate": 5e-06,
"loss": 0.6277,
"step": 200
},
{
"epoch": 0.6823720552396426,
"grad_norm": 0.53784457051734,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 210
},
{
"epoch": 0.7148659626320065,
"grad_norm": 0.6457223875849278,
"learning_rate": 5e-06,
"loss": 0.6358,
"step": 220
},
{
"epoch": 0.7473598700243704,
"grad_norm": 0.4969903844480984,
"learning_rate": 5e-06,
"loss": 0.6331,
"step": 230
},
{
"epoch": 0.7798537774167343,
"grad_norm": 1.0671390274200754,
"learning_rate": 5e-06,
"loss": 0.6224,
"step": 240
},
{
"epoch": 0.8123476848090982,
"grad_norm": 1.9438042138910412,
"learning_rate": 5e-06,
"loss": 0.6198,
"step": 250
},
{
"epoch": 0.8448415922014623,
"grad_norm": 0.6831461136116804,
"learning_rate": 5e-06,
"loss": 0.6157,
"step": 260
},
{
"epoch": 0.8773354995938262,
"grad_norm": 0.548956486134229,
"learning_rate": 5e-06,
"loss": 0.6116,
"step": 270
},
{
"epoch": 0.9098294069861901,
"grad_norm": 0.5540177414913263,
"learning_rate": 5e-06,
"loss": 0.6326,
"step": 280
},
{
"epoch": 0.942323314378554,
"grad_norm": 0.7706090314968392,
"learning_rate": 5e-06,
"loss": 0.6331,
"step": 290
},
{
"epoch": 0.974817221770918,
"grad_norm": 0.59177053251295,
"learning_rate": 5e-06,
"loss": 0.6185,
"step": 300
},
{
"epoch": 0.9975629569455727,
"eval_loss": 0.6177791357040405,
"eval_runtime": 107.8067,
"eval_samples_per_second": 76.878,
"eval_steps_per_second": 0.603,
"step": 307
},
{
"epoch": 1.007311129163282,
"grad_norm": 1.4811327109031245,
"learning_rate": 5e-06,
"loss": 0.6074,
"step": 310
},
{
"epoch": 1.0398050365556457,
"grad_norm": 0.924221358685102,
"learning_rate": 5e-06,
"loss": 0.5701,
"step": 320
},
{
"epoch": 1.0722989439480097,
"grad_norm": 0.7293714439923176,
"learning_rate": 5e-06,
"loss": 0.5836,
"step": 330
},
{
"epoch": 1.1047928513403737,
"grad_norm": 0.5947409116978826,
"learning_rate": 5e-06,
"loss": 0.5676,
"step": 340
},
{
"epoch": 1.1372867587327375,
"grad_norm": 0.7000520581211176,
"learning_rate": 5e-06,
"loss": 0.5722,
"step": 350
},
{
"epoch": 1.1697806661251016,
"grad_norm": 0.5866183764561665,
"learning_rate": 5e-06,
"loss": 0.5692,
"step": 360
},
{
"epoch": 1.2022745735174656,
"grad_norm": 0.5380736496064281,
"learning_rate": 5e-06,
"loss": 0.5716,
"step": 370
},
{
"epoch": 1.2347684809098294,
"grad_norm": 0.4885682503135813,
"learning_rate": 5e-06,
"loss": 0.573,
"step": 380
},
{
"epoch": 1.2672623883021934,
"grad_norm": 0.48634335674401963,
"learning_rate": 5e-06,
"loss": 0.5759,
"step": 390
},
{
"epoch": 1.2997562956945572,
"grad_norm": 0.6079027143194111,
"learning_rate": 5e-06,
"loss": 0.5812,
"step": 400
},
{
"epoch": 1.3322502030869212,
"grad_norm": 0.5095044846593028,
"learning_rate": 5e-06,
"loss": 0.5781,
"step": 410
},
{
"epoch": 1.3647441104792852,
"grad_norm": 0.7887736133740892,
"learning_rate": 5e-06,
"loss": 0.5829,
"step": 420
},
{
"epoch": 1.397238017871649,
"grad_norm": 0.546342811666024,
"learning_rate": 5e-06,
"loss": 0.5743,
"step": 430
},
{
"epoch": 1.429731925264013,
"grad_norm": 0.49153267657911065,
"learning_rate": 5e-06,
"loss": 0.568,
"step": 440
},
{
"epoch": 1.4622258326563768,
"grad_norm": 0.5236274223462694,
"learning_rate": 5e-06,
"loss": 0.5688,
"step": 450
},
{
"epoch": 1.4947197400487409,
"grad_norm": 0.4672834690085644,
"learning_rate": 5e-06,
"loss": 0.5765,
"step": 460
},
{
"epoch": 1.5272136474411049,
"grad_norm": 0.5572476008854027,
"learning_rate": 5e-06,
"loss": 0.5709,
"step": 470
},
{
"epoch": 1.5597075548334687,
"grad_norm": 0.6292993165160258,
"learning_rate": 5e-06,
"loss": 0.5744,
"step": 480
},
{
"epoch": 1.5922014622258327,
"grad_norm": 0.7710487341485719,
"learning_rate": 5e-06,
"loss": 0.5736,
"step": 490
},
{
"epoch": 1.6246953696181965,
"grad_norm": 0.5087134161939325,
"learning_rate": 5e-06,
"loss": 0.5644,
"step": 500
},
{
"epoch": 1.6571892770105605,
"grad_norm": 0.7400405946910072,
"learning_rate": 5e-06,
"loss": 0.5631,
"step": 510
},
{
"epoch": 1.6896831844029245,
"grad_norm": 0.7142563791899093,
"learning_rate": 5e-06,
"loss": 0.5736,
"step": 520
},
{
"epoch": 1.7221770917952883,
"grad_norm": 0.5733442942289497,
"learning_rate": 5e-06,
"loss": 0.565,
"step": 530
},
{
"epoch": 1.7546709991876523,
"grad_norm": 0.5068224110056896,
"learning_rate": 5e-06,
"loss": 0.5735,
"step": 540
},
{
"epoch": 1.7871649065800161,
"grad_norm": 0.5248635762437218,
"learning_rate": 5e-06,
"loss": 0.5673,
"step": 550
},
{
"epoch": 1.8196588139723802,
"grad_norm": 0.5160341588912364,
"learning_rate": 5e-06,
"loss": 0.5678,
"step": 560
},
{
"epoch": 1.8521527213647442,
"grad_norm": 0.5597640569403803,
"learning_rate": 5e-06,
"loss": 0.5685,
"step": 570
},
{
"epoch": 1.8846466287571082,
"grad_norm": 0.5725824151776507,
"learning_rate": 5e-06,
"loss": 0.5717,
"step": 580
},
{
"epoch": 1.917140536149472,
"grad_norm": 0.6677340004988277,
"learning_rate": 5e-06,
"loss": 0.5744,
"step": 590
},
{
"epoch": 1.9496344435418358,
"grad_norm": 0.48981238508621994,
"learning_rate": 5e-06,
"loss": 0.5646,
"step": 600
},
{
"epoch": 1.9821283509341998,
"grad_norm": 0.5139052165921854,
"learning_rate": 5e-06,
"loss": 0.5652,
"step": 610
},
{
"epoch": 1.9983753046303818,
"eval_loss": 0.6079972982406616,
"eval_runtime": 105.9269,
"eval_samples_per_second": 78.243,
"eval_steps_per_second": 0.614,
"step": 615
},
{
"epoch": 2.014622258326564,
"grad_norm": 0.8669205295109212,
"learning_rate": 5e-06,
"loss": 0.5462,
"step": 620
},
{
"epoch": 2.047116165718928,
"grad_norm": 0.5112180094253919,
"learning_rate": 5e-06,
"loss": 0.5103,
"step": 630
},
{
"epoch": 2.0796100731112914,
"grad_norm": 0.6398716969047498,
"learning_rate": 5e-06,
"loss": 0.5161,
"step": 640
},
{
"epoch": 2.1121039805036554,
"grad_norm": 0.6665002162537883,
"learning_rate": 5e-06,
"loss": 0.5166,
"step": 650
},
{
"epoch": 2.1445978878960195,
"grad_norm": 0.611641023084313,
"learning_rate": 5e-06,
"loss": 0.5057,
"step": 660
},
{
"epoch": 2.1770917952883835,
"grad_norm": 0.5429214844688849,
"learning_rate": 5e-06,
"loss": 0.5193,
"step": 670
},
{
"epoch": 2.2095857026807475,
"grad_norm": 0.590201521626834,
"learning_rate": 5e-06,
"loss": 0.5261,
"step": 680
},
{
"epoch": 2.2420796100731115,
"grad_norm": 0.5384309983189058,
"learning_rate": 5e-06,
"loss": 0.5216,
"step": 690
},
{
"epoch": 2.274573517465475,
"grad_norm": 0.6332831563355791,
"learning_rate": 5e-06,
"loss": 0.5239,
"step": 700
},
{
"epoch": 2.307067424857839,
"grad_norm": 0.5774048376947836,
"learning_rate": 5e-06,
"loss": 0.518,
"step": 710
},
{
"epoch": 2.339561332250203,
"grad_norm": 0.5715485935547556,
"learning_rate": 5e-06,
"loss": 0.5311,
"step": 720
},
{
"epoch": 2.372055239642567,
"grad_norm": 0.6112818977221842,
"learning_rate": 5e-06,
"loss": 0.5225,
"step": 730
},
{
"epoch": 2.404549147034931,
"grad_norm": 0.5043334772464461,
"learning_rate": 5e-06,
"loss": 0.5228,
"step": 740
},
{
"epoch": 2.4370430544272947,
"grad_norm": 0.5427442635398124,
"learning_rate": 5e-06,
"loss": 0.5269,
"step": 750
},
{
"epoch": 2.4695369618196588,
"grad_norm": 0.48115676937900054,
"learning_rate": 5e-06,
"loss": 0.5175,
"step": 760
},
{
"epoch": 2.502030869212023,
"grad_norm": 0.5657773427745864,
"learning_rate": 5e-06,
"loss": 0.5219,
"step": 770
},
{
"epoch": 2.534524776604387,
"grad_norm": 0.6022938542861616,
"learning_rate": 5e-06,
"loss": 0.5196,
"step": 780
},
{
"epoch": 2.567018683996751,
"grad_norm": 0.501463044421915,
"learning_rate": 5e-06,
"loss": 0.5167,
"step": 790
},
{
"epoch": 2.5995125913891144,
"grad_norm": 0.5141918965616773,
"learning_rate": 5e-06,
"loss": 0.5186,
"step": 800
},
{
"epoch": 2.6320064987814784,
"grad_norm": 0.549194326600886,
"learning_rate": 5e-06,
"loss": 0.5269,
"step": 810
},
{
"epoch": 2.6645004061738424,
"grad_norm": 0.6157662245530849,
"learning_rate": 5e-06,
"loss": 0.5241,
"step": 820
},
{
"epoch": 2.6969943135662064,
"grad_norm": 0.6492362584129086,
"learning_rate": 5e-06,
"loss": 0.52,
"step": 830
},
{
"epoch": 2.7294882209585705,
"grad_norm": 0.532657814156255,
"learning_rate": 5e-06,
"loss": 0.5218,
"step": 840
},
{
"epoch": 2.761982128350934,
"grad_norm": 0.5072209705902165,
"learning_rate": 5e-06,
"loss": 0.5295,
"step": 850
},
{
"epoch": 2.794476035743298,
"grad_norm": 0.5590113792642208,
"learning_rate": 5e-06,
"loss": 0.5237,
"step": 860
},
{
"epoch": 2.826969943135662,
"grad_norm": 0.5766994462509862,
"learning_rate": 5e-06,
"loss": 0.522,
"step": 870
},
{
"epoch": 2.859463850528026,
"grad_norm": 0.5390978647646242,
"learning_rate": 5e-06,
"loss": 0.5188,
"step": 880
},
{
"epoch": 2.89195775792039,
"grad_norm": 0.5874802051466262,
"learning_rate": 5e-06,
"loss": 0.5252,
"step": 890
},
{
"epoch": 2.9244516653127537,
"grad_norm": 0.4949352435636162,
"learning_rate": 5e-06,
"loss": 0.5181,
"step": 900
},
{
"epoch": 2.9569455727051177,
"grad_norm": 0.5960858577490875,
"learning_rate": 5e-06,
"loss": 0.5282,
"step": 910
},
{
"epoch": 2.9894394800974817,
"grad_norm": 0.551557074132645,
"learning_rate": 5e-06,
"loss": 0.5197,
"step": 920
},
{
"epoch": 2.992688870836718,
"eval_loss": 0.6120628714561462,
"eval_runtime": 104.3409,
"eval_samples_per_second": 79.432,
"eval_steps_per_second": 0.623,
"step": 921
},
{
"epoch": 2.992688870836718,
"step": 921,
"total_flos": 1542290543738880.0,
"train_loss": 0.5815144646245416,
"train_runtime": 15433.4875,
"train_samples_per_second": 30.607,
"train_steps_per_second": 0.06
}
],
"logging_steps": 10,
"max_steps": 921,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1542290543738880.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}