oh_v1.3_airoboros_x8 / trainer_state.json
sedrickkeh's picture
End of training
5aff591 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998778998778999,
"eval_steps": 500,
"global_step": 921,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03256003256003256,
"grad_norm": 5.323436630800234,
"learning_rate": 5e-06,
"loss": 1.0327,
"step": 10
},
{
"epoch": 0.06512006512006512,
"grad_norm": 1.688128003809466,
"learning_rate": 5e-06,
"loss": 0.9062,
"step": 20
},
{
"epoch": 0.09768009768009768,
"grad_norm": 1.2140455994223702,
"learning_rate": 5e-06,
"loss": 0.8619,
"step": 30
},
{
"epoch": 0.13024013024013023,
"grad_norm": 1.892078953122378,
"learning_rate": 5e-06,
"loss": 0.8392,
"step": 40
},
{
"epoch": 0.1628001628001628,
"grad_norm": 1.3493280342186107,
"learning_rate": 5e-06,
"loss": 0.8302,
"step": 50
},
{
"epoch": 0.19536019536019536,
"grad_norm": 1.404398870605967,
"learning_rate": 5e-06,
"loss": 0.8146,
"step": 60
},
{
"epoch": 0.22792022792022792,
"grad_norm": 1.716904927996173,
"learning_rate": 5e-06,
"loss": 0.7984,
"step": 70
},
{
"epoch": 0.26048026048026046,
"grad_norm": 1.1043041468212171,
"learning_rate": 5e-06,
"loss": 0.7822,
"step": 80
},
{
"epoch": 0.29304029304029305,
"grad_norm": 1.1394393992724225,
"learning_rate": 5e-06,
"loss": 0.7747,
"step": 90
},
{
"epoch": 0.3256003256003256,
"grad_norm": 0.9041488774288327,
"learning_rate": 5e-06,
"loss": 0.7717,
"step": 100
},
{
"epoch": 0.3581603581603582,
"grad_norm": 0.9299755697835647,
"learning_rate": 5e-06,
"loss": 0.7678,
"step": 110
},
{
"epoch": 0.3907203907203907,
"grad_norm": 0.993036115619975,
"learning_rate": 5e-06,
"loss": 0.7683,
"step": 120
},
{
"epoch": 0.42328042328042326,
"grad_norm": 0.6582951208355411,
"learning_rate": 5e-06,
"loss": 0.7628,
"step": 130
},
{
"epoch": 0.45584045584045585,
"grad_norm": 0.7519239236084948,
"learning_rate": 5e-06,
"loss": 0.7565,
"step": 140
},
{
"epoch": 0.4884004884004884,
"grad_norm": 0.6532749233523701,
"learning_rate": 5e-06,
"loss": 0.7542,
"step": 150
},
{
"epoch": 0.5209605209605209,
"grad_norm": 0.6538729701357949,
"learning_rate": 5e-06,
"loss": 0.752,
"step": 160
},
{
"epoch": 0.5535205535205535,
"grad_norm": 0.5799393154553659,
"learning_rate": 5e-06,
"loss": 0.7495,
"step": 170
},
{
"epoch": 0.5860805860805861,
"grad_norm": 0.5825669893589298,
"learning_rate": 5e-06,
"loss": 0.7428,
"step": 180
},
{
"epoch": 0.6186406186406186,
"grad_norm": 0.7509089505557253,
"learning_rate": 5e-06,
"loss": 0.7449,
"step": 190
},
{
"epoch": 0.6512006512006512,
"grad_norm": 0.8999008774030093,
"learning_rate": 5e-06,
"loss": 0.7512,
"step": 200
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.6383792756493567,
"learning_rate": 5e-06,
"loss": 0.7357,
"step": 210
},
{
"epoch": 0.7163207163207164,
"grad_norm": 0.7456320321321589,
"learning_rate": 5e-06,
"loss": 0.7407,
"step": 220
},
{
"epoch": 0.7488807488807488,
"grad_norm": 0.6143588238010131,
"learning_rate": 5e-06,
"loss": 0.7365,
"step": 230
},
{
"epoch": 0.7814407814407814,
"grad_norm": 0.7213470058027147,
"learning_rate": 5e-06,
"loss": 0.74,
"step": 240
},
{
"epoch": 0.814000814000814,
"grad_norm": 0.6247707373236687,
"learning_rate": 5e-06,
"loss": 0.737,
"step": 250
},
{
"epoch": 0.8465608465608465,
"grad_norm": 0.7179277915838116,
"learning_rate": 5e-06,
"loss": 0.7391,
"step": 260
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.5751713737264728,
"learning_rate": 5e-06,
"loss": 0.7256,
"step": 270
},
{
"epoch": 0.9116809116809117,
"grad_norm": 1.0076623893106156,
"learning_rate": 5e-06,
"loss": 0.7292,
"step": 280
},
{
"epoch": 0.9442409442409443,
"grad_norm": 0.9261278576758778,
"learning_rate": 5e-06,
"loss": 0.7307,
"step": 290
},
{
"epoch": 0.9768009768009768,
"grad_norm": 0.5709384175023418,
"learning_rate": 5e-06,
"loss": 0.7273,
"step": 300
},
{
"epoch": 0.9995929995929996,
"eval_loss": 0.727676272392273,
"eval_runtime": 323.9497,
"eval_samples_per_second": 25.544,
"eval_steps_per_second": 0.401,
"step": 307
},
{
"epoch": 1.0093610093610093,
"grad_norm": 0.9162973627143661,
"learning_rate": 5e-06,
"loss": 0.775,
"step": 310
},
{
"epoch": 1.0419210419210418,
"grad_norm": 0.6625376118384887,
"learning_rate": 5e-06,
"loss": 0.679,
"step": 320
},
{
"epoch": 1.0744810744810744,
"grad_norm": 0.6807366677436052,
"learning_rate": 5e-06,
"loss": 0.6787,
"step": 330
},
{
"epoch": 1.107041107041107,
"grad_norm": 0.7495116413050847,
"learning_rate": 5e-06,
"loss": 0.6805,
"step": 340
},
{
"epoch": 1.1396011396011396,
"grad_norm": 0.6693097946610393,
"learning_rate": 5e-06,
"loss": 0.6759,
"step": 350
},
{
"epoch": 1.1721611721611722,
"grad_norm": 0.6269071277974789,
"learning_rate": 5e-06,
"loss": 0.6793,
"step": 360
},
{
"epoch": 1.2047212047212048,
"grad_norm": 0.701447231936067,
"learning_rate": 5e-06,
"loss": 0.6739,
"step": 370
},
{
"epoch": 1.2372812372812372,
"grad_norm": 0.6286463507729596,
"learning_rate": 5e-06,
"loss": 0.6738,
"step": 380
},
{
"epoch": 1.2698412698412698,
"grad_norm": 0.678099904425436,
"learning_rate": 5e-06,
"loss": 0.6775,
"step": 390
},
{
"epoch": 1.3024013024013024,
"grad_norm": 0.7402015170342834,
"learning_rate": 5e-06,
"loss": 0.6755,
"step": 400
},
{
"epoch": 1.334961334961335,
"grad_norm": 0.8963076513215479,
"learning_rate": 5e-06,
"loss": 0.6747,
"step": 410
},
{
"epoch": 1.3675213675213675,
"grad_norm": 0.758606230124057,
"learning_rate": 5e-06,
"loss": 0.6795,
"step": 420
},
{
"epoch": 1.4000814000814001,
"grad_norm": 0.755225029704983,
"learning_rate": 5e-06,
"loss": 0.6782,
"step": 430
},
{
"epoch": 1.4326414326414327,
"grad_norm": 0.9296144227043265,
"learning_rate": 5e-06,
"loss": 0.6706,
"step": 440
},
{
"epoch": 1.4652014652014653,
"grad_norm": 0.6465031575836382,
"learning_rate": 5e-06,
"loss": 0.6708,
"step": 450
},
{
"epoch": 1.4977614977614977,
"grad_norm": 0.5928157369911277,
"learning_rate": 5e-06,
"loss": 0.6763,
"step": 460
},
{
"epoch": 1.5303215303215303,
"grad_norm": 0.6210442459071823,
"learning_rate": 5e-06,
"loss": 0.6749,
"step": 470
},
{
"epoch": 1.5628815628815629,
"grad_norm": 0.6163699649091662,
"learning_rate": 5e-06,
"loss": 0.6755,
"step": 480
},
{
"epoch": 1.5954415954415955,
"grad_norm": 0.8040173316442683,
"learning_rate": 5e-06,
"loss": 0.6704,
"step": 490
},
{
"epoch": 1.6280016280016278,
"grad_norm": 0.6887993451391516,
"learning_rate": 5e-06,
"loss": 0.6701,
"step": 500
},
{
"epoch": 1.6605616605616604,
"grad_norm": 0.6197281649463939,
"learning_rate": 5e-06,
"loss": 0.6726,
"step": 510
},
{
"epoch": 1.693121693121693,
"grad_norm": 0.619478860918107,
"learning_rate": 5e-06,
"loss": 0.6751,
"step": 520
},
{
"epoch": 1.7256817256817256,
"grad_norm": 0.6773051427286838,
"learning_rate": 5e-06,
"loss": 0.6641,
"step": 530
},
{
"epoch": 1.7582417582417582,
"grad_norm": 0.6286720338866559,
"learning_rate": 5e-06,
"loss": 0.6796,
"step": 540
},
{
"epoch": 1.7908017908017908,
"grad_norm": 0.7533774989219207,
"learning_rate": 5e-06,
"loss": 0.672,
"step": 550
},
{
"epoch": 1.8233618233618234,
"grad_norm": 0.7731184689615994,
"learning_rate": 5e-06,
"loss": 0.6659,
"step": 560
},
{
"epoch": 1.855921855921856,
"grad_norm": 0.7416671731262793,
"learning_rate": 5e-06,
"loss": 0.6746,
"step": 570
},
{
"epoch": 1.8884818884818886,
"grad_norm": 0.6128076594680967,
"learning_rate": 5e-06,
"loss": 0.6716,
"step": 580
},
{
"epoch": 1.9210419210419212,
"grad_norm": 0.7891628980046747,
"learning_rate": 5e-06,
"loss": 0.6697,
"step": 590
},
{
"epoch": 1.9536019536019538,
"grad_norm": 0.6796937767570254,
"learning_rate": 5e-06,
"loss": 0.6651,
"step": 600
},
{
"epoch": 1.9861619861619861,
"grad_norm": 0.6743284971968594,
"learning_rate": 5e-06,
"loss": 0.6701,
"step": 610
},
{
"epoch": 1.999185999185999,
"eval_loss": 0.711691677570343,
"eval_runtime": 324.9391,
"eval_samples_per_second": 25.466,
"eval_steps_per_second": 0.4,
"step": 614
},
{
"epoch": 2.0187220187220185,
"grad_norm": 1.025163234157577,
"learning_rate": 5e-06,
"loss": 0.6947,
"step": 620
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.8628252708559125,
"learning_rate": 5e-06,
"loss": 0.6187,
"step": 630
},
{
"epoch": 2.0838420838420837,
"grad_norm": 0.6783100628721863,
"learning_rate": 5e-06,
"loss": 0.6179,
"step": 640
},
{
"epoch": 2.1164021164021163,
"grad_norm": 0.6421565201150183,
"learning_rate": 5e-06,
"loss": 0.6162,
"step": 650
},
{
"epoch": 2.148962148962149,
"grad_norm": 0.685352763219904,
"learning_rate": 5e-06,
"loss": 0.6177,
"step": 660
},
{
"epoch": 2.1815221815221815,
"grad_norm": 0.6061382299294098,
"learning_rate": 5e-06,
"loss": 0.613,
"step": 670
},
{
"epoch": 2.214082214082214,
"grad_norm": 0.690472583201057,
"learning_rate": 5e-06,
"loss": 0.6157,
"step": 680
},
{
"epoch": 2.2466422466422467,
"grad_norm": 0.627437676785234,
"learning_rate": 5e-06,
"loss": 0.6187,
"step": 690
},
{
"epoch": 2.2792022792022792,
"grad_norm": 0.6938080734685778,
"learning_rate": 5e-06,
"loss": 0.6226,
"step": 700
},
{
"epoch": 2.311762311762312,
"grad_norm": 0.772959190894534,
"learning_rate": 5e-06,
"loss": 0.6182,
"step": 710
},
{
"epoch": 2.3443223443223444,
"grad_norm": 0.5713521350519779,
"learning_rate": 5e-06,
"loss": 0.6213,
"step": 720
},
{
"epoch": 2.376882376882377,
"grad_norm": 0.6443040936760203,
"learning_rate": 5e-06,
"loss": 0.6224,
"step": 730
},
{
"epoch": 2.4094424094424096,
"grad_norm": 0.5889564557828441,
"learning_rate": 5e-06,
"loss": 0.6203,
"step": 740
},
{
"epoch": 2.442002442002442,
"grad_norm": 0.709826472700304,
"learning_rate": 5e-06,
"loss": 0.6193,
"step": 750
},
{
"epoch": 2.4745624745624744,
"grad_norm": 0.7335788363502472,
"learning_rate": 5e-06,
"loss": 0.623,
"step": 760
},
{
"epoch": 2.5071225071225074,
"grad_norm": 0.6283405015720556,
"learning_rate": 5e-06,
"loss": 0.6188,
"step": 770
},
{
"epoch": 2.5396825396825395,
"grad_norm": 0.6952325423084712,
"learning_rate": 5e-06,
"loss": 0.6209,
"step": 780
},
{
"epoch": 2.572242572242572,
"grad_norm": 0.6559620535420857,
"learning_rate": 5e-06,
"loss": 0.6201,
"step": 790
},
{
"epoch": 2.6048026048026047,
"grad_norm": 0.7118809496834119,
"learning_rate": 5e-06,
"loss": 0.6198,
"step": 800
},
{
"epoch": 2.6373626373626373,
"grad_norm": 0.7214373132810955,
"learning_rate": 5e-06,
"loss": 0.621,
"step": 810
},
{
"epoch": 2.66992266992267,
"grad_norm": 0.6415485259710608,
"learning_rate": 5e-06,
"loss": 0.6229,
"step": 820
},
{
"epoch": 2.7024827024827025,
"grad_norm": 0.5655240384143257,
"learning_rate": 5e-06,
"loss": 0.623,
"step": 830
},
{
"epoch": 2.735042735042735,
"grad_norm": 0.7379491136681863,
"learning_rate": 5e-06,
"loss": 0.6204,
"step": 840
},
{
"epoch": 2.7676027676027677,
"grad_norm": 0.6816392250287217,
"learning_rate": 5e-06,
"loss": 0.6213,
"step": 850
},
{
"epoch": 2.8001628001628003,
"grad_norm": 0.6788149050134666,
"learning_rate": 5e-06,
"loss": 0.6219,
"step": 860
},
{
"epoch": 2.832722832722833,
"grad_norm": 0.5660568888358906,
"learning_rate": 5e-06,
"loss": 0.6211,
"step": 870
},
{
"epoch": 2.8652828652828655,
"grad_norm": 0.59814839030772,
"learning_rate": 5e-06,
"loss": 0.6172,
"step": 880
},
{
"epoch": 2.8978428978428976,
"grad_norm": 0.7047473448081863,
"learning_rate": 5e-06,
"loss": 0.6207,
"step": 890
},
{
"epoch": 2.9304029304029307,
"grad_norm": 0.9367887506446253,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 900
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.5836189827444674,
"learning_rate": 5e-06,
"loss": 0.6189,
"step": 910
},
{
"epoch": 2.9955229955229954,
"grad_norm": 0.8008103377337297,
"learning_rate": 5e-06,
"loss": 0.6223,
"step": 920
},
{
"epoch": 2.998778998778999,
"eval_loss": 0.7124439477920532,
"eval_runtime": 324.9155,
"eval_samples_per_second": 25.468,
"eval_steps_per_second": 0.4,
"step": 921
},
{
"epoch": 2.998778998778999,
"step": 921,
"total_flos": 1542499923394560.0,
"train_loss": 0.6911445101490498,
"train_runtime": 53981.7348,
"train_samples_per_second": 8.737,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 921,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1542499923394560.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}