oh_v1.3_airoboros_x.5 / trainer_state.json
sedrickkeh's picture
End of training
0d3501b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9972247918593897,
"eval_steps": 500,
"global_step": 810,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03700277520814061,
"grad_norm": 2.5574069547071865,
"learning_rate": 5e-06,
"loss": 1.0349,
"step": 10
},
{
"epoch": 0.07400555041628122,
"grad_norm": 5.900418380831986,
"learning_rate": 5e-06,
"loss": 0.9193,
"step": 20
},
{
"epoch": 0.11100832562442182,
"grad_norm": 1.343234502219187,
"learning_rate": 5e-06,
"loss": 0.8706,
"step": 30
},
{
"epoch": 0.14801110083256244,
"grad_norm": 1.3710163438367058,
"learning_rate": 5e-06,
"loss": 0.8479,
"step": 40
},
{
"epoch": 0.18501387604070305,
"grad_norm": 1.0033978639288033,
"learning_rate": 5e-06,
"loss": 0.8243,
"step": 50
},
{
"epoch": 0.22201665124884365,
"grad_norm": 1.3608933171871491,
"learning_rate": 5e-06,
"loss": 0.8107,
"step": 60
},
{
"epoch": 0.2590194264569843,
"grad_norm": 1.1028614841829054,
"learning_rate": 5e-06,
"loss": 0.799,
"step": 70
},
{
"epoch": 0.2960222016651249,
"grad_norm": 1.0434200254799575,
"learning_rate": 5e-06,
"loss": 0.7885,
"step": 80
},
{
"epoch": 0.3330249768732655,
"grad_norm": 1.229274798797147,
"learning_rate": 5e-06,
"loss": 0.7834,
"step": 90
},
{
"epoch": 0.3700277520814061,
"grad_norm": 0.9205856097213055,
"learning_rate": 5e-06,
"loss": 0.7777,
"step": 100
},
{
"epoch": 0.4070305272895467,
"grad_norm": 0.7497767563492606,
"learning_rate": 5e-06,
"loss": 0.7724,
"step": 110
},
{
"epoch": 0.4440333024976873,
"grad_norm": 0.8373352830011821,
"learning_rate": 5e-06,
"loss": 0.7683,
"step": 120
},
{
"epoch": 0.48103607770582796,
"grad_norm": 0.9313579962563663,
"learning_rate": 5e-06,
"loss": 0.7616,
"step": 130
},
{
"epoch": 0.5180388529139686,
"grad_norm": 0.6590049252216683,
"learning_rate": 5e-06,
"loss": 0.761,
"step": 140
},
{
"epoch": 0.5550416281221091,
"grad_norm": 0.7121322312495858,
"learning_rate": 5e-06,
"loss": 0.7581,
"step": 150
},
{
"epoch": 0.5920444033302498,
"grad_norm": 0.6148417146408288,
"learning_rate": 5e-06,
"loss": 0.7596,
"step": 160
},
{
"epoch": 0.6290471785383904,
"grad_norm": 0.7859140889960612,
"learning_rate": 5e-06,
"loss": 0.7607,
"step": 170
},
{
"epoch": 0.666049953746531,
"grad_norm": 0.6262995603184957,
"learning_rate": 5e-06,
"loss": 0.7536,
"step": 180
},
{
"epoch": 0.7030527289546716,
"grad_norm": 0.8388208254030984,
"learning_rate": 5e-06,
"loss": 0.7509,
"step": 190
},
{
"epoch": 0.7400555041628122,
"grad_norm": 0.6363523499932093,
"learning_rate": 5e-06,
"loss": 0.7477,
"step": 200
},
{
"epoch": 0.7770582793709528,
"grad_norm": 0.557587555852944,
"learning_rate": 5e-06,
"loss": 0.7452,
"step": 210
},
{
"epoch": 0.8140610545790934,
"grad_norm": 0.5789377197252322,
"learning_rate": 5e-06,
"loss": 0.7475,
"step": 220
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.7489244924234153,
"learning_rate": 5e-06,
"loss": 0.7423,
"step": 230
},
{
"epoch": 0.8880666049953746,
"grad_norm": 0.7658948623639423,
"learning_rate": 5e-06,
"loss": 0.7407,
"step": 240
},
{
"epoch": 0.9250693802035153,
"grad_norm": 0.6418871797978494,
"learning_rate": 5e-06,
"loss": 0.7429,
"step": 250
},
{
"epoch": 0.9620721554116559,
"grad_norm": 0.6374125796179182,
"learning_rate": 5e-06,
"loss": 0.7457,
"step": 260
},
{
"epoch": 0.9990749306197965,
"grad_norm": 0.6225924769184604,
"learning_rate": 5e-06,
"loss": 0.7398,
"step": 270
},
{
"epoch": 0.9990749306197965,
"eval_loss": 0.7373877167701721,
"eval_runtime": 285.6741,
"eval_samples_per_second": 25.487,
"eval_steps_per_second": 0.399,
"step": 270
},
{
"epoch": 1.0360777058279371,
"grad_norm": 1.0999156797786174,
"learning_rate": 5e-06,
"loss": 0.7465,
"step": 280
},
{
"epoch": 1.0730804810360777,
"grad_norm": 0.9033567059221405,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 290
},
{
"epoch": 1.1100832562442182,
"grad_norm": 0.7225203751531558,
"learning_rate": 5e-06,
"loss": 0.6891,
"step": 300
},
{
"epoch": 1.147086031452359,
"grad_norm": 0.7744016164334468,
"learning_rate": 5e-06,
"loss": 0.6924,
"step": 310
},
{
"epoch": 1.1840888066604995,
"grad_norm": 0.5778670419847766,
"learning_rate": 5e-06,
"loss": 0.6847,
"step": 320
},
{
"epoch": 1.22109158186864,
"grad_norm": 0.7216085851109396,
"learning_rate": 5e-06,
"loss": 0.6874,
"step": 330
},
{
"epoch": 1.2580943570767809,
"grad_norm": 0.6834497791089044,
"learning_rate": 5e-06,
"loss": 0.685,
"step": 340
},
{
"epoch": 1.2950971322849214,
"grad_norm": 0.5864187616860316,
"learning_rate": 5e-06,
"loss": 0.6887,
"step": 350
},
{
"epoch": 1.332099907493062,
"grad_norm": 0.6435850583829653,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 360
},
{
"epoch": 1.3691026827012025,
"grad_norm": 1.026308036346174,
"learning_rate": 5e-06,
"loss": 0.6914,
"step": 370
},
{
"epoch": 1.4061054579093433,
"grad_norm": 0.6177344760653564,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 380
},
{
"epoch": 1.4431082331174838,
"grad_norm": 0.5769701433909521,
"learning_rate": 5e-06,
"loss": 0.6899,
"step": 390
},
{
"epoch": 1.4801110083256244,
"grad_norm": 0.6884699856368363,
"learning_rate": 5e-06,
"loss": 0.6846,
"step": 400
},
{
"epoch": 1.5171137835337651,
"grad_norm": 0.6251420323292153,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 410
},
{
"epoch": 1.5541165587419057,
"grad_norm": 0.6980683262318477,
"learning_rate": 5e-06,
"loss": 0.6845,
"step": 420
},
{
"epoch": 1.5911193339500462,
"grad_norm": 0.5739588719749933,
"learning_rate": 5e-06,
"loss": 0.6885,
"step": 430
},
{
"epoch": 1.6281221091581868,
"grad_norm": 0.5948644114545361,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 440
},
{
"epoch": 1.6651248843663273,
"grad_norm": 0.5752931750826541,
"learning_rate": 5e-06,
"loss": 0.6844,
"step": 450
},
{
"epoch": 1.702127659574468,
"grad_norm": 0.6226615522398108,
"learning_rate": 5e-06,
"loss": 0.6873,
"step": 460
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.709209259109901,
"learning_rate": 5e-06,
"loss": 0.6822,
"step": 470
},
{
"epoch": 1.7761332099907494,
"grad_norm": 0.6748952075449096,
"learning_rate": 5e-06,
"loss": 0.6866,
"step": 480
},
{
"epoch": 1.81313598519889,
"grad_norm": 0.7823171266955319,
"learning_rate": 5e-06,
"loss": 0.6865,
"step": 490
},
{
"epoch": 1.8501387604070305,
"grad_norm": 0.9392626078254421,
"learning_rate": 5e-06,
"loss": 0.6879,
"step": 500
},
{
"epoch": 1.887141535615171,
"grad_norm": 0.7275512298704835,
"learning_rate": 5e-06,
"loss": 0.6826,
"step": 510
},
{
"epoch": 1.9241443108233116,
"grad_norm": 0.8091982613912542,
"learning_rate": 5e-06,
"loss": 0.6841,
"step": 520
},
{
"epoch": 1.9611470860314524,
"grad_norm": 0.6148273009527061,
"learning_rate": 5e-06,
"loss": 0.6882,
"step": 530
},
{
"epoch": 1.998149861239593,
"grad_norm": 0.6033253124158411,
"learning_rate": 5e-06,
"loss": 0.6816,
"step": 540
},
{
"epoch": 1.998149861239593,
"eval_loss": 0.7247459888458252,
"eval_runtime": 286.0565,
"eval_samples_per_second": 25.453,
"eval_steps_per_second": 0.399,
"step": 540
},
{
"epoch": 2.0351526364477337,
"grad_norm": 0.6676285360839004,
"learning_rate": 5e-06,
"loss": 0.6817,
"step": 550
},
{
"epoch": 2.0721554116558742,
"grad_norm": 0.6670166248278999,
"learning_rate": 5e-06,
"loss": 0.6302,
"step": 560
},
{
"epoch": 2.109158186864015,
"grad_norm": 0.7594092171720456,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 570
},
{
"epoch": 2.1461609620721553,
"grad_norm": 0.734159893671992,
"learning_rate": 5e-06,
"loss": 0.631,
"step": 580
},
{
"epoch": 2.183163737280296,
"grad_norm": 0.7620304153842595,
"learning_rate": 5e-06,
"loss": 0.6331,
"step": 590
},
{
"epoch": 2.2201665124884364,
"grad_norm": 0.8062383311336945,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 600
},
{
"epoch": 2.2571692876965774,
"grad_norm": 0.6298902178051499,
"learning_rate": 5e-06,
"loss": 0.6324,
"step": 610
},
{
"epoch": 2.294172062904718,
"grad_norm": 0.6917188810207621,
"learning_rate": 5e-06,
"loss": 0.6373,
"step": 620
},
{
"epoch": 2.3311748381128585,
"grad_norm": 0.9550757835002316,
"learning_rate": 5e-06,
"loss": 0.6357,
"step": 630
},
{
"epoch": 2.368177613320999,
"grad_norm": 0.5913508720714069,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 640
},
{
"epoch": 2.4051803885291396,
"grad_norm": 0.6758217219286833,
"learning_rate": 5e-06,
"loss": 0.6349,
"step": 650
},
{
"epoch": 2.44218316373728,
"grad_norm": 0.5787936091702227,
"learning_rate": 5e-06,
"loss": 0.632,
"step": 660
},
{
"epoch": 2.4791859389454207,
"grad_norm": 0.5976179095981347,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 670
},
{
"epoch": 2.5161887141535617,
"grad_norm": 0.7954067986819094,
"learning_rate": 5e-06,
"loss": 0.6319,
"step": 680
},
{
"epoch": 2.5531914893617023,
"grad_norm": 0.6891125209012705,
"learning_rate": 5e-06,
"loss": 0.6383,
"step": 690
},
{
"epoch": 2.590194264569843,
"grad_norm": 0.8230201955415605,
"learning_rate": 5e-06,
"loss": 0.6371,
"step": 700
},
{
"epoch": 2.6271970397779834,
"grad_norm": 0.6635633756561687,
"learning_rate": 5e-06,
"loss": 0.6382,
"step": 710
},
{
"epoch": 2.664199814986124,
"grad_norm": 0.6306477728740528,
"learning_rate": 5e-06,
"loss": 0.6411,
"step": 720
},
{
"epoch": 2.7012025901942645,
"grad_norm": 0.5984777601069516,
"learning_rate": 5e-06,
"loss": 0.6369,
"step": 730
},
{
"epoch": 2.738205365402405,
"grad_norm": 0.7644851120709378,
"learning_rate": 5e-06,
"loss": 0.6348,
"step": 740
},
{
"epoch": 2.7752081406105455,
"grad_norm": 0.6478127083239548,
"learning_rate": 5e-06,
"loss": 0.636,
"step": 750
},
{
"epoch": 2.8122109158186865,
"grad_norm": 0.6453201797896143,
"learning_rate": 5e-06,
"loss": 0.6396,
"step": 760
},
{
"epoch": 2.849213691026827,
"grad_norm": 0.7223841425019709,
"learning_rate": 5e-06,
"loss": 0.639,
"step": 770
},
{
"epoch": 2.8862164662349676,
"grad_norm": 0.8854103875073065,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 780
},
{
"epoch": 2.923219241443108,
"grad_norm": 0.74480497953526,
"learning_rate": 5e-06,
"loss": 0.6387,
"step": 790
},
{
"epoch": 2.9602220166512487,
"grad_norm": 0.5468951542823913,
"learning_rate": 5e-06,
"loss": 0.6346,
"step": 800
},
{
"epoch": 2.9972247918593897,
"grad_norm": 0.7117523367143715,
"learning_rate": 5e-06,
"loss": 0.6406,
"step": 810
},
{
"epoch": 2.9972247918593897,
"eval_loss": 0.7273637056350708,
"eval_runtime": 286.127,
"eval_samples_per_second": 25.447,
"eval_steps_per_second": 0.398,
"step": 810
},
{
"epoch": 2.9972247918593897,
"step": 810,
"total_flos": 1356570789150720.0,
"train_loss": 0.7045574435481319,
"train_runtime": 47714.0524,
"train_samples_per_second": 8.697,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 810,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1356570789150720.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}