oh_v1.3_airoboros_x.125 / trainer_state.json
sedrickkeh's picture
End of training
914200c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9972041006523766,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03727865796831314,
"grad_norm": 2.863966708076063,
"learning_rate": 5e-06,
"loss": 1.026,
"step": 10
},
{
"epoch": 0.07455731593662628,
"grad_norm": 1.3926538767403338,
"learning_rate": 5e-06,
"loss": 0.9052,
"step": 20
},
{
"epoch": 0.11183597390493942,
"grad_norm": 1.2844171360679957,
"learning_rate": 5e-06,
"loss": 0.8595,
"step": 30
},
{
"epoch": 0.14911463187325255,
"grad_norm": 1.2804154175067717,
"learning_rate": 5e-06,
"loss": 0.8369,
"step": 40
},
{
"epoch": 0.1863932898415657,
"grad_norm": 1.2295434002540855,
"learning_rate": 5e-06,
"loss": 0.8189,
"step": 50
},
{
"epoch": 0.22367194780987884,
"grad_norm": 0.9533085893615539,
"learning_rate": 5e-06,
"loss": 0.8029,
"step": 60
},
{
"epoch": 0.26095060577819196,
"grad_norm": 0.9173543178817729,
"learning_rate": 5e-06,
"loss": 0.7942,
"step": 70
},
{
"epoch": 0.2982292637465051,
"grad_norm": 1.1035649744908476,
"learning_rate": 5e-06,
"loss": 0.7819,
"step": 80
},
{
"epoch": 0.33550792171481825,
"grad_norm": 1.0858240553462641,
"learning_rate": 5e-06,
"loss": 0.7749,
"step": 90
},
{
"epoch": 0.3727865796831314,
"grad_norm": 0.7409831961561815,
"learning_rate": 5e-06,
"loss": 0.7747,
"step": 100
},
{
"epoch": 0.41006523765144454,
"grad_norm": 0.7383266991759111,
"learning_rate": 5e-06,
"loss": 0.771,
"step": 110
},
{
"epoch": 0.4473438956197577,
"grad_norm": 0.729136950048857,
"learning_rate": 5e-06,
"loss": 0.7661,
"step": 120
},
{
"epoch": 0.4846225535880708,
"grad_norm": 0.7091298599299047,
"learning_rate": 5e-06,
"loss": 0.7686,
"step": 130
},
{
"epoch": 0.5219012115563839,
"grad_norm": 0.8897155658891847,
"learning_rate": 5e-06,
"loss": 0.7585,
"step": 140
},
{
"epoch": 0.5591798695246971,
"grad_norm": 0.991237889992658,
"learning_rate": 5e-06,
"loss": 0.7596,
"step": 150
},
{
"epoch": 0.5964585274930102,
"grad_norm": 0.6956773386703347,
"learning_rate": 5e-06,
"loss": 0.7567,
"step": 160
},
{
"epoch": 0.6337371854613234,
"grad_norm": 0.6627254486504695,
"learning_rate": 5e-06,
"loss": 0.7553,
"step": 170
},
{
"epoch": 0.6710158434296365,
"grad_norm": 0.8294084043245143,
"learning_rate": 5e-06,
"loss": 0.7507,
"step": 180
},
{
"epoch": 0.7082945013979497,
"grad_norm": 0.8385421799416569,
"learning_rate": 5e-06,
"loss": 0.752,
"step": 190
},
{
"epoch": 0.7455731593662628,
"grad_norm": 1.2138081527115805,
"learning_rate": 5e-06,
"loss": 0.7478,
"step": 200
},
{
"epoch": 0.782851817334576,
"grad_norm": 0.6817842666893509,
"learning_rate": 5e-06,
"loss": 0.7459,
"step": 210
},
{
"epoch": 0.8201304753028891,
"grad_norm": 0.6011497469129173,
"learning_rate": 5e-06,
"loss": 0.7457,
"step": 220
},
{
"epoch": 0.8574091332712023,
"grad_norm": 0.737363078092173,
"learning_rate": 5e-06,
"loss": 0.7454,
"step": 230
},
{
"epoch": 0.8946877912395154,
"grad_norm": 0.9165088326689114,
"learning_rate": 5e-06,
"loss": 0.7399,
"step": 240
},
{
"epoch": 0.9319664492078286,
"grad_norm": 0.631133171846698,
"learning_rate": 5e-06,
"loss": 0.7459,
"step": 250
},
{
"epoch": 0.9692451071761417,
"grad_norm": 0.6095804573659136,
"learning_rate": 5e-06,
"loss": 0.7451,
"step": 260
},
{
"epoch": 0.9990680335507922,
"eval_loss": 0.7414608001708984,
"eval_runtime": 285.7705,
"eval_samples_per_second": 25.286,
"eval_steps_per_second": 0.395,
"step": 268
},
{
"epoch": 1.0065237651444547,
"grad_norm": 0.991121087496217,
"learning_rate": 5e-06,
"loss": 0.7852,
"step": 270
},
{
"epoch": 1.0438024231127678,
"grad_norm": 0.9766463352882261,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 280
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.9726663140123732,
"learning_rate": 5e-06,
"loss": 0.6941,
"step": 290
},
{
"epoch": 1.1183597390493942,
"grad_norm": 0.7674442614556171,
"learning_rate": 5e-06,
"loss": 0.6928,
"step": 300
},
{
"epoch": 1.1556383970177073,
"grad_norm": 0.686949635032861,
"learning_rate": 5e-06,
"loss": 0.6868,
"step": 310
},
{
"epoch": 1.1929170549860204,
"grad_norm": 0.7040486633398215,
"learning_rate": 5e-06,
"loss": 0.6923,
"step": 320
},
{
"epoch": 1.2301957129543337,
"grad_norm": 0.757865125886295,
"learning_rate": 5e-06,
"loss": 0.6901,
"step": 330
},
{
"epoch": 1.2674743709226468,
"grad_norm": 0.9846801239791743,
"learning_rate": 5e-06,
"loss": 0.6918,
"step": 340
},
{
"epoch": 1.30475302889096,
"grad_norm": 0.6230374762078432,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 350
},
{
"epoch": 1.342031686859273,
"grad_norm": 0.6833293101908209,
"learning_rate": 5e-06,
"loss": 0.6882,
"step": 360
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.7286800883906255,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 370
},
{
"epoch": 1.4165890027958994,
"grad_norm": 0.7749966544687281,
"learning_rate": 5e-06,
"loss": 0.6913,
"step": 380
},
{
"epoch": 1.4538676607642125,
"grad_norm": 0.77958879320336,
"learning_rate": 5e-06,
"loss": 0.6897,
"step": 390
},
{
"epoch": 1.4911463187325256,
"grad_norm": 0.7140550646519259,
"learning_rate": 5e-06,
"loss": 0.685,
"step": 400
},
{
"epoch": 1.5284249767008387,
"grad_norm": 0.6584304607146931,
"learning_rate": 5e-06,
"loss": 0.6902,
"step": 410
},
{
"epoch": 1.565703634669152,
"grad_norm": 0.7452382115118451,
"learning_rate": 5e-06,
"loss": 0.6848,
"step": 420
},
{
"epoch": 1.602982292637465,
"grad_norm": 0.8519625836288258,
"learning_rate": 5e-06,
"loss": 0.6888,
"step": 430
},
{
"epoch": 1.6402609506057781,
"grad_norm": 0.562624844847511,
"learning_rate": 5e-06,
"loss": 0.6864,
"step": 440
},
{
"epoch": 1.6775396085740915,
"grad_norm": 0.7282578944985719,
"learning_rate": 5e-06,
"loss": 0.6925,
"step": 450
},
{
"epoch": 1.7148182665424043,
"grad_norm": 0.8007629798945024,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 460
},
{
"epoch": 1.7520969245107176,
"grad_norm": 0.6847302844270698,
"learning_rate": 5e-06,
"loss": 0.6873,
"step": 470
},
{
"epoch": 1.7893755824790307,
"grad_norm": 0.6294374666555999,
"learning_rate": 5e-06,
"loss": 0.6845,
"step": 480
},
{
"epoch": 1.8266542404473438,
"grad_norm": 0.7641590639056253,
"learning_rate": 5e-06,
"loss": 0.6862,
"step": 490
},
{
"epoch": 1.8639328984156571,
"grad_norm": 0.666418306068689,
"learning_rate": 5e-06,
"loss": 0.6899,
"step": 500
},
{
"epoch": 1.9012115563839702,
"grad_norm": 0.5887624682915402,
"learning_rate": 5e-06,
"loss": 0.6839,
"step": 510
},
{
"epoch": 1.9384902143522833,
"grad_norm": 0.6878912984211528,
"learning_rate": 5e-06,
"loss": 0.6822,
"step": 520
},
{
"epoch": 1.9757688723205966,
"grad_norm": 0.6704088372022132,
"learning_rate": 5e-06,
"loss": 0.6865,
"step": 530
},
{
"epoch": 1.9981360671015844,
"eval_loss": 0.7287164330482483,
"eval_runtime": 286.215,
"eval_samples_per_second": 25.247,
"eval_steps_per_second": 0.395,
"step": 536
},
{
"epoch": 2.0130475302889095,
"grad_norm": 0.7652802012798461,
"learning_rate": 5e-06,
"loss": 0.7188,
"step": 540
},
{
"epoch": 2.050326188257223,
"grad_norm": 0.6143634312705478,
"learning_rate": 5e-06,
"loss": 0.6374,
"step": 550
},
{
"epoch": 2.0876048462255357,
"grad_norm": 0.7815576376342297,
"learning_rate": 5e-06,
"loss": 0.627,
"step": 560
},
{
"epoch": 2.124883504193849,
"grad_norm": 0.6461505452236371,
"learning_rate": 5e-06,
"loss": 0.6339,
"step": 570
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.6479801052923935,
"learning_rate": 5e-06,
"loss": 0.6367,
"step": 580
},
{
"epoch": 2.199440820130475,
"grad_norm": 0.795040730406223,
"learning_rate": 5e-06,
"loss": 0.6361,
"step": 590
},
{
"epoch": 2.2367194780987885,
"grad_norm": 0.718949197084576,
"learning_rate": 5e-06,
"loss": 0.6361,
"step": 600
},
{
"epoch": 2.2739981360671018,
"grad_norm": 0.6166412328819074,
"learning_rate": 5e-06,
"loss": 0.6369,
"step": 610
},
{
"epoch": 2.3112767940354146,
"grad_norm": 0.6766175977068568,
"learning_rate": 5e-06,
"loss": 0.639,
"step": 620
},
{
"epoch": 2.348555452003728,
"grad_norm": 0.7034913016991963,
"learning_rate": 5e-06,
"loss": 0.6411,
"step": 630
},
{
"epoch": 2.385834109972041,
"grad_norm": 0.6509863784269144,
"learning_rate": 5e-06,
"loss": 0.6368,
"step": 640
},
{
"epoch": 2.423112767940354,
"grad_norm": 0.715734199522274,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 650
},
{
"epoch": 2.4603914259086674,
"grad_norm": 0.6552323242186081,
"learning_rate": 5e-06,
"loss": 0.634,
"step": 660
},
{
"epoch": 2.4976700838769803,
"grad_norm": 0.6632367791952104,
"learning_rate": 5e-06,
"loss": 0.6404,
"step": 670
},
{
"epoch": 2.5349487418452936,
"grad_norm": 0.7300194823086572,
"learning_rate": 5e-06,
"loss": 0.6421,
"step": 680
},
{
"epoch": 2.572227399813607,
"grad_norm": 0.5964985990739192,
"learning_rate": 5e-06,
"loss": 0.6369,
"step": 690
},
{
"epoch": 2.60950605778192,
"grad_norm": 0.8180239269430768,
"learning_rate": 5e-06,
"loss": 0.6376,
"step": 700
},
{
"epoch": 2.646784715750233,
"grad_norm": 0.6860649388075701,
"learning_rate": 5e-06,
"loss": 0.6368,
"step": 710
},
{
"epoch": 2.684063373718546,
"grad_norm": 1.1051628648207943,
"learning_rate": 5e-06,
"loss": 0.6375,
"step": 720
},
{
"epoch": 2.7213420316868593,
"grad_norm": 0.6214860543682473,
"learning_rate": 5e-06,
"loss": 0.6398,
"step": 730
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.7826068718341928,
"learning_rate": 5e-06,
"loss": 0.6378,
"step": 740
},
{
"epoch": 2.7958993476234855,
"grad_norm": 0.801079615885771,
"learning_rate": 5e-06,
"loss": 0.6413,
"step": 750
},
{
"epoch": 2.8331780055917988,
"grad_norm": 0.5970154468549519,
"learning_rate": 5e-06,
"loss": 0.641,
"step": 760
},
{
"epoch": 2.8704566635601116,
"grad_norm": 0.7276565987601038,
"learning_rate": 5e-06,
"loss": 0.6388,
"step": 770
},
{
"epoch": 2.907735321528425,
"grad_norm": 0.690824403756736,
"learning_rate": 5e-06,
"loss": 0.6391,
"step": 780
},
{
"epoch": 2.9450139794967383,
"grad_norm": 0.761736653534516,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 790
},
{
"epoch": 2.982292637465051,
"grad_norm": 0.6454438184790852,
"learning_rate": 5e-06,
"loss": 0.6415,
"step": 800
},
{
"epoch": 2.9972041006523766,
"eval_loss": 0.7303594946861267,
"eval_runtime": 285.1186,
"eval_samples_per_second": 25.344,
"eval_steps_per_second": 0.396,
"step": 804
},
{
"epoch": 2.9972041006523766,
"step": 804,
"total_flos": 1346520565678080.0,
"train_loss": 0.7048911662837166,
"train_runtime": 47750.7809,
"train_samples_per_second": 8.626,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1346520565678080.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}