oh_v1.3_airoboros_x4 / trainer_state.json
sedrickkeh's picture
End of training
a33667a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.995667244367418,
"eval_steps": 500,
"global_step": 864,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03466204506065858,
"grad_norm": 1.8127493746878947,
"learning_rate": 5e-06,
"loss": 1.0247,
"step": 10
},
{
"epoch": 0.06932409012131716,
"grad_norm": 1.892292516893664,
"learning_rate": 5e-06,
"loss": 0.9061,
"step": 20
},
{
"epoch": 0.10398613518197573,
"grad_norm": 2.439531384426821,
"learning_rate": 5e-06,
"loss": 0.8729,
"step": 30
},
{
"epoch": 0.1386481802426343,
"grad_norm": 1.0105961240246888,
"learning_rate": 5e-06,
"loss": 0.8422,
"step": 40
},
{
"epoch": 0.1733102253032929,
"grad_norm": 1.0541427004735089,
"learning_rate": 5e-06,
"loss": 0.8226,
"step": 50
},
{
"epoch": 0.20797227036395147,
"grad_norm": 1.185839497666169,
"learning_rate": 5e-06,
"loss": 0.8035,
"step": 60
},
{
"epoch": 0.24263431542461006,
"grad_norm": 1.1025425925379357,
"learning_rate": 5e-06,
"loss": 0.7971,
"step": 70
},
{
"epoch": 0.2772963604852686,
"grad_norm": 1.0489992399782422,
"learning_rate": 5e-06,
"loss": 0.7891,
"step": 80
},
{
"epoch": 0.3119584055459272,
"grad_norm": 0.668603535180152,
"learning_rate": 5e-06,
"loss": 0.7823,
"step": 90
},
{
"epoch": 0.3466204506065858,
"grad_norm": 0.569376657027902,
"learning_rate": 5e-06,
"loss": 0.7697,
"step": 100
},
{
"epoch": 0.38128249566724437,
"grad_norm": 0.6909448594076529,
"learning_rate": 5e-06,
"loss": 0.7641,
"step": 110
},
{
"epoch": 0.41594454072790293,
"grad_norm": 0.6532682299793838,
"learning_rate": 5e-06,
"loss": 0.7695,
"step": 120
},
{
"epoch": 0.4506065857885615,
"grad_norm": 0.9741442291789676,
"learning_rate": 5e-06,
"loss": 0.7702,
"step": 130
},
{
"epoch": 0.4852686308492201,
"grad_norm": 0.8963697523822133,
"learning_rate": 5e-06,
"loss": 0.7635,
"step": 140
},
{
"epoch": 0.5199306759098787,
"grad_norm": 0.7797154633763044,
"learning_rate": 5e-06,
"loss": 0.7577,
"step": 150
},
{
"epoch": 0.5545927209705372,
"grad_norm": 0.8410605236589601,
"learning_rate": 5e-06,
"loss": 0.7597,
"step": 160
},
{
"epoch": 0.5892547660311959,
"grad_norm": 0.7051595274843617,
"learning_rate": 5e-06,
"loss": 0.752,
"step": 170
},
{
"epoch": 0.6239168110918544,
"grad_norm": 0.6800181939208395,
"learning_rate": 5e-06,
"loss": 0.7527,
"step": 180
},
{
"epoch": 0.658578856152513,
"grad_norm": 0.7986625471943152,
"learning_rate": 5e-06,
"loss": 0.7491,
"step": 190
},
{
"epoch": 0.6932409012131716,
"grad_norm": 0.8468221058427845,
"learning_rate": 5e-06,
"loss": 0.7471,
"step": 200
},
{
"epoch": 0.7279029462738301,
"grad_norm": 0.7527636890957969,
"learning_rate": 5e-06,
"loss": 0.7488,
"step": 210
},
{
"epoch": 0.7625649913344887,
"grad_norm": 0.672904711451661,
"learning_rate": 5e-06,
"loss": 0.744,
"step": 220
},
{
"epoch": 0.7972270363951474,
"grad_norm": 0.9298264839873263,
"learning_rate": 5e-06,
"loss": 0.7438,
"step": 230
},
{
"epoch": 0.8318890814558059,
"grad_norm": 0.6925885250176548,
"learning_rate": 5e-06,
"loss": 0.7402,
"step": 240
},
{
"epoch": 0.8665511265164645,
"grad_norm": 0.6976668007067893,
"learning_rate": 5e-06,
"loss": 0.7449,
"step": 250
},
{
"epoch": 0.901213171577123,
"grad_norm": 0.7134513511376641,
"learning_rate": 5e-06,
"loss": 0.7378,
"step": 260
},
{
"epoch": 0.9358752166377816,
"grad_norm": 0.5758590804698668,
"learning_rate": 5e-06,
"loss": 0.7439,
"step": 270
},
{
"epoch": 0.9705372616984402,
"grad_norm": 0.7076061848472048,
"learning_rate": 5e-06,
"loss": 0.7382,
"step": 280
},
{
"epoch": 0.9982668977469671,
"eval_loss": 0.7344536185264587,
"eval_runtime": 308.2237,
"eval_samples_per_second": 25.215,
"eval_steps_per_second": 0.396,
"step": 288
},
{
"epoch": 1.005632582322357,
"grad_norm": 0.7185681802232957,
"learning_rate": 5e-06,
"loss": 0.779,
"step": 290
},
{
"epoch": 1.0402946273830156,
"grad_norm": 0.9393905325717241,
"learning_rate": 5e-06,
"loss": 0.6889,
"step": 300
},
{
"epoch": 1.074956672443674,
"grad_norm": 0.8787089784301063,
"learning_rate": 5e-06,
"loss": 0.687,
"step": 310
},
{
"epoch": 1.1096187175043328,
"grad_norm": 0.7560092649402328,
"learning_rate": 5e-06,
"loss": 0.6872,
"step": 320
},
{
"epoch": 1.1442807625649913,
"grad_norm": 0.6643286211815734,
"learning_rate": 5e-06,
"loss": 0.6858,
"step": 330
},
{
"epoch": 1.1789428076256498,
"grad_norm": 0.7127668776455044,
"learning_rate": 5e-06,
"loss": 0.684,
"step": 340
},
{
"epoch": 1.2136048526863086,
"grad_norm": 0.655292316893117,
"learning_rate": 5e-06,
"loss": 0.6855,
"step": 350
},
{
"epoch": 1.248266897746967,
"grad_norm": 0.8839088016848645,
"learning_rate": 5e-06,
"loss": 0.686,
"step": 360
},
{
"epoch": 1.2829289428076256,
"grad_norm": 0.624864756502428,
"learning_rate": 5e-06,
"loss": 0.6819,
"step": 370
},
{
"epoch": 1.317590987868284,
"grad_norm": 0.7439571552243042,
"learning_rate": 5e-06,
"loss": 0.6851,
"step": 380
},
{
"epoch": 1.3522530329289428,
"grad_norm": 0.5854034874524795,
"learning_rate": 5e-06,
"loss": 0.6868,
"step": 390
},
{
"epoch": 1.3869150779896013,
"grad_norm": 0.6734106560005542,
"learning_rate": 5e-06,
"loss": 0.6834,
"step": 400
},
{
"epoch": 1.4215771230502598,
"grad_norm": 0.6926581209135775,
"learning_rate": 5e-06,
"loss": 0.6832,
"step": 410
},
{
"epoch": 1.4562391681109186,
"grad_norm": 1.1324386970749247,
"learning_rate": 5e-06,
"loss": 0.6842,
"step": 420
},
{
"epoch": 1.490901213171577,
"grad_norm": 0.7226777314119034,
"learning_rate": 5e-06,
"loss": 0.6844,
"step": 430
},
{
"epoch": 1.5255632582322356,
"grad_norm": 0.7481904787146205,
"learning_rate": 5e-06,
"loss": 0.6791,
"step": 440
},
{
"epoch": 1.5602253032928943,
"grad_norm": 0.6135505957665759,
"learning_rate": 5e-06,
"loss": 0.6817,
"step": 450
},
{
"epoch": 1.5948873483535528,
"grad_norm": 0.7553340277380959,
"learning_rate": 5e-06,
"loss": 0.684,
"step": 460
},
{
"epoch": 1.6295493934142113,
"grad_norm": 0.7233556793224363,
"learning_rate": 5e-06,
"loss": 0.681,
"step": 470
},
{
"epoch": 1.66421143847487,
"grad_norm": 0.5547213886367687,
"learning_rate": 5e-06,
"loss": 0.6806,
"step": 480
},
{
"epoch": 1.6988734835355286,
"grad_norm": 0.6625866861543885,
"learning_rate": 5e-06,
"loss": 0.6792,
"step": 490
},
{
"epoch": 1.733535528596187,
"grad_norm": 0.8682937684926717,
"learning_rate": 5e-06,
"loss": 0.6789,
"step": 500
},
{
"epoch": 1.7681975736568458,
"grad_norm": 0.6685275937902929,
"learning_rate": 5e-06,
"loss": 0.6822,
"step": 510
},
{
"epoch": 1.8028596187175043,
"grad_norm": 1.0295956431263236,
"learning_rate": 5e-06,
"loss": 0.6825,
"step": 520
},
{
"epoch": 1.8375216637781628,
"grad_norm": 0.784814610980589,
"learning_rate": 5e-06,
"loss": 0.6769,
"step": 530
},
{
"epoch": 1.8721837088388216,
"grad_norm": 0.7570247170470147,
"learning_rate": 5e-06,
"loss": 0.6782,
"step": 540
},
{
"epoch": 1.90684575389948,
"grad_norm": 0.5807065830422653,
"learning_rate": 5e-06,
"loss": 0.6846,
"step": 550
},
{
"epoch": 1.9415077989601386,
"grad_norm": 0.6301636959503909,
"learning_rate": 5e-06,
"loss": 0.6787,
"step": 560
},
{
"epoch": 1.9761698440207973,
"grad_norm": 0.6686036844283325,
"learning_rate": 5e-06,
"loss": 0.6785,
"step": 570
},
{
"epoch": 1.9969670710571923,
"eval_loss": 0.7195846438407898,
"eval_runtime": 306.775,
"eval_samples_per_second": 25.335,
"eval_steps_per_second": 0.398,
"step": 576
},
{
"epoch": 2.011265164644714,
"grad_norm": 1.229519224308769,
"learning_rate": 5e-06,
"loss": 0.7064,
"step": 580
},
{
"epoch": 2.0459272097053725,
"grad_norm": 0.9032143353484438,
"learning_rate": 5e-06,
"loss": 0.6269,
"step": 590
},
{
"epoch": 2.080589254766031,
"grad_norm": 0.7420064693943627,
"learning_rate": 5e-06,
"loss": 0.6204,
"step": 600
},
{
"epoch": 2.11525129982669,
"grad_norm": 1.2914353849457911,
"learning_rate": 5e-06,
"loss": 0.6251,
"step": 610
},
{
"epoch": 2.149913344887348,
"grad_norm": 0.7778946515270286,
"learning_rate": 5e-06,
"loss": 0.6274,
"step": 620
},
{
"epoch": 2.184575389948007,
"grad_norm": 0.7043162671127772,
"learning_rate": 5e-06,
"loss": 0.6267,
"step": 630
},
{
"epoch": 2.2192374350086657,
"grad_norm": 0.5973234209878199,
"learning_rate": 5e-06,
"loss": 0.6284,
"step": 640
},
{
"epoch": 2.253899480069324,
"grad_norm": 0.666095113544406,
"learning_rate": 5e-06,
"loss": 0.6334,
"step": 650
},
{
"epoch": 2.2885615251299827,
"grad_norm": 0.6767024363263829,
"learning_rate": 5e-06,
"loss": 0.6292,
"step": 660
},
{
"epoch": 2.3232235701906414,
"grad_norm": 0.5737190679416464,
"learning_rate": 5e-06,
"loss": 0.6299,
"step": 670
},
{
"epoch": 2.3578856152512997,
"grad_norm": 0.5600750074108755,
"learning_rate": 5e-06,
"loss": 0.6342,
"step": 680
},
{
"epoch": 2.3925476603119584,
"grad_norm": 0.5910347547974553,
"learning_rate": 5e-06,
"loss": 0.6315,
"step": 690
},
{
"epoch": 2.427209705372617,
"grad_norm": 0.6226740928701757,
"learning_rate": 5e-06,
"loss": 0.631,
"step": 700
},
{
"epoch": 2.4618717504332754,
"grad_norm": 0.6210136062823411,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 710
},
{
"epoch": 2.496533795493934,
"grad_norm": 0.5748749993993215,
"learning_rate": 5e-06,
"loss": 0.6315,
"step": 720
},
{
"epoch": 2.5311958405545925,
"grad_norm": 0.6967001634339309,
"learning_rate": 5e-06,
"loss": 0.6362,
"step": 730
},
{
"epoch": 2.565857885615251,
"grad_norm": 0.6258079849864094,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 740
},
{
"epoch": 2.60051993067591,
"grad_norm": 0.6125604920957239,
"learning_rate": 5e-06,
"loss": 0.6285,
"step": 750
},
{
"epoch": 2.635181975736568,
"grad_norm": 0.5972379433259742,
"learning_rate": 5e-06,
"loss": 0.6339,
"step": 760
},
{
"epoch": 2.669844020797227,
"grad_norm": 0.6758633252723798,
"learning_rate": 5e-06,
"loss": 0.6326,
"step": 770
},
{
"epoch": 2.7045060658578857,
"grad_norm": 0.6607811157555928,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 780
},
{
"epoch": 2.739168110918544,
"grad_norm": 0.7251327172929152,
"learning_rate": 5e-06,
"loss": 0.6253,
"step": 790
},
{
"epoch": 2.7738301559792027,
"grad_norm": 0.5734616475373774,
"learning_rate": 5e-06,
"loss": 0.6328,
"step": 800
},
{
"epoch": 2.8084922010398614,
"grad_norm": 0.5940604342669007,
"learning_rate": 5e-06,
"loss": 0.6346,
"step": 810
},
{
"epoch": 2.8431542461005197,
"grad_norm": 0.6989887403612659,
"learning_rate": 5e-06,
"loss": 0.6331,
"step": 820
},
{
"epoch": 2.8778162911611784,
"grad_norm": 0.592871012328308,
"learning_rate": 5e-06,
"loss": 0.6274,
"step": 830
},
{
"epoch": 2.912478336221837,
"grad_norm": 0.7052513186995701,
"learning_rate": 5e-06,
"loss": 0.632,
"step": 840
},
{
"epoch": 2.9471403812824954,
"grad_norm": 0.6220289067550866,
"learning_rate": 5e-06,
"loss": 0.6307,
"step": 850
},
{
"epoch": 2.981802426343154,
"grad_norm": 0.6904590828336521,
"learning_rate": 5e-06,
"loss": 0.6293,
"step": 860
},
{
"epoch": 2.995667244367418,
"eval_loss": 0.720524787902832,
"eval_runtime": 307.7205,
"eval_samples_per_second": 25.257,
"eval_steps_per_second": 0.396,
"step": 864
},
{
"epoch": 2.995667244367418,
"step": 864,
"total_flos": 1447022800404480.0,
"train_loss": 0.6994864764036955,
"train_runtime": 51001.1529,
"train_samples_per_second": 8.686,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 864,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1447022800404480.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}