oh_v1.3_alpaca_x4 / trainer_state.json
sedrickkeh's picture
End of training
e4505dc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.993049522154648,
"eval_steps": 500,
"global_step": 861,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03475238922675934,
"grad_norm": 4.204217083914466,
"learning_rate": 5e-06,
"loss": 1.0615,
"step": 10
},
{
"epoch": 0.06950477845351868,
"grad_norm": 1.422784268276639,
"learning_rate": 5e-06,
"loss": 0.9179,
"step": 20
},
{
"epoch": 0.10425716768027801,
"grad_norm": 1.7251757262516607,
"learning_rate": 5e-06,
"loss": 0.8716,
"step": 30
},
{
"epoch": 0.13900955690703737,
"grad_norm": 1.1547403296199443,
"learning_rate": 5e-06,
"loss": 0.8538,
"step": 40
},
{
"epoch": 0.1737619461337967,
"grad_norm": 1.3147732137734534,
"learning_rate": 5e-06,
"loss": 0.8315,
"step": 50
},
{
"epoch": 0.20851433536055602,
"grad_norm": 1.2212040088296596,
"learning_rate": 5e-06,
"loss": 0.8137,
"step": 60
},
{
"epoch": 0.24326672458731538,
"grad_norm": 1.0105503239364757,
"learning_rate": 5e-06,
"loss": 0.8079,
"step": 70
},
{
"epoch": 0.27801911381407474,
"grad_norm": 1.0559222477057983,
"learning_rate": 5e-06,
"loss": 0.794,
"step": 80
},
{
"epoch": 0.31277150304083406,
"grad_norm": 0.7128592380359542,
"learning_rate": 5e-06,
"loss": 0.79,
"step": 90
},
{
"epoch": 0.3475238922675934,
"grad_norm": 0.7316521398436672,
"learning_rate": 5e-06,
"loss": 0.7808,
"step": 100
},
{
"epoch": 0.3822762814943527,
"grad_norm": 0.9194116933386736,
"learning_rate": 5e-06,
"loss": 0.7743,
"step": 110
},
{
"epoch": 0.41702867072111205,
"grad_norm": 0.724708916285198,
"learning_rate": 5e-06,
"loss": 0.7727,
"step": 120
},
{
"epoch": 0.45178105994787143,
"grad_norm": 0.6304484292299692,
"learning_rate": 5e-06,
"loss": 0.7708,
"step": 130
},
{
"epoch": 0.48653344917463076,
"grad_norm": 0.8580390763526664,
"learning_rate": 5e-06,
"loss": 0.7663,
"step": 140
},
{
"epoch": 0.5212858384013901,
"grad_norm": 0.680533196209476,
"learning_rate": 5e-06,
"loss": 0.7666,
"step": 150
},
{
"epoch": 0.5560382276281495,
"grad_norm": 0.5754590021742806,
"learning_rate": 5e-06,
"loss": 0.7677,
"step": 160
},
{
"epoch": 0.5907906168549087,
"grad_norm": 0.6990861984775781,
"learning_rate": 5e-06,
"loss": 0.7586,
"step": 170
},
{
"epoch": 0.6255430060816681,
"grad_norm": 0.8182472984882369,
"learning_rate": 5e-06,
"loss": 0.7581,
"step": 180
},
{
"epoch": 0.6602953953084274,
"grad_norm": 0.8231305567479303,
"learning_rate": 5e-06,
"loss": 0.7578,
"step": 190
},
{
"epoch": 0.6950477845351868,
"grad_norm": 0.6513447002925578,
"learning_rate": 5e-06,
"loss": 0.755,
"step": 200
},
{
"epoch": 0.7298001737619462,
"grad_norm": 0.5789033103066399,
"learning_rate": 5e-06,
"loss": 0.756,
"step": 210
},
{
"epoch": 0.7645525629887054,
"grad_norm": 0.6090095361122515,
"learning_rate": 5e-06,
"loss": 0.7547,
"step": 220
},
{
"epoch": 0.7993049522154648,
"grad_norm": 0.5529700900735016,
"learning_rate": 5e-06,
"loss": 0.7511,
"step": 230
},
{
"epoch": 0.8340573414422241,
"grad_norm": 0.6891665664865682,
"learning_rate": 5e-06,
"loss": 0.7536,
"step": 240
},
{
"epoch": 0.8688097306689835,
"grad_norm": 0.8354216869977991,
"learning_rate": 5e-06,
"loss": 0.7495,
"step": 250
},
{
"epoch": 0.9035621198957429,
"grad_norm": 0.750227289167969,
"learning_rate": 5e-06,
"loss": 0.7473,
"step": 260
},
{
"epoch": 0.9383145091225021,
"grad_norm": 0.590848403292903,
"learning_rate": 5e-06,
"loss": 0.7475,
"step": 270
},
{
"epoch": 0.9730668983492615,
"grad_norm": 0.7308589738950354,
"learning_rate": 5e-06,
"loss": 0.7466,
"step": 280
},
{
"epoch": 0.9973935708079931,
"eval_loss": 0.7464137077331543,
"eval_runtime": 306.0354,
"eval_samples_per_second": 25.33,
"eval_steps_per_second": 0.399,
"step": 287
},
{
"epoch": 1.0082536924413554,
"grad_norm": 0.8888839481114714,
"learning_rate": 5e-06,
"loss": 0.7656,
"step": 290
},
{
"epoch": 1.0430060816681146,
"grad_norm": 0.7736290679704527,
"learning_rate": 5e-06,
"loss": 0.6927,
"step": 300
},
{
"epoch": 1.077758470894874,
"grad_norm": 0.6086798023669154,
"learning_rate": 5e-06,
"loss": 0.6937,
"step": 310
},
{
"epoch": 1.1125108601216334,
"grad_norm": 0.7277120931979707,
"learning_rate": 5e-06,
"loss": 0.6956,
"step": 320
},
{
"epoch": 1.1472632493483927,
"grad_norm": 0.6477654174773422,
"learning_rate": 5e-06,
"loss": 0.6945,
"step": 330
},
{
"epoch": 1.1820156385751521,
"grad_norm": 0.6746834402435494,
"learning_rate": 5e-06,
"loss": 0.6928,
"step": 340
},
{
"epoch": 1.2167680278019113,
"grad_norm": 0.7909338588584797,
"learning_rate": 5e-06,
"loss": 0.6975,
"step": 350
},
{
"epoch": 1.2515204170286707,
"grad_norm": 0.6996994953219825,
"learning_rate": 5e-06,
"loss": 0.693,
"step": 360
},
{
"epoch": 1.28627280625543,
"grad_norm": 0.7572207346579433,
"learning_rate": 5e-06,
"loss": 0.6919,
"step": 370
},
{
"epoch": 1.3210251954821894,
"grad_norm": 0.6315378764527892,
"learning_rate": 5e-06,
"loss": 0.7012,
"step": 380
},
{
"epoch": 1.3557775847089486,
"grad_norm": 0.7377025919254079,
"learning_rate": 5e-06,
"loss": 0.6976,
"step": 390
},
{
"epoch": 1.390529973935708,
"grad_norm": 0.6170787424269338,
"learning_rate": 5e-06,
"loss": 0.6923,
"step": 400
},
{
"epoch": 1.4252823631624674,
"grad_norm": 0.6720319783859334,
"learning_rate": 5e-06,
"loss": 0.6957,
"step": 410
},
{
"epoch": 1.4600347523892268,
"grad_norm": 0.5493772411957378,
"learning_rate": 5e-06,
"loss": 0.6945,
"step": 420
},
{
"epoch": 1.4947871416159861,
"grad_norm": 0.6013975213270655,
"learning_rate": 5e-06,
"loss": 0.6929,
"step": 430
},
{
"epoch": 1.5295395308427455,
"grad_norm": 0.6939858737269444,
"learning_rate": 5e-06,
"loss": 0.6926,
"step": 440
},
{
"epoch": 1.564291920069505,
"grad_norm": 0.5880706906541838,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 450
},
{
"epoch": 1.599044309296264,
"grad_norm": 0.7548284182746593,
"learning_rate": 5e-06,
"loss": 0.6902,
"step": 460
},
{
"epoch": 1.6337966985230234,
"grad_norm": 0.6183278356571063,
"learning_rate": 5e-06,
"loss": 0.6925,
"step": 470
},
{
"epoch": 1.6685490877497828,
"grad_norm": 0.9398095529537541,
"learning_rate": 5e-06,
"loss": 0.6857,
"step": 480
},
{
"epoch": 1.703301476976542,
"grad_norm": 0.6367587092612689,
"learning_rate": 5e-06,
"loss": 0.689,
"step": 490
},
{
"epoch": 1.7380538662033014,
"grad_norm": 0.7639239518306171,
"learning_rate": 5e-06,
"loss": 0.6939,
"step": 500
},
{
"epoch": 1.7728062554300608,
"grad_norm": 0.706056725045481,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 510
},
{
"epoch": 1.8075586446568201,
"grad_norm": 0.7327907255249769,
"learning_rate": 5e-06,
"loss": 0.6913,
"step": 520
},
{
"epoch": 1.8423110338835795,
"grad_norm": 0.7090737638783119,
"learning_rate": 5e-06,
"loss": 0.687,
"step": 530
},
{
"epoch": 1.877063423110339,
"grad_norm": 0.641981559236637,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 540
},
{
"epoch": 1.9118158123370983,
"grad_norm": 0.6003290685340094,
"learning_rate": 5e-06,
"loss": 0.6885,
"step": 550
},
{
"epoch": 1.9465682015638577,
"grad_norm": 0.5991204401446062,
"learning_rate": 5e-06,
"loss": 0.6885,
"step": 560
},
{
"epoch": 1.9813205907906168,
"grad_norm": 0.6603357286091912,
"learning_rate": 5e-06,
"loss": 0.693,
"step": 570
},
{
"epoch": 1.9986967854039965,
"eval_loss": 0.7325075268745422,
"eval_runtime": 306.5804,
"eval_samples_per_second": 25.285,
"eval_steps_per_second": 0.398,
"step": 575
},
{
"epoch": 2.016507384882711,
"grad_norm": 0.9010986091915655,
"learning_rate": 5e-06,
"loss": 0.694,
"step": 580
},
{
"epoch": 2.05125977410947,
"grad_norm": 0.6963098532911728,
"learning_rate": 5e-06,
"loss": 0.6416,
"step": 590
},
{
"epoch": 2.086012163336229,
"grad_norm": 0.7981128982665585,
"learning_rate": 5e-06,
"loss": 0.6347,
"step": 600
},
{
"epoch": 2.1207645525629886,
"grad_norm": 0.7045115982630096,
"learning_rate": 5e-06,
"loss": 0.6382,
"step": 610
},
{
"epoch": 2.155516941789748,
"grad_norm": 0.6178456961772154,
"learning_rate": 5e-06,
"loss": 0.6316,
"step": 620
},
{
"epoch": 2.1902693310165073,
"grad_norm": 0.6984364299074612,
"learning_rate": 5e-06,
"loss": 0.64,
"step": 630
},
{
"epoch": 2.2250217202432667,
"grad_norm": 0.6153473997901224,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 640
},
{
"epoch": 2.259774109470026,
"grad_norm": 0.7551289583137311,
"learning_rate": 5e-06,
"loss": 0.6415,
"step": 650
},
{
"epoch": 2.2945264986967855,
"grad_norm": 1.0727937828194958,
"learning_rate": 5e-06,
"loss": 0.6409,
"step": 660
},
{
"epoch": 2.329278887923545,
"grad_norm": 0.6961000957869337,
"learning_rate": 5e-06,
"loss": 0.641,
"step": 670
},
{
"epoch": 2.3640312771503043,
"grad_norm": 1.5380586860018097,
"learning_rate": 5e-06,
"loss": 0.6343,
"step": 680
},
{
"epoch": 2.3987836663770636,
"grad_norm": 1.473915330175611,
"learning_rate": 5e-06,
"loss": 0.6424,
"step": 690
},
{
"epoch": 2.4335360556038226,
"grad_norm": 1.3707384922893442,
"learning_rate": 5e-06,
"loss": 0.6402,
"step": 700
},
{
"epoch": 2.468288444830582,
"grad_norm": 1.2856734997208332,
"learning_rate": 5e-06,
"loss": 0.6405,
"step": 710
},
{
"epoch": 2.5030408340573413,
"grad_norm": 1.0655758845498153,
"learning_rate": 5e-06,
"loss": 0.639,
"step": 720
},
{
"epoch": 2.5377932232841007,
"grad_norm": 0.7063563183382034,
"learning_rate": 5e-06,
"loss": 0.6451,
"step": 730
},
{
"epoch": 2.57254561251086,
"grad_norm": 1.0030690870134182,
"learning_rate": 5e-06,
"loss": 0.6451,
"step": 740
},
{
"epoch": 2.6072980017376195,
"grad_norm": 0.642378453539176,
"learning_rate": 5e-06,
"loss": 0.6452,
"step": 750
},
{
"epoch": 2.642050390964379,
"grad_norm": 0.5960942912485575,
"learning_rate": 5e-06,
"loss": 0.6435,
"step": 760
},
{
"epoch": 2.6768027801911383,
"grad_norm": 0.6526728525085794,
"learning_rate": 5e-06,
"loss": 0.6393,
"step": 770
},
{
"epoch": 2.711555169417897,
"grad_norm": 0.7230688644533029,
"learning_rate": 5e-06,
"loss": 0.6416,
"step": 780
},
{
"epoch": 2.7463075586446566,
"grad_norm": 0.7601768679175701,
"learning_rate": 5e-06,
"loss": 0.6413,
"step": 790
},
{
"epoch": 2.781059947871416,
"grad_norm": 0.9110327754774364,
"learning_rate": 5e-06,
"loss": 0.6441,
"step": 800
},
{
"epoch": 2.8158123370981754,
"grad_norm": 0.8925826699350575,
"learning_rate": 5e-06,
"loss": 0.6432,
"step": 810
},
{
"epoch": 2.8505647263249347,
"grad_norm": 0.6237089071073183,
"learning_rate": 5e-06,
"loss": 0.644,
"step": 820
},
{
"epoch": 2.885317115551694,
"grad_norm": 0.6110859257293438,
"learning_rate": 5e-06,
"loss": 0.6399,
"step": 830
},
{
"epoch": 2.9200695047784535,
"grad_norm": 0.6421968773513084,
"learning_rate": 5e-06,
"loss": 0.6454,
"step": 840
},
{
"epoch": 2.954821894005213,
"grad_norm": 0.5712412822442149,
"learning_rate": 5e-06,
"loss": 0.6416,
"step": 850
},
{
"epoch": 2.9895742832319723,
"grad_norm": 0.5672582221675043,
"learning_rate": 5e-06,
"loss": 0.644,
"step": 860
},
{
"epoch": 2.993049522154648,
"eval_loss": 0.7338809370994568,
"eval_runtime": 308.0146,
"eval_samples_per_second": 25.168,
"eval_steps_per_second": 0.396,
"step": 861
},
{
"epoch": 2.993049522154648,
"step": 861,
"total_flos": 1441997688668160.0,
"train_loss": 0.7089061699677843,
"train_runtime": 50775.3746,
"train_samples_per_second": 8.702,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 861,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1441997688668160.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}