collect1_test / trainer_state.json

init

e067ea8 verified 11 months ago

17.1 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 9.90625,
	"eval_steps": 500,
	"global_step": 951,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.10416666666666667,
	"grad_norm": 2.861802577972412,
	"learning_rate": 4.166666666666667e-05,
	"loss": 0.9682,
	"step": 10
	},
	{
	"epoch": 0.20833333333333334,
	"grad_norm": 3.0431478023529053,
	"learning_rate": 8.333333333333334e-05,
	"loss": 0.3898,
	"step": 20
	},
	{
	"epoch": 0.3125,
	"grad_norm": 1.3093934059143066,
	"learning_rate": 0.000125,
	"loss": 0.2177,
	"step": 30
	},
	{
	"epoch": 0.4166666666666667,
	"grad_norm": 0.8621488809585571,
	"learning_rate": 0.0001666666666666667,
	"loss": 0.1536,
	"step": 40
	},
	{
	"epoch": 0.5208333333333334,
	"grad_norm": 0.9496772289276123,
	"learning_rate": 0.00019999757923579923,
	"loss": 0.1156,
	"step": 50
	},
	{
	"epoch": 0.625,
	"grad_norm": 1.193084716796875,
	"learning_rate": 0.00019991286479434454,
	"loss": 0.1079,
	"step": 60
	},
	{
	"epoch": 0.7291666666666666,
	"grad_norm": 0.9372045397758484,
	"learning_rate": 0.00019970722931933287,
	"loss": 0.0979,
	"step": 70
	},
	{
	"epoch": 0.8333333333333334,
	"grad_norm": 1.1392306089401245,
	"learning_rate": 0.0001993809216841623,
	"loss": 0.0873,
	"step": 80
	},
	{
	"epoch": 0.9375,
	"grad_norm": 0.5865321159362793,
	"learning_rate": 0.00019893433680751103,
	"loss": 0.0788,
	"step": 90
	},
	{
	"epoch": 1.0416666666666667,
	"grad_norm": 0.41245004534721375,
	"learning_rate": 0.00019836801517538125,
	"loss": 0.0613,
	"step": 100
	},
	{
	"epoch": 1.1458333333333333,
	"grad_norm": 0.5878641605377197,
	"learning_rate": 0.00019768264218696772,
	"loss": 0.0642,
	"step": 110
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.4363374710083008,
	"learning_rate": 0.0001968790473251434,
	"loss": 0.0544,
	"step": 120
	},
	{
	"epoch": 1.3541666666666667,
	"grad_norm": 0.4858635663986206,
	"learning_rate": 0.0001959582031525653,
	"loss": 0.0524,
	"step": 130
	},
	{
	"epoch": 1.4583333333333333,
	"grad_norm": 0.6191464066505432,
	"learning_rate": 0.00019492122413461603,
	"loss": 0.0555,
	"step": 140
	},
	{
	"epoch": 1.5625,
	"grad_norm": 0.3185575008392334,
	"learning_rate": 0.00019376936529060554,
	"loss": 0.0494,
	"step": 150
	},
	{
	"epoch": 1.6666666666666665,
	"grad_norm": 0.266720175743103,
	"learning_rate": 0.00019250402067486522,
	"loss": 0.0474,
	"step": 160
	},
	{
	"epoch": 1.7708333333333335,
	"grad_norm": 0.24431076645851135,
	"learning_rate": 0.00019112672168957292,
	"loss": 0.0462,
	"step": 170
	},
	{
	"epoch": 1.875,
	"grad_norm": 0.2719191014766693,
	"learning_rate": 0.0001896391352313506,
	"loss": 0.0395,
	"step": 180
	},
	{
	"epoch": 1.9791666666666665,
	"grad_norm": 0.40286824107170105,
	"learning_rate": 0.00018804306167387796,
	"loss": 0.0449,
	"step": 190
	},
	{
	"epoch": 2.0833333333333335,
	"grad_norm": 0.4218553304672241,
	"learning_rate": 0.00018634043268896323,
	"loss": 0.0361,
	"step": 200
	},
	{
	"epoch": 2.1875,
	"grad_norm": 0.40750882029533386,
	"learning_rate": 0.00018453330890870855,
	"loss": 0.0418,
	"step": 210
	},
	{
	"epoch": 2.2916666666666665,
	"grad_norm": 1.9637465476989746,
	"learning_rate": 0.0001826238774315995,
	"loss": 0.0925,
	"step": 220
	},
	{
	"epoch": 2.3958333333333335,
	"grad_norm": 0.2965734004974365,
	"learning_rate": 0.00018061444917553629,
	"loss": 0.052,
	"step": 230
	},
	{
	"epoch": 2.5,
	"grad_norm": 0.4165923297405243,
	"learning_rate": 0.0001785074560810111,
	"loss": 0.0512,
	"step": 240
	},
	{
	"epoch": 2.6041666666666665,
	"grad_norm": 0.44070684909820557,
	"learning_rate": 0.00017630544816781577,
	"loss": 0.0418,
	"step": 250
	},
	{
	"epoch": 2.7083333333333335,
	"grad_norm": 0.3902498185634613,
	"learning_rate": 0.00017401109044884246,
	"loss": 0.0423,
	"step": 260
	},
	{
	"epoch": 2.8125,
	"grad_norm": 0.458552747964859,
	"learning_rate": 0.0001716271597047119,
	"loss": 0.0398,
	"step": 270
	},
	{
	"epoch": 2.9166666666666665,
	"grad_norm": 0.3544536530971527,
	"learning_rate": 0.00016915654112313345,
	"loss": 0.0376,
	"step": 280
	},
	{
	"epoch": 3.0208333333333335,
	"grad_norm": 0.5818161368370056,
	"learning_rate": 0.00016660222480706355,
	"loss": 0.0447,
	"step": 290
	},
	{
	"epoch": 3.125,
	"grad_norm": 0.35342398285865784,
	"learning_rate": 0.00016396730215588915,
	"loss": 0.0401,
	"step": 300
	},
	{
	"epoch": 3.2291666666666665,
	"grad_norm": 0.28917449712753296,
	"learning_rate": 0.0001612549621240154,
	"loss": 0.0447,
	"step": 310
	},
	{
	"epoch": 3.3333333333333335,
	"grad_norm": 0.34957313537597656,
	"learning_rate": 0.00015846848736138623,
	"loss": 0.034,
	"step": 320
	},
	{
	"epoch": 3.4375,
	"grad_norm": 0.2229030877351761,
	"learning_rate": 0.00015561125024060826,
	"loss": 0.0351,
	"step": 330
	},
	{
	"epoch": 3.5416666666666665,
	"grad_norm": 0.1731082648038864,
	"learning_rate": 0.00015268670877548648,
	"loss": 0.0369,
	"step": 340
	},
	{
	"epoch": 3.6458333333333335,
	"grad_norm": 0.33026209473609924,
	"learning_rate": 0.00014969840243591177,
	"loss": 0.0324,
	"step": 350
	},
	{
	"epoch": 3.75,
	"grad_norm": 0.22994904220104218,
	"learning_rate": 0.0001466499478641644,
	"loss": 0.0377,
	"step": 360
	},
	{
	"epoch": 3.8541666666666665,
	"grad_norm": 0.25043389201164246,
	"learning_rate": 0.00014354503449781912,
	"loss": 0.0334,
	"step": 370
	},
	{
	"epoch": 3.9583333333333335,
	"grad_norm": 0.26702672243118286,
	"learning_rate": 0.00014038742010454814,
	"loss": 0.0311,
	"step": 380
	},
	{
	"epoch": 4.0625,
	"grad_norm": 0.28817203640937805,
	"learning_rate": 0.00013718092623422686,
	"loss": 0.0339,
	"step": 390
	},
	{
	"epoch": 4.166666666666667,
	"grad_norm": 0.32753250002861023,
	"learning_rate": 0.00013392943359384624,
	"loss": 0.0313,
	"step": 400
	},
	{
	"epoch": 4.270833333333333,
	"grad_norm": 0.26729685068130493,
	"learning_rate": 0.00013063687735082933,
	"loss": 0.0354,
	"step": 410
	},
	{
	"epoch": 4.375,
	"grad_norm": 0.25145024061203003,
	"learning_rate": 0.00012730724237043615,
	"loss": 0.0316,
	"step": 420
	},
	{
	"epoch": 4.479166666666667,
	"grad_norm": 0.21019020676612854,
	"learning_rate": 0.00012394455839302113,
	"loss": 0.0341,
	"step": 430
	},
	{
	"epoch": 4.583333333333333,
	"grad_norm": 0.25795239210128784,
	"learning_rate": 0.00012055289515698007,
	"loss": 0.0309,
	"step": 440
	},
	{
	"epoch": 4.6875,
	"grad_norm": 0.34662681818008423,
	"learning_rate": 0.00011713635747328818,
	"loss": 0.0274,
	"step": 450
	},
	{
	"epoch": 4.791666666666667,
	"grad_norm": 0.26628291606903076,
	"learning_rate": 0.00011369908025759167,
	"loss": 0.028,
	"step": 460
	},
	{
	"epoch": 4.895833333333333,
	"grad_norm": 0.38035184144973755,
	"learning_rate": 0.00011024522352586452,
	"loss": 0.0273,
	"step": 470
	},
	{
	"epoch": 5.0,
	"grad_norm": 0.515480637550354,
	"learning_rate": 0.00010677896735968693,
	"loss": 0.028,
	"step": 480
	},
	{
	"epoch": 5.104166666666667,
	"grad_norm": 0.2834020256996155,
	"learning_rate": 0.00010330450684723955,
	"loss": 0.0241,
	"step": 490
	},
	{
	"epoch": 5.208333333333333,
	"grad_norm": 0.2605891823768616,
	"learning_rate": 9.982604700613529e-05,
	"loss": 0.0285,
	"step": 500
	},
	{
	"epoch": 5.3125,
	"grad_norm": 0.20520137250423431,
	"learning_rate": 9.63477976942341e-05,
	"loss": 0.0233,
	"step": 510
	},
	{
	"epoch": 5.416666666666667,
	"grad_norm": 0.34367069602012634,
	"learning_rate": 9.287396851460008e-05,
	"loss": 0.0248,
	"step": 520
	},
	{
	"epoch": 5.520833333333333,
	"grad_norm": 0.17647576332092285,
	"learning_rate": 8.940876372076603e-05,
	"loss": 0.0249,
	"step": 530
	},
	{
	"epoch": 5.625,
	"grad_norm": 0.18092995882034302,
	"learning_rate": 8.595637712847358e-05,
	"loss": 0.0299,
	"step": 540
	},
	{
	"epoch": 5.729166666666667,
	"grad_norm": 0.4016890823841095,
	"learning_rate": 8.252098704004479e-05,
	"loss": 0.0221,
	"step": 550
	},
	{
	"epoch": 5.833333333333333,
	"grad_norm": 0.20710507035255432,
	"learning_rate": 7.910675118752977e-05,
	"loss": 0.0227,
	"step": 560
	},
	{
	"epoch": 5.9375,
	"grad_norm": 0.16734477877616882,
	"learning_rate": 7.57178017007492e-05,
	"loss": 0.0272,
	"step": 570
	},
	{
	"epoch": 6.041666666666667,
	"grad_norm": 0.25709211826324463,
	"learning_rate": 7.235824010632283e-05,
	"loss": 0.0262,
	"step": 580
	},
	{
	"epoch": 6.145833333333333,
	"grad_norm": 0.20443040132522583,
	"learning_rate": 6.903213236373591e-05,
	"loss": 0.0248,
	"step": 590
	},
	{
	"epoch": 6.25,
	"grad_norm": 0.33397573232650757,
	"learning_rate": 6.574350394445074e-05,
	"loss": 0.0232,
	"step": 600
	},
	{
	"epoch": 6.354166666666667,
	"grad_norm": 0.29977497458457947,
	"learning_rate": 6.249633496002016e-05,
	"loss": 0.0259,
	"step": 610
	},
	{
	"epoch": 6.458333333333333,
	"grad_norm": 0.2301492542028427,
	"learning_rate": 5.929455534509818e-05,
	"loss": 0.0225,
	"step": 620
	},
	{
	"epoch": 6.5625,
	"grad_norm": 0.17564110457897186,
	"learning_rate": 5.614204010117785e-05,
	"loss": 0.0242,
	"step": 630
	},
	{
	"epoch": 6.666666666666667,
	"grad_norm": 0.22045820951461792,
	"learning_rate": 5.304260460681309e-05,
	"loss": 0.0241,
	"step": 640
	},
	{
	"epoch": 6.770833333333333,
	"grad_norm": 0.19701404869556427,
	"learning_rate": 5.000000000000002e-05,
	"loss": 0.0227,
	"step": 650
	},
	{
	"epoch": 6.875,
	"grad_norm": 0.16471460461616516,
	"learning_rate": 4.7017908638305995e-05,
	"loss": 0.0232,
	"step": 660
	},
	{
	"epoch": 6.979166666666667,
	"grad_norm": 0.19613026082515717,
	"learning_rate": 4.4099939642241795e-05,
	"loss": 0.0231,
	"step": 670
	},
	{
	"epoch": 7.083333333333333,
	"grad_norm": 0.2281058430671692,
	"learning_rate": 4.124962452726969e-05,
	"loss": 0.0194,
	"step": 680
	},
	{
	"epoch": 7.1875,
	"grad_norm": 0.33613333106040955,
	"learning_rate": 3.84704129297339e-05,
	"loss": 0.0189,
	"step": 690
	},
	{
	"epoch": 7.291666666666667,
	"grad_norm": 0.11981873214244843,
	"learning_rate": 3.576566843188729e-05,
	"loss": 0.0193,
	"step": 700
	},
	{
	"epoch": 7.395833333333333,
	"grad_norm": 0.14438898861408234,
	"learning_rate": 3.313866449106555e-05,
	"loss": 0.0195,
	"step": 710
	},
	{
	"epoch": 7.5,
	"grad_norm": 0.157650426030159,
	"learning_rate": 3.059258047793661e-05,
	"loss": 0.0213,
	"step": 720
	},
	{
	"epoch": 7.604166666666667,
	"grad_norm": 0.19574569165706635,
	"learning_rate": 2.8130497828620128e-05,
	"loss": 0.0213,
	"step": 730
	},
	{
	"epoch": 7.708333333333333,
	"grad_norm": 0.18361669778823853,
	"learning_rate": 2.5755396315333324e-05,
	"loss": 0.0195,
	"step": 740
	},
	{
	"epoch": 7.8125,
	"grad_norm": 0.17926767468452454,
	"learning_rate": 2.3470150440077266e-05,
	"loss": 0.0214,
	"step": 750
	},
	{
	"epoch": 7.916666666666667,
	"grad_norm": 0.14856334030628204,
	"learning_rate": 2.1277525955728138e-05,
	"loss": 0.0214,
	"step": 760
	},
	{
	"epoch": 8.020833333333334,
	"grad_norm": 0.2279294729232788,
	"learning_rate": 1.9180176518743476e-05,
	"loss": 0.0196,
	"step": 770
	},
	{
	"epoch": 8.125,
	"grad_norm": 0.17617039382457733,
	"learning_rate": 1.7180640477534847e-05,
	"loss": 0.0216,
	"step": 780
	},
	{
	"epoch": 8.229166666666666,
	"grad_norm": 0.09398578852415085,
	"learning_rate": 1.5281337800393968e-05,
	"loss": 0.0186,
	"step": 790
	},
	{
	"epoch": 8.333333333333334,
	"grad_norm": 0.14784985780715942,
	"learning_rate": 1.3484567146690009e-05,
	"loss": 0.018,
	"step": 800
	},
	{
	"epoch": 8.4375,
	"grad_norm": 0.18601654469966888,
	"learning_rate": 1.1792503084882789e-05,
	"loss": 0.0187,
	"step": 810
	},
	{
	"epoch": 8.541666666666666,
	"grad_norm": 0.22347238659858704,
	"learning_rate": 1.0207193460718856e-05,
	"loss": 0.0171,
	"step": 820
	},
	{
	"epoch": 8.645833333333334,
	"grad_norm": 0.16696269810199738,
	"learning_rate": 8.730556918795785e-06,
	"loss": 0.0171,
	"step": 830
	},
	{
	"epoch": 8.75,
	"grad_norm": 0.2363879382610321,
	"learning_rate": 7.364380580493813e-06,
	"loss": 0.0175,
	"step": 840
	},
	{
	"epoch": 8.854166666666666,
	"grad_norm": 0.14943011105060577,
	"learning_rate": 6.1103178810856364e-06,
	"loss": 0.0208,
	"step": 850
	},
	{
	"epoch": 8.958333333333334,
	"grad_norm": 0.18391437828540802,
	"learning_rate": 4.969886568641757e-06,
	"loss": 0.0172,
	"step": 860
	},
	{
	"epoch": 9.0625,
	"grad_norm": 0.17471212148666382,
	"learning_rate": 3.944466867153218e-06,
	"loss": 0.0199,
	"step": 870
	},
	{
	"epoch": 9.166666666666666,
	"grad_norm": 0.2174932211637497,
	"learning_rate": 3.0352998060949155e-06,
	"loss": 0.0173,
	"step": 880
	},
	{
	"epoch": 9.270833333333334,
	"grad_norm": 0.15788139402866364,
	"learning_rate": 2.2434857184512435e-06,
	"loss": 0.0188,
	"step": 890
	},
	{
	"epoch": 9.375,
	"grad_norm": 0.12695789337158203,
	"learning_rate": 1.5699829090217278e-06,
	"loss": 0.0183,
	"step": 900
	},
	{
	"epoch": 9.479166666666666,
	"grad_norm": 0.21467889845371246,
	"learning_rate": 1.0156064946182376e-06,
	"loss": 0.0213,
	"step": 910
	},
	{
	"epoch": 9.583333333333334,
	"grad_norm": 0.11011941730976105,
	"learning_rate": 5.810274175578445e-07,
	"loss": 0.0165,
	"step": 920
	},
	{
	"epoch": 9.6875,
	"grad_norm": 0.1231321394443512,
	"learning_rate": 2.667716336448356e-07,
	"loss": 0.0153,
	"step": 930
	},
	{
	"epoch": 9.791666666666666,
	"grad_norm": 0.23924382030963898,
	"learning_rate": 7.321947562484166e-08,
	"loss": 0.0176,
	"step": 940
	},
	{
	"epoch": 9.895833333333334,
	"grad_norm": 0.1052466407418251,
	"learning_rate": 6.051928814865271e-10,
	"loss": 0.0171,
	"step": 950
	},
	{
	"epoch": 9.90625,
	"step": 951,
	"total_flos": 1.2838499503164576e+17,
	"train_loss": 0.05094248948170246,
	"train_runtime": 1084.1029,
	"train_samples_per_second": 56.142,
	"train_steps_per_second": 0.877
	}
	],
	"logging_steps": 10,
	"max_steps": 951,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 10,
	"save_steps": 10000,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 1.2838499503164576e+17,
	"train_batch_size": 64,
	"trial_name": null,
	"trial_params": null
	}