vit-ena24-MD / trainer_state.json
mbiarreta's picture
🍻 cheers
40e6d1c verified
{
"best_global_step": 300,
"best_metric": 1.6826636791229248,
"best_model_checkpoint": "./vit-ena24-MD/checkpoint-300",
"epoch": 2.0,
"eval_steps": 100,
"global_step": 1588,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012594458438287154,
"grad_norm": 33.30289077758789,
"learning_rate": 0.00019886649874055416,
"loss": 0.3147,
"step": 10
},
{
"epoch": 0.02518891687657431,
"grad_norm": 0.3888566792011261,
"learning_rate": 0.00019760705289672545,
"loss": 0.1926,
"step": 20
},
{
"epoch": 0.037783375314861464,
"grad_norm": 0.45531752705574036,
"learning_rate": 0.00019634760705289673,
"loss": 0.244,
"step": 30
},
{
"epoch": 0.05037783375314862,
"grad_norm": 17.49447250366211,
"learning_rate": 0.000195088161209068,
"loss": 0.4402,
"step": 40
},
{
"epoch": 0.06297229219143577,
"grad_norm": 10.337971687316895,
"learning_rate": 0.0001938287153652393,
"loss": 0.2908,
"step": 50
},
{
"epoch": 0.07556675062972293,
"grad_norm": 0.1199137419462204,
"learning_rate": 0.00019256926952141058,
"loss": 0.1088,
"step": 60
},
{
"epoch": 0.08816120906801007,
"grad_norm": 12.398459434509277,
"learning_rate": 0.0001913098236775819,
"loss": 0.2769,
"step": 70
},
{
"epoch": 0.10075566750629723,
"grad_norm": 4.823121547698975,
"learning_rate": 0.00019005037783375317,
"loss": 0.2812,
"step": 80
},
{
"epoch": 0.11335012594458438,
"grad_norm": 0.09492120891809464,
"learning_rate": 0.00018879093198992443,
"loss": 0.3205,
"step": 90
},
{
"epoch": 0.12594458438287154,
"grad_norm": 13.602778434753418,
"learning_rate": 0.00018753148614609571,
"loss": 0.4013,
"step": 100
},
{
"epoch": 0.12594458438287154,
"eval_accuracy": 0.6630859375,
"eval_loss": 1.89939546585083,
"eval_runtime": 8.9391,
"eval_samples_per_second": 114.553,
"eval_steps_per_second": 14.319,
"step": 100
},
{
"epoch": 0.1385390428211587,
"grad_norm": 0.17718783020973206,
"learning_rate": 0.00018627204030226703,
"loss": 0.2671,
"step": 110
},
{
"epoch": 0.15113350125944586,
"grad_norm": 0.10247470438480377,
"learning_rate": 0.0001850125944584383,
"loss": 0.0324,
"step": 120
},
{
"epoch": 0.163727959697733,
"grad_norm": 2.346425771713257,
"learning_rate": 0.00018375314861460957,
"loss": 0.1711,
"step": 130
},
{
"epoch": 0.17632241813602015,
"grad_norm": 16.59246063232422,
"learning_rate": 0.00018249370277078085,
"loss": 0.1502,
"step": 140
},
{
"epoch": 0.1889168765743073,
"grad_norm": 10.356197357177734,
"learning_rate": 0.00018123425692695216,
"loss": 0.3752,
"step": 150
},
{
"epoch": 0.20151133501259447,
"grad_norm": 3.4677155017852783,
"learning_rate": 0.00017997481108312344,
"loss": 0.2305,
"step": 160
},
{
"epoch": 0.2141057934508816,
"grad_norm": 15.256316184997559,
"learning_rate": 0.00017871536523929473,
"loss": 0.1325,
"step": 170
},
{
"epoch": 0.22670025188916876,
"grad_norm": 15.606841087341309,
"learning_rate": 0.000177455919395466,
"loss": 0.6504,
"step": 180
},
{
"epoch": 0.23929471032745592,
"grad_norm": 0.047608401626348495,
"learning_rate": 0.0001761964735516373,
"loss": 0.402,
"step": 190
},
{
"epoch": 0.2518891687657431,
"grad_norm": 11.901758193969727,
"learning_rate": 0.00017493702770780858,
"loss": 0.4178,
"step": 200
},
{
"epoch": 0.2518891687657431,
"eval_accuracy": 0.59375,
"eval_loss": 2.070011615753174,
"eval_runtime": 9.3891,
"eval_samples_per_second": 109.062,
"eval_steps_per_second": 13.633,
"step": 200
},
{
"epoch": 0.26448362720403024,
"grad_norm": 20.353191375732422,
"learning_rate": 0.00017367758186397986,
"loss": 0.1999,
"step": 210
},
{
"epoch": 0.2770780856423174,
"grad_norm": 0.20118139684200287,
"learning_rate": 0.00017241813602015114,
"loss": 0.4375,
"step": 220
},
{
"epoch": 0.28967254408060455,
"grad_norm": 1.5593441724777222,
"learning_rate": 0.00017115869017632243,
"loss": 0.2035,
"step": 230
},
{
"epoch": 0.3022670025188917,
"grad_norm": 10.367003440856934,
"learning_rate": 0.0001698992443324937,
"loss": 0.3008,
"step": 240
},
{
"epoch": 0.3148614609571788,
"grad_norm": 4.51542329788208,
"learning_rate": 0.00016876574307304786,
"loss": 0.3633,
"step": 250
},
{
"epoch": 0.327455919395466,
"grad_norm": 16.005172729492188,
"learning_rate": 0.00016750629722921915,
"loss": 0.1696,
"step": 260
},
{
"epoch": 0.34005037783375314,
"grad_norm": 29.806106567382812,
"learning_rate": 0.00016624685138539046,
"loss": 0.152,
"step": 270
},
{
"epoch": 0.3526448362720403,
"grad_norm": 0.061621200293302536,
"learning_rate": 0.00016498740554156171,
"loss": 0.4195,
"step": 280
},
{
"epoch": 0.36523929471032746,
"grad_norm": 0.22019930183887482,
"learning_rate": 0.000163727959697733,
"loss": 0.3925,
"step": 290
},
{
"epoch": 0.3778337531486146,
"grad_norm": 0.03678397834300995,
"learning_rate": 0.00016246851385390428,
"loss": 0.336,
"step": 300
},
{
"epoch": 0.3778337531486146,
"eval_accuracy": 0.6826171875,
"eval_loss": 1.6826636791229248,
"eval_runtime": 8.0131,
"eval_samples_per_second": 127.791,
"eval_steps_per_second": 15.974,
"step": 300
},
{
"epoch": 0.3904282115869018,
"grad_norm": 7.445373058319092,
"learning_rate": 0.0001612090680100756,
"loss": 0.2124,
"step": 310
},
{
"epoch": 0.40302267002518893,
"grad_norm": 0.45029667019844055,
"learning_rate": 0.00015994962216624685,
"loss": 0.2362,
"step": 320
},
{
"epoch": 0.4156171284634761,
"grad_norm": 6.546263694763184,
"learning_rate": 0.00015869017632241813,
"loss": 0.2603,
"step": 330
},
{
"epoch": 0.4282115869017632,
"grad_norm": 6.475110054016113,
"learning_rate": 0.00015743073047858942,
"loss": 0.2096,
"step": 340
},
{
"epoch": 0.44080604534005036,
"grad_norm": 20.233562469482422,
"learning_rate": 0.00015617128463476073,
"loss": 0.1777,
"step": 350
},
{
"epoch": 0.4534005037783375,
"grad_norm": 14.09807014465332,
"learning_rate": 0.000154911838790932,
"loss": 0.1531,
"step": 360
},
{
"epoch": 0.4659949622166247,
"grad_norm": 0.7836583256721497,
"learning_rate": 0.00015365239294710327,
"loss": 0.1541,
"step": 370
},
{
"epoch": 0.47858942065491183,
"grad_norm": 0.07820595055818558,
"learning_rate": 0.00015239294710327455,
"loss": 0.4512,
"step": 380
},
{
"epoch": 0.491183879093199,
"grad_norm": 24.061058044433594,
"learning_rate": 0.00015113350125944586,
"loss": 0.2958,
"step": 390
},
{
"epoch": 0.5037783375314862,
"grad_norm": 2.0599920749664307,
"learning_rate": 0.00014987405541561715,
"loss": 0.2547,
"step": 400
},
{
"epoch": 0.5037783375314862,
"eval_accuracy": 0.658203125,
"eval_loss": 1.8337910175323486,
"eval_runtime": 8.3563,
"eval_samples_per_second": 122.543,
"eval_steps_per_second": 15.318,
"step": 400
},
{
"epoch": 0.5163727959697733,
"grad_norm": 0.04411542788147926,
"learning_rate": 0.0001486146095717884,
"loss": 0.1933,
"step": 410
},
{
"epoch": 0.5289672544080605,
"grad_norm": 6.006237030029297,
"learning_rate": 0.0001473551637279597,
"loss": 0.3889,
"step": 420
},
{
"epoch": 0.5415617128463476,
"grad_norm": 5.069452285766602,
"learning_rate": 0.000146095717884131,
"loss": 0.2128,
"step": 430
},
{
"epoch": 0.5541561712846348,
"grad_norm": 7.733457565307617,
"learning_rate": 0.00014483627204030228,
"loss": 0.3503,
"step": 440
},
{
"epoch": 0.5667506297229219,
"grad_norm": 14.06003475189209,
"learning_rate": 0.00014357682619647356,
"loss": 0.3558,
"step": 450
},
{
"epoch": 0.5793450881612091,
"grad_norm": 5.533696174621582,
"learning_rate": 0.00014231738035264485,
"loss": 0.1261,
"step": 460
},
{
"epoch": 0.5919395465994962,
"grad_norm": 0.04644688963890076,
"learning_rate": 0.00014105793450881613,
"loss": 0.2681,
"step": 470
},
{
"epoch": 0.6045340050377834,
"grad_norm": 13.318643569946289,
"learning_rate": 0.00013979848866498741,
"loss": 0.186,
"step": 480
},
{
"epoch": 0.6171284634760705,
"grad_norm": 0.22986146807670593,
"learning_rate": 0.0001385390428211587,
"loss": 0.1415,
"step": 490
},
{
"epoch": 0.6297229219143576,
"grad_norm": 0.17749029397964478,
"learning_rate": 0.00013727959697732998,
"loss": 0.1166,
"step": 500
},
{
"epoch": 0.6297229219143576,
"eval_accuracy": 0.669921875,
"eval_loss": 1.9548735618591309,
"eval_runtime": 8.0737,
"eval_samples_per_second": 126.832,
"eval_steps_per_second": 15.854,
"step": 500
},
{
"epoch": 0.6423173803526449,
"grad_norm": 0.04894278571009636,
"learning_rate": 0.00013602015113350126,
"loss": 0.0819,
"step": 510
},
{
"epoch": 0.654911838790932,
"grad_norm": 4.449767112731934,
"learning_rate": 0.00013476070528967255,
"loss": 0.3191,
"step": 520
},
{
"epoch": 0.6675062972292192,
"grad_norm": 0.7904582023620605,
"learning_rate": 0.00013350125944584383,
"loss": 0.0115,
"step": 530
},
{
"epoch": 0.6801007556675063,
"grad_norm": 0.6529366374015808,
"learning_rate": 0.00013224181360201514,
"loss": 0.0362,
"step": 540
},
{
"epoch": 0.6926952141057935,
"grad_norm": 17.554079055786133,
"learning_rate": 0.0001309823677581864,
"loss": 0.442,
"step": 550
},
{
"epoch": 0.7052896725440806,
"grad_norm": 0.08190945535898209,
"learning_rate": 0.00012972292191435768,
"loss": 0.0196,
"step": 560
},
{
"epoch": 0.7178841309823678,
"grad_norm": 5.79897928237915,
"learning_rate": 0.00012846347607052897,
"loss": 0.1356,
"step": 570
},
{
"epoch": 0.7304785894206549,
"grad_norm": 9.713973999023438,
"learning_rate": 0.00012720403022670028,
"loss": 0.1532,
"step": 580
},
{
"epoch": 0.743073047858942,
"grad_norm": 0.020533636212348938,
"learning_rate": 0.00012594458438287153,
"loss": 0.1312,
"step": 590
},
{
"epoch": 0.7556675062972292,
"grad_norm": 0.024593239650130272,
"learning_rate": 0.00012468513853904282,
"loss": 0.0555,
"step": 600
},
{
"epoch": 0.7556675062972292,
"eval_accuracy": 0.7021484375,
"eval_loss": 1.7758601903915405,
"eval_runtime": 8.0497,
"eval_samples_per_second": 127.209,
"eval_steps_per_second": 15.901,
"step": 600
},
{
"epoch": 0.7682619647355163,
"grad_norm": 0.01405579037964344,
"learning_rate": 0.0001234256926952141,
"loss": 0.0614,
"step": 610
},
{
"epoch": 0.7808564231738035,
"grad_norm": 0.2251737415790558,
"learning_rate": 0.0001221662468513854,
"loss": 0.1868,
"step": 620
},
{
"epoch": 0.7934508816120907,
"grad_norm": 0.03519393131136894,
"learning_rate": 0.0001209068010075567,
"loss": 0.1195,
"step": 630
},
{
"epoch": 0.8060453400503779,
"grad_norm": 5.647881507873535,
"learning_rate": 0.00011964735516372796,
"loss": 0.0161,
"step": 640
},
{
"epoch": 0.818639798488665,
"grad_norm": 11.489825248718262,
"learning_rate": 0.00011838790931989925,
"loss": 0.1974,
"step": 650
},
{
"epoch": 0.8312342569269522,
"grad_norm": 0.1198825016617775,
"learning_rate": 0.00011712846347607053,
"loss": 0.1037,
"step": 660
},
{
"epoch": 0.8438287153652393,
"grad_norm": 0.7424900531768799,
"learning_rate": 0.00011586901763224183,
"loss": 0.068,
"step": 670
},
{
"epoch": 0.8564231738035264,
"grad_norm": 0.7058230042457581,
"learning_rate": 0.0001146095717884131,
"loss": 0.1471,
"step": 680
},
{
"epoch": 0.8690176322418136,
"grad_norm": 5.266252040863037,
"learning_rate": 0.00011335012594458438,
"loss": 0.2159,
"step": 690
},
{
"epoch": 0.8816120906801007,
"grad_norm": 48.79673385620117,
"learning_rate": 0.00011209068010075568,
"loss": 0.3521,
"step": 700
},
{
"epoch": 0.8816120906801007,
"eval_accuracy": 0.6591796875,
"eval_loss": 2.015519618988037,
"eval_runtime": 8.1531,
"eval_samples_per_second": 125.597,
"eval_steps_per_second": 15.7,
"step": 700
},
{
"epoch": 0.8942065491183879,
"grad_norm": 0.0208774134516716,
"learning_rate": 0.00011083123425692696,
"loss": 0.2564,
"step": 710
},
{
"epoch": 0.906801007556675,
"grad_norm": 0.03130019083619118,
"learning_rate": 0.00010957178841309825,
"loss": 0.0802,
"step": 720
},
{
"epoch": 0.9193954659949622,
"grad_norm": 15.187126159667969,
"learning_rate": 0.00010831234256926952,
"loss": 0.0679,
"step": 730
},
{
"epoch": 0.9319899244332494,
"grad_norm": 0.01843099482357502,
"learning_rate": 0.00010705289672544081,
"loss": 0.112,
"step": 740
},
{
"epoch": 0.9445843828715366,
"grad_norm": 0.07539524137973785,
"learning_rate": 0.0001057934508816121,
"loss": 0.0254,
"step": 750
},
{
"epoch": 0.9571788413098237,
"grad_norm": 3.864802598953247,
"learning_rate": 0.0001045340050377834,
"loss": 0.0793,
"step": 760
},
{
"epoch": 0.9697732997481109,
"grad_norm": 28.977079391479492,
"learning_rate": 0.00010327455919395465,
"loss": 0.2016,
"step": 770
},
{
"epoch": 0.982367758186398,
"grad_norm": 0.019700728356838226,
"learning_rate": 0.00010201511335012595,
"loss": 0.1451,
"step": 780
},
{
"epoch": 0.9949622166246851,
"grad_norm": 0.01537514291703701,
"learning_rate": 0.00010075566750629723,
"loss": 0.1757,
"step": 790
},
{
"epoch": 1.0075566750629723,
"grad_norm": 0.07689725607633591,
"learning_rate": 9.949622166246852e-05,
"loss": 0.0144,
"step": 800
},
{
"epoch": 1.0075566750629723,
"eval_accuracy": 0.673828125,
"eval_loss": 1.9692714214324951,
"eval_runtime": 8.2858,
"eval_samples_per_second": 123.584,
"eval_steps_per_second": 15.448,
"step": 800
},
{
"epoch": 1.0201511335012594,
"grad_norm": 0.01374890562146902,
"learning_rate": 9.82367758186398e-05,
"loss": 0.1439,
"step": 810
},
{
"epoch": 1.0327455919395465,
"grad_norm": 0.048850249499082565,
"learning_rate": 9.69773299748111e-05,
"loss": 0.2261,
"step": 820
},
{
"epoch": 1.0453400503778338,
"grad_norm": 0.02125728130340576,
"learning_rate": 9.571788413098237e-05,
"loss": 0.141,
"step": 830
},
{
"epoch": 1.057934508816121,
"grad_norm": 0.47563159465789795,
"learning_rate": 9.445843828715366e-05,
"loss": 0.0288,
"step": 840
},
{
"epoch": 1.070528967254408,
"grad_norm": 0.14305871725082397,
"learning_rate": 9.319899244332493e-05,
"loss": 0.0726,
"step": 850
},
{
"epoch": 1.0831234256926952,
"grad_norm": 0.00739708449691534,
"learning_rate": 9.193954659949623e-05,
"loss": 0.0061,
"step": 860
},
{
"epoch": 1.0957178841309823,
"grad_norm": 0.019515765830874443,
"learning_rate": 9.068010075566751e-05,
"loss": 0.0061,
"step": 870
},
{
"epoch": 1.1083123425692696,
"grad_norm": 0.012037048116326332,
"learning_rate": 8.94206549118388e-05,
"loss": 0.0062,
"step": 880
},
{
"epoch": 1.1209068010075567,
"grad_norm": 0.017209310084581375,
"learning_rate": 8.816120906801008e-05,
"loss": 0.004,
"step": 890
},
{
"epoch": 1.1335012594458438,
"grad_norm": 0.01974736526608467,
"learning_rate": 8.690176322418136e-05,
"loss": 0.0847,
"step": 900
},
{
"epoch": 1.1335012594458438,
"eval_accuracy": 0.72265625,
"eval_loss": 1.7930188179016113,
"eval_runtime": 9.0857,
"eval_samples_per_second": 112.705,
"eval_steps_per_second": 14.088,
"step": 900
},
{
"epoch": 1.146095717884131,
"grad_norm": 0.04939868301153183,
"learning_rate": 8.564231738035265e-05,
"loss": 0.0919,
"step": 910
},
{
"epoch": 1.1586901763224182,
"grad_norm": 0.0755525752902031,
"learning_rate": 8.438287153652393e-05,
"loss": 0.0596,
"step": 920
},
{
"epoch": 1.1712846347607053,
"grad_norm": 0.40348997712135315,
"learning_rate": 8.312342569269523e-05,
"loss": 0.0092,
"step": 930
},
{
"epoch": 1.1838790931989924,
"grad_norm": 0.012245587073266506,
"learning_rate": 8.18639798488665e-05,
"loss": 0.0699,
"step": 940
},
{
"epoch": 1.1964735516372795,
"grad_norm": 0.03490450978279114,
"learning_rate": 8.06045340050378e-05,
"loss": 0.0036,
"step": 950
},
{
"epoch": 1.2090680100755669,
"grad_norm": 0.052859097719192505,
"learning_rate": 7.934508816120907e-05,
"loss": 0.0103,
"step": 960
},
{
"epoch": 1.221662468513854,
"grad_norm": 0.01732555963099003,
"learning_rate": 7.808564231738036e-05,
"loss": 0.0529,
"step": 970
},
{
"epoch": 1.234256926952141,
"grad_norm": 0.01187656819820404,
"learning_rate": 7.682619647355163e-05,
"loss": 0.0055,
"step": 980
},
{
"epoch": 1.2468513853904282,
"grad_norm": 0.009664223529398441,
"learning_rate": 7.556675062972293e-05,
"loss": 0.0392,
"step": 990
},
{
"epoch": 1.2594458438287153,
"grad_norm": 0.014954408630728722,
"learning_rate": 7.43073047858942e-05,
"loss": 0.0041,
"step": 1000
},
{
"epoch": 1.2594458438287153,
"eval_accuracy": 0.7333984375,
"eval_loss": 1.710068941116333,
"eval_runtime": 8.8,
"eval_samples_per_second": 116.364,
"eval_steps_per_second": 14.545,
"step": 1000
},
{
"epoch": 1.2720403022670026,
"grad_norm": 1.0215097665786743,
"learning_rate": 7.30478589420655e-05,
"loss": 0.0057,
"step": 1010
},
{
"epoch": 1.2846347607052897,
"grad_norm": 0.020603695884346962,
"learning_rate": 7.178841309823678e-05,
"loss": 0.007,
"step": 1020
},
{
"epoch": 1.2972292191435768,
"grad_norm": 0.09024298191070557,
"learning_rate": 7.052896725440807e-05,
"loss": 0.0438,
"step": 1030
},
{
"epoch": 1.309823677581864,
"grad_norm": 0.08804096281528473,
"learning_rate": 6.926952141057935e-05,
"loss": 0.0512,
"step": 1040
},
{
"epoch": 1.322418136020151,
"grad_norm": 2.246978521347046,
"learning_rate": 6.801007556675063e-05,
"loss": 0.0056,
"step": 1050
},
{
"epoch": 1.3350125944584383,
"grad_norm": 0.05517780780792236,
"learning_rate": 6.675062972292192e-05,
"loss": 0.0561,
"step": 1060
},
{
"epoch": 1.3476070528967254,
"grad_norm": 0.21189695596694946,
"learning_rate": 6.54911838790932e-05,
"loss": 0.0571,
"step": 1070
},
{
"epoch": 1.3602015113350125,
"grad_norm": 0.010511963628232479,
"learning_rate": 6.423173803526448e-05,
"loss": 0.0039,
"step": 1080
},
{
"epoch": 1.3727959697732999,
"grad_norm": 0.009107470512390137,
"learning_rate": 6.297229219143577e-05,
"loss": 0.1101,
"step": 1090
},
{
"epoch": 1.385390428211587,
"grad_norm": 0.016693545505404472,
"learning_rate": 6.171284634760705e-05,
"loss": 0.0232,
"step": 1100
},
{
"epoch": 1.385390428211587,
"eval_accuracy": 0.732421875,
"eval_loss": 1.71272611618042,
"eval_runtime": 8.7899,
"eval_samples_per_second": 116.497,
"eval_steps_per_second": 14.562,
"step": 1100
},
{
"epoch": 1.397984886649874,
"grad_norm": 0.01695523038506508,
"learning_rate": 6.045340050377835e-05,
"loss": 0.0544,
"step": 1110
},
{
"epoch": 1.4105793450881612,
"grad_norm": 0.044825177639722824,
"learning_rate": 5.9193954659949624e-05,
"loss": 0.045,
"step": 1120
},
{
"epoch": 1.4231738035264483,
"grad_norm": 0.010057768784463406,
"learning_rate": 5.7934508816120914e-05,
"loss": 0.1501,
"step": 1130
},
{
"epoch": 1.4357682619647356,
"grad_norm": 0.005191602278500795,
"learning_rate": 5.667506297229219e-05,
"loss": 0.0097,
"step": 1140
},
{
"epoch": 1.4483627204030227,
"grad_norm": 0.0071839033626019955,
"learning_rate": 5.541561712846348e-05,
"loss": 0.0154,
"step": 1150
},
{
"epoch": 1.4609571788413098,
"grad_norm": 0.009370822459459305,
"learning_rate": 5.415617128463476e-05,
"loss": 0.1309,
"step": 1160
},
{
"epoch": 1.473551637279597,
"grad_norm": 0.010776126757264137,
"learning_rate": 5.289672544080605e-05,
"loss": 0.0048,
"step": 1170
},
{
"epoch": 1.486146095717884,
"grad_norm": 0.17925524711608887,
"learning_rate": 5.1637279596977326e-05,
"loss": 0.0181,
"step": 1180
},
{
"epoch": 1.4987405541561714,
"grad_norm": 0.013006888329982758,
"learning_rate": 5.0377833753148616e-05,
"loss": 0.0648,
"step": 1190
},
{
"epoch": 1.5113350125944585,
"grad_norm": 0.015756677836179733,
"learning_rate": 4.91183879093199e-05,
"loss": 0.1154,
"step": 1200
},
{
"epoch": 1.5113350125944585,
"eval_accuracy": 0.7236328125,
"eval_loss": 1.8169147968292236,
"eval_runtime": 8.2529,
"eval_samples_per_second": 124.078,
"eval_steps_per_second": 15.51,
"step": 1200
},
{
"epoch": 1.5239294710327456,
"grad_norm": 0.014958159998059273,
"learning_rate": 4.785894206549118e-05,
"loss": 0.0025,
"step": 1210
},
{
"epoch": 1.536523929471033,
"grad_norm": 0.29688045382499695,
"learning_rate": 4.659949622166247e-05,
"loss": 0.0052,
"step": 1220
},
{
"epoch": 1.5491183879093198,
"grad_norm": 0.06546491384506226,
"learning_rate": 4.534005037783376e-05,
"loss": 0.0425,
"step": 1230
},
{
"epoch": 1.561712846347607,
"grad_norm": 0.0268222838640213,
"learning_rate": 4.408060453400504e-05,
"loss": 0.0056,
"step": 1240
},
{
"epoch": 1.5743073047858942,
"grad_norm": 0.03548605740070343,
"learning_rate": 4.2821158690176324e-05,
"loss": 0.0047,
"step": 1250
},
{
"epoch": 1.5869017632241813,
"grad_norm": 2.136223554611206,
"learning_rate": 4.1561712846347615e-05,
"loss": 0.1031,
"step": 1260
},
{
"epoch": 1.5994962216624686,
"grad_norm": 0.0036976351402699947,
"learning_rate": 4.03022670025189e-05,
"loss": 0.0024,
"step": 1270
},
{
"epoch": 1.6120906801007555,
"grad_norm": 0.00549653172492981,
"learning_rate": 3.904282115869018e-05,
"loss": 0.006,
"step": 1280
},
{
"epoch": 1.6246851385390428,
"grad_norm": 0.008129788562655449,
"learning_rate": 3.7783375314861465e-05,
"loss": 0.0026,
"step": 1290
},
{
"epoch": 1.63727959697733,
"grad_norm": 0.006744919810444117,
"learning_rate": 3.652392947103275e-05,
"loss": 0.0355,
"step": 1300
},
{
"epoch": 1.63727959697733,
"eval_accuracy": 0.7333984375,
"eval_loss": 1.7981336116790771,
"eval_runtime": 7.7358,
"eval_samples_per_second": 132.372,
"eval_steps_per_second": 16.546,
"step": 1300
},
{
"epoch": 1.649874055415617,
"grad_norm": 0.09279919415712357,
"learning_rate": 3.526448362720403e-05,
"loss": 0.0353,
"step": 1310
},
{
"epoch": 1.6624685138539044,
"grad_norm": 0.005968036130070686,
"learning_rate": 3.4005037783375316e-05,
"loss": 0.0017,
"step": 1320
},
{
"epoch": 1.6750629722921915,
"grad_norm": 0.00884359423071146,
"learning_rate": 3.27455919395466e-05,
"loss": 0.0039,
"step": 1330
},
{
"epoch": 1.6876574307304786,
"grad_norm": 0.007703159470111132,
"learning_rate": 3.148614609571788e-05,
"loss": 0.0124,
"step": 1340
},
{
"epoch": 1.700251889168766,
"grad_norm": 0.01291657891124487,
"learning_rate": 3.0226700251889174e-05,
"loss": 0.0036,
"step": 1350
},
{
"epoch": 1.7128463476070528,
"grad_norm": 0.008787489496171474,
"learning_rate": 2.8967254408060457e-05,
"loss": 0.0021,
"step": 1360
},
{
"epoch": 1.7254408060453401,
"grad_norm": 0.04295135661959648,
"learning_rate": 2.770780856423174e-05,
"loss": 0.0061,
"step": 1370
},
{
"epoch": 1.7380352644836272,
"grad_norm": 2.3927783966064453,
"learning_rate": 2.6448362720403024e-05,
"loss": 0.1051,
"step": 1380
},
{
"epoch": 1.7506297229219143,
"grad_norm": 0.034302860498428345,
"learning_rate": 2.5188916876574308e-05,
"loss": 0.0023,
"step": 1390
},
{
"epoch": 1.7632241813602016,
"grad_norm": 0.01246937457472086,
"learning_rate": 2.392947103274559e-05,
"loss": 0.0042,
"step": 1400
},
{
"epoch": 1.7632241813602016,
"eval_accuracy": 0.7451171875,
"eval_loss": 1.7518525123596191,
"eval_runtime": 7.828,
"eval_samples_per_second": 130.813,
"eval_steps_per_second": 16.352,
"step": 1400
},
{
"epoch": 1.7758186397984885,
"grad_norm": 0.03391415998339653,
"learning_rate": 2.267002518891688e-05,
"loss": 0.1411,
"step": 1410
},
{
"epoch": 1.7884130982367759,
"grad_norm": 0.004355714190751314,
"learning_rate": 2.1410579345088162e-05,
"loss": 0.0714,
"step": 1420
},
{
"epoch": 1.801007556675063,
"grad_norm": 5.468477249145508,
"learning_rate": 2.015113350125945e-05,
"loss": 0.1146,
"step": 1430
},
{
"epoch": 1.81360201511335,
"grad_norm": 0.005633903201669455,
"learning_rate": 1.8891687657430733e-05,
"loss": 0.0023,
"step": 1440
},
{
"epoch": 1.8261964735516374,
"grad_norm": 0.09885888546705246,
"learning_rate": 1.7632241813602016e-05,
"loss": 0.002,
"step": 1450
},
{
"epoch": 1.8387909319899243,
"grad_norm": 0.0057752556167542934,
"learning_rate": 1.63727959697733e-05,
"loss": 0.0792,
"step": 1460
},
{
"epoch": 1.8513853904282116,
"grad_norm": 0.009123986586928368,
"learning_rate": 1.5113350125944587e-05,
"loss": 0.0127,
"step": 1470
},
{
"epoch": 1.8639798488664987,
"grad_norm": 0.07375753670930862,
"learning_rate": 1.385390428211587e-05,
"loss": 0.0621,
"step": 1480
},
{
"epoch": 1.8765743073047858,
"grad_norm": 0.007538340985774994,
"learning_rate": 1.2594458438287154e-05,
"loss": 0.0763,
"step": 1490
},
{
"epoch": 1.8891687657430731,
"grad_norm": 0.009298047050833702,
"learning_rate": 1.133501259445844e-05,
"loss": 0.0435,
"step": 1500
},
{
"epoch": 1.8891687657430731,
"eval_accuracy": 0.734375,
"eval_loss": 1.8184525966644287,
"eval_runtime": 7.6441,
"eval_samples_per_second": 133.959,
"eval_steps_per_second": 16.745,
"step": 1500
},
{
"epoch": 1.9017632241813602,
"grad_norm": 0.018270719796419144,
"learning_rate": 1.0075566750629725e-05,
"loss": 0.0034,
"step": 1510
},
{
"epoch": 1.9143576826196473,
"grad_norm": 0.03833504393696785,
"learning_rate": 8.816120906801008e-06,
"loss": 0.0213,
"step": 1520
},
{
"epoch": 1.9269521410579347,
"grad_norm": 0.014325067400932312,
"learning_rate": 7.556675062972293e-06,
"loss": 0.0808,
"step": 1530
},
{
"epoch": 1.9395465994962215,
"grad_norm": 0.01397955697029829,
"learning_rate": 6.297229219143577e-06,
"loss": 0.0122,
"step": 1540
},
{
"epoch": 1.9521410579345089,
"grad_norm": 21.65605926513672,
"learning_rate": 5.037783375314862e-06,
"loss": 0.0355,
"step": 1550
},
{
"epoch": 1.964735516372796,
"grad_norm": 0.008561785332858562,
"learning_rate": 3.7783375314861467e-06,
"loss": 0.0032,
"step": 1560
},
{
"epoch": 1.977329974811083,
"grad_norm": 0.006672169081866741,
"learning_rate": 2.518891687657431e-06,
"loss": 0.003,
"step": 1570
},
{
"epoch": 1.9899244332493704,
"grad_norm": 0.007904536090791225,
"learning_rate": 1.2594458438287156e-06,
"loss": 0.0062,
"step": 1580
},
{
"epoch": 2.0,
"step": 1588,
"total_flos": 9.84315859688448e+17,
"train_loss": 0.13015312990478664,
"train_runtime": 869.5131,
"train_samples_per_second": 14.606,
"train_steps_per_second": 1.826
}
],
"logging_steps": 10,
"max_steps": 1588,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.84315859688448e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}