| { |
| "best_global_step": 300, |
| "best_metric": 1.6826636791229248, |
| "best_model_checkpoint": "./vit-ena24-MD/checkpoint-300", |
| "epoch": 2.0, |
| "eval_steps": 100, |
| "global_step": 1588, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012594458438287154, |
| "grad_norm": 33.30289077758789, |
| "learning_rate": 0.00019886649874055416, |
| "loss": 0.3147, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02518891687657431, |
| "grad_norm": 0.3888566792011261, |
| "learning_rate": 0.00019760705289672545, |
| "loss": 0.1926, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.037783375314861464, |
| "grad_norm": 0.45531752705574036, |
| "learning_rate": 0.00019634760705289673, |
| "loss": 0.244, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05037783375314862, |
| "grad_norm": 17.49447250366211, |
| "learning_rate": 0.000195088161209068, |
| "loss": 0.4402, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06297229219143577, |
| "grad_norm": 10.337971687316895, |
| "learning_rate": 0.0001938287153652393, |
| "loss": 0.2908, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07556675062972293, |
| "grad_norm": 0.1199137419462204, |
| "learning_rate": 0.00019256926952141058, |
| "loss": 0.1088, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.08816120906801007, |
| "grad_norm": 12.398459434509277, |
| "learning_rate": 0.0001913098236775819, |
| "loss": 0.2769, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10075566750629723, |
| "grad_norm": 4.823121547698975, |
| "learning_rate": 0.00019005037783375317, |
| "loss": 0.2812, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.11335012594458438, |
| "grad_norm": 0.09492120891809464, |
| "learning_rate": 0.00018879093198992443, |
| "loss": 0.3205, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.12594458438287154, |
| "grad_norm": 13.602778434753418, |
| "learning_rate": 0.00018753148614609571, |
| "loss": 0.4013, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.12594458438287154, |
| "eval_accuracy": 0.6630859375, |
| "eval_loss": 1.89939546585083, |
| "eval_runtime": 8.9391, |
| "eval_samples_per_second": 114.553, |
| "eval_steps_per_second": 14.319, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1385390428211587, |
| "grad_norm": 0.17718783020973206, |
| "learning_rate": 0.00018627204030226703, |
| "loss": 0.2671, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.15113350125944586, |
| "grad_norm": 0.10247470438480377, |
| "learning_rate": 0.0001850125944584383, |
| "loss": 0.0324, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.163727959697733, |
| "grad_norm": 2.346425771713257, |
| "learning_rate": 0.00018375314861460957, |
| "loss": 0.1711, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.17632241813602015, |
| "grad_norm": 16.59246063232422, |
| "learning_rate": 0.00018249370277078085, |
| "loss": 0.1502, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1889168765743073, |
| "grad_norm": 10.356197357177734, |
| "learning_rate": 0.00018123425692695216, |
| "loss": 0.3752, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.20151133501259447, |
| "grad_norm": 3.4677155017852783, |
| "learning_rate": 0.00017997481108312344, |
| "loss": 0.2305, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2141057934508816, |
| "grad_norm": 15.256316184997559, |
| "learning_rate": 0.00017871536523929473, |
| "loss": 0.1325, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.22670025188916876, |
| "grad_norm": 15.606841087341309, |
| "learning_rate": 0.000177455919395466, |
| "loss": 0.6504, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.23929471032745592, |
| "grad_norm": 0.047608401626348495, |
| "learning_rate": 0.0001761964735516373, |
| "loss": 0.402, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.2518891687657431, |
| "grad_norm": 11.901758193969727, |
| "learning_rate": 0.00017493702770780858, |
| "loss": 0.4178, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2518891687657431, |
| "eval_accuracy": 0.59375, |
| "eval_loss": 2.070011615753174, |
| "eval_runtime": 9.3891, |
| "eval_samples_per_second": 109.062, |
| "eval_steps_per_second": 13.633, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.26448362720403024, |
| "grad_norm": 20.353191375732422, |
| "learning_rate": 0.00017367758186397986, |
| "loss": 0.1999, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2770780856423174, |
| "grad_norm": 0.20118139684200287, |
| "learning_rate": 0.00017241813602015114, |
| "loss": 0.4375, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.28967254408060455, |
| "grad_norm": 1.5593441724777222, |
| "learning_rate": 0.00017115869017632243, |
| "loss": 0.2035, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3022670025188917, |
| "grad_norm": 10.367003440856934, |
| "learning_rate": 0.0001698992443324937, |
| "loss": 0.3008, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3148614609571788, |
| "grad_norm": 4.51542329788208, |
| "learning_rate": 0.00016876574307304786, |
| "loss": 0.3633, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.327455919395466, |
| "grad_norm": 16.005172729492188, |
| "learning_rate": 0.00016750629722921915, |
| "loss": 0.1696, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.34005037783375314, |
| "grad_norm": 29.806106567382812, |
| "learning_rate": 0.00016624685138539046, |
| "loss": 0.152, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3526448362720403, |
| "grad_norm": 0.061621200293302536, |
| "learning_rate": 0.00016498740554156171, |
| "loss": 0.4195, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.36523929471032746, |
| "grad_norm": 0.22019930183887482, |
| "learning_rate": 0.000163727959697733, |
| "loss": 0.3925, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3778337531486146, |
| "grad_norm": 0.03678397834300995, |
| "learning_rate": 0.00016246851385390428, |
| "loss": 0.336, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3778337531486146, |
| "eval_accuracy": 0.6826171875, |
| "eval_loss": 1.6826636791229248, |
| "eval_runtime": 8.0131, |
| "eval_samples_per_second": 127.791, |
| "eval_steps_per_second": 15.974, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3904282115869018, |
| "grad_norm": 7.445373058319092, |
| "learning_rate": 0.0001612090680100756, |
| "loss": 0.2124, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.40302267002518893, |
| "grad_norm": 0.45029667019844055, |
| "learning_rate": 0.00015994962216624685, |
| "loss": 0.2362, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4156171284634761, |
| "grad_norm": 6.546263694763184, |
| "learning_rate": 0.00015869017632241813, |
| "loss": 0.2603, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4282115869017632, |
| "grad_norm": 6.475110054016113, |
| "learning_rate": 0.00015743073047858942, |
| "loss": 0.2096, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.44080604534005036, |
| "grad_norm": 20.233562469482422, |
| "learning_rate": 0.00015617128463476073, |
| "loss": 0.1777, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4534005037783375, |
| "grad_norm": 14.09807014465332, |
| "learning_rate": 0.000154911838790932, |
| "loss": 0.1531, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.4659949622166247, |
| "grad_norm": 0.7836583256721497, |
| "learning_rate": 0.00015365239294710327, |
| "loss": 0.1541, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.47858942065491183, |
| "grad_norm": 0.07820595055818558, |
| "learning_rate": 0.00015239294710327455, |
| "loss": 0.4512, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.491183879093199, |
| "grad_norm": 24.061058044433594, |
| "learning_rate": 0.00015113350125944586, |
| "loss": 0.2958, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5037783375314862, |
| "grad_norm": 2.0599920749664307, |
| "learning_rate": 0.00014987405541561715, |
| "loss": 0.2547, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5037783375314862, |
| "eval_accuracy": 0.658203125, |
| "eval_loss": 1.8337910175323486, |
| "eval_runtime": 8.3563, |
| "eval_samples_per_second": 122.543, |
| "eval_steps_per_second": 15.318, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5163727959697733, |
| "grad_norm": 0.04411542788147926, |
| "learning_rate": 0.0001486146095717884, |
| "loss": 0.1933, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5289672544080605, |
| "grad_norm": 6.006237030029297, |
| "learning_rate": 0.0001473551637279597, |
| "loss": 0.3889, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5415617128463476, |
| "grad_norm": 5.069452285766602, |
| "learning_rate": 0.000146095717884131, |
| "loss": 0.2128, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5541561712846348, |
| "grad_norm": 7.733457565307617, |
| "learning_rate": 0.00014483627204030228, |
| "loss": 0.3503, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.5667506297229219, |
| "grad_norm": 14.06003475189209, |
| "learning_rate": 0.00014357682619647356, |
| "loss": 0.3558, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5793450881612091, |
| "grad_norm": 5.533696174621582, |
| "learning_rate": 0.00014231738035264485, |
| "loss": 0.1261, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.5919395465994962, |
| "grad_norm": 0.04644688963890076, |
| "learning_rate": 0.00014105793450881613, |
| "loss": 0.2681, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6045340050377834, |
| "grad_norm": 13.318643569946289, |
| "learning_rate": 0.00013979848866498741, |
| "loss": 0.186, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6171284634760705, |
| "grad_norm": 0.22986146807670593, |
| "learning_rate": 0.0001385390428211587, |
| "loss": 0.1415, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6297229219143576, |
| "grad_norm": 0.17749029397964478, |
| "learning_rate": 0.00013727959697732998, |
| "loss": 0.1166, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6297229219143576, |
| "eval_accuracy": 0.669921875, |
| "eval_loss": 1.9548735618591309, |
| "eval_runtime": 8.0737, |
| "eval_samples_per_second": 126.832, |
| "eval_steps_per_second": 15.854, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6423173803526449, |
| "grad_norm": 0.04894278571009636, |
| "learning_rate": 0.00013602015113350126, |
| "loss": 0.0819, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.654911838790932, |
| "grad_norm": 4.449767112731934, |
| "learning_rate": 0.00013476070528967255, |
| "loss": 0.3191, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.6675062972292192, |
| "grad_norm": 0.7904582023620605, |
| "learning_rate": 0.00013350125944584383, |
| "loss": 0.0115, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6801007556675063, |
| "grad_norm": 0.6529366374015808, |
| "learning_rate": 0.00013224181360201514, |
| "loss": 0.0362, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.6926952141057935, |
| "grad_norm": 17.554079055786133, |
| "learning_rate": 0.0001309823677581864, |
| "loss": 0.442, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7052896725440806, |
| "grad_norm": 0.08190945535898209, |
| "learning_rate": 0.00012972292191435768, |
| "loss": 0.0196, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7178841309823678, |
| "grad_norm": 5.79897928237915, |
| "learning_rate": 0.00012846347607052897, |
| "loss": 0.1356, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7304785894206549, |
| "grad_norm": 9.713973999023438, |
| "learning_rate": 0.00012720403022670028, |
| "loss": 0.1532, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.743073047858942, |
| "grad_norm": 0.020533636212348938, |
| "learning_rate": 0.00012594458438287153, |
| "loss": 0.1312, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.7556675062972292, |
| "grad_norm": 0.024593239650130272, |
| "learning_rate": 0.00012468513853904282, |
| "loss": 0.0555, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7556675062972292, |
| "eval_accuracy": 0.7021484375, |
| "eval_loss": 1.7758601903915405, |
| "eval_runtime": 8.0497, |
| "eval_samples_per_second": 127.209, |
| "eval_steps_per_second": 15.901, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7682619647355163, |
| "grad_norm": 0.01405579037964344, |
| "learning_rate": 0.0001234256926952141, |
| "loss": 0.0614, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.7808564231738035, |
| "grad_norm": 0.2251737415790558, |
| "learning_rate": 0.0001221662468513854, |
| "loss": 0.1868, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.7934508816120907, |
| "grad_norm": 0.03519393131136894, |
| "learning_rate": 0.0001209068010075567, |
| "loss": 0.1195, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8060453400503779, |
| "grad_norm": 5.647881507873535, |
| "learning_rate": 0.00011964735516372796, |
| "loss": 0.0161, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.818639798488665, |
| "grad_norm": 11.489825248718262, |
| "learning_rate": 0.00011838790931989925, |
| "loss": 0.1974, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.8312342569269522, |
| "grad_norm": 0.1198825016617775, |
| "learning_rate": 0.00011712846347607053, |
| "loss": 0.1037, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.8438287153652393, |
| "grad_norm": 0.7424900531768799, |
| "learning_rate": 0.00011586901763224183, |
| "loss": 0.068, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.8564231738035264, |
| "grad_norm": 0.7058230042457581, |
| "learning_rate": 0.0001146095717884131, |
| "loss": 0.1471, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.8690176322418136, |
| "grad_norm": 5.266252040863037, |
| "learning_rate": 0.00011335012594458438, |
| "loss": 0.2159, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.8816120906801007, |
| "grad_norm": 48.79673385620117, |
| "learning_rate": 0.00011209068010075568, |
| "loss": 0.3521, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.8816120906801007, |
| "eval_accuracy": 0.6591796875, |
| "eval_loss": 2.015519618988037, |
| "eval_runtime": 8.1531, |
| "eval_samples_per_second": 125.597, |
| "eval_steps_per_second": 15.7, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.8942065491183879, |
| "grad_norm": 0.0208774134516716, |
| "learning_rate": 0.00011083123425692696, |
| "loss": 0.2564, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.906801007556675, |
| "grad_norm": 0.03130019083619118, |
| "learning_rate": 0.00010957178841309825, |
| "loss": 0.0802, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.9193954659949622, |
| "grad_norm": 15.187126159667969, |
| "learning_rate": 0.00010831234256926952, |
| "loss": 0.0679, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.9319899244332494, |
| "grad_norm": 0.01843099482357502, |
| "learning_rate": 0.00010705289672544081, |
| "loss": 0.112, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.9445843828715366, |
| "grad_norm": 0.07539524137973785, |
| "learning_rate": 0.0001057934508816121, |
| "loss": 0.0254, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.9571788413098237, |
| "grad_norm": 3.864802598953247, |
| "learning_rate": 0.0001045340050377834, |
| "loss": 0.0793, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.9697732997481109, |
| "grad_norm": 28.977079391479492, |
| "learning_rate": 0.00010327455919395465, |
| "loss": 0.2016, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.982367758186398, |
| "grad_norm": 0.019700728356838226, |
| "learning_rate": 0.00010201511335012595, |
| "loss": 0.1451, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.9949622166246851, |
| "grad_norm": 0.01537514291703701, |
| "learning_rate": 0.00010075566750629723, |
| "loss": 0.1757, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.0075566750629723, |
| "grad_norm": 0.07689725607633591, |
| "learning_rate": 9.949622166246852e-05, |
| "loss": 0.0144, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.0075566750629723, |
| "eval_accuracy": 0.673828125, |
| "eval_loss": 1.9692714214324951, |
| "eval_runtime": 8.2858, |
| "eval_samples_per_second": 123.584, |
| "eval_steps_per_second": 15.448, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.0201511335012594, |
| "grad_norm": 0.01374890562146902, |
| "learning_rate": 9.82367758186398e-05, |
| "loss": 0.1439, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.0327455919395465, |
| "grad_norm": 0.048850249499082565, |
| "learning_rate": 9.69773299748111e-05, |
| "loss": 0.2261, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.0453400503778338, |
| "grad_norm": 0.02125728130340576, |
| "learning_rate": 9.571788413098237e-05, |
| "loss": 0.141, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.057934508816121, |
| "grad_norm": 0.47563159465789795, |
| "learning_rate": 9.445843828715366e-05, |
| "loss": 0.0288, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.070528967254408, |
| "grad_norm": 0.14305871725082397, |
| "learning_rate": 9.319899244332493e-05, |
| "loss": 0.0726, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.0831234256926952, |
| "grad_norm": 0.00739708449691534, |
| "learning_rate": 9.193954659949623e-05, |
| "loss": 0.0061, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.0957178841309823, |
| "grad_norm": 0.019515765830874443, |
| "learning_rate": 9.068010075566751e-05, |
| "loss": 0.0061, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.1083123425692696, |
| "grad_norm": 0.012037048116326332, |
| "learning_rate": 8.94206549118388e-05, |
| "loss": 0.0062, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.1209068010075567, |
| "grad_norm": 0.017209310084581375, |
| "learning_rate": 8.816120906801008e-05, |
| "loss": 0.004, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.1335012594458438, |
| "grad_norm": 0.01974736526608467, |
| "learning_rate": 8.690176322418136e-05, |
| "loss": 0.0847, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.1335012594458438, |
| "eval_accuracy": 0.72265625, |
| "eval_loss": 1.7930188179016113, |
| "eval_runtime": 9.0857, |
| "eval_samples_per_second": 112.705, |
| "eval_steps_per_second": 14.088, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.146095717884131, |
| "grad_norm": 0.04939868301153183, |
| "learning_rate": 8.564231738035265e-05, |
| "loss": 0.0919, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.1586901763224182, |
| "grad_norm": 0.0755525752902031, |
| "learning_rate": 8.438287153652393e-05, |
| "loss": 0.0596, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.1712846347607053, |
| "grad_norm": 0.40348997712135315, |
| "learning_rate": 8.312342569269523e-05, |
| "loss": 0.0092, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.1838790931989924, |
| "grad_norm": 0.012245587073266506, |
| "learning_rate": 8.18639798488665e-05, |
| "loss": 0.0699, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.1964735516372795, |
| "grad_norm": 0.03490450978279114, |
| "learning_rate": 8.06045340050378e-05, |
| "loss": 0.0036, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.2090680100755669, |
| "grad_norm": 0.052859097719192505, |
| "learning_rate": 7.934508816120907e-05, |
| "loss": 0.0103, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.221662468513854, |
| "grad_norm": 0.01732555963099003, |
| "learning_rate": 7.808564231738036e-05, |
| "loss": 0.0529, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.234256926952141, |
| "grad_norm": 0.01187656819820404, |
| "learning_rate": 7.682619647355163e-05, |
| "loss": 0.0055, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.2468513853904282, |
| "grad_norm": 0.009664223529398441, |
| "learning_rate": 7.556675062972293e-05, |
| "loss": 0.0392, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.2594458438287153, |
| "grad_norm": 0.014954408630728722, |
| "learning_rate": 7.43073047858942e-05, |
| "loss": 0.0041, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.2594458438287153, |
| "eval_accuracy": 0.7333984375, |
| "eval_loss": 1.710068941116333, |
| "eval_runtime": 8.8, |
| "eval_samples_per_second": 116.364, |
| "eval_steps_per_second": 14.545, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.2720403022670026, |
| "grad_norm": 1.0215097665786743, |
| "learning_rate": 7.30478589420655e-05, |
| "loss": 0.0057, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.2846347607052897, |
| "grad_norm": 0.020603695884346962, |
| "learning_rate": 7.178841309823678e-05, |
| "loss": 0.007, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.2972292191435768, |
| "grad_norm": 0.09024298191070557, |
| "learning_rate": 7.052896725440807e-05, |
| "loss": 0.0438, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.309823677581864, |
| "grad_norm": 0.08804096281528473, |
| "learning_rate": 6.926952141057935e-05, |
| "loss": 0.0512, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.322418136020151, |
| "grad_norm": 2.246978521347046, |
| "learning_rate": 6.801007556675063e-05, |
| "loss": 0.0056, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.3350125944584383, |
| "grad_norm": 0.05517780780792236, |
| "learning_rate": 6.675062972292192e-05, |
| "loss": 0.0561, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.3476070528967254, |
| "grad_norm": 0.21189695596694946, |
| "learning_rate": 6.54911838790932e-05, |
| "loss": 0.0571, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.3602015113350125, |
| "grad_norm": 0.010511963628232479, |
| "learning_rate": 6.423173803526448e-05, |
| "loss": 0.0039, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.3727959697732999, |
| "grad_norm": 0.009107470512390137, |
| "learning_rate": 6.297229219143577e-05, |
| "loss": 0.1101, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.385390428211587, |
| "grad_norm": 0.016693545505404472, |
| "learning_rate": 6.171284634760705e-05, |
| "loss": 0.0232, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.385390428211587, |
| "eval_accuracy": 0.732421875, |
| "eval_loss": 1.71272611618042, |
| "eval_runtime": 8.7899, |
| "eval_samples_per_second": 116.497, |
| "eval_steps_per_second": 14.562, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.397984886649874, |
| "grad_norm": 0.01695523038506508, |
| "learning_rate": 6.045340050377835e-05, |
| "loss": 0.0544, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.4105793450881612, |
| "grad_norm": 0.044825177639722824, |
| "learning_rate": 5.9193954659949624e-05, |
| "loss": 0.045, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.4231738035264483, |
| "grad_norm": 0.010057768784463406, |
| "learning_rate": 5.7934508816120914e-05, |
| "loss": 0.1501, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.4357682619647356, |
| "grad_norm": 0.005191602278500795, |
| "learning_rate": 5.667506297229219e-05, |
| "loss": 0.0097, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.4483627204030227, |
| "grad_norm": 0.0071839033626019955, |
| "learning_rate": 5.541561712846348e-05, |
| "loss": 0.0154, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.4609571788413098, |
| "grad_norm": 0.009370822459459305, |
| "learning_rate": 5.415617128463476e-05, |
| "loss": 0.1309, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.473551637279597, |
| "grad_norm": 0.010776126757264137, |
| "learning_rate": 5.289672544080605e-05, |
| "loss": 0.0048, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.486146095717884, |
| "grad_norm": 0.17925524711608887, |
| "learning_rate": 5.1637279596977326e-05, |
| "loss": 0.0181, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.4987405541561714, |
| "grad_norm": 0.013006888329982758, |
| "learning_rate": 5.0377833753148616e-05, |
| "loss": 0.0648, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.5113350125944585, |
| "grad_norm": 0.015756677836179733, |
| "learning_rate": 4.91183879093199e-05, |
| "loss": 0.1154, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.5113350125944585, |
| "eval_accuracy": 0.7236328125, |
| "eval_loss": 1.8169147968292236, |
| "eval_runtime": 8.2529, |
| "eval_samples_per_second": 124.078, |
| "eval_steps_per_second": 15.51, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.5239294710327456, |
| "grad_norm": 0.014958159998059273, |
| "learning_rate": 4.785894206549118e-05, |
| "loss": 0.0025, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.536523929471033, |
| "grad_norm": 0.29688045382499695, |
| "learning_rate": 4.659949622166247e-05, |
| "loss": 0.0052, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.5491183879093198, |
| "grad_norm": 0.06546491384506226, |
| "learning_rate": 4.534005037783376e-05, |
| "loss": 0.0425, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.561712846347607, |
| "grad_norm": 0.0268222838640213, |
| "learning_rate": 4.408060453400504e-05, |
| "loss": 0.0056, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.5743073047858942, |
| "grad_norm": 0.03548605740070343, |
| "learning_rate": 4.2821158690176324e-05, |
| "loss": 0.0047, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.5869017632241813, |
| "grad_norm": 2.136223554611206, |
| "learning_rate": 4.1561712846347615e-05, |
| "loss": 0.1031, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.5994962216624686, |
| "grad_norm": 0.0036976351402699947, |
| "learning_rate": 4.03022670025189e-05, |
| "loss": 0.0024, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.6120906801007555, |
| "grad_norm": 0.00549653172492981, |
| "learning_rate": 3.904282115869018e-05, |
| "loss": 0.006, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.6246851385390428, |
| "grad_norm": 0.008129788562655449, |
| "learning_rate": 3.7783375314861465e-05, |
| "loss": 0.0026, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.63727959697733, |
| "grad_norm": 0.006744919810444117, |
| "learning_rate": 3.652392947103275e-05, |
| "loss": 0.0355, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.63727959697733, |
| "eval_accuracy": 0.7333984375, |
| "eval_loss": 1.7981336116790771, |
| "eval_runtime": 7.7358, |
| "eval_samples_per_second": 132.372, |
| "eval_steps_per_second": 16.546, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.649874055415617, |
| "grad_norm": 0.09279919415712357, |
| "learning_rate": 3.526448362720403e-05, |
| "loss": 0.0353, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.6624685138539044, |
| "grad_norm": 0.005968036130070686, |
| "learning_rate": 3.4005037783375316e-05, |
| "loss": 0.0017, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.6750629722921915, |
| "grad_norm": 0.00884359423071146, |
| "learning_rate": 3.27455919395466e-05, |
| "loss": 0.0039, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.6876574307304786, |
| "grad_norm": 0.007703159470111132, |
| "learning_rate": 3.148614609571788e-05, |
| "loss": 0.0124, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.700251889168766, |
| "grad_norm": 0.01291657891124487, |
| "learning_rate": 3.0226700251889174e-05, |
| "loss": 0.0036, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.7128463476070528, |
| "grad_norm": 0.008787489496171474, |
| "learning_rate": 2.8967254408060457e-05, |
| "loss": 0.0021, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.7254408060453401, |
| "grad_norm": 0.04295135661959648, |
| "learning_rate": 2.770780856423174e-05, |
| "loss": 0.0061, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.7380352644836272, |
| "grad_norm": 2.3927783966064453, |
| "learning_rate": 2.6448362720403024e-05, |
| "loss": 0.1051, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.7506297229219143, |
| "grad_norm": 0.034302860498428345, |
| "learning_rate": 2.5188916876574308e-05, |
| "loss": 0.0023, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.7632241813602016, |
| "grad_norm": 0.01246937457472086, |
| "learning_rate": 2.392947103274559e-05, |
| "loss": 0.0042, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.7632241813602016, |
| "eval_accuracy": 0.7451171875, |
| "eval_loss": 1.7518525123596191, |
| "eval_runtime": 7.828, |
| "eval_samples_per_second": 130.813, |
| "eval_steps_per_second": 16.352, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.7758186397984885, |
| "grad_norm": 0.03391415998339653, |
| "learning_rate": 2.267002518891688e-05, |
| "loss": 0.1411, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.7884130982367759, |
| "grad_norm": 0.004355714190751314, |
| "learning_rate": 2.1410579345088162e-05, |
| "loss": 0.0714, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.801007556675063, |
| "grad_norm": 5.468477249145508, |
| "learning_rate": 2.015113350125945e-05, |
| "loss": 0.1146, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.81360201511335, |
| "grad_norm": 0.005633903201669455, |
| "learning_rate": 1.8891687657430733e-05, |
| "loss": 0.0023, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.8261964735516374, |
| "grad_norm": 0.09885888546705246, |
| "learning_rate": 1.7632241813602016e-05, |
| "loss": 0.002, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.8387909319899243, |
| "grad_norm": 0.0057752556167542934, |
| "learning_rate": 1.63727959697733e-05, |
| "loss": 0.0792, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.8513853904282116, |
| "grad_norm": 0.009123986586928368, |
| "learning_rate": 1.5113350125944587e-05, |
| "loss": 0.0127, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.8639798488664987, |
| "grad_norm": 0.07375753670930862, |
| "learning_rate": 1.385390428211587e-05, |
| "loss": 0.0621, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.8765743073047858, |
| "grad_norm": 0.007538340985774994, |
| "learning_rate": 1.2594458438287154e-05, |
| "loss": 0.0763, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.8891687657430731, |
| "grad_norm": 0.009298047050833702, |
| "learning_rate": 1.133501259445844e-05, |
| "loss": 0.0435, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.8891687657430731, |
| "eval_accuracy": 0.734375, |
| "eval_loss": 1.8184525966644287, |
| "eval_runtime": 7.6441, |
| "eval_samples_per_second": 133.959, |
| "eval_steps_per_second": 16.745, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.9017632241813602, |
| "grad_norm": 0.018270719796419144, |
| "learning_rate": 1.0075566750629725e-05, |
| "loss": 0.0034, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.9143576826196473, |
| "grad_norm": 0.03833504393696785, |
| "learning_rate": 8.816120906801008e-06, |
| "loss": 0.0213, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.9269521410579347, |
| "grad_norm": 0.014325067400932312, |
| "learning_rate": 7.556675062972293e-06, |
| "loss": 0.0808, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.9395465994962215, |
| "grad_norm": 0.01397955697029829, |
| "learning_rate": 6.297229219143577e-06, |
| "loss": 0.0122, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.9521410579345089, |
| "grad_norm": 21.65605926513672, |
| "learning_rate": 5.037783375314862e-06, |
| "loss": 0.0355, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.964735516372796, |
| "grad_norm": 0.008561785332858562, |
| "learning_rate": 3.7783375314861467e-06, |
| "loss": 0.0032, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.977329974811083, |
| "grad_norm": 0.006672169081866741, |
| "learning_rate": 2.518891687657431e-06, |
| "loss": 0.003, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.9899244332493704, |
| "grad_norm": 0.007904536090791225, |
| "learning_rate": 1.2594458438287156e-06, |
| "loss": 0.0062, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 1588, |
| "total_flos": 9.84315859688448e+17, |
| "train_loss": 0.13015312990478664, |
| "train_runtime": 869.5131, |
| "train_samples_per_second": 14.606, |
| "train_steps_per_second": 1.826 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1588, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.84315859688448e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|