diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5746 @@ +{ + "best_metric": 0.03600074350833893, + "best_model_checkpoint": "saves/psy-course/Llama-3.1-8B-Instruct/train/fold3/checkpoint-1300", + "epoch": 9.995295888606643, + "eval_steps": 50, + "global_step": 6640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.015053156458744943, + "grad_norm": 4.750461578369141, + "learning_rate": 1.5060240963855423e-06, + "loss": 1.5851, + "step": 10 + }, + { + "epoch": 0.030106312917489886, + "grad_norm": 5.361246109008789, + "learning_rate": 3.0120481927710846e-06, + "loss": 1.5514, + "step": 20 + }, + { + "epoch": 0.04515946937623483, + "grad_norm": 5.0164594650268555, + "learning_rate": 4.518072289156626e-06, + "loss": 1.4768, + "step": 30 + }, + { + "epoch": 0.06021262583497977, + "grad_norm": 5.635066032409668, + "learning_rate": 6.024096385542169e-06, + "loss": 1.4376, + "step": 40 + }, + { + "epoch": 0.07526578229372471, + "grad_norm": 2.2061386108398438, + "learning_rate": 7.530120481927712e-06, + "loss": 0.9136, + "step": 50 + }, + { + "epoch": 0.07526578229372471, + "eval_loss": 0.7079544067382812, + "eval_runtime": 160.6919, + "eval_samples_per_second": 7.349, + "eval_steps_per_second": 7.349, + "step": 50 + }, + { + "epoch": 0.09031893875246966, + "grad_norm": 1.8459008932113647, + "learning_rate": 9.036144578313253e-06, + "loss": 0.6329, + "step": 60 + }, + { + "epoch": 0.1053720952112146, + "grad_norm": 1.8084211349487305, + "learning_rate": 1.0542168674698796e-05, + "loss": 0.5347, + "step": 70 + }, + { + "epoch": 0.12042525166995954, + "grad_norm": 0.8255929350852966, + "learning_rate": 1.2048192771084338e-05, + "loss": 0.4346, + "step": 80 + }, + { + "epoch": 0.1354784081287045, + "grad_norm": 1.4775186777114868, + "learning_rate": 1.3554216867469879e-05, + "loss": 0.2425, + "step": 90 + }, + { + "epoch": 0.15053156458744943, + "grad_norm": 1.001645565032959, + "learning_rate": 1.5060240963855424e-05, + "loss": 0.2044, + "step": 100 + }, + { + "epoch": 0.15053156458744943, + "eval_loss": 0.1460021436214447, + "eval_runtime": 160.5645, + "eval_samples_per_second": 7.355, + "eval_steps_per_second": 7.355, + "step": 100 + }, + { + "epoch": 0.16558472104619437, + "grad_norm": 1.0702455043792725, + "learning_rate": 1.6566265060240965e-05, + "loss": 0.1307, + "step": 110 + }, + { + "epoch": 0.18063787750493931, + "grad_norm": 0.8481782674789429, + "learning_rate": 1.8072289156626505e-05, + "loss": 0.1432, + "step": 120 + }, + { + "epoch": 0.19569103396368426, + "grad_norm": 0.9463543891906738, + "learning_rate": 1.957831325301205e-05, + "loss": 0.0993, + "step": 130 + }, + { + "epoch": 0.2107441904224292, + "grad_norm": 1.6875685453414917, + "learning_rate": 2.1084337349397593e-05, + "loss": 0.1025, + "step": 140 + }, + { + "epoch": 0.22579734688117414, + "grad_norm": 1.484510898590088, + "learning_rate": 2.2590361445783133e-05, + "loss": 0.1028, + "step": 150 + }, + { + "epoch": 0.22579734688117414, + "eval_loss": 0.08280546218156815, + "eval_runtime": 160.7165, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, + "step": 150 + }, + { + "epoch": 0.2408505033399191, + "grad_norm": 0.7870819568634033, + "learning_rate": 2.4096385542168677e-05, + "loss": 0.0848, + "step": 160 + }, + { + "epoch": 0.25590365979866403, + "grad_norm": 1.0159631967544556, + "learning_rate": 2.560240963855422e-05, + "loss": 0.0862, + "step": 170 + }, + { + "epoch": 0.270956816257409, + "grad_norm": 0.9531567692756653, + "learning_rate": 2.7108433734939758e-05, + "loss": 0.0782, + "step": 180 + }, + { + "epoch": 0.2860099727161539, + "grad_norm": 1.486837387084961, + "learning_rate": 2.86144578313253e-05, + "loss": 0.0697, + "step": 190 + }, + { + "epoch": 0.30106312917489886, + "grad_norm": 1.089078426361084, + "learning_rate": 3.012048192771085e-05, + "loss": 0.08, + "step": 200 + }, + { + "epoch": 0.30106312917489886, + "eval_loss": 0.06571684032678604, + "eval_runtime": 160.7264, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, + "step": 200 + }, + { + "epoch": 0.3161162856336438, + "grad_norm": 1.0826783180236816, + "learning_rate": 3.162650602409639e-05, + "loss": 0.0737, + "step": 210 + }, + { + "epoch": 0.33116944209238874, + "grad_norm": 0.972507894039154, + "learning_rate": 3.313253012048193e-05, + "loss": 0.0669, + "step": 220 + }, + { + "epoch": 0.3462225985511337, + "grad_norm": 0.792308509349823, + "learning_rate": 3.463855421686747e-05, + "loss": 0.074, + "step": 230 + }, + { + "epoch": 0.36127575500987863, + "grad_norm": 1.3287421464920044, + "learning_rate": 3.614457831325301e-05, + "loss": 0.0656, + "step": 240 + }, + { + "epoch": 0.3763289114686236, + "grad_norm": 1.1908459663391113, + "learning_rate": 3.765060240963856e-05, + "loss": 0.0625, + "step": 250 + }, + { + "epoch": 0.3763289114686236, + "eval_loss": 0.05646319314837456, + "eval_runtime": 160.6166, + "eval_samples_per_second": 7.353, + "eval_steps_per_second": 7.353, + "step": 250 + }, + { + "epoch": 0.3913820679273685, + "grad_norm": 0.7318952679634094, + "learning_rate": 3.91566265060241e-05, + "loss": 0.0439, + "step": 260 + }, + { + "epoch": 0.40643522438611346, + "grad_norm": 0.5888552069664001, + "learning_rate": 4.066265060240964e-05, + "loss": 0.0557, + "step": 270 + }, + { + "epoch": 0.4214883808448584, + "grad_norm": 0.6811500787734985, + "learning_rate": 4.2168674698795186e-05, + "loss": 0.0623, + "step": 280 + }, + { + "epoch": 0.43654153730360334, + "grad_norm": 0.4946328103542328, + "learning_rate": 4.3674698795180726e-05, + "loss": 0.0531, + "step": 290 + }, + { + "epoch": 0.4515946937623483, + "grad_norm": 1.1131876707077026, + "learning_rate": 4.5180722891566266e-05, + "loss": 0.0631, + "step": 300 + }, + { + "epoch": 0.4515946937623483, + "eval_loss": 0.062427956610918045, + "eval_runtime": 160.5499, + "eval_samples_per_second": 7.356, + "eval_steps_per_second": 7.356, + "step": 300 + }, + { + "epoch": 0.46664785022109323, + "grad_norm": 0.3793734908103943, + "learning_rate": 4.668674698795181e-05, + "loss": 0.049, + "step": 310 + }, + { + "epoch": 0.4817010066798382, + "grad_norm": 1.2404499053955078, + "learning_rate": 4.8192771084337354e-05, + "loss": 0.0771, + "step": 320 + }, + { + "epoch": 0.4967541631385831, + "grad_norm": 1.1948049068450928, + "learning_rate": 4.9698795180722894e-05, + "loss": 0.0562, + "step": 330 + }, + { + "epoch": 0.5118073195973281, + "grad_norm": 0.8342772126197815, + "learning_rate": 5.120481927710844e-05, + "loss": 0.0399, + "step": 340 + }, + { + "epoch": 0.526860476056073, + "grad_norm": 0.6116824150085449, + "learning_rate": 5.271084337349398e-05, + "loss": 0.0453, + "step": 350 + }, + { + "epoch": 0.526860476056073, + "eval_loss": 0.05592818930745125, + "eval_runtime": 160.8997, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, + "step": 350 + }, + { + "epoch": 0.541913632514818, + "grad_norm": 0.5662557482719421, + "learning_rate": 5.4216867469879516e-05, + "loss": 0.058, + "step": 360 + }, + { + "epoch": 0.5569667889735629, + "grad_norm": 0.7923420667648315, + "learning_rate": 5.572289156626507e-05, + "loss": 0.05, + "step": 370 + }, + { + "epoch": 0.5720199454323078, + "grad_norm": 0.4902084767818451, + "learning_rate": 5.72289156626506e-05, + "loss": 0.0471, + "step": 380 + }, + { + "epoch": 0.5870731018910528, + "grad_norm": 0.9782336354255676, + "learning_rate": 5.8734939759036144e-05, + "loss": 0.0474, + "step": 390 + }, + { + "epoch": 0.6021262583497977, + "grad_norm": 0.5204954147338867, + "learning_rate": 6.02409638554217e-05, + "loss": 0.057, + "step": 400 + }, + { + "epoch": 0.6021262583497977, + "eval_loss": 0.05005372315645218, + "eval_runtime": 160.8158, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, + "step": 400 + }, + { + "epoch": 0.6171794148085427, + "grad_norm": 0.3158038854598999, + "learning_rate": 6.174698795180724e-05, + "loss": 0.0412, + "step": 410 + }, + { + "epoch": 0.6322325712672876, + "grad_norm": 1.257641077041626, + "learning_rate": 6.325301204819278e-05, + "loss": 0.0626, + "step": 420 + }, + { + "epoch": 0.6472857277260325, + "grad_norm": 0.5771623849868774, + "learning_rate": 6.47590361445783e-05, + "loss": 0.0481, + "step": 430 + }, + { + "epoch": 0.6623388841847775, + "grad_norm": 0.4072902202606201, + "learning_rate": 6.626506024096386e-05, + "loss": 0.0425, + "step": 440 + }, + { + "epoch": 0.6773920406435224, + "grad_norm": 1.0025973320007324, + "learning_rate": 6.77710843373494e-05, + "loss": 0.0534, + "step": 450 + }, + { + "epoch": 0.6773920406435224, + "eval_loss": 0.0463910847902298, + "eval_runtime": 160.8868, + "eval_samples_per_second": 7.341, + "eval_steps_per_second": 7.341, + "step": 450 + }, + { + "epoch": 0.6924451971022674, + "grad_norm": 1.0996001958847046, + "learning_rate": 6.927710843373494e-05, + "loss": 0.0504, + "step": 460 + }, + { + "epoch": 0.7074983535610123, + "grad_norm": 0.5003724694252014, + "learning_rate": 7.07831325301205e-05, + "loss": 0.0454, + "step": 470 + }, + { + "epoch": 0.7225515100197573, + "grad_norm": 0.360058456659317, + "learning_rate": 7.228915662650602e-05, + "loss": 0.0372, + "step": 480 + }, + { + "epoch": 0.7376046664785022, + "grad_norm": 1.0568592548370361, + "learning_rate": 7.379518072289156e-05, + "loss": 0.062, + "step": 490 + }, + { + "epoch": 0.7526578229372471, + "grad_norm": 0.5876762270927429, + "learning_rate": 7.530120481927712e-05, + "loss": 0.0377, + "step": 500 + }, + { + "epoch": 0.7526578229372471, + "eval_loss": 0.04563547670841217, + "eval_runtime": 160.8707, + "eval_samples_per_second": 7.341, + "eval_steps_per_second": 7.341, + "step": 500 + }, + { + "epoch": 0.7677109793959921, + "grad_norm": 0.3074174225330353, + "learning_rate": 7.680722891566266e-05, + "loss": 0.0418, + "step": 510 + }, + { + "epoch": 0.782764135854737, + "grad_norm": 0.5924628376960754, + "learning_rate": 7.83132530120482e-05, + "loss": 0.0538, + "step": 520 + }, + { + "epoch": 0.797817292313482, + "grad_norm": 0.28745847940444946, + "learning_rate": 7.981927710843375e-05, + "loss": 0.0599, + "step": 530 + }, + { + "epoch": 0.8128704487722269, + "grad_norm": 0.4864930510520935, + "learning_rate": 8.132530120481928e-05, + "loss": 0.0624, + "step": 540 + }, + { + "epoch": 0.8279236052309719, + "grad_norm": 0.6496531963348389, + "learning_rate": 8.283132530120482e-05, + "loss": 0.0504, + "step": 550 + }, + { + "epoch": 0.8279236052309719, + "eval_loss": 0.04653579369187355, + "eval_runtime": 160.9903, + "eval_samples_per_second": 7.336, + "eval_steps_per_second": 7.336, + "step": 550 + }, + { + "epoch": 0.8429767616897168, + "grad_norm": 0.7302563190460205, + "learning_rate": 8.433734939759037e-05, + "loss": 0.067, + "step": 560 + }, + { + "epoch": 0.8580299181484617, + "grad_norm": 0.8676456809043884, + "learning_rate": 8.584337349397591e-05, + "loss": 0.0496, + "step": 570 + }, + { + "epoch": 0.8730830746072067, + "grad_norm": 0.34765201807022095, + "learning_rate": 8.734939759036145e-05, + "loss": 0.0542, + "step": 580 + }, + { + "epoch": 0.8881362310659516, + "grad_norm": 1.036482810974121, + "learning_rate": 8.885542168674699e-05, + "loss": 0.0652, + "step": 590 + }, + { + "epoch": 0.9031893875246966, + "grad_norm": 0.4210595488548279, + "learning_rate": 9.036144578313253e-05, + "loss": 0.0379, + "step": 600 + }, + { + "epoch": 0.9031893875246966, + "eval_loss": 0.04480915144085884, + "eval_runtime": 160.8215, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, + "step": 600 + }, + { + "epoch": 0.9182425439834415, + "grad_norm": 0.5708878636360168, + "learning_rate": 9.186746987951807e-05, + "loss": 0.0419, + "step": 610 + }, + { + "epoch": 0.9332957004421865, + "grad_norm": 0.3627513349056244, + "learning_rate": 9.337349397590361e-05, + "loss": 0.0436, + "step": 620 + }, + { + "epoch": 0.9483488569009314, + "grad_norm": 0.5482935309410095, + "learning_rate": 9.487951807228917e-05, + "loss": 0.0536, + "step": 630 + }, + { + "epoch": 0.9634020133596763, + "grad_norm": 0.3744940459728241, + "learning_rate": 9.638554216867471e-05, + "loss": 0.0474, + "step": 640 + }, + { + "epoch": 0.9784551698184213, + "grad_norm": 0.4197324216365814, + "learning_rate": 9.789156626506025e-05, + "loss": 0.0403, + "step": 650 + }, + { + "epoch": 0.9784551698184213, + "eval_loss": 0.04545089974999428, + "eval_runtime": 160.9346, + "eval_samples_per_second": 7.338, + "eval_steps_per_second": 7.338, + "step": 650 + }, + { + "epoch": 0.9935083262771662, + "grad_norm": 0.2738643288612366, + "learning_rate": 9.939759036144579e-05, + "loss": 0.0338, + "step": 660 + }, + { + "epoch": 1.0085614827359113, + "grad_norm": 0.3260810971260071, + "learning_rate": 9.99997512742683e-05, + "loss": 0.0486, + "step": 670 + }, + { + "epoch": 1.0236146391946561, + "grad_norm": 0.3396519720554352, + "learning_rate": 9.999823129264712e-05, + "loss": 0.0499, + "step": 680 + }, + { + "epoch": 1.0386677956534012, + "grad_norm": 0.3331776559352875, + "learning_rate": 9.999532955232234e-05, + "loss": 0.0422, + "step": 690 + }, + { + "epoch": 1.053720952112146, + "grad_norm": 0.2971383035182953, + "learning_rate": 9.999104613348688e-05, + "loss": 0.0331, + "step": 700 + }, + { + "epoch": 1.053720952112146, + "eval_loss": 0.04420311748981476, + "eval_runtime": 160.7051, + "eval_samples_per_second": 7.349, + "eval_steps_per_second": 7.349, + "step": 700 + }, + { + "epoch": 1.068774108570891, + "grad_norm": 0.19040505588054657, + "learning_rate": 9.998538115451798e-05, + "loss": 0.0393, + "step": 710 + }, + { + "epoch": 1.083827265029636, + "grad_norm": 0.42369094491004944, + "learning_rate": 9.997833477197385e-05, + "loss": 0.0293, + "step": 720 + }, + { + "epoch": 1.098880421488381, + "grad_norm": 0.25379225611686707, + "learning_rate": 9.996990718058939e-05, + "loss": 0.0453, + "step": 730 + }, + { + "epoch": 1.1139335779471258, + "grad_norm": 0.36474576592445374, + "learning_rate": 9.996009861327077e-05, + "loss": 0.0389, + "step": 740 + }, + { + "epoch": 1.1289867344058706, + "grad_norm": 0.28880634903907776, + "learning_rate": 9.994890934108907e-05, + "loss": 0.0376, + "step": 750 + }, + { + "epoch": 1.1289867344058706, + "eval_loss": 0.039407871663570404, + "eval_runtime": 160.7315, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, + "step": 750 + }, + { + "epoch": 1.1440398908646157, + "grad_norm": 0.537286639213562, + "learning_rate": 9.993633967327269e-05, + "loss": 0.0333, + "step": 760 + }, + { + "epoch": 1.1590930473233607, + "grad_norm": 0.5258364081382751, + "learning_rate": 9.99223899571989e-05, + "loss": 0.038, + "step": 770 + }, + { + "epoch": 1.1741462037821055, + "grad_norm": 0.44215118885040283, + "learning_rate": 9.990706057838416e-05, + "loss": 0.0312, + "step": 780 + }, + { + "epoch": 1.1891993602408506, + "grad_norm": 0.3708779811859131, + "learning_rate": 9.98903519604735e-05, + "loss": 0.0407, + "step": 790 + }, + { + "epoch": 1.2042525166995954, + "grad_norm": 0.2657548189163208, + "learning_rate": 9.987226456522884e-05, + "loss": 0.0425, + "step": 800 + }, + { + "epoch": 1.2042525166995954, + "eval_loss": 0.0430581271648407, + "eval_runtime": 160.9064, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, + "step": 800 + }, + { + "epoch": 1.2193056731583405, + "grad_norm": 0.325986385345459, + "learning_rate": 9.985279889251615e-05, + "loss": 0.0396, + "step": 810 + }, + { + "epoch": 1.2343588296170853, + "grad_norm": 0.5112910866737366, + "learning_rate": 9.983195548029173e-05, + "loss": 0.0462, + "step": 820 + }, + { + "epoch": 1.2494119860758304, + "grad_norm": 0.1505061537027359, + "learning_rate": 9.980973490458728e-05, + "loss": 0.0364, + "step": 830 + }, + { + "epoch": 1.2644651425345752, + "grad_norm": 0.20219314098358154, + "learning_rate": 9.9786137779494e-05, + "loss": 0.0444, + "step": 840 + }, + { + "epoch": 1.2795182989933203, + "grad_norm": 0.3308413326740265, + "learning_rate": 9.976116475714563e-05, + "loss": 0.0351, + "step": 850 + }, + { + "epoch": 1.2795182989933203, + "eval_loss": 0.03973166272044182, + "eval_runtime": 160.8199, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, + "step": 850 + }, + { + "epoch": 1.294571455452065, + "grad_norm": 0.2993268072605133, + "learning_rate": 9.973481652770038e-05, + "loss": 0.0469, + "step": 860 + }, + { + "epoch": 1.30962461191081, + "grad_norm": 0.16391193866729736, + "learning_rate": 9.970709381932193e-05, + "loss": 0.0376, + "step": 870 + }, + { + "epoch": 1.324677768369555, + "grad_norm": 0.26282191276550293, + "learning_rate": 9.967799739815925e-05, + "loss": 0.0384, + "step": 880 + }, + { + "epoch": 1.3397309248283, + "grad_norm": 0.3372248113155365, + "learning_rate": 9.964752806832545e-05, + "loss": 0.0419, + "step": 890 + }, + { + "epoch": 1.3547840812870449, + "grad_norm": 0.5041816234588623, + "learning_rate": 9.961568667187556e-05, + "loss": 0.047, + "step": 900 + }, + { + "epoch": 1.3547840812870449, + "eval_loss": 0.04260425642132759, + "eval_runtime": 160.8075, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, + "step": 900 + }, + { + "epoch": 1.3698372377457897, + "grad_norm": 0.27931326627731323, + "learning_rate": 9.958247408878321e-05, + "loss": 0.0372, + "step": 910 + }, + { + "epoch": 1.3848903942045347, + "grad_norm": 0.21397525072097778, + "learning_rate": 9.954789123691642e-05, + "loss": 0.0372, + "step": 920 + }, + { + "epoch": 1.3999435506632798, + "grad_norm": 0.2363465428352356, + "learning_rate": 9.951193907201213e-05, + "loss": 0.0329, + "step": 930 + }, + { + "epoch": 1.4149967071220246, + "grad_norm": 0.1953222006559372, + "learning_rate": 9.947461858764978e-05, + "loss": 0.0416, + "step": 940 + }, + { + "epoch": 1.4300498635807695, + "grad_norm": 0.563061535358429, + "learning_rate": 9.943593081522397e-05, + "loss": 0.0369, + "step": 950 + }, + { + "epoch": 1.4300498635807695, + "eval_loss": 0.03970416262745857, + "eval_runtime": 160.9086, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 7.34, + "step": 950 + }, + { + "epoch": 1.4451030200395145, + "grad_norm": 0.6983281373977661, + "learning_rate": 9.939587682391586e-05, + "loss": 0.0434, + "step": 960 + }, + { + "epoch": 1.4601561764982596, + "grad_norm": 0.30356237292289734, + "learning_rate": 9.93544577206636e-05, + "loss": 0.0317, + "step": 970 + }, + { + "epoch": 1.4752093329570044, + "grad_norm": 0.4461734890937805, + "learning_rate": 9.931167465013182e-05, + "loss": 0.034, + "step": 980 + }, + { + "epoch": 1.4902624894157492, + "grad_norm": 0.5264511704444885, + "learning_rate": 9.926752879467996e-05, + "loss": 0.0351, + "step": 990 + }, + { + "epoch": 1.5053156458744943, + "grad_norm": 0.2561458349227905, + "learning_rate": 9.922202137432955e-05, + "loss": 0.0327, + "step": 1000 + }, + { + "epoch": 1.5053156458744943, + "eval_loss": 0.04186021536588669, + "eval_runtime": 160.7321, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, + "step": 1000 + }, + { + "epoch": 1.5203688023332393, + "grad_norm": 0.26379063725471497, + "learning_rate": 9.917515364673056e-05, + "loss": 0.0378, + "step": 1010 + }, + { + "epoch": 1.5354219587919842, + "grad_norm": 0.32819533348083496, + "learning_rate": 9.912692690712665e-05, + "loss": 0.0348, + "step": 1020 + }, + { + "epoch": 1.550475115250729, + "grad_norm": 0.31579750776290894, + "learning_rate": 9.907734248831931e-05, + "loss": 0.0364, + "step": 1030 + }, + { + "epoch": 1.565528271709474, + "grad_norm": 0.6729230880737305, + "learning_rate": 9.902640176063103e-05, + "loss": 0.0428, + "step": 1040 + }, + { + "epoch": 1.5805814281682191, + "grad_norm": 0.3203141689300537, + "learning_rate": 9.89741061318675e-05, + "loss": 0.0323, + "step": 1050 + }, + { + "epoch": 1.5805814281682191, + "eval_loss": 0.04276812821626663, + "eval_runtime": 160.8693, + "eval_samples_per_second": 7.341, + "eval_steps_per_second": 7.341, + "step": 1050 + }, + { + "epoch": 1.595634584626964, + "grad_norm": 0.5274858474731445, + "learning_rate": 9.892045704727864e-05, + "loss": 0.0437, + "step": 1060 + }, + { + "epoch": 1.6106877410857088, + "grad_norm": 0.5357206463813782, + "learning_rate": 9.886545598951871e-05, + "loss": 0.0449, + "step": 1070 + }, + { + "epoch": 1.6257408975444538, + "grad_norm": 0.26878616213798523, + "learning_rate": 9.880910447860527e-05, + "loss": 0.0275, + "step": 1080 + }, + { + "epoch": 1.6407940540031989, + "grad_norm": 0.17348457872867584, + "learning_rate": 9.875140407187721e-05, + "loss": 0.0282, + "step": 1090 + }, + { + "epoch": 1.6558472104619437, + "grad_norm": 0.13969603180885315, + "learning_rate": 9.869235636395177e-05, + "loss": 0.0264, + "step": 1100 + }, + { + "epoch": 1.6558472104619437, + "eval_loss": 0.03832107409834862, + "eval_runtime": 160.7845, + "eval_samples_per_second": 7.345, + "eval_steps_per_second": 7.345, + "step": 1100 + }, + { + "epoch": 1.6709003669206886, + "grad_norm": 0.2004968523979187, + "learning_rate": 9.863196298668032e-05, + "loss": 0.0451, + "step": 1110 + }, + { + "epoch": 1.6859535233794336, + "grad_norm": 0.17377223074436188, + "learning_rate": 9.857022560910338e-05, + "loss": 0.0328, + "step": 1120 + }, + { + "epoch": 1.7010066798381787, + "grad_norm": 0.2831920087337494, + "learning_rate": 9.850714593740453e-05, + "loss": 0.0265, + "step": 1130 + }, + { + "epoch": 1.7160598362969235, + "grad_norm": 0.2152077555656433, + "learning_rate": 9.844272571486311e-05, + "loss": 0.0331, + "step": 1140 + }, + { + "epoch": 1.7311129927556683, + "grad_norm": 0.23021051287651062, + "learning_rate": 9.837696672180618e-05, + "loss": 0.0445, + "step": 1150 + }, + { + "epoch": 1.7311129927556683, + "eval_loss": 0.036994002759456635, + "eval_runtime": 160.6249, + "eval_samples_per_second": 7.353, + "eval_steps_per_second": 7.353, + "step": 1150 + }, + { + "epoch": 1.7461661492144134, + "grad_norm": 0.309906542301178, + "learning_rate": 9.830987077555924e-05, + "loss": 0.0356, + "step": 1160 + }, + { + "epoch": 1.7612193056731584, + "grad_norm": 0.21372509002685547, + "learning_rate": 9.824143973039603e-05, + "loss": 0.0426, + "step": 1170 + }, + { + "epoch": 1.7762724621319033, + "grad_norm": 0.3771088421344757, + "learning_rate": 9.817167547748729e-05, + "loss": 0.0354, + "step": 1180 + }, + { + "epoch": 1.791325618590648, + "grad_norm": 0.22261619567871094, + "learning_rate": 9.810057994484852e-05, + "loss": 0.0406, + "step": 1190 + }, + { + "epoch": 1.8063787750493931, + "grad_norm": 0.6612097024917603, + "learning_rate": 9.802815509728662e-05, + "loss": 0.0357, + "step": 1200 + }, + { + "epoch": 1.8063787750493931, + "eval_loss": 0.037844400852918625, + "eval_runtime": 160.7204, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, + "step": 1200 + }, + { + "epoch": 1.8214319315081382, + "grad_norm": 0.682697057723999, + "learning_rate": 9.795440293634567e-05, + "loss": 0.0363, + "step": 1210 + }, + { + "epoch": 1.836485087966883, + "grad_norm": 0.31857362389564514, + "learning_rate": 9.787932550025158e-05, + "loss": 0.0381, + "step": 1220 + }, + { + "epoch": 1.8515382444256279, + "grad_norm": 0.27617108821868896, + "learning_rate": 9.780292486385574e-05, + "loss": 0.0351, + "step": 1230 + }, + { + "epoch": 1.866591400884373, + "grad_norm": 0.23234552145004272, + "learning_rate": 9.772520313857775e-05, + "loss": 0.0401, + "step": 1240 + }, + { + "epoch": 1.881644557343118, + "grad_norm": 0.1940830796957016, + "learning_rate": 9.764616247234701e-05, + "loss": 0.0358, + "step": 1250 + }, + { + "epoch": 1.881644557343118, + "eval_loss": 0.036553166806697845, + "eval_runtime": 160.7904, + "eval_samples_per_second": 7.345, + "eval_steps_per_second": 7.345, + "step": 1250 + }, + { + "epoch": 1.8966977138018628, + "grad_norm": 0.2719970941543579, + "learning_rate": 9.756580504954334e-05, + "loss": 0.0441, + "step": 1260 + }, + { + "epoch": 1.9117508702606076, + "grad_norm": 0.19048522412776947, + "learning_rate": 9.748413309093666e-05, + "loss": 0.0309, + "step": 1270 + }, + { + "epoch": 1.9268040267193527, + "grad_norm": 0.1486659049987793, + "learning_rate": 9.740114885362562e-05, + "loss": 0.0412, + "step": 1280 + }, + { + "epoch": 1.9418571831780977, + "grad_norm": 0.4442104995250702, + "learning_rate": 9.731685463097518e-05, + "loss": 0.036, + "step": 1290 + }, + { + "epoch": 1.9569103396368426, + "grad_norm": 0.2808210253715515, + "learning_rate": 9.723125275255325e-05, + "loss": 0.0336, + "step": 1300 + }, + { + "epoch": 1.9569103396368426, + "eval_loss": 0.03600074350833893, + "eval_runtime": 160.7695, + "eval_samples_per_second": 7.346, + "eval_steps_per_second": 7.346, + "step": 1300 + }, + { + "epoch": 1.9719634960955874, + "grad_norm": 0.3234440088272095, + "learning_rate": 9.714434558406636e-05, + "loss": 0.0318, + "step": 1310 + }, + { + "epoch": 1.9870166525543325, + "grad_norm": 0.16435784101486206, + "learning_rate": 9.705613552729415e-05, + "loss": 0.0336, + "step": 1320 + }, + { + "epoch": 2.0020698090130775, + "grad_norm": 0.5010185837745667, + "learning_rate": 9.69666250200232e-05, + "loss": 0.041, + "step": 1330 + }, + { + "epoch": 2.0171229654718226, + "grad_norm": 0.14287133514881134, + "learning_rate": 9.68758165359794e-05, + "loss": 0.0213, + "step": 1340 + }, + { + "epoch": 2.032176121930567, + "grad_norm": 0.1827498823404312, + "learning_rate": 9.678371258475982e-05, + "loss": 0.0233, + "step": 1350 + }, + { + "epoch": 2.032176121930567, + "eval_loss": 0.0407937727868557, + "eval_runtime": 160.5838, + "eval_samples_per_second": 7.354, + "eval_steps_per_second": 7.354, + "step": 1350 + }, + { + "epoch": 2.0472292783893122, + "grad_norm": 0.09340988099575043, + "learning_rate": 9.669031571176322e-05, + "loss": 0.0158, + "step": 1360 + }, + { + "epoch": 2.0622824348480573, + "grad_norm": 0.23635055124759674, + "learning_rate": 9.659562849811976e-05, + "loss": 0.0234, + "step": 1370 + }, + { + "epoch": 2.0773355913068023, + "grad_norm": 0.16082511842250824, + "learning_rate": 9.64996535606196e-05, + "loss": 0.0227, + "step": 1380 + }, + { + "epoch": 2.092388747765547, + "grad_norm": 0.16143430769443512, + "learning_rate": 9.640239355164073e-05, + "loss": 0.0185, + "step": 1390 + }, + { + "epoch": 2.107441904224292, + "grad_norm": 0.5148758292198181, + "learning_rate": 9.630385115907545e-05, + "loss": 0.0311, + "step": 1400 + }, + { + "epoch": 2.107441904224292, + "eval_loss": 0.037585336714982986, + "eval_runtime": 160.5735, + "eval_samples_per_second": 7.355, + "eval_steps_per_second": 7.355, + "step": 1400 + }, + { + "epoch": 2.122495060683037, + "grad_norm": 0.4294039309024811, + "learning_rate": 9.620402910625631e-05, + "loss": 0.0313, + "step": 1410 + }, + { + "epoch": 2.137548217141782, + "grad_norm": 0.45958212018013, + "learning_rate": 9.610293015188067e-05, + "loss": 0.0264, + "step": 1420 + }, + { + "epoch": 2.1526013736005267, + "grad_norm": 0.22370566427707672, + "learning_rate": 9.600055708993461e-05, + "loss": 0.0286, + "step": 1430 + }, + { + "epoch": 2.167654530059272, + "grad_norm": 0.37099528312683105, + "learning_rate": 9.589691274961556e-05, + "loss": 0.0312, + "step": 1440 + }, + { + "epoch": 2.182707686518017, + "grad_norm": 0.2500664293766022, + "learning_rate": 9.579199999525424e-05, + "loss": 0.0243, + "step": 1450 + }, + { + "epoch": 2.182707686518017, + "eval_loss": 0.03909142687916756, + "eval_runtime": 160.8135, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, + "step": 1450 + }, + { + "epoch": 2.197760842976762, + "grad_norm": 0.3968459367752075, + "learning_rate": 9.568582172623544e-05, + "loss": 0.0244, + "step": 1460 + }, + { + "epoch": 2.2128139994355065, + "grad_norm": 0.1588933914899826, + "learning_rate": 9.557838087691791e-05, + "loss": 0.031, + "step": 1470 + }, + { + "epoch": 2.2278671558942515, + "grad_norm": 0.25690123438835144, + "learning_rate": 9.546968041655326e-05, + "loss": 0.0284, + "step": 1480 + }, + { + "epoch": 2.2429203123529966, + "grad_norm": 0.08418586850166321, + "learning_rate": 9.53597233492039e-05, + "loss": 0.0226, + "step": 1490 + }, + { + "epoch": 2.257973468811741, + "grad_norm": 0.22872009873390198, + "learning_rate": 9.524851271366001e-05, + "loss": 0.0299, + "step": 1500 + }, + { + "epoch": 2.257973468811741, + "eval_loss": 0.04315732792019844, + "eval_runtime": 160.7791, + "eval_samples_per_second": 7.345, + "eval_steps_per_second": 7.345, + "step": 1500 + }, + { + "epoch": 2.2730266252704863, + "grad_norm": 0.32274970412254333, + "learning_rate": 9.513605158335562e-05, + "loss": 0.0248, + "step": 1510 + }, + { + "epoch": 2.2880797817292313, + "grad_norm": 0.14836372435092926, + "learning_rate": 9.502234306628355e-05, + "loss": 0.0375, + "step": 1520 + }, + { + "epoch": 2.3031329381879764, + "grad_norm": 0.2848995327949524, + "learning_rate": 9.490739030490963e-05, + "loss": 0.0313, + "step": 1530 + }, + { + "epoch": 2.3181860946467214, + "grad_norm": 0.14220879971981049, + "learning_rate": 9.47911964760858e-05, + "loss": 0.0213, + "step": 1540 + }, + { + "epoch": 2.333239251105466, + "grad_norm": 0.24286463856697083, + "learning_rate": 9.467376479096235e-05, + "loss": 0.0351, + "step": 1550 + }, + { + "epoch": 2.333239251105466, + "eval_loss": 0.04003646597266197, + "eval_runtime": 160.6314, + "eval_samples_per_second": 7.352, + "eval_steps_per_second": 7.352, + "step": 1550 + }, + { + "epoch": 2.348292407564211, + "grad_norm": 0.15897639095783234, + "learning_rate": 9.455509849489915e-05, + "loss": 0.0327, + "step": 1560 + }, + { + "epoch": 2.363345564022956, + "grad_norm": 0.2880462110042572, + "learning_rate": 9.443520086737594e-05, + "loss": 0.0256, + "step": 1570 + }, + { + "epoch": 2.378398720481701, + "grad_norm": 0.21580329537391663, + "learning_rate": 9.431407522190175e-05, + "loss": 0.0195, + "step": 1580 + }, + { + "epoch": 2.393451876940446, + "grad_norm": 0.1254677027463913, + "learning_rate": 9.41917249059233e-05, + "loss": 0.0234, + "step": 1590 + }, + { + "epoch": 2.408505033399191, + "grad_norm": 0.19677628576755524, + "learning_rate": 9.406815330073244e-05, + "loss": 0.0189, + "step": 1600 + }, + { + "epoch": 2.408505033399191, + "eval_loss": 0.04029300436377525, + "eval_runtime": 160.7089, + "eval_samples_per_second": 7.349, + "eval_steps_per_second": 7.349, + "step": 1600 + }, + { + "epoch": 2.423558189857936, + "grad_norm": 0.42127513885498047, + "learning_rate": 9.394336382137285e-05, + "loss": 0.0286, + "step": 1610 + }, + { + "epoch": 2.438611346316681, + "grad_norm": 0.24628838896751404, + "learning_rate": 9.381735991654546e-05, + "loss": 0.0298, + "step": 1620 + }, + { + "epoch": 2.4536645027754256, + "grad_norm": 0.3270810842514038, + "learning_rate": 9.369014506851333e-05, + "loss": 0.0294, + "step": 1630 + }, + { + "epoch": 2.4687176592341706, + "grad_norm": 0.10504765808582306, + "learning_rate": 9.356172279300528e-05, + "loss": 0.0231, + "step": 1640 + }, + { + "epoch": 2.4837708156929157, + "grad_norm": 0.18838824331760406, + "learning_rate": 9.343209663911881e-05, + "loss": 0.0226, + "step": 1650 + }, + { + "epoch": 2.4837708156929157, + "eval_loss": 0.04215184599161148, + "eval_runtime": 160.6773, + "eval_samples_per_second": 7.35, + "eval_steps_per_second": 7.35, + "step": 1650 + }, + { + "epoch": 2.4988239721516607, + "grad_norm": 0.39783433079719543, + "learning_rate": 9.330127018922194e-05, + "loss": 0.0263, + "step": 1660 + }, + { + "epoch": 2.5138771286104054, + "grad_norm": 0.12182117253541946, + "learning_rate": 9.31692470588543e-05, + "loss": 0.018, + "step": 1670 + }, + { + "epoch": 2.5289302850691504, + "grad_norm": 0.21732597053050995, + "learning_rate": 9.303603089662716e-05, + "loss": 0.0263, + "step": 1680 + }, + { + "epoch": 2.5439834415278955, + "grad_norm": 0.6598740220069885, + "learning_rate": 9.290162538412256e-05, + "loss": 0.0292, + "step": 1690 + }, + { + "epoch": 2.5590365979866405, + "grad_norm": 0.35163193941116333, + "learning_rate": 9.276603423579164e-05, + "loss": 0.0313, + "step": 1700 + }, + { + "epoch": 2.5590365979866405, + "eval_loss": 0.03919665887951851, + "eval_runtime": 160.6585, + "eval_samples_per_second": 7.351, + "eval_steps_per_second": 7.351, + "step": 1700 + }, + { + "epoch": 2.574089754445385, + "grad_norm": 0.11442722380161285, + "learning_rate": 9.262926119885196e-05, + "loss": 0.0245, + "step": 1710 + }, + { + "epoch": 2.58914291090413, + "grad_norm": 0.40782737731933594, + "learning_rate": 9.249131005318387e-05, + "loss": 0.0281, + "step": 1720 + }, + { + "epoch": 2.6041960673628752, + "grad_norm": 0.3015223443508148, + "learning_rate": 9.235218461122621e-05, + "loss": 0.0292, + "step": 1730 + }, + { + "epoch": 2.61924922382162, + "grad_norm": 0.36578717827796936, + "learning_rate": 9.221188871787075e-05, + "loss": 0.0264, + "step": 1740 + }, + { + "epoch": 2.634302380280365, + "grad_norm": 0.3577549159526825, + "learning_rate": 9.207042625035612e-05, + "loss": 0.0456, + "step": 1750 + }, + { + "epoch": 2.634302380280365, + "eval_loss": 0.03835190087556839, + "eval_runtime": 160.6146, + "eval_samples_per_second": 7.353, + "eval_steps_per_second": 7.353, + "step": 1750 + }, + { + "epoch": 2.64935553673911, + "grad_norm": 0.18290165066719055, + "learning_rate": 9.192780111816047e-05, + "loss": 0.0314, + "step": 1760 + }, + { + "epoch": 2.664408693197855, + "grad_norm": 0.25759264826774597, + "learning_rate": 9.178401726289366e-05, + "loss": 0.025, + "step": 1770 + }, + { + "epoch": 2.6794618496566, + "grad_norm": 0.3371202051639557, + "learning_rate": 9.163907865818806e-05, + "loss": 0.0222, + "step": 1780 + }, + { + "epoch": 2.6945150061153447, + "grad_norm": 0.3024255931377411, + "learning_rate": 9.149298930958896e-05, + "loss": 0.0265, + "step": 1790 + }, + { + "epoch": 2.7095681625740897, + "grad_norm": 0.4768078327178955, + "learning_rate": 9.134575325444376e-05, + "loss": 0.0297, + "step": 1800 + }, + { + "epoch": 2.7095681625740897, + "eval_loss": 0.03965345025062561, + "eval_runtime": 160.5352, + "eval_samples_per_second": 7.357, + "eval_steps_per_second": 7.357, + "step": 1800 + }, + { + "epoch": 2.7246213190328348, + "grad_norm": 0.19347694516181946, + "learning_rate": 9.11973745617904e-05, + "loss": 0.0329, + "step": 1810 + }, + { + "epoch": 2.7396744754915794, + "grad_norm": 0.3491796553134918, + "learning_rate": 9.104785733224496e-05, + "loss": 0.0303, + "step": 1820 + }, + { + "epoch": 2.7547276319503244, + "grad_norm": 0.15914735198020935, + "learning_rate": 9.089720569788824e-05, + "loss": 0.0303, + "step": 1830 + }, + { + "epoch": 2.7697807884090695, + "grad_norm": 0.27251091599464417, + "learning_rate": 9.07454238221517e-05, + "loss": 0.0278, + "step": 1840 + }, + { + "epoch": 2.7848339448678145, + "grad_norm": 0.3514344096183777, + "learning_rate": 9.059251589970223e-05, + "loss": 0.0276, + "step": 1850 + }, + { + "epoch": 2.7848339448678145, + "eval_loss": 0.037223633378744125, + "eval_runtime": 160.5698, + "eval_samples_per_second": 7.355, + "eval_steps_per_second": 7.355, + "step": 1850 + }, + { + "epoch": 2.7998871013265596, + "grad_norm": 0.3742121160030365, + "learning_rate": 9.043848615632642e-05, + "loss": 0.0221, + "step": 1860 + }, + { + "epoch": 2.814940257785304, + "grad_norm": 0.31676194071769714, + "learning_rate": 9.028333884881357e-05, + "loss": 0.0253, + "step": 1870 + }, + { + "epoch": 2.8299934142440493, + "grad_norm": 0.2500811219215393, + "learning_rate": 9.012707826483823e-05, + "loss": 0.0205, + "step": 1880 + }, + { + "epoch": 2.8450465707027943, + "grad_norm": 0.40287694334983826, + "learning_rate": 8.996970872284158e-05, + "loss": 0.0225, + "step": 1890 + }, + { + "epoch": 2.860099727161539, + "grad_norm": 0.22694632411003113, + "learning_rate": 8.98112345719122e-05, + "loss": 0.0309, + "step": 1900 + }, + { + "epoch": 2.860099727161539, + "eval_loss": 0.04200906306505203, + "eval_runtime": 160.5185, + "eval_samples_per_second": 7.357, + "eval_steps_per_second": 7.357, + "step": 1900 + }, + { + "epoch": 2.875152883620284, + "grad_norm": 0.23344391584396362, + "learning_rate": 8.965166019166571e-05, + "loss": 0.0254, + "step": 1910 + }, + { + "epoch": 2.890206040079029, + "grad_norm": 0.24605628848075867, + "learning_rate": 8.949098999212391e-05, + "loss": 0.0202, + "step": 1920 + }, + { + "epoch": 2.905259196537774, + "grad_norm": 0.433034211397171, + "learning_rate": 8.932922841359281e-05, + "loss": 0.0312, + "step": 1930 + }, + { + "epoch": 2.920312352996519, + "grad_norm": 0.5052774548530579, + "learning_rate": 8.916637992653991e-05, + "loss": 0.0239, + "step": 1940 + }, + { + "epoch": 2.9353655094552638, + "grad_norm": 0.5289516448974609, + "learning_rate": 8.90024490314707e-05, + "loss": 0.0272, + "step": 1950 + }, + { + "epoch": 2.9353655094552638, + "eval_loss": 0.03794412687420845, + "eval_runtime": 160.4841, + "eval_samples_per_second": 7.359, + "eval_steps_per_second": 7.359, + "step": 1950 + }, + { + "epoch": 2.950418665914009, + "grad_norm": 0.26798203587532043, + "learning_rate": 8.883744025880428e-05, + "loss": 0.0225, + "step": 1960 + }, + { + "epoch": 2.965471822372754, + "grad_norm": 0.16808480024337769, + "learning_rate": 8.867135816874811e-05, + "loss": 0.0322, + "step": 1970 + }, + { + "epoch": 2.9805249788314985, + "grad_norm": 0.1027529239654541, + "learning_rate": 8.850420735117202e-05, + "loss": 0.0229, + "step": 1980 + }, + { + "epoch": 2.9955781352902435, + "grad_norm": 0.2596558630466461, + "learning_rate": 8.833599242548137e-05, + "loss": 0.0231, + "step": 1990 + }, + { + "epoch": 3.0106312917489886, + "grad_norm": 0.04692067205905914, + "learning_rate": 8.816671804048933e-05, + "loss": 0.0186, + "step": 2000 + }, + { + "epoch": 3.0106312917489886, + "eval_loss": 0.03820064291357994, + "eval_runtime": 160.4606, + "eval_samples_per_second": 7.36, + "eval_steps_per_second": 7.36, + "step": 2000 + }, + { + "epoch": 3.0256844482077336, + "grad_norm": 0.17519836127758026, + "learning_rate": 8.79963888742885e-05, + "loss": 0.0171, + "step": 2010 + }, + { + "epoch": 3.0407376046664787, + "grad_norm": 0.06432295590639114, + "learning_rate": 8.782500963412156e-05, + "loss": 0.0152, + "step": 2020 + }, + { + "epoch": 3.0557907611252233, + "grad_norm": 0.34834784269332886, + "learning_rate": 8.765258505625117e-05, + "loss": 0.0111, + "step": 2030 + }, + { + "epoch": 3.0708439175839684, + "grad_norm": 0.3122510313987732, + "learning_rate": 8.747911990582912e-05, + "loss": 0.0144, + "step": 2040 + }, + { + "epoch": 3.0858970740427134, + "grad_norm": 0.4063514769077301, + "learning_rate": 8.730461897676464e-05, + "loss": 0.0154, + "step": 2050 + }, + { + "epoch": 3.0858970740427134, + "eval_loss": 0.04377365484833717, + "eval_runtime": 160.5906, + "eval_samples_per_second": 7.354, + "eval_steps_per_second": 7.354, + "step": 2050 + }, + { + "epoch": 3.1009502305014585, + "grad_norm": 0.46541234850883484, + "learning_rate": 8.712908709159183e-05, + "loss": 0.0129, + "step": 2060 + }, + { + "epoch": 3.116003386960203, + "grad_norm": 0.09646399319171906, + "learning_rate": 8.695252910133652e-05, + "loss": 0.0223, + "step": 2070 + }, + { + "epoch": 3.131056543418948, + "grad_norm": 0.3003600537776947, + "learning_rate": 8.677494988538211e-05, + "loss": 0.0165, + "step": 2080 + }, + { + "epoch": 3.146109699877693, + "grad_norm": 0.109312504529953, + "learning_rate": 8.659635435133476e-05, + "loss": 0.0153, + "step": 2090 + }, + { + "epoch": 3.1611628563364382, + "grad_norm": 0.17565420269966125, + "learning_rate": 8.641674743488769e-05, + "loss": 0.0148, + "step": 2100 + }, + { + "epoch": 3.1611628563364382, + "eval_loss": 0.042058054357767105, + "eval_runtime": 160.6004, + "eval_samples_per_second": 7.354, + "eval_steps_per_second": 7.354, + "step": 2100 + }, + { + "epoch": 3.176216012795183, + "grad_norm": 0.05583309382200241, + "learning_rate": 8.623613409968492e-05, + "loss": 0.0136, + "step": 2110 + }, + { + "epoch": 3.191269169253928, + "grad_norm": 0.2509431838989258, + "learning_rate": 8.605451933718397e-05, + "loss": 0.0209, + "step": 2120 + }, + { + "epoch": 3.206322325712673, + "grad_norm": 0.18518169224262238, + "learning_rate": 8.5871908166518e-05, + "loss": 0.0202, + "step": 2130 + }, + { + "epoch": 3.221375482171418, + "grad_norm": 0.19731983542442322, + "learning_rate": 8.568830563435694e-05, + "loss": 0.0213, + "step": 2140 + }, + { + "epoch": 3.2364286386301626, + "grad_norm": 0.4004783630371094, + "learning_rate": 8.550371681476829e-05, + "loss": 0.0191, + "step": 2150 + }, + { + "epoch": 3.2364286386301626, + "eval_loss": 0.040727224200963974, + "eval_runtime": 160.6764, + "eval_samples_per_second": 7.35, + "eval_steps_per_second": 7.35, + "step": 2150 + }, + { + "epoch": 3.2514817950889077, + "grad_norm": 0.32673612236976624, + "learning_rate": 8.531814680907664e-05, + "loss": 0.0169, + "step": 2160 + }, + { + "epoch": 3.2665349515476527, + "grad_norm": 0.23968438804149628, + "learning_rate": 8.513160074572279e-05, + "loss": 0.0245, + "step": 2170 + }, + { + "epoch": 3.2815881080063978, + "grad_norm": 0.18281219899654388, + "learning_rate": 8.494408378012209e-05, + "loss": 0.017, + "step": 2180 + }, + { + "epoch": 3.2966412644651424, + "grad_norm": 0.13286392390727997, + "learning_rate": 8.475560109452181e-05, + "loss": 0.0162, + "step": 2190 + }, + { + "epoch": 3.3116944209238874, + "grad_norm": 0.36538711190223694, + "learning_rate": 8.456615789785804e-05, + "loss": 0.0223, + "step": 2200 + }, + { + "epoch": 3.3116944209238874, + "eval_loss": 0.04104102775454521, + "eval_runtime": 160.4918, + "eval_samples_per_second": 7.359, + "eval_steps_per_second": 7.359, + "step": 2200 + }, + { + "epoch": 3.3267475773826325, + "grad_norm": 0.37908822298049927, + "learning_rate": 8.437575942561172e-05, + "loss": 0.0232, + "step": 2210 + }, + { + "epoch": 3.3418007338413775, + "grad_norm": 0.7264277935028076, + "learning_rate": 8.418441093966385e-05, + "loss": 0.0187, + "step": 2220 + }, + { + "epoch": 3.356853890300122, + "grad_norm": 0.40312430262565613, + "learning_rate": 8.39921177281503e-05, + "loss": 0.0104, + "step": 2230 + }, + { + "epoch": 3.371907046758867, + "grad_norm": 0.26982536911964417, + "learning_rate": 8.379888510531535e-05, + "loss": 0.0171, + "step": 2240 + }, + { + "epoch": 3.3869602032176123, + "grad_norm": 0.6724857091903687, + "learning_rate": 8.360471841136513e-05, + "loss": 0.0212, + "step": 2250 + }, + { + "epoch": 3.3869602032176123, + "eval_loss": 0.04086815565824509, + "eval_runtime": 160.6146, + "eval_samples_per_second": 7.353, + "eval_steps_per_second": 7.353, + "step": 2250 + }, + { + "epoch": 3.4020133596763573, + "grad_norm": 0.11547891795635223, + "learning_rate": 8.340962301231981e-05, + "loss": 0.0089, + "step": 2260 + }, + { + "epoch": 3.417066516135102, + "grad_norm": 0.29867950081825256, + "learning_rate": 8.321360429986543e-05, + "loss": 0.0176, + "step": 2270 + }, + { + "epoch": 3.432119672593847, + "grad_norm": 0.2638017237186432, + "learning_rate": 8.301666769120488e-05, + "loss": 0.0152, + "step": 2280 + }, + { + "epoch": 3.447172829052592, + "grad_norm": 0.46061062812805176, + "learning_rate": 8.281881862890813e-05, + "loss": 0.0187, + "step": 2290 + }, + { + "epoch": 3.462225985511337, + "grad_norm": 0.037824250757694244, + "learning_rate": 8.262006258076187e-05, + "loss": 0.0159, + "step": 2300 + }, + { + "epoch": 3.462225985511337, + "eval_loss": 0.04359050467610359, + "eval_runtime": 160.7318, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, + "step": 2300 + }, + { + "epoch": 3.4772791419700817, + "grad_norm": 0.5161665678024292, + "learning_rate": 8.242040503961844e-05, + "loss": 0.0228, + "step": 2310 + }, + { + "epoch": 3.4923322984288268, + "grad_norm": 0.2847847640514374, + "learning_rate": 8.221985152324385e-05, + "loss": 0.0157, + "step": 2320 + }, + { + "epoch": 3.507385454887572, + "grad_norm": 0.49681392312049866, + "learning_rate": 8.201840757416557e-05, + "loss": 0.0158, + "step": 2330 + }, + { + "epoch": 3.522438611346317, + "grad_norm": 0.14823944866657257, + "learning_rate": 8.18160787595191e-05, + "loss": 0.0202, + "step": 2340 + }, + { + "epoch": 3.5374917678050615, + "grad_norm": 0.40550848841667175, + "learning_rate": 8.161287067089426e-05, + "loss": 0.0109, + "step": 2350 + }, + { + "epoch": 3.5374917678050615, + "eval_loss": 0.04591783136129379, + "eval_runtime": 160.7063, + "eval_samples_per_second": 7.349, + "eval_steps_per_second": 7.349, + "step": 2350 + }, + { + "epoch": 3.5525449242638065, + "grad_norm": 0.3373641073703766, + "learning_rate": 8.14087889241806e-05, + "loss": 0.0192, + "step": 2360 + }, + { + "epoch": 3.5675980807225516, + "grad_norm": 0.38119205832481384, + "learning_rate": 8.120383915941223e-05, + "loss": 0.0196, + "step": 2370 + }, + { + "epoch": 3.582651237181296, + "grad_norm": 0.5549781918525696, + "learning_rate": 8.099802704061195e-05, + "loss": 0.0195, + "step": 2380 + }, + { + "epoch": 3.5977043936400412, + "grad_norm": 0.3053273558616638, + "learning_rate": 8.079135825563466e-05, + "loss": 0.0184, + "step": 2390 + }, + { + "epoch": 3.6127575500987863, + "grad_norm": 0.3129936456680298, + "learning_rate": 8.058383851601027e-05, + "loss": 0.0183, + "step": 2400 + }, + { + "epoch": 3.6127575500987863, + "eval_loss": 0.043686240911483765, + "eval_runtime": 160.6716, + "eval_samples_per_second": 7.35, + "eval_steps_per_second": 7.35, + "step": 2400 + }, + { + "epoch": 3.6278107065575314, + "grad_norm": 0.07833557575941086, + "learning_rate": 8.037547355678577e-05, + "loss": 0.0163, + "step": 2410 + }, + { + "epoch": 3.6428638630162764, + "grad_norm": 0.3321286737918854, + "learning_rate": 8.01662691363668e-05, + "loss": 0.0134, + "step": 2420 + }, + { + "epoch": 3.657917019475021, + "grad_norm": 0.2881935238838196, + "learning_rate": 7.995623103635843e-05, + "loss": 0.0182, + "step": 2430 + }, + { + "epoch": 3.672970175933766, + "grad_norm": 0.3763083815574646, + "learning_rate": 7.974536506140547e-05, + "loss": 0.0189, + "step": 2440 + }, + { + "epoch": 3.688023332392511, + "grad_norm": 0.12157958000898361, + "learning_rate": 7.953367703903196e-05, + "loss": 0.0209, + "step": 2450 + }, + { + "epoch": 3.688023332392511, + "eval_loss": 0.04248056560754776, + "eval_runtime": 160.6004, + "eval_samples_per_second": 7.354, + "eval_steps_per_second": 7.354, + "step": 2450 + }, + { + "epoch": 3.7030764888512557, + "grad_norm": 0.20593494176864624, + "learning_rate": 7.932117281948021e-05, + "loss": 0.0155, + "step": 2460 + }, + { + "epoch": 3.718129645310001, + "grad_norm": 0.18196265399456024, + "learning_rate": 7.910785827554909e-05, + "loss": 0.0143, + "step": 2470 + }, + { + "epoch": 3.733182801768746, + "grad_norm": 0.05911887064576149, + "learning_rate": 7.889373930243164e-05, + "loss": 0.0131, + "step": 2480 + }, + { + "epoch": 3.748235958227491, + "grad_norm": 0.27829521894454956, + "learning_rate": 7.86788218175523e-05, + "loss": 0.0219, + "step": 2490 + }, + { + "epoch": 3.763289114686236, + "grad_norm": 0.08826201409101486, + "learning_rate": 7.846311176040331e-05, + "loss": 0.0187, + "step": 2500 + }, + { + "epoch": 3.763289114686236, + "eval_loss": 0.04148360714316368, + "eval_runtime": 160.7202, + "eval_samples_per_second": 7.348, + "eval_steps_per_second": 7.348, + "step": 2500 + }, + { + "epoch": 3.7783422711449806, + "grad_norm": 0.17570102214813232, + "learning_rate": 7.824661509238048e-05, + "loss": 0.0145, + "step": 2510 + }, + { + "epoch": 3.7933954276037256, + "grad_norm": 0.49462220072746277, + "learning_rate": 7.802933779661859e-05, + "loss": 0.0205, + "step": 2520 + }, + { + "epoch": 3.8084485840624707, + "grad_norm": 0.2543872892856598, + "learning_rate": 7.781128587782595e-05, + "loss": 0.0171, + "step": 2530 + }, + { + "epoch": 3.8235017405212153, + "grad_norm": 0.13087032735347748, + "learning_rate": 7.759246536211844e-05, + "loss": 0.0143, + "step": 2540 + }, + { + "epoch": 3.8385548969799603, + "grad_norm": 0.17477837204933167, + "learning_rate": 7.737288229685303e-05, + "loss": 0.0176, + "step": 2550 + }, + { + "epoch": 3.8385548969799603, + "eval_loss": 0.041211917996406555, + "eval_runtime": 160.8042, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 7.344, + "step": 2550 + }, + { + "epoch": 3.8536080534387054, + "grad_norm": 0.32363876700401306, + "learning_rate": 7.715254275046062e-05, + "loss": 0.0159, + "step": 2560 + }, + { + "epoch": 3.8686612098974504, + "grad_norm": 0.2837449312210083, + "learning_rate": 7.693145281227834e-05, + "loss": 0.0124, + "step": 2570 + }, + { + "epoch": 3.8837143663561955, + "grad_norm": 0.3394205868244171, + "learning_rate": 7.670961859238124e-05, + "loss": 0.0135, + "step": 2580 + }, + { + "epoch": 3.89876752281494, + "grad_norm": 0.3454907238483429, + "learning_rate": 7.648704622141347e-05, + "loss": 0.0182, + "step": 2590 + }, + { + "epoch": 3.913820679273685, + "grad_norm": 0.43332821130752563, + "learning_rate": 7.626374185041886e-05, + "loss": 0.0201, + "step": 2600 + }, + { + "epoch": 3.913820679273685, + "eval_loss": 0.04203461855649948, + "eval_runtime": 160.6666, + "eval_samples_per_second": 7.351, + "eval_steps_per_second": 7.351, + "step": 2600 + }, + { + "epoch": 3.92887383573243, + "grad_norm": 0.08007007092237473, + "learning_rate": 7.603971165067086e-05, + "loss": 0.0127, + "step": 2610 + }, + { + "epoch": 3.943926992191175, + "grad_norm": 0.3678800165653229, + "learning_rate": 7.581496181350203e-05, + "loss": 0.0155, + "step": 2620 + }, + { + "epoch": 3.95898014864992, + "grad_norm": 0.48592865467071533, + "learning_rate": 7.558949855013299e-05, + "loss": 0.0154, + "step": 2630 + }, + { + "epoch": 3.974033305108665, + "grad_norm": 0.3304588794708252, + "learning_rate": 7.536332809150067e-05, + "loss": 0.0176, + "step": 2640 + }, + { + "epoch": 3.98908646156741, + "grad_norm": 0.16497576236724854, + "learning_rate": 7.513645668808616e-05, + "loss": 0.021, + "step": 2650 + }, + { + "epoch": 3.98908646156741, + "eval_loss": 0.04175440967082977, + "eval_runtime": 160.5638, + "eval_samples_per_second": 7.355, + "eval_steps_per_second": 7.355, + "step": 2650 + }, + { + "epoch": 4.004139618026155, + "grad_norm": 0.24761489033699036, + "learning_rate": 7.490889060974201e-05, + "loss": 0.0188, + "step": 2660 + }, + { + "epoch": 4.0191927744849, + "grad_norm": 0.12707805633544922, + "learning_rate": 7.468063614551884e-05, + "loss": 0.0133, + "step": 2670 + }, + { + "epoch": 4.034245930943645, + "grad_norm": 0.26704150438308716, + "learning_rate": 7.445169960349167e-05, + "loss": 0.0096, + "step": 2680 + }, + { + "epoch": 4.04929908740239, + "grad_norm": 0.33614397048950195, + "learning_rate": 7.422208731058549e-05, + "loss": 0.0099, + "step": 2690 + }, + { + "epoch": 4.064352243861134, + "grad_norm": 0.39196062088012695, + "learning_rate": 7.399180561240044e-05, + "loss": 0.0096, + "step": 2700 + }, + { + "epoch": 4.064352243861134, + "eval_loss": 0.05126947537064552, + "eval_runtime": 160.5262, + "eval_samples_per_second": 7.357, + "eval_steps_per_second": 7.357, + "step": 2700 + }, + { + "epoch": 4.07940540031988, + "grad_norm": 0.22677403688430786, + "learning_rate": 7.376086087303648e-05, + "loss": 0.011, + "step": 2710 + }, + { + "epoch": 4.0944585567786245, + "grad_norm": 0.30137506127357483, + "learning_rate": 7.352925947491746e-05, + "loss": 0.0111, + "step": 2720 + }, + { + "epoch": 4.109511713237369, + "grad_norm": 0.062418729066848755, + "learning_rate": 7.329700781861472e-05, + "loss": 0.0066, + "step": 2730 + }, + { + "epoch": 4.124564869696115, + "grad_norm": 0.07959133386611938, + "learning_rate": 7.306411232267029e-05, + "loss": 0.0055, + "step": 2740 + }, + { + "epoch": 4.139618026154859, + "grad_norm": 0.1270930916070938, + "learning_rate": 7.283057942341939e-05, + "loss": 0.0075, + "step": 2750 + }, + { + "epoch": 4.139618026154859, + "eval_loss": 0.051892608404159546, + "eval_runtime": 160.5415, + "eval_samples_per_second": 7.356, + "eval_steps_per_second": 7.356, + "step": 2750 + }, + { + "epoch": 4.154671182613605, + "grad_norm": 0.1968042105436325, + "learning_rate": 7.259641557481269e-05, + "loss": 0.0065, + "step": 2760 + }, + { + "epoch": 4.169724339072349, + "grad_norm": 0.5984161496162415, + "learning_rate": 7.23616272482378e-05, + "loss": 0.0066, + "step": 2770 + }, + { + "epoch": 4.184777495531094, + "grad_norm": 0.02378808706998825, + "learning_rate": 7.212622093234049e-05, + "loss": 0.0057, + "step": 2780 + }, + { + "epoch": 4.199830651989839, + "grad_norm": 0.5045300126075745, + "learning_rate": 7.18902031328455e-05, + "loss": 0.0133, + "step": 2790 + }, + { + "epoch": 4.214883808448584, + "grad_norm": 0.10351201891899109, + "learning_rate": 7.165358037237643e-05, + "loss": 0.0116, + "step": 2800 + }, + { + "epoch": 4.214883808448584, + "eval_loss": 0.05397958308458328, + "eval_runtime": 160.4211, + "eval_samples_per_second": 7.362, + "eval_steps_per_second": 7.362, + "step": 2800 + }, + { + "epoch": 4.229936964907329, + "grad_norm": 0.5372568964958191, + "learning_rate": 7.141635919027587e-05, + "loss": 0.0086, + "step": 2810 + }, + { + "epoch": 4.244990121366074, + "grad_norm": 0.2798357605934143, + "learning_rate": 7.117854614242434e-05, + "loss": 0.01, + "step": 2820 + }, + { + "epoch": 4.260043277824819, + "grad_norm": 0.20511843264102936, + "learning_rate": 7.094014780105931e-05, + "loss": 0.0111, + "step": 2830 + }, + { + "epoch": 4.275096434283564, + "grad_norm": 0.13015638291835785, + "learning_rate": 7.070117075459352e-05, + "loss": 0.0074, + "step": 2840 + }, + { + "epoch": 4.290149590742309, + "grad_norm": 0.6621993780136108, + "learning_rate": 7.046162160743283e-05, + "loss": 0.0088, + "step": 2850 + }, + { + "epoch": 4.290149590742309, + "eval_loss": 0.05007839575409889, + "eval_runtime": 160.4675, + "eval_samples_per_second": 7.36, + "eval_steps_per_second": 7.36, + "step": 2850 + }, + { + "epoch": 4.3052027472010534, + "grad_norm": 0.26351553201675415, + "learning_rate": 7.022150697979384e-05, + "loss": 0.0043, + "step": 2860 + }, + { + "epoch": 4.320255903659799, + "grad_norm": 0.2611856162548065, + "learning_rate": 6.998083350752083e-05, + "loss": 0.0082, + "step": 2870 + }, + { + "epoch": 4.335309060118544, + "grad_norm": 0.04866206645965576, + "learning_rate": 6.973960784190237e-05, + "loss": 0.0056, + "step": 2880 + }, + { + "epoch": 4.350362216577288, + "grad_norm": 0.2370246946811676, + "learning_rate": 6.949783664948753e-05, + "loss": 0.0086, + "step": 2890 + }, + { + "epoch": 4.365415373036034, + "grad_norm": 0.37018248438835144, + "learning_rate": 6.925552661190166e-05, + "loss": 0.0166, + "step": 2900 + }, + { + "epoch": 4.365415373036034, + "eval_loss": 0.050742682069540024, + "eval_runtime": 160.5448, + "eval_samples_per_second": 7.356, + "eval_steps_per_second": 7.356, + "step": 2900 + }, + { + "epoch": 4.380468529494778, + "grad_norm": 0.1219954788684845, + "learning_rate": 6.901268442566172e-05, + "loss": 0.0059, + "step": 2910 + }, + { + "epoch": 4.395521685953524, + "grad_norm": 0.32057642936706543, + "learning_rate": 6.876931680199121e-05, + "loss": 0.0087, + "step": 2920 + }, + { + "epoch": 4.410574842412268, + "grad_norm": 0.339565247297287, + "learning_rate": 6.852543046663467e-05, + "loss": 0.0152, + "step": 2930 + }, + { + "epoch": 4.425627998871013, + "grad_norm": 0.21275486052036285, + "learning_rate": 6.828103215967186e-05, + "loss": 0.0112, + "step": 2940 + }, + { + "epoch": 4.4406811553297585, + "grad_norm": 0.1512928307056427, + "learning_rate": 6.803612863533148e-05, + "loss": 0.0098, + "step": 2950 + }, + { + "epoch": 4.4406811553297585, + "eval_loss": 0.05037567764520645, + "eval_runtime": 160.4337, + "eval_samples_per_second": 7.361, + "eval_steps_per_second": 7.361, + "step": 2950 + }, + { + "epoch": 4.455734311788503, + "grad_norm": 0.05528683960437775, + "learning_rate": 6.779072666180446e-05, + "loss": 0.0095, + "step": 2960 + }, + { + "epoch": 4.470787468247248, + "grad_norm": 0.3433815836906433, + "learning_rate": 6.754483302105696e-05, + "loss": 0.0069, + "step": 2970 + }, + { + "epoch": 4.485840624705993, + "grad_norm": 0.4260675013065338, + "learning_rate": 6.729845450864294e-05, + "loss": 0.0089, + "step": 2980 + }, + { + "epoch": 4.500893781164738, + "grad_norm": 0.18459519743919373, + "learning_rate": 6.705159793351634e-05, + "loss": 0.0106, + "step": 2990 + }, + { + "epoch": 4.515946937623482, + "grad_norm": 0.35230037569999695, + "learning_rate": 6.680427011784292e-05, + "loss": 0.0076, + "step": 3000 + }, + { + "epoch": 4.515946937623482, + "eval_loss": 0.0548754557967186, + "eval_runtime": 160.4116, + "eval_samples_per_second": 7.362, + "eval_steps_per_second": 7.362, + "step": 3000 + }, + { + "epoch": 4.531000094082228, + "grad_norm": 0.19794873893260956, + "learning_rate": 6.655647789681166e-05, + "loss": 0.0108, + "step": 3010 + }, + { + "epoch": 4.5460532505409725, + "grad_norm": 0.21396826207637787, + "learning_rate": 6.630822811844604e-05, + "loss": 0.0087, + "step": 3020 + }, + { + "epoch": 4.561106406999718, + "grad_norm": 0.19450531899929047, + "learning_rate": 6.605952764341453e-05, + "loss": 0.0093, + "step": 3030 + }, + { + "epoch": 4.576159563458463, + "grad_norm": 0.4334748387336731, + "learning_rate": 6.58103833448412e-05, + "loss": 0.0177, + "step": 3040 + }, + { + "epoch": 4.591212719917207, + "grad_norm": 0.2196117788553238, + "learning_rate": 6.556080210811569e-05, + "loss": 0.0175, + "step": 3050 + }, + { + "epoch": 4.591212719917207, + "eval_loss": 0.04699844494462013, + "eval_runtime": 160.3917, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 3050 + }, + { + "epoch": 4.606265876375953, + "grad_norm": 0.14767540991306305, + "learning_rate": 6.531079083070288e-05, + "loss": 0.009, + "step": 3060 + }, + { + "epoch": 4.621319032834697, + "grad_norm": 0.10351859033107758, + "learning_rate": 6.506035642195238e-05, + "loss": 0.006, + "step": 3070 + }, + { + "epoch": 4.636372189293443, + "grad_norm": 0.21928033232688904, + "learning_rate": 6.480950580290752e-05, + "loss": 0.0124, + "step": 3080 + }, + { + "epoch": 4.6514253457521875, + "grad_norm": 0.1401652842760086, + "learning_rate": 6.455824590611398e-05, + "loss": 0.0112, + "step": 3090 + }, + { + "epoch": 4.666478502210932, + "grad_norm": 0.3622473478317261, + "learning_rate": 6.430658367542843e-05, + "loss": 0.0116, + "step": 3100 + }, + { + "epoch": 4.666478502210932, + "eval_loss": 0.05732714757323265, + "eval_runtime": 160.4003, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 3100 + }, + { + "epoch": 4.681531658669678, + "grad_norm": 0.32070496678352356, + "learning_rate": 6.405452606582647e-05, + "loss": 0.0194, + "step": 3110 + }, + { + "epoch": 4.696584815128422, + "grad_norm": 0.19401530921459198, + "learning_rate": 6.380208004321036e-05, + "loss": 0.018, + "step": 3120 + }, + { + "epoch": 4.711637971587168, + "grad_norm": 0.11467122286558151, + "learning_rate": 6.354925258421675e-05, + "loss": 0.0096, + "step": 3130 + }, + { + "epoch": 4.726691128045912, + "grad_norm": 0.04464598000049591, + "learning_rate": 6.32960506760236e-05, + "loss": 0.0088, + "step": 3140 + }, + { + "epoch": 4.741744284504657, + "grad_norm": 0.19807831943035126, + "learning_rate": 6.304248131615724e-05, + "loss": 0.0157, + "step": 3150 + }, + { + "epoch": 4.741744284504657, + "eval_loss": 0.04563054069876671, + "eval_runtime": 160.4474, + "eval_samples_per_second": 7.361, + "eval_steps_per_second": 7.361, + "step": 3150 + }, + { + "epoch": 4.756797440963402, + "grad_norm": 0.14441685378551483, + "learning_rate": 6.278855151229901e-05, + "loss": 0.0072, + "step": 3160 + }, + { + "epoch": 4.771850597422147, + "grad_norm": 0.1856829822063446, + "learning_rate": 6.253426828209143e-05, + "loss": 0.0087, + "step": 3170 + }, + { + "epoch": 4.786903753880892, + "grad_norm": 0.21941307187080383, + "learning_rate": 6.227963865294444e-05, + "loss": 0.0054, + "step": 3180 + }, + { + "epoch": 4.801956910339637, + "grad_norm": 0.3553565442562103, + "learning_rate": 6.202466966184112e-05, + "loss": 0.0108, + "step": 3190 + }, + { + "epoch": 4.817010066798382, + "grad_norm": 0.44985368847846985, + "learning_rate": 6.176936835514312e-05, + "loss": 0.0079, + "step": 3200 + }, + { + "epoch": 4.817010066798382, + "eval_loss": 0.05075014382600784, + "eval_runtime": 160.3914, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 3200 + }, + { + "epoch": 4.832063223257126, + "grad_norm": 0.22725753486156464, + "learning_rate": 6.151374178839614e-05, + "loss": 0.0068, + "step": 3210 + }, + { + "epoch": 4.847116379715872, + "grad_norm": 0.2145383358001709, + "learning_rate": 6.125779702613471e-05, + "loss": 0.006, + "step": 3220 + }, + { + "epoch": 4.8621695361746164, + "grad_norm": 0.11892041563987732, + "learning_rate": 6.1001541141687105e-05, + "loss": 0.0085, + "step": 3230 + }, + { + "epoch": 4.877222692633362, + "grad_norm": 0.4456254243850708, + "learning_rate": 6.074498121697983e-05, + "loss": 0.0189, + "step": 3240 + }, + { + "epoch": 4.8922758490921066, + "grad_norm": 0.26635292172431946, + "learning_rate": 6.048812434234189e-05, + "loss": 0.0091, + "step": 3250 + }, + { + "epoch": 4.8922758490921066, + "eval_loss": 0.0511915497481823, + "eval_runtime": 160.5138, + "eval_samples_per_second": 7.358, + "eval_steps_per_second": 7.358, + "step": 3250 + }, + { + "epoch": 4.907329005550851, + "grad_norm": 0.07702851295471191, + "learning_rate": 6.023097761630879e-05, + "loss": 0.0088, + "step": 3260 + }, + { + "epoch": 4.922382162009597, + "grad_norm": 0.4732000529766083, + "learning_rate": 5.997354814542649e-05, + "loss": 0.0129, + "step": 3270 + }, + { + "epoch": 4.937435318468341, + "grad_norm": 0.16142740845680237, + "learning_rate": 5.971584304405489e-05, + "loss": 0.0139, + "step": 3280 + }, + { + "epoch": 4.952488474927087, + "grad_norm": 0.06730391830205917, + "learning_rate": 5.9457869434171234e-05, + "loss": 0.0091, + "step": 3290 + }, + { + "epoch": 4.967541631385831, + "grad_norm": 0.2761211097240448, + "learning_rate": 5.919963444517338e-05, + "loss": 0.009, + "step": 3300 + }, + { + "epoch": 4.967541631385831, + "eval_loss": 0.04786451533436775, + "eval_runtime": 160.4067, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 3300 + }, + { + "epoch": 4.982594787844576, + "grad_norm": 0.3200295567512512, + "learning_rate": 5.8941145213682594e-05, + "loss": 0.0083, + "step": 3310 + }, + { + "epoch": 4.9976479443033215, + "grad_norm": 0.781211793422699, + "learning_rate": 5.868240888334653e-05, + "loss": 0.0065, + "step": 3320 + }, + { + "epoch": 5.012701100762066, + "grad_norm": 0.4308778643608093, + "learning_rate": 5.8423432604641636e-05, + "loss": 0.0049, + "step": 3330 + }, + { + "epoch": 5.027754257220811, + "grad_norm": 0.19739733636379242, + "learning_rate": 5.816422353467562e-05, + "loss": 0.0075, + "step": 3340 + }, + { + "epoch": 5.042807413679556, + "grad_norm": 0.3251270353794098, + "learning_rate": 5.79047888369897e-05, + "loss": 0.0042, + "step": 3350 + }, + { + "epoch": 5.042807413679556, + "eval_loss": 0.05960267782211304, + "eval_runtime": 160.4056, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 3350 + }, + { + "epoch": 5.057860570138301, + "grad_norm": 0.051357630640268326, + "learning_rate": 5.7645135681360496e-05, + "loss": 0.0038, + "step": 3360 + }, + { + "epoch": 5.072913726597045, + "grad_norm": 0.19079819321632385, + "learning_rate": 5.738527124360199e-05, + "loss": 0.0036, + "step": 3370 + }, + { + "epoch": 5.087966883055791, + "grad_norm": 0.512081503868103, + "learning_rate": 5.7125202705367234e-05, + "loss": 0.0046, + "step": 3380 + }, + { + "epoch": 5.1030200395145355, + "grad_norm": 0.27560409903526306, + "learning_rate": 5.686493725394978e-05, + "loss": 0.0064, + "step": 3390 + }, + { + "epoch": 5.118073195973281, + "grad_norm": 0.3109549880027771, + "learning_rate": 5.660448208208513e-05, + "loss": 0.0059, + "step": 3400 + }, + { + "epoch": 5.118073195973281, + "eval_loss": 0.05848437920212746, + "eval_runtime": 160.5028, + "eval_samples_per_second": 7.358, + "eval_steps_per_second": 7.358, + "step": 3400 + }, + { + "epoch": 5.133126352432026, + "grad_norm": 0.1946442574262619, + "learning_rate": 5.63438443877519e-05, + "loss": 0.0058, + "step": 3410 + }, + { + "epoch": 5.14817950889077, + "grad_norm": 0.3648136258125305, + "learning_rate": 5.608303137397294e-05, + "loss": 0.0105, + "step": 3420 + }, + { + "epoch": 5.163232665349516, + "grad_norm": 0.09449704736471176, + "learning_rate": 5.5822050248616285e-05, + "loss": 0.0024, + "step": 3430 + }, + { + "epoch": 5.17828582180826, + "grad_norm": 0.008435610681772232, + "learning_rate": 5.5560908224195886e-05, + "loss": 0.0047, + "step": 3440 + }, + { + "epoch": 5.193338978267005, + "grad_norm": 0.09024719893932343, + "learning_rate": 5.5299612517672325e-05, + "loss": 0.0041, + "step": 3450 + }, + { + "epoch": 5.193338978267005, + "eval_loss": 0.06566525995731354, + "eval_runtime": 160.3913, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 3450 + }, + { + "epoch": 5.2083921347257505, + "grad_norm": 0.4751967787742615, + "learning_rate": 5.503817035025342e-05, + "loss": 0.0046, + "step": 3460 + }, + { + "epoch": 5.223445291184495, + "grad_norm": 0.48634153604507446, + "learning_rate": 5.4776588947194526e-05, + "loss": 0.0026, + "step": 3470 + }, + { + "epoch": 5.238498447643241, + "grad_norm": 0.32989442348480225, + "learning_rate": 5.4514875537598985e-05, + "loss": 0.0065, + "step": 3480 + }, + { + "epoch": 5.253551604101985, + "grad_norm": 0.057971298694610596, + "learning_rate": 5.425303735421828e-05, + "loss": 0.0082, + "step": 3490 + }, + { + "epoch": 5.26860476056073, + "grad_norm": 0.05405077338218689, + "learning_rate": 5.399108163325217e-05, + "loss": 0.0027, + "step": 3500 + }, + { + "epoch": 5.26860476056073, + "eval_loss": 0.06462710350751877, + "eval_runtime": 160.4488, + "eval_samples_per_second": 7.361, + "eval_steps_per_second": 7.361, + "step": 3500 + }, + { + "epoch": 5.283657917019475, + "grad_norm": 0.4236180782318115, + "learning_rate": 5.3729015614148693e-05, + "loss": 0.0065, + "step": 3510 + }, + { + "epoch": 5.29871107347822, + "grad_norm": 0.05975193902850151, + "learning_rate": 5.346684653940408e-05, + "loss": 0.0028, + "step": 3520 + }, + { + "epoch": 5.3137642299369645, + "grad_norm": 0.16054610908031464, + "learning_rate": 5.320458165436268e-05, + "loss": 0.0053, + "step": 3530 + }, + { + "epoch": 5.32881738639571, + "grad_norm": 0.1953929215669632, + "learning_rate": 5.294222820701661e-05, + "loss": 0.0068, + "step": 3540 + }, + { + "epoch": 5.343870542854455, + "grad_norm": 0.15068404376506805, + "learning_rate": 5.267979344780555e-05, + "loss": 0.0021, + "step": 3550 + }, + { + "epoch": 5.343870542854455, + "eval_loss": 0.059755634516477585, + "eval_runtime": 160.4103, + "eval_samples_per_second": 7.362, + "eval_steps_per_second": 7.362, + "step": 3550 + }, + { + "epoch": 5.3589236993132, + "grad_norm": 0.14032387733459473, + "learning_rate": 5.24172846294163e-05, + "loss": 0.0041, + "step": 3560 + }, + { + "epoch": 5.373976855771945, + "grad_norm": 0.054425228387117386, + "learning_rate": 5.215470900658237e-05, + "loss": 0.006, + "step": 3570 + }, + { + "epoch": 5.389030012230689, + "grad_norm": 0.3048708438873291, + "learning_rate": 5.1892073835883524e-05, + "loss": 0.0106, + "step": 3580 + }, + { + "epoch": 5.404083168689435, + "grad_norm": 0.22547312080860138, + "learning_rate": 5.162938637554516e-05, + "loss": 0.004, + "step": 3590 + }, + { + "epoch": 5.4191363251481794, + "grad_norm": 0.020732874050736427, + "learning_rate": 5.136665388523778e-05, + "loss": 0.0011, + "step": 3600 + }, + { + "epoch": 5.4191363251481794, + "eval_loss": 0.059289801865816116, + "eval_runtime": 160.3839, + "eval_samples_per_second": 7.364, + "eval_steps_per_second": 7.364, + "step": 3600 + }, + { + "epoch": 5.434189481606924, + "grad_norm": 0.04097124934196472, + "learning_rate": 5.1103883625876335e-05, + "loss": 0.0048, + "step": 3610 + }, + { + "epoch": 5.4492426380656696, + "grad_norm": 0.04979607090353966, + "learning_rate": 5.0841082859419585e-05, + "loss": 0.0047, + "step": 3620 + }, + { + "epoch": 5.464295794524414, + "grad_norm": 0.0677928477525711, + "learning_rate": 5.057825884866935e-05, + "loss": 0.0067, + "step": 3630 + }, + { + "epoch": 5.47934895098316, + "grad_norm": 0.30655401945114136, + "learning_rate": 5.031541885706987e-05, + "loss": 0.0044, + "step": 3640 + }, + { + "epoch": 5.494402107441904, + "grad_norm": 0.5029659271240234, + "learning_rate": 5.005257014850701e-05, + "loss": 0.0028, + "step": 3650 + }, + { + "epoch": 5.494402107441904, + "eval_loss": 0.05830836668610573, + "eval_runtime": 160.5019, + "eval_samples_per_second": 7.358, + "eval_steps_per_second": 7.358, + "step": 3650 + }, + { + "epoch": 5.509455263900649, + "grad_norm": 0.4105170667171478, + "learning_rate": 4.9789719987107545e-05, + "loss": 0.0078, + "step": 3660 + }, + { + "epoch": 5.524508420359394, + "grad_norm": 0.0587475448846817, + "learning_rate": 4.952687563703841e-05, + "loss": 0.0032, + "step": 3670 + }, + { + "epoch": 5.539561576818139, + "grad_norm": 0.8135388493537903, + "learning_rate": 4.926404436230596e-05, + "loss": 0.0084, + "step": 3680 + }, + { + "epoch": 5.554614733276884, + "grad_norm": 0.07115081697702408, + "learning_rate": 4.900123342655511e-05, + "loss": 0.0055, + "step": 3690 + }, + { + "epoch": 5.569667889735629, + "grad_norm": 0.2204965054988861, + "learning_rate": 4.8738450092868785e-05, + "loss": 0.0069, + "step": 3700 + }, + { + "epoch": 5.569667889735629, + "eval_loss": 0.0527074933052063, + "eval_runtime": 160.3354, + "eval_samples_per_second": 7.366, + "eval_steps_per_second": 7.366, + "step": 3700 + }, + { + "epoch": 5.584721046194374, + "grad_norm": 0.07341840863227844, + "learning_rate": 4.847570162356703e-05, + "loss": 0.0039, + "step": 3710 + }, + { + "epoch": 5.599774202653119, + "grad_norm": 0.003633906366303563, + "learning_rate": 4.8212995280006426e-05, + "loss": 0.0037, + "step": 3720 + }, + { + "epoch": 5.614827359111864, + "grad_norm": 0.25106242299079895, + "learning_rate": 4.7950338322379294e-05, + "loss": 0.0038, + "step": 3730 + }, + { + "epoch": 5.629880515570608, + "grad_norm": 0.23045752942562103, + "learning_rate": 4.76877380095132e-05, + "loss": 0.0077, + "step": 3740 + }, + { + "epoch": 5.644933672029354, + "grad_norm": 0.09880348294973373, + "learning_rate": 4.742520159867018e-05, + "loss": 0.0044, + "step": 3750 + }, + { + "epoch": 5.644933672029354, + "eval_loss": 0.05745174363255501, + "eval_runtime": 160.2876, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 3750 + }, + { + "epoch": 5.6599868284880985, + "grad_norm": 0.1467890739440918, + "learning_rate": 4.7162736345346303e-05, + "loss": 0.0034, + "step": 3760 + }, + { + "epoch": 5.675039984946844, + "grad_norm": 0.24932341277599335, + "learning_rate": 4.690034950307115e-05, + "loss": 0.0046, + "step": 3770 + }, + { + "epoch": 5.690093141405589, + "grad_norm": 0.4808225631713867, + "learning_rate": 4.663804832320726e-05, + "loss": 0.0064, + "step": 3780 + }, + { + "epoch": 5.705146297864333, + "grad_norm": 0.33825623989105225, + "learning_rate": 4.637584005474987e-05, + "loss": 0.0072, + "step": 3790 + }, + { + "epoch": 5.720199454323079, + "grad_norm": 0.6621521711349487, + "learning_rate": 4.6113731944126406e-05, + "loss": 0.005, + "step": 3800 + }, + { + "epoch": 5.720199454323079, + "eval_loss": 0.06031941995024681, + "eval_runtime": 160.3617, + "eval_samples_per_second": 7.365, + "eval_steps_per_second": 7.365, + "step": 3800 + }, + { + "epoch": 5.735252610781823, + "grad_norm": 0.2407289445400238, + "learning_rate": 4.58517312349964e-05, + "loss": 0.0033, + "step": 3810 + }, + { + "epoch": 5.750305767240568, + "grad_norm": 0.047928955405950546, + "learning_rate": 4.558984516805118e-05, + "loss": 0.002, + "step": 3820 + }, + { + "epoch": 5.7653589236993135, + "grad_norm": 0.3384752571582794, + "learning_rate": 4.5328080980813815e-05, + "loss": 0.0021, + "step": 3830 + }, + { + "epoch": 5.780412080158058, + "grad_norm": 0.03273696079850197, + "learning_rate": 4.5066445907439104e-05, + "loss": 0.0046, + "step": 3840 + }, + { + "epoch": 5.795465236616803, + "grad_norm": 0.0050367917865514755, + "learning_rate": 4.480494717851359e-05, + "loss": 0.0034, + "step": 3850 + }, + { + "epoch": 5.795465236616803, + "eval_loss": 0.0663178339600563, + "eval_runtime": 160.4267, + "eval_samples_per_second": 7.362, + "eval_steps_per_second": 7.362, + "step": 3850 + }, + { + "epoch": 5.810518393075548, + "grad_norm": 0.22678084671497345, + "learning_rate": 4.454359202085582e-05, + "loss": 0.0058, + "step": 3860 + }, + { + "epoch": 5.825571549534293, + "grad_norm": 0.18142001330852509, + "learning_rate": 4.4282387657316574e-05, + "loss": 0.01, + "step": 3870 + }, + { + "epoch": 5.840624705993038, + "grad_norm": 0.03086080029606819, + "learning_rate": 4.402134130657925e-05, + "loss": 0.0039, + "step": 3880 + }, + { + "epoch": 5.855677862451783, + "grad_norm": 1.3688685894012451, + "learning_rate": 4.376046018296043e-05, + "loss": 0.0062, + "step": 3890 + }, + { + "epoch": 5.8707310189105275, + "grad_norm": 0.03379492834210396, + "learning_rate": 4.349975149621039e-05, + "loss": 0.007, + "step": 3900 + }, + { + "epoch": 5.8707310189105275, + "eval_loss": 0.05438974127173424, + "eval_runtime": 160.355, + "eval_samples_per_second": 7.365, + "eval_steps_per_second": 7.365, + "step": 3900 + }, + { + "epoch": 5.885784175369273, + "grad_norm": 0.1113867238163948, + "learning_rate": 4.3239222451313924e-05, + "loss": 0.0048, + "step": 3910 + }, + { + "epoch": 5.900837331828018, + "grad_norm": 0.6470983028411865, + "learning_rate": 4.297888024829126e-05, + "loss": 0.0058, + "step": 3920 + }, + { + "epoch": 5.915890488286763, + "grad_norm": 0.04651109501719475, + "learning_rate": 4.2718732081998985e-05, + "loss": 0.0071, + "step": 3930 + }, + { + "epoch": 5.930943644745508, + "grad_norm": 0.20141349732875824, + "learning_rate": 4.2458785141931314e-05, + "loss": 0.0104, + "step": 3940 + }, + { + "epoch": 5.945996801204252, + "grad_norm": 0.08963126689195633, + "learning_rate": 4.21990466120213e-05, + "loss": 0.0059, + "step": 3950 + }, + { + "epoch": 5.945996801204252, + "eval_loss": 0.05330498889088631, + "eval_runtime": 160.4006, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 3950 + }, + { + "epoch": 5.961049957662998, + "grad_norm": 0.0755634531378746, + "learning_rate": 4.1939523670442316e-05, + "loss": 0.0031, + "step": 3960 + }, + { + "epoch": 5.976103114121742, + "grad_norm": 0.007666358258575201, + "learning_rate": 4.168022348940978e-05, + "loss": 0.0081, + "step": 3970 + }, + { + "epoch": 5.991156270580487, + "grad_norm": 0.014299154281616211, + "learning_rate": 4.14211532349828e-05, + "loss": 0.0049, + "step": 3980 + }, + { + "epoch": 6.0062094270392326, + "grad_norm": 0.008767153136432171, + "learning_rate": 4.1162320066866236e-05, + "loss": 0.0035, + "step": 3990 + }, + { + "epoch": 6.021262583497977, + "grad_norm": 0.05362703651189804, + "learning_rate": 4.090373113821281e-05, + "loss": 0.0021, + "step": 4000 + }, + { + "epoch": 6.021262583497977, + "eval_loss": 0.05973425135016441, + "eval_runtime": 160.4067, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 7.363, + "step": 4000 + }, + { + "epoch": 6.036315739956722, + "grad_norm": 0.05829249694943428, + "learning_rate": 4.0645393595425323e-05, + "loss": 0.0015, + "step": 4010 + }, + { + "epoch": 6.051368896415467, + "grad_norm": 0.024488981813192368, + "learning_rate": 4.0387314577959315e-05, + "loss": 0.0057, + "step": 4020 + }, + { + "epoch": 6.066422052874212, + "grad_norm": 0.18860554695129395, + "learning_rate": 4.012950121812565e-05, + "loss": 0.0032, + "step": 4030 + }, + { + "epoch": 6.081475209332957, + "grad_norm": 0.020918726921081543, + "learning_rate": 3.987196064089346e-05, + "loss": 0.0024, + "step": 4040 + }, + { + "epoch": 6.096528365791702, + "grad_norm": 0.00897147599607706, + "learning_rate": 3.961469996369319e-05, + "loss": 0.0017, + "step": 4050 + }, + { + "epoch": 6.096528365791702, + "eval_loss": 0.06118471547961235, + "eval_runtime": 160.445, + "eval_samples_per_second": 7.361, + "eval_steps_per_second": 7.361, + "step": 4050 + }, + { + "epoch": 6.111581522250447, + "grad_norm": 0.23106351494789124, + "learning_rate": 3.935772629621995e-05, + "loss": 0.004, + "step": 4060 + }, + { + "epoch": 6.126634678709192, + "grad_norm": 0.034407466650009155, + "learning_rate": 3.910104674023696e-05, + "loss": 0.001, + "step": 4070 + }, + { + "epoch": 6.141687835167937, + "grad_norm": 0.20165956020355225, + "learning_rate": 3.8844668389379396e-05, + "loss": 0.0033, + "step": 4080 + }, + { + "epoch": 6.156740991626681, + "grad_norm": 0.025097308680415154, + "learning_rate": 3.858859832895822e-05, + "loss": 0.0028, + "step": 4090 + }, + { + "epoch": 6.171794148085427, + "grad_norm": 0.12382583320140839, + "learning_rate": 3.833284363576447e-05, + "loss": 0.0065, + "step": 4100 + }, + { + "epoch": 6.171794148085427, + "eval_loss": 0.060556113719940186, + "eval_runtime": 160.2461, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 7.37, + "step": 4100 + }, + { + "epoch": 6.186847304544171, + "grad_norm": 0.00555534940212965, + "learning_rate": 3.8077411377873675e-05, + "loss": 0.0019, + "step": 4110 + }, + { + "epoch": 6.201900461002917, + "grad_norm": 0.0636376440525055, + "learning_rate": 3.7822308614450406e-05, + "loss": 0.0022, + "step": 4120 + }, + { + "epoch": 6.2169536174616615, + "grad_norm": 0.12849709391593933, + "learning_rate": 3.7567542395553345e-05, + "loss": 0.0009, + "step": 4130 + }, + { + "epoch": 6.232006773920406, + "grad_norm": 0.007057442329823971, + "learning_rate": 3.7313119761940375e-05, + "loss": 0.0013, + "step": 4140 + }, + { + "epoch": 6.247059930379152, + "grad_norm": 0.4326237440109253, + "learning_rate": 3.705904774487396e-05, + "loss": 0.001, + "step": 4150 + }, + { + "epoch": 6.247059930379152, + "eval_loss": 0.0654623731970787, + "eval_runtime": 160.3832, + "eval_samples_per_second": 7.364, + "eval_steps_per_second": 7.364, + "step": 4150 + }, + { + "epoch": 6.262113086837896, + "grad_norm": 0.815345823764801, + "learning_rate": 3.680533336592694e-05, + "loss": 0.0063, + "step": 4160 + }, + { + "epoch": 6.277166243296641, + "grad_norm": 0.19209644198417664, + "learning_rate": 3.6551983636788336e-05, + "loss": 0.0019, + "step": 4170 + }, + { + "epoch": 6.292219399755386, + "grad_norm": 0.02730114571750164, + "learning_rate": 3.62990055590697e-05, + "loss": 0.0021, + "step": 4180 + }, + { + "epoch": 6.307272556214131, + "grad_norm": 0.06521335989236832, + "learning_rate": 3.604640612411156e-05, + "loss": 0.0019, + "step": 4190 + }, + { + "epoch": 6.3223257126728765, + "grad_norm": 0.09775713831186295, + "learning_rate": 3.579419231279023e-05, + "loss": 0.0035, + "step": 4200 + }, + { + "epoch": 6.3223257126728765, + "eval_loss": 0.06749049574136734, + "eval_runtime": 160.3287, + "eval_samples_per_second": 7.366, + "eval_steps_per_second": 7.366, + "step": 4200 + }, + { + "epoch": 6.337378869131621, + "grad_norm": 0.2489928901195526, + "learning_rate": 3.554237109532483e-05, + "loss": 0.0045, + "step": 4210 + }, + { + "epoch": 6.352432025590366, + "grad_norm": 0.06490439176559448, + "learning_rate": 3.529094943108475e-05, + "loss": 0.0025, + "step": 4220 + }, + { + "epoch": 6.367485182049111, + "grad_norm": 0.4271223545074463, + "learning_rate": 3.5039934268397225e-05, + "loss": 0.0015, + "step": 4230 + }, + { + "epoch": 6.382538338507856, + "grad_norm": 0.1668974757194519, + "learning_rate": 3.478933254435534e-05, + "loss": 0.0051, + "step": 4240 + }, + { + "epoch": 6.3975914949666, + "grad_norm": 0.012954922392964363, + "learning_rate": 3.4539151184626385e-05, + "loss": 0.0058, + "step": 4250 + }, + { + "epoch": 6.3975914949666, + "eval_loss": 0.061912406235933304, + "eval_runtime": 160.2963, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 4250 + }, + { + "epoch": 6.412644651425346, + "grad_norm": 0.20001620054244995, + "learning_rate": 3.4289397103260346e-05, + "loss": 0.0039, + "step": 4260 + }, + { + "epoch": 6.4276978078840905, + "grad_norm": 0.15622283518314362, + "learning_rate": 3.4040077202498916e-05, + "loss": 0.0027, + "step": 4270 + }, + { + "epoch": 6.442750964342836, + "grad_norm": 0.3287314772605896, + "learning_rate": 3.3791198372584664e-05, + "loss": 0.0036, + "step": 4280 + }, + { + "epoch": 6.457804120801581, + "grad_norm": 0.05680296942591667, + "learning_rate": 3.3542767491570695e-05, + "loss": 0.001, + "step": 4290 + }, + { + "epoch": 6.472857277260325, + "grad_norm": 0.006026843097060919, + "learning_rate": 3.329479142513051e-05, + "loss": 0.0025, + "step": 4300 + }, + { + "epoch": 6.472857277260325, + "eval_loss": 0.06358759850263596, + "eval_runtime": 160.3526, + "eval_samples_per_second": 7.365, + "eval_steps_per_second": 7.365, + "step": 4300 + }, + { + "epoch": 6.487910433719071, + "grad_norm": 0.04567964747548103, + "learning_rate": 3.304727702636832e-05, + "loss": 0.001, + "step": 4310 + }, + { + "epoch": 6.502963590177815, + "grad_norm": 0.013553903438150883, + "learning_rate": 3.280023113562957e-05, + "loss": 0.0037, + "step": 4320 + }, + { + "epoch": 6.51801674663656, + "grad_norm": 0.20995531976222992, + "learning_rate": 3.255366058031196e-05, + "loss": 0.0027, + "step": 4330 + }, + { + "epoch": 6.533069903095305, + "grad_norm": 0.06071958318352699, + "learning_rate": 3.230757217467677e-05, + "loss": 0.0009, + "step": 4340 + }, + { + "epoch": 6.54812305955405, + "grad_norm": 0.12979112565517426, + "learning_rate": 3.206197271966049e-05, + "loss": 0.002, + "step": 4350 + }, + { + "epoch": 6.54812305955405, + "eval_loss": 0.06813929229974747, + "eval_runtime": 160.2628, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 4350 + }, + { + "epoch": 6.5631762160127955, + "grad_norm": 0.11185190081596375, + "learning_rate": 3.1816869002686936e-05, + "loss": 0.0022, + "step": 4360 + }, + { + "epoch": 6.57822937247154, + "grad_norm": 0.3746158480644226, + "learning_rate": 3.157226779747958e-05, + "loss": 0.0008, + "step": 4370 + }, + { + "epoch": 6.593282528930285, + "grad_norm": 0.005143607500940561, + "learning_rate": 3.1328175863874464e-05, + "loss": 0.0021, + "step": 4380 + }, + { + "epoch": 6.60833568538903, + "grad_norm": 0.010787163861095905, + "learning_rate": 3.1084599947633256e-05, + "loss": 0.0021, + "step": 4390 + }, + { + "epoch": 6.623388841847775, + "grad_norm": 0.004909005016088486, + "learning_rate": 3.084154678025692e-05, + "loss": 0.0008, + "step": 4400 + }, + { + "epoch": 6.623388841847775, + "eval_loss": 0.07205173373222351, + "eval_runtime": 160.2709, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 4400 + }, + { + "epoch": 6.63844199830652, + "grad_norm": 0.5811508297920227, + "learning_rate": 3.059902307879967e-05, + "loss": 0.0077, + "step": 4410 + }, + { + "epoch": 6.653495154765265, + "grad_norm": 0.020976049825549126, + "learning_rate": 3.035703554568331e-05, + "loss": 0.0028, + "step": 4420 + }, + { + "epoch": 6.66854831122401, + "grad_norm": 0.04433680698275566, + "learning_rate": 3.011559086851201e-05, + "loss": 0.0103, + "step": 4430 + }, + { + "epoch": 6.683601467682755, + "grad_norm": 0.2687906324863434, + "learning_rate": 2.9874695719887464e-05, + "loss": 0.0041, + "step": 4440 + }, + { + "epoch": 6.6986546241415, + "grad_norm": 0.015897583216428757, + "learning_rate": 2.9634356757224563e-05, + "loss": 0.0025, + "step": 4450 + }, + { + "epoch": 6.6986546241415, + "eval_loss": 0.06026773527264595, + "eval_runtime": 160.4208, + "eval_samples_per_second": 7.362, + "eval_steps_per_second": 7.362, + "step": 4450 + }, + { + "epoch": 6.713707780600244, + "grad_norm": 0.028956053778529167, + "learning_rate": 2.9394580622567312e-05, + "loss": 0.0011, + "step": 4460 + }, + { + "epoch": 6.72876093705899, + "grad_norm": 0.25893718004226685, + "learning_rate": 2.9155373942405372e-05, + "loss": 0.0051, + "step": 4470 + }, + { + "epoch": 6.743814093517734, + "grad_norm": 0.015201851725578308, + "learning_rate": 2.8916743327490803e-05, + "loss": 0.0032, + "step": 4480 + }, + { + "epoch": 6.758867249976479, + "grad_norm": 0.5931967496871948, + "learning_rate": 2.8678695372655496e-05, + "loss": 0.0035, + "step": 4490 + }, + { + "epoch": 6.7739204064352245, + "grad_norm": 0.0724746510386467, + "learning_rate": 2.8441236656628828e-05, + "loss": 0.0021, + "step": 4500 + }, + { + "epoch": 6.7739204064352245, + "eval_loss": 0.0615006722509861, + "eval_runtime": 160.2833, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 4500 + }, + { + "epoch": 6.788973562893969, + "grad_norm": 0.03488875925540924, + "learning_rate": 2.820437374185587e-05, + "loss": 0.0023, + "step": 4510 + }, + { + "epoch": 6.804026719352715, + "grad_norm": 0.2586740255355835, + "learning_rate": 2.79681131743161e-05, + "loss": 0.0019, + "step": 4520 + }, + { + "epoch": 6.819079875811459, + "grad_norm": 0.0354912094771862, + "learning_rate": 2.7732461483342393e-05, + "loss": 0.0049, + "step": 4530 + }, + { + "epoch": 6.834133032270204, + "grad_norm": 0.44501015543937683, + "learning_rate": 2.7497425181440607e-05, + "loss": 0.0016, + "step": 4540 + }, + { + "epoch": 6.849186188728949, + "grad_norm": 0.019058121368288994, + "learning_rate": 2.726301076410963e-05, + "loss": 0.0032, + "step": 4550 + }, + { + "epoch": 6.849186188728949, + "eval_loss": 0.060836534947156906, + "eval_runtime": 160.2053, + "eval_samples_per_second": 7.372, + "eval_steps_per_second": 7.372, + "step": 4550 + }, + { + "epoch": 6.864239345187694, + "grad_norm": 0.20955052971839905, + "learning_rate": 2.702922470966187e-05, + "loss": 0.0025, + "step": 4560 + }, + { + "epoch": 6.8792925016464395, + "grad_norm": 0.014856183901429176, + "learning_rate": 2.6796073479044174e-05, + "loss": 0.0023, + "step": 4570 + }, + { + "epoch": 6.894345658105184, + "grad_norm": 0.18584220111370087, + "learning_rate": 2.6563563515659306e-05, + "loss": 0.0026, + "step": 4580 + }, + { + "epoch": 6.909398814563929, + "grad_norm": 0.09641899913549423, + "learning_rate": 2.6331701245187934e-05, + "loss": 0.0007, + "step": 4590 + }, + { + "epoch": 6.924451971022674, + "grad_norm": 0.020394498482346535, + "learning_rate": 2.6100493075410848e-05, + "loss": 0.0046, + "step": 4600 + }, + { + "epoch": 6.924451971022674, + "eval_loss": 0.06560589373111725, + "eval_runtime": 160.2173, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 4600 + }, + { + "epoch": 6.939505127481419, + "grad_norm": 0.571601927280426, + "learning_rate": 2.586994539603217e-05, + "loss": 0.0031, + "step": 4610 + }, + { + "epoch": 6.954558283940163, + "grad_norm": 0.8144958019256592, + "learning_rate": 2.5640064578502497e-05, + "loss": 0.0017, + "step": 4620 + }, + { + "epoch": 6.969611440398909, + "grad_norm": 0.015226438641548157, + "learning_rate": 2.5410856975842996e-05, + "loss": 0.0029, + "step": 4630 + }, + { + "epoch": 6.9846645968576535, + "grad_norm": 0.02755417861044407, + "learning_rate": 2.5182328922469723e-05, + "loss": 0.0009, + "step": 4640 + }, + { + "epoch": 6.999717753316398, + "grad_norm": 0.0061939009465277195, + "learning_rate": 2.4954486734018618e-05, + "loss": 0.0021, + "step": 4650 + }, + { + "epoch": 6.999717753316398, + "eval_loss": 0.06949563324451447, + "eval_runtime": 160.157, + "eval_samples_per_second": 7.374, + "eval_steps_per_second": 7.374, + "step": 4650 + }, + { + "epoch": 7.014770909775144, + "grad_norm": 0.06838443875312805, + "learning_rate": 2.4727336707170973e-05, + "loss": 0.0025, + "step": 4660 + }, + { + "epoch": 7.029824066233888, + "grad_norm": 0.13910618424415588, + "learning_rate": 2.450088511947936e-05, + "loss": 0.0032, + "step": 4670 + }, + { + "epoch": 7.044877222692634, + "grad_norm": 0.012873644009232521, + "learning_rate": 2.427513822919424e-05, + "loss": 0.0006, + "step": 4680 + }, + { + "epoch": 7.059930379151378, + "grad_norm": 0.33798831701278687, + "learning_rate": 2.4050102275090898e-05, + "loss": 0.0015, + "step": 4690 + }, + { + "epoch": 7.074983535610123, + "grad_norm": 0.0062183113768696785, + "learning_rate": 2.3825783476297087e-05, + "loss": 0.0008, + "step": 4700 + }, + { + "epoch": 7.074983535610123, + "eval_loss": 0.06688717007637024, + "eval_runtime": 160.2297, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 4700 + }, + { + "epoch": 7.090036692068868, + "grad_norm": 0.00765349343419075, + "learning_rate": 2.3602188032121163e-05, + "loss": 0.0005, + "step": 4710 + }, + { + "epoch": 7.105089848527613, + "grad_norm": 0.02176281437277794, + "learning_rate": 2.337932212188073e-05, + "loss": 0.0009, + "step": 4720 + }, + { + "epoch": 7.120143004986358, + "grad_norm": 0.11797157675027847, + "learning_rate": 2.3157191904731874e-05, + "loss": 0.0019, + "step": 4730 + }, + { + "epoch": 7.135196161445103, + "grad_norm": 0.006601026281714439, + "learning_rate": 2.2935803519499e-05, + "loss": 0.0005, + "step": 4740 + }, + { + "epoch": 7.150249317903848, + "grad_norm": 0.024187704548239708, + "learning_rate": 2.271516308450511e-05, + "loss": 0.0001, + "step": 4750 + }, + { + "epoch": 7.150249317903848, + "eval_loss": 0.07288970053195953, + "eval_runtime": 160.2559, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 4750 + }, + { + "epoch": 7.165302474362593, + "grad_norm": 0.005969291087239981, + "learning_rate": 2.2495276697402662e-05, + "loss": 0.0003, + "step": 4760 + }, + { + "epoch": 7.180355630821338, + "grad_norm": 0.010023161768913269, + "learning_rate": 2.227615043500527e-05, + "loss": 0.0009, + "step": 4770 + }, + { + "epoch": 7.1954087872800825, + "grad_norm": 0.23527194559574127, + "learning_rate": 2.2057790353119535e-05, + "loss": 0.0039, + "step": 4780 + }, + { + "epoch": 7.210461943738828, + "grad_norm": 0.01431242935359478, + "learning_rate": 2.1840202486377797e-05, + "loss": 0.0006, + "step": 4790 + }, + { + "epoch": 7.225515100197573, + "grad_norm": 0.32648417353630066, + "learning_rate": 2.1623392848071354e-05, + "loss": 0.0016, + "step": 4800 + }, + { + "epoch": 7.225515100197573, + "eval_loss": 0.07545964419841766, + "eval_runtime": 160.2328, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 4800 + }, + { + "epoch": 7.240568256656317, + "grad_norm": 0.010650348849594593, + "learning_rate": 2.1407367429984242e-05, + "loss": 0.0013, + "step": 4810 + }, + { + "epoch": 7.255621413115063, + "grad_norm": 0.0733296275138855, + "learning_rate": 2.1192132202227677e-05, + "loss": 0.0002, + "step": 4820 + }, + { + "epoch": 7.270674569573807, + "grad_norm": 0.0039123608730733395, + "learning_rate": 2.0977693113075085e-05, + "loss": 0.0012, + "step": 4830 + }, + { + "epoch": 7.285727726032553, + "grad_norm": 0.0044445618987083435, + "learning_rate": 2.0764056088797645e-05, + "loss": 0.0001, + "step": 4840 + }, + { + "epoch": 7.300780882491297, + "grad_norm": 0.02226909063756466, + "learning_rate": 2.055122703350057e-05, + "loss": 0.0011, + "step": 4850 + }, + { + "epoch": 7.300780882491297, + "eval_loss": 0.07717882096767426, + "eval_runtime": 160.1835, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 7.373, + "step": 4850 + }, + { + "epoch": 7.315834038950042, + "grad_norm": 0.005666263867169619, + "learning_rate": 2.0339211828959904e-05, + "loss": 0.0008, + "step": 4860 + }, + { + "epoch": 7.3308871954087875, + "grad_norm": 0.028113720938563347, + "learning_rate": 2.0128016334459997e-05, + "loss": 0.0025, + "step": 4870 + }, + { + "epoch": 7.345940351867532, + "grad_norm": 0.0033202129416167736, + "learning_rate": 1.9917646386631577e-05, + "loss": 0.0013, + "step": 4880 + }, + { + "epoch": 7.360993508326278, + "grad_norm": 0.03852800279855728, + "learning_rate": 1.970810779929041e-05, + "loss": 0.0006, + "step": 4890 + }, + { + "epoch": 7.376046664785022, + "grad_norm": 0.012712639756500721, + "learning_rate": 1.949940636327671e-05, + "loss": 0.0037, + "step": 4900 + }, + { + "epoch": 7.376046664785022, + "eval_loss": 0.07560543715953827, + "eval_runtime": 160.2227, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 4900 + }, + { + "epoch": 7.391099821243767, + "grad_norm": 0.07468613237142563, + "learning_rate": 1.9291547846295004e-05, + "loss": 0.0003, + "step": 4910 + }, + { + "epoch": 7.406152977702512, + "grad_norm": 0.2812354564666748, + "learning_rate": 1.9084537992754792e-05, + "loss": 0.0034, + "step": 4920 + }, + { + "epoch": 7.421206134161257, + "grad_norm": 0.0016331238439306617, + "learning_rate": 1.8878382523611786e-05, + "loss": 0.0007, + "step": 4930 + }, + { + "epoch": 7.436259290620002, + "grad_norm": 0.006154115777462721, + "learning_rate": 1.8673087136209803e-05, + "loss": 0.0008, + "step": 4940 + }, + { + "epoch": 7.451312447078747, + "grad_norm": 0.0049999612383544445, + "learning_rate": 1.8468657504123287e-05, + "loss": 0.0002, + "step": 4950 + }, + { + "epoch": 7.451312447078747, + "eval_loss": 0.07704046368598938, + "eval_runtime": 160.2983, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 4950 + }, + { + "epoch": 7.466365603537492, + "grad_norm": 0.011083276011049747, + "learning_rate": 1.8265099277000614e-05, + "loss": 0.0036, + "step": 4960 + }, + { + "epoch": 7.481418759996236, + "grad_norm": 0.009901138022542, + "learning_rate": 1.806241808040776e-05, + "loss": 0.0021, + "step": 4970 + }, + { + "epoch": 7.496471916454982, + "grad_norm": 0.2185642123222351, + "learning_rate": 1.7860619515673033e-05, + "loss": 0.0013, + "step": 4980 + }, + { + "epoch": 7.511525072913726, + "grad_norm": 0.0034970121923834085, + "learning_rate": 1.76597091597322e-05, + "loss": 0.0001, + "step": 4990 + }, + { + "epoch": 7.526578229372472, + "grad_norm": 0.03339719772338867, + "learning_rate": 1.7459692564974316e-05, + "loss": 0.0013, + "step": 5000 + }, + { + "epoch": 7.526578229372472, + "eval_loss": 0.07092679291963577, + "eval_runtime": 160.3188, + "eval_samples_per_second": 7.367, + "eval_steps_per_second": 7.367, + "step": 5000 + }, + { + "epoch": 7.5416313858312165, + "grad_norm": 0.02087845653295517, + "learning_rate": 1.7260575259088317e-05, + "loss": 0.0003, + "step": 5010 + }, + { + "epoch": 7.556684542289961, + "grad_norm": 0.007836734876036644, + "learning_rate": 1.7062362744910322e-05, + "loss": 0.0006, + "step": 5020 + }, + { + "epoch": 7.571737698748707, + "grad_norm": 0.012868853285908699, + "learning_rate": 1.6865060500271383e-05, + "loss": 0.0003, + "step": 5030 + }, + { + "epoch": 7.586790855207451, + "grad_norm": 0.002863493515178561, + "learning_rate": 1.6668673977846254e-05, + "loss": 0.0012, + "step": 5040 + }, + { + "epoch": 7.601844011666197, + "grad_norm": 0.010151112452149391, + "learning_rate": 1.6473208605002704e-05, + "loss": 0.0016, + "step": 5050 + }, + { + "epoch": 7.601844011666197, + "eval_loss": 0.0706048458814621, + "eval_runtime": 160.1944, + "eval_samples_per_second": 7.372, + "eval_steps_per_second": 7.372, + "step": 5050 + }, + { + "epoch": 7.616897168124941, + "grad_norm": 0.2096201479434967, + "learning_rate": 1.6278669783651395e-05, + "loss": 0.0016, + "step": 5060 + }, + { + "epoch": 7.631950324583686, + "grad_norm": 0.018149644136428833, + "learning_rate": 1.6085062890096708e-05, + "loss": 0.001, + "step": 5070 + }, + { + "epoch": 7.647003481042431, + "grad_norm": 0.047165073454380035, + "learning_rate": 1.589239327488812e-05, + "loss": 0.001, + "step": 5080 + }, + { + "epoch": 7.662056637501176, + "grad_norm": 0.016901487484574318, + "learning_rate": 1.5700666262672324e-05, + "loss": 0.0006, + "step": 5090 + }, + { + "epoch": 7.677109793959921, + "grad_norm": 0.00693804444745183, + "learning_rate": 1.5509887152046137e-05, + "loss": 0.0003, + "step": 5100 + }, + { + "epoch": 7.677109793959921, + "eval_loss": 0.07184600830078125, + "eval_runtime": 160.248, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 7.37, + "step": 5100 + }, + { + "epoch": 7.692162950418666, + "grad_norm": 0.011177941225469112, + "learning_rate": 1.5320061215409958e-05, + "loss": 0.0014, + "step": 5110 + }, + { + "epoch": 7.707216106877411, + "grad_norm": 0.1834590882062912, + "learning_rate": 1.5131193698822232e-05, + "loss": 0.0016, + "step": 5120 + }, + { + "epoch": 7.722269263336155, + "grad_norm": 0.6408646702766418, + "learning_rate": 1.4943289821854212e-05, + "loss": 0.0028, + "step": 5130 + }, + { + "epoch": 7.737322419794901, + "grad_norm": 0.06593596935272217, + "learning_rate": 1.4756354777446001e-05, + "loss": 0.0009, + "step": 5140 + }, + { + "epoch": 7.7523755762536455, + "grad_norm": 0.049006663262844086, + "learning_rate": 1.4570393731762821e-05, + "loss": 0.0024, + "step": 5150 + }, + { + "epoch": 7.7523755762536455, + "eval_loss": 0.07241707295179367, + "eval_runtime": 160.2714, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 5150 + }, + { + "epoch": 7.767428732712391, + "grad_norm": 0.0026186048053205013, + "learning_rate": 1.4385411824052342e-05, + "loss": 0.0011, + "step": 5160 + }, + { + "epoch": 7.782481889171136, + "grad_norm": 0.12721039354801178, + "learning_rate": 1.4201414166502597e-05, + "loss": 0.0014, + "step": 5170 + }, + { + "epoch": 7.79753504562988, + "grad_norm": 0.0161359254270792, + "learning_rate": 1.4018405844100812e-05, + "loss": 0.0003, + "step": 5180 + }, + { + "epoch": 7.812588202088626, + "grad_norm": 0.002176119713112712, + "learning_rate": 1.3836391914492697e-05, + "loss": 0.0002, + "step": 5190 + }, + { + "epoch": 7.82764135854737, + "grad_norm": 0.013269235379993916, + "learning_rate": 1.3655377407842812e-05, + "loss": 0.0024, + "step": 5200 + }, + { + "epoch": 7.82764135854737, + "eval_loss": 0.07497136294841766, + "eval_runtime": 160.2476, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 7.37, + "step": 5200 + }, + { + "epoch": 7.842694515006116, + "grad_norm": 0.00417544599622488, + "learning_rate": 1.3475367326695559e-05, + "loss": 0.0005, + "step": 5210 + }, + { + "epoch": 7.85774767146486, + "grad_norm": 0.004082761239260435, + "learning_rate": 1.3296366645836822e-05, + "loss": 0.0003, + "step": 5220 + }, + { + "epoch": 7.872800827923605, + "grad_norm": 0.008688686415553093, + "learning_rate": 1.311838031215657e-05, + "loss": 0.0003, + "step": 5230 + }, + { + "epoch": 7.8878539843823505, + "grad_norm": 0.019071508198976517, + "learning_rate": 1.2941413244512113e-05, + "loss": 0.0031, + "step": 5240 + }, + { + "epoch": 7.902907140841095, + "grad_norm": 0.07568395137786865, + "learning_rate": 1.2765470333592178e-05, + "loss": 0.0003, + "step": 5250 + }, + { + "epoch": 7.902907140841095, + "eval_loss": 0.07531905174255371, + "eval_runtime": 160.265, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 5250 + }, + { + "epoch": 7.91796029729984, + "grad_norm": 0.0057370299473404884, + "learning_rate": 1.2590556441781725e-05, + "loss": 0.0011, + "step": 5260 + }, + { + "epoch": 7.933013453758585, + "grad_norm": 0.0013195904903113842, + "learning_rate": 1.2416676403027621e-05, + "loss": 0.0007, + "step": 5270 + }, + { + "epoch": 7.94806661021733, + "grad_norm": 0.004281977657228708, + "learning_rate": 1.2243835022705003e-05, + "loss": 0.0006, + "step": 5280 + }, + { + "epoch": 7.9631197666760745, + "grad_norm": 0.01802857592701912, + "learning_rate": 1.2072037077484416e-05, + "loss": 0.0004, + "step": 5290 + }, + { + "epoch": 7.97817292313482, + "grad_norm": 0.020572487264871597, + "learning_rate": 1.1901287315199977e-05, + "loss": 0.0008, + "step": 5300 + }, + { + "epoch": 7.97817292313482, + "eval_loss": 0.07622922956943512, + "eval_runtime": 160.3449, + "eval_samples_per_second": 7.365, + "eval_steps_per_second": 7.365, + "step": 5300 + }, + { + "epoch": 7.993226079593565, + "grad_norm": 0.0023284931667149067, + "learning_rate": 1.173159045471801e-05, + "loss": 0.0014, + "step": 5310 + }, + { + "epoch": 8.00827923605231, + "grad_norm": 0.004363782238215208, + "learning_rate": 1.1562951185806676e-05, + "loss": 0.0004, + "step": 5320 + }, + { + "epoch": 8.023332392511055, + "grad_norm": 0.010162640362977982, + "learning_rate": 1.1395374169006407e-05, + "loss": 0.0002, + "step": 5330 + }, + { + "epoch": 8.0383855489698, + "grad_norm": 0.007003723178058863, + "learning_rate": 1.1228864035501069e-05, + "loss": 0.0005, + "step": 5340 + }, + { + "epoch": 8.053438705428544, + "grad_norm": 0.04383144527673721, + "learning_rate": 1.1063425386989912e-05, + "loss": 0.0002, + "step": 5350 + }, + { + "epoch": 8.053438705428544, + "eval_loss": 0.07978023588657379, + "eval_runtime": 160.3568, + "eval_samples_per_second": 7.365, + "eval_steps_per_second": 7.365, + "step": 5350 + }, + { + "epoch": 8.06849186188729, + "grad_norm": 0.029918311163783073, + "learning_rate": 1.0899062795560573e-05, + "loss": 0.0008, + "step": 5360 + }, + { + "epoch": 8.083545018346035, + "grad_norm": 0.005771939642727375, + "learning_rate": 1.0735780803562539e-05, + "loss": 0.0002, + "step": 5370 + }, + { + "epoch": 8.09859817480478, + "grad_norm": 0.00218608183786273, + "learning_rate": 1.0573583923481711e-05, + "loss": 0.0005, + "step": 5380 + }, + { + "epoch": 8.113651331263524, + "grad_norm": 0.0029254136607050896, + "learning_rate": 1.0412476637815665e-05, + "loss": 0.0001, + "step": 5390 + }, + { + "epoch": 8.128704487722269, + "grad_norm": 0.2637246251106262, + "learning_rate": 1.0252463398949792e-05, + "loss": 0.0014, + "step": 5400 + }, + { + "epoch": 8.128704487722269, + "eval_loss": 0.08101807534694672, + "eval_runtime": 160.3113, + "eval_samples_per_second": 7.367, + "eval_steps_per_second": 7.367, + "step": 5400 + }, + { + "epoch": 8.143757644181013, + "grad_norm": 0.007723491173237562, + "learning_rate": 1.0093548629034216e-05, + "loss": 0.0012, + "step": 5410 + }, + { + "epoch": 8.15881080063976, + "grad_norm": 0.004306530114263296, + "learning_rate": 9.935736719861622e-06, + "loss": 0.0006, + "step": 5420 + }, + { + "epoch": 8.173863957098504, + "grad_norm": 0.08289399743080139, + "learning_rate": 9.779032032745889e-06, + "loss": 0.0004, + "step": 5430 + }, + { + "epoch": 8.188917113557249, + "grad_norm": 0.003008614992722869, + "learning_rate": 9.62343889840151e-06, + "loss": 0.0004, + "step": 5440 + }, + { + "epoch": 8.203970270015994, + "grad_norm": 0.0040671988390386105, + "learning_rate": 9.468961616823941e-06, + "loss": 0.0001, + "step": 5450 + }, + { + "epoch": 8.203970270015994, + "eval_loss": 0.08068478107452393, + "eval_runtime": 160.2828, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 5450 + }, + { + "epoch": 8.219023426474738, + "grad_norm": 0.004215168301016092, + "learning_rate": 9.315604457170768e-06, + "loss": 0.0004, + "step": 5460 + }, + { + "epoch": 8.234076582933485, + "grad_norm": 0.007035973947495222, + "learning_rate": 9.163371657643716e-06, + "loss": 0.0002, + "step": 5470 + }, + { + "epoch": 8.24912973939223, + "grad_norm": 0.00959786120802164, + "learning_rate": 9.012267425371513e-06, + "loss": 0.0001, + "step": 5480 + }, + { + "epoch": 8.264182895850974, + "grad_norm": 0.000956021947786212, + "learning_rate": 8.862295936293658e-06, + "loss": 0.0002, + "step": 5490 + }, + { + "epoch": 8.279236052309718, + "grad_norm": 0.004889221396297216, + "learning_rate": 8.71346133504498e-06, + "loss": 0.0022, + "step": 5500 + }, + { + "epoch": 8.279236052309718, + "eval_loss": 0.08257721364498138, + "eval_runtime": 160.2572, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 5500 + }, + { + "epoch": 8.294289208768463, + "grad_norm": 0.005491503980010748, + "learning_rate": 8.565767734841057e-06, + "loss": 0.001, + "step": 5510 + }, + { + "epoch": 8.30934236522721, + "grad_norm": 0.00465207127854228, + "learning_rate": 8.419219217364654e-06, + "loss": 0.0002, + "step": 5520 + }, + { + "epoch": 8.324395521685954, + "grad_norm": 0.008887565694749355, + "learning_rate": 8.273819832652824e-06, + "loss": 0.0004, + "step": 5530 + }, + { + "epoch": 8.339448678144699, + "grad_norm": 0.03552303835749626, + "learning_rate": 8.129573598984997e-06, + "loss": 0.0002, + "step": 5540 + }, + { + "epoch": 8.354501834603443, + "grad_norm": 0.009291916154325008, + "learning_rate": 7.986484502772013e-06, + "loss": 0.0011, + "step": 5550 + }, + { + "epoch": 8.354501834603443, + "eval_loss": 0.08401281386613846, + "eval_runtime": 160.2178, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 5550 + }, + { + "epoch": 8.369554991062188, + "grad_norm": 0.006055537611246109, + "learning_rate": 7.844556498445788e-06, + "loss": 0.0002, + "step": 5560 + }, + { + "epoch": 8.384608147520932, + "grad_norm": 0.004059657454490662, + "learning_rate": 7.703793508350188e-06, + "loss": 0.0001, + "step": 5570 + }, + { + "epoch": 8.399661303979679, + "grad_norm": 0.002961810678243637, + "learning_rate": 7.564199422632579e-06, + "loss": 0.0011, + "step": 5580 + }, + { + "epoch": 8.414714460438423, + "grad_norm": 0.0019825524650514126, + "learning_rate": 7.425778099136271e-06, + "loss": 0.0005, + "step": 5590 + }, + { + "epoch": 8.429767616897168, + "grad_norm": 0.010841657407581806, + "learning_rate": 7.288533363293959e-06, + "loss": 0.0001, + "step": 5600 + }, + { + "epoch": 8.429767616897168, + "eval_loss": 0.0857808068394661, + "eval_runtime": 160.2918, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 5600 + }, + { + "epoch": 8.444820773355913, + "grad_norm": 0.005581530276685953, + "learning_rate": 7.152469008021984e-06, + "loss": 0.0001, + "step": 5610 + }, + { + "epoch": 8.459873929814657, + "grad_norm": 0.0017037424258887768, + "learning_rate": 7.017588793615498e-06, + "loss": 0.0015, + "step": 5620 + }, + { + "epoch": 8.474927086273404, + "grad_norm": 0.005070498678833246, + "learning_rate": 6.8838964476445554e-06, + "loss": 0.0002, + "step": 5630 + }, + { + "epoch": 8.489980242732148, + "grad_norm": 0.00680957967415452, + "learning_rate": 6.751395664851135e-06, + "loss": 0.0004, + "step": 5640 + }, + { + "epoch": 8.505033399190893, + "grad_norm": 0.19991876184940338, + "learning_rate": 6.62009010704695e-06, + "loss": 0.0012, + "step": 5650 + }, + { + "epoch": 8.505033399190893, + "eval_loss": 0.08782170712947845, + "eval_runtime": 160.2761, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 5650 + }, + { + "epoch": 8.520086555649637, + "grad_norm": 0.0022044023498892784, + "learning_rate": 6.489983403012312e-06, + "loss": 0.0002, + "step": 5660 + }, + { + "epoch": 8.535139712108382, + "grad_norm": 0.6040512919425964, + "learning_rate": 6.361079148395837e-06, + "loss": 0.003, + "step": 5670 + }, + { + "epoch": 8.550192868567128, + "grad_norm": 0.0023957875091582537, + "learning_rate": 6.233380905615049e-06, + "loss": 0.0002, + "step": 5680 + }, + { + "epoch": 8.565246025025873, + "grad_norm": 0.0027232368011027575, + "learning_rate": 6.106892203757953e-06, + "loss": 0.0003, + "step": 5690 + }, + { + "epoch": 8.580299181484618, + "grad_norm": 0.20856763422489166, + "learning_rate": 5.981616538485496e-06, + "loss": 0.0006, + "step": 5700 + }, + { + "epoch": 8.580299181484618, + "eval_loss": 0.08700068295001984, + "eval_runtime": 160.236, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 7.37, + "step": 5700 + }, + { + "epoch": 8.595352337943362, + "grad_norm": 0.007333498448133469, + "learning_rate": 5.857557371934991e-06, + "loss": 0.0003, + "step": 5710 + }, + { + "epoch": 8.610405494402107, + "grad_norm": 1.0139929056167603, + "learning_rate": 5.73471813262435e-06, + "loss": 0.0009, + "step": 5720 + }, + { + "epoch": 8.625458650860853, + "grad_norm": 0.022938523441553116, + "learning_rate": 5.613102215357424e-06, + "loss": 0.0003, + "step": 5730 + }, + { + "epoch": 8.640511807319598, + "grad_norm": 0.15222594141960144, + "learning_rate": 5.4927129811301715e-06, + "loss": 0.0014, + "step": 5740 + }, + { + "epoch": 8.655564963778343, + "grad_norm": 0.028255749493837357, + "learning_rate": 5.37355375703772e-06, + "loss": 0.0005, + "step": 5750 + }, + { + "epoch": 8.655564963778343, + "eval_loss": 0.08826266974210739, + "eval_runtime": 160.185, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 7.373, + "step": 5750 + }, + { + "epoch": 8.670618120237087, + "grad_norm": 0.003581064287573099, + "learning_rate": 5.255627836182453e-06, + "loss": 0.0011, + "step": 5760 + }, + { + "epoch": 8.685671276695832, + "grad_norm": 0.029965421184897423, + "learning_rate": 5.138938477583016e-06, + "loss": 0.0006, + "step": 5770 + }, + { + "epoch": 8.700724433154576, + "grad_norm": 0.000929945323150605, + "learning_rate": 5.0234889060842176e-06, + "loss": 0.0015, + "step": 5780 + }, + { + "epoch": 8.715777589613323, + "grad_norm": 0.011362021788954735, + "learning_rate": 4.909282312267916e-06, + "loss": 0.0003, + "step": 5790 + }, + { + "epoch": 8.730830746072067, + "grad_norm": 0.0025621161330491304, + "learning_rate": 4.796321852364877e-06, + "loss": 0.0016, + "step": 5800 + }, + { + "epoch": 8.730830746072067, + "eval_loss": 0.08825913816690445, + "eval_runtime": 160.198, + "eval_samples_per_second": 7.372, + "eval_steps_per_second": 7.372, + "step": 5800 + }, + { + "epoch": 8.745883902530812, + "grad_norm": 0.00716409832239151, + "learning_rate": 4.684610648167503e-06, + "loss": 0.0001, + "step": 5810 + }, + { + "epoch": 8.760937058989557, + "grad_norm": 0.08832676708698273, + "learning_rate": 4.5741517869435706e-06, + "loss": 0.0002, + "step": 5820 + }, + { + "epoch": 8.775990215448301, + "grad_norm": 0.0006533529958687723, + "learning_rate": 4.464948321350925e-06, + "loss": 0.0, + "step": 5830 + }, + { + "epoch": 8.791043371907048, + "grad_norm": 0.003266278887167573, + "learning_rate": 4.357003269353105e-06, + "loss": 0.0001, + "step": 5840 + }, + { + "epoch": 8.806096528365792, + "grad_norm": 0.0014703174820169806, + "learning_rate": 4.2503196141359335e-06, + "loss": 0.0004, + "step": 5850 + }, + { + "epoch": 8.806096528365792, + "eval_loss": 0.08919917047023773, + "eval_runtime": 160.2623, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 7.369, + "step": 5850 + }, + { + "epoch": 8.821149684824537, + "grad_norm": 0.0031188959255814552, + "learning_rate": 4.144900304025101e-06, + "loss": 0.0001, + "step": 5860 + }, + { + "epoch": 8.836202841283281, + "grad_norm": 0.007266889326274395, + "learning_rate": 4.0407482524046524e-06, + "loss": 0.0004, + "step": 5870 + }, + { + "epoch": 8.851255997742026, + "grad_norm": 0.2118658423423767, + "learning_rate": 3.937866337636459e-06, + "loss": 0.0027, + "step": 5880 + }, + { + "epoch": 8.86630915420077, + "grad_norm": 0.0021141062024980783, + "learning_rate": 3.8362574029807475e-06, + "loss": 0.0001, + "step": 5890 + }, + { + "epoch": 8.881362310659517, + "grad_norm": 0.0021142575424164534, + "learning_rate": 3.7359242565174423e-06, + "loss": 0.0003, + "step": 5900 + }, + { + "epoch": 8.881362310659517, + "eval_loss": 0.08780890703201294, + "eval_runtime": 160.1793, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 7.373, + "step": 5900 + }, + { + "epoch": 8.896415467118262, + "grad_norm": 0.004410842899233103, + "learning_rate": 3.6368696710685877e-06, + "loss": 0.0019, + "step": 5910 + }, + { + "epoch": 8.911468623577006, + "grad_norm": 0.0010047026444226503, + "learning_rate": 3.539096384121743e-06, + "loss": 0.0001, + "step": 5920 + }, + { + "epoch": 8.92652178003575, + "grad_norm": 0.0010828308295458555, + "learning_rate": 3.4426070977542914e-06, + "loss": 0.0005, + "step": 5930 + }, + { + "epoch": 8.941574936494495, + "grad_norm": 0.003045717952772975, + "learning_rate": 3.34740447855878e-06, + "loss": 0.0002, + "step": 5940 + }, + { + "epoch": 8.956628092953242, + "grad_norm": 0.0016286061145365238, + "learning_rate": 3.253491157569244e-06, + "loss": 0.0009, + "step": 5950 + }, + { + "epoch": 8.956628092953242, + "eval_loss": 0.0877390056848526, + "eval_runtime": 160.2243, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 5950 + }, + { + "epoch": 8.971681249411986, + "grad_norm": 0.0003505848872009665, + "learning_rate": 3.160869730188465e-06, + "loss": 0.0006, + "step": 5960 + }, + { + "epoch": 8.986734405870731, + "grad_norm": 0.0039598834700882435, + "learning_rate": 3.069542756116256e-06, + "loss": 0.0001, + "step": 5970 + }, + { + "epoch": 9.001787562329476, + "grad_norm": 0.0012256280751898885, + "learning_rate": 2.9795127592787186e-06, + "loss": 0.0, + "step": 5980 + }, + { + "epoch": 9.01684071878822, + "grad_norm": 0.13050001859664917, + "learning_rate": 2.890782227758515e-06, + "loss": 0.0003, + "step": 5990 + }, + { + "epoch": 9.031893875246967, + "grad_norm": 0.07357563078403473, + "learning_rate": 2.803353613726056e-06, + "loss": 0.0007, + "step": 6000 + }, + { + "epoch": 9.031893875246967, + "eval_loss": 0.0878191664814949, + "eval_runtime": 160.2188, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 6000 + }, + { + "epoch": 9.046947031705711, + "grad_norm": 0.09042318910360336, + "learning_rate": 2.7172293333717848e-06, + "loss": 0.0002, + "step": 6010 + }, + { + "epoch": 9.062000188164456, + "grad_norm": 0.0021275978069752455, + "learning_rate": 2.6324117668393877e-06, + "loss": 0.0004, + "step": 6020 + }, + { + "epoch": 9.0770533446232, + "grad_norm": 0.005650477483868599, + "learning_rate": 2.5489032581600016e-06, + "loss": 0.0003, + "step": 6030 + }, + { + "epoch": 9.092106501081945, + "grad_norm": 0.0027991284150630236, + "learning_rate": 2.466706115187406e-06, + "loss": 0.0001, + "step": 6040 + }, + { + "epoch": 9.107159657540691, + "grad_norm": 0.004402808845043182, + "learning_rate": 2.385822609534344e-06, + "loss": 0.0001, + "step": 6050 + }, + { + "epoch": 9.107159657540691, + "eval_loss": 0.08879312872886658, + "eval_runtime": 160.2215, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 6050 + }, + { + "epoch": 9.122212813999436, + "grad_norm": 0.0011018068762496114, + "learning_rate": 2.3062549765096364e-06, + "loss": 0.0, + "step": 6060 + }, + { + "epoch": 9.13726597045818, + "grad_norm": 0.1328442096710205, + "learning_rate": 2.22800541505645e-06, + "loss": 0.0004, + "step": 6070 + }, + { + "epoch": 9.152319126916925, + "grad_norm": 0.00035094571649096906, + "learning_rate": 2.1510760876915505e-06, + "loss": 0.0001, + "step": 6080 + }, + { + "epoch": 9.16737228337567, + "grad_norm": 0.004552965052425861, + "learning_rate": 2.0754691204454835e-06, + "loss": 0.0008, + "step": 6090 + }, + { + "epoch": 9.182425439834415, + "grad_norm": 0.01151742972433567, + "learning_rate": 2.0011866028038617e-06, + "loss": 0.0013, + "step": 6100 + }, + { + "epoch": 9.182425439834415, + "eval_loss": 0.08881905674934387, + "eval_runtime": 160.2237, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 6100 + }, + { + "epoch": 9.197478596293161, + "grad_norm": 0.006344900466501713, + "learning_rate": 1.928230587649621e-06, + "loss": 0.0005, + "step": 6110 + }, + { + "epoch": 9.212531752751906, + "grad_norm": 0.008713200688362122, + "learning_rate": 1.8566030912062549e-06, + "loss": 0.0001, + "step": 6120 + }, + { + "epoch": 9.22758490921065, + "grad_norm": 0.0007548349676653743, + "learning_rate": 1.7863060929821208e-06, + "loss": 0.0005, + "step": 6130 + }, + { + "epoch": 9.242638065669395, + "grad_norm": 0.005701607558876276, + "learning_rate": 1.717341535715733e-06, + "loss": 0.0001, + "step": 6140 + }, + { + "epoch": 9.25769122212814, + "grad_norm": 0.017252963036298752, + "learning_rate": 1.6497113253220398e-06, + "loss": 0.0006, + "step": 6150 + }, + { + "epoch": 9.25769122212814, + "eval_loss": 0.08880797773599625, + "eval_runtime": 160.2863, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 6150 + }, + { + "epoch": 9.272744378586886, + "grad_norm": 0.03114139288663864, + "learning_rate": 1.5834173308397982e-06, + "loss": 0.0008, + "step": 6160 + }, + { + "epoch": 9.28779753504563, + "grad_norm": 0.0019799680449068546, + "learning_rate": 1.5184613843799045e-06, + "loss": 0.0, + "step": 6170 + }, + { + "epoch": 9.302850691504375, + "grad_norm": 0.00201089377515018, + "learning_rate": 1.4548452810747403e-06, + "loss": 0.0, + "step": 6180 + }, + { + "epoch": 9.31790384796312, + "grad_norm": 0.004385457839816809, + "learning_rate": 1.3925707790285846e-06, + "loss": 0.0009, + "step": 6190 + }, + { + "epoch": 9.332957004421864, + "grad_norm": 0.0023628007620573044, + "learning_rate": 1.33163959926903e-06, + "loss": 0.0006, + "step": 6200 + }, + { + "epoch": 9.332957004421864, + "eval_loss": 0.08912111818790436, + "eval_runtime": 160.1885, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 7.373, + "step": 6200 + }, + { + "epoch": 9.348010160880609, + "grad_norm": 0.0051778145134449005, + "learning_rate": 1.2720534256993877e-06, + "loss": 0.0, + "step": 6210 + }, + { + "epoch": 9.363063317339355, + "grad_norm": 0.004946765024214983, + "learning_rate": 1.2138139050522023e-06, + "loss": 0.0003, + "step": 6220 + }, + { + "epoch": 9.3781164737981, + "grad_norm": 0.00025816867128014565, + "learning_rate": 1.156922646843689e-06, + "loss": 0.0, + "step": 6230 + }, + { + "epoch": 9.393169630256844, + "grad_norm": 0.141653910279274, + "learning_rate": 1.101381223329301e-06, + "loss": 0.0004, + "step": 6240 + }, + { + "epoch": 9.408222786715589, + "grad_norm": 0.005238122306764126, + "learning_rate": 1.0471911694602321e-06, + "loss": 0.0003, + "step": 6250 + }, + { + "epoch": 9.408222786715589, + "eval_loss": 0.08936414867639542, + "eval_runtime": 160.212, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 6250 + }, + { + "epoch": 9.423275943174334, + "grad_norm": 0.0011284928768873215, + "learning_rate": 9.943539828410342e-07, + "loss": 0.0001, + "step": 6260 + }, + { + "epoch": 9.43832909963308, + "grad_norm": 0.0021875326056033373, + "learning_rate": 9.428711236882104e-07, + "loss": 0.0003, + "step": 6270 + }, + { + "epoch": 9.453382256091825, + "grad_norm": 0.005923525895923376, + "learning_rate": 8.927440147898702e-07, + "loss": 0.0001, + "step": 6280 + }, + { + "epoch": 9.46843541255057, + "grad_norm": 0.0015720590017735958, + "learning_rate": 8.439740414663833e-07, + "loss": 0.0005, + "step": 6290 + }, + { + "epoch": 9.483488569009314, + "grad_norm": 0.001961724366992712, + "learning_rate": 7.96562551532154e-07, + "loss": 0.0, + "step": 6300 + }, + { + "epoch": 9.483488569009314, + "eval_loss": 0.08965995907783508, + "eval_runtime": 160.2006, + "eval_samples_per_second": 7.372, + "eval_steps_per_second": 7.372, + "step": 6300 + }, + { + "epoch": 9.498541725468058, + "grad_norm": 0.1116378903388977, + "learning_rate": 7.505108552582852e-07, + "loss": 0.0004, + "step": 6310 + }, + { + "epoch": 9.513594881926805, + "grad_norm": 0.013178065419197083, + "learning_rate": 7.05820225336451e-07, + "loss": 0.0002, + "step": 6320 + }, + { + "epoch": 9.52864803838555, + "grad_norm": 0.0026883843820542097, + "learning_rate": 6.624918968436811e-07, + "loss": 0.0003, + "step": 6330 + }, + { + "epoch": 9.543701194844294, + "grad_norm": 0.04128742590546608, + "learning_rate": 6.20527067208232e-07, + "loss": 0.0004, + "step": 6340 + }, + { + "epoch": 9.558754351303039, + "grad_norm": 0.008797229267656803, + "learning_rate": 5.79926896176497e-07, + "loss": 0.0001, + "step": 6350 + }, + { + "epoch": 9.558754351303039, + "eval_loss": 0.09018374979496002, + "eval_runtime": 160.1879, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 7.373, + "step": 6350 + }, + { + "epoch": 9.573807507761783, + "grad_norm": 0.029039381071925163, + "learning_rate": 5.406925057809653e-07, + "loss": 0.0006, + "step": 6360 + }, + { + "epoch": 9.58886066422053, + "grad_norm": 0.0030565778724849224, + "learning_rate": 5.028249803091966e-07, + "loss": 0.0008, + "step": 6370 + }, + { + "epoch": 9.603913820679274, + "grad_norm": 0.0006165514350868762, + "learning_rate": 4.6632536627386756e-07, + "loss": 0.0011, + "step": 6380 + }, + { + "epoch": 9.618966977138019, + "grad_norm": 0.0003506843058858067, + "learning_rate": 4.311946723838556e-07, + "loss": 0.0007, + "step": 6390 + }, + { + "epoch": 9.634020133596763, + "grad_norm": 0.01982015185058117, + "learning_rate": 3.974338695163393e-07, + "loss": 0.0004, + "step": 6400 + }, + { + "epoch": 9.634020133596763, + "eval_loss": 0.09014550596475601, + "eval_runtime": 160.2801, + "eval_samples_per_second": 7.368, + "eval_steps_per_second": 7.368, + "step": 6400 + }, + { + "epoch": 9.649073290055508, + "grad_norm": 0.0015386652667075396, + "learning_rate": 3.6504389068998665e-07, + "loss": 0.0004, + "step": 6410 + }, + { + "epoch": 9.664126446514253, + "grad_norm": 0.022146394476294518, + "learning_rate": 3.3402563103916984e-07, + "loss": 0.0001, + "step": 6420 + }, + { + "epoch": 9.679179602972999, + "grad_norm": 0.013505291193723679, + "learning_rate": 3.043799477892129e-07, + "loss": 0.0005, + "step": 6430 + }, + { + "epoch": 9.694232759431744, + "grad_norm": 0.024992825463414192, + "learning_rate": 2.7610766023271615e-07, + "loss": 0.0002, + "step": 6440 + }, + { + "epoch": 9.709285915890488, + "grad_norm": 0.11639195680618286, + "learning_rate": 2.49209549706908e-07, + "loss": 0.0008, + "step": 6450 + }, + { + "epoch": 9.709285915890488, + "eval_loss": 0.08995959162712097, + "eval_runtime": 160.2999, + "eval_samples_per_second": 7.367, + "eval_steps_per_second": 7.367, + "step": 6450 + }, + { + "epoch": 9.724339072349233, + "grad_norm": 0.00043936833390034735, + "learning_rate": 2.2368635957205618e-07, + "loss": 0.0003, + "step": 6460 + }, + { + "epoch": 9.739392228807978, + "grad_norm": 0.0009845566237345338, + "learning_rate": 1.9953879519090113e-07, + "loss": 0.0003, + "step": 6470 + }, + { + "epoch": 9.754445385266724, + "grad_norm": 0.00036533811362460256, + "learning_rate": 1.7676752390920482e-07, + "loss": 0.0004, + "step": 6480 + }, + { + "epoch": 9.769498541725468, + "grad_norm": 0.011934688314795494, + "learning_rate": 1.5537317503727111e-07, + "loss": 0.0001, + "step": 6490 + }, + { + "epoch": 9.784551698184213, + "grad_norm": 0.0017583414446562529, + "learning_rate": 1.3535633983257078e-07, + "loss": 0.0002, + "step": 6500 + }, + { + "epoch": 9.784551698184213, + "eval_loss": 0.09014325588941574, + "eval_runtime": 160.2498, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 7.37, + "step": 6500 + }, + { + "epoch": 9.799604854642958, + "grad_norm": 0.007195598445832729, + "learning_rate": 1.1671757148339901e-07, + "loss": 0.0, + "step": 6510 + }, + { + "epoch": 9.814658011101702, + "grad_norm": 0.09654410183429718, + "learning_rate": 9.945738509358205e-08, + "loss": 0.0002, + "step": 6520 + }, + { + "epoch": 9.829711167560447, + "grad_norm": 0.0014568920014426112, + "learning_rate": 8.357625766824973e-08, + "loss": 0.0, + "step": 6530 + }, + { + "epoch": 9.844764324019193, + "grad_norm": 0.0006679772632196546, + "learning_rate": 6.907462810065158e-08, + "loss": 0.0001, + "step": 6540 + }, + { + "epoch": 9.859817480477938, + "grad_norm": 0.1340363323688507, + "learning_rate": 5.59528971600165e-08, + "loss": 0.0003, + "step": 6550 + }, + { + "epoch": 9.859817480477938, + "eval_loss": 0.09023192524909973, + "eval_runtime": 160.2199, + "eval_samples_per_second": 7.371, + "eval_steps_per_second": 7.371, + "step": 6550 + }, + { + "epoch": 9.874870636936683, + "grad_norm": 0.002330332761630416, + "learning_rate": 4.4211427480500554e-08, + "loss": 0.0015, + "step": 6560 + }, + { + "epoch": 9.889923793395427, + "grad_norm": 0.2291540503501892, + "learning_rate": 3.385054355112827e-08, + "loss": 0.0006, + "step": 6570 + }, + { + "epoch": 9.904976949854172, + "grad_norm": 0.0013285197783261538, + "learning_rate": 2.4870531706872034e-08, + "loss": 0.0008, + "step": 6580 + }, + { + "epoch": 9.920030106312918, + "grad_norm": 0.0007674365187995136, + "learning_rate": 1.7271640120686272e-08, + "loss": 0.0004, + "step": 6590 + }, + { + "epoch": 9.935083262771663, + "grad_norm": 0.0018663301598280668, + "learning_rate": 1.105407879670728e-08, + "loss": 0.0001, + "step": 6600 + }, + { + "epoch": 9.935083262771663, + "eval_loss": 0.09038501977920532, + "eval_runtime": 160.1706, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 7.373, + "step": 6600 + }, + { + "epoch": 9.950136419230407, + "grad_norm": 0.0004858894972130656, + "learning_rate": 6.218019564391275e-09, + "loss": 0.0002, + "step": 6610 + }, + { + "epoch": 9.965189575689152, + "grad_norm": 0.022663401439785957, + "learning_rate": 2.763596073807051e-09, + "loss": 0.0005, + "step": 6620 + }, + { + "epoch": 9.980242732147897, + "grad_norm": 0.002418066142126918, + "learning_rate": 6.909037919333728e-10, + "loss": 0.001, + "step": 6630 + }, + { + "epoch": 9.995295888606643, + "grad_norm": 0.0342492014169693, + "learning_rate": 0.0, + "loss": 0.0, + "step": 6640 + }, + { + "epoch": 9.995295888606643, + "step": 6640, + "total_flos": 1.667956184857903e+18, + "train_loss": 0.0289062451225399, + "train_runtime": 73167.6228, + "train_samples_per_second": 1.453, + "train_steps_per_second": 0.091 + } + ], + "logging_steps": 10, + "max_steps": 6640, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.667956184857903e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}