{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9954430379746837, "eval_steps": 500, "global_step": 1972, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020253164556962026, "grad_norm": 2.8833762486177865, "learning_rate": 5e-06, "loss": 0.7568, "step": 10 }, { "epoch": 0.04050632911392405, "grad_norm": 2.390286912889809, "learning_rate": 5e-06, "loss": 0.6504, "step": 20 }, { "epoch": 0.060759493670886074, "grad_norm": 1.489395380260759, "learning_rate": 5e-06, "loss": 0.6287, "step": 30 }, { "epoch": 0.0810126582278481, "grad_norm": 1.589470284066133, "learning_rate": 5e-06, "loss": 0.6135, "step": 40 }, { "epoch": 0.10126582278481013, "grad_norm": 1.4199327904724377, "learning_rate": 5e-06, "loss": 0.6037, "step": 50 }, { "epoch": 0.12151898734177215, "grad_norm": 1.4578167281681993, "learning_rate": 5e-06, "loss": 0.5974, "step": 60 }, { "epoch": 0.14177215189873418, "grad_norm": 1.3411623258601368, "learning_rate": 5e-06, "loss": 0.5923, "step": 70 }, { "epoch": 0.1620253164556962, "grad_norm": 2.0256881877625204, "learning_rate": 5e-06, "loss": 0.5879, "step": 80 }, { "epoch": 0.18227848101265823, "grad_norm": 1.865143294695778, "learning_rate": 5e-06, "loss": 0.5869, "step": 90 }, { "epoch": 0.20253164556962025, "grad_norm": 2.850869507593689, "learning_rate": 5e-06, "loss": 0.5894, "step": 100 }, { "epoch": 0.22278481012658227, "grad_norm": 2.780764039633903, "learning_rate": 5e-06, "loss": 0.5825, "step": 110 }, { "epoch": 0.2430379746835443, "grad_norm": 2.24595769639212, "learning_rate": 5e-06, "loss": 0.5792, "step": 120 }, { "epoch": 0.26329113924050634, "grad_norm": 1.898945973266495, "learning_rate": 5e-06, "loss": 0.5765, "step": 130 }, { "epoch": 0.28354430379746837, "grad_norm": 2.0985064075463518, "learning_rate": 5e-06, "loss": 0.5657, "step": 140 }, { "epoch": 0.3037974683544304, "grad_norm": 2.1947025380420393, "learning_rate": 5e-06, "loss": 0.576, "step": 150 }, { "epoch": 0.3240506329113924, "grad_norm": 1.9010440206949761, "learning_rate": 5e-06, "loss": 0.5748, "step": 160 }, { "epoch": 0.34430379746835443, "grad_norm": 1.5587857210363367, "learning_rate": 5e-06, "loss": 0.5762, "step": 170 }, { "epoch": 0.36455696202531646, "grad_norm": 1.940350997849539, "learning_rate": 5e-06, "loss": 0.5726, "step": 180 }, { "epoch": 0.3848101265822785, "grad_norm": 1.653915136046625, "learning_rate": 5e-06, "loss": 0.5724, "step": 190 }, { "epoch": 0.4050632911392405, "grad_norm": 2.4614019893025274, "learning_rate": 5e-06, "loss": 0.567, "step": 200 }, { "epoch": 0.4253164556962025, "grad_norm": 1.8586854938543989, "learning_rate": 5e-06, "loss": 0.5628, "step": 210 }, { "epoch": 0.44556962025316454, "grad_norm": 1.4449652115361937, "learning_rate": 5e-06, "loss": 0.5679, "step": 220 }, { "epoch": 0.46582278481012657, "grad_norm": 1.3739469268351123, "learning_rate": 5e-06, "loss": 0.5676, "step": 230 }, { "epoch": 0.4860759493670886, "grad_norm": 1.165737020699501, "learning_rate": 5e-06, "loss": 0.5644, "step": 240 }, { "epoch": 0.5063291139240507, "grad_norm": 1.357956082726849, "learning_rate": 5e-06, "loss": 0.5686, "step": 250 }, { "epoch": 0.5265822784810127, "grad_norm": 1.4295099868510899, "learning_rate": 5e-06, "loss": 0.5554, "step": 260 }, { "epoch": 0.5468354430379747, "grad_norm": 1.4688068967033727, "learning_rate": 5e-06, "loss": 0.5643, "step": 270 }, { "epoch": 0.5670886075949367, "grad_norm": 1.3744420183733652, "learning_rate": 5e-06, "loss": 0.56, "step": 280 }, { "epoch": 0.5873417721518988, "grad_norm": 1.4423763395622504, "learning_rate": 5e-06, "loss": 0.56, "step": 290 }, { "epoch": 0.6075949367088608, "grad_norm": 1.4527708533649162, "learning_rate": 5e-06, "loss": 0.5596, "step": 300 }, { "epoch": 0.6278481012658228, "grad_norm": 1.4417603290241021, "learning_rate": 5e-06, "loss": 0.5655, "step": 310 }, { "epoch": 0.6481012658227848, "grad_norm": 1.6835798959760988, "learning_rate": 5e-06, "loss": 0.5611, "step": 320 }, { "epoch": 0.6683544303797468, "grad_norm": 1.3523279967838302, "learning_rate": 5e-06, "loss": 0.5607, "step": 330 }, { "epoch": 0.6886075949367089, "grad_norm": 1.2500179332984684, "learning_rate": 5e-06, "loss": 0.5597, "step": 340 }, { "epoch": 0.7088607594936709, "grad_norm": 1.2523150009107906, "learning_rate": 5e-06, "loss": 0.5614, "step": 350 }, { "epoch": 0.7291139240506329, "grad_norm": 1.0953008806392797, "learning_rate": 5e-06, "loss": 0.5638, "step": 360 }, { "epoch": 0.7493670886075949, "grad_norm": 1.6446500941969269, "learning_rate": 5e-06, "loss": 0.5566, "step": 370 }, { "epoch": 0.769620253164557, "grad_norm": 1.4079831931594988, "learning_rate": 5e-06, "loss": 0.5533, "step": 380 }, { "epoch": 0.789873417721519, "grad_norm": 1.1110346875099018, "learning_rate": 5e-06, "loss": 0.5559, "step": 390 }, { "epoch": 0.810126582278481, "grad_norm": 1.4268171466763837, "learning_rate": 5e-06, "loss": 0.5566, "step": 400 }, { "epoch": 0.830379746835443, "grad_norm": 1.2447639143712546, "learning_rate": 5e-06, "loss": 0.5514, "step": 410 }, { "epoch": 0.850632911392405, "grad_norm": 1.195155734961536, "learning_rate": 5e-06, "loss": 0.5587, "step": 420 }, { "epoch": 0.8708860759493671, "grad_norm": 1.1003141204515203, "learning_rate": 5e-06, "loss": 0.5521, "step": 430 }, { "epoch": 0.8911392405063291, "grad_norm": 1.3030562652810704, "learning_rate": 5e-06, "loss": 0.5573, "step": 440 }, { "epoch": 0.9113924050632911, "grad_norm": 1.2728739765574453, "learning_rate": 5e-06, "loss": 0.5488, "step": 450 }, { "epoch": 0.9316455696202531, "grad_norm": 1.1393805858707695, "learning_rate": 5e-06, "loss": 0.5532, "step": 460 }, { "epoch": 0.9518987341772152, "grad_norm": 1.252449871115967, "learning_rate": 5e-06, "loss": 0.5539, "step": 470 }, { "epoch": 0.9721518987341772, "grad_norm": 1.3675922433289114, "learning_rate": 5e-06, "loss": 0.5515, "step": 480 }, { "epoch": 0.9924050632911392, "grad_norm": 1.0986012485203254, "learning_rate": 5e-06, "loss": 0.548, "step": 490 }, { "epoch": 0.9984810126582279, "eval_loss": 0.13772521913051605, "eval_runtime": 252.9462, "eval_samples_per_second": 52.596, "eval_steps_per_second": 0.411, "step": 493 }, { "epoch": 1.0131645569620253, "grad_norm": 1.8664570600716093, "learning_rate": 5e-06, "loss": 0.5056, "step": 500 }, { "epoch": 1.0334177215189873, "grad_norm": 1.690550013794204, "learning_rate": 5e-06, "loss": 0.4717, "step": 510 }, { "epoch": 1.0536708860759494, "grad_norm": 1.7450606923145526, "learning_rate": 5e-06, "loss": 0.4683, "step": 520 }, { "epoch": 1.0739240506329113, "grad_norm": 1.4739241775856946, "learning_rate": 5e-06, "loss": 0.4682, "step": 530 }, { "epoch": 1.0941772151898734, "grad_norm": 2.196710788601052, "learning_rate": 5e-06, "loss": 0.467, "step": 540 }, { "epoch": 1.1144303797468353, "grad_norm": 1.4622782937118193, "learning_rate": 5e-06, "loss": 0.4664, "step": 550 }, { "epoch": 1.1346835443037975, "grad_norm": 1.204681750576354, "learning_rate": 5e-06, "loss": 0.4629, "step": 560 }, { "epoch": 1.1549367088607596, "grad_norm": 1.2919169624567832, "learning_rate": 5e-06, "loss": 0.4723, "step": 570 }, { "epoch": 1.1751898734177215, "grad_norm": 1.1738791165204896, "learning_rate": 5e-06, "loss": 0.4696, "step": 580 }, { "epoch": 1.1954430379746834, "grad_norm": 1.2749973090696376, "learning_rate": 5e-06, "loss": 0.4655, "step": 590 }, { "epoch": 1.2156962025316456, "grad_norm": 1.3069834745764197, "learning_rate": 5e-06, "loss": 0.4738, "step": 600 }, { "epoch": 1.2359493670886077, "grad_norm": 1.268141928024117, "learning_rate": 5e-06, "loss": 0.4689, "step": 610 }, { "epoch": 1.2562025316455696, "grad_norm": 2.349539426930866, "learning_rate": 5e-06, "loss": 0.4738, "step": 620 }, { "epoch": 1.2764556962025315, "grad_norm": 1.3038280118636631, "learning_rate": 5e-06, "loss": 0.4724, "step": 630 }, { "epoch": 1.2967088607594937, "grad_norm": 1.2561549442694693, "learning_rate": 5e-06, "loss": 0.4717, "step": 640 }, { "epoch": 1.3169620253164558, "grad_norm": 1.2353995568966352, "learning_rate": 5e-06, "loss": 0.4756, "step": 650 }, { "epoch": 1.3372151898734177, "grad_norm": 1.3143696543162657, "learning_rate": 5e-06, "loss": 0.4653, "step": 660 }, { "epoch": 1.3574683544303796, "grad_norm": 1.8332487196614191, "learning_rate": 5e-06, "loss": 0.4791, "step": 670 }, { "epoch": 1.3777215189873417, "grad_norm": 1.5782907788998026, "learning_rate": 5e-06, "loss": 0.4743, "step": 680 }, { "epoch": 1.3979746835443039, "grad_norm": 1.43844179760011, "learning_rate": 5e-06, "loss": 0.4733, "step": 690 }, { "epoch": 1.4182278481012658, "grad_norm": 1.2104409020838625, "learning_rate": 5e-06, "loss": 0.4695, "step": 700 }, { "epoch": 1.438481012658228, "grad_norm": 1.2829692982799845, "learning_rate": 5e-06, "loss": 0.4756, "step": 710 }, { "epoch": 1.4587341772151898, "grad_norm": 1.2115512985925798, "learning_rate": 5e-06, "loss": 0.4757, "step": 720 }, { "epoch": 1.478987341772152, "grad_norm": 1.1708137297250578, "learning_rate": 5e-06, "loss": 0.479, "step": 730 }, { "epoch": 1.4992405063291139, "grad_norm": 1.1860297553473809, "learning_rate": 5e-06, "loss": 0.4737, "step": 740 }, { "epoch": 1.5194936708860758, "grad_norm": 1.159382227654447, "learning_rate": 5e-06, "loss": 0.4764, "step": 750 }, { "epoch": 1.539746835443038, "grad_norm": 1.1694651947442445, "learning_rate": 5e-06, "loss": 0.4765, "step": 760 }, { "epoch": 1.56, "grad_norm": 1.160166128209198, "learning_rate": 5e-06, "loss": 0.4765, "step": 770 }, { "epoch": 1.5802531645569622, "grad_norm": 1.1966330785907486, "learning_rate": 5e-06, "loss": 0.4762, "step": 780 }, { "epoch": 1.600506329113924, "grad_norm": 1.2884100928471158, "learning_rate": 5e-06, "loss": 0.4797, "step": 790 }, { "epoch": 1.620759493670886, "grad_norm": 1.50350568469768, "learning_rate": 5e-06, "loss": 0.4765, "step": 800 }, { "epoch": 1.6410126582278481, "grad_norm": 1.1416454468827366, "learning_rate": 5e-06, "loss": 0.4685, "step": 810 }, { "epoch": 1.6612658227848103, "grad_norm": 1.3289305602168267, "learning_rate": 5e-06, "loss": 0.4771, "step": 820 }, { "epoch": 1.6815189873417722, "grad_norm": 1.242003954761086, "learning_rate": 5e-06, "loss": 0.4721, "step": 830 }, { "epoch": 1.701772151898734, "grad_norm": 1.1084079600907246, "learning_rate": 5e-06, "loss": 0.4731, "step": 840 }, { "epoch": 1.7220253164556962, "grad_norm": 1.268075113702087, "learning_rate": 5e-06, "loss": 0.4766, "step": 850 }, { "epoch": 1.7422784810126584, "grad_norm": 1.152924839399553, "learning_rate": 5e-06, "loss": 0.4767, "step": 860 }, { "epoch": 1.7625316455696203, "grad_norm": 1.0955649434062678, "learning_rate": 5e-06, "loss": 0.4776, "step": 870 }, { "epoch": 1.7827848101265822, "grad_norm": 1.3082664193765503, "learning_rate": 5e-06, "loss": 0.4815, "step": 880 }, { "epoch": 1.8030379746835443, "grad_norm": 1.166931921170078, "learning_rate": 5e-06, "loss": 0.4788, "step": 890 }, { "epoch": 1.8232911392405065, "grad_norm": 1.1423266879559995, "learning_rate": 5e-06, "loss": 0.477, "step": 900 }, { "epoch": 1.8435443037974684, "grad_norm": 1.3005388006776002, "learning_rate": 5e-06, "loss": 0.4789, "step": 910 }, { "epoch": 1.8637974683544303, "grad_norm": 1.5362991154311363, "learning_rate": 5e-06, "loss": 0.4791, "step": 920 }, { "epoch": 1.8840506329113924, "grad_norm": 1.0615549431669202, "learning_rate": 5e-06, "loss": 0.4746, "step": 930 }, { "epoch": 1.9043037974683545, "grad_norm": 1.1359371350640348, "learning_rate": 5e-06, "loss": 0.4774, "step": 940 }, { "epoch": 1.9245569620253165, "grad_norm": 1.338873863277151, "learning_rate": 5e-06, "loss": 0.4799, "step": 950 }, { "epoch": 1.9448101265822784, "grad_norm": 1.0562817698500642, "learning_rate": 5e-06, "loss": 0.4834, "step": 960 }, { "epoch": 1.9650632911392405, "grad_norm": 1.1976389472965596, "learning_rate": 5e-06, "loss": 0.481, "step": 970 }, { "epoch": 1.9853164556962026, "grad_norm": 1.1318384251326352, "learning_rate": 5e-06, "loss": 0.4823, "step": 980 }, { "epoch": 1.999493670886076, "eval_loss": 0.13791824877262115, "eval_runtime": 252.4301, "eval_samples_per_second": 52.704, "eval_steps_per_second": 0.412, "step": 987 }, { "epoch": 2.0060759493670886, "grad_norm": 2.24179568741783, "learning_rate": 5e-06, "loss": 0.4515, "step": 990 }, { "epoch": 2.0263291139240507, "grad_norm": 1.6981313244359, "learning_rate": 5e-06, "loss": 0.3891, "step": 1000 }, { "epoch": 2.046582278481013, "grad_norm": 1.5796760743871119, "learning_rate": 5e-06, "loss": 0.3814, "step": 1010 }, { "epoch": 2.0668354430379745, "grad_norm": 1.7079048311976808, "learning_rate": 5e-06, "loss": 0.3794, "step": 1020 }, { "epoch": 2.0870886075949366, "grad_norm": 1.4027876167762106, "learning_rate": 5e-06, "loss": 0.3793, "step": 1030 }, { "epoch": 2.1073417721518988, "grad_norm": 1.2776686544662923, "learning_rate": 5e-06, "loss": 0.3758, "step": 1040 }, { "epoch": 2.127594936708861, "grad_norm": 1.3728804833099486, "learning_rate": 5e-06, "loss": 0.3868, "step": 1050 }, { "epoch": 2.1478481012658226, "grad_norm": 1.4205398673755427, "learning_rate": 5e-06, "loss": 0.3827, "step": 1060 }, { "epoch": 2.1681012658227847, "grad_norm": 1.3832130487234917, "learning_rate": 5e-06, "loss": 0.3852, "step": 1070 }, { "epoch": 2.188354430379747, "grad_norm": 1.5929946445474668, "learning_rate": 5e-06, "loss": 0.3841, "step": 1080 }, { "epoch": 2.208607594936709, "grad_norm": 1.4600989820317136, "learning_rate": 5e-06, "loss": 0.3818, "step": 1090 }, { "epoch": 2.2288607594936707, "grad_norm": 1.4225880515710438, "learning_rate": 5e-06, "loss": 0.3909, "step": 1100 }, { "epoch": 2.249113924050633, "grad_norm": 1.780076806234694, "learning_rate": 5e-06, "loss": 0.3884, "step": 1110 }, { "epoch": 2.269367088607595, "grad_norm": 1.694668518355522, "learning_rate": 5e-06, "loss": 0.387, "step": 1120 }, { "epoch": 2.289620253164557, "grad_norm": 1.7667024915838094, "learning_rate": 5e-06, "loss": 0.3894, "step": 1130 }, { "epoch": 2.309873417721519, "grad_norm": 1.410793594228534, "learning_rate": 5e-06, "loss": 0.3842, "step": 1140 }, { "epoch": 2.330126582278481, "grad_norm": 1.5234909172570381, "learning_rate": 5e-06, "loss": 0.3908, "step": 1150 }, { "epoch": 2.350379746835443, "grad_norm": 1.2213254079078641, "learning_rate": 5e-06, "loss": 0.3903, "step": 1160 }, { "epoch": 2.370632911392405, "grad_norm": 1.4202255106744544, "learning_rate": 5e-06, "loss": 0.3915, "step": 1170 }, { "epoch": 2.390886075949367, "grad_norm": 1.4023194734455904, "learning_rate": 5e-06, "loss": 0.3924, "step": 1180 }, { "epoch": 2.411139240506329, "grad_norm": 1.7343746037355243, "learning_rate": 5e-06, "loss": 0.3905, "step": 1190 }, { "epoch": 2.431392405063291, "grad_norm": 1.4482110997116577, "learning_rate": 5e-06, "loss": 0.3899, "step": 1200 }, { "epoch": 2.4516455696202533, "grad_norm": 1.9131070141039026, "learning_rate": 5e-06, "loss": 0.3878, "step": 1210 }, { "epoch": 2.4718987341772154, "grad_norm": 1.3699773358857317, "learning_rate": 5e-06, "loss": 0.3907, "step": 1220 }, { "epoch": 2.492151898734177, "grad_norm": 1.2435143983285588, "learning_rate": 5e-06, "loss": 0.3937, "step": 1230 }, { "epoch": 2.512405063291139, "grad_norm": 1.271353869459015, "learning_rate": 5e-06, "loss": 0.393, "step": 1240 }, { "epoch": 2.5326582278481014, "grad_norm": 1.9458404824086235, "learning_rate": 5e-06, "loss": 0.3921, "step": 1250 }, { "epoch": 2.552911392405063, "grad_norm": 1.324983619143349, "learning_rate": 5e-06, "loss": 0.386, "step": 1260 }, { "epoch": 2.573164556962025, "grad_norm": 1.5086572692565505, "learning_rate": 5e-06, "loss": 0.3929, "step": 1270 }, { "epoch": 2.5934177215189873, "grad_norm": 1.2387524175778437, "learning_rate": 5e-06, "loss": 0.3956, "step": 1280 }, { "epoch": 2.6136708860759494, "grad_norm": 1.4032957006294775, "learning_rate": 5e-06, "loss": 0.3985, "step": 1290 }, { "epoch": 2.6339240506329116, "grad_norm": 1.4186988248363785, "learning_rate": 5e-06, "loss": 0.3949, "step": 1300 }, { "epoch": 2.6541772151898733, "grad_norm": 1.474152675372263, "learning_rate": 5e-06, "loss": 0.3949, "step": 1310 }, { "epoch": 2.6744303797468354, "grad_norm": 1.388800748370683, "learning_rate": 5e-06, "loss": 0.3972, "step": 1320 }, { "epoch": 2.6946835443037975, "grad_norm": 1.3404535288890715, "learning_rate": 5e-06, "loss": 0.3975, "step": 1330 }, { "epoch": 2.714936708860759, "grad_norm": 1.3485784987514158, "learning_rate": 5e-06, "loss": 0.3998, "step": 1340 }, { "epoch": 2.7351898734177214, "grad_norm": 1.3182287721338872, "learning_rate": 5e-06, "loss": 0.3972, "step": 1350 }, { "epoch": 2.7554430379746835, "grad_norm": 1.2129071324788472, "learning_rate": 5e-06, "loss": 0.3932, "step": 1360 }, { "epoch": 2.7756962025316456, "grad_norm": 1.4082013444619639, "learning_rate": 5e-06, "loss": 0.3999, "step": 1370 }, { "epoch": 2.7959493670886078, "grad_norm": 1.4390382641502573, "learning_rate": 5e-06, "loss": 0.3974, "step": 1380 }, { "epoch": 2.81620253164557, "grad_norm": 1.3463527613931228, "learning_rate": 5e-06, "loss": 0.4004, "step": 1390 }, { "epoch": 2.8364556962025316, "grad_norm": 1.2190952668881454, "learning_rate": 5e-06, "loss": 0.3945, "step": 1400 }, { "epoch": 2.8567088607594937, "grad_norm": 1.2959901372391192, "learning_rate": 5e-06, "loss": 0.4014, "step": 1410 }, { "epoch": 2.876962025316456, "grad_norm": 1.2846058903861945, "learning_rate": 5e-06, "loss": 0.3987, "step": 1420 }, { "epoch": 2.8972151898734175, "grad_norm": 1.340897452737586, "learning_rate": 5e-06, "loss": 0.3983, "step": 1430 }, { "epoch": 2.9174683544303797, "grad_norm": 1.3196158163944824, "learning_rate": 5e-06, "loss": 0.4016, "step": 1440 }, { "epoch": 2.937721518987342, "grad_norm": 1.275687217034204, "learning_rate": 5e-06, "loss": 0.4105, "step": 1450 }, { "epoch": 2.957974683544304, "grad_norm": 1.4052855567194664, "learning_rate": 5e-06, "loss": 0.3992, "step": 1460 }, { "epoch": 2.978227848101266, "grad_norm": 1.3721809126891413, "learning_rate": 5e-06, "loss": 0.403, "step": 1470 }, { "epoch": 2.9984810126582278, "grad_norm": 1.3733986827541713, "learning_rate": 5e-06, "loss": 0.4008, "step": 1480 }, { "epoch": 2.9984810126582278, "eval_loss": 0.1451626569032669, "eval_runtime": 252.5403, "eval_samples_per_second": 52.681, "eval_steps_per_second": 0.412, "step": 1480 }, { "epoch": 3.019240506329114, "grad_norm": 2.1233881502453333, "learning_rate": 5e-06, "loss": 0.3123, "step": 1490 }, { "epoch": 3.039493670886076, "grad_norm": 1.733976300717687, "learning_rate": 5e-06, "loss": 0.2924, "step": 1500 }, { "epoch": 3.059746835443038, "grad_norm": 1.7085808896440553, "learning_rate": 5e-06, "loss": 0.2942, "step": 1510 }, { "epoch": 3.08, "grad_norm": 1.4544968881558145, "learning_rate": 5e-06, "loss": 0.2933, "step": 1520 }, { "epoch": 3.100253164556962, "grad_norm": 1.6261331378733803, "learning_rate": 5e-06, "loss": 0.2944, "step": 1530 }, { "epoch": 3.120506329113924, "grad_norm": 1.5609283167478611, "learning_rate": 5e-06, "loss": 0.292, "step": 1540 }, { "epoch": 3.140759493670886, "grad_norm": 1.6902454180217539, "learning_rate": 5e-06, "loss": 0.29, "step": 1550 }, { "epoch": 3.161012658227848, "grad_norm": 1.47961286622397, "learning_rate": 5e-06, "loss": 0.2964, "step": 1560 }, { "epoch": 3.1812658227848103, "grad_norm": 1.6098815338414532, "learning_rate": 5e-06, "loss": 0.2967, "step": 1570 }, { "epoch": 3.201518987341772, "grad_norm": 1.4570975957744257, "learning_rate": 5e-06, "loss": 0.2999, "step": 1580 }, { "epoch": 3.221772151898734, "grad_norm": 1.5616376984698326, "learning_rate": 5e-06, "loss": 0.3003, "step": 1590 }, { "epoch": 3.2420253164556962, "grad_norm": 1.5261875412769847, "learning_rate": 5e-06, "loss": 0.3007, "step": 1600 }, { "epoch": 3.2622784810126584, "grad_norm": 1.5927243321009188, "learning_rate": 5e-06, "loss": 0.2992, "step": 1610 }, { "epoch": 3.28253164556962, "grad_norm": 1.4687003886067878, "learning_rate": 5e-06, "loss": 0.3016, "step": 1620 }, { "epoch": 3.302784810126582, "grad_norm": 1.6212181828314611, "learning_rate": 5e-06, "loss": 0.3046, "step": 1630 }, { "epoch": 3.3230379746835443, "grad_norm": 1.7006560112850782, "learning_rate": 5e-06, "loss": 0.3014, "step": 1640 }, { "epoch": 3.3432911392405065, "grad_norm": 1.6369351197517492, "learning_rate": 5e-06, "loss": 0.3031, "step": 1650 }, { "epoch": 3.363544303797468, "grad_norm": 1.8997824836659962, "learning_rate": 5e-06, "loss": 0.3052, "step": 1660 }, { "epoch": 3.3837974683544303, "grad_norm": 1.7686559384037446, "learning_rate": 5e-06, "loss": 0.2994, "step": 1670 }, { "epoch": 3.4040506329113924, "grad_norm": 1.9852860129783203, "learning_rate": 5e-06, "loss": 0.2987, "step": 1680 }, { "epoch": 3.4243037974683546, "grad_norm": 1.6687727310492642, "learning_rate": 5e-06, "loss": 0.3028, "step": 1690 }, { "epoch": 3.4445569620253167, "grad_norm": 1.555588423959256, "learning_rate": 5e-06, "loss": 0.3012, "step": 1700 }, { "epoch": 3.4648101265822784, "grad_norm": 1.570937882764948, "learning_rate": 5e-06, "loss": 0.3033, "step": 1710 }, { "epoch": 3.4850632911392405, "grad_norm": 1.762794823286088, "learning_rate": 5e-06, "loss": 0.3078, "step": 1720 }, { "epoch": 3.5053164556962026, "grad_norm": 1.5085676767074419, "learning_rate": 5e-06, "loss": 0.3076, "step": 1730 }, { "epoch": 3.5255696202531643, "grad_norm": 1.846690362471807, "learning_rate": 5e-06, "loss": 0.3058, "step": 1740 }, { "epoch": 3.5458227848101265, "grad_norm": 1.888656474961098, "learning_rate": 5e-06, "loss": 0.3092, "step": 1750 }, { "epoch": 3.5660759493670886, "grad_norm": 1.5522432118926228, "learning_rate": 5e-06, "loss": 0.3031, "step": 1760 }, { "epoch": 3.5863291139240507, "grad_norm": 1.7040265273376634, "learning_rate": 5e-06, "loss": 0.3097, "step": 1770 }, { "epoch": 3.606582278481013, "grad_norm": 1.6319175616422348, "learning_rate": 5e-06, "loss": 0.3076, "step": 1780 }, { "epoch": 3.6268354430379746, "grad_norm": 1.6539455217449284, "learning_rate": 5e-06, "loss": 0.3054, "step": 1790 }, { "epoch": 3.6470886075949367, "grad_norm": 1.6323498992182317, "learning_rate": 5e-06, "loss": 0.3089, "step": 1800 }, { "epoch": 3.667341772151899, "grad_norm": 1.9811098470470307, "learning_rate": 5e-06, "loss": 0.3141, "step": 1810 }, { "epoch": 3.6875949367088605, "grad_norm": 1.5750523511747638, "learning_rate": 5e-06, "loss": 0.3093, "step": 1820 }, { "epoch": 3.7078481012658226, "grad_norm": 1.5810037258388918, "learning_rate": 5e-06, "loss": 0.3105, "step": 1830 }, { "epoch": 3.728101265822785, "grad_norm": 1.554399518805066, "learning_rate": 5e-06, "loss": 0.3124, "step": 1840 }, { "epoch": 3.748354430379747, "grad_norm": 1.6069072450441366, "learning_rate": 5e-06, "loss": 0.3099, "step": 1850 }, { "epoch": 3.768607594936709, "grad_norm": 1.6769246041867307, "learning_rate": 5e-06, "loss": 0.3173, "step": 1860 }, { "epoch": 3.7888607594936707, "grad_norm": 1.579060667933051, "learning_rate": 5e-06, "loss": 0.3145, "step": 1870 }, { "epoch": 3.809113924050633, "grad_norm": 2.0501104971409396, "learning_rate": 5e-06, "loss": 0.3154, "step": 1880 }, { "epoch": 3.829367088607595, "grad_norm": 2.2983760386486214, "learning_rate": 5e-06, "loss": 0.315, "step": 1890 }, { "epoch": 3.8496202531645567, "grad_norm": 1.780585915706879, "learning_rate": 5e-06, "loss": 0.3157, "step": 1900 }, { "epoch": 3.869873417721519, "grad_norm": 1.77785313158742, "learning_rate": 5e-06, "loss": 0.3117, "step": 1910 }, { "epoch": 3.890126582278481, "grad_norm": 1.6198590987029773, "learning_rate": 5e-06, "loss": 0.3172, "step": 1920 }, { "epoch": 3.910379746835443, "grad_norm": 1.64334724662568, "learning_rate": 5e-06, "loss": 0.3147, "step": 1930 }, { "epoch": 3.9306329113924052, "grad_norm": 1.5274532742672213, "learning_rate": 5e-06, "loss": 0.312, "step": 1940 }, { "epoch": 3.9508860759493674, "grad_norm": 1.7941457309206295, "learning_rate": 5e-06, "loss": 0.3164, "step": 1950 }, { "epoch": 3.971139240506329, "grad_norm": 1.6138102060492845, "learning_rate": 5e-06, "loss": 0.3175, "step": 1960 }, { "epoch": 3.991392405063291, "grad_norm": 1.7439224514084468, "learning_rate": 5e-06, "loss": 0.3172, "step": 1970 }, { "epoch": 3.9954430379746837, "eval_loss": 0.1617216169834137, "eval_runtime": 254.4743, "eval_samples_per_second": 52.28, "eval_steps_per_second": 0.409, "step": 1972 }, { "epoch": 3.9954430379746837, "step": 1972, "total_flos": 3302754688696320.0, "train_loss": 0.4368948165108176, "train_runtime": 56763.579, "train_samples_per_second": 17.811, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 1972, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3302754688696320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }