| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9954430379746837, | |
| "eval_steps": 500, | |
| "global_step": 1972, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020253164556962026, | |
| "grad_norm": 2.8833762486177865, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7568, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04050632911392405, | |
| "grad_norm": 2.390286912889809, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6504, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.060759493670886074, | |
| "grad_norm": 1.489395380260759, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0810126582278481, | |
| "grad_norm": 1.589470284066133, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6135, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10126582278481013, | |
| "grad_norm": 1.4199327904724377, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6037, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12151898734177215, | |
| "grad_norm": 1.4578167281681993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5974, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14177215189873418, | |
| "grad_norm": 1.3411623258601368, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5923, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1620253164556962, | |
| "grad_norm": 2.0256881877625204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5879, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18227848101265823, | |
| "grad_norm": 1.865143294695778, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5869, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20253164556962025, | |
| "grad_norm": 2.850869507593689, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5894, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.22278481012658227, | |
| "grad_norm": 2.780764039633903, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5825, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2430379746835443, | |
| "grad_norm": 2.24595769639212, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5792, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.26329113924050634, | |
| "grad_norm": 1.898945973266495, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5765, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.28354430379746837, | |
| "grad_norm": 2.0985064075463518, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5657, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3037974683544304, | |
| "grad_norm": 2.1947025380420393, | |
| "learning_rate": 5e-06, | |
| "loss": 0.576, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3240506329113924, | |
| "grad_norm": 1.9010440206949761, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5748, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.34430379746835443, | |
| "grad_norm": 1.5587857210363367, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5762, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.36455696202531646, | |
| "grad_norm": 1.940350997849539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5726, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3848101265822785, | |
| "grad_norm": 1.653915136046625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5724, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4050632911392405, | |
| "grad_norm": 2.4614019893025274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.567, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4253164556962025, | |
| "grad_norm": 1.8586854938543989, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5628, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.44556962025316454, | |
| "grad_norm": 1.4449652115361937, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5679, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.46582278481012657, | |
| "grad_norm": 1.3739469268351123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5676, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4860759493670886, | |
| "grad_norm": 1.165737020699501, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5644, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5063291139240507, | |
| "grad_norm": 1.357956082726849, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5686, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5265822784810127, | |
| "grad_norm": 1.4295099868510899, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5554, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5468354430379747, | |
| "grad_norm": 1.4688068967033727, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5643, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5670886075949367, | |
| "grad_norm": 1.3744420183733652, | |
| "learning_rate": 5e-06, | |
| "loss": 0.56, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5873417721518988, | |
| "grad_norm": 1.4423763395622504, | |
| "learning_rate": 5e-06, | |
| "loss": 0.56, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6075949367088608, | |
| "grad_norm": 1.4527708533649162, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5596, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6278481012658228, | |
| "grad_norm": 1.4417603290241021, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5655, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6481012658227848, | |
| "grad_norm": 1.6835798959760988, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5611, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6683544303797468, | |
| "grad_norm": 1.3523279967838302, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5607, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6886075949367089, | |
| "grad_norm": 1.2500179332984684, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5597, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7088607594936709, | |
| "grad_norm": 1.2523150009107906, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5614, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7291139240506329, | |
| "grad_norm": 1.0953008806392797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5638, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7493670886075949, | |
| "grad_norm": 1.6446500941969269, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5566, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.769620253164557, | |
| "grad_norm": 1.4079831931594988, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5533, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.789873417721519, | |
| "grad_norm": 1.1110346875099018, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5559, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.810126582278481, | |
| "grad_norm": 1.4268171466763837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5566, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.830379746835443, | |
| "grad_norm": 1.2447639143712546, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5514, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.850632911392405, | |
| "grad_norm": 1.195155734961536, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5587, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8708860759493671, | |
| "grad_norm": 1.1003141204515203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5521, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8911392405063291, | |
| "grad_norm": 1.3030562652810704, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5573, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9113924050632911, | |
| "grad_norm": 1.2728739765574453, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5488, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9316455696202531, | |
| "grad_norm": 1.1393805858707695, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5532, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9518987341772152, | |
| "grad_norm": 1.252449871115967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5539, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9721518987341772, | |
| "grad_norm": 1.3675922433289114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5515, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9924050632911392, | |
| "grad_norm": 1.0986012485203254, | |
| "learning_rate": 5e-06, | |
| "loss": 0.548, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9984810126582279, | |
| "eval_loss": 0.13772521913051605, | |
| "eval_runtime": 252.9462, | |
| "eval_samples_per_second": 52.596, | |
| "eval_steps_per_second": 0.411, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 1.0131645569620253, | |
| "grad_norm": 1.8664570600716093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5056, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0334177215189873, | |
| "grad_norm": 1.690550013794204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4717, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0536708860759494, | |
| "grad_norm": 1.7450606923145526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4683, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0739240506329113, | |
| "grad_norm": 1.4739241775856946, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4682, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0941772151898734, | |
| "grad_norm": 2.196710788601052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.467, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1144303797468353, | |
| "grad_norm": 1.4622782937118193, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4664, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1346835443037975, | |
| "grad_norm": 1.204681750576354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4629, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1549367088607596, | |
| "grad_norm": 1.2919169624567832, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4723, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1751898734177215, | |
| "grad_norm": 1.1738791165204896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4696, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.1954430379746834, | |
| "grad_norm": 1.2749973090696376, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4655, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2156962025316456, | |
| "grad_norm": 1.3069834745764197, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4738, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2359493670886077, | |
| "grad_norm": 1.268141928024117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4689, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2562025316455696, | |
| "grad_norm": 2.349539426930866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4738, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2764556962025315, | |
| "grad_norm": 1.3038280118636631, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4724, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.2967088607594937, | |
| "grad_norm": 1.2561549442694693, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4717, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.3169620253164558, | |
| "grad_norm": 1.2353995568966352, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4756, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3372151898734177, | |
| "grad_norm": 1.3143696543162657, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4653, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3574683544303796, | |
| "grad_norm": 1.8332487196614191, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4791, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.3777215189873417, | |
| "grad_norm": 1.5782907788998026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4743, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.3979746835443039, | |
| "grad_norm": 1.43844179760011, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4733, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4182278481012658, | |
| "grad_norm": 1.2104409020838625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4695, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.438481012658228, | |
| "grad_norm": 1.2829692982799845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4756, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.4587341772151898, | |
| "grad_norm": 1.2115512985925798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4757, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.478987341772152, | |
| "grad_norm": 1.1708137297250578, | |
| "learning_rate": 5e-06, | |
| "loss": 0.479, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4992405063291139, | |
| "grad_norm": 1.1860297553473809, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4737, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5194936708860758, | |
| "grad_norm": 1.159382227654447, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4764, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.539746835443038, | |
| "grad_norm": 1.1694651947442445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4765, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.160166128209198, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4765, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.5802531645569622, | |
| "grad_norm": 1.1966330785907486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4762, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.600506329113924, | |
| "grad_norm": 1.2884100928471158, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4797, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.620759493670886, | |
| "grad_norm": 1.50350568469768, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4765, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6410126582278481, | |
| "grad_norm": 1.1416454468827366, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4685, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6612658227848103, | |
| "grad_norm": 1.3289305602168267, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4771, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.6815189873417722, | |
| "grad_norm": 1.242003954761086, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4721, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.701772151898734, | |
| "grad_norm": 1.1084079600907246, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4731, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7220253164556962, | |
| "grad_norm": 1.268075113702087, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4766, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7422784810126584, | |
| "grad_norm": 1.152924839399553, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4767, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.7625316455696203, | |
| "grad_norm": 1.0955649434062678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4776, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.7827848101265822, | |
| "grad_norm": 1.3082664193765503, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4815, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.8030379746835443, | |
| "grad_norm": 1.166931921170078, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4788, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8232911392405065, | |
| "grad_norm": 1.1423266879559995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.477, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8435443037974684, | |
| "grad_norm": 1.3005388006776002, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4789, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.8637974683544303, | |
| "grad_norm": 1.5362991154311363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4791, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.8840506329113924, | |
| "grad_norm": 1.0615549431669202, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4746, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.9043037974683545, | |
| "grad_norm": 1.1359371350640348, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4774, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9245569620253165, | |
| "grad_norm": 1.338873863277151, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4799, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9448101265822784, | |
| "grad_norm": 1.0562817698500642, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4834, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.9650632911392405, | |
| "grad_norm": 1.1976389472965596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.481, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9853164556962026, | |
| "grad_norm": 1.1318384251326352, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4823, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.999493670886076, | |
| "eval_loss": 0.13791824877262115, | |
| "eval_runtime": 252.4301, | |
| "eval_samples_per_second": 52.704, | |
| "eval_steps_per_second": 0.412, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 2.0060759493670886, | |
| "grad_norm": 2.24179568741783, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4515, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.0263291139240507, | |
| "grad_norm": 1.6981313244359, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3891, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.046582278481013, | |
| "grad_norm": 1.5796760743871119, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3814, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.0668354430379745, | |
| "grad_norm": 1.7079048311976808, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3794, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.0870886075949366, | |
| "grad_norm": 1.4027876167762106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3793, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.1073417721518988, | |
| "grad_norm": 1.2776686544662923, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3758, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.127594936708861, | |
| "grad_norm": 1.3728804833099486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3868, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.1478481012658226, | |
| "grad_norm": 1.4205398673755427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3827, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.1681012658227847, | |
| "grad_norm": 1.3832130487234917, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3852, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.188354430379747, | |
| "grad_norm": 1.5929946445474668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3841, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.208607594936709, | |
| "grad_norm": 1.4600989820317136, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3818, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.2288607594936707, | |
| "grad_norm": 1.4225880515710438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3909, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.249113924050633, | |
| "grad_norm": 1.780076806234694, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3884, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.269367088607595, | |
| "grad_norm": 1.694668518355522, | |
| "learning_rate": 5e-06, | |
| "loss": 0.387, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.289620253164557, | |
| "grad_norm": 1.7667024915838094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3894, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.309873417721519, | |
| "grad_norm": 1.410793594228534, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3842, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.330126582278481, | |
| "grad_norm": 1.5234909172570381, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3908, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.350379746835443, | |
| "grad_norm": 1.2213254079078641, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3903, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.370632911392405, | |
| "grad_norm": 1.4202255106744544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3915, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.390886075949367, | |
| "grad_norm": 1.4023194734455904, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3924, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.411139240506329, | |
| "grad_norm": 1.7343746037355243, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3905, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.431392405063291, | |
| "grad_norm": 1.4482110997116577, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3899, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.4516455696202533, | |
| "grad_norm": 1.9131070141039026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3878, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.4718987341772154, | |
| "grad_norm": 1.3699773358857317, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3907, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.492151898734177, | |
| "grad_norm": 1.2435143983285588, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3937, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.512405063291139, | |
| "grad_norm": 1.271353869459015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.393, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.5326582278481014, | |
| "grad_norm": 1.9458404824086235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3921, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.552911392405063, | |
| "grad_norm": 1.324983619143349, | |
| "learning_rate": 5e-06, | |
| "loss": 0.386, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.573164556962025, | |
| "grad_norm": 1.5086572692565505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3929, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.5934177215189873, | |
| "grad_norm": 1.2387524175778437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3956, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.6136708860759494, | |
| "grad_norm": 1.4032957006294775, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3985, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.6339240506329116, | |
| "grad_norm": 1.4186988248363785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3949, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.6541772151898733, | |
| "grad_norm": 1.474152675372263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3949, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.6744303797468354, | |
| "grad_norm": 1.388800748370683, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3972, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.6946835443037975, | |
| "grad_norm": 1.3404535288890715, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3975, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.714936708860759, | |
| "grad_norm": 1.3485784987514158, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3998, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.7351898734177214, | |
| "grad_norm": 1.3182287721338872, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3972, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.7554430379746835, | |
| "grad_norm": 1.2129071324788472, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3932, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.7756962025316456, | |
| "grad_norm": 1.4082013444619639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3999, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.7959493670886078, | |
| "grad_norm": 1.4390382641502573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3974, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.81620253164557, | |
| "grad_norm": 1.3463527613931228, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4004, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.8364556962025316, | |
| "grad_norm": 1.2190952668881454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3945, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.8567088607594937, | |
| "grad_norm": 1.2959901372391192, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4014, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.876962025316456, | |
| "grad_norm": 1.2846058903861945, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3987, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.8972151898734175, | |
| "grad_norm": 1.340897452737586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3983, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.9174683544303797, | |
| "grad_norm": 1.3196158163944824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4016, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.937721518987342, | |
| "grad_norm": 1.275687217034204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4105, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.957974683544304, | |
| "grad_norm": 1.4052855567194664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3992, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.978227848101266, | |
| "grad_norm": 1.3721809126891413, | |
| "learning_rate": 5e-06, | |
| "loss": 0.403, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.9984810126582278, | |
| "grad_norm": 1.3733986827541713, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4008, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.9984810126582278, | |
| "eval_loss": 0.1451626569032669, | |
| "eval_runtime": 252.5403, | |
| "eval_samples_per_second": 52.681, | |
| "eval_steps_per_second": 0.412, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.019240506329114, | |
| "grad_norm": 2.1233881502453333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3123, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.039493670886076, | |
| "grad_norm": 1.733976300717687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2924, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.059746835443038, | |
| "grad_norm": 1.7085808896440553, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2942, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 1.4544968881558145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2933, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.100253164556962, | |
| "grad_norm": 1.6261331378733803, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2944, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.120506329113924, | |
| "grad_norm": 1.5609283167478611, | |
| "learning_rate": 5e-06, | |
| "loss": 0.292, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.140759493670886, | |
| "grad_norm": 1.6902454180217539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.29, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.161012658227848, | |
| "grad_norm": 1.47961286622397, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2964, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.1812658227848103, | |
| "grad_norm": 1.6098815338414532, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2967, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.201518987341772, | |
| "grad_norm": 1.4570975957744257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2999, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.221772151898734, | |
| "grad_norm": 1.5616376984698326, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3003, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.2420253164556962, | |
| "grad_norm": 1.5261875412769847, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3007, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.2622784810126584, | |
| "grad_norm": 1.5927243321009188, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2992, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.28253164556962, | |
| "grad_norm": 1.4687003886067878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3016, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.302784810126582, | |
| "grad_norm": 1.6212181828314611, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3046, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.3230379746835443, | |
| "grad_norm": 1.7006560112850782, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3014, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.3432911392405065, | |
| "grad_norm": 1.6369351197517492, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3031, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.363544303797468, | |
| "grad_norm": 1.8997824836659962, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3052, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.3837974683544303, | |
| "grad_norm": 1.7686559384037446, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2994, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.4040506329113924, | |
| "grad_norm": 1.9852860129783203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2987, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.4243037974683546, | |
| "grad_norm": 1.6687727310492642, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3028, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.4445569620253167, | |
| "grad_norm": 1.555588423959256, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3012, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.4648101265822784, | |
| "grad_norm": 1.570937882764948, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3033, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.4850632911392405, | |
| "grad_norm": 1.762794823286088, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3078, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.5053164556962026, | |
| "grad_norm": 1.5085676767074419, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3076, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.5255696202531643, | |
| "grad_norm": 1.846690362471807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3058, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.5458227848101265, | |
| "grad_norm": 1.888656474961098, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3092, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.5660759493670886, | |
| "grad_norm": 1.5522432118926228, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3031, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.5863291139240507, | |
| "grad_norm": 1.7040265273376634, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3097, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.606582278481013, | |
| "grad_norm": 1.6319175616422348, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3076, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.6268354430379746, | |
| "grad_norm": 1.6539455217449284, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3054, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.6470886075949367, | |
| "grad_norm": 1.6323498992182317, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3089, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.667341772151899, | |
| "grad_norm": 1.9811098470470307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3141, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.6875949367088605, | |
| "grad_norm": 1.5750523511747638, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3093, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.7078481012658226, | |
| "grad_norm": 1.5810037258388918, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3105, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.728101265822785, | |
| "grad_norm": 1.554399518805066, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3124, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.748354430379747, | |
| "grad_norm": 1.6069072450441366, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3099, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.768607594936709, | |
| "grad_norm": 1.6769246041867307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3173, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.7888607594936707, | |
| "grad_norm": 1.579060667933051, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3145, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.809113924050633, | |
| "grad_norm": 2.0501104971409396, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3154, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.829367088607595, | |
| "grad_norm": 2.2983760386486214, | |
| "learning_rate": 5e-06, | |
| "loss": 0.315, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.8496202531645567, | |
| "grad_norm": 1.780585915706879, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3157, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.869873417721519, | |
| "grad_norm": 1.77785313158742, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3117, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.890126582278481, | |
| "grad_norm": 1.6198590987029773, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3172, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.910379746835443, | |
| "grad_norm": 1.64334724662568, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3147, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.9306329113924052, | |
| "grad_norm": 1.5274532742672213, | |
| "learning_rate": 5e-06, | |
| "loss": 0.312, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.9508860759493674, | |
| "grad_norm": 1.7941457309206295, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3164, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.971139240506329, | |
| "grad_norm": 1.6138102060492845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3175, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.991392405063291, | |
| "grad_norm": 1.7439224514084468, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3172, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.9954430379746837, | |
| "eval_loss": 0.1617216169834137, | |
| "eval_runtime": 254.4743, | |
| "eval_samples_per_second": 52.28, | |
| "eval_steps_per_second": 0.409, | |
| "step": 1972 | |
| }, | |
| { | |
| "epoch": 3.9954430379746837, | |
| "step": 1972, | |
| "total_flos": 3302754688696320.0, | |
| "train_loss": 0.4368948165108176, | |
| "train_runtime": 56763.579, | |
| "train_samples_per_second": 17.811, | |
| "train_steps_per_second": 0.035 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1972, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3302754688696320.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |