{ "best_metric": 0.24075740575790405, "best_model_checkpoint": "./results/checkpoint-6750", "epoch": 3.0, "global_step": 6750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.9992592592592596e-05, "loss": 6.945, "step": 1 }, { "epoch": 0.0, "learning_rate": 4.9925925925925926e-05, "loss": 1.4118, "step": 10 }, { "epoch": 0.01, "learning_rate": 4.9851851851851855e-05, "loss": 0.7118, "step": 20 }, { "epoch": 0.01, "eval_loss": 0.6554884910583496, "eval_runtime": 6.9152, "eval_samples_per_second": 144.754, "eval_steps_per_second": 36.297, "step": 25 }, { "epoch": 0.01, "learning_rate": 4.977777777777778e-05, "loss": 0.8383, "step": 30 }, { "epoch": 0.02, "learning_rate": 4.970370370370371e-05, "loss": 0.6623, "step": 40 }, { "epoch": 0.02, "learning_rate": 4.962962962962963e-05, "loss": 0.6282, "step": 50 }, { "epoch": 0.02, "eval_loss": 0.5736271739006042, "eval_runtime": 6.8557, "eval_samples_per_second": 146.01, "eval_steps_per_second": 36.612, "step": 50 }, { "epoch": 0.03, "learning_rate": 4.955555555555556e-05, "loss": 0.5446, "step": 60 }, { "epoch": 0.03, "learning_rate": 4.9481481481481485e-05, "loss": 0.5983, "step": 70 }, { "epoch": 0.03, "eval_loss": 0.5498507022857666, "eval_runtime": 6.8866, "eval_samples_per_second": 145.356, "eval_steps_per_second": 36.448, "step": 75 }, { "epoch": 0.04, "learning_rate": 4.940740740740741e-05, "loss": 0.5406, "step": 80 }, { "epoch": 0.04, "learning_rate": 4.933333333333334e-05, "loss": 0.618, "step": 90 }, { "epoch": 0.04, "learning_rate": 4.925925925925926e-05, "loss": 0.6346, "step": 100 }, { "epoch": 0.04, "eval_loss": 0.5268338918685913, "eval_runtime": 6.9999, "eval_samples_per_second": 143.001, "eval_steps_per_second": 35.857, "step": 100 }, { "epoch": 0.05, "learning_rate": 4.918518518518519e-05, "loss": 0.53, "step": 110 }, { "epoch": 0.05, "learning_rate": 4.9111111111111114e-05, "loss": 0.4268, "step": 120 }, { "epoch": 0.06, "eval_loss": 0.5140772461891174, "eval_runtime": 7.002, "eval_samples_per_second": 142.959, "eval_steps_per_second": 35.847, "step": 125 }, { "epoch": 0.06, "learning_rate": 4.903703703703704e-05, "loss": 0.5474, "step": 130 }, { "epoch": 0.06, "learning_rate": 4.896296296296297e-05, "loss": 0.5773, "step": 140 }, { "epoch": 0.07, "learning_rate": 4.888888888888889e-05, "loss": 0.5611, "step": 150 }, { "epoch": 0.07, "eval_loss": 0.5012452006340027, "eval_runtime": 7.2224, "eval_samples_per_second": 138.597, "eval_steps_per_second": 34.753, "step": 150 }, { "epoch": 0.07, "learning_rate": 4.881481481481482e-05, "loss": 0.5421, "step": 160 }, { "epoch": 0.08, "learning_rate": 4.874074074074074e-05, "loss": 0.5056, "step": 170 }, { "epoch": 0.08, "eval_loss": 0.492546945810318, "eval_runtime": 7.468, "eval_samples_per_second": 134.038, "eval_steps_per_second": 33.61, "step": 175 }, { "epoch": 0.08, "learning_rate": 4.866666666666667e-05, "loss": 0.5086, "step": 180 }, { "epoch": 0.08, "learning_rate": 4.8592592592592596e-05, "loss": 0.4423, "step": 190 }, { "epoch": 0.09, "learning_rate": 4.851851851851852e-05, "loss": 0.5391, "step": 200 }, { "epoch": 0.09, "eval_loss": 0.4847224950790405, "eval_runtime": 7.3642, "eval_samples_per_second": 135.928, "eval_steps_per_second": 34.084, "step": 200 }, { "epoch": 0.09, "learning_rate": 4.844444444444445e-05, "loss": 0.4785, "step": 210 }, { "epoch": 0.1, "learning_rate": 4.837037037037037e-05, "loss": 0.4303, "step": 220 }, { "epoch": 0.1, "eval_loss": 0.4781314730644226, "eval_runtime": 7.4029, "eval_samples_per_second": 135.217, "eval_steps_per_second": 33.906, "step": 225 }, { "epoch": 0.1, "learning_rate": 4.82962962962963e-05, "loss": 0.5914, "step": 230 }, { "epoch": 0.11, "learning_rate": 4.8222222222222225e-05, "loss": 0.4892, "step": 240 }, { "epoch": 0.11, "learning_rate": 4.814814814814815e-05, "loss": 0.5442, "step": 250 }, { "epoch": 0.11, "eval_loss": 0.4688411355018616, "eval_runtime": 7.5871, "eval_samples_per_second": 131.935, "eval_steps_per_second": 33.083, "step": 250 }, { "epoch": 0.12, "learning_rate": 4.807407407407408e-05, "loss": 0.4823, "step": 260 }, { "epoch": 0.12, "learning_rate": 4.8e-05, "loss": 0.4739, "step": 270 }, { "epoch": 0.12, "eval_loss": 0.462319940328598, "eval_runtime": 7.2477, "eval_samples_per_second": 138.112, "eval_steps_per_second": 34.632, "step": 275 }, { "epoch": 0.12, "learning_rate": 4.792592592592593e-05, "loss": 0.5542, "step": 280 }, { "epoch": 0.13, "learning_rate": 4.7851851851851854e-05, "loss": 0.4593, "step": 290 }, { "epoch": 0.13, "learning_rate": 4.7777777777777784e-05, "loss": 0.4388, "step": 300 }, { "epoch": 0.13, "eval_loss": 0.4547964036464691, "eval_runtime": 7.2941, "eval_samples_per_second": 137.235, "eval_steps_per_second": 34.412, "step": 300 }, { "epoch": 0.14, "learning_rate": 4.770370370370371e-05, "loss": 0.4762, "step": 310 }, { "epoch": 0.14, "learning_rate": 4.762962962962963e-05, "loss": 0.428, "step": 320 }, { "epoch": 0.14, "eval_loss": 0.45131900906562805, "eval_runtime": 7.2669, "eval_samples_per_second": 137.748, "eval_steps_per_second": 34.54, "step": 325 }, { "epoch": 0.15, "learning_rate": 4.755555555555556e-05, "loss": 0.4719, "step": 330 }, { "epoch": 0.15, "learning_rate": 4.7481481481481483e-05, "loss": 0.4293, "step": 340 }, { "epoch": 0.16, "learning_rate": 4.740740740740741e-05, "loss": 0.4112, "step": 350 }, { "epoch": 0.16, "eval_loss": 0.4421839714050293, "eval_runtime": 7.3019, "eval_samples_per_second": 137.087, "eval_steps_per_second": 34.374, "step": 350 }, { "epoch": 0.16, "learning_rate": 4.7333333333333336e-05, "loss": 0.3981, "step": 360 }, { "epoch": 0.16, "learning_rate": 4.7259259259259266e-05, "loss": 0.501, "step": 370 }, { "epoch": 0.17, "eval_loss": 0.43612053990364075, "eval_runtime": 7.1757, "eval_samples_per_second": 139.499, "eval_steps_per_second": 34.979, "step": 375 }, { "epoch": 0.17, "learning_rate": 4.718518518518519e-05, "loss": 0.4066, "step": 380 }, { "epoch": 0.17, "learning_rate": 4.711111111111111e-05, "loss": 0.4892, "step": 390 }, { "epoch": 0.18, "learning_rate": 4.703703703703704e-05, "loss": 0.4584, "step": 400 }, { "epoch": 0.18, "eval_loss": 0.4388324022293091, "eval_runtime": 7.2341, "eval_samples_per_second": 138.372, "eval_steps_per_second": 34.697, "step": 400 }, { "epoch": 0.18, "learning_rate": 4.6962962962962966e-05, "loss": 0.432, "step": 410 }, { "epoch": 0.19, "learning_rate": 4.6888888888888895e-05, "loss": 0.4717, "step": 420 }, { "epoch": 0.19, "eval_loss": 0.42829033732414246, "eval_runtime": 7.261, "eval_samples_per_second": 137.86, "eval_steps_per_second": 34.568, "step": 425 }, { "epoch": 0.19, "learning_rate": 4.681481481481482e-05, "loss": 0.399, "step": 430 }, { "epoch": 0.2, "learning_rate": 4.674074074074074e-05, "loss": 0.4021, "step": 440 }, { "epoch": 0.2, "learning_rate": 4.666666666666667e-05, "loss": 0.4122, "step": 450 }, { "epoch": 0.2, "eval_loss": 0.4233491122722626, "eval_runtime": 7.2994, "eval_samples_per_second": 137.134, "eval_steps_per_second": 34.386, "step": 450 }, { "epoch": 0.2, "learning_rate": 4.6592592592592595e-05, "loss": 0.4539, "step": 460 }, { "epoch": 0.21, "learning_rate": 4.6518518518518525e-05, "loss": 0.5041, "step": 470 }, { "epoch": 0.21, "eval_loss": 0.4170687794685364, "eval_runtime": 7.1967, "eval_samples_per_second": 139.092, "eval_steps_per_second": 34.877, "step": 475 }, { "epoch": 0.21, "learning_rate": 4.644444444444445e-05, "loss": 0.5674, "step": 480 }, { "epoch": 0.22, "learning_rate": 4.637037037037038e-05, "loss": 0.4226, "step": 490 }, { "epoch": 0.22, "learning_rate": 4.62962962962963e-05, "loss": 0.4153, "step": 500 }, { "epoch": 0.22, "eval_loss": 0.41686439514160156, "eval_runtime": 7.2526, "eval_samples_per_second": 138.019, "eval_steps_per_second": 34.608, "step": 500 }, { "epoch": 0.23, "learning_rate": 4.6222222222222224e-05, "loss": 0.3956, "step": 510 }, { "epoch": 0.23, "learning_rate": 4.6148148148148154e-05, "loss": 0.4524, "step": 520 }, { "epoch": 0.23, "eval_loss": 0.41372087597846985, "eval_runtime": 7.2599, "eval_samples_per_second": 137.88, "eval_steps_per_second": 34.573, "step": 525 }, { "epoch": 0.24, "learning_rate": 4.607407407407408e-05, "loss": 0.4319, "step": 530 }, { "epoch": 0.24, "learning_rate": 4.600000000000001e-05, "loss": 0.4098, "step": 540 }, { "epoch": 0.24, "learning_rate": 4.592592592592593e-05, "loss": 0.511, "step": 550 }, { "epoch": 0.24, "eval_loss": 0.406745046377182, "eval_runtime": 7.4218, "eval_samples_per_second": 134.872, "eval_steps_per_second": 33.819, "step": 550 }, { "epoch": 0.25, "learning_rate": 4.585185185185185e-05, "loss": 0.5086, "step": 560 }, { "epoch": 0.25, "learning_rate": 4.577777777777778e-05, "loss": 0.4096, "step": 570 }, { "epoch": 0.26, "eval_loss": 0.404786080121994, "eval_runtime": 7.39, "eval_samples_per_second": 135.453, "eval_steps_per_second": 33.965, "step": 575 }, { "epoch": 0.26, "learning_rate": 4.5703703703703706e-05, "loss": 0.4281, "step": 580 }, { "epoch": 0.26, "learning_rate": 4.5629629629629636e-05, "loss": 0.4779, "step": 590 }, { "epoch": 0.27, "learning_rate": 4.555555555555556e-05, "loss": 0.405, "step": 600 }, { "epoch": 0.27, "eval_loss": 0.40296873450279236, "eval_runtime": 7.4985, "eval_samples_per_second": 133.493, "eval_steps_per_second": 33.473, "step": 600 }, { "epoch": 0.27, "learning_rate": 4.548148148148149e-05, "loss": 0.4144, "step": 610 }, { "epoch": 0.28, "learning_rate": 4.540740740740741e-05, "loss": 0.3645, "step": 620 }, { "epoch": 0.28, "eval_loss": 0.39786386489868164, "eval_runtime": 7.5248, "eval_samples_per_second": 133.027, "eval_steps_per_second": 33.356, "step": 625 }, { "epoch": 0.28, "learning_rate": 4.5333333333333335e-05, "loss": 0.3679, "step": 630 }, { "epoch": 0.28, "learning_rate": 4.5259259259259265e-05, "loss": 0.3724, "step": 640 }, { "epoch": 0.29, "learning_rate": 4.518518518518519e-05, "loss": 0.4452, "step": 650 }, { "epoch": 0.29, "eval_loss": 0.39415648579597473, "eval_runtime": 7.2333, "eval_samples_per_second": 138.388, "eval_steps_per_second": 34.701, "step": 650 }, { "epoch": 0.29, "learning_rate": 4.511111111111112e-05, "loss": 0.325, "step": 660 }, { "epoch": 0.3, "learning_rate": 4.503703703703704e-05, "loss": 0.4001, "step": 670 }, { "epoch": 0.3, "eval_loss": 0.38874566555023193, "eval_runtime": 7.2318, "eval_samples_per_second": 138.417, "eval_steps_per_second": 34.708, "step": 675 }, { "epoch": 0.3, "learning_rate": 4.496296296296297e-05, "loss": 0.3631, "step": 680 }, { "epoch": 0.31, "learning_rate": 4.4888888888888894e-05, "loss": 0.4206, "step": 690 }, { "epoch": 0.31, "learning_rate": 4.481481481481482e-05, "loss": 0.4616, "step": 700 }, { "epoch": 0.31, "eval_loss": 0.38908329606056213, "eval_runtime": 7.2894, "eval_samples_per_second": 137.323, "eval_steps_per_second": 34.434, "step": 700 }, { "epoch": 0.32, "learning_rate": 4.474074074074075e-05, "loss": 0.4723, "step": 710 }, { "epoch": 0.32, "learning_rate": 4.466666666666667e-05, "loss": 0.3344, "step": 720 }, { "epoch": 0.32, "eval_loss": 0.38453349471092224, "eval_runtime": 7.2247, "eval_samples_per_second": 138.552, "eval_steps_per_second": 34.742, "step": 725 }, { "epoch": 0.32, "learning_rate": 4.4592592592592594e-05, "loss": 0.4402, "step": 730 }, { "epoch": 0.33, "learning_rate": 4.4518518518518523e-05, "loss": 0.389, "step": 740 }, { "epoch": 0.33, "learning_rate": 4.4444444444444447e-05, "loss": 0.3899, "step": 750 }, { "epoch": 0.33, "eval_loss": 0.3842224180698395, "eval_runtime": 7.3669, "eval_samples_per_second": 135.879, "eval_steps_per_second": 34.071, "step": 750 }, { "epoch": 0.34, "learning_rate": 4.4370370370370376e-05, "loss": 0.3759, "step": 760 }, { "epoch": 0.34, "learning_rate": 4.42962962962963e-05, "loss": 0.3662, "step": 770 }, { "epoch": 0.34, "eval_loss": 0.3829396665096283, "eval_runtime": 7.3321, "eval_samples_per_second": 136.523, "eval_steps_per_second": 34.233, "step": 775 }, { "epoch": 0.35, "learning_rate": 4.422222222222222e-05, "loss": 0.4342, "step": 780 }, { "epoch": 0.35, "learning_rate": 4.414814814814815e-05, "loss": 0.4007, "step": 790 }, { "epoch": 0.36, "learning_rate": 4.4074074074074076e-05, "loss": 0.3931, "step": 800 }, { "epoch": 0.36, "eval_loss": 0.37909042835235596, "eval_runtime": 7.3121, "eval_samples_per_second": 136.896, "eval_steps_per_second": 34.327, "step": 800 }, { "epoch": 0.36, "learning_rate": 4.4000000000000006e-05, "loss": 0.3937, "step": 810 }, { "epoch": 0.36, "learning_rate": 4.392592592592593e-05, "loss": 0.3655, "step": 820 }, { "epoch": 0.37, "eval_loss": 0.37680765986442566, "eval_runtime": 7.2101, "eval_samples_per_second": 138.832, "eval_steps_per_second": 34.812, "step": 825 }, { "epoch": 0.37, "learning_rate": 4.385185185185185e-05, "loss": 0.3454, "step": 830 }, { "epoch": 0.37, "learning_rate": 4.377777777777778e-05, "loss": 0.3279, "step": 840 }, { "epoch": 0.38, "learning_rate": 4.3703703703703705e-05, "loss": 0.4125, "step": 850 }, { "epoch": 0.38, "eval_loss": 0.3775666058063507, "eval_runtime": 7.2081, "eval_samples_per_second": 138.872, "eval_steps_per_second": 34.822, "step": 850 }, { "epoch": 0.38, "learning_rate": 4.3629629629629635e-05, "loss": 0.3324, "step": 860 }, { "epoch": 0.39, "learning_rate": 4.355555555555556e-05, "loss": 0.3848, "step": 870 }, { "epoch": 0.39, "eval_loss": 0.373475044965744, "eval_runtime": 7.1966, "eval_samples_per_second": 139.093, "eval_steps_per_second": 34.877, "step": 875 }, { "epoch": 0.39, "learning_rate": 4.348148148148148e-05, "loss": 0.5328, "step": 880 }, { "epoch": 0.4, "learning_rate": 4.340740740740741e-05, "loss": 0.3791, "step": 890 }, { "epoch": 0.4, "learning_rate": 4.3333333333333334e-05, "loss": 0.3907, "step": 900 }, { "epoch": 0.4, "eval_loss": 0.3697744607925415, "eval_runtime": 7.2557, "eval_samples_per_second": 137.961, "eval_steps_per_second": 34.594, "step": 900 }, { "epoch": 0.4, "learning_rate": 4.325925925925926e-05, "loss": 0.3595, "step": 910 }, { "epoch": 0.41, "learning_rate": 4.318518518518519e-05, "loss": 0.4298, "step": 920 }, { "epoch": 0.41, "eval_loss": 0.3690047562122345, "eval_runtime": 7.1998, "eval_samples_per_second": 139.032, "eval_steps_per_second": 34.862, "step": 925 }, { "epoch": 0.41, "learning_rate": 4.311111111111111e-05, "loss": 0.368, "step": 930 }, { "epoch": 0.42, "learning_rate": 4.303703703703704e-05, "loss": 0.384, "step": 940 }, { "epoch": 0.42, "learning_rate": 4.296296296296296e-05, "loss": 0.4369, "step": 950 }, { "epoch": 0.42, "eval_loss": 0.3619600534439087, "eval_runtime": 7.2818, "eval_samples_per_second": 137.467, "eval_steps_per_second": 34.47, "step": 950 }, { "epoch": 0.43, "learning_rate": 4.2888888888888886e-05, "loss": 0.3569, "step": 960 }, { "epoch": 0.43, "learning_rate": 4.2814814814814816e-05, "loss": 0.3332, "step": 970 }, { "epoch": 0.43, "eval_loss": 0.3631249666213989, "eval_runtime": 7.2547, "eval_samples_per_second": 137.98, "eval_steps_per_second": 34.598, "step": 975 }, { "epoch": 0.44, "learning_rate": 4.274074074074074e-05, "loss": 0.3758, "step": 980 }, { "epoch": 0.44, "learning_rate": 4.266666666666667e-05, "loss": 0.385, "step": 990 }, { "epoch": 0.44, "learning_rate": 4.259259259259259e-05, "loss": 0.4328, "step": 1000 }, { "epoch": 0.44, "eval_loss": 0.35931944847106934, "eval_runtime": 7.3342, "eval_samples_per_second": 136.484, "eval_steps_per_second": 34.223, "step": 1000 }, { "epoch": 0.45, "learning_rate": 4.2518518518518515e-05, "loss": 0.4093, "step": 1010 }, { "epoch": 0.45, "learning_rate": 4.2444444444444445e-05, "loss": 0.3752, "step": 1020 }, { "epoch": 0.46, "eval_loss": 0.3549325466156006, "eval_runtime": 7.1796, "eval_samples_per_second": 139.424, "eval_steps_per_second": 34.96, "step": 1025 }, { "epoch": 0.46, "learning_rate": 4.237037037037037e-05, "loss": 0.3473, "step": 1030 }, { "epoch": 0.46, "learning_rate": 4.22962962962963e-05, "loss": 0.3823, "step": 1040 }, { "epoch": 0.47, "learning_rate": 4.222222222222222e-05, "loss": 0.4004, "step": 1050 }, { "epoch": 0.47, "eval_loss": 0.3550175428390503, "eval_runtime": 7.3, "eval_samples_per_second": 137.122, "eval_steps_per_second": 34.383, "step": 1050 }, { "epoch": 0.47, "learning_rate": 4.2148148148148145e-05, "loss": 0.3484, "step": 1060 }, { "epoch": 0.48, "learning_rate": 4.2074074074074075e-05, "loss": 0.3552, "step": 1070 }, { "epoch": 0.48, "eval_loss": 0.3549079895019531, "eval_runtime": 7.6065, "eval_samples_per_second": 131.599, "eval_steps_per_second": 32.998, "step": 1075 }, { "epoch": 0.48, "learning_rate": 4.2e-05, "loss": 0.3847, "step": 1080 }, { "epoch": 0.48, "learning_rate": 4.192592592592593e-05, "loss": 0.3861, "step": 1090 }, { "epoch": 0.49, "learning_rate": 4.185185185185185e-05, "loss": 0.3719, "step": 1100 }, { "epoch": 0.49, "eval_loss": 0.3503241539001465, "eval_runtime": 7.7017, "eval_samples_per_second": 129.971, "eval_steps_per_second": 32.59, "step": 1100 }, { "epoch": 0.49, "learning_rate": 4.177777777777778e-05, "loss": 0.3661, "step": 1110 }, { "epoch": 0.5, "learning_rate": 4.1703703703703704e-05, "loss": 0.3762, "step": 1120 }, { "epoch": 0.5, "eval_loss": 0.3482719659805298, "eval_runtime": 7.5987, "eval_samples_per_second": 131.732, "eval_steps_per_second": 33.032, "step": 1125 }, { "epoch": 0.5, "learning_rate": 4.162962962962963e-05, "loss": 0.3803, "step": 1130 }, { "epoch": 0.51, "learning_rate": 4.155555555555556e-05, "loss": 0.2975, "step": 1140 }, { "epoch": 0.51, "learning_rate": 4.148148148148148e-05, "loss": 0.3397, "step": 1150 }, { "epoch": 0.51, "eval_loss": 0.34838753938674927, "eval_runtime": 7.5913, "eval_samples_per_second": 131.861, "eval_steps_per_second": 33.064, "step": 1150 }, { "epoch": 0.52, "learning_rate": 4.140740740740741e-05, "loss": 0.3218, "step": 1160 }, { "epoch": 0.52, "learning_rate": 4.133333333333333e-05, "loss": 0.345, "step": 1170 }, { "epoch": 0.52, "eval_loss": 0.3447699248790741, "eval_runtime": 7.6124, "eval_samples_per_second": 131.496, "eval_steps_per_second": 32.973, "step": 1175 }, { "epoch": 0.52, "learning_rate": 4.1259259259259256e-05, "loss": 0.3635, "step": 1180 }, { "epoch": 0.53, "learning_rate": 4.1185185185185186e-05, "loss": 0.3337, "step": 1190 }, { "epoch": 0.53, "learning_rate": 4.111111111111111e-05, "loss": 0.3892, "step": 1200 }, { "epoch": 0.53, "eval_loss": 0.3436849117279053, "eval_runtime": 7.6409, "eval_samples_per_second": 131.006, "eval_steps_per_second": 32.85, "step": 1200 }, { "epoch": 0.54, "learning_rate": 4.103703703703704e-05, "loss": 0.3792, "step": 1210 }, { "epoch": 0.54, "learning_rate": 4.096296296296296e-05, "loss": 0.3062, "step": 1220 }, { "epoch": 0.54, "eval_loss": 0.34092003107070923, "eval_runtime": 7.6331, "eval_samples_per_second": 131.14, "eval_steps_per_second": 32.883, "step": 1225 }, { "epoch": 0.55, "learning_rate": 4.088888888888889e-05, "loss": 0.3764, "step": 1230 }, { "epoch": 0.55, "learning_rate": 4.0814814814814815e-05, "loss": 0.3541, "step": 1240 }, { "epoch": 0.56, "learning_rate": 4.074074074074074e-05, "loss": 0.3728, "step": 1250 }, { "epoch": 0.56, "eval_loss": 0.3405691683292389, "eval_runtime": 7.5692, "eval_samples_per_second": 132.247, "eval_steps_per_second": 33.161, "step": 1250 }, { "epoch": 0.56, "learning_rate": 4.066666666666667e-05, "loss": 0.3013, "step": 1260 }, { "epoch": 0.56, "learning_rate": 4.059259259259259e-05, "loss": 0.321, "step": 1270 }, { "epoch": 0.57, "eval_loss": 0.3399699926376343, "eval_runtime": 7.588, "eval_samples_per_second": 131.919, "eval_steps_per_second": 33.079, "step": 1275 }, { "epoch": 0.57, "learning_rate": 4.051851851851852e-05, "loss": 0.3404, "step": 1280 }, { "epoch": 0.57, "learning_rate": 4.0444444444444444e-05, "loss": 0.381, "step": 1290 }, { "epoch": 0.58, "learning_rate": 4.0370370370370374e-05, "loss": 0.314, "step": 1300 }, { "epoch": 0.58, "eval_loss": 0.3377434313297272, "eval_runtime": 7.5595, "eval_samples_per_second": 132.417, "eval_steps_per_second": 33.203, "step": 1300 }, { "epoch": 0.58, "learning_rate": 4.02962962962963e-05, "loss": 0.3016, "step": 1310 }, { "epoch": 0.59, "learning_rate": 4.022222222222222e-05, "loss": 0.4416, "step": 1320 }, { "epoch": 0.59, "eval_loss": 0.33633822202682495, "eval_runtime": 7.5093, "eval_samples_per_second": 133.302, "eval_steps_per_second": 33.425, "step": 1325 }, { "epoch": 0.59, "learning_rate": 4.014814814814815e-05, "loss": 0.348, "step": 1330 }, { "epoch": 0.6, "learning_rate": 4.007407407407407e-05, "loss": 0.3685, "step": 1340 }, { "epoch": 0.6, "learning_rate": 4e-05, "loss": 0.362, "step": 1350 }, { "epoch": 0.6, "eval_loss": 0.33278197050094604, "eval_runtime": 7.5483, "eval_samples_per_second": 132.612, "eval_steps_per_second": 33.252, "step": 1350 }, { "epoch": 0.6, "learning_rate": 3.9925925925925926e-05, "loss": 0.3961, "step": 1360 }, { "epoch": 0.61, "learning_rate": 3.985185185185185e-05, "loss": 0.2903, "step": 1370 }, { "epoch": 0.61, "eval_loss": 0.33177217841148376, "eval_runtime": 7.574, "eval_samples_per_second": 132.162, "eval_steps_per_second": 33.14, "step": 1375 }, { "epoch": 0.61, "learning_rate": 3.977777777777778e-05, "loss": 0.3527, "step": 1380 }, { "epoch": 0.62, "learning_rate": 3.97037037037037e-05, "loss": 0.3297, "step": 1390 }, { "epoch": 0.62, "learning_rate": 3.962962962962963e-05, "loss": 0.3362, "step": 1400 }, { "epoch": 0.62, "eval_loss": 0.3336848020553589, "eval_runtime": 7.5409, "eval_samples_per_second": 132.743, "eval_steps_per_second": 33.285, "step": 1400 }, { "epoch": 0.63, "learning_rate": 3.9555555555555556e-05, "loss": 0.3714, "step": 1410 }, { "epoch": 0.63, "learning_rate": 3.9481481481481485e-05, "loss": 0.3278, "step": 1420 }, { "epoch": 0.63, "eval_loss": 0.32720068097114563, "eval_runtime": 7.5879, "eval_samples_per_second": 131.921, "eval_steps_per_second": 33.079, "step": 1425 }, { "epoch": 0.64, "learning_rate": 3.940740740740741e-05, "loss": 0.3328, "step": 1430 }, { "epoch": 0.64, "learning_rate": 3.933333333333333e-05, "loss": 0.2908, "step": 1440 }, { "epoch": 0.64, "learning_rate": 3.925925925925926e-05, "loss": 0.358, "step": 1450 }, { "epoch": 0.64, "eval_loss": 0.3258882761001587, "eval_runtime": 7.6673, "eval_samples_per_second": 130.554, "eval_steps_per_second": 32.736, "step": 1450 }, { "epoch": 0.65, "learning_rate": 3.9185185185185185e-05, "loss": 0.4164, "step": 1460 }, { "epoch": 0.65, "learning_rate": 3.9111111111111115e-05, "loss": 0.3103, "step": 1470 }, { "epoch": 0.66, "eval_loss": 0.3261792063713074, "eval_runtime": 7.6042, "eval_samples_per_second": 131.637, "eval_steps_per_second": 33.008, "step": 1475 }, { "epoch": 0.66, "learning_rate": 3.903703703703704e-05, "loss": 0.4174, "step": 1480 }, { "epoch": 0.66, "learning_rate": 3.896296296296296e-05, "loss": 0.3545, "step": 1490 }, { "epoch": 0.67, "learning_rate": 3.888888888888889e-05, "loss": 0.3297, "step": 1500 }, { "epoch": 0.67, "eval_loss": 0.32423877716064453, "eval_runtime": 7.6588, "eval_samples_per_second": 130.7, "eval_steps_per_second": 32.773, "step": 1500 }, { "epoch": 0.67, "learning_rate": 3.8814814814814814e-05, "loss": 0.3856, "step": 1510 }, { "epoch": 0.68, "learning_rate": 3.8740740740740744e-05, "loss": 0.3598, "step": 1520 }, { "epoch": 0.68, "eval_loss": 0.3228550851345062, "eval_runtime": 7.6093, "eval_samples_per_second": 131.55, "eval_steps_per_second": 32.986, "step": 1525 }, { "epoch": 0.68, "learning_rate": 3.866666666666667e-05, "loss": 0.3218, "step": 1530 }, { "epoch": 0.68, "learning_rate": 3.85925925925926e-05, "loss": 0.3577, "step": 1540 }, { "epoch": 0.69, "learning_rate": 3.851851851851852e-05, "loss": 0.3343, "step": 1550 }, { "epoch": 0.69, "eval_loss": 0.32196611166000366, "eval_runtime": 7.6896, "eval_samples_per_second": 130.175, "eval_steps_per_second": 32.641, "step": 1550 }, { "epoch": 0.69, "learning_rate": 3.844444444444444e-05, "loss": 0.2857, "step": 1560 }, { "epoch": 0.7, "learning_rate": 3.837037037037037e-05, "loss": 0.3324, "step": 1570 }, { "epoch": 0.7, "eval_loss": 0.32020455598831177, "eval_runtime": 7.512, "eval_samples_per_second": 133.254, "eval_steps_per_second": 33.413, "step": 1575 }, { "epoch": 0.7, "learning_rate": 3.8296296296296296e-05, "loss": 0.3791, "step": 1580 }, { "epoch": 0.71, "learning_rate": 3.8222222222222226e-05, "loss": 0.3563, "step": 1590 }, { "epoch": 0.71, "learning_rate": 3.814814814814815e-05, "loss": 0.3744, "step": 1600 }, { "epoch": 0.71, "eval_loss": 0.31763964891433716, "eval_runtime": 7.5475, "eval_samples_per_second": 132.626, "eval_steps_per_second": 33.256, "step": 1600 }, { "epoch": 0.72, "learning_rate": 3.807407407407408e-05, "loss": 0.3689, "step": 1610 }, { "epoch": 0.72, "learning_rate": 3.8e-05, "loss": 0.3385, "step": 1620 }, { "epoch": 0.72, "eval_loss": 0.3172718584537506, "eval_runtime": 7.4165, "eval_samples_per_second": 134.969, "eval_steps_per_second": 33.843, "step": 1625 }, { "epoch": 0.72, "learning_rate": 3.7925925925925925e-05, "loss": 0.3142, "step": 1630 }, { "epoch": 0.73, "learning_rate": 3.7851851851851855e-05, "loss": 0.369, "step": 1640 }, { "epoch": 0.73, "learning_rate": 3.777777777777778e-05, "loss": 0.3264, "step": 1650 }, { "epoch": 0.73, "eval_loss": 0.31627902388572693, "eval_runtime": 7.4047, "eval_samples_per_second": 135.184, "eval_steps_per_second": 33.897, "step": 1650 }, { "epoch": 0.74, "learning_rate": 3.770370370370371e-05, "loss": 0.3481, "step": 1660 }, { "epoch": 0.74, "learning_rate": 3.762962962962963e-05, "loss": 0.3162, "step": 1670 }, { "epoch": 0.74, "eval_loss": 0.31443050503730774, "eval_runtime": 7.2703, "eval_samples_per_second": 137.683, "eval_steps_per_second": 34.524, "step": 1675 }, { "epoch": 0.75, "learning_rate": 3.7555555555555554e-05, "loss": 0.3458, "step": 1680 }, { "epoch": 0.75, "learning_rate": 3.7481481481481484e-05, "loss": 0.3641, "step": 1690 }, { "epoch": 0.76, "learning_rate": 3.740740740740741e-05, "loss": 0.3399, "step": 1700 }, { "epoch": 0.76, "eval_loss": 0.3139602839946747, "eval_runtime": 7.5549, "eval_samples_per_second": 132.496, "eval_steps_per_second": 33.223, "step": 1700 }, { "epoch": 0.76, "learning_rate": 3.733333333333334e-05, "loss": 0.3082, "step": 1710 }, { "epoch": 0.76, "learning_rate": 3.725925925925926e-05, "loss": 0.3544, "step": 1720 }, { "epoch": 0.77, "eval_loss": 0.3116389811038971, "eval_runtime": 7.3628, "eval_samples_per_second": 135.955, "eval_steps_per_second": 34.091, "step": 1725 }, { "epoch": 0.77, "learning_rate": 3.718518518518519e-05, "loss": 0.3356, "step": 1730 }, { "epoch": 0.77, "learning_rate": 3.7111111111111113e-05, "loss": 0.383, "step": 1740 }, { "epoch": 0.78, "learning_rate": 3.7037037037037037e-05, "loss": 0.3839, "step": 1750 }, { "epoch": 0.78, "eval_loss": 0.3124999403953552, "eval_runtime": 7.3086, "eval_samples_per_second": 136.962, "eval_steps_per_second": 34.343, "step": 1750 }, { "epoch": 0.78, "learning_rate": 3.6962962962962966e-05, "loss": 0.3378, "step": 1760 }, { "epoch": 0.79, "learning_rate": 3.688888888888889e-05, "loss": 0.3034, "step": 1770 }, { "epoch": 0.79, "eval_loss": 0.31177276372909546, "eval_runtime": 7.2443, "eval_samples_per_second": 138.177, "eval_steps_per_second": 34.648, "step": 1775 }, { "epoch": 0.79, "learning_rate": 3.681481481481482e-05, "loss": 0.3384, "step": 1780 }, { "epoch": 0.8, "learning_rate": 3.674074074074074e-05, "loss": 0.3401, "step": 1790 }, { "epoch": 0.8, "learning_rate": 3.6666666666666666e-05, "loss": 0.2989, "step": 1800 }, { "epoch": 0.8, "eval_loss": 0.3089355528354645, "eval_runtime": 7.1728, "eval_samples_per_second": 139.556, "eval_steps_per_second": 34.993, "step": 1800 }, { "epoch": 0.8, "learning_rate": 3.6592592592592596e-05, "loss": 0.3398, "step": 1810 }, { "epoch": 0.81, "learning_rate": 3.651851851851852e-05, "loss": 0.3055, "step": 1820 }, { "epoch": 0.81, "eval_loss": 0.30923065543174744, "eval_runtime": 7.208, "eval_samples_per_second": 138.874, "eval_steps_per_second": 34.822, "step": 1825 }, { "epoch": 0.81, "learning_rate": 3.644444444444445e-05, "loss": 0.3088, "step": 1830 }, { "epoch": 0.82, "learning_rate": 3.637037037037037e-05, "loss": 0.3016, "step": 1840 }, { "epoch": 0.82, "learning_rate": 3.62962962962963e-05, "loss": 0.3143, "step": 1850 }, { "epoch": 0.82, "eval_loss": 0.3072822690010071, "eval_runtime": 7.2053, "eval_samples_per_second": 138.926, "eval_steps_per_second": 34.836, "step": 1850 }, { "epoch": 0.83, "learning_rate": 3.6222222222222225e-05, "loss": 0.2985, "step": 1860 }, { "epoch": 0.83, "learning_rate": 3.614814814814815e-05, "loss": 0.3805, "step": 1870 }, { "epoch": 0.83, "eval_loss": 0.3062894940376282, "eval_runtime": 7.2009, "eval_samples_per_second": 139.01, "eval_steps_per_second": 34.857, "step": 1875 }, { "epoch": 0.84, "learning_rate": 3.607407407407408e-05, "loss": 0.2831, "step": 1880 }, { "epoch": 0.84, "learning_rate": 3.6e-05, "loss": 0.3258, "step": 1890 }, { "epoch": 0.84, "learning_rate": 3.592592592592593e-05, "loss": 0.2625, "step": 1900 }, { "epoch": 0.84, "eval_loss": 0.30678972601890564, "eval_runtime": 7.1985, "eval_samples_per_second": 139.057, "eval_steps_per_second": 34.869, "step": 1900 }, { "epoch": 0.85, "learning_rate": 3.5851851851851854e-05, "loss": 0.2923, "step": 1910 }, { "epoch": 0.85, "learning_rate": 3.577777777777778e-05, "loss": 0.3276, "step": 1920 }, { "epoch": 0.86, "eval_loss": 0.30508390069007874, "eval_runtime": 7.2926, "eval_samples_per_second": 137.262, "eval_steps_per_second": 34.418, "step": 1925 }, { "epoch": 0.86, "learning_rate": 3.570370370370371e-05, "loss": 0.3521, "step": 1930 }, { "epoch": 0.86, "learning_rate": 3.562962962962963e-05, "loss": 0.283, "step": 1940 }, { "epoch": 0.87, "learning_rate": 3.555555555555556e-05, "loss": 0.3364, "step": 1950 }, { "epoch": 0.87, "eval_loss": 0.3016437888145447, "eval_runtime": 7.3704, "eval_samples_per_second": 135.814, "eval_steps_per_second": 34.055, "step": 1950 }, { "epoch": 0.87, "learning_rate": 3.548148148148148e-05, "loss": 0.2882, "step": 1960 }, { "epoch": 0.88, "learning_rate": 3.540740740740741e-05, "loss": 0.3353, "step": 1970 }, { "epoch": 0.88, "eval_loss": 0.30082637071609497, "eval_runtime": 7.6161, "eval_samples_per_second": 131.432, "eval_steps_per_second": 32.957, "step": 1975 }, { "epoch": 0.88, "learning_rate": 3.5333333333333336e-05, "loss": 0.2675, "step": 1980 }, { "epoch": 0.88, "learning_rate": 3.525925925925926e-05, "loss": 0.3182, "step": 1990 }, { "epoch": 0.89, "learning_rate": 3.518518518518519e-05, "loss": 0.2932, "step": 2000 }, { "epoch": 0.89, "eval_loss": 0.2999679744243622, "eval_runtime": 7.5872, "eval_samples_per_second": 131.933, "eval_steps_per_second": 33.082, "step": 2000 }, { "epoch": 0.89, "learning_rate": 3.511111111111111e-05, "loss": 0.3029, "step": 2010 }, { "epoch": 0.9, "learning_rate": 3.503703703703704e-05, "loss": 0.2434, "step": 2020 }, { "epoch": 0.9, "eval_loss": 0.2994518280029297, "eval_runtime": 7.5416, "eval_samples_per_second": 132.731, "eval_steps_per_second": 33.282, "step": 2025 }, { "epoch": 0.9, "learning_rate": 3.4962962962962965e-05, "loss": 0.3236, "step": 2030 }, { "epoch": 0.91, "learning_rate": 3.4888888888888895e-05, "loss": 0.2967, "step": 2040 }, { "epoch": 0.91, "learning_rate": 3.481481481481482e-05, "loss": 0.3572, "step": 2050 }, { "epoch": 0.91, "eval_loss": 0.29705023765563965, "eval_runtime": 7.5563, "eval_samples_per_second": 132.473, "eval_steps_per_second": 33.218, "step": 2050 }, { "epoch": 0.92, "learning_rate": 3.474074074074074e-05, "loss": 0.3384, "step": 2060 }, { "epoch": 0.92, "learning_rate": 3.466666666666667e-05, "loss": 0.2816, "step": 2070 }, { "epoch": 0.92, "eval_loss": 0.29650408029556274, "eval_runtime": 7.3924, "eval_samples_per_second": 135.409, "eval_steps_per_second": 33.954, "step": 2075 }, { "epoch": 0.92, "learning_rate": 3.4592592592592594e-05, "loss": 0.3361, "step": 2080 }, { "epoch": 0.93, "learning_rate": 3.4518518518518524e-05, "loss": 0.3125, "step": 2090 }, { "epoch": 0.93, "learning_rate": 3.444444444444445e-05, "loss": 0.3801, "step": 2100 }, { "epoch": 0.93, "eval_loss": 0.2950206696987152, "eval_runtime": 7.3773, "eval_samples_per_second": 135.687, "eval_steps_per_second": 34.023, "step": 2100 }, { "epoch": 0.94, "learning_rate": 3.437037037037037e-05, "loss": 0.2684, "step": 2110 }, { "epoch": 0.94, "learning_rate": 3.42962962962963e-05, "loss": 0.3387, "step": 2120 }, { "epoch": 0.94, "eval_loss": 0.29483914375305176, "eval_runtime": 7.5911, "eval_samples_per_second": 131.865, "eval_steps_per_second": 33.065, "step": 2125 }, { "epoch": 0.95, "learning_rate": 3.4222222222222224e-05, "loss": 0.3394, "step": 2130 }, { "epoch": 0.95, "learning_rate": 3.4148148148148153e-05, "loss": 0.2852, "step": 2140 }, { "epoch": 0.96, "learning_rate": 3.4074074074074077e-05, "loss": 0.2903, "step": 2150 }, { "epoch": 0.96, "eval_loss": 0.2979108989238739, "eval_runtime": 7.666, "eval_samples_per_second": 130.576, "eval_steps_per_second": 32.742, "step": 2150 }, { "epoch": 0.96, "learning_rate": 3.4000000000000007e-05, "loss": 0.3144, "step": 2160 }, { "epoch": 0.96, "learning_rate": 3.392592592592593e-05, "loss": 0.3238, "step": 2170 }, { "epoch": 0.97, "eval_loss": 0.2956538200378418, "eval_runtime": 7.6479, "eval_samples_per_second": 130.886, "eval_steps_per_second": 32.82, "step": 2175 }, { "epoch": 0.97, "learning_rate": 3.385185185185185e-05, "loss": 0.2848, "step": 2180 }, { "epoch": 0.97, "learning_rate": 3.377777777777778e-05, "loss": 0.2742, "step": 2190 }, { "epoch": 0.98, "learning_rate": 3.3703703703703706e-05, "loss": 0.3392, "step": 2200 }, { "epoch": 0.98, "eval_loss": 0.29324308037757874, "eval_runtime": 7.6322, "eval_samples_per_second": 131.155, "eval_steps_per_second": 32.887, "step": 2200 }, { "epoch": 0.98, "learning_rate": 3.3629629629629636e-05, "loss": 0.2332, "step": 2210 }, { "epoch": 0.99, "learning_rate": 3.355555555555556e-05, "loss": 0.2754, "step": 2220 }, { "epoch": 0.99, "eval_loss": 0.29224133491516113, "eval_runtime": 7.5764, "eval_samples_per_second": 132.12, "eval_steps_per_second": 33.129, "step": 2225 }, { "epoch": 0.99, "learning_rate": 3.348148148148148e-05, "loss": 0.2778, "step": 2230 }, { "epoch": 1.0, "learning_rate": 3.340740740740741e-05, "loss": 0.2646, "step": 2240 }, { "epoch": 1.0, "learning_rate": 3.3333333333333335e-05, "loss": 0.3542, "step": 2250 }, { "epoch": 1.0, "eval_loss": 0.2917466461658478, "eval_runtime": 7.5812, "eval_samples_per_second": 132.038, "eval_steps_per_second": 33.108, "step": 2250 }, { "epoch": 1.0, "learning_rate": 3.3259259259259265e-05, "loss": 0.3091, "step": 2260 }, { "epoch": 1.01, "learning_rate": 3.318518518518519e-05, "loss": 0.314, "step": 2270 }, { "epoch": 1.01, "eval_loss": 0.29272007942199707, "eval_runtime": 7.5219, "eval_samples_per_second": 133.079, "eval_steps_per_second": 33.369, "step": 2275 }, { "epoch": 1.01, "learning_rate": 3.311111111111112e-05, "loss": 0.269, "step": 2280 }, { "epoch": 1.02, "learning_rate": 3.303703703703704e-05, "loss": 0.2504, "step": 2290 }, { "epoch": 1.02, "learning_rate": 3.2962962962962964e-05, "loss": 0.3201, "step": 2300 }, { "epoch": 1.02, "eval_loss": 0.29086679220199585, "eval_runtime": 7.5125, "eval_samples_per_second": 133.245, "eval_steps_per_second": 33.411, "step": 2300 }, { "epoch": 1.03, "learning_rate": 3.2888888888888894e-05, "loss": 0.2967, "step": 2310 }, { "epoch": 1.03, "learning_rate": 3.281481481481482e-05, "loss": 0.2799, "step": 2320 }, { "epoch": 1.03, "eval_loss": 0.28955137729644775, "eval_runtime": 7.3165, "eval_samples_per_second": 136.814, "eval_steps_per_second": 34.306, "step": 2325 }, { "epoch": 1.04, "learning_rate": 3.274074074074075e-05, "loss": 0.2535, "step": 2330 }, { "epoch": 1.04, "learning_rate": 3.266666666666667e-05, "loss": 0.2763, "step": 2340 }, { "epoch": 1.04, "learning_rate": 3.25925925925926e-05, "loss": 0.2356, "step": 2350 }, { "epoch": 1.04, "eval_loss": 0.2900914251804352, "eval_runtime": 7.3036, "eval_samples_per_second": 137.055, "eval_steps_per_second": 34.366, "step": 2350 }, { "epoch": 1.05, "learning_rate": 3.251851851851852e-05, "loss": 0.3099, "step": 2360 }, { "epoch": 1.05, "learning_rate": 3.2444444444444446e-05, "loss": 0.271, "step": 2370 }, { "epoch": 1.06, "eval_loss": 0.28874123096466064, "eval_runtime": 7.2862, "eval_samples_per_second": 137.382, "eval_steps_per_second": 34.448, "step": 2375 }, { "epoch": 1.06, "learning_rate": 3.2370370370370376e-05, "loss": 0.32, "step": 2380 }, { "epoch": 1.06, "learning_rate": 3.22962962962963e-05, "loss": 0.28, "step": 2390 }, { "epoch": 1.07, "learning_rate": 3.222222222222223e-05, "loss": 0.2246, "step": 2400 }, { "epoch": 1.07, "eval_loss": 0.2897484004497528, "eval_runtime": 7.2105, "eval_samples_per_second": 138.825, "eval_steps_per_second": 34.81, "step": 2400 }, { "epoch": 1.07, "learning_rate": 3.214814814814815e-05, "loss": 0.3076, "step": 2410 }, { "epoch": 1.08, "learning_rate": 3.2074074074074075e-05, "loss": 0.2824, "step": 2420 }, { "epoch": 1.08, "eval_loss": 0.2881883680820465, "eval_runtime": 7.5403, "eval_samples_per_second": 132.752, "eval_steps_per_second": 33.288, "step": 2425 }, { "epoch": 1.08, "learning_rate": 3.2000000000000005e-05, "loss": 0.3322, "step": 2430 }, { "epoch": 1.08, "learning_rate": 3.192592592592593e-05, "loss": 0.2299, "step": 2440 }, { "epoch": 1.09, "learning_rate": 3.185185185185185e-05, "loss": 0.2516, "step": 2450 }, { "epoch": 1.09, "eval_loss": 0.287548303604126, "eval_runtime": 7.3579, "eval_samples_per_second": 136.045, "eval_steps_per_second": 34.113, "step": 2450 }, { "epoch": 1.09, "learning_rate": 3.177777777777778e-05, "loss": 0.2652, "step": 2460 }, { "epoch": 1.1, "learning_rate": 3.1703703703703705e-05, "loss": 0.2465, "step": 2470 }, { "epoch": 1.1, "eval_loss": 0.2856321632862091, "eval_runtime": 7.7527, "eval_samples_per_second": 129.116, "eval_steps_per_second": 32.376, "step": 2475 }, { "epoch": 1.1, "learning_rate": 3.1629629629629634e-05, "loss": 0.301, "step": 2480 }, { "epoch": 1.11, "learning_rate": 3.155555555555556e-05, "loss": 0.1987, "step": 2490 }, { "epoch": 1.11, "learning_rate": 3.148148148148148e-05, "loss": 0.3417, "step": 2500 }, { "epoch": 1.11, "eval_loss": 0.2860187292098999, "eval_runtime": 7.5819, "eval_samples_per_second": 132.024, "eval_steps_per_second": 33.105, "step": 2500 }, { "epoch": 1.12, "learning_rate": 3.140740740740741e-05, "loss": 0.2464, "step": 2510 }, { "epoch": 1.12, "learning_rate": 3.1333333333333334e-05, "loss": 0.2418, "step": 2520 }, { "epoch": 1.12, "eval_loss": 0.28458064794540405, "eval_runtime": 7.684, "eval_samples_per_second": 130.271, "eval_steps_per_second": 32.665, "step": 2525 }, { "epoch": 1.12, "learning_rate": 3.1259259259259264e-05, "loss": 0.2884, "step": 2530 }, { "epoch": 1.13, "learning_rate": 3.118518518518519e-05, "loss": 0.2396, "step": 2540 }, { "epoch": 1.13, "learning_rate": 3.111111111111111e-05, "loss": 0.2625, "step": 2550 }, { "epoch": 1.13, "eval_loss": 0.28442564606666565, "eval_runtime": 7.6251, "eval_samples_per_second": 131.277, "eval_steps_per_second": 32.918, "step": 2550 }, { "epoch": 1.14, "learning_rate": 3.103703703703704e-05, "loss": 0.2286, "step": 2560 }, { "epoch": 1.14, "learning_rate": 3.096296296296296e-05, "loss": 0.3023, "step": 2570 }, { "epoch": 1.14, "eval_loss": 0.28357136249542236, "eval_runtime": 7.6709, "eval_samples_per_second": 130.493, "eval_steps_per_second": 32.721, "step": 2575 }, { "epoch": 1.15, "learning_rate": 3.088888888888889e-05, "loss": 0.2631, "step": 2580 }, { "epoch": 1.15, "learning_rate": 3.0814814814814816e-05, "loss": 0.2656, "step": 2590 }, { "epoch": 1.16, "learning_rate": 3.074074074074074e-05, "loss": 0.2301, "step": 2600 }, { "epoch": 1.16, "eval_loss": 0.2838016450405121, "eval_runtime": 7.4972, "eval_samples_per_second": 133.516, "eval_steps_per_second": 33.479, "step": 2600 }, { "epoch": 1.16, "learning_rate": 3.066666666666667e-05, "loss": 0.2708, "step": 2610 }, { "epoch": 1.16, "learning_rate": 3.059259259259259e-05, "loss": 0.2638, "step": 2620 }, { "epoch": 1.17, "eval_loss": 0.2819526791572571, "eval_runtime": 7.5414, "eval_samples_per_second": 132.734, "eval_steps_per_second": 33.283, "step": 2625 }, { "epoch": 1.17, "learning_rate": 3.0518518518518515e-05, "loss": 0.2786, "step": 2630 }, { "epoch": 1.17, "learning_rate": 3.044444444444445e-05, "loss": 0.3328, "step": 2640 }, { "epoch": 1.18, "learning_rate": 3.037037037037037e-05, "loss": 0.2835, "step": 2650 }, { "epoch": 1.18, "eval_loss": 0.2829004228115082, "eval_runtime": 7.3311, "eval_samples_per_second": 136.541, "eval_steps_per_second": 34.238, "step": 2650 }, { "epoch": 1.18, "learning_rate": 3.02962962962963e-05, "loss": 0.3523, "step": 2660 }, { "epoch": 1.19, "learning_rate": 3.0222222222222225e-05, "loss": 0.2512, "step": 2670 }, { "epoch": 1.19, "eval_loss": 0.28273966908454895, "eval_runtime": 7.3473, "eval_samples_per_second": 136.24, "eval_steps_per_second": 34.162, "step": 2675 }, { "epoch": 1.19, "learning_rate": 3.0148148148148148e-05, "loss": 0.2382, "step": 2680 }, { "epoch": 1.2, "learning_rate": 3.0074074074074078e-05, "loss": 0.2668, "step": 2690 }, { "epoch": 1.2, "learning_rate": 3e-05, "loss": 0.2472, "step": 2700 }, { "epoch": 1.2, "eval_loss": 0.28208473324775696, "eval_runtime": 7.2579, "eval_samples_per_second": 137.92, "eval_steps_per_second": 34.583, "step": 2700 }, { "epoch": 1.2, "learning_rate": 2.992592592592593e-05, "loss": 0.2332, "step": 2710 }, { "epoch": 1.21, "learning_rate": 2.9851851851851854e-05, "loss": 0.266, "step": 2720 }, { "epoch": 1.21, "eval_loss": 0.281426340341568, "eval_runtime": 7.2593, "eval_samples_per_second": 137.892, "eval_steps_per_second": 34.576, "step": 2725 }, { "epoch": 1.21, "learning_rate": 2.9777777777777777e-05, "loss": 0.2908, "step": 2730 }, { "epoch": 1.22, "learning_rate": 2.9703703703703707e-05, "loss": 0.2299, "step": 2740 }, { "epoch": 1.22, "learning_rate": 2.962962962962963e-05, "loss": 0.2284, "step": 2750 }, { "epoch": 1.22, "eval_loss": 0.2816203832626343, "eval_runtime": 7.2174, "eval_samples_per_second": 138.694, "eval_steps_per_second": 34.777, "step": 2750 }, { "epoch": 1.23, "learning_rate": 2.955555555555556e-05, "loss": 0.2821, "step": 2760 }, { "epoch": 1.23, "learning_rate": 2.9481481481481483e-05, "loss": 0.3079, "step": 2770 }, { "epoch": 1.23, "eval_loss": 0.2785053551197052, "eval_runtime": 7.3, "eval_samples_per_second": 137.124, "eval_steps_per_second": 34.384, "step": 2775 }, { "epoch": 1.24, "learning_rate": 2.9407407407407413e-05, "loss": 0.1826, "step": 2780 }, { "epoch": 1.24, "learning_rate": 2.9333333333333336e-05, "loss": 0.2438, "step": 2790 }, { "epoch": 1.24, "learning_rate": 2.925925925925926e-05, "loss": 0.2014, "step": 2800 }, { "epoch": 1.24, "eval_loss": 0.2803582549095154, "eval_runtime": 7.196, "eval_samples_per_second": 139.106, "eval_steps_per_second": 34.881, "step": 2800 }, { "epoch": 1.25, "learning_rate": 2.918518518518519e-05, "loss": 0.2979, "step": 2810 }, { "epoch": 1.25, "learning_rate": 2.9111111111111112e-05, "loss": 0.2223, "step": 2820 }, { "epoch": 1.26, "eval_loss": 0.27921053767204285, "eval_runtime": 7.2996, "eval_samples_per_second": 137.132, "eval_steps_per_second": 34.386, "step": 2825 }, { "epoch": 1.26, "learning_rate": 2.9037037037037042e-05, "loss": 0.2515, "step": 2830 }, { "epoch": 1.26, "learning_rate": 2.8962962962962965e-05, "loss": 0.2771, "step": 2840 }, { "epoch": 1.27, "learning_rate": 2.8888888888888888e-05, "loss": 0.2611, "step": 2850 }, { "epoch": 1.27, "eval_loss": 0.2792360186576843, "eval_runtime": 7.1956, "eval_samples_per_second": 139.112, "eval_steps_per_second": 34.882, "step": 2850 }, { "epoch": 1.27, "learning_rate": 2.8814814814814818e-05, "loss": 0.2867, "step": 2860 }, { "epoch": 1.28, "learning_rate": 2.874074074074074e-05, "loss": 0.2492, "step": 2870 }, { "epoch": 1.28, "eval_loss": 0.2788338363170624, "eval_runtime": 7.2868, "eval_samples_per_second": 137.371, "eval_steps_per_second": 34.446, "step": 2875 }, { "epoch": 1.28, "learning_rate": 2.8666666666666668e-05, "loss": 0.26, "step": 2880 }, { "epoch": 1.28, "learning_rate": 2.8592592592592594e-05, "loss": 0.3693, "step": 2890 }, { "epoch": 1.29, "learning_rate": 2.851851851851852e-05, "loss": 0.2686, "step": 2900 }, { "epoch": 1.29, "eval_loss": 0.27813464403152466, "eval_runtime": 7.1901, "eval_samples_per_second": 139.219, "eval_steps_per_second": 34.909, "step": 2900 }, { "epoch": 1.29, "learning_rate": 2.8444444444444447e-05, "loss": 0.2091, "step": 2910 }, { "epoch": 1.3, "learning_rate": 2.837037037037037e-05, "loss": 0.2701, "step": 2920 }, { "epoch": 1.3, "eval_loss": 0.27804136276245117, "eval_runtime": 7.2533, "eval_samples_per_second": 138.006, "eval_steps_per_second": 34.605, "step": 2925 }, { "epoch": 1.3, "learning_rate": 2.8296296296296297e-05, "loss": 0.2795, "step": 2930 }, { "epoch": 1.31, "learning_rate": 2.8222222222222223e-05, "loss": 0.3346, "step": 2940 }, { "epoch": 1.31, "learning_rate": 2.814814814814815e-05, "loss": 0.2254, "step": 2950 }, { "epoch": 1.31, "eval_loss": 0.27507713437080383, "eval_runtime": 7.2159, "eval_samples_per_second": 138.721, "eval_steps_per_second": 34.784, "step": 2950 }, { "epoch": 1.32, "learning_rate": 2.8074074074074076e-05, "loss": 0.2507, "step": 2960 }, { "epoch": 1.32, "learning_rate": 2.8000000000000003e-05, "loss": 0.2942, "step": 2970 }, { "epoch": 1.32, "eval_loss": 0.2757761776447296, "eval_runtime": 7.2423, "eval_samples_per_second": 138.216, "eval_steps_per_second": 34.658, "step": 2975 }, { "epoch": 1.32, "learning_rate": 2.7925925925925926e-05, "loss": 0.2678, "step": 2980 }, { "epoch": 1.33, "learning_rate": 2.7851851851851853e-05, "loss": 0.2459, "step": 2990 }, { "epoch": 1.33, "learning_rate": 2.777777777777778e-05, "loss": 0.2446, "step": 3000 }, { "epoch": 1.33, "eval_loss": 0.2748652994632721, "eval_runtime": 7.2844, "eval_samples_per_second": 137.416, "eval_steps_per_second": 34.457, "step": 3000 }, { "epoch": 1.34, "learning_rate": 2.7703703703703706e-05, "loss": 0.2278, "step": 3010 }, { "epoch": 1.34, "learning_rate": 2.7629629629629632e-05, "loss": 0.2337, "step": 3020 }, { "epoch": 1.34, "eval_loss": 0.2740112841129303, "eval_runtime": 7.3832, "eval_samples_per_second": 135.578, "eval_steps_per_second": 33.996, "step": 3025 }, { "epoch": 1.35, "learning_rate": 2.7555555555555555e-05, "loss": 0.2335, "step": 3030 }, { "epoch": 1.35, "learning_rate": 2.7481481481481482e-05, "loss": 0.2023, "step": 3040 }, { "epoch": 1.36, "learning_rate": 2.7407407407407408e-05, "loss": 0.2166, "step": 3050 }, { "epoch": 1.36, "eval_loss": 0.27378755807876587, "eval_runtime": 7.4184, "eval_samples_per_second": 134.934, "eval_steps_per_second": 33.835, "step": 3050 }, { "epoch": 1.36, "learning_rate": 2.733333333333333e-05, "loss": 0.2161, "step": 3060 }, { "epoch": 1.36, "learning_rate": 2.725925925925926e-05, "loss": 0.3003, "step": 3070 }, { "epoch": 1.37, "eval_loss": 0.2731979191303253, "eval_runtime": 13.3786, "eval_samples_per_second": 74.821, "eval_steps_per_second": 18.761, "step": 3075 }, { "epoch": 1.37, "learning_rate": 2.7185185185185184e-05, "loss": 0.2498, "step": 3080 }, { "epoch": 1.37, "learning_rate": 2.7111111111111114e-05, "loss": 0.2749, "step": 3090 }, { "epoch": 1.38, "learning_rate": 2.7037037037037037e-05, "loss": 0.2422, "step": 3100 }, { "epoch": 1.38, "eval_loss": 0.27196845412254333, "eval_runtime": 7.5816, "eval_samples_per_second": 132.03, "eval_steps_per_second": 33.107, "step": 3100 }, { "epoch": 1.38, "learning_rate": 2.696296296296296e-05, "loss": 0.2468, "step": 3110 }, { "epoch": 1.39, "learning_rate": 2.688888888888889e-05, "loss": 0.263, "step": 3120 }, { "epoch": 1.39, "eval_loss": 0.2727610766887665, "eval_runtime": 7.6161, "eval_samples_per_second": 131.432, "eval_steps_per_second": 32.956, "step": 3125 }, { "epoch": 1.39, "learning_rate": 2.6814814814814814e-05, "loss": 0.2436, "step": 3130 }, { "epoch": 1.4, "learning_rate": 2.6740740740740743e-05, "loss": 0.27, "step": 3140 }, { "epoch": 1.4, "learning_rate": 2.6666666666666667e-05, "loss": 0.2462, "step": 3150 }, { "epoch": 1.4, "eval_loss": 0.2711770832538605, "eval_runtime": 7.4596, "eval_samples_per_second": 134.19, "eval_steps_per_second": 33.648, "step": 3150 }, { "epoch": 1.4, "learning_rate": 2.659259259259259e-05, "loss": 0.3066, "step": 3160 }, { "epoch": 1.41, "learning_rate": 2.651851851851852e-05, "loss": 0.2262, "step": 3170 }, { "epoch": 1.41, "eval_loss": 0.27057451009750366, "eval_runtime": 7.4675, "eval_samples_per_second": 134.047, "eval_steps_per_second": 33.612, "step": 3175 }, { "epoch": 1.41, "learning_rate": 2.6444444444444443e-05, "loss": 0.2187, "step": 3180 }, { "epoch": 1.42, "learning_rate": 2.6370370370370373e-05, "loss": 0.2992, "step": 3190 }, { "epoch": 1.42, "learning_rate": 2.6296296296296296e-05, "loss": 0.3181, "step": 3200 }, { "epoch": 1.42, "eval_loss": 0.27069294452667236, "eval_runtime": 7.2871, "eval_samples_per_second": 137.367, "eval_steps_per_second": 34.445, "step": 3200 }, { "epoch": 1.43, "learning_rate": 2.6222222222222226e-05, "loss": 0.275, "step": 3210 }, { "epoch": 1.43, "learning_rate": 2.614814814814815e-05, "loss": 0.2337, "step": 3220 }, { "epoch": 1.43, "eval_loss": 0.2696068286895752, "eval_runtime": 7.3083, "eval_samples_per_second": 136.967, "eval_steps_per_second": 34.344, "step": 3225 }, { "epoch": 1.44, "learning_rate": 2.6074074074074072e-05, "loss": 0.2596, "step": 3230 }, { "epoch": 1.44, "learning_rate": 2.6000000000000002e-05, "loss": 0.2511, "step": 3240 }, { "epoch": 1.44, "learning_rate": 2.5925925925925925e-05, "loss": 0.2223, "step": 3250 }, { "epoch": 1.44, "eval_loss": 0.26948779821395874, "eval_runtime": 7.2594, "eval_samples_per_second": 137.89, "eval_steps_per_second": 34.576, "step": 3250 }, { "epoch": 1.45, "learning_rate": 2.5851851851851855e-05, "loss": 0.266, "step": 3260 }, { "epoch": 1.45, "learning_rate": 2.5777777777777778e-05, "loss": 0.2594, "step": 3270 }, { "epoch": 1.46, "eval_loss": 0.2680164873600006, "eval_runtime": 7.3879, "eval_samples_per_second": 135.492, "eval_steps_per_second": 33.975, "step": 3275 }, { "epoch": 1.46, "learning_rate": 2.5703703703703708e-05, "loss": 0.2623, "step": 3280 }, { "epoch": 1.46, "learning_rate": 2.562962962962963e-05, "loss": 0.2028, "step": 3290 }, { "epoch": 1.47, "learning_rate": 2.5555555555555554e-05, "loss": 0.2226, "step": 3300 }, { "epoch": 1.47, "eval_loss": 0.27027028799057007, "eval_runtime": 7.2794, "eval_samples_per_second": 137.512, "eval_steps_per_second": 34.481, "step": 3300 }, { "epoch": 1.47, "learning_rate": 2.5481481481481484e-05, "loss": 0.211, "step": 3310 }, { "epoch": 1.48, "learning_rate": 2.5407407407407407e-05, "loss": 0.2037, "step": 3320 }, { "epoch": 1.48, "eval_loss": 0.2683195471763611, "eval_runtime": 7.3164, "eval_samples_per_second": 136.815, "eval_steps_per_second": 34.306, "step": 3325 }, { "epoch": 1.48, "learning_rate": 2.5333333333333337e-05, "loss": 0.1988, "step": 3330 }, { "epoch": 1.48, "learning_rate": 2.525925925925926e-05, "loss": 0.2262, "step": 3340 }, { "epoch": 1.49, "learning_rate": 2.5185185185185183e-05, "loss": 0.2538, "step": 3350 }, { "epoch": 1.49, "eval_loss": 0.26811686158180237, "eval_runtime": 7.2097, "eval_samples_per_second": 138.841, "eval_steps_per_second": 34.814, "step": 3350 }, { "epoch": 1.49, "learning_rate": 2.5111111111111113e-05, "loss": 0.2029, "step": 3360 }, { "epoch": 1.5, "learning_rate": 2.5037037037037036e-05, "loss": 0.2094, "step": 3370 }, { "epoch": 1.5, "eval_loss": 0.269042432308197, "eval_runtime": 7.2591, "eval_samples_per_second": 137.896, "eval_steps_per_second": 34.577, "step": 3375 }, { "epoch": 1.5, "learning_rate": 2.4962962962962963e-05, "loss": 0.195, "step": 3380 }, { "epoch": 1.51, "learning_rate": 2.488888888888889e-05, "loss": 0.2639, "step": 3390 }, { "epoch": 1.51, "learning_rate": 2.4814814814814816e-05, "loss": 0.2591, "step": 3400 }, { "epoch": 1.51, "eval_loss": 0.26725488901138306, "eval_runtime": 7.2092, "eval_samples_per_second": 138.851, "eval_steps_per_second": 34.817, "step": 3400 }, { "epoch": 1.52, "learning_rate": 2.4740740740740742e-05, "loss": 0.2043, "step": 3410 }, { "epoch": 1.52, "learning_rate": 2.466666666666667e-05, "loss": 0.2274, "step": 3420 }, { "epoch": 1.52, "eval_loss": 0.2678578794002533, "eval_runtime": 7.2753, "eval_samples_per_second": 137.588, "eval_steps_per_second": 34.5, "step": 3425 }, { "epoch": 1.52, "learning_rate": 2.4592592592592595e-05, "loss": 0.2352, "step": 3430 }, { "epoch": 1.53, "learning_rate": 2.451851851851852e-05, "loss": 0.2365, "step": 3440 }, { "epoch": 1.53, "learning_rate": 2.4444444444444445e-05, "loss": 0.2526, "step": 3450 }, { "epoch": 1.53, "eval_loss": 0.266437292098999, "eval_runtime": 7.2076, "eval_samples_per_second": 138.88, "eval_steps_per_second": 34.824, "step": 3450 }, { "epoch": 1.54, "learning_rate": 2.437037037037037e-05, "loss": 0.1861, "step": 3460 }, { "epoch": 1.54, "learning_rate": 2.4296296296296298e-05, "loss": 0.2659, "step": 3470 }, { "epoch": 1.54, "eval_loss": 0.265876829624176, "eval_runtime": 7.5122, "eval_samples_per_second": 133.25, "eval_steps_per_second": 33.412, "step": 3475 }, { "epoch": 1.55, "learning_rate": 2.4222222222222224e-05, "loss": 0.2159, "step": 3480 }, { "epoch": 1.55, "learning_rate": 2.414814814814815e-05, "loss": 0.2809, "step": 3490 }, { "epoch": 1.56, "learning_rate": 2.4074074074074074e-05, "loss": 0.2144, "step": 3500 }, { "epoch": 1.56, "eval_loss": 0.2642614245414734, "eval_runtime": 7.468, "eval_samples_per_second": 134.039, "eval_steps_per_second": 33.61, "step": 3500 }, { "epoch": 1.56, "learning_rate": 2.4e-05, "loss": 0.2122, "step": 3510 }, { "epoch": 1.56, "learning_rate": 2.3925925925925927e-05, "loss": 0.2432, "step": 3520 }, { "epoch": 1.57, "eval_loss": 0.2640276551246643, "eval_runtime": 7.5832, "eval_samples_per_second": 132.003, "eval_steps_per_second": 33.1, "step": 3525 }, { "epoch": 1.57, "learning_rate": 2.3851851851851854e-05, "loss": 0.2441, "step": 3530 }, { "epoch": 1.57, "learning_rate": 2.377777777777778e-05, "loss": 0.2647, "step": 3540 }, { "epoch": 1.58, "learning_rate": 2.3703703703703707e-05, "loss": 0.2852, "step": 3550 }, { "epoch": 1.58, "eval_loss": 0.26369205117225647, "eval_runtime": 7.3907, "eval_samples_per_second": 135.44, "eval_steps_per_second": 33.962, "step": 3550 }, { "epoch": 1.58, "learning_rate": 2.3629629629629633e-05, "loss": 0.2321, "step": 3560 }, { "epoch": 1.59, "learning_rate": 2.3555555555555556e-05, "loss": 0.3375, "step": 3570 }, { "epoch": 1.59, "eval_loss": 0.2633427381515503, "eval_runtime": 7.3881, "eval_samples_per_second": 135.489, "eval_steps_per_second": 33.974, "step": 3575 }, { "epoch": 1.59, "learning_rate": 2.3481481481481483e-05, "loss": 0.2219, "step": 3580 }, { "epoch": 1.6, "learning_rate": 2.340740740740741e-05, "loss": 0.2004, "step": 3590 }, { "epoch": 1.6, "learning_rate": 2.3333333333333336e-05, "loss": 0.272, "step": 3600 }, { "epoch": 1.6, "eval_loss": 0.26376578211784363, "eval_runtime": 7.2236, "eval_samples_per_second": 138.574, "eval_steps_per_second": 34.747, "step": 3600 }, { "epoch": 1.6, "learning_rate": 2.3259259259259262e-05, "loss": 0.2887, "step": 3610 }, { "epoch": 1.61, "learning_rate": 2.318518518518519e-05, "loss": 0.2502, "step": 3620 }, { "epoch": 1.61, "eval_loss": 0.2631487250328064, "eval_runtime": 7.2851, "eval_samples_per_second": 137.403, "eval_steps_per_second": 34.454, "step": 3625 }, { "epoch": 1.61, "learning_rate": 2.3111111111111112e-05, "loss": 0.206, "step": 3630 }, { "epoch": 1.62, "learning_rate": 2.303703703703704e-05, "loss": 0.2368, "step": 3640 }, { "epoch": 1.62, "learning_rate": 2.2962962962962965e-05, "loss": 0.2131, "step": 3650 }, { "epoch": 1.62, "eval_loss": 0.2649364173412323, "eval_runtime": 7.1872, "eval_samples_per_second": 139.275, "eval_steps_per_second": 34.923, "step": 3650 }, { "epoch": 1.63, "learning_rate": 2.288888888888889e-05, "loss": 0.2137, "step": 3660 }, { "epoch": 1.63, "learning_rate": 2.2814814814814818e-05, "loss": 0.2313, "step": 3670 }, { "epoch": 1.63, "eval_loss": 0.2636246979236603, "eval_runtime": 7.2744, "eval_samples_per_second": 137.606, "eval_steps_per_second": 34.505, "step": 3675 }, { "epoch": 1.64, "learning_rate": 2.2740740740740744e-05, "loss": 0.239, "step": 3680 }, { "epoch": 1.64, "learning_rate": 2.2666666666666668e-05, "loss": 0.2354, "step": 3690 }, { "epoch": 1.64, "learning_rate": 2.2592592592592594e-05, "loss": 0.1919, "step": 3700 }, { "epoch": 1.64, "eval_loss": 0.262579083442688, "eval_runtime": 7.2981, "eval_samples_per_second": 137.159, "eval_steps_per_second": 34.392, "step": 3700 }, { "epoch": 1.65, "learning_rate": 2.251851851851852e-05, "loss": 0.2836, "step": 3710 }, { "epoch": 1.65, "learning_rate": 2.2444444444444447e-05, "loss": 0.2761, "step": 3720 }, { "epoch": 1.66, "eval_loss": 0.26134076714515686, "eval_runtime": 7.3981, "eval_samples_per_second": 135.305, "eval_steps_per_second": 33.928, "step": 3725 }, { "epoch": 1.66, "learning_rate": 2.2370370370370374e-05, "loss": 0.1932, "step": 3730 }, { "epoch": 1.66, "learning_rate": 2.2296296296296297e-05, "loss": 0.2248, "step": 3740 }, { "epoch": 1.67, "learning_rate": 2.2222222222222223e-05, "loss": 0.2151, "step": 3750 }, { "epoch": 1.67, "eval_loss": 0.26156380772590637, "eval_runtime": 7.2684, "eval_samples_per_second": 137.72, "eval_steps_per_second": 34.533, "step": 3750 }, { "epoch": 1.67, "learning_rate": 2.214814814814815e-05, "loss": 0.2614, "step": 3760 }, { "epoch": 1.68, "learning_rate": 2.2074074074074076e-05, "loss": 0.2725, "step": 3770 }, { "epoch": 1.68, "eval_loss": 0.26058655977249146, "eval_runtime": 7.2639, "eval_samples_per_second": 137.804, "eval_steps_per_second": 34.554, "step": 3775 }, { "epoch": 1.68, "learning_rate": 2.2000000000000003e-05, "loss": 0.2402, "step": 3780 }, { "epoch": 1.68, "learning_rate": 2.1925925925925926e-05, "loss": 0.2283, "step": 3790 }, { "epoch": 1.69, "learning_rate": 2.1851851851851852e-05, "loss": 0.2552, "step": 3800 }, { "epoch": 1.69, "eval_loss": 0.26024872064590454, "eval_runtime": 7.191, "eval_samples_per_second": 139.201, "eval_steps_per_second": 34.905, "step": 3800 }, { "epoch": 1.69, "learning_rate": 2.177777777777778e-05, "loss": 0.2615, "step": 3810 }, { "epoch": 1.7, "learning_rate": 2.1703703703703705e-05, "loss": 0.2094, "step": 3820 }, { "epoch": 1.7, "eval_loss": 0.2611374258995056, "eval_runtime": 7.1864, "eval_samples_per_second": 139.291, "eval_steps_per_second": 34.927, "step": 3825 }, { "epoch": 1.7, "learning_rate": 2.162962962962963e-05, "loss": 0.2267, "step": 3830 }, { "epoch": 1.71, "learning_rate": 2.1555555555555555e-05, "loss": 0.2807, "step": 3840 }, { "epoch": 1.71, "learning_rate": 2.148148148148148e-05, "loss": 0.2948, "step": 3850 }, { "epoch": 1.71, "eval_loss": 0.25876420736312866, "eval_runtime": 7.2079, "eval_samples_per_second": 138.875, "eval_steps_per_second": 34.823, "step": 3850 }, { "epoch": 1.72, "learning_rate": 2.1407407407407408e-05, "loss": 0.261, "step": 3860 }, { "epoch": 1.72, "learning_rate": 2.1333333333333335e-05, "loss": 0.3131, "step": 3870 }, { "epoch": 1.72, "eval_loss": 0.25907066464424133, "eval_runtime": 7.2187, "eval_samples_per_second": 138.668, "eval_steps_per_second": 34.771, "step": 3875 }, { "epoch": 1.72, "learning_rate": 2.1259259259259258e-05, "loss": 0.2667, "step": 3880 }, { "epoch": 1.73, "learning_rate": 2.1185185185185184e-05, "loss": 0.216, "step": 3890 }, { "epoch": 1.73, "learning_rate": 2.111111111111111e-05, "loss": 0.237, "step": 3900 }, { "epoch": 1.73, "eval_loss": 0.2582587003707886, "eval_runtime": 8.5503, "eval_samples_per_second": 117.072, "eval_steps_per_second": 29.356, "step": 3900 }, { "epoch": 1.74, "learning_rate": 2.1037037037037037e-05, "loss": 0.3102, "step": 3910 }, { "epoch": 1.74, "learning_rate": 2.0962962962962964e-05, "loss": 0.2385, "step": 3920 }, { "epoch": 1.74, "eval_loss": 0.2583344876766205, "eval_runtime": 7.2821, "eval_samples_per_second": 137.461, "eval_steps_per_second": 34.468, "step": 3925 }, { "epoch": 1.75, "learning_rate": 2.088888888888889e-05, "loss": 0.3235, "step": 3930 }, { "epoch": 1.75, "learning_rate": 2.0814814814814813e-05, "loss": 0.2154, "step": 3940 }, { "epoch": 1.76, "learning_rate": 2.074074074074074e-05, "loss": 0.1842, "step": 3950 }, { "epoch": 1.76, "eval_loss": 0.2580818831920624, "eval_runtime": 7.213, "eval_samples_per_second": 138.777, "eval_steps_per_second": 34.798, "step": 3950 }, { "epoch": 1.76, "learning_rate": 2.0666666666666666e-05, "loss": 0.2051, "step": 3960 }, { "epoch": 1.76, "learning_rate": 2.0592592592592593e-05, "loss": 0.2291, "step": 3970 }, { "epoch": 1.77, "eval_loss": 0.25752562284469604, "eval_runtime": 7.2125, "eval_samples_per_second": 138.787, "eval_steps_per_second": 34.801, "step": 3975 }, { "epoch": 1.77, "learning_rate": 2.051851851851852e-05, "loss": 0.1973, "step": 3980 }, { "epoch": 1.77, "learning_rate": 2.0444444444444446e-05, "loss": 0.2864, "step": 3990 }, { "epoch": 1.78, "learning_rate": 2.037037037037037e-05, "loss": 0.2124, "step": 4000 }, { "epoch": 1.78, "eval_loss": 0.2576400637626648, "eval_runtime": 7.2072, "eval_samples_per_second": 138.89, "eval_steps_per_second": 34.827, "step": 4000 }, { "epoch": 1.78, "learning_rate": 2.0296296296296296e-05, "loss": 0.3026, "step": 4010 }, { "epoch": 1.79, "learning_rate": 2.0222222222222222e-05, "loss": 0.2713, "step": 4020 }, { "epoch": 1.79, "eval_loss": 0.256246954202652, "eval_runtime": 7.2163, "eval_samples_per_second": 138.714, "eval_steps_per_second": 34.782, "step": 4025 }, { "epoch": 1.79, "learning_rate": 2.014814814814815e-05, "loss": 0.2531, "step": 4030 }, { "epoch": 1.8, "learning_rate": 2.0074074074074075e-05, "loss": 0.2241, "step": 4040 }, { "epoch": 1.8, "learning_rate": 2e-05, "loss": 0.2111, "step": 4050 }, { "epoch": 1.8, "eval_loss": 0.25542324781417847, "eval_runtime": 7.2106, "eval_samples_per_second": 138.823, "eval_steps_per_second": 34.81, "step": 4050 }, { "epoch": 1.8, "learning_rate": 1.9925925925925925e-05, "loss": 0.2535, "step": 4060 }, { "epoch": 1.81, "learning_rate": 1.985185185185185e-05, "loss": 0.2385, "step": 4070 }, { "epoch": 1.81, "eval_loss": 0.2562381625175476, "eval_runtime": 7.1846, "eval_samples_per_second": 139.325, "eval_steps_per_second": 34.936, "step": 4075 }, { "epoch": 1.81, "learning_rate": 1.9777777777777778e-05, "loss": 0.212, "step": 4080 }, { "epoch": 1.82, "learning_rate": 1.9703703703703704e-05, "loss": 0.2094, "step": 4090 }, { "epoch": 1.82, "learning_rate": 1.962962962962963e-05, "loss": 0.2499, "step": 4100 }, { "epoch": 1.82, "eval_loss": 0.2557750344276428, "eval_runtime": 7.2707, "eval_samples_per_second": 137.676, "eval_steps_per_second": 34.522, "step": 4100 }, { "epoch": 1.83, "learning_rate": 1.9555555555555557e-05, "loss": 0.2507, "step": 4110 }, { "epoch": 1.83, "learning_rate": 1.948148148148148e-05, "loss": 0.233, "step": 4120 }, { "epoch": 1.83, "eval_loss": 0.25592681765556335, "eval_runtime": 7.3242, "eval_samples_per_second": 136.671, "eval_steps_per_second": 34.27, "step": 4125 }, { "epoch": 1.84, "learning_rate": 1.9407407407407407e-05, "loss": 0.2772, "step": 4130 }, { "epoch": 1.84, "learning_rate": 1.9333333333333333e-05, "loss": 0.2487, "step": 4140 }, { "epoch": 1.84, "learning_rate": 1.925925925925926e-05, "loss": 0.2538, "step": 4150 }, { "epoch": 1.84, "eval_loss": 0.25490960478782654, "eval_runtime": 7.3569, "eval_samples_per_second": 136.062, "eval_steps_per_second": 34.117, "step": 4150 }, { "epoch": 1.85, "learning_rate": 1.9185185185185186e-05, "loss": 0.2681, "step": 4160 }, { "epoch": 1.85, "learning_rate": 1.9111111111111113e-05, "loss": 0.3042, "step": 4170 }, { "epoch": 1.86, "eval_loss": 0.2556719183921814, "eval_runtime": 7.4839, "eval_samples_per_second": 133.753, "eval_steps_per_second": 33.539, "step": 4175 }, { "epoch": 1.86, "learning_rate": 1.903703703703704e-05, "loss": 0.2017, "step": 4180 }, { "epoch": 1.86, "learning_rate": 1.8962962962962963e-05, "loss": 0.2162, "step": 4190 }, { "epoch": 1.87, "learning_rate": 1.888888888888889e-05, "loss": 0.1989, "step": 4200 }, { "epoch": 1.87, "eval_loss": 0.25471413135528564, "eval_runtime": 7.4955, "eval_samples_per_second": 133.547, "eval_steps_per_second": 33.487, "step": 4200 }, { "epoch": 1.87, "learning_rate": 1.8814814814814816e-05, "loss": 0.2227, "step": 4210 }, { "epoch": 1.88, "learning_rate": 1.8740740740740742e-05, "loss": 0.198, "step": 4220 }, { "epoch": 1.88, "eval_loss": 0.2542085349559784, "eval_runtime": 7.5713, "eval_samples_per_second": 132.211, "eval_steps_per_second": 33.152, "step": 4225 }, { "epoch": 1.88, "learning_rate": 1.866666666666667e-05, "loss": 0.2839, "step": 4230 }, { "epoch": 1.88, "learning_rate": 1.8592592592592595e-05, "loss": 0.2969, "step": 4240 }, { "epoch": 1.89, "learning_rate": 1.8518518518518518e-05, "loss": 0.2634, "step": 4250 }, { "epoch": 1.89, "eval_loss": 0.25297510623931885, "eval_runtime": 7.6201, "eval_samples_per_second": 131.363, "eval_steps_per_second": 32.939, "step": 4250 }, { "epoch": 1.89, "learning_rate": 1.8444444444444445e-05, "loss": 0.2182, "step": 4260 }, { "epoch": 1.9, "learning_rate": 1.837037037037037e-05, "loss": 0.2684, "step": 4270 }, { "epoch": 1.9, "eval_loss": 0.2529478371143341, "eval_runtime": 7.6013, "eval_samples_per_second": 131.689, "eval_steps_per_second": 33.021, "step": 4275 }, { "epoch": 1.9, "learning_rate": 1.8296296296296298e-05, "loss": 0.302, "step": 4280 }, { "epoch": 1.91, "learning_rate": 1.8222222222222224e-05, "loss": 0.195, "step": 4290 }, { "epoch": 1.91, "learning_rate": 1.814814814814815e-05, "loss": 0.1844, "step": 4300 }, { "epoch": 1.91, "eval_loss": 0.2522367835044861, "eval_runtime": 7.3231, "eval_samples_per_second": 136.692, "eval_steps_per_second": 34.275, "step": 4300 }, { "epoch": 1.92, "learning_rate": 1.8074074074074074e-05, "loss": 0.2901, "step": 4310 }, { "epoch": 1.92, "learning_rate": 1.8e-05, "loss": 0.2303, "step": 4320 }, { "epoch": 1.92, "eval_loss": 0.252410352230072, "eval_runtime": 7.3587, "eval_samples_per_second": 136.029, "eval_steps_per_second": 34.109, "step": 4325 }, { "epoch": 1.92, "learning_rate": 1.7925925925925927e-05, "loss": 0.3308, "step": 4330 }, { "epoch": 1.93, "learning_rate": 1.7851851851851853e-05, "loss": 0.2441, "step": 4340 }, { "epoch": 1.93, "learning_rate": 1.777777777777778e-05, "loss": 0.2126, "step": 4350 }, { "epoch": 1.93, "eval_loss": 0.25146955251693726, "eval_runtime": 7.3139, "eval_samples_per_second": 136.863, "eval_steps_per_second": 34.318, "step": 4350 }, { "epoch": 1.94, "learning_rate": 1.7703703703703706e-05, "loss": 0.2338, "step": 4360 }, { "epoch": 1.94, "learning_rate": 1.762962962962963e-05, "loss": 0.2278, "step": 4370 }, { "epoch": 1.94, "eval_loss": 0.2519494593143463, "eval_runtime": 7.2536, "eval_samples_per_second": 138.0, "eval_steps_per_second": 34.604, "step": 4375 }, { "epoch": 1.95, "learning_rate": 1.7555555555555556e-05, "loss": 0.2033, "step": 4380 }, { "epoch": 1.95, "learning_rate": 1.7481481481481483e-05, "loss": 0.2524, "step": 4390 }, { "epoch": 1.96, "learning_rate": 1.740740740740741e-05, "loss": 0.2692, "step": 4400 }, { "epoch": 1.96, "eval_loss": 0.25108450651168823, "eval_runtime": 7.1825, "eval_samples_per_second": 139.366, "eval_steps_per_second": 34.946, "step": 4400 }, { "epoch": 1.96, "learning_rate": 1.7333333333333336e-05, "loss": 0.2222, "step": 4410 }, { "epoch": 1.96, "learning_rate": 1.7259259259259262e-05, "loss": 0.2398, "step": 4420 }, { "epoch": 1.97, "eval_loss": 0.25103849172592163, "eval_runtime": 7.1987, "eval_samples_per_second": 139.053, "eval_steps_per_second": 34.867, "step": 4425 }, { "epoch": 1.97, "learning_rate": 1.7185185185185185e-05, "loss": 0.2295, "step": 4430 }, { "epoch": 1.97, "learning_rate": 1.7111111111111112e-05, "loss": 0.2153, "step": 4440 }, { "epoch": 1.98, "learning_rate": 1.7037037037037038e-05, "loss": 0.2799, "step": 4450 }, { "epoch": 1.98, "eval_loss": 0.25079530477523804, "eval_runtime": 7.4048, "eval_samples_per_second": 135.183, "eval_steps_per_second": 33.897, "step": 4450 }, { "epoch": 1.98, "learning_rate": 1.6962962962962965e-05, "loss": 0.2143, "step": 4460 }, { "epoch": 1.99, "learning_rate": 1.688888888888889e-05, "loss": 0.2492, "step": 4470 }, { "epoch": 1.99, "eval_loss": 0.25007081031799316, "eval_runtime": 7.3873, "eval_samples_per_second": 135.504, "eval_steps_per_second": 33.977, "step": 4475 }, { "epoch": 1.99, "learning_rate": 1.6814814814814818e-05, "loss": 0.2657, "step": 4480 }, { "epoch": 2.0, "learning_rate": 1.674074074074074e-05, "loss": 0.2207, "step": 4490 }, { "epoch": 2.0, "learning_rate": 1.6666666666666667e-05, "loss": 0.2202, "step": 4500 }, { "epoch": 2.0, "eval_loss": 0.24966831505298615, "eval_runtime": 7.3428, "eval_samples_per_second": 136.324, "eval_steps_per_second": 34.183, "step": 4500 }, { "epoch": 2.0, "learning_rate": 1.6592592592592594e-05, "loss": 0.2467, "step": 4510 }, { "epoch": 2.01, "learning_rate": 1.651851851851852e-05, "loss": 0.1736, "step": 4520 }, { "epoch": 2.01, "eval_loss": 0.25051918625831604, "eval_runtime": 7.2975, "eval_samples_per_second": 137.171, "eval_steps_per_second": 34.395, "step": 4525 }, { "epoch": 2.01, "learning_rate": 1.6444444444444447e-05, "loss": 0.1836, "step": 4530 }, { "epoch": 2.02, "learning_rate": 1.6370370370370374e-05, "loss": 0.1811, "step": 4540 }, { "epoch": 2.02, "learning_rate": 1.62962962962963e-05, "loss": 0.204, "step": 4550 }, { "epoch": 2.02, "eval_loss": 0.25177687406539917, "eval_runtime": 7.1922, "eval_samples_per_second": 139.179, "eval_steps_per_second": 34.899, "step": 4550 }, { "epoch": 2.03, "learning_rate": 1.6222222222222223e-05, "loss": 0.22, "step": 4560 }, { "epoch": 2.03, "learning_rate": 1.614814814814815e-05, "loss": 0.1853, "step": 4570 }, { "epoch": 2.03, "eval_loss": 0.25192761421203613, "eval_runtime": 7.1908, "eval_samples_per_second": 139.205, "eval_steps_per_second": 34.906, "step": 4575 }, { "epoch": 2.04, "learning_rate": 1.6074074074074076e-05, "loss": 0.1916, "step": 4580 }, { "epoch": 2.04, "learning_rate": 1.6000000000000003e-05, "loss": 0.2385, "step": 4590 }, { "epoch": 2.04, "learning_rate": 1.5925925925925926e-05, "loss": 0.1834, "step": 4600 }, { "epoch": 2.04, "eval_loss": 0.25096383690834045, "eval_runtime": 7.214, "eval_samples_per_second": 138.757, "eval_steps_per_second": 34.793, "step": 4600 }, { "epoch": 2.05, "learning_rate": 1.5851851851851852e-05, "loss": 0.3046, "step": 4610 }, { "epoch": 2.05, "learning_rate": 1.577777777777778e-05, "loss": 0.2285, "step": 4620 }, { "epoch": 2.06, "eval_loss": 0.251299113035202, "eval_runtime": 7.2235, "eval_samples_per_second": 138.575, "eval_steps_per_second": 34.748, "step": 4625 }, { "epoch": 2.06, "learning_rate": 1.5703703703703705e-05, "loss": 0.213, "step": 4630 }, { "epoch": 2.06, "learning_rate": 1.5629629629629632e-05, "loss": 0.1222, "step": 4640 }, { "epoch": 2.07, "learning_rate": 1.5555555555555555e-05, "loss": 0.179, "step": 4650 }, { "epoch": 2.07, "eval_loss": 0.25189557671546936, "eval_runtime": 7.2744, "eval_samples_per_second": 137.607, "eval_steps_per_second": 34.505, "step": 4650 }, { "epoch": 2.07, "learning_rate": 1.548148148148148e-05, "loss": 0.1749, "step": 4660 }, { "epoch": 2.08, "learning_rate": 1.5407407407407408e-05, "loss": 0.2163, "step": 4670 }, { "epoch": 2.08, "eval_loss": 0.25133654475212097, "eval_runtime": 7.2066, "eval_samples_per_second": 138.901, "eval_steps_per_second": 34.829, "step": 4675 }, { "epoch": 2.08, "learning_rate": 1.5333333333333334e-05, "loss": 0.1791, "step": 4680 }, { "epoch": 2.08, "learning_rate": 1.5259259259259258e-05, "loss": 0.1868, "step": 4690 }, { "epoch": 2.09, "learning_rate": 1.5185185185185186e-05, "loss": 0.2034, "step": 4700 }, { "epoch": 2.09, "eval_loss": 0.251429945230484, "eval_runtime": 7.2815, "eval_samples_per_second": 137.471, "eval_steps_per_second": 34.471, "step": 4700 }, { "epoch": 2.09, "learning_rate": 1.5111111111111112e-05, "loss": 0.1877, "step": 4710 }, { "epoch": 2.1, "learning_rate": 1.5037037037037039e-05, "loss": 0.2275, "step": 4720 }, { "epoch": 2.1, "eval_loss": 0.25081896781921387, "eval_runtime": 7.1988, "eval_samples_per_second": 139.051, "eval_steps_per_second": 34.867, "step": 4725 }, { "epoch": 2.1, "learning_rate": 1.4962962962962965e-05, "loss": 0.2512, "step": 4730 }, { "epoch": 2.11, "learning_rate": 1.4888888888888888e-05, "loss": 0.1883, "step": 4740 }, { "epoch": 2.11, "learning_rate": 1.4814814814814815e-05, "loss": 0.2328, "step": 4750 }, { "epoch": 2.11, "eval_loss": 0.2509821653366089, "eval_runtime": 9.4138, "eval_samples_per_second": 106.334, "eval_steps_per_second": 26.663, "step": 4750 }, { "epoch": 2.12, "learning_rate": 1.4740740740740741e-05, "loss": 0.2146, "step": 4760 }, { "epoch": 2.12, "learning_rate": 1.4666666666666668e-05, "loss": 0.1935, "step": 4770 }, { "epoch": 2.12, "eval_loss": 0.2518753111362457, "eval_runtime": 7.4318, "eval_samples_per_second": 134.691, "eval_steps_per_second": 33.774, "step": 4775 }, { "epoch": 2.12, "learning_rate": 1.4592592592592594e-05, "loss": 0.2174, "step": 4780 }, { "epoch": 2.13, "learning_rate": 1.4518518518518521e-05, "loss": 0.1887, "step": 4790 }, { "epoch": 2.13, "learning_rate": 1.4444444444444444e-05, "loss": 0.1613, "step": 4800 }, { "epoch": 2.13, "eval_loss": 0.251521497964859, "eval_runtime": 7.26, "eval_samples_per_second": 137.88, "eval_steps_per_second": 34.573, "step": 4800 }, { "epoch": 2.14, "learning_rate": 1.437037037037037e-05, "loss": 0.2027, "step": 4810 }, { "epoch": 2.14, "learning_rate": 1.4296296296296297e-05, "loss": 0.2451, "step": 4820 }, { "epoch": 2.14, "eval_loss": 0.2503267526626587, "eval_runtime": 7.228, "eval_samples_per_second": 138.488, "eval_steps_per_second": 34.726, "step": 4825 }, { "epoch": 2.15, "learning_rate": 1.4222222222222224e-05, "loss": 0.199, "step": 4830 }, { "epoch": 2.15, "learning_rate": 1.4148148148148148e-05, "loss": 0.1797, "step": 4840 }, { "epoch": 2.16, "learning_rate": 1.4074074074074075e-05, "loss": 0.199, "step": 4850 }, { "epoch": 2.16, "eval_loss": 0.2507803440093994, "eval_runtime": 7.2959, "eval_samples_per_second": 137.2, "eval_steps_per_second": 34.403, "step": 4850 }, { "epoch": 2.16, "learning_rate": 1.4000000000000001e-05, "loss": 0.2138, "step": 4860 }, { "epoch": 2.16, "learning_rate": 1.3925925925925926e-05, "loss": 0.2486, "step": 4870 }, { "epoch": 2.17, "eval_loss": 0.25041449069976807, "eval_runtime": 7.2127, "eval_samples_per_second": 138.783, "eval_steps_per_second": 34.8, "step": 4875 }, { "epoch": 2.17, "learning_rate": 1.3851851851851853e-05, "loss": 0.1906, "step": 4880 }, { "epoch": 2.17, "learning_rate": 1.3777777777777778e-05, "loss": 0.2375, "step": 4890 }, { "epoch": 2.18, "learning_rate": 1.3703703703703704e-05, "loss": 0.2124, "step": 4900 }, { "epoch": 2.18, "eval_loss": 0.2496342658996582, "eval_runtime": 7.2877, "eval_samples_per_second": 137.354, "eval_steps_per_second": 34.441, "step": 4900 }, { "epoch": 2.18, "learning_rate": 1.362962962962963e-05, "loss": 0.1827, "step": 4910 }, { "epoch": 2.19, "learning_rate": 1.3555555555555557e-05, "loss": 0.2032, "step": 4920 }, { "epoch": 2.19, "eval_loss": 0.2495114803314209, "eval_runtime": 7.1954, "eval_samples_per_second": 139.117, "eval_steps_per_second": 34.883, "step": 4925 }, { "epoch": 2.19, "learning_rate": 1.348148148148148e-05, "loss": 0.133, "step": 4930 }, { "epoch": 2.2, "learning_rate": 1.3407407407407407e-05, "loss": 0.2068, "step": 4940 }, { "epoch": 2.2, "learning_rate": 1.3333333333333333e-05, "loss": 0.1772, "step": 4950 }, { "epoch": 2.2, "eval_loss": 0.25019142031669617, "eval_runtime": 7.291, "eval_samples_per_second": 137.293, "eval_steps_per_second": 34.426, "step": 4950 }, { "epoch": 2.2, "learning_rate": 1.325925925925926e-05, "loss": 0.213, "step": 4960 }, { "epoch": 2.21, "learning_rate": 1.3185185185185186e-05, "loss": 0.1879, "step": 4970 }, { "epoch": 2.21, "eval_loss": 0.24890127778053284, "eval_runtime": 7.2295, "eval_samples_per_second": 138.46, "eval_steps_per_second": 34.719, "step": 4975 }, { "epoch": 2.21, "learning_rate": 1.3111111111111113e-05, "loss": 0.1829, "step": 4980 }, { "epoch": 2.22, "learning_rate": 1.3037037037037036e-05, "loss": 0.211, "step": 4990 }, { "epoch": 2.22, "learning_rate": 1.2962962962962962e-05, "loss": 0.1554, "step": 5000 }, { "epoch": 2.22, "eval_loss": 0.24963602423667908, "eval_runtime": 7.2892, "eval_samples_per_second": 137.326, "eval_steps_per_second": 34.434, "step": 5000 }, { "epoch": 2.23, "learning_rate": 1.2888888888888889e-05, "loss": 0.1922, "step": 5010 }, { "epoch": 2.23, "learning_rate": 1.2814814814814815e-05, "loss": 0.2178, "step": 5020 }, { "epoch": 2.23, "eval_loss": 0.24953505396842957, "eval_runtime": 7.1973, "eval_samples_per_second": 139.08, "eval_steps_per_second": 34.874, "step": 5025 }, { "epoch": 2.24, "learning_rate": 1.2740740740740742e-05, "loss": 0.1909, "step": 5030 }, { "epoch": 2.24, "learning_rate": 1.2666666666666668e-05, "loss": 0.2111, "step": 5040 }, { "epoch": 2.24, "learning_rate": 1.2592592592592592e-05, "loss": 0.1936, "step": 5050 }, { "epoch": 2.24, "eval_loss": 0.249254047870636, "eval_runtime": 7.2689, "eval_samples_per_second": 137.711, "eval_steps_per_second": 34.531, "step": 5050 }, { "epoch": 2.25, "learning_rate": 1.2518518518518518e-05, "loss": 0.1913, "step": 5060 }, { "epoch": 2.25, "learning_rate": 1.2444444444444445e-05, "loss": 0.2, "step": 5070 }, { "epoch": 2.26, "eval_loss": 0.2488778978586197, "eval_runtime": 7.2184, "eval_samples_per_second": 138.674, "eval_steps_per_second": 34.772, "step": 5075 }, { "epoch": 2.26, "learning_rate": 1.2370370370370371e-05, "loss": 0.1723, "step": 5080 }, { "epoch": 2.26, "learning_rate": 1.2296296296296298e-05, "loss": 0.1875, "step": 5090 }, { "epoch": 2.27, "learning_rate": 1.2222222222222222e-05, "loss": 0.185, "step": 5100 }, { "epoch": 2.27, "eval_loss": 0.24892283976078033, "eval_runtime": 7.3812, "eval_samples_per_second": 135.615, "eval_steps_per_second": 34.005, "step": 5100 }, { "epoch": 2.27, "learning_rate": 1.2148148148148149e-05, "loss": 0.2168, "step": 5110 }, { "epoch": 2.28, "learning_rate": 1.2074074074074075e-05, "loss": 0.1783, "step": 5120 }, { "epoch": 2.28, "eval_loss": 0.2482217699289322, "eval_runtime": 7.3518, "eval_samples_per_second": 136.157, "eval_steps_per_second": 34.141, "step": 5125 }, { "epoch": 2.28, "learning_rate": 1.2e-05, "loss": 0.1643, "step": 5130 }, { "epoch": 2.28, "learning_rate": 1.1925925925925927e-05, "loss": 0.2211, "step": 5140 }, { "epoch": 2.29, "learning_rate": 1.1851851851851853e-05, "loss": 0.2276, "step": 5150 }, { "epoch": 2.29, "eval_loss": 0.2477390617132187, "eval_runtime": 7.5029, "eval_samples_per_second": 133.414, "eval_steps_per_second": 33.454, "step": 5150 }, { "epoch": 2.29, "learning_rate": 1.1777777777777778e-05, "loss": 0.1951, "step": 5160 }, { "epoch": 2.3, "learning_rate": 1.1703703703703705e-05, "loss": 0.2134, "step": 5170 }, { "epoch": 2.3, "eval_loss": 0.24737687408924103, "eval_runtime": 7.4753, "eval_samples_per_second": 133.907, "eval_steps_per_second": 33.577, "step": 5175 }, { "epoch": 2.3, "learning_rate": 1.1629629629629631e-05, "loss": 0.2208, "step": 5180 }, { "epoch": 2.31, "learning_rate": 1.1555555555555556e-05, "loss": 0.1714, "step": 5190 }, { "epoch": 2.31, "learning_rate": 1.1481481481481482e-05, "loss": 0.1747, "step": 5200 }, { "epoch": 2.31, "eval_loss": 0.2470199465751648, "eval_runtime": 7.6413, "eval_samples_per_second": 130.999, "eval_steps_per_second": 32.848, "step": 5200 }, { "epoch": 2.32, "learning_rate": 1.1407407407407409e-05, "loss": 0.209, "step": 5210 }, { "epoch": 2.32, "learning_rate": 1.1333333333333334e-05, "loss": 0.2121, "step": 5220 }, { "epoch": 2.32, "eval_loss": 0.2458607703447342, "eval_runtime": 7.515, "eval_samples_per_second": 133.2, "eval_steps_per_second": 33.4, "step": 5225 }, { "epoch": 2.32, "learning_rate": 1.125925925925926e-05, "loss": 0.2266, "step": 5230 }, { "epoch": 2.33, "learning_rate": 1.1185185185185187e-05, "loss": 0.1754, "step": 5240 }, { "epoch": 2.33, "learning_rate": 1.1111111111111112e-05, "loss": 0.2199, "step": 5250 }, { "epoch": 2.33, "eval_loss": 0.2461249679327011, "eval_runtime": 7.5406, "eval_samples_per_second": 132.748, "eval_steps_per_second": 33.286, "step": 5250 }, { "epoch": 2.34, "learning_rate": 1.1037037037037038e-05, "loss": 0.2609, "step": 5260 }, { "epoch": 2.34, "learning_rate": 1.0962962962962963e-05, "loss": 0.2708, "step": 5270 }, { "epoch": 2.34, "eval_loss": 0.24557095766067505, "eval_runtime": 7.3992, "eval_samples_per_second": 135.286, "eval_steps_per_second": 33.923, "step": 5275 }, { "epoch": 2.35, "learning_rate": 1.088888888888889e-05, "loss": 0.1986, "step": 5280 }, { "epoch": 2.35, "learning_rate": 1.0814814814814814e-05, "loss": 0.1937, "step": 5290 }, { "epoch": 2.36, "learning_rate": 1.074074074074074e-05, "loss": 0.2252, "step": 5300 }, { "epoch": 2.36, "eval_loss": 0.24577473104000092, "eval_runtime": 7.388, "eval_samples_per_second": 135.49, "eval_steps_per_second": 33.974, "step": 5300 }, { "epoch": 2.36, "learning_rate": 1.0666666666666667e-05, "loss": 0.1642, "step": 5310 }, { "epoch": 2.36, "learning_rate": 1.0592592592592592e-05, "loss": 0.1921, "step": 5320 }, { "epoch": 2.37, "eval_loss": 0.24629971385002136, "eval_runtime": 7.2504, "eval_samples_per_second": 138.061, "eval_steps_per_second": 34.619, "step": 5325 }, { "epoch": 2.37, "learning_rate": 1.0518518518518519e-05, "loss": 0.1795, "step": 5330 }, { "epoch": 2.37, "learning_rate": 1.0444444444444445e-05, "loss": 0.2051, "step": 5340 }, { "epoch": 2.38, "learning_rate": 1.037037037037037e-05, "loss": 0.1627, "step": 5350 }, { "epoch": 2.38, "eval_loss": 0.24658331274986267, "eval_runtime": 7.6185, "eval_samples_per_second": 131.39, "eval_steps_per_second": 32.946, "step": 5350 }, { "epoch": 2.38, "learning_rate": 1.0296296296296296e-05, "loss": 0.1746, "step": 5360 }, { "epoch": 2.39, "learning_rate": 1.0222222222222223e-05, "loss": 0.1988, "step": 5370 }, { "epoch": 2.39, "eval_loss": 0.24601979553699493, "eval_runtime": 7.5137, "eval_samples_per_second": 133.223, "eval_steps_per_second": 33.406, "step": 5375 }, { "epoch": 2.39, "learning_rate": 1.0148148148148148e-05, "loss": 0.1994, "step": 5380 }, { "epoch": 2.4, "learning_rate": 1.0074074074074074e-05, "loss": 0.191, "step": 5390 }, { "epoch": 2.4, "learning_rate": 1e-05, "loss": 0.2308, "step": 5400 }, { "epoch": 2.4, "eval_loss": 0.24538284540176392, "eval_runtime": 7.6411, "eval_samples_per_second": 131.002, "eval_steps_per_second": 32.849, "step": 5400 }, { "epoch": 2.4, "learning_rate": 9.925925925925926e-06, "loss": 0.2331, "step": 5410 }, { "epoch": 2.41, "learning_rate": 9.851851851851852e-06, "loss": 0.138, "step": 5420 }, { "epoch": 2.41, "eval_loss": 0.24550172686576843, "eval_runtime": 7.6397, "eval_samples_per_second": 131.026, "eval_steps_per_second": 32.855, "step": 5425 }, { "epoch": 2.41, "learning_rate": 9.777777777777779e-06, "loss": 0.2266, "step": 5430 }, { "epoch": 2.42, "learning_rate": 9.703703703703703e-06, "loss": 0.1693, "step": 5440 }, { "epoch": 2.42, "learning_rate": 9.62962962962963e-06, "loss": 0.1745, "step": 5450 }, { "epoch": 2.42, "eval_loss": 0.2455550730228424, "eval_runtime": 7.6397, "eval_samples_per_second": 131.025, "eval_steps_per_second": 32.855, "step": 5450 }, { "epoch": 2.43, "learning_rate": 9.555555555555556e-06, "loss": 0.1472, "step": 5460 }, { "epoch": 2.43, "learning_rate": 9.481481481481481e-06, "loss": 0.1712, "step": 5470 }, { "epoch": 2.43, "eval_loss": 0.24605458974838257, "eval_runtime": 8.1344, "eval_samples_per_second": 123.058, "eval_steps_per_second": 30.857, "step": 5475 }, { "epoch": 2.44, "learning_rate": 9.407407407407408e-06, "loss": 0.1755, "step": 5480 }, { "epoch": 2.44, "learning_rate": 9.333333333333334e-06, "loss": 0.1984, "step": 5490 }, { "epoch": 2.44, "learning_rate": 9.259259259259259e-06, "loss": 0.1763, "step": 5500 }, { "epoch": 2.44, "eval_loss": 0.24583406746387482, "eval_runtime": 10.3194, "eval_samples_per_second": 97.002, "eval_steps_per_second": 24.323, "step": 5500 }, { "epoch": 2.45, "learning_rate": 9.185185185185186e-06, "loss": 0.2261, "step": 5510 }, { "epoch": 2.45, "learning_rate": 9.111111111111112e-06, "loss": 0.2068, "step": 5520 }, { "epoch": 2.46, "eval_loss": 0.24605007469654083, "eval_runtime": 7.6128, "eval_samples_per_second": 131.489, "eval_steps_per_second": 32.971, "step": 5525 }, { "epoch": 2.46, "learning_rate": 9.037037037037037e-06, "loss": 0.2097, "step": 5530 }, { "epoch": 2.46, "learning_rate": 8.962962962962963e-06, "loss": 0.1733, "step": 5540 }, { "epoch": 2.47, "learning_rate": 8.88888888888889e-06, "loss": 0.2097, "step": 5550 }, { "epoch": 2.47, "eval_loss": 0.24533824622631073, "eval_runtime": 7.6087, "eval_samples_per_second": 131.559, "eval_steps_per_second": 32.988, "step": 5550 }, { "epoch": 2.47, "learning_rate": 8.814814814814815e-06, "loss": 0.1896, "step": 5560 }, { "epoch": 2.48, "learning_rate": 8.740740740740741e-06, "loss": 0.1945, "step": 5570 }, { "epoch": 2.48, "eval_loss": 0.24509920179843903, "eval_runtime": 7.6048, "eval_samples_per_second": 131.627, "eval_steps_per_second": 33.005, "step": 5575 }, { "epoch": 2.48, "learning_rate": 8.666666666666668e-06, "loss": 0.2036, "step": 5580 }, { "epoch": 2.48, "learning_rate": 8.592592592592593e-06, "loss": 0.1933, "step": 5590 }, { "epoch": 2.49, "learning_rate": 8.518518518518519e-06, "loss": 0.1906, "step": 5600 }, { "epoch": 2.49, "eval_loss": 0.24449166655540466, "eval_runtime": 7.5807, "eval_samples_per_second": 132.047, "eval_steps_per_second": 33.111, "step": 5600 }, { "epoch": 2.49, "learning_rate": 8.444444444444446e-06, "loss": 0.1655, "step": 5610 }, { "epoch": 2.5, "learning_rate": 8.37037037037037e-06, "loss": 0.1681, "step": 5620 }, { "epoch": 2.5, "eval_loss": 0.24523867666721344, "eval_runtime": 7.5355, "eval_samples_per_second": 132.839, "eval_steps_per_second": 33.309, "step": 5625 }, { "epoch": 2.5, "learning_rate": 8.296296296296297e-06, "loss": 0.202, "step": 5630 }, { "epoch": 2.51, "learning_rate": 8.222222222222223e-06, "loss": 0.1795, "step": 5640 }, { "epoch": 2.51, "learning_rate": 8.14814814814815e-06, "loss": 0.1732, "step": 5650 }, { "epoch": 2.51, "eval_loss": 0.24490933120250702, "eval_runtime": 7.4917, "eval_samples_per_second": 133.614, "eval_steps_per_second": 33.504, "step": 5650 }, { "epoch": 2.52, "learning_rate": 8.074074074074075e-06, "loss": 0.1765, "step": 5660 }, { "epoch": 2.52, "learning_rate": 8.000000000000001e-06, "loss": 0.2129, "step": 5670 }, { "epoch": 2.52, "eval_loss": 0.2450021654367447, "eval_runtime": 7.5776, "eval_samples_per_second": 132.099, "eval_steps_per_second": 33.124, "step": 5675 }, { "epoch": 2.52, "learning_rate": 7.925925925925926e-06, "loss": 0.239, "step": 5680 }, { "epoch": 2.53, "learning_rate": 7.851851851851853e-06, "loss": 0.153, "step": 5690 }, { "epoch": 2.53, "learning_rate": 7.777777777777777e-06, "loss": 0.1749, "step": 5700 }, { "epoch": 2.53, "eval_loss": 0.244710773229599, "eval_runtime": 7.2355, "eval_samples_per_second": 138.346, "eval_steps_per_second": 34.69, "step": 5700 }, { "epoch": 2.54, "learning_rate": 7.703703703703704e-06, "loss": 0.2436, "step": 5710 }, { "epoch": 2.54, "learning_rate": 7.629629629629629e-06, "loss": 0.217, "step": 5720 }, { "epoch": 2.54, "eval_loss": 0.24449826776981354, "eval_runtime": 7.2691, "eval_samples_per_second": 137.706, "eval_steps_per_second": 34.53, "step": 5725 }, { "epoch": 2.55, "learning_rate": 7.555555555555556e-06, "loss": 0.1999, "step": 5730 }, { "epoch": 2.55, "learning_rate": 7.481481481481483e-06, "loss": 0.1981, "step": 5740 }, { "epoch": 2.56, "learning_rate": 7.4074074074074075e-06, "loss": 0.2359, "step": 5750 }, { "epoch": 2.56, "eval_loss": 0.24463185667991638, "eval_runtime": 7.1779, "eval_samples_per_second": 139.456, "eval_steps_per_second": 34.968, "step": 5750 }, { "epoch": 2.56, "learning_rate": 7.333333333333334e-06, "loss": 0.1824, "step": 5760 }, { "epoch": 2.56, "learning_rate": 7.2592592592592605e-06, "loss": 0.1813, "step": 5770 }, { "epoch": 2.57, "eval_loss": 0.24394886195659637, "eval_runtime": 7.21, "eval_samples_per_second": 138.835, "eval_steps_per_second": 34.813, "step": 5775 }, { "epoch": 2.57, "learning_rate": 7.185185185185185e-06, "loss": 0.2108, "step": 5780 }, { "epoch": 2.57, "learning_rate": 7.111111111111112e-06, "loss": 0.2403, "step": 5790 }, { "epoch": 2.58, "learning_rate": 7.0370370370370375e-06, "loss": 0.1771, "step": 5800 }, { "epoch": 2.58, "eval_loss": 0.2437148541212082, "eval_runtime": 7.3132, "eval_samples_per_second": 136.876, "eval_steps_per_second": 34.322, "step": 5800 }, { "epoch": 2.58, "learning_rate": 6.962962962962963e-06, "loss": 0.2451, "step": 5810 }, { "epoch": 2.59, "learning_rate": 6.888888888888889e-06, "loss": 0.2009, "step": 5820 }, { "epoch": 2.59, "eval_loss": 0.2434249371290207, "eval_runtime": 7.2848, "eval_samples_per_second": 137.41, "eval_steps_per_second": 34.455, "step": 5825 }, { "epoch": 2.59, "learning_rate": 6.814814814814815e-06, "loss": 0.1734, "step": 5830 }, { "epoch": 2.6, "learning_rate": 6.74074074074074e-06, "loss": 0.2106, "step": 5840 }, { "epoch": 2.6, "learning_rate": 6.666666666666667e-06, "loss": 0.2255, "step": 5850 }, { "epoch": 2.6, "eval_loss": 0.24379944801330566, "eval_runtime": 7.3588, "eval_samples_per_second": 136.028, "eval_steps_per_second": 34.109, "step": 5850 }, { "epoch": 2.6, "learning_rate": 6.592592592592593e-06, "loss": 0.2042, "step": 5860 }, { "epoch": 2.61, "learning_rate": 6.518518518518518e-06, "loss": 0.1645, "step": 5870 }, { "epoch": 2.61, "eval_loss": 0.24349051713943481, "eval_runtime": 7.3319, "eval_samples_per_second": 136.527, "eval_steps_per_second": 34.234, "step": 5875 }, { "epoch": 2.61, "learning_rate": 6.4444444444444445e-06, "loss": 0.1971, "step": 5880 }, { "epoch": 2.62, "learning_rate": 6.370370370370371e-06, "loss": 0.1939, "step": 5890 }, { "epoch": 2.62, "learning_rate": 6.296296296296296e-06, "loss": 0.2087, "step": 5900 }, { "epoch": 2.62, "eval_loss": 0.24342414736747742, "eval_runtime": 7.3018, "eval_samples_per_second": 137.09, "eval_steps_per_second": 34.375, "step": 5900 }, { "epoch": 2.63, "learning_rate": 6.222222222222222e-06, "loss": 0.2373, "step": 5910 }, { "epoch": 2.63, "learning_rate": 6.148148148148149e-06, "loss": 0.2125, "step": 5920 }, { "epoch": 2.63, "eval_loss": 0.24332143366336823, "eval_runtime": 7.221, "eval_samples_per_second": 138.623, "eval_steps_per_second": 34.76, "step": 5925 }, { "epoch": 2.64, "learning_rate": 6.0740740740740745e-06, "loss": 0.2188, "step": 5930 }, { "epoch": 2.64, "learning_rate": 6e-06, "loss": 0.1756, "step": 5940 }, { "epoch": 2.64, "learning_rate": 5.925925925925927e-06, "loss": 0.2484, "step": 5950 }, { "epoch": 2.64, "eval_loss": 0.24283772706985474, "eval_runtime": 7.2081, "eval_samples_per_second": 138.871, "eval_steps_per_second": 34.822, "step": 5950 }, { "epoch": 2.65, "learning_rate": 5.851851851851852e-06, "loss": 0.201, "step": 5960 }, { "epoch": 2.65, "learning_rate": 5.777777777777778e-06, "loss": 0.1724, "step": 5970 }, { "epoch": 2.66, "eval_loss": 0.24253526329994202, "eval_runtime": 7.2139, "eval_samples_per_second": 138.76, "eval_steps_per_second": 34.794, "step": 5975 }, { "epoch": 2.66, "learning_rate": 5.7037037037037045e-06, "loss": 0.2071, "step": 5980 }, { "epoch": 2.66, "learning_rate": 5.62962962962963e-06, "loss": 0.2026, "step": 5990 }, { "epoch": 2.67, "learning_rate": 5.555555555555556e-06, "loss": 0.18, "step": 6000 }, { "epoch": 2.67, "eval_loss": 0.2427457869052887, "eval_runtime": 7.24, "eval_samples_per_second": 138.259, "eval_steps_per_second": 34.668, "step": 6000 }, { "epoch": 2.67, "learning_rate": 5.4814814814814815e-06, "loss": 0.2491, "step": 6010 }, { "epoch": 2.68, "learning_rate": 5.407407407407407e-06, "loss": 0.1878, "step": 6020 }, { "epoch": 2.68, "eval_loss": 0.2427491545677185, "eval_runtime": 7.2356, "eval_samples_per_second": 138.343, "eval_steps_per_second": 34.689, "step": 6025 }, { "epoch": 2.68, "learning_rate": 5.333333333333334e-06, "loss": 0.196, "step": 6030 }, { "epoch": 2.68, "learning_rate": 5.259259259259259e-06, "loss": 0.1861, "step": 6040 }, { "epoch": 2.69, "learning_rate": 5.185185185185185e-06, "loss": 0.2131, "step": 6050 }, { "epoch": 2.69, "eval_loss": 0.24262972176074982, "eval_runtime": 7.2135, "eval_samples_per_second": 138.768, "eval_steps_per_second": 34.796, "step": 6050 }, { "epoch": 2.69, "learning_rate": 5.1111111111111115e-06, "loss": 0.1904, "step": 6060 }, { "epoch": 2.7, "learning_rate": 5.037037037037037e-06, "loss": 0.1677, "step": 6070 }, { "epoch": 2.7, "eval_loss": 0.24258430302143097, "eval_runtime": 7.38, "eval_samples_per_second": 135.638, "eval_steps_per_second": 34.011, "step": 6075 }, { "epoch": 2.7, "learning_rate": 4.962962962962963e-06, "loss": 0.1728, "step": 6080 }, { "epoch": 2.71, "learning_rate": 4.888888888888889e-06, "loss": 0.2153, "step": 6090 }, { "epoch": 2.71, "learning_rate": 4.814814814814815e-06, "loss": 0.2097, "step": 6100 }, { "epoch": 2.71, "eval_loss": 0.2426908016204834, "eval_runtime": 7.3472, "eval_samples_per_second": 136.243, "eval_steps_per_second": 34.163, "step": 6100 }, { "epoch": 2.72, "learning_rate": 4.740740740740741e-06, "loss": 0.1584, "step": 6110 }, { "epoch": 2.72, "learning_rate": 4.666666666666667e-06, "loss": 0.2014, "step": 6120 }, { "epoch": 2.72, "eval_loss": 0.24265140295028687, "eval_runtime": 7.3411, "eval_samples_per_second": 136.355, "eval_steps_per_second": 34.191, "step": 6125 }, { "epoch": 2.72, "learning_rate": 4.592592592592593e-06, "loss": 0.2504, "step": 6130 }, { "epoch": 2.73, "learning_rate": 4.5185185185185185e-06, "loss": 0.207, "step": 6140 }, { "epoch": 2.73, "learning_rate": 4.444444444444445e-06, "loss": 0.1777, "step": 6150 }, { "epoch": 2.73, "eval_loss": 0.24221929907798767, "eval_runtime": 7.2165, "eval_samples_per_second": 138.709, "eval_steps_per_second": 34.781, "step": 6150 }, { "epoch": 2.74, "learning_rate": 4.370370370370371e-06, "loss": 0.1982, "step": 6160 }, { "epoch": 2.74, "learning_rate": 4.296296296296296e-06, "loss": 0.189, "step": 6170 }, { "epoch": 2.74, "eval_loss": 0.24199625849723816, "eval_runtime": 7.2328, "eval_samples_per_second": 138.397, "eval_steps_per_second": 34.703, "step": 6175 }, { "epoch": 2.75, "learning_rate": 4.222222222222223e-06, "loss": 0.1895, "step": 6180 }, { "epoch": 2.75, "learning_rate": 4.1481481481481485e-06, "loss": 0.2303, "step": 6190 }, { "epoch": 2.76, "learning_rate": 4.074074074074075e-06, "loss": 0.2499, "step": 6200 }, { "epoch": 2.76, "eval_loss": 0.24183855950832367, "eval_runtime": 7.2222, "eval_samples_per_second": 138.6, "eval_steps_per_second": 34.754, "step": 6200 }, { "epoch": 2.76, "learning_rate": 4.000000000000001e-06, "loss": 0.2105, "step": 6210 }, { "epoch": 2.76, "learning_rate": 3.925925925925926e-06, "loss": 0.1827, "step": 6220 }, { "epoch": 2.77, "eval_loss": 0.24157124757766724, "eval_runtime": 7.2228, "eval_samples_per_second": 138.589, "eval_steps_per_second": 34.751, "step": 6225 }, { "epoch": 2.77, "learning_rate": 3.851851851851852e-06, "loss": 0.1625, "step": 6230 }, { "epoch": 2.77, "learning_rate": 3.777777777777778e-06, "loss": 0.2046, "step": 6240 }, { "epoch": 2.78, "learning_rate": 3.7037037037037037e-06, "loss": 0.1872, "step": 6250 }, { "epoch": 2.78, "eval_loss": 0.24152511358261108, "eval_runtime": 7.2189, "eval_samples_per_second": 138.665, "eval_steps_per_second": 34.77, "step": 6250 }, { "epoch": 2.78, "learning_rate": 3.6296296296296302e-06, "loss": 0.1476, "step": 6260 }, { "epoch": 2.79, "learning_rate": 3.555555555555556e-06, "loss": 0.1948, "step": 6270 }, { "epoch": 2.79, "eval_loss": 0.24153129756450653, "eval_runtime": 7.2338, "eval_samples_per_second": 138.379, "eval_steps_per_second": 34.698, "step": 6275 }, { "epoch": 2.79, "learning_rate": 3.4814814814814816e-06, "loss": 0.184, "step": 6280 }, { "epoch": 2.8, "learning_rate": 3.4074074074074077e-06, "loss": 0.2911, "step": 6290 }, { "epoch": 2.8, "learning_rate": 3.3333333333333333e-06, "loss": 0.2382, "step": 6300 }, { "epoch": 2.8, "eval_loss": 0.24131891131401062, "eval_runtime": 7.204, "eval_samples_per_second": 138.95, "eval_steps_per_second": 34.842, "step": 6300 }, { "epoch": 2.8, "learning_rate": 3.259259259259259e-06, "loss": 0.2274, "step": 6310 }, { "epoch": 2.81, "learning_rate": 3.1851851851851855e-06, "loss": 0.2495, "step": 6320 }, { "epoch": 2.81, "eval_loss": 0.24129566550254822, "eval_runtime": 7.2128, "eval_samples_per_second": 138.78, "eval_steps_per_second": 34.799, "step": 6325 }, { "epoch": 2.81, "learning_rate": 3.111111111111111e-06, "loss": 0.1895, "step": 6330 }, { "epoch": 2.82, "learning_rate": 3.0370370370370372e-06, "loss": 0.2067, "step": 6340 }, { "epoch": 2.82, "learning_rate": 2.9629629629629633e-06, "loss": 0.2258, "step": 6350 }, { "epoch": 2.82, "eval_loss": 0.24125269055366516, "eval_runtime": 7.2125, "eval_samples_per_second": 138.787, "eval_steps_per_second": 34.801, "step": 6350 }, { "epoch": 2.83, "learning_rate": 2.888888888888889e-06, "loss": 0.1759, "step": 6360 }, { "epoch": 2.83, "learning_rate": 2.814814814814815e-06, "loss": 0.1618, "step": 6370 }, { "epoch": 2.83, "eval_loss": 0.24155820906162262, "eval_runtime": 7.2268, "eval_samples_per_second": 138.511, "eval_steps_per_second": 34.732, "step": 6375 }, { "epoch": 2.84, "learning_rate": 2.7407407407407407e-06, "loss": 0.1782, "step": 6380 }, { "epoch": 2.84, "learning_rate": 2.666666666666667e-06, "loss": 0.1813, "step": 6390 }, { "epoch": 2.84, "learning_rate": 2.5925925925925925e-06, "loss": 0.1605, "step": 6400 }, { "epoch": 2.84, "eval_loss": 0.24137118458747864, "eval_runtime": 7.2587, "eval_samples_per_second": 137.903, "eval_steps_per_second": 34.579, "step": 6400 }, { "epoch": 2.85, "learning_rate": 2.5185185185185186e-06, "loss": 0.1751, "step": 6410 }, { "epoch": 2.85, "learning_rate": 2.4444444444444447e-06, "loss": 0.1712, "step": 6420 }, { "epoch": 2.86, "eval_loss": 0.24129529297351837, "eval_runtime": 7.385, "eval_samples_per_second": 135.545, "eval_steps_per_second": 33.988, "step": 6425 }, { "epoch": 2.86, "learning_rate": 2.3703703703703703e-06, "loss": 0.2067, "step": 6430 }, { "epoch": 2.86, "learning_rate": 2.2962962962962964e-06, "loss": 0.177, "step": 6440 }, { "epoch": 2.87, "learning_rate": 2.2222222222222225e-06, "loss": 0.2021, "step": 6450 }, { "epoch": 2.87, "eval_loss": 0.24113218486309052, "eval_runtime": 7.1916, "eval_samples_per_second": 139.191, "eval_steps_per_second": 34.902, "step": 6450 }, { "epoch": 2.87, "learning_rate": 2.148148148148148e-06, "loss": 0.1895, "step": 6460 }, { "epoch": 2.88, "learning_rate": 2.0740740740740742e-06, "loss": 0.1985, "step": 6470 }, { "epoch": 2.88, "eval_loss": 0.24104492366313934, "eval_runtime": 7.2162, "eval_samples_per_second": 138.715, "eval_steps_per_second": 34.783, "step": 6475 }, { "epoch": 2.88, "learning_rate": 2.0000000000000003e-06, "loss": 0.1742, "step": 6480 }, { "epoch": 2.88, "learning_rate": 1.925925925925926e-06, "loss": 0.2035, "step": 6490 }, { "epoch": 2.89, "learning_rate": 1.8518518518518519e-06, "loss": 0.1773, "step": 6500 }, { "epoch": 2.89, "eval_loss": 0.24099375307559967, "eval_runtime": 7.2337, "eval_samples_per_second": 138.38, "eval_steps_per_second": 34.699, "step": 6500 }, { "epoch": 2.89, "learning_rate": 1.777777777777778e-06, "loss": 0.1872, "step": 6510 }, { "epoch": 2.9, "learning_rate": 1.7037037037037038e-06, "loss": 0.2152, "step": 6520 }, { "epoch": 2.9, "eval_loss": 0.24097038805484772, "eval_runtime": 7.2148, "eval_samples_per_second": 138.743, "eval_steps_per_second": 34.79, "step": 6525 }, { "epoch": 2.9, "learning_rate": 1.6296296296296295e-06, "loss": 0.2588, "step": 6530 }, { "epoch": 2.91, "learning_rate": 1.5555555555555556e-06, "loss": 0.2228, "step": 6540 }, { "epoch": 2.91, "learning_rate": 1.4814814814814817e-06, "loss": 0.2225, "step": 6550 }, { "epoch": 2.91, "eval_loss": 0.24102109670639038, "eval_runtime": 7.2724, "eval_samples_per_second": 137.644, "eval_steps_per_second": 34.514, "step": 6550 }, { "epoch": 2.92, "learning_rate": 1.4074074074074075e-06, "loss": 0.2303, "step": 6560 }, { "epoch": 2.92, "learning_rate": 1.3333333333333334e-06, "loss": 0.1393, "step": 6570 }, { "epoch": 2.92, "eval_loss": 0.24098366498947144, "eval_runtime": 7.2615, "eval_samples_per_second": 137.849, "eval_steps_per_second": 34.566, "step": 6575 }, { "epoch": 2.92, "learning_rate": 1.2592592592592593e-06, "loss": 0.1581, "step": 6580 }, { "epoch": 2.93, "learning_rate": 1.1851851851851852e-06, "loss": 0.2093, "step": 6590 }, { "epoch": 2.93, "learning_rate": 1.1111111111111112e-06, "loss": 0.1631, "step": 6600 }, { "epoch": 2.93, "eval_loss": 0.24084864556789398, "eval_runtime": 7.2231, "eval_samples_per_second": 138.583, "eval_steps_per_second": 34.75, "step": 6600 }, { "epoch": 2.94, "learning_rate": 1.0370370370370371e-06, "loss": 0.2028, "step": 6610 }, { "epoch": 2.94, "learning_rate": 9.62962962962963e-07, "loss": 0.2158, "step": 6620 }, { "epoch": 2.94, "eval_loss": 0.24080170691013336, "eval_runtime": 7.2315, "eval_samples_per_second": 138.423, "eval_steps_per_second": 34.709, "step": 6625 }, { "epoch": 2.95, "learning_rate": 8.88888888888889e-07, "loss": 0.1775, "step": 6630 }, { "epoch": 2.95, "learning_rate": 8.148148148148147e-07, "loss": 0.1827, "step": 6640 }, { "epoch": 2.96, "learning_rate": 7.407407407407408e-07, "loss": 0.218, "step": 6650 }, { "epoch": 2.96, "eval_loss": 0.24078905582427979, "eval_runtime": 7.2318, "eval_samples_per_second": 138.417, "eval_steps_per_second": 34.708, "step": 6650 }, { "epoch": 2.96, "learning_rate": 6.666666666666667e-07, "loss": 0.1736, "step": 6660 }, { "epoch": 2.96, "learning_rate": 5.925925925925926e-07, "loss": 0.1654, "step": 6670 }, { "epoch": 2.97, "eval_loss": 0.24077662825584412, "eval_runtime": 7.2025, "eval_samples_per_second": 138.98, "eval_steps_per_second": 34.849, "step": 6675 }, { "epoch": 2.97, "learning_rate": 5.185185185185186e-07, "loss": 0.1993, "step": 6680 }, { "epoch": 2.97, "learning_rate": 4.444444444444445e-07, "loss": 0.1852, "step": 6690 }, { "epoch": 2.98, "learning_rate": 3.703703703703704e-07, "loss": 0.1955, "step": 6700 }, { "epoch": 2.98, "eval_loss": 0.24076078832149506, "eval_runtime": 7.1929, "eval_samples_per_second": 139.165, "eval_steps_per_second": 34.895, "step": 6700 }, { "epoch": 2.98, "learning_rate": 2.962962962962963e-07, "loss": 0.1923, "step": 6710 }, { "epoch": 2.99, "learning_rate": 2.2222222222222224e-07, "loss": 0.2142, "step": 6720 }, { "epoch": 2.99, "eval_loss": 0.24076122045516968, "eval_runtime": 7.1938, "eval_samples_per_second": 139.147, "eval_steps_per_second": 34.891, "step": 6725 }, { "epoch": 2.99, "learning_rate": 1.4814814814814815e-07, "loss": 0.2464, "step": 6730 }, { "epoch": 3.0, "learning_rate": 7.407407407407407e-08, "loss": 0.2322, "step": 6740 }, { "epoch": 3.0, "learning_rate": 0.0, "loss": 0.1781, "step": 6750 }, { "epoch": 3.0, "eval_loss": 0.24075740575790405, "eval_runtime": 7.2037, "eval_samples_per_second": 138.956, "eval_steps_per_second": 34.843, "step": 6750 } ], "max_steps": 6750, "num_train_epochs": 3, "total_flos": 915255853056000.0, "trial_name": null, "trial_params": null }