{ "best_global_step": 4300, "best_metric": 2.432278633117676, "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-4000", "epoch": 0.18, "eval_steps": 100, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 39.75564521032967, "learning_rate": 4.8e-08, "loss": 3.6517, "step": 25 }, { "epoch": 0.001, "grad_norm": 28.937531835097435, "learning_rate": 9.8e-08, "loss": 3.5931, "step": 50 }, { "epoch": 0.0015, "grad_norm": 21.922720332659644, "learning_rate": 1.4800000000000003e-07, "loss": 3.3397, "step": 75 }, { "epoch": 0.002, "grad_norm": 8.739610199908325, "learning_rate": 1.9800000000000003e-07, "loss": 3.1289, "step": 100 }, { "epoch": 0.002, "eval_loss": 2.9243295192718506, "eval_runtime": 264.3302, "eval_samples_per_second": 3.11, "eval_steps_per_second": 1.555, "step": 100 }, { "epoch": 0.0025, "grad_norm": 4.433912600039677, "learning_rate": 2.48e-07, "loss": 2.8957, "step": 125 }, { "epoch": 0.003, "grad_norm": 3.2874790066620303, "learning_rate": 2.9800000000000005e-07, "loss": 2.763, "step": 150 }, { "epoch": 0.0035, "grad_norm": 1.5203472215469231, "learning_rate": 3.48e-07, "loss": 2.676, "step": 175 }, { "epoch": 0.004, "grad_norm": 1.1945541683905954, "learning_rate": 3.9800000000000004e-07, "loss": 2.635, "step": 200 }, { "epoch": 0.004, "eval_loss": 2.6094932556152344, "eval_runtime": 265.7702, "eval_samples_per_second": 3.093, "eval_steps_per_second": 1.546, "step": 200 }, { "epoch": 0.0045, "grad_norm": 1.0852713304633745, "learning_rate": 4.4800000000000004e-07, "loss": 2.6016, "step": 225 }, { "epoch": 0.005, "grad_norm": 1.0733940346699529, "learning_rate": 4.98e-07, "loss": 2.5797, "step": 250 }, { "epoch": 0.0055, "grad_norm": 0.9273949035031271, "learning_rate": 5.480000000000001e-07, "loss": 2.5607, "step": 275 }, { "epoch": 0.006, "grad_norm": 0.9289300678591714, "learning_rate": 5.98e-07, "loss": 2.552, "step": 300 }, { "epoch": 0.006, "eval_loss": 2.541522264480591, "eval_runtime": 266.7478, "eval_samples_per_second": 3.082, "eval_steps_per_second": 1.541, "step": 300 }, { "epoch": 0.0065, "grad_norm": 1.1328584507449984, "learning_rate": 6.48e-07, "loss": 2.5402, "step": 325 }, { "epoch": 0.007, "grad_norm": 0.8593307029257858, "learning_rate": 6.98e-07, "loss": 2.5286, "step": 350 }, { "epoch": 0.0075, "grad_norm": 0.895615604067586, "learning_rate": 7.480000000000001e-07, "loss": 2.5311, "step": 375 }, { "epoch": 0.008, "grad_norm": 0.912306580242149, "learning_rate": 7.98e-07, "loss": 2.5037, "step": 400 }, { "epoch": 0.008, "eval_loss": 2.514389991760254, "eval_runtime": 266.4899, "eval_samples_per_second": 3.085, "eval_steps_per_second": 1.542, "step": 400 }, { "epoch": 0.0085, "grad_norm": 1.1866535514670034, "learning_rate": 8.480000000000001e-07, "loss": 2.5011, "step": 425 }, { "epoch": 0.009, "grad_norm": 1.211342504193914, "learning_rate": 8.980000000000001e-07, "loss": 2.503, "step": 450 }, { "epoch": 0.0095, "grad_norm": 1.113763817383069, "learning_rate": 9.480000000000001e-07, "loss": 2.4999, "step": 475 }, { "epoch": 0.01, "grad_norm": 1.2585585589647226, "learning_rate": 9.98e-07, "loss": 2.4872, "step": 500 }, { "epoch": 0.01, "eval_loss": 2.497868061065674, "eval_runtime": 265.7962, "eval_samples_per_second": 3.093, "eval_steps_per_second": 1.546, "step": 500 }, { "epoch": 0.0105, "grad_norm": 1.2585825718084245, "learning_rate": 1.0480000000000002e-06, "loss": 2.4852, "step": 525 }, { "epoch": 0.011, "grad_norm": 1.4101257437846046, "learning_rate": 1.0980000000000001e-06, "loss": 2.4892, "step": 550 }, { "epoch": 0.0115, "grad_norm": 1.1975234150707363, "learning_rate": 1.148e-06, "loss": 2.4861, "step": 575 }, { "epoch": 0.012, "grad_norm": 1.3662769225582332, "learning_rate": 1.1980000000000002e-06, "loss": 2.4882, "step": 600 }, { "epoch": 0.012, "eval_loss": 2.4879231452941895, "eval_runtime": 267.0005, "eval_samples_per_second": 3.079, "eval_steps_per_second": 1.539, "step": 600 }, { "epoch": 0.0125, "grad_norm": 1.3086724275194024, "learning_rate": 1.248e-06, "loss": 2.4745, "step": 625 }, { "epoch": 0.013, "grad_norm": 1.317023206802888, "learning_rate": 1.2980000000000001e-06, "loss": 2.4727, "step": 650 }, { "epoch": 0.0135, "grad_norm": 1.5284967544483212, "learning_rate": 1.348e-06, "loss": 2.469, "step": 675 }, { "epoch": 0.014, "grad_norm": 1.1047595217316941, "learning_rate": 1.3980000000000002e-06, "loss": 2.4695, "step": 700 }, { "epoch": 0.014, "eval_loss": 2.480103015899658, "eval_runtime": 263.5022, "eval_samples_per_second": 3.12, "eval_steps_per_second": 1.56, "step": 700 }, { "epoch": 0.0145, "grad_norm": 1.2077328209863791, "learning_rate": 1.4480000000000002e-06, "loss": 2.4654, "step": 725 }, { "epoch": 0.015, "grad_norm": 1.209220841771836, "learning_rate": 1.498e-06, "loss": 2.4663, "step": 750 }, { "epoch": 0.0155, "grad_norm": 1.3063169829879686, "learning_rate": 1.548e-06, "loss": 2.4704, "step": 775 }, { "epoch": 0.016, "grad_norm": 1.3180183352683195, "learning_rate": 1.5980000000000002e-06, "loss": 2.4583, "step": 800 }, { "epoch": 0.016, "eval_loss": 2.473590850830078, "eval_runtime": 305.9875, "eval_samples_per_second": 2.686, "eval_steps_per_second": 1.343, "step": 800 }, { "epoch": 0.0165, "grad_norm": 1.1674852380778837, "learning_rate": 1.6480000000000001e-06, "loss": 2.467, "step": 825 }, { "epoch": 0.017, "grad_norm": 1.2497656349941002, "learning_rate": 1.6980000000000003e-06, "loss": 2.4612, "step": 850 }, { "epoch": 0.0175, "grad_norm": 1.3358614980967494, "learning_rate": 1.7480000000000002e-06, "loss": 2.4636, "step": 875 }, { "epoch": 0.018, "grad_norm": 1.252489857653356, "learning_rate": 1.798e-06, "loss": 2.454, "step": 900 }, { "epoch": 0.018, "eval_loss": 2.4681763648986816, "eval_runtime": 264.702, "eval_samples_per_second": 3.105, "eval_steps_per_second": 1.553, "step": 900 }, { "epoch": 0.0185, "grad_norm": 1.2815437998994337, "learning_rate": 1.8480000000000001e-06, "loss": 2.4571, "step": 925 }, { "epoch": 0.019, "grad_norm": 1.0902475329451575, "learning_rate": 1.898e-06, "loss": 2.451, "step": 950 }, { "epoch": 0.0195, "grad_norm": 1.1502696024965324, "learning_rate": 1.9480000000000002e-06, "loss": 2.4527, "step": 975 }, { "epoch": 0.02, "grad_norm": 1.2336661855806117, "learning_rate": 1.998e-06, "loss": 2.4496, "step": 1000 }, { "epoch": 0.02, "eval_loss": 2.463880777359009, "eval_runtime": 275.7426, "eval_samples_per_second": 2.981, "eval_steps_per_second": 1.491, "step": 1000 }, { "epoch": 0.0205, "grad_norm": 1.2680742209094296, "learning_rate": 2.048e-06, "loss": 2.4494, "step": 1025 }, { "epoch": 0.021, "grad_norm": 1.0341778808278126, "learning_rate": 2.098e-06, "loss": 2.4467, "step": 1050 }, { "epoch": 0.0215, "grad_norm": 0.9860490736001175, "learning_rate": 2.148e-06, "loss": 2.4473, "step": 1075 }, { "epoch": 0.022, "grad_norm": 0.9419267295275278, "learning_rate": 2.198e-06, "loss": 2.443, "step": 1100 }, { "epoch": 0.022, "eval_loss": 2.4598941802978516, "eval_runtime": 265.0502, "eval_samples_per_second": 3.101, "eval_steps_per_second": 1.551, "step": 1100 }, { "epoch": 0.0225, "grad_norm": 1.3280720471027394, "learning_rate": 2.2480000000000003e-06, "loss": 2.4515, "step": 1125 }, { "epoch": 0.023, "grad_norm": 1.053570785582915, "learning_rate": 2.2980000000000003e-06, "loss": 2.4396, "step": 1150 }, { "epoch": 0.0235, "grad_norm": 0.9108119839585552, "learning_rate": 2.3480000000000002e-06, "loss": 2.4442, "step": 1175 }, { "epoch": 0.024, "grad_norm": 1.0062346367900277, "learning_rate": 2.398e-06, "loss": 2.4443, "step": 1200 }, { "epoch": 0.024, "eval_loss": 2.456455945968628, "eval_runtime": 264.5888, "eval_samples_per_second": 3.107, "eval_steps_per_second": 1.553, "step": 1200 }, { "epoch": 0.0245, "grad_norm": 1.0264127705426926, "learning_rate": 2.448e-06, "loss": 2.4351, "step": 1225 }, { "epoch": 0.025, "grad_norm": 0.8015249588347212, "learning_rate": 2.498e-06, "loss": 2.4406, "step": 1250 }, { "epoch": 0.0255, "grad_norm": 1.1105649485540114, "learning_rate": 2.5480000000000004e-06, "loss": 2.4377, "step": 1275 }, { "epoch": 0.026, "grad_norm": 0.9701758426012801, "learning_rate": 2.598e-06, "loss": 2.4341, "step": 1300 }, { "epoch": 0.026, "eval_loss": 2.453026056289673, "eval_runtime": 264.7653, "eval_samples_per_second": 3.105, "eval_steps_per_second": 1.552, "step": 1300 }, { "epoch": 0.0265, "grad_norm": 0.9587254891845429, "learning_rate": 2.648e-06, "loss": 2.4303, "step": 1325 }, { "epoch": 0.027, "grad_norm": 0.8135883960763247, "learning_rate": 2.6980000000000003e-06, "loss": 2.4363, "step": 1350 }, { "epoch": 0.0275, "grad_norm": 0.9192860127847176, "learning_rate": 2.748e-06, "loss": 2.4257, "step": 1375 }, { "epoch": 0.028, "grad_norm": 0.947465928893444, "learning_rate": 2.798e-06, "loss": 2.4353, "step": 1400 }, { "epoch": 0.028, "eval_loss": 2.450345993041992, "eval_runtime": 265.6266, "eval_samples_per_second": 3.095, "eval_steps_per_second": 1.547, "step": 1400 }, { "epoch": 0.0285, "grad_norm": 0.9270137901066681, "learning_rate": 2.848e-06, "loss": 2.4347, "step": 1425 }, { "epoch": 0.029, "grad_norm": 0.8839980710491563, "learning_rate": 2.8980000000000005e-06, "loss": 2.4213, "step": 1450 }, { "epoch": 0.0295, "grad_norm": 0.913196005454606, "learning_rate": 2.9480000000000004e-06, "loss": 2.4232, "step": 1475 }, { "epoch": 0.03, "grad_norm": 0.8139623858623861, "learning_rate": 2.9980000000000003e-06, "loss": 2.4254, "step": 1500 }, { "epoch": 0.03, "eval_loss": 2.447662830352783, "eval_runtime": 263.4353, "eval_samples_per_second": 3.12, "eval_steps_per_second": 1.56, "step": 1500 }, { "epoch": 0.0305, "grad_norm": 0.8422198221554755, "learning_rate": 3.0480000000000003e-06, "loss": 2.4196, "step": 1525 }, { "epoch": 0.031, "grad_norm": 0.8542957579365906, "learning_rate": 3.0980000000000007e-06, "loss": 2.4294, "step": 1550 }, { "epoch": 0.0315, "grad_norm": 1.149263137594797, "learning_rate": 3.1480000000000006e-06, "loss": 2.4265, "step": 1575 }, { "epoch": 0.032, "grad_norm": 0.811470126240392, "learning_rate": 3.198e-06, "loss": 2.4105, "step": 1600 }, { "epoch": 0.032, "eval_loss": 2.4456679821014404, "eval_runtime": 264.056, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.556, "step": 1600 }, { "epoch": 0.0325, "grad_norm": 2.3928975221881434, "learning_rate": 3.248e-06, "loss": 2.4208, "step": 1625 }, { "epoch": 0.033, "grad_norm": 0.8031315125360012, "learning_rate": 3.298e-06, "loss": 2.4224, "step": 1650 }, { "epoch": 0.0335, "grad_norm": 0.835567276692195, "learning_rate": 3.348e-06, "loss": 2.4188, "step": 1675 }, { "epoch": 0.034, "grad_norm": 0.8894325175719718, "learning_rate": 3.3980000000000003e-06, "loss": 2.4206, "step": 1700 }, { "epoch": 0.034, "eval_loss": 2.4437851905822754, "eval_runtime": 264.6455, "eval_samples_per_second": 3.106, "eval_steps_per_second": 1.553, "step": 1700 }, { "epoch": 0.0345, "grad_norm": 0.802724390649243, "learning_rate": 3.4480000000000003e-06, "loss": 2.4241, "step": 1725 }, { "epoch": 0.035, "grad_norm": 0.8206312612014312, "learning_rate": 3.4980000000000002e-06, "loss": 2.4157, "step": 1750 }, { "epoch": 0.0355, "grad_norm": 0.8653789917535344, "learning_rate": 3.548e-06, "loss": 2.412, "step": 1775 }, { "epoch": 0.036, "grad_norm": 0.7816319078215015, "learning_rate": 3.5980000000000005e-06, "loss": 2.4179, "step": 1800 }, { "epoch": 0.036, "eval_loss": 2.4423036575317383, "eval_runtime": 264.5578, "eval_samples_per_second": 3.107, "eval_steps_per_second": 1.554, "step": 1800 }, { "epoch": 0.0365, "grad_norm": 0.707594544466941, "learning_rate": 3.6480000000000005e-06, "loss": 2.416, "step": 1825 }, { "epoch": 0.037, "grad_norm": 0.7481066913011816, "learning_rate": 3.6980000000000004e-06, "loss": 2.4242, "step": 1850 }, { "epoch": 0.0375, "grad_norm": 0.7612014979445353, "learning_rate": 3.7480000000000004e-06, "loss": 2.4173, "step": 1875 }, { "epoch": 0.038, "grad_norm": 0.772750918048857, "learning_rate": 3.7980000000000007e-06, "loss": 2.4134, "step": 1900 }, { "epoch": 0.038, "eval_loss": 2.440969228744507, "eval_runtime": 274.3624, "eval_samples_per_second": 2.996, "eval_steps_per_second": 1.498, "step": 1900 }, { "epoch": 0.0385, "grad_norm": 0.7927966042188935, "learning_rate": 3.848e-06, "loss": 2.4131, "step": 1925 }, { "epoch": 0.039, "grad_norm": 0.7664274167276341, "learning_rate": 3.898e-06, "loss": 2.4133, "step": 1950 }, { "epoch": 0.0395, "grad_norm": 0.7038638213491795, "learning_rate": 3.948e-06, "loss": 2.4135, "step": 1975 }, { "epoch": 0.04, "grad_norm": 0.7231696877425319, "learning_rate": 3.9980000000000005e-06, "loss": 2.4169, "step": 2000 }, { "epoch": 0.04, "eval_loss": 2.439641237258911, "eval_runtime": 282.4449, "eval_samples_per_second": 2.91, "eval_steps_per_second": 1.455, "step": 2000 }, { "epoch": 0.0405, "grad_norm": 0.7184393791203537, "learning_rate": 4.048e-06, "loss": 2.4071, "step": 2025 }, { "epoch": 0.041, "grad_norm": 0.7366813467336683, "learning_rate": 4.098e-06, "loss": 2.4113, "step": 2050 }, { "epoch": 0.0415, "grad_norm": 0.7081408763220511, "learning_rate": 4.148000000000001e-06, "loss": 2.4168, "step": 2075 }, { "epoch": 0.042, "grad_norm": 0.6912835983850483, "learning_rate": 4.198e-06, "loss": 2.4105, "step": 2100 }, { "epoch": 0.042, "eval_loss": 2.438904047012329, "eval_runtime": 277.7481, "eval_samples_per_second": 2.96, "eval_steps_per_second": 1.48, "step": 2100 }, { "epoch": 0.0425, "grad_norm": 0.7745538733736145, "learning_rate": 4.248000000000001e-06, "loss": 2.4131, "step": 2125 }, { "epoch": 0.043, "grad_norm": 0.6897576190091962, "learning_rate": 4.298e-06, "loss": 2.4084, "step": 2150 }, { "epoch": 0.0435, "grad_norm": 0.7020994032566351, "learning_rate": 4.3480000000000006e-06, "loss": 2.4125, "step": 2175 }, { "epoch": 0.044, "grad_norm": 0.6668651869738377, "learning_rate": 4.398000000000001e-06, "loss": 2.4034, "step": 2200 }, { "epoch": 0.044, "eval_loss": 2.4380908012390137, "eval_runtime": 268.2252, "eval_samples_per_second": 3.065, "eval_steps_per_second": 1.532, "step": 2200 }, { "epoch": 0.0445, "grad_norm": 0.6547759047620061, "learning_rate": 4.4480000000000004e-06, "loss": 2.4099, "step": 2225 }, { "epoch": 0.045, "grad_norm": 0.6865815945777785, "learning_rate": 4.498e-06, "loss": 2.412, "step": 2250 }, { "epoch": 0.0455, "grad_norm": 0.6878267781655092, "learning_rate": 4.548e-06, "loss": 2.4137, "step": 2275 }, { "epoch": 0.046, "grad_norm": 0.8314813616644483, "learning_rate": 4.598e-06, "loss": 2.4097, "step": 2300 }, { "epoch": 0.046, "eval_loss": 2.4374496936798096, "eval_runtime": 263.1701, "eval_samples_per_second": 3.123, "eval_steps_per_second": 1.562, "step": 2300 }, { "epoch": 0.0465, "grad_norm": 0.6723966792931375, "learning_rate": 4.648e-06, "loss": 2.4051, "step": 2325 }, { "epoch": 0.047, "grad_norm": 0.7003756914046538, "learning_rate": 4.698000000000001e-06, "loss": 2.4032, "step": 2350 }, { "epoch": 0.0475, "grad_norm": 0.6747085415631567, "learning_rate": 4.748e-06, "loss": 2.4096, "step": 2375 }, { "epoch": 0.048, "grad_norm": 0.6571218540079207, "learning_rate": 4.7980000000000005e-06, "loss": 2.4165, "step": 2400 }, { "epoch": 0.048, "eval_loss": 2.4365923404693604, "eval_runtime": 264.2268, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.555, "step": 2400 }, { "epoch": 0.0485, "grad_norm": 0.7464314980483315, "learning_rate": 4.848000000000001e-06, "loss": 2.4098, "step": 2425 }, { "epoch": 0.049, "grad_norm": 0.6267266619200393, "learning_rate": 4.898e-06, "loss": 2.4019, "step": 2450 }, { "epoch": 0.0495, "grad_norm": 0.6650772680412506, "learning_rate": 4.948000000000001e-06, "loss": 2.405, "step": 2475 }, { "epoch": 0.05, "grad_norm": 0.7197173899674899, "learning_rate": 4.998e-06, "loss": 2.4095, "step": 2500 }, { "epoch": 0.05, "eval_loss": 2.4358348846435547, "eval_runtime": 266.7682, "eval_samples_per_second": 3.081, "eval_steps_per_second": 1.541, "step": 2500 }, { "epoch": 0.0505, "grad_norm": 0.6249572472256157, "learning_rate": 5.048000000000001e-06, "loss": 2.4058, "step": 2525 }, { "epoch": 0.051, "grad_norm": 0.7429228032719255, "learning_rate": 5.098000000000001e-06, "loss": 2.4084, "step": 2550 }, { "epoch": 0.0515, "grad_norm": 0.6320325962693778, "learning_rate": 5.1480000000000005e-06, "loss": 2.4015, "step": 2575 }, { "epoch": 0.052, "grad_norm": 0.672581755106835, "learning_rate": 5.198000000000001e-06, "loss": 2.4051, "step": 2600 }, { "epoch": 0.052, "eval_loss": 2.4351842403411865, "eval_runtime": 264.9149, "eval_samples_per_second": 3.103, "eval_steps_per_second": 1.551, "step": 2600 }, { "epoch": 0.0525, "grad_norm": 0.7086480776921088, "learning_rate": 5.248000000000001e-06, "loss": 2.3988, "step": 2625 }, { "epoch": 0.053, "grad_norm": 0.6774201154936552, "learning_rate": 5.298000000000001e-06, "loss": 2.394, "step": 2650 }, { "epoch": 0.0535, "grad_norm": 0.6661104910300973, "learning_rate": 5.348000000000001e-06, "loss": 2.4034, "step": 2675 }, { "epoch": 0.054, "grad_norm": 0.6224421593448741, "learning_rate": 5.398e-06, "loss": 2.3939, "step": 2700 }, { "epoch": 0.054, "eval_loss": 2.434826374053955, "eval_runtime": 264.1641, "eval_samples_per_second": 3.112, "eval_steps_per_second": 1.556, "step": 2700 }, { "epoch": 0.0545, "grad_norm": 0.6944661408419767, "learning_rate": 5.448e-06, "loss": 2.4064, "step": 2725 }, { "epoch": 0.055, "grad_norm": 0.6597297955298902, "learning_rate": 5.498e-06, "loss": 2.4051, "step": 2750 }, { "epoch": 0.0555, "grad_norm": 0.6526109506522182, "learning_rate": 5.548e-06, "loss": 2.4124, "step": 2775 }, { "epoch": 0.056, "grad_norm": 0.6528041780055424, "learning_rate": 5.5980000000000004e-06, "loss": 2.3979, "step": 2800 }, { "epoch": 0.056, "eval_loss": 2.4344167709350586, "eval_runtime": 264.2924, "eval_samples_per_second": 3.11, "eval_steps_per_second": 1.555, "step": 2800 }, { "epoch": 0.0565, "grad_norm": 0.7067565611523313, "learning_rate": 5.648e-06, "loss": 2.398, "step": 2825 }, { "epoch": 0.057, "grad_norm": 0.6416666495903947, "learning_rate": 5.698e-06, "loss": 2.3991, "step": 2850 }, { "epoch": 0.0575, "grad_norm": 0.6605105424774851, "learning_rate": 5.748e-06, "loss": 2.3962, "step": 2875 }, { "epoch": 0.058, "grad_norm": 0.6308761264530915, "learning_rate": 5.798e-06, "loss": 2.4058, "step": 2900 }, { "epoch": 0.058, "eval_loss": 2.434436082839966, "eval_runtime": 265.0112, "eval_samples_per_second": 3.102, "eval_steps_per_second": 1.551, "step": 2900 }, { "epoch": 0.0585, "grad_norm": 0.6363649329289001, "learning_rate": 5.848000000000001e-06, "loss": 2.3943, "step": 2925 }, { "epoch": 0.059, "grad_norm": 0.6147983139117156, "learning_rate": 5.898e-06, "loss": 2.3982, "step": 2950 }, { "epoch": 0.0595, "grad_norm": 0.611354772141602, "learning_rate": 5.9480000000000005e-06, "loss": 2.3921, "step": 2975 }, { "epoch": 0.06, "grad_norm": 0.6269054680170398, "learning_rate": 5.998000000000001e-06, "loss": 2.392, "step": 3000 }, { "epoch": 0.06, "eval_loss": 2.433990955352783, "eval_runtime": 264.2169, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.556, "step": 3000 }, { "epoch": 0.0605, "grad_norm": 0.6248207448228328, "learning_rate": 6.048e-06, "loss": 2.3858, "step": 3025 }, { "epoch": 0.061, "grad_norm": 0.6275258656299642, "learning_rate": 6.098000000000001e-06, "loss": 2.4015, "step": 3050 }, { "epoch": 0.0615, "grad_norm": 1.0457401571274152, "learning_rate": 6.148e-06, "loss": 2.3909, "step": 3075 }, { "epoch": 0.062, "grad_norm": 0.6551230863319748, "learning_rate": 6.198000000000001e-06, "loss": 2.3983, "step": 3100 }, { "epoch": 0.062, "eval_loss": 2.433279275894165, "eval_runtime": 264.1521, "eval_samples_per_second": 3.112, "eval_steps_per_second": 1.556, "step": 3100 }, { "epoch": 0.0625, "grad_norm": 0.6306746226297937, "learning_rate": 6.248000000000001e-06, "loss": 2.397, "step": 3125 }, { "epoch": 0.063, "grad_norm": 0.6299802316587856, "learning_rate": 6.2980000000000005e-06, "loss": 2.4018, "step": 3150 }, { "epoch": 0.0635, "grad_norm": 0.6265424590222634, "learning_rate": 6.348000000000001e-06, "loss": 2.4065, "step": 3175 }, { "epoch": 0.064, "grad_norm": 0.6717273211615455, "learning_rate": 6.398000000000001e-06, "loss": 2.3906, "step": 3200 }, { "epoch": 0.064, "eval_loss": 2.4333276748657227, "eval_runtime": 263.9592, "eval_samples_per_second": 3.114, "eval_steps_per_second": 1.557, "step": 3200 }, { "epoch": 0.0645, "grad_norm": 0.6159924635031793, "learning_rate": 6.448000000000001e-06, "loss": 2.3947, "step": 3225 }, { "epoch": 0.065, "grad_norm": 0.6124462043712093, "learning_rate": 6.498000000000001e-06, "loss": 2.3963, "step": 3250 }, { "epoch": 0.0655, "grad_norm": 0.6144378183602921, "learning_rate": 6.548000000000001e-06, "loss": 2.402, "step": 3275 }, { "epoch": 0.066, "grad_norm": 0.6295732934678283, "learning_rate": 6.598000000000001e-06, "loss": 2.3877, "step": 3300 }, { "epoch": 0.066, "eval_loss": 2.4331116676330566, "eval_runtime": 263.4524, "eval_samples_per_second": 3.12, "eval_steps_per_second": 1.56, "step": 3300 }, { "epoch": 0.0665, "grad_norm": 0.5938287129149346, "learning_rate": 6.648e-06, "loss": 2.389, "step": 3325 }, { "epoch": 0.067, "grad_norm": 0.6194783667871923, "learning_rate": 6.698e-06, "loss": 2.39, "step": 3350 }, { "epoch": 0.0675, "grad_norm": 0.60927231594853, "learning_rate": 6.7480000000000004e-06, "loss": 2.3968, "step": 3375 }, { "epoch": 0.068, "grad_norm": 0.6386175333576501, "learning_rate": 6.798e-06, "loss": 2.3861, "step": 3400 }, { "epoch": 0.068, "eval_loss": 2.4328911304473877, "eval_runtime": 264.2923, "eval_samples_per_second": 3.11, "eval_steps_per_second": 1.555, "step": 3400 }, { "epoch": 0.0685, "grad_norm": 0.6092295027577579, "learning_rate": 6.848e-06, "loss": 2.3827, "step": 3425 }, { "epoch": 0.069, "grad_norm": 0.5914846449422462, "learning_rate": 6.898e-06, "loss": 2.3894, "step": 3450 }, { "epoch": 0.0695, "grad_norm": 0.5927461214526666, "learning_rate": 6.948e-06, "loss": 2.3858, "step": 3475 }, { "epoch": 0.07, "grad_norm": 0.5992194088197265, "learning_rate": 6.998000000000001e-06, "loss": 2.3941, "step": 3500 }, { "epoch": 0.07, "eval_loss": 2.432774543762207, "eval_runtime": 263.8546, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.558, "step": 3500 }, { "epoch": 0.0705, "grad_norm": 0.6119297158568089, "learning_rate": 7.048e-06, "loss": 2.3897, "step": 3525 }, { "epoch": 0.071, "grad_norm": 0.6040666217758901, "learning_rate": 7.0980000000000005e-06, "loss": 2.3966, "step": 3550 }, { "epoch": 0.0715, "grad_norm": 0.6142925813030266, "learning_rate": 7.148000000000001e-06, "loss": 2.3953, "step": 3575 }, { "epoch": 0.072, "grad_norm": 0.5857079248330344, "learning_rate": 7.198e-06, "loss": 2.3854, "step": 3600 }, { "epoch": 0.072, "eval_loss": 2.432868719100952, "eval_runtime": 264.1849, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.556, "step": 3600 }, { "epoch": 0.0725, "grad_norm": 0.6075613052530382, "learning_rate": 7.248000000000001e-06, "loss": 2.3798, "step": 3625 }, { "epoch": 0.073, "grad_norm": 0.6146043204282547, "learning_rate": 7.298e-06, "loss": 2.3894, "step": 3650 }, { "epoch": 0.0735, "grad_norm": 0.613284002341936, "learning_rate": 7.348000000000001e-06, "loss": 2.3897, "step": 3675 }, { "epoch": 0.074, "grad_norm": 0.6694404263159593, "learning_rate": 7.398000000000001e-06, "loss": 2.3925, "step": 3700 }, { "epoch": 0.074, "eval_loss": 2.4324021339416504, "eval_runtime": 263.3107, "eval_samples_per_second": 3.122, "eval_steps_per_second": 1.561, "step": 3700 }, { "epoch": 0.0745, "grad_norm": 0.5756401973694445, "learning_rate": 7.4480000000000005e-06, "loss": 2.3894, "step": 3725 }, { "epoch": 0.075, "grad_norm": 0.5945783703417461, "learning_rate": 7.498000000000001e-06, "loss": 2.3928, "step": 3750 }, { "epoch": 0.0755, "grad_norm": 0.5935750222986942, "learning_rate": 7.548000000000001e-06, "loss": 2.3774, "step": 3775 }, { "epoch": 0.076, "grad_norm": 0.5938734543073783, "learning_rate": 7.598000000000001e-06, "loss": 2.3776, "step": 3800 }, { "epoch": 0.076, "eval_loss": 2.432751178741455, "eval_runtime": 263.8929, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.557, "step": 3800 }, { "epoch": 0.0765, "grad_norm": 0.595820899700728, "learning_rate": 7.648e-06, "loss": 2.3804, "step": 3825 }, { "epoch": 0.077, "grad_norm": 0.6079304106413467, "learning_rate": 7.698000000000002e-06, "loss": 2.3917, "step": 3850 }, { "epoch": 0.0775, "grad_norm": 0.6083448146618482, "learning_rate": 7.748000000000001e-06, "loss": 2.3842, "step": 3875 }, { "epoch": 0.078, "grad_norm": 0.6128893415605828, "learning_rate": 7.798e-06, "loss": 2.3806, "step": 3900 }, { "epoch": 0.078, "eval_loss": 2.4325239658355713, "eval_runtime": 263.6693, "eval_samples_per_second": 3.118, "eval_steps_per_second": 1.559, "step": 3900 }, { "epoch": 0.0785, "grad_norm": 0.6079041195191952, "learning_rate": 7.848000000000002e-06, "loss": 2.3801, "step": 3925 }, { "epoch": 0.079, "grad_norm": 0.6075689821557235, "learning_rate": 7.898e-06, "loss": 2.3797, "step": 3950 }, { "epoch": 0.0795, "grad_norm": 0.5882326737716994, "learning_rate": 7.948e-06, "loss": 2.3905, "step": 3975 }, { "epoch": 0.08, "grad_norm": 0.5828476462223788, "learning_rate": 7.998e-06, "loss": 2.3806, "step": 4000 }, { "epoch": 0.08, "eval_loss": 2.4323527812957764, "eval_runtime": 263.9786, "eval_samples_per_second": 3.114, "eval_steps_per_second": 1.557, "step": 4000 }, { "epoch": 0.0805, "grad_norm": 0.5907927035367586, "learning_rate": 8.048e-06, "loss": 2.3739, "step": 4025 }, { "epoch": 0.081, "grad_norm": 0.608189189988593, "learning_rate": 8.098000000000001e-06, "loss": 2.3837, "step": 4050 }, { "epoch": 0.0815, "grad_norm": 0.5933025642280234, "learning_rate": 8.148e-06, "loss": 2.3814, "step": 4075 }, { "epoch": 0.082, "grad_norm": 0.5898305070270532, "learning_rate": 8.198e-06, "loss": 2.3854, "step": 4100 }, { "epoch": 0.082, "eval_loss": 2.432577610015869, "eval_runtime": 264.0972, "eval_samples_per_second": 3.112, "eval_steps_per_second": 1.556, "step": 4100 }, { "epoch": 0.0825, "grad_norm": 0.5673002921483621, "learning_rate": 8.248e-06, "loss": 2.3827, "step": 4125 }, { "epoch": 0.083, "grad_norm": 0.5859186364996516, "learning_rate": 8.298000000000001e-06, "loss": 2.3859, "step": 4150 }, { "epoch": 0.0835, "grad_norm": 0.5852893491639726, "learning_rate": 8.348e-06, "loss": 2.3711, "step": 4175 }, { "epoch": 0.084, "grad_norm": 0.5704807601233864, "learning_rate": 8.398e-06, "loss": 2.3682, "step": 4200 }, { "epoch": 0.084, "eval_loss": 2.4325780868530273, "eval_runtime": 264.0677, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.556, "step": 4200 }, { "epoch": 0.0845, "grad_norm": 0.565873049775094, "learning_rate": 8.448000000000001e-06, "loss": 2.3894, "step": 4225 }, { "epoch": 0.085, "grad_norm": 0.6594348238393681, "learning_rate": 8.498e-06, "loss": 2.3736, "step": 4250 }, { "epoch": 0.0855, "grad_norm": 0.6114416993962639, "learning_rate": 8.548e-06, "loss": 2.3768, "step": 4275 }, { "epoch": 0.086, "grad_norm": 0.613007148558132, "learning_rate": 8.598000000000001e-06, "loss": 2.3841, "step": 4300 }, { "epoch": 0.086, "eval_loss": 2.432278633117676, "eval_runtime": 264.5455, "eval_samples_per_second": 3.107, "eval_steps_per_second": 1.554, "step": 4300 }, { "epoch": 0.0865, "grad_norm": 0.6316113111159283, "learning_rate": 8.648000000000001e-06, "loss": 2.3853, "step": 4325 }, { "epoch": 0.087, "grad_norm": 0.578758909498954, "learning_rate": 8.698e-06, "loss": 2.3838, "step": 4350 }, { "epoch": 0.0875, "grad_norm": 0.5663796780744771, "learning_rate": 8.748000000000002e-06, "loss": 2.3744, "step": 4375 }, { "epoch": 0.088, "grad_norm": 0.5996723194508057, "learning_rate": 8.798000000000001e-06, "loss": 2.3741, "step": 4400 }, { "epoch": 0.088, "eval_loss": 2.4327504634857178, "eval_runtime": 264.3839, "eval_samples_per_second": 3.109, "eval_steps_per_second": 1.555, "step": 4400 }, { "epoch": 0.0885, "grad_norm": 0.5903185672805589, "learning_rate": 8.848e-06, "loss": 2.3789, "step": 4425 }, { "epoch": 0.089, "grad_norm": 0.5683354037993711, "learning_rate": 8.898000000000002e-06, "loss": 2.3739, "step": 4450 }, { "epoch": 0.0895, "grad_norm": 0.5992802333814672, "learning_rate": 8.948000000000001e-06, "loss": 2.3805, "step": 4475 }, { "epoch": 0.09, "grad_norm": 0.5951158771681028, "learning_rate": 8.998000000000001e-06, "loss": 2.3702, "step": 4500 }, { "epoch": 0.09, "eval_loss": 2.432904005050659, "eval_runtime": 264.0927, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.556, "step": 4500 }, { "epoch": 0.0905, "grad_norm": 0.628437176595306, "learning_rate": 9.048e-06, "loss": 2.3705, "step": 4525 }, { "epoch": 0.091, "grad_norm": 0.5852194468933433, "learning_rate": 9.098000000000002e-06, "loss": 2.3726, "step": 4550 }, { "epoch": 0.0915, "grad_norm": 0.5832814461503186, "learning_rate": 9.148e-06, "loss": 2.3709, "step": 4575 }, { "epoch": 0.092, "grad_norm": 0.6235298544634128, "learning_rate": 9.198e-06, "loss": 2.3823, "step": 4600 }, { "epoch": 0.092, "eval_loss": 2.433288335800171, "eval_runtime": 264.0394, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.557, "step": 4600 }, { "epoch": 0.0925, "grad_norm": 0.6097464410099737, "learning_rate": 9.248e-06, "loss": 2.3715, "step": 4625 }, { "epoch": 0.093, "grad_norm": 0.5830918527201829, "learning_rate": 9.298e-06, "loss": 2.3694, "step": 4650 }, { "epoch": 0.0935, "grad_norm": 0.6195865573807103, "learning_rate": 9.348000000000001e-06, "loss": 2.3711, "step": 4675 }, { "epoch": 0.094, "grad_norm": 0.5922485886549429, "learning_rate": 9.398e-06, "loss": 2.3764, "step": 4700 }, { "epoch": 0.094, "eval_loss": 2.4330477714538574, "eval_runtime": 263.7501, "eval_samples_per_second": 3.117, "eval_steps_per_second": 1.558, "step": 4700 }, { "epoch": 0.0945, "grad_norm": 0.5909566806378528, "learning_rate": 9.448e-06, "loss": 2.3799, "step": 4725 }, { "epoch": 0.095, "grad_norm": 0.5872189964007283, "learning_rate": 9.498000000000001e-06, "loss": 2.3737, "step": 4750 }, { "epoch": 0.0955, "grad_norm": 0.6071714619656263, "learning_rate": 9.548e-06, "loss": 2.3789, "step": 4775 }, { "epoch": 0.096, "grad_norm": 0.5631342344537085, "learning_rate": 9.598e-06, "loss": 2.3641, "step": 4800 }, { "epoch": 0.096, "eval_loss": 2.4332797527313232, "eval_runtime": 264.5164, "eval_samples_per_second": 3.108, "eval_steps_per_second": 1.554, "step": 4800 }, { "epoch": 0.0965, "grad_norm": 0.600707218384485, "learning_rate": 9.648000000000001e-06, "loss": 2.3715, "step": 4825 }, { "epoch": 0.097, "grad_norm": 0.5705494762785608, "learning_rate": 9.698000000000001e-06, "loss": 2.3741, "step": 4850 }, { "epoch": 0.0975, "grad_norm": 0.5891811727113021, "learning_rate": 9.748e-06, "loss": 2.3738, "step": 4875 }, { "epoch": 0.098, "grad_norm": 0.5947555260131183, "learning_rate": 9.798e-06, "loss": 2.365, "step": 4900 }, { "epoch": 0.098, "eval_loss": 2.433032751083374, "eval_runtime": 264.6355, "eval_samples_per_second": 3.106, "eval_steps_per_second": 1.553, "step": 4900 }, { "epoch": 0.0985, "grad_norm": 0.6055417663185935, "learning_rate": 9.848000000000001e-06, "loss": 2.3677, "step": 4925 }, { "epoch": 0.099, "grad_norm": 0.5803464068069174, "learning_rate": 9.898e-06, "loss": 2.3699, "step": 4950 }, { "epoch": 0.0995, "grad_norm": 0.5899201870269601, "learning_rate": 9.948e-06, "loss": 2.3685, "step": 4975 }, { "epoch": 0.1, "grad_norm": 0.6226759838202708, "learning_rate": 9.998000000000002e-06, "loss": 2.3599, "step": 5000 }, { "epoch": 0.1, "eval_loss": 2.433412551879883, "eval_runtime": 279.6783, "eval_samples_per_second": 2.939, "eval_steps_per_second": 1.47, "step": 5000 }, { "epoch": 0.1005, "grad_norm": 0.6129345554278736, "learning_rate": 9.994666666666668e-06, "loss": 2.3651, "step": 5025 }, { "epoch": 0.101, "grad_norm": 0.5783687106202524, "learning_rate": 9.989111111111111e-06, "loss": 2.3635, "step": 5050 }, { "epoch": 0.1015, "grad_norm": 0.7886759246703615, "learning_rate": 9.983555555555556e-06, "loss": 2.3688, "step": 5075 }, { "epoch": 0.102, "grad_norm": 0.5496276670344779, "learning_rate": 9.978000000000002e-06, "loss": 2.3718, "step": 5100 }, { "epoch": 0.102, "eval_loss": 2.4336636066436768, "eval_runtime": 264.0531, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.557, "step": 5100 }, { "epoch": 0.1025, "grad_norm": 0.596488402670124, "learning_rate": 9.972444444444445e-06, "loss": 2.3654, "step": 5125 }, { "epoch": 0.103, "grad_norm": 0.5758952191659142, "learning_rate": 9.966888888888889e-06, "loss": 2.3662, "step": 5150 }, { "epoch": 0.1035, "grad_norm": 0.5714325894660194, "learning_rate": 9.961333333333334e-06, "loss": 2.3671, "step": 5175 }, { "epoch": 0.104, "grad_norm": 0.5826964477363549, "learning_rate": 9.95577777777778e-06, "loss": 2.3621, "step": 5200 }, { "epoch": 0.104, "eval_loss": 2.433170795440674, "eval_runtime": 263.4913, "eval_samples_per_second": 3.12, "eval_steps_per_second": 1.56, "step": 5200 }, { "epoch": 0.1045, "grad_norm": 0.5939017286545814, "learning_rate": 9.950222222222223e-06, "loss": 2.3704, "step": 5225 }, { "epoch": 0.105, "grad_norm": 0.5916137818576529, "learning_rate": 9.944666666666668e-06, "loss": 2.3662, "step": 5250 }, { "epoch": 0.1055, "grad_norm": 0.6105360548349205, "learning_rate": 9.939111111111112e-06, "loss": 2.3646, "step": 5275 }, { "epoch": 0.106, "grad_norm": 0.5821955662592928, "learning_rate": 9.933555555555557e-06, "loss": 2.365, "step": 5300 }, { "epoch": 0.106, "eval_loss": 2.4327642917633057, "eval_runtime": 263.745, "eval_samples_per_second": 3.117, "eval_steps_per_second": 1.558, "step": 5300 }, { "epoch": 0.1065, "grad_norm": 0.5805717889494187, "learning_rate": 9.928e-06, "loss": 2.364, "step": 5325 }, { "epoch": 0.107, "grad_norm": 0.5876895049794754, "learning_rate": 9.922444444444446e-06, "loss": 2.362, "step": 5350 }, { "epoch": 0.1075, "grad_norm": 0.6258383766876349, "learning_rate": 9.91688888888889e-06, "loss": 2.3654, "step": 5375 }, { "epoch": 0.108, "grad_norm": 0.5963835367877209, "learning_rate": 9.911333333333335e-06, "loss": 2.3627, "step": 5400 }, { "epoch": 0.108, "eval_loss": 2.4326930046081543, "eval_runtime": 263.2366, "eval_samples_per_second": 3.123, "eval_steps_per_second": 1.561, "step": 5400 }, { "epoch": 0.1085, "grad_norm": 0.5827253994353866, "learning_rate": 9.905777777777778e-06, "loss": 2.3703, "step": 5425 }, { "epoch": 0.109, "grad_norm": 0.571031920084426, "learning_rate": 9.900222222222223e-06, "loss": 2.3671, "step": 5450 }, { "epoch": 0.1095, "grad_norm": 0.599548806743577, "learning_rate": 9.894666666666669e-06, "loss": 2.362, "step": 5475 }, { "epoch": 0.11, "grad_norm": 0.5736311725646083, "learning_rate": 9.889111111111112e-06, "loss": 2.3622, "step": 5500 }, { "epoch": 0.11, "eval_loss": 2.4330084323883057, "eval_runtime": 264.1044, "eval_samples_per_second": 3.112, "eval_steps_per_second": 1.556, "step": 5500 }, { "epoch": 0.1105, "grad_norm": 0.6098672058792028, "learning_rate": 9.883555555555556e-06, "loss": 2.3705, "step": 5525 }, { "epoch": 0.111, "grad_norm": 0.5761728375832208, "learning_rate": 9.878000000000001e-06, "loss": 2.3608, "step": 5550 }, { "epoch": 0.1115, "grad_norm": 0.5922504560114277, "learning_rate": 9.872444444444446e-06, "loss": 2.3542, "step": 5575 }, { "epoch": 0.112, "grad_norm": 0.5668795024079605, "learning_rate": 9.86688888888889e-06, "loss": 2.3623, "step": 5600 }, { "epoch": 0.112, "eval_loss": 2.432955503463745, "eval_runtime": 263.8097, "eval_samples_per_second": 3.116, "eval_steps_per_second": 1.558, "step": 5600 }, { "epoch": 0.1125, "grad_norm": 0.5697809034851604, "learning_rate": 9.861333333333333e-06, "loss": 2.3541, "step": 5625 }, { "epoch": 0.113, "grad_norm": 0.5740407982821335, "learning_rate": 9.855777777777779e-06, "loss": 2.3594, "step": 5650 }, { "epoch": 0.1135, "grad_norm": 0.5697372211616294, "learning_rate": 9.850222222222224e-06, "loss": 2.3592, "step": 5675 }, { "epoch": 0.114, "grad_norm": 0.5845230307189324, "learning_rate": 9.844666666666667e-06, "loss": 2.3456, "step": 5700 }, { "epoch": 0.114, "eval_loss": 2.432389974594116, "eval_runtime": 263.8043, "eval_samples_per_second": 3.116, "eval_steps_per_second": 1.558, "step": 5700 }, { "epoch": 0.1145, "grad_norm": 0.5677067211464538, "learning_rate": 9.839111111111111e-06, "loss": 2.3581, "step": 5725 }, { "epoch": 0.115, "grad_norm": 0.6024564908699644, "learning_rate": 9.833555555555556e-06, "loss": 2.359, "step": 5750 }, { "epoch": 0.1155, "grad_norm": 0.5789830837760237, "learning_rate": 9.828000000000001e-06, "loss": 2.36, "step": 5775 }, { "epoch": 0.116, "grad_norm": 0.5912805339254935, "learning_rate": 9.822444444444445e-06, "loss": 2.3588, "step": 5800 }, { "epoch": 0.116, "eval_loss": 2.432565689086914, "eval_runtime": 263.3515, "eval_samples_per_second": 3.121, "eval_steps_per_second": 1.561, "step": 5800 }, { "epoch": 0.1165, "grad_norm": 0.5647440650976697, "learning_rate": 9.81688888888889e-06, "loss": 2.3576, "step": 5825 }, { "epoch": 0.117, "grad_norm": 0.5673458673735715, "learning_rate": 9.811333333333334e-06, "loss": 2.3616, "step": 5850 }, { "epoch": 0.1175, "grad_norm": 0.6030082642745155, "learning_rate": 9.805777777777779e-06, "loss": 2.3556, "step": 5875 }, { "epoch": 0.118, "grad_norm": 0.5571893163840321, "learning_rate": 9.800222222222223e-06, "loss": 2.3557, "step": 5900 }, { "epoch": 0.118, "eval_loss": 2.4327075481414795, "eval_runtime": 263.2657, "eval_samples_per_second": 3.122, "eval_steps_per_second": 1.561, "step": 5900 }, { "epoch": 0.1185, "grad_norm": 0.5716010515949606, "learning_rate": 9.794666666666668e-06, "loss": 2.3616, "step": 5925 }, { "epoch": 0.119, "grad_norm": 0.6245053681878497, "learning_rate": 9.789111111111111e-06, "loss": 2.358, "step": 5950 }, { "epoch": 0.1195, "grad_norm": 0.5896528100704728, "learning_rate": 9.783555555555557e-06, "loss": 2.355, "step": 5975 }, { "epoch": 0.12, "grad_norm": 0.5534590488643797, "learning_rate": 9.778e-06, "loss": 2.3567, "step": 6000 }, { "epoch": 0.12, "eval_loss": 2.4327354431152344, "eval_runtime": 263.9156, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.557, "step": 6000 }, { "epoch": 0.1205, "grad_norm": 0.5779403883996491, "learning_rate": 9.772444444444445e-06, "loss": 2.3487, "step": 6025 }, { "epoch": 0.121, "grad_norm": 0.5693494880188505, "learning_rate": 9.76688888888889e-06, "loss": 2.3506, "step": 6050 }, { "epoch": 0.1215, "grad_norm": 0.5864069751838692, "learning_rate": 9.761333333333334e-06, "loss": 2.3498, "step": 6075 }, { "epoch": 0.122, "grad_norm": 0.5930208676954954, "learning_rate": 9.755777777777778e-06, "loss": 2.3508, "step": 6100 }, { "epoch": 0.122, "eval_loss": 2.432914972305298, "eval_runtime": 263.746, "eval_samples_per_second": 3.117, "eval_steps_per_second": 1.558, "step": 6100 }, { "epoch": 0.1225, "grad_norm": 0.5967532601446782, "learning_rate": 9.750222222222223e-06, "loss": 2.3584, "step": 6125 }, { "epoch": 0.123, "grad_norm": 0.5670429310236035, "learning_rate": 9.744666666666668e-06, "loss": 2.3584, "step": 6150 }, { "epoch": 0.1235, "grad_norm": 0.5744482242457726, "learning_rate": 9.739111111111112e-06, "loss": 2.351, "step": 6175 }, { "epoch": 0.124, "grad_norm": 0.6029007635970692, "learning_rate": 9.733555555555555e-06, "loss": 2.3494, "step": 6200 }, { "epoch": 0.124, "eval_loss": 2.432878255844116, "eval_runtime": 263.5842, "eval_samples_per_second": 3.119, "eval_steps_per_second": 1.559, "step": 6200 }, { "epoch": 0.1245, "grad_norm": 0.564399310279196, "learning_rate": 9.728e-06, "loss": 2.3595, "step": 6225 }, { "epoch": 0.125, "grad_norm": 0.6065670221926927, "learning_rate": 9.722444444444446e-06, "loss": 2.3547, "step": 6250 }, { "epoch": 0.1255, "grad_norm": 0.5659801132085207, "learning_rate": 9.71688888888889e-06, "loss": 2.3511, "step": 6275 }, { "epoch": 0.126, "grad_norm": 0.5837628069797915, "learning_rate": 9.711333333333333e-06, "loss": 2.3575, "step": 6300 }, { "epoch": 0.126, "eval_loss": 2.4329097270965576, "eval_runtime": 264.6192, "eval_samples_per_second": 3.106, "eval_steps_per_second": 1.553, "step": 6300 }, { "epoch": 0.1265, "grad_norm": 0.5760319910919499, "learning_rate": 9.705777777777778e-06, "loss": 2.3488, "step": 6325 }, { "epoch": 0.127, "grad_norm": 0.5761318046315628, "learning_rate": 9.700222222222224e-06, "loss": 2.3435, "step": 6350 }, { "epoch": 0.1275, "grad_norm": 0.5609369346838009, "learning_rate": 9.694666666666667e-06, "loss": 2.347, "step": 6375 }, { "epoch": 0.128, "grad_norm": 0.5954461846572633, "learning_rate": 9.68911111111111e-06, "loss": 2.3485, "step": 6400 }, { "epoch": 0.128, "eval_loss": 2.4333934783935547, "eval_runtime": 263.5903, "eval_samples_per_second": 3.118, "eval_steps_per_second": 1.559, "step": 6400 }, { "epoch": 0.1285, "grad_norm": 0.5524126786458765, "learning_rate": 9.683555555555556e-06, "loss": 2.3514, "step": 6425 }, { "epoch": 0.129, "grad_norm": 0.5590067107241867, "learning_rate": 9.678000000000001e-06, "loss": 2.3477, "step": 6450 }, { "epoch": 0.1295, "grad_norm": 0.5578028236930622, "learning_rate": 9.672444444444445e-06, "loss": 2.3434, "step": 6475 }, { "epoch": 0.13, "grad_norm": 0.6002389478119885, "learning_rate": 9.66688888888889e-06, "loss": 2.3415, "step": 6500 }, { "epoch": 0.13, "eval_loss": 2.433302164077759, "eval_runtime": 263.4334, "eval_samples_per_second": 3.12, "eval_steps_per_second": 1.56, "step": 6500 }, { "epoch": 0.1305, "grad_norm": 0.5868647352323021, "learning_rate": 9.661333333333334e-06, "loss": 2.3532, "step": 6525 }, { "epoch": 0.131, "grad_norm": 0.5525203092071236, "learning_rate": 9.655777777777779e-06, "loss": 2.3439, "step": 6550 }, { "epoch": 0.1315, "grad_norm": 0.642282300647443, "learning_rate": 9.650222222222222e-06, "loss": 2.333, "step": 6575 }, { "epoch": 0.132, "grad_norm": 0.5954691746571129, "learning_rate": 9.644666666666668e-06, "loss": 2.3371, "step": 6600 }, { "epoch": 0.132, "eval_loss": 2.4332070350646973, "eval_runtime": 263.9928, "eval_samples_per_second": 3.114, "eval_steps_per_second": 1.557, "step": 6600 }, { "epoch": 0.1325, "grad_norm": 0.5696322215994257, "learning_rate": 9.639111111111113e-06, "loss": 2.3568, "step": 6625 }, { "epoch": 0.133, "grad_norm": 0.569783318316734, "learning_rate": 9.633555555555556e-06, "loss": 2.3468, "step": 6650 }, { "epoch": 0.1335, "grad_norm": 0.5974477984803339, "learning_rate": 9.628e-06, "loss": 2.3369, "step": 6675 }, { "epoch": 0.134, "grad_norm": 0.5850514409957908, "learning_rate": 9.622444444444445e-06, "loss": 2.3328, "step": 6700 }, { "epoch": 0.134, "eval_loss": 2.4336042404174805, "eval_runtime": 264.1653, "eval_samples_per_second": 3.112, "eval_steps_per_second": 1.556, "step": 6700 }, { "epoch": 0.1345, "grad_norm": 0.5598567946533984, "learning_rate": 9.61688888888889e-06, "loss": 2.3505, "step": 6725 }, { "epoch": 0.135, "grad_norm": 0.564538169627995, "learning_rate": 9.611333333333334e-06, "loss": 2.3512, "step": 6750 }, { "epoch": 0.1355, "grad_norm": 0.555057205811747, "learning_rate": 9.605777777777778e-06, "loss": 2.3441, "step": 6775 }, { "epoch": 0.136, "grad_norm": 0.5928392878820046, "learning_rate": 9.600222222222223e-06, "loss": 2.342, "step": 6800 }, { "epoch": 0.136, "eval_loss": 2.4332380294799805, "eval_runtime": 263.6981, "eval_samples_per_second": 3.117, "eval_steps_per_second": 1.559, "step": 6800 }, { "epoch": 0.1365, "grad_norm": 0.580747535991996, "learning_rate": 9.594666666666668e-06, "loss": 2.3402, "step": 6825 }, { "epoch": 0.137, "grad_norm": 0.5361093856752921, "learning_rate": 9.589111111111112e-06, "loss": 2.3345, "step": 6850 }, { "epoch": 0.1375, "grad_norm": 0.5764684974648585, "learning_rate": 9.583555555555555e-06, "loss": 2.3434, "step": 6875 }, { "epoch": 0.138, "grad_norm": 0.5695437902803252, "learning_rate": 9.578e-06, "loss": 2.3345, "step": 6900 }, { "epoch": 0.138, "eval_loss": 2.4334897994995117, "eval_runtime": 263.9042, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.557, "step": 6900 }, { "epoch": 0.1385, "grad_norm": 0.5856816810807355, "learning_rate": 9.572444444444446e-06, "loss": 2.3344, "step": 6925 }, { "epoch": 0.139, "grad_norm": 0.5692161417871612, "learning_rate": 9.56688888888889e-06, "loss": 2.3492, "step": 6950 }, { "epoch": 0.1395, "grad_norm": 0.5782790626699041, "learning_rate": 9.561333333333333e-06, "loss": 2.3343, "step": 6975 }, { "epoch": 0.14, "grad_norm": 0.5592348825440727, "learning_rate": 9.555777777777778e-06, "loss": 2.3361, "step": 7000 }, { "epoch": 0.14, "eval_loss": 2.4338128566741943, "eval_runtime": 264.0278, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.557, "step": 7000 }, { "epoch": 0.1405, "grad_norm": 0.5810855929853301, "learning_rate": 9.550222222222223e-06, "loss": 2.3397, "step": 7025 }, { "epoch": 0.141, "grad_norm": 0.5672444444354668, "learning_rate": 9.544666666666667e-06, "loss": 2.3384, "step": 7050 }, { "epoch": 0.1415, "grad_norm": 0.649461804794621, "learning_rate": 9.539111111111112e-06, "loss": 2.3384, "step": 7075 }, { "epoch": 0.142, "grad_norm": 0.5697893925017475, "learning_rate": 9.533555555555556e-06, "loss": 2.3415, "step": 7100 }, { "epoch": 0.142, "eval_loss": 2.4329330921173096, "eval_runtime": 263.8408, "eval_samples_per_second": 3.116, "eval_steps_per_second": 1.558, "step": 7100 }, { "epoch": 0.1425, "grad_norm": 0.562192662676289, "learning_rate": 9.528000000000001e-06, "loss": 2.3381, "step": 7125 }, { "epoch": 0.143, "grad_norm": 0.5782927675061864, "learning_rate": 9.522444444444444e-06, "loss": 2.3316, "step": 7150 }, { "epoch": 0.1435, "grad_norm": 0.5470889439002048, "learning_rate": 9.51688888888889e-06, "loss": 2.3336, "step": 7175 }, { "epoch": 0.144, "grad_norm": 0.5732687375919955, "learning_rate": 9.511333333333335e-06, "loss": 2.3302, "step": 7200 }, { "epoch": 0.144, "eval_loss": 2.4339091777801514, "eval_runtime": 265.4685, "eval_samples_per_second": 3.096, "eval_steps_per_second": 1.548, "step": 7200 }, { "epoch": 0.1445, "grad_norm": 0.5552677779418167, "learning_rate": 9.505777777777779e-06, "loss": 2.3382, "step": 7225 }, { "epoch": 0.145, "grad_norm": 0.5597695533114173, "learning_rate": 9.500222222222222e-06, "loss": 2.3281, "step": 7250 }, { "epoch": 0.1455, "grad_norm": 0.586047229250587, "learning_rate": 9.494666666666667e-06, "loss": 2.3365, "step": 7275 }, { "epoch": 0.146, "grad_norm": 0.5631697021330876, "learning_rate": 9.489111111111113e-06, "loss": 2.3434, "step": 7300 }, { "epoch": 0.146, "eval_loss": 2.4337289333343506, "eval_runtime": 264.0121, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.557, "step": 7300 }, { "epoch": 0.1465, "grad_norm": 0.5787283610065107, "learning_rate": 9.483555555555556e-06, "loss": 2.3385, "step": 7325 }, { "epoch": 0.147, "grad_norm": 0.5894250508009748, "learning_rate": 9.478e-06, "loss": 2.3289, "step": 7350 }, { "epoch": 0.1475, "grad_norm": 0.5698558287850775, "learning_rate": 9.472444444444445e-06, "loss": 2.3363, "step": 7375 }, { "epoch": 0.148, "grad_norm": 0.5704695535231787, "learning_rate": 9.46688888888889e-06, "loss": 2.3245, "step": 7400 }, { "epoch": 0.148, "eval_loss": 2.4338371753692627, "eval_runtime": 264.1068, "eval_samples_per_second": 3.112, "eval_steps_per_second": 1.556, "step": 7400 }, { "epoch": 0.1485, "grad_norm": 0.5452782996001769, "learning_rate": 9.461333333333334e-06, "loss": 2.3442, "step": 7425 }, { "epoch": 0.149, "grad_norm": 0.5741037001956839, "learning_rate": 9.455777777777777e-06, "loss": 2.3349, "step": 7450 }, { "epoch": 0.1495, "grad_norm": 0.5570524045425876, "learning_rate": 9.450222222222223e-06, "loss": 2.3324, "step": 7475 }, { "epoch": 0.15, "grad_norm": 0.5701333037498688, "learning_rate": 9.444666666666668e-06, "loss": 2.3268, "step": 7500 }, { "epoch": 0.15, "eval_loss": 2.4347753524780273, "eval_runtime": 264.1822, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.556, "step": 7500 }, { "epoch": 0.1505, "grad_norm": 0.5636194713998469, "learning_rate": 9.439111111111111e-06, "loss": 2.3324, "step": 7525 }, { "epoch": 0.151, "grad_norm": 0.5745462812172999, "learning_rate": 9.433555555555557e-06, "loss": 2.3438, "step": 7550 }, { "epoch": 0.1515, "grad_norm": 0.5658180287749817, "learning_rate": 9.428e-06, "loss": 2.3272, "step": 7575 }, { "epoch": 0.152, "grad_norm": 0.5590021944536283, "learning_rate": 9.422444444444445e-06, "loss": 2.3379, "step": 7600 }, { "epoch": 0.152, "eval_loss": 2.43342924118042, "eval_runtime": 264.6073, "eval_samples_per_second": 3.106, "eval_steps_per_second": 1.553, "step": 7600 }, { "epoch": 0.1525, "grad_norm": 0.5756847823781959, "learning_rate": 9.41688888888889e-06, "loss": 2.3291, "step": 7625 }, { "epoch": 0.153, "grad_norm": 0.5614727649452073, "learning_rate": 9.411333333333334e-06, "loss": 2.3164, "step": 7650 }, { "epoch": 0.1535, "grad_norm": 0.581410678990456, "learning_rate": 9.405777777777778e-06, "loss": 2.3205, "step": 7675 }, { "epoch": 0.154, "grad_norm": 0.6063515370764081, "learning_rate": 9.400222222222223e-06, "loss": 2.3331, "step": 7700 }, { "epoch": 0.154, "eval_loss": 2.435711622238159, "eval_runtime": 283.6724, "eval_samples_per_second": 2.898, "eval_steps_per_second": 1.449, "step": 7700 }, { "epoch": 0.1545, "grad_norm": 0.5535459156675728, "learning_rate": 9.394666666666668e-06, "loss": 2.3312, "step": 7725 }, { "epoch": 0.155, "grad_norm": 0.5550223235337549, "learning_rate": 9.389111111111112e-06, "loss": 2.3222, "step": 7750 }, { "epoch": 0.1555, "grad_norm": 0.5661396564004607, "learning_rate": 9.383555555555557e-06, "loss": 2.329, "step": 7775 }, { "epoch": 0.156, "grad_norm": 0.5754229466302317, "learning_rate": 9.378e-06, "loss": 2.3375, "step": 7800 }, { "epoch": 0.156, "eval_loss": 2.4339263439178467, "eval_runtime": 263.7245, "eval_samples_per_second": 3.117, "eval_steps_per_second": 1.558, "step": 7800 }, { "epoch": 0.1565, "grad_norm": 0.5922113870936093, "learning_rate": 9.372444444444446e-06, "loss": 2.3326, "step": 7825 }, { "epoch": 0.157, "grad_norm": 0.5802231546249389, "learning_rate": 9.36688888888889e-06, "loss": 2.3313, "step": 7850 }, { "epoch": 0.1575, "grad_norm": 0.5613750089293277, "learning_rate": 9.361333333333335e-06, "loss": 2.3306, "step": 7875 }, { "epoch": 0.158, "grad_norm": 0.5554952690049914, "learning_rate": 9.355777777777778e-06, "loss": 2.3307, "step": 7900 }, { "epoch": 0.158, "eval_loss": 2.435500144958496, "eval_runtime": 268.1064, "eval_samples_per_second": 3.066, "eval_steps_per_second": 1.533, "step": 7900 }, { "epoch": 0.1585, "grad_norm": 0.5699743157285643, "learning_rate": 9.350222222222224e-06, "loss": 2.3274, "step": 7925 }, { "epoch": 0.159, "grad_norm": 0.580771514541295, "learning_rate": 9.344666666666667e-06, "loss": 2.3238, "step": 7950 }, { "epoch": 0.1595, "grad_norm": 0.563419791930312, "learning_rate": 9.339111111111112e-06, "loss": 2.3384, "step": 7975 }, { "epoch": 0.16, "grad_norm": 0.5793778749938447, "learning_rate": 9.333555555555558e-06, "loss": 2.3291, "step": 8000 }, { "epoch": 0.16, "eval_loss": 2.4343531131744385, "eval_runtime": 263.9111, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.557, "step": 8000 }, { "epoch": 0.1605, "grad_norm": 0.5748501940226582, "learning_rate": 9.328000000000001e-06, "loss": 2.3272, "step": 8025 }, { "epoch": 0.161, "grad_norm": 0.5776520997935511, "learning_rate": 9.322444444444445e-06, "loss": 2.3232, "step": 8050 }, { "epoch": 0.1615, "grad_norm": 0.5841162716826148, "learning_rate": 9.31688888888889e-06, "loss": 2.3252, "step": 8075 }, { "epoch": 0.162, "grad_norm": 0.5582161918345583, "learning_rate": 9.311333333333335e-06, "loss": 2.3254, "step": 8100 }, { "epoch": 0.162, "eval_loss": 2.4345877170562744, "eval_runtime": 263.9792, "eval_samples_per_second": 3.114, "eval_steps_per_second": 1.557, "step": 8100 }, { "epoch": 0.1625, "grad_norm": 0.5744381110572562, "learning_rate": 9.305777777777779e-06, "loss": 2.325, "step": 8125 }, { "epoch": 0.163, "grad_norm": 0.5801402993634438, "learning_rate": 9.300222222222222e-06, "loss": 2.3203, "step": 8150 }, { "epoch": 0.1635, "grad_norm": 0.5644380448766211, "learning_rate": 9.294666666666668e-06, "loss": 2.3179, "step": 8175 }, { "epoch": 0.164, "grad_norm": 0.5747041663572834, "learning_rate": 9.289111111111113e-06, "loss": 2.3241, "step": 8200 }, { "epoch": 0.164, "eval_loss": 2.435701847076416, "eval_runtime": 263.9699, "eval_samples_per_second": 3.114, "eval_steps_per_second": 1.557, "step": 8200 }, { "epoch": 0.1645, "grad_norm": 0.5550631701119645, "learning_rate": 9.283555555555556e-06, "loss": 2.3176, "step": 8225 }, { "epoch": 0.165, "grad_norm": 0.5828828542252756, "learning_rate": 9.278e-06, "loss": 2.3213, "step": 8250 }, { "epoch": 0.1655, "grad_norm": 0.5610132600982978, "learning_rate": 9.272444444444445e-06, "loss": 2.3117, "step": 8275 }, { "epoch": 0.166, "grad_norm": 0.5777357931804634, "learning_rate": 9.26688888888889e-06, "loss": 2.3189, "step": 8300 }, { "epoch": 0.166, "eval_loss": 2.43573260307312, "eval_runtime": 264.2018, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.556, "step": 8300 }, { "epoch": 0.1665, "grad_norm": 0.5515402141694353, "learning_rate": 9.261333333333334e-06, "loss": 2.3267, "step": 8325 }, { "epoch": 0.167, "grad_norm": 0.588745393922677, "learning_rate": 9.25577777777778e-06, "loss": 2.3219, "step": 8350 }, { "epoch": 0.1675, "grad_norm": 0.5391388541771018, "learning_rate": 9.250222222222223e-06, "loss": 2.3181, "step": 8375 }, { "epoch": 0.168, "grad_norm": 0.5680296112961243, "learning_rate": 9.244666666666668e-06, "loss": 2.3231, "step": 8400 }, { "epoch": 0.168, "eval_loss": 2.435276985168457, "eval_runtime": 263.8428, "eval_samples_per_second": 3.115, "eval_steps_per_second": 1.558, "step": 8400 }, { "epoch": 0.1685, "grad_norm": 0.5655802530008279, "learning_rate": 9.239111111111112e-06, "loss": 2.3201, "step": 8425 }, { "epoch": 0.169, "grad_norm": 0.5917481613153034, "learning_rate": 9.233555555555557e-06, "loss": 2.3184, "step": 8450 }, { "epoch": 0.1695, "grad_norm": 0.5808853698441179, "learning_rate": 9.228e-06, "loss": 2.3151, "step": 8475 }, { "epoch": 0.17, "grad_norm": 0.5868551530423814, "learning_rate": 9.222444444444446e-06, "loss": 2.3146, "step": 8500 }, { "epoch": 0.17, "eval_loss": 2.435950994491577, "eval_runtime": 264.3586, "eval_samples_per_second": 3.109, "eval_steps_per_second": 1.555, "step": 8500 }, { "epoch": 0.1705, "grad_norm": 0.5638181149272796, "learning_rate": 9.21688888888889e-06, "loss": 2.3155, "step": 8525 }, { "epoch": 0.171, "grad_norm": 0.5740285526813199, "learning_rate": 9.211333333333334e-06, "loss": 2.319, "step": 8550 }, { "epoch": 0.1715, "grad_norm": 0.5695622395648989, "learning_rate": 9.20577777777778e-06, "loss": 2.3206, "step": 8575 }, { "epoch": 0.172, "grad_norm": 0.5747463636735414, "learning_rate": 9.200222222222223e-06, "loss": 2.3111, "step": 8600 }, { "epoch": 0.172, "eval_loss": 2.4367878437042236, "eval_runtime": 264.2061, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.556, "step": 8600 }, { "epoch": 0.1725, "grad_norm": 0.5777631704492084, "learning_rate": 9.194666666666667e-06, "loss": 2.3078, "step": 8625 }, { "epoch": 0.173, "grad_norm": 0.5746886517313039, "learning_rate": 9.189111111111112e-06, "loss": 2.3152, "step": 8650 }, { "epoch": 0.1735, "grad_norm": 0.564580351173264, "learning_rate": 9.183555555555557e-06, "loss": 2.316, "step": 8675 }, { "epoch": 0.174, "grad_norm": 0.6048784393681501, "learning_rate": 9.178000000000001e-06, "loss": 2.3251, "step": 8700 }, { "epoch": 0.174, "eval_loss": 2.435750722885132, "eval_runtime": 264.296, "eval_samples_per_second": 3.11, "eval_steps_per_second": 1.555, "step": 8700 }, { "epoch": 0.1745, "grad_norm": 0.5769443750882641, "learning_rate": 9.172444444444444e-06, "loss": 2.3186, "step": 8725 }, { "epoch": 0.175, "grad_norm": 0.5792202067037501, "learning_rate": 9.16688888888889e-06, "loss": 2.3106, "step": 8750 }, { "epoch": 0.1755, "grad_norm": 0.5819115394572557, "learning_rate": 9.161333333333335e-06, "loss": 2.3118, "step": 8775 }, { "epoch": 0.176, "grad_norm": 0.575657270210696, "learning_rate": 9.155777777777779e-06, "loss": 2.3106, "step": 8800 }, { "epoch": 0.176, "eval_loss": 2.436899185180664, "eval_runtime": 263.9579, "eval_samples_per_second": 3.114, "eval_steps_per_second": 1.557, "step": 8800 }, { "epoch": 0.1765, "grad_norm": 0.572118834452971, "learning_rate": 9.150222222222222e-06, "loss": 2.3139, "step": 8825 }, { "epoch": 0.177, "grad_norm": 0.5812618278818413, "learning_rate": 9.144666666666667e-06, "loss": 2.319, "step": 8850 }, { "epoch": 0.1775, "grad_norm": 0.5527533551295488, "learning_rate": 9.139111111111113e-06, "loss": 2.3152, "step": 8875 }, { "epoch": 0.178, "grad_norm": 0.5749551425231054, "learning_rate": 9.133555555555556e-06, "loss": 2.3065, "step": 8900 }, { "epoch": 0.178, "eval_loss": 2.4364571571350098, "eval_runtime": 264.0259, "eval_samples_per_second": 3.113, "eval_steps_per_second": 1.557, "step": 8900 }, { "epoch": 0.1785, "grad_norm": 0.5758182476998225, "learning_rate": 9.128e-06, "loss": 2.3104, "step": 8925 }, { "epoch": 0.179, "grad_norm": 0.5922756280220078, "learning_rate": 9.122444444444445e-06, "loss": 2.3158, "step": 8950 }, { "epoch": 0.1795, "grad_norm": 0.5943790910117238, "learning_rate": 9.11688888888889e-06, "loss": 2.3167, "step": 8975 }, { "epoch": 0.18, "grad_norm": 0.580613992072982, "learning_rate": 9.111333333333334e-06, "loss": 2.3069, "step": 9000 }, { "epoch": 0.18, "eval_loss": 2.436984062194824, "eval_runtime": 264.2235, "eval_samples_per_second": 3.111, "eval_steps_per_second": 1.556, "step": 9000 } ], "logging_steps": 25, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8648820684944835e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }