diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2533 +1,3253 @@ { - "best_global_step": 2700, - "best_metric": 2.4390792846679688, - "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-2000", - "epoch": 0.14, + "best_global_step": 4300, + "best_metric": 2.432278633117676, + "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-4000", + "epoch": 0.18, "eval_steps": 100, - "global_step": 7000, + "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, - "grad_norm": 50.656509992459746, + "grad_norm": 39.75564521032967, "learning_rate": 4.8e-08, - "loss": 4.0214, + "loss": 3.6517, "step": 25 }, { "epoch": 0.001, - "grad_norm": 38.99252800700454, + "grad_norm": 28.937531835097435, "learning_rate": 9.8e-08, - "loss": 3.935, + "loss": 3.5931, "step": 50 }, { "epoch": 0.0015, - "grad_norm": 21.472407776297636, + "grad_norm": 21.922720332659644, "learning_rate": 1.4800000000000003e-07, - "loss": 3.6524, + "loss": 3.3397, "step": 75 }, { "epoch": 0.002, - "grad_norm": 10.352839141449, + "grad_norm": 8.739610199908325, "learning_rate": 1.9800000000000003e-07, - "loss": 3.3517, + "loss": 3.1289, "step": 100 }, { "epoch": 0.002, - "eval_loss": 3.0817508697509766, - "eval_runtime": 266.4152, - "eval_samples_per_second": 3.085, - "eval_steps_per_second": 1.543, + "eval_loss": 2.9243295192718506, + "eval_runtime": 264.3302, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 1.555, "step": 100 }, { "epoch": 0.0025, - "grad_norm": 4.30904423901761, + "grad_norm": 4.433912600039677, "learning_rate": 2.48e-07, - "loss": 3.036, + "loss": 2.8957, "step": 125 }, { "epoch": 0.003, - "grad_norm": 2.264577383947395, + "grad_norm": 3.2874790066620303, "learning_rate": 2.9800000000000005e-07, - "loss": 2.8341, + "loss": 2.763, "step": 150 }, { "epoch": 0.0035, - "grad_norm": 1.4155513897039314, + "grad_norm": 1.5203472215469231, "learning_rate": 3.48e-07, - "loss": 2.73, + "loss": 2.676, "step": 175 }, { "epoch": 0.004, - "grad_norm": 1.2078696095638184, + "grad_norm": 1.1945541683905954, "learning_rate": 3.9800000000000004e-07, - "loss": 2.6731, + "loss": 2.635, "step": 200 }, { "epoch": 0.004, - "eval_loss": 2.635507345199585, - "eval_runtime": 266.4141, - "eval_samples_per_second": 3.085, - "eval_steps_per_second": 1.543, + "eval_loss": 2.6094932556152344, + "eval_runtime": 265.7702, + "eval_samples_per_second": 3.093, + "eval_steps_per_second": 1.546, "step": 200 }, { "epoch": 0.0045, - "grad_norm": 1.0846849213120675, + "grad_norm": 1.0852713304633745, "learning_rate": 4.4800000000000004e-07, - "loss": 2.6351, + "loss": 2.6016, "step": 225 }, { "epoch": 0.005, - "grad_norm": 0.9559789554521467, + "grad_norm": 1.0733940346699529, "learning_rate": 4.98e-07, - "loss": 2.601, + "loss": 2.5797, "step": 250 }, { "epoch": 0.0055, - "grad_norm": 1.0192810065788216, + "grad_norm": 0.9273949035031271, "learning_rate": 5.480000000000001e-07, - "loss": 2.5828, + "loss": 2.5607, "step": 275 }, { "epoch": 0.006, - "grad_norm": 1.1431990209968765, + "grad_norm": 0.9289300678591714, "learning_rate": 5.98e-07, - "loss": 2.5634, + "loss": 2.552, "step": 300 }, { "epoch": 0.006, - "eval_loss": 2.550046682357788, - "eval_runtime": 267.4638, - "eval_samples_per_second": 3.073, - "eval_steps_per_second": 1.537, + "eval_loss": 2.541522264480591, + "eval_runtime": 266.7478, + "eval_samples_per_second": 3.082, + "eval_steps_per_second": 1.541, "step": 300 }, { "epoch": 0.0065, - "grad_norm": 2.767331212303632, + "grad_norm": 1.1328584507449984, "learning_rate": 6.48e-07, - "loss": 2.5495, + "loss": 2.5402, "step": 325 }, { "epoch": 0.007, - "grad_norm": 2.515653095645461, + "grad_norm": 0.8593307029257858, "learning_rate": 6.98e-07, - "loss": 2.5327, + "loss": 2.5286, "step": 350 }, { "epoch": 0.0075, - "grad_norm": 3.1526237978853615, + "grad_norm": 0.895615604067586, "learning_rate": 7.480000000000001e-07, - "loss": 2.5176, + "loss": 2.5311, "step": 375 }, { "epoch": 0.008, - "grad_norm": 3.62440809116865, + "grad_norm": 0.912306580242149, "learning_rate": 7.98e-07, - "loss": 2.5127, + "loss": 2.5037, "step": 400 }, { "epoch": 0.008, - "eval_loss": 2.5152335166931152, - "eval_runtime": 266.9874, - "eval_samples_per_second": 3.079, - "eval_steps_per_second": 1.539, + "eval_loss": 2.514389991760254, + "eval_runtime": 266.4899, + "eval_samples_per_second": 3.085, + "eval_steps_per_second": 1.542, "step": 400 }, { "epoch": 0.0085, - "grad_norm": 7.135146349888136, + "grad_norm": 1.1866535514670034, "learning_rate": 8.480000000000001e-07, - "loss": 2.5162, + "loss": 2.5011, "step": 425 }, { "epoch": 0.009, - "grad_norm": 2.648120897857043, + "grad_norm": 1.211342504193914, "learning_rate": 8.980000000000001e-07, - "loss": 2.4957, + "loss": 2.503, "step": 450 }, { "epoch": 0.0095, - "grad_norm": 1.8002381671303913, + "grad_norm": 1.113763817383069, "learning_rate": 9.480000000000001e-07, - "loss": 2.4903, + "loss": 2.4999, "step": 475 }, { "epoch": 0.01, - "grad_norm": 4.371053421441641, + "grad_norm": 1.2585585589647226, "learning_rate": 9.98e-07, - "loss": 2.4832, + "loss": 2.4872, "step": 500 }, { "epoch": 0.01, - "eval_loss": 2.497570514678955, - "eval_runtime": 267.4804, - "eval_samples_per_second": 3.073, - "eval_steps_per_second": 1.537, + "eval_loss": 2.497868061065674, + "eval_runtime": 265.7962, + "eval_samples_per_second": 3.093, + "eval_steps_per_second": 1.546, "step": 500 }, { "epoch": 0.0105, - "grad_norm": 2.7367881121941138, + "grad_norm": 1.2585825718084245, "learning_rate": 1.0480000000000002e-06, - "loss": 2.4764, + "loss": 2.4852, "step": 525 }, { "epoch": 0.011, - "grad_norm": 2.6059320858334547, + "grad_norm": 1.4101257437846046, "learning_rate": 1.0980000000000001e-06, - "loss": 2.4817, + "loss": 2.4892, "step": 550 }, { "epoch": 0.0115, - "grad_norm": 1.4561576361319408, + "grad_norm": 1.1975234150707363, "learning_rate": 1.148e-06, - "loss": 2.4699, + "loss": 2.4861, "step": 575 }, { "epoch": 0.012, - "grad_norm": 1.6446545702256075, + "grad_norm": 1.3662769225582332, "learning_rate": 1.1980000000000002e-06, - "loss": 2.4625, + "loss": 2.4882, "step": 600 }, { "epoch": 0.012, - "eval_loss": 2.480875253677368, - "eval_runtime": 267.1762, - "eval_samples_per_second": 3.077, - "eval_steps_per_second": 1.538, + "eval_loss": 2.4879231452941895, + "eval_runtime": 267.0005, + "eval_samples_per_second": 3.079, + "eval_steps_per_second": 1.539, "step": 600 }, { "epoch": 0.0125, - "grad_norm": 1.8219267735301914, + "grad_norm": 1.3086724275194024, "learning_rate": 1.248e-06, - "loss": 2.4611, + "loss": 2.4745, "step": 625 }, { "epoch": 0.013, - "grad_norm": 1.1370020634221731, + "grad_norm": 1.317023206802888, "learning_rate": 1.2980000000000001e-06, - "loss": 2.466, + "loss": 2.4727, "step": 650 }, { "epoch": 0.0135, - "grad_norm": 2.1094427938442104, + "grad_norm": 1.5284967544483212, "learning_rate": 1.348e-06, - "loss": 2.4612, + "loss": 2.469, "step": 675 }, { "epoch": 0.014, - "grad_norm": 2.68194405349932, + "grad_norm": 1.1047595217316941, "learning_rate": 1.3980000000000002e-06, - "loss": 2.4628, + "loss": 2.4695, "step": 700 }, { "epoch": 0.014, - "eval_loss": 2.4722726345062256, - "eval_runtime": 268.4962, - "eval_samples_per_second": 3.061, - "eval_steps_per_second": 1.531, + "eval_loss": 2.480103015899658, + "eval_runtime": 263.5022, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 1.56, "step": 700 }, { "epoch": 0.0145, - "grad_norm": 2.035476406407236, + "grad_norm": 1.2077328209863791, "learning_rate": 1.4480000000000002e-06, - "loss": 2.4472, + "loss": 2.4654, "step": 725 }, { "epoch": 0.015, - "grad_norm": 3.271849266938815, + "grad_norm": 1.209220841771836, "learning_rate": 1.498e-06, - "loss": 2.449, + "loss": 2.4663, "step": 750 }, { "epoch": 0.0155, - "grad_norm": 0.9814118372932493, + "grad_norm": 1.3063169829879686, "learning_rate": 1.548e-06, - "loss": 2.453, + "loss": 2.4704, "step": 775 }, { "epoch": 0.016, - "grad_norm": 2.6810297902336244, + "grad_norm": 1.3180183352683195, "learning_rate": 1.5980000000000002e-06, - "loss": 2.4463, + "loss": 2.4583, "step": 800 }, { "epoch": 0.016, - "eval_loss": 2.4666495323181152, - "eval_runtime": 414.1548, - "eval_samples_per_second": 1.985, - "eval_steps_per_second": 0.992, + "eval_loss": 2.473590850830078, + "eval_runtime": 305.9875, + "eval_samples_per_second": 2.686, + "eval_steps_per_second": 1.343, "step": 800 }, { "epoch": 0.0165, - "grad_norm": 1.2790805248924313, + "grad_norm": 1.1674852380778837, "learning_rate": 1.6480000000000001e-06, - "loss": 2.4472, + "loss": 2.467, "step": 825 }, { "epoch": 0.017, - "grad_norm": 1.0740073529506808, + "grad_norm": 1.2497656349941002, "learning_rate": 1.6980000000000003e-06, - "loss": 2.452, + "loss": 2.4612, "step": 850 }, { "epoch": 0.0175, - "grad_norm": 1.551336448361318, + "grad_norm": 1.3358614980967494, "learning_rate": 1.7480000000000002e-06, - "loss": 2.4468, + "loss": 2.4636, "step": 875 }, { "epoch": 0.018, - "grad_norm": 2.3542378163349933, + "grad_norm": 1.252489857653356, "learning_rate": 1.798e-06, - "loss": 2.4436, + "loss": 2.454, "step": 900 }, { "epoch": 0.018, - "eval_loss": 2.461561441421509, - "eval_runtime": 580.7751, - "eval_samples_per_second": 1.415, - "eval_steps_per_second": 0.708, + "eval_loss": 2.4681763648986816, + "eval_runtime": 264.702, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 1.553, "step": 900 }, { "epoch": 0.0185, - "grad_norm": 2.0862642215782854, + "grad_norm": 1.2815437998994337, "learning_rate": 1.8480000000000001e-06, - "loss": 2.4469, + "loss": 2.4571, "step": 925 }, { "epoch": 0.019, - "grad_norm": 1.1621391054674521, + "grad_norm": 1.0902475329451575, "learning_rate": 1.898e-06, - "loss": 2.4419, + "loss": 2.451, "step": 950 }, { "epoch": 0.0195, - "grad_norm": 1.1717546484062549, + "grad_norm": 1.1502696024965324, "learning_rate": 1.9480000000000002e-06, - "loss": 2.4337, + "loss": 2.4527, "step": 975 }, { "epoch": 0.02, - "grad_norm": 1.7485677588723867, + "grad_norm": 1.2336661855806117, "learning_rate": 1.998e-06, - "loss": 2.4448, + "loss": 2.4496, "step": 1000 }, { "epoch": 0.02, - "eval_loss": 2.4590463638305664, - "eval_runtime": 271.0126, - "eval_samples_per_second": 3.033, - "eval_steps_per_second": 1.517, + "eval_loss": 2.463880777359009, + "eval_runtime": 275.7426, + "eval_samples_per_second": 2.981, + "eval_steps_per_second": 1.491, "step": 1000 }, { "epoch": 0.0205, - "grad_norm": 1.2350950769146356, + "grad_norm": 1.2680742209094296, "learning_rate": 2.048e-06, - "loss": 2.4331, + "loss": 2.4494, "step": 1025 }, { "epoch": 0.021, - "grad_norm": 1.4443797116679313, + "grad_norm": 1.0341778808278126, "learning_rate": 2.098e-06, - "loss": 2.4292, + "loss": 2.4467, "step": 1050 }, { "epoch": 0.0215, - "grad_norm": 1.8443466972633604, + "grad_norm": 0.9860490736001175, "learning_rate": 2.148e-06, - "loss": 2.4354, + "loss": 2.4473, "step": 1075 }, { "epoch": 0.022, - "grad_norm": 0.9668323863754179, + "grad_norm": 0.9419267295275278, "learning_rate": 2.198e-06, - "loss": 2.4351, + "loss": 2.443, "step": 1100 }, { "epoch": 0.022, - "eval_loss": 2.4556779861450195, - "eval_runtime": 272.0439, - "eval_samples_per_second": 3.022, - "eval_steps_per_second": 1.511, + "eval_loss": 2.4598941802978516, + "eval_runtime": 265.0502, + "eval_samples_per_second": 3.101, + "eval_steps_per_second": 1.551, "step": 1100 }, { "epoch": 0.0225, - "grad_norm": 1.7207654971672084, + "grad_norm": 1.3280720471027394, "learning_rate": 2.2480000000000003e-06, - "loss": 2.4313, + "loss": 2.4515, "step": 1125 }, { "epoch": 0.023, - "grad_norm": 1.0609444308653502, + "grad_norm": 1.053570785582915, "learning_rate": 2.2980000000000003e-06, - "loss": 2.4268, + "loss": 2.4396, "step": 1150 }, { "epoch": 0.0235, - "grad_norm": 2.632073425850519, + "grad_norm": 0.9108119839585552, "learning_rate": 2.3480000000000002e-06, - "loss": 2.4279, + "loss": 2.4442, "step": 1175 }, { "epoch": 0.024, - "grad_norm": 0.8736598209263914, + "grad_norm": 1.0062346367900277, "learning_rate": 2.398e-06, - "loss": 2.4224, + "loss": 2.4443, "step": 1200 }, { "epoch": 0.024, - "eval_loss": 2.452965021133423, - "eval_runtime": 279.2551, - "eval_samples_per_second": 2.944, - "eval_steps_per_second": 1.472, + "eval_loss": 2.456455945968628, + "eval_runtime": 264.5888, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 1.553, "step": 1200 }, { "epoch": 0.0245, - "grad_norm": 1.1010146380315569, + "grad_norm": 1.0264127705426926, "learning_rate": 2.448e-06, - "loss": 2.4347, + "loss": 2.4351, "step": 1225 }, { "epoch": 0.025, - "grad_norm": 1.1289837790122792, + "grad_norm": 0.8015249588347212, "learning_rate": 2.498e-06, - "loss": 2.4359, + "loss": 2.4406, "step": 1250 }, { "epoch": 0.0255, - "grad_norm": 1.332600172456898, + "grad_norm": 1.1105649485540114, "learning_rate": 2.5480000000000004e-06, - "loss": 2.4233, + "loss": 2.4377, "step": 1275 }, { "epoch": 0.026, - "grad_norm": 0.918676647582182, + "grad_norm": 0.9701758426012801, "learning_rate": 2.598e-06, - "loss": 2.4257, + "loss": 2.4341, "step": 1300 }, { "epoch": 0.026, - "eval_loss": 2.45108962059021, - "eval_runtime": 273.2611, - "eval_samples_per_second": 3.008, - "eval_steps_per_second": 1.504, + "eval_loss": 2.453026056289673, + "eval_runtime": 264.7653, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 1.552, "step": 1300 }, { "epoch": 0.0265, - "grad_norm": 1.4029272839399918, + "grad_norm": 0.9587254891845429, "learning_rate": 2.648e-06, - "loss": 2.4289, + "loss": 2.4303, "step": 1325 }, { "epoch": 0.027, - "grad_norm": 1.2160130720202662, + "grad_norm": 0.8135883960763247, "learning_rate": 2.6980000000000003e-06, - "loss": 2.4265, + "loss": 2.4363, "step": 1350 }, { "epoch": 0.0275, - "grad_norm": 0.9695113084191667, + "grad_norm": 0.9192860127847176, "learning_rate": 2.748e-06, - "loss": 2.4195, + "loss": 2.4257, "step": 1375 }, { "epoch": 0.028, - "grad_norm": 1.7441288123872434, + "grad_norm": 0.947465928893444, "learning_rate": 2.798e-06, - "loss": 2.4298, + "loss": 2.4353, "step": 1400 }, { "epoch": 0.028, - "eval_loss": 2.4491610527038574, - "eval_runtime": 583.0171, - "eval_samples_per_second": 1.41, - "eval_steps_per_second": 0.705, + "eval_loss": 2.450345993041992, + "eval_runtime": 265.6266, + "eval_samples_per_second": 3.095, + "eval_steps_per_second": 1.547, "step": 1400 }, { "epoch": 0.0285, - "grad_norm": 1.0715504482160458, + "grad_norm": 0.9270137901066681, "learning_rate": 2.848e-06, - "loss": 2.4262, + "loss": 2.4347, "step": 1425 }, { "epoch": 0.029, - "grad_norm": 1.572598515142917, + "grad_norm": 0.8839980710491563, "learning_rate": 2.8980000000000005e-06, - "loss": 2.4247, + "loss": 2.4213, "step": 1450 }, { "epoch": 0.0295, - "grad_norm": 0.8769894814820209, + "grad_norm": 0.913196005454606, "learning_rate": 2.9480000000000004e-06, - "loss": 2.4203, + "loss": 2.4232, "step": 1475 }, { "epoch": 0.03, - "grad_norm": 1.15218078551874, + "grad_norm": 0.8139623858623861, "learning_rate": 2.9980000000000003e-06, - "loss": 2.4132, + "loss": 2.4254, "step": 1500 }, { "epoch": 0.03, - "eval_loss": 2.4474868774414062, - "eval_runtime": 269.8322, - "eval_samples_per_second": 3.046, - "eval_steps_per_second": 1.523, + "eval_loss": 2.447662830352783, + "eval_runtime": 263.4353, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 1.56, "step": 1500 }, { "epoch": 0.0305, - "grad_norm": 0.9189527146104203, + "grad_norm": 0.8422198221554755, "learning_rate": 3.0480000000000003e-06, - "loss": 2.4178, + "loss": 2.4196, "step": 1525 }, { "epoch": 0.031, - "grad_norm": 0.7625325255575713, + "grad_norm": 0.8542957579365906, "learning_rate": 3.0980000000000007e-06, - "loss": 2.4177, + "loss": 2.4294, "step": 1550 }, { "epoch": 0.0315, - "grad_norm": 0.6754417969129354, + "grad_norm": 1.149263137594797, "learning_rate": 3.1480000000000006e-06, - "loss": 2.4231, + "loss": 2.4265, "step": 1575 }, { "epoch": 0.032, - "grad_norm": 0.7801492001481019, + "grad_norm": 0.811470126240392, "learning_rate": 3.198e-06, - "loss": 2.4211, + "loss": 2.4105, "step": 1600 }, { "epoch": 0.032, - "eval_loss": 2.4462578296661377, - "eval_runtime": 284.7909, - "eval_samples_per_second": 2.886, - "eval_steps_per_second": 1.443, + "eval_loss": 2.4456679821014404, + "eval_runtime": 264.056, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.556, "step": 1600 }, { "epoch": 0.0325, - "grad_norm": 1.0689417566940471, + "grad_norm": 2.3928975221881434, "learning_rate": 3.248e-06, - "loss": 2.413, + "loss": 2.4208, "step": 1625 }, { "epoch": 0.033, - "grad_norm": 0.7787232312537056, + "grad_norm": 0.8031315125360012, "learning_rate": 3.298e-06, - "loss": 2.4174, + "loss": 2.4224, "step": 1650 }, { "epoch": 0.0335, - "grad_norm": 0.781184938396322, + "grad_norm": 0.835567276692195, "learning_rate": 3.348e-06, - "loss": 2.4172, + "loss": 2.4188, "step": 1675 }, { "epoch": 0.034, - "grad_norm": 0.7747298764544236, + "grad_norm": 0.8894325175719718, "learning_rate": 3.3980000000000003e-06, - "loss": 2.413, + "loss": 2.4206, "step": 1700 }, { "epoch": 0.034, - "eval_loss": 2.4448819160461426, - "eval_runtime": 271.9561, - "eval_samples_per_second": 3.023, - "eval_steps_per_second": 1.511, + "eval_loss": 2.4437851905822754, + "eval_runtime": 264.6455, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 1.553, "step": 1700 }, { "epoch": 0.0345, - "grad_norm": 0.7757798466912066, + "grad_norm": 0.802724390649243, "learning_rate": 3.4480000000000003e-06, - "loss": 2.4098, + "loss": 2.4241, "step": 1725 }, { "epoch": 0.035, - "grad_norm": 0.7310107987133747, + "grad_norm": 0.8206312612014312, "learning_rate": 3.4980000000000002e-06, - "loss": 2.4128, + "loss": 2.4157, "step": 1750 }, { "epoch": 0.0355, - "grad_norm": 0.7001013929335956, + "grad_norm": 0.8653789917535344, "learning_rate": 3.548e-06, - "loss": 2.4186, + "loss": 2.412, "step": 1775 }, { "epoch": 0.036, - "grad_norm": 0.8450545164626939, + "grad_norm": 0.7816319078215015, "learning_rate": 3.5980000000000005e-06, - "loss": 2.4023, + "loss": 2.4179, "step": 1800 }, { "epoch": 0.036, - "eval_loss": 2.4438183307647705, - "eval_runtime": 270.9746, - "eval_samples_per_second": 3.033, - "eval_steps_per_second": 1.517, + "eval_loss": 2.4423036575317383, + "eval_runtime": 264.5578, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 1.554, "step": 1800 }, { "epoch": 0.0365, - "grad_norm": 0.699119117469249, + "grad_norm": 0.707594544466941, "learning_rate": 3.6480000000000005e-06, - "loss": 2.4147, + "loss": 2.416, "step": 1825 }, { "epoch": 0.037, - "grad_norm": 1.4648900237948448, + "grad_norm": 0.7481066913011816, "learning_rate": 3.6980000000000004e-06, - "loss": 2.4057, + "loss": 2.4242, "step": 1850 }, { "epoch": 0.0375, - "grad_norm": 0.8100762412810776, + "grad_norm": 0.7612014979445353, "learning_rate": 3.7480000000000004e-06, - "loss": 2.4, + "loss": 2.4173, "step": 1875 }, { "epoch": 0.038, - "grad_norm": 1.7107392169339468, + "grad_norm": 0.772750918048857, "learning_rate": 3.7980000000000007e-06, - "loss": 2.4064, + "loss": 2.4134, "step": 1900 }, { "epoch": 0.038, - "eval_loss": 2.4430434703826904, - "eval_runtime": 271.5411, - "eval_samples_per_second": 3.027, - "eval_steps_per_second": 1.514, + "eval_loss": 2.440969228744507, + "eval_runtime": 274.3624, + "eval_samples_per_second": 2.996, + "eval_steps_per_second": 1.498, "step": 1900 }, { "epoch": 0.0385, - "grad_norm": 0.7400885595875731, + "grad_norm": 0.7927966042188935, "learning_rate": 3.848e-06, - "loss": 2.4045, + "loss": 2.4131, "step": 1925 }, { "epoch": 0.039, - "grad_norm": 0.6549639602828458, + "grad_norm": 0.7664274167276341, "learning_rate": 3.898e-06, - "loss": 2.4013, + "loss": 2.4133, "step": 1950 }, { "epoch": 0.0395, - "grad_norm": 0.900275875384622, + "grad_norm": 0.7038638213491795, "learning_rate": 3.948e-06, - "loss": 2.4023, + "loss": 2.4135, "step": 1975 }, { "epoch": 0.04, - "grad_norm": 0.6557480580465296, + "grad_norm": 0.7231696877425319, "learning_rate": 3.9980000000000005e-06, - "loss": 2.4067, + "loss": 2.4169, "step": 2000 }, { "epoch": 0.04, - "eval_loss": 2.4421370029449463, - "eval_runtime": 271.9467, - "eval_samples_per_second": 3.023, - "eval_steps_per_second": 1.511, + "eval_loss": 2.439641237258911, + "eval_runtime": 282.4449, + "eval_samples_per_second": 2.91, + "eval_steps_per_second": 1.455, "step": 2000 }, { "epoch": 0.0405, - "grad_norm": 0.693045116257973, + "grad_norm": 0.7184393791203537, "learning_rate": 4.048e-06, - "loss": 2.4039, + "loss": 2.4071, "step": 2025 }, { "epoch": 0.041, - "grad_norm": 0.7610891579847459, + "grad_norm": 0.7366813467336683, "learning_rate": 4.098e-06, - "loss": 2.4078, + "loss": 2.4113, "step": 2050 }, { "epoch": 0.0415, - "grad_norm": 1.3834425728285105, + "grad_norm": 0.7081408763220511, "learning_rate": 4.148000000000001e-06, - "loss": 2.4147, + "loss": 2.4168, "step": 2075 }, { "epoch": 0.042, - "grad_norm": 1.1248195017181577, + "grad_norm": 0.6912835983850483, "learning_rate": 4.198e-06, - "loss": 2.4081, + "loss": 2.4105, "step": 2100 }, { "epoch": 0.042, - "eval_loss": 2.441387891769409, - "eval_runtime": 273.2847, - "eval_samples_per_second": 3.008, - "eval_steps_per_second": 1.504, + "eval_loss": 2.438904047012329, + "eval_runtime": 277.7481, + "eval_samples_per_second": 2.96, + "eval_steps_per_second": 1.48, "step": 2100 }, { "epoch": 0.0425, - "grad_norm": 0.9505394679390867, + "grad_norm": 0.7745538733736145, "learning_rate": 4.248000000000001e-06, - "loss": 2.3997, + "loss": 2.4131, "step": 2125 }, { "epoch": 0.043, - "grad_norm": 0.6617161262323777, + "grad_norm": 0.6897576190091962, "learning_rate": 4.298e-06, - "loss": 2.4092, + "loss": 2.4084, "step": 2150 }, { "epoch": 0.0435, - "grad_norm": 0.8041142918976389, + "grad_norm": 0.7020994032566351, "learning_rate": 4.3480000000000006e-06, - "loss": 2.3911, + "loss": 2.4125, "step": 2175 }, { "epoch": 0.044, - "grad_norm": 0.6836064943747875, + "grad_norm": 0.6668651869738377, "learning_rate": 4.398000000000001e-06, - "loss": 2.4009, + "loss": 2.4034, "step": 2200 }, { "epoch": 0.044, - "eval_loss": 2.4408862590789795, - "eval_runtime": 272.2045, - "eval_samples_per_second": 3.02, - "eval_steps_per_second": 1.51, + "eval_loss": 2.4380908012390137, + "eval_runtime": 268.2252, + "eval_samples_per_second": 3.065, + "eval_steps_per_second": 1.532, "step": 2200 }, { "epoch": 0.0445, - "grad_norm": 0.8459080804967606, + "grad_norm": 0.6547759047620061, "learning_rate": 4.4480000000000004e-06, - "loss": 2.3956, + "loss": 2.4099, "step": 2225 }, { "epoch": 0.045, - "grad_norm": 0.7291051916367819, + "grad_norm": 0.6865815945777785, "learning_rate": 4.498e-06, - "loss": 2.3987, + "loss": 2.412, "step": 2250 }, { "epoch": 0.0455, - "grad_norm": 0.8931508487954566, + "grad_norm": 0.6878267781655092, "learning_rate": 4.548e-06, - "loss": 2.3998, + "loss": 2.4137, "step": 2275 }, { "epoch": 0.046, - "grad_norm": 0.7182312121808919, + "grad_norm": 0.8314813616644483, "learning_rate": 4.598e-06, - "loss": 2.3986, + "loss": 2.4097, "step": 2300 }, { "epoch": 0.046, - "eval_loss": 2.4408977031707764, - "eval_runtime": 270.8718, - "eval_samples_per_second": 3.035, - "eval_steps_per_second": 1.517, + "eval_loss": 2.4374496936798096, + "eval_runtime": 263.1701, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 1.562, "step": 2300 }, { "epoch": 0.0465, - "grad_norm": 0.6845849871247104, + "grad_norm": 0.6723966792931375, "learning_rate": 4.648e-06, - "loss": 2.3993, + "loss": 2.4051, "step": 2325 }, { "epoch": 0.047, - "grad_norm": 0.6454402382658997, + "grad_norm": 0.7003756914046538, "learning_rate": 4.698000000000001e-06, - "loss": 2.3919, + "loss": 2.4032, "step": 2350 }, { "epoch": 0.0475, - "grad_norm": 0.7059991305508788, + "grad_norm": 0.6747085415631567, "learning_rate": 4.748e-06, - "loss": 2.3932, + "loss": 2.4096, "step": 2375 }, { "epoch": 0.048, - "grad_norm": 0.7912136028957374, + "grad_norm": 0.6571218540079207, "learning_rate": 4.7980000000000005e-06, - "loss": 2.3981, + "loss": 2.4165, "step": 2400 }, { "epoch": 0.048, - "eval_loss": 2.4403159618377686, - "eval_runtime": 268.2333, - "eval_samples_per_second": 3.064, - "eval_steps_per_second": 1.532, + "eval_loss": 2.4365923404693604, + "eval_runtime": 264.2268, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.555, "step": 2400 }, { "epoch": 0.0485, - "grad_norm": 0.8023695737551224, + "grad_norm": 0.7464314980483315, "learning_rate": 4.848000000000001e-06, - "loss": 2.3989, + "loss": 2.4098, "step": 2425 }, { "epoch": 0.049, - "grad_norm": 0.6698221863733745, + "grad_norm": 0.6267266619200393, "learning_rate": 4.898e-06, - "loss": 2.3846, + "loss": 2.4019, "step": 2450 }, { "epoch": 0.0495, - "grad_norm": 0.6888793258344078, + "grad_norm": 0.6650772680412506, "learning_rate": 4.948000000000001e-06, - "loss": 2.3895, + "loss": 2.405, "step": 2475 }, { "epoch": 0.05, - "grad_norm": 1.0427086572633348, + "grad_norm": 0.7197173899674899, "learning_rate": 4.998e-06, - "loss": 2.3909, + "loss": 2.4095, "step": 2500 }, { "epoch": 0.05, - "eval_loss": 2.4396488666534424, - "eval_runtime": 269.5843, - "eval_samples_per_second": 3.049, - "eval_steps_per_second": 1.525, + "eval_loss": 2.4358348846435547, + "eval_runtime": 266.7682, + "eval_samples_per_second": 3.081, + "eval_steps_per_second": 1.541, "step": 2500 }, { "epoch": 0.0505, - "grad_norm": 0.7651432866063412, + "grad_norm": 0.6249572472256157, "learning_rate": 5.048000000000001e-06, - "loss": 2.3868, + "loss": 2.4058, "step": 2525 }, { "epoch": 0.051, - "grad_norm": 0.9011841495487002, + "grad_norm": 0.7429228032719255, "learning_rate": 5.098000000000001e-06, - "loss": 2.3831, + "loss": 2.4084, "step": 2550 }, { "epoch": 0.0515, - "grad_norm": 0.7564083994967769, + "grad_norm": 0.6320325962693778, "learning_rate": 5.1480000000000005e-06, - "loss": 2.3847, + "loss": 2.4015, "step": 2575 }, { "epoch": 0.052, - "grad_norm": 1.2435018934793685, + "grad_norm": 0.672581755106835, "learning_rate": 5.198000000000001e-06, - "loss": 2.3819, + "loss": 2.4051, "step": 2600 }, { "epoch": 0.052, - "eval_loss": 2.439671039581299, - "eval_runtime": 268.9479, - "eval_samples_per_second": 3.056, - "eval_steps_per_second": 1.528, + "eval_loss": 2.4351842403411865, + "eval_runtime": 264.9149, + "eval_samples_per_second": 3.103, + "eval_steps_per_second": 1.551, "step": 2600 }, { "epoch": 0.0525, - "grad_norm": 0.6889931430781852, + "grad_norm": 0.7086480776921088, "learning_rate": 5.248000000000001e-06, - "loss": 2.3866, + "loss": 2.3988, "step": 2625 }, { "epoch": 0.053, - "grad_norm": 0.6691285553697948, + "grad_norm": 0.6774201154936552, "learning_rate": 5.298000000000001e-06, - "loss": 2.3847, + "loss": 2.394, "step": 2650 }, { "epoch": 0.0535, - "grad_norm": 0.763818492118123, + "grad_norm": 0.6661104910300973, "learning_rate": 5.348000000000001e-06, - "loss": 2.3869, + "loss": 2.4034, "step": 2675 }, { "epoch": 0.054, - "grad_norm": 0.751625809368973, + "grad_norm": 0.6224421593448741, "learning_rate": 5.398e-06, - "loss": 2.3849, + "loss": 2.3939, "step": 2700 }, { "epoch": 0.054, - "eval_loss": 2.4390792846679688, - "eval_runtime": 269.1474, - "eval_samples_per_second": 3.054, - "eval_steps_per_second": 1.527, + "eval_loss": 2.434826374053955, + "eval_runtime": 264.1641, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 1.556, "step": 2700 }, { "epoch": 0.0545, - "grad_norm": 0.7089053263326198, + "grad_norm": 0.6944661408419767, "learning_rate": 5.448e-06, - "loss": 2.3888, + "loss": 2.4064, "step": 2725 }, { "epoch": 0.055, - "grad_norm": 0.7882614052121117, + "grad_norm": 0.6597297955298902, "learning_rate": 5.498e-06, - "loss": 2.3867, + "loss": 2.4051, "step": 2750 }, { "epoch": 0.0555, - "grad_norm": 0.834757676919121, + "grad_norm": 0.6526109506522182, "learning_rate": 5.548e-06, - "loss": 2.3934, + "loss": 2.4124, "step": 2775 }, { "epoch": 0.056, - "grad_norm": 0.8944452545678075, + "grad_norm": 0.6528041780055424, "learning_rate": 5.5980000000000004e-06, - "loss": 2.3806, + "loss": 2.3979, "step": 2800 }, { "epoch": 0.056, - "eval_loss": 2.4394161701202393, - "eval_runtime": 267.9021, - "eval_samples_per_second": 3.068, - "eval_steps_per_second": 1.534, + "eval_loss": 2.4344167709350586, + "eval_runtime": 264.2924, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 1.555, "step": 2800 }, { "epoch": 0.0565, - "grad_norm": 0.7299925105583852, + "grad_norm": 0.7067565611523313, "learning_rate": 5.648e-06, - "loss": 2.3838, + "loss": 2.398, "step": 2825 }, { "epoch": 0.057, - "grad_norm": 0.8737328950825439, + "grad_norm": 0.6416666495903947, "learning_rate": 5.698e-06, - "loss": 2.3845, + "loss": 2.3991, "step": 2850 }, { "epoch": 0.0575, - "grad_norm": 0.7907442941301994, + "grad_norm": 0.6605105424774851, "learning_rate": 5.748e-06, - "loss": 2.3834, + "loss": 2.3962, "step": 2875 }, { "epoch": 0.058, - "grad_norm": 0.6846463994155354, + "grad_norm": 0.6308761264530915, "learning_rate": 5.798e-06, - "loss": 2.3776, + "loss": 2.4058, "step": 2900 }, { "epoch": 0.058, - "eval_loss": 2.4393930435180664, - "eval_runtime": 266.9434, - "eval_samples_per_second": 3.079, - "eval_steps_per_second": 1.54, + "eval_loss": 2.434436082839966, + "eval_runtime": 265.0112, + "eval_samples_per_second": 3.102, + "eval_steps_per_second": 1.551, "step": 2900 }, { "epoch": 0.0585, - "grad_norm": 0.62540280272208, + "grad_norm": 0.6363649329289001, "learning_rate": 5.848000000000001e-06, - "loss": 2.3808, + "loss": 2.3943, "step": 2925 }, { "epoch": 0.059, - "grad_norm": 0.8774049191705039, + "grad_norm": 0.6147983139117156, "learning_rate": 5.898e-06, - "loss": 2.3873, + "loss": 2.3982, "step": 2950 }, { "epoch": 0.0595, - "grad_norm": 0.6485716892578811, + "grad_norm": 0.611354772141602, "learning_rate": 5.9480000000000005e-06, - "loss": 2.3768, + "loss": 2.3921, "step": 2975 }, { "epoch": 0.06, - "grad_norm": 0.730163203087036, + "grad_norm": 0.6269054680170398, "learning_rate": 5.998000000000001e-06, - "loss": 2.3713, + "loss": 2.392, "step": 3000 }, { "epoch": 0.06, - "eval_loss": 2.4393091201782227, - "eval_runtime": 267.1212, - "eval_samples_per_second": 3.077, - "eval_steps_per_second": 1.539, + "eval_loss": 2.433990955352783, + "eval_runtime": 264.2169, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.556, "step": 3000 }, { "epoch": 0.0605, - "grad_norm": 0.6879314967282564, + "grad_norm": 0.6248207448228328, "learning_rate": 6.048e-06, - "loss": 2.3783, + "loss": 2.3858, "step": 3025 }, { "epoch": 0.061, - "grad_norm": 0.8144222671191256, + "grad_norm": 0.6275258656299642, "learning_rate": 6.098000000000001e-06, - "loss": 2.3675, + "loss": 2.4015, "step": 3050 }, { "epoch": 0.0615, - "grad_norm": 0.7831396066479988, + "grad_norm": 1.0457401571274152, "learning_rate": 6.148e-06, - "loss": 2.3749, + "loss": 2.3909, "step": 3075 }, { "epoch": 0.062, - "grad_norm": 0.8428793923955709, + "grad_norm": 0.6551230863319748, "learning_rate": 6.198000000000001e-06, - "loss": 2.3815, + "loss": 2.3983, "step": 3100 }, { "epoch": 0.062, - "eval_loss": 2.4399077892303467, - "eval_runtime": 267.544, - "eval_samples_per_second": 3.072, - "eval_steps_per_second": 1.536, + "eval_loss": 2.433279275894165, + "eval_runtime": 264.1521, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 1.556, "step": 3100 }, { "epoch": 0.0625, - "grad_norm": 0.6828002417283633, + "grad_norm": 0.6306746226297937, "learning_rate": 6.248000000000001e-06, - "loss": 2.3824, + "loss": 2.397, "step": 3125 }, { "epoch": 0.063, - "grad_norm": 0.6281277470124486, + "grad_norm": 0.6299802316587856, "learning_rate": 6.2980000000000005e-06, - "loss": 2.3721, + "loss": 2.4018, "step": 3150 }, { "epoch": 0.0635, - "grad_norm": 0.6760775775841215, + "grad_norm": 0.6265424590222634, "learning_rate": 6.348000000000001e-06, - "loss": 2.3691, + "loss": 2.4065, "step": 3175 }, { "epoch": 0.064, - "grad_norm": 1.5950387898495975, + "grad_norm": 0.6717273211615455, "learning_rate": 6.398000000000001e-06, - "loss": 2.3752, + "loss": 2.3906, "step": 3200 }, { "epoch": 0.064, - "eval_loss": 2.439392328262329, - "eval_runtime": 266.9401, - "eval_samples_per_second": 3.079, - "eval_steps_per_second": 1.54, + "eval_loss": 2.4333276748657227, + "eval_runtime": 263.9592, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 1.557, "step": 3200 }, { "epoch": 0.0645, - "grad_norm": 0.8656925185501568, + "grad_norm": 0.6159924635031793, "learning_rate": 6.448000000000001e-06, - "loss": 2.3756, + "loss": 2.3947, "step": 3225 }, { "epoch": 0.065, - "grad_norm": 1.0689072967386066, + "grad_norm": 0.6124462043712093, "learning_rate": 6.498000000000001e-06, - "loss": 2.374, + "loss": 2.3963, "step": 3250 }, { "epoch": 0.0655, - "grad_norm": 0.642689784270949, + "grad_norm": 0.6144378183602921, "learning_rate": 6.548000000000001e-06, - "loss": 2.3675, + "loss": 2.402, "step": 3275 }, { "epoch": 0.066, - "grad_norm": 0.7931015535140951, + "grad_norm": 0.6295732934678283, "learning_rate": 6.598000000000001e-06, - "loss": 2.3723, + "loss": 2.3877, "step": 3300 }, { "epoch": 0.066, - "eval_loss": 2.4396414756774902, - "eval_runtime": 266.3301, - "eval_samples_per_second": 3.086, - "eval_steps_per_second": 1.543, + "eval_loss": 2.4331116676330566, + "eval_runtime": 263.4524, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 1.56, "step": 3300 }, { "epoch": 0.0665, - "grad_norm": 0.7478812929813007, + "grad_norm": 0.5938287129149346, "learning_rate": 6.648e-06, - "loss": 2.3728, + "loss": 2.389, "step": 3325 }, { "epoch": 0.067, - "grad_norm": 0.7286462484545718, + "grad_norm": 0.6194783667871923, "learning_rate": 6.698e-06, - "loss": 2.3718, + "loss": 2.39, "step": 3350 }, { "epoch": 0.0675, - "grad_norm": 0.7188647675429949, + "grad_norm": 0.60927231594853, "learning_rate": 6.7480000000000004e-06, - "loss": 2.3744, + "loss": 2.3968, "step": 3375 }, { "epoch": 0.068, - "grad_norm": 0.6869274776133854, + "grad_norm": 0.6386175333576501, "learning_rate": 6.798e-06, - "loss": 2.3625, + "loss": 2.3861, "step": 3400 }, { "epoch": 0.068, - "eval_loss": 2.43986177444458, - "eval_runtime": 266.556, - "eval_samples_per_second": 3.084, - "eval_steps_per_second": 1.542, + "eval_loss": 2.4328911304473877, + "eval_runtime": 264.2923, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 1.555, "step": 3400 }, { "epoch": 0.0685, - "grad_norm": 0.7666004938241721, + "grad_norm": 0.6092295027577579, "learning_rate": 6.848e-06, - "loss": 2.3686, + "loss": 2.3827, "step": 3425 }, { "epoch": 0.069, - "grad_norm": 0.864757734602374, + "grad_norm": 0.5914846449422462, "learning_rate": 6.898e-06, - "loss": 2.3596, + "loss": 2.3894, "step": 3450 }, { "epoch": 0.0695, - "grad_norm": 0.7715710742116183, + "grad_norm": 0.5927461214526666, "learning_rate": 6.948e-06, - "loss": 2.3576, + "loss": 2.3858, "step": 3475 }, { "epoch": 0.07, - "grad_norm": 0.6372061584106886, + "grad_norm": 0.5992194088197265, "learning_rate": 6.998000000000001e-06, - "loss": 2.3729, + "loss": 2.3941, "step": 3500 }, { "epoch": 0.07, - "eval_loss": 2.4403369426727295, - "eval_runtime": 266.4426, - "eval_samples_per_second": 3.085, - "eval_steps_per_second": 1.543, + "eval_loss": 2.432774543762207, + "eval_runtime": 263.8546, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.558, "step": 3500 }, { "epoch": 0.0705, - "grad_norm": 1.176729441259039, + "grad_norm": 0.6119297158568089, "learning_rate": 7.048e-06, - "loss": 2.3655, + "loss": 2.3897, "step": 3525 }, { "epoch": 0.071, - "grad_norm": 0.6115287728562199, + "grad_norm": 0.6040666217758901, "learning_rate": 7.0980000000000005e-06, - "loss": 2.3625, + "loss": 2.3966, "step": 3550 }, { "epoch": 0.0715, - "grad_norm": 0.6246990242430912, + "grad_norm": 0.6142925813030266, "learning_rate": 7.148000000000001e-06, - "loss": 2.3618, + "loss": 2.3953, "step": 3575 }, { "epoch": 0.072, - "grad_norm": 0.7221703846470798, + "grad_norm": 0.5857079248330344, "learning_rate": 7.198e-06, - "loss": 2.3628, + "loss": 2.3854, "step": 3600 }, { "epoch": 0.072, - "eval_loss": 2.440523862838745, - "eval_runtime": 266.5015, - "eval_samples_per_second": 3.084, - "eval_steps_per_second": 1.542, + "eval_loss": 2.432868719100952, + "eval_runtime": 264.1849, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.556, "step": 3600 }, { "epoch": 0.0725, - "grad_norm": 0.6655580980887033, + "grad_norm": 0.6075613052530382, "learning_rate": 7.248000000000001e-06, - "loss": 2.3606, + "loss": 2.3798, "step": 3625 }, { "epoch": 0.073, - "grad_norm": 0.8105564047038534, + "grad_norm": 0.6146043204282547, "learning_rate": 7.298e-06, - "loss": 2.3633, + "loss": 2.3894, "step": 3650 }, { "epoch": 0.0735, - "grad_norm": 0.6850595144504594, + "grad_norm": 0.613284002341936, "learning_rate": 7.348000000000001e-06, - "loss": 2.3599, + "loss": 2.3897, "step": 3675 }, { "epoch": 0.074, - "grad_norm": 0.6572561289961083, + "grad_norm": 0.6694404263159593, "learning_rate": 7.398000000000001e-06, - "loss": 2.356, + "loss": 2.3925, "step": 3700 }, { "epoch": 0.074, - "eval_loss": 2.440370798110962, - "eval_runtime": 266.1237, - "eval_samples_per_second": 3.089, - "eval_steps_per_second": 1.544, + "eval_loss": 2.4324021339416504, + "eval_runtime": 263.3107, + "eval_samples_per_second": 3.122, + "eval_steps_per_second": 1.561, "step": 3700 }, { "epoch": 0.0745, - "grad_norm": 0.6568528045310038, + "grad_norm": 0.5756401973694445, "learning_rate": 7.4480000000000005e-06, - "loss": 2.364, + "loss": 2.3894, "step": 3725 }, { "epoch": 0.075, - "grad_norm": 0.7767463454261476, + "grad_norm": 0.5945783703417461, "learning_rate": 7.498000000000001e-06, - "loss": 2.3544, + "loss": 2.3928, "step": 3750 }, { "epoch": 0.0755, - "grad_norm": 0.6432402218022537, + "grad_norm": 0.5935750222986942, "learning_rate": 7.548000000000001e-06, - "loss": 2.3667, + "loss": 2.3774, "step": 3775 }, { "epoch": 0.076, - "grad_norm": 0.6601750907381865, + "grad_norm": 0.5938734543073783, "learning_rate": 7.598000000000001e-06, - "loss": 2.3538, + "loss": 2.3776, "step": 3800 }, { "epoch": 0.076, - "eval_loss": 2.440768241882324, - "eval_runtime": 266.5088, - "eval_samples_per_second": 3.084, - "eval_steps_per_second": 1.542, + "eval_loss": 2.432751178741455, + "eval_runtime": 263.8929, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.557, "step": 3800 }, { "epoch": 0.0765, - "grad_norm": 0.6698754690123959, + "grad_norm": 0.595820899700728, "learning_rate": 7.648e-06, - "loss": 2.3606, + "loss": 2.3804, "step": 3825 }, { "epoch": 0.077, - "grad_norm": 0.6469410659344654, + "grad_norm": 0.6079304106413467, "learning_rate": 7.698000000000002e-06, - "loss": 2.362, + "loss": 2.3917, "step": 3850 }, { "epoch": 0.0775, - "grad_norm": 1.0013339216690909, + "grad_norm": 0.6083448146618482, "learning_rate": 7.748000000000001e-06, - "loss": 2.3507, + "loss": 2.3842, "step": 3875 }, { "epoch": 0.078, - "grad_norm": 0.7506371440780338, + "grad_norm": 0.6128893415605828, "learning_rate": 7.798e-06, - "loss": 2.3452, + "loss": 2.3806, "step": 3900 }, { "epoch": 0.078, - "eval_loss": 2.4411609172821045, - "eval_runtime": 266.382, - "eval_samples_per_second": 3.086, - "eval_steps_per_second": 1.543, + "eval_loss": 2.4325239658355713, + "eval_runtime": 263.6693, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 1.559, "step": 3900 }, { "epoch": 0.0785, - "grad_norm": 0.8222668670513549, + "grad_norm": 0.6079041195191952, "learning_rate": 7.848000000000002e-06, - "loss": 2.3492, + "loss": 2.3801, "step": 3925 }, { "epoch": 0.079, - "grad_norm": 0.7348741963305673, + "grad_norm": 0.6075689821557235, "learning_rate": 7.898e-06, - "loss": 2.3514, + "loss": 2.3797, "step": 3950 }, { "epoch": 0.0795, - "grad_norm": 0.6659497394839384, + "grad_norm": 0.5882326737716994, "learning_rate": 7.948e-06, - "loss": 2.3555, + "loss": 2.3905, "step": 3975 }, { "epoch": 0.08, - "grad_norm": 0.6450727740951402, + "grad_norm": 0.5828476462223788, "learning_rate": 7.998e-06, - "loss": 2.3473, + "loss": 2.3806, "step": 4000 }, { "epoch": 0.08, - "eval_loss": 2.4414608478546143, - "eval_runtime": 266.9648, - "eval_samples_per_second": 3.079, - "eval_steps_per_second": 1.54, + "eval_loss": 2.4323527812957764, + "eval_runtime": 263.9786, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 1.557, "step": 4000 }, { "epoch": 0.0805, - "grad_norm": 0.7546326612640164, + "grad_norm": 0.5907927035367586, "learning_rate": 8.048e-06, - "loss": 2.3539, + "loss": 2.3739, "step": 4025 }, { "epoch": 0.081, - "grad_norm": 0.6761905448705341, + "grad_norm": 0.608189189988593, "learning_rate": 8.098000000000001e-06, - "loss": 2.3556, + "loss": 2.3837, "step": 4050 }, { "epoch": 0.0815, - "grad_norm": 0.7318094477510495, + "grad_norm": 0.5933025642280234, "learning_rate": 8.148e-06, - "loss": 2.3423, + "loss": 2.3814, "step": 4075 }, { "epoch": 0.082, - "grad_norm": 0.6821593855329929, + "grad_norm": 0.5898305070270532, "learning_rate": 8.198e-06, - "loss": 2.3502, + "loss": 2.3854, "step": 4100 }, { "epoch": 0.082, - "eval_loss": 2.4423110485076904, - "eval_runtime": 279.3363, - "eval_samples_per_second": 2.943, - "eval_steps_per_second": 1.471, + "eval_loss": 2.432577610015869, + "eval_runtime": 264.0972, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 1.556, "step": 4100 }, { "epoch": 0.0825, - "grad_norm": 0.6574698437748202, + "grad_norm": 0.5673002921483621, "learning_rate": 8.248e-06, - "loss": 2.3409, + "loss": 2.3827, "step": 4125 }, { "epoch": 0.083, - "grad_norm": 0.6906095764967443, + "grad_norm": 0.5859186364996516, "learning_rate": 8.298000000000001e-06, - "loss": 2.3492, + "loss": 2.3859, "step": 4150 }, { "epoch": 0.0835, - "grad_norm": 0.6442365789950762, + "grad_norm": 0.5852893491639726, "learning_rate": 8.348e-06, - "loss": 2.3488, + "loss": 2.3711, "step": 4175 }, { "epoch": 0.084, - "grad_norm": 0.6795131481296484, + "grad_norm": 0.5704807601233864, "learning_rate": 8.398e-06, - "loss": 2.3467, + "loss": 2.3682, "step": 4200 }, { "epoch": 0.084, - "eval_loss": 2.442260980606079, - "eval_runtime": 266.2484, - "eval_samples_per_second": 3.087, - "eval_steps_per_second": 1.544, + "eval_loss": 2.4325780868530273, + "eval_runtime": 264.0677, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.556, "step": 4200 }, { "epoch": 0.0845, - "grad_norm": 0.698061792277122, + "grad_norm": 0.565873049775094, "learning_rate": 8.448000000000001e-06, - "loss": 2.3436, + "loss": 2.3894, "step": 4225 }, { "epoch": 0.085, - "grad_norm": 0.7442835229840722, + "grad_norm": 0.6594348238393681, "learning_rate": 8.498e-06, - "loss": 2.3381, + "loss": 2.3736, "step": 4250 }, { "epoch": 0.0855, - "grad_norm": 0.7362824833247893, + "grad_norm": 0.6114416993962639, "learning_rate": 8.548e-06, - "loss": 2.3431, + "loss": 2.3768, "step": 4275 }, { "epoch": 0.086, - "grad_norm": 0.7137715109018318, + "grad_norm": 0.613007148558132, "learning_rate": 8.598000000000001e-06, - "loss": 2.3295, + "loss": 2.3841, "step": 4300 }, { "epoch": 0.086, - "eval_loss": 2.443796157836914, - "eval_runtime": 266.8087, - "eval_samples_per_second": 3.081, - "eval_steps_per_second": 1.54, + "eval_loss": 2.432278633117676, + "eval_runtime": 264.5455, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 1.554, "step": 4300 }, { "epoch": 0.0865, - "grad_norm": 0.7098595503905674, + "grad_norm": 0.6316113111159283, "learning_rate": 8.648000000000001e-06, - "loss": 2.3459, + "loss": 2.3853, "step": 4325 }, { "epoch": 0.087, - "grad_norm": 0.7996135906080958, + "grad_norm": 0.578758909498954, "learning_rate": 8.698e-06, - "loss": 2.3466, + "loss": 2.3838, "step": 4350 }, { "epoch": 0.0875, - "grad_norm": 0.842668515468917, + "grad_norm": 0.5663796780744771, "learning_rate": 8.748000000000002e-06, - "loss": 2.3384, + "loss": 2.3744, "step": 4375 }, { "epoch": 0.088, - "grad_norm": 0.6713693442490306, + "grad_norm": 0.5996723194508057, "learning_rate": 8.798000000000001e-06, - "loss": 2.3421, + "loss": 2.3741, "step": 4400 }, { "epoch": 0.088, - "eval_loss": 2.4439103603363037, - "eval_runtime": 266.7578, - "eval_samples_per_second": 3.081, - "eval_steps_per_second": 1.541, + "eval_loss": 2.4327504634857178, + "eval_runtime": 264.3839, + "eval_samples_per_second": 3.109, + "eval_steps_per_second": 1.555, "step": 4400 }, { "epoch": 0.0885, - "grad_norm": 0.7003512380037604, + "grad_norm": 0.5903185672805589, "learning_rate": 8.848e-06, - "loss": 2.3346, + "loss": 2.3789, "step": 4425 }, { "epoch": 0.089, - "grad_norm": 0.6607605926628429, + "grad_norm": 0.5683354037993711, "learning_rate": 8.898000000000002e-06, - "loss": 2.3406, + "loss": 2.3739, "step": 4450 }, { "epoch": 0.0895, - "grad_norm": 0.6803671624212732, + "grad_norm": 0.5992802333814672, "learning_rate": 8.948000000000001e-06, - "loss": 2.336, + "loss": 2.3805, "step": 4475 }, { "epoch": 0.09, - "grad_norm": 0.8922530007640126, + "grad_norm": 0.5951158771681028, "learning_rate": 8.998000000000001e-06, - "loss": 2.3436, + "loss": 2.3702, "step": 4500 }, { "epoch": 0.09, - "eval_loss": 2.444617509841919, - "eval_runtime": 266.6078, - "eval_samples_per_second": 3.083, - "eval_steps_per_second": 1.542, + "eval_loss": 2.432904005050659, + "eval_runtime": 264.0927, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.556, "step": 4500 }, { "epoch": 0.0905, - "grad_norm": 0.7083372956919765, + "grad_norm": 0.628437176595306, "learning_rate": 9.048e-06, - "loss": 2.3381, + "loss": 2.3705, "step": 4525 }, { "epoch": 0.091, - "grad_norm": 0.7428013956928052, + "grad_norm": 0.5852194468933433, "learning_rate": 9.098000000000002e-06, - "loss": 2.3296, + "loss": 2.3726, "step": 4550 }, { "epoch": 0.0915, - "grad_norm": 0.7273252330071415, + "grad_norm": 0.5832814461503186, "learning_rate": 9.148e-06, - "loss": 2.3316, + "loss": 2.3709, "step": 4575 }, { "epoch": 0.092, - "grad_norm": 0.7554687577503647, + "grad_norm": 0.6235298544634128, "learning_rate": 9.198e-06, - "loss": 2.3319, + "loss": 2.3823, "step": 4600 }, { "epoch": 0.092, - "eval_loss": 2.4464893341064453, - "eval_runtime": 266.7269, - "eval_samples_per_second": 3.082, - "eval_steps_per_second": 1.541, + "eval_loss": 2.433288335800171, + "eval_runtime": 264.0394, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.557, "step": 4600 }, { "epoch": 0.0925, - "grad_norm": 0.6871461093264853, + "grad_norm": 0.6097464410099737, "learning_rate": 9.248e-06, - "loss": 2.3326, + "loss": 2.3715, "step": 4625 }, { "epoch": 0.093, - "grad_norm": 0.7005189512561462, + "grad_norm": 0.5830918527201829, "learning_rate": 9.298e-06, - "loss": 2.3292, + "loss": 2.3694, "step": 4650 }, { "epoch": 0.0935, - "grad_norm": 0.7009511384899544, + "grad_norm": 0.6195865573807103, "learning_rate": 9.348000000000001e-06, - "loss": 2.329, + "loss": 2.3711, "step": 4675 }, { "epoch": 0.094, - "grad_norm": 0.6627629301918981, + "grad_norm": 0.5922485886549429, "learning_rate": 9.398e-06, - "loss": 2.3255, + "loss": 2.3764, "step": 4700 }, { "epoch": 0.094, - "eval_loss": 2.4467787742614746, - "eval_runtime": 266.7555, - "eval_samples_per_second": 3.081, - "eval_steps_per_second": 1.541, + "eval_loss": 2.4330477714538574, + "eval_runtime": 263.7501, + "eval_samples_per_second": 3.117, + "eval_steps_per_second": 1.558, "step": 4700 }, { "epoch": 0.0945, - "grad_norm": 0.6653742483339516, + "grad_norm": 0.5909566806378528, "learning_rate": 9.448e-06, - "loss": 2.3361, + "loss": 2.3799, "step": 4725 }, { "epoch": 0.095, - "grad_norm": 0.6648535177091066, + "grad_norm": 0.5872189964007283, "learning_rate": 9.498000000000001e-06, - "loss": 2.3383, + "loss": 2.3737, "step": 4750 }, { "epoch": 0.0955, - "grad_norm": 0.8353528219476875, + "grad_norm": 0.6071714619656263, "learning_rate": 9.548e-06, - "loss": 2.3232, + "loss": 2.3789, "step": 4775 }, { "epoch": 0.096, - "grad_norm": 0.6908965943260156, + "grad_norm": 0.5631342344537085, "learning_rate": 9.598e-06, - "loss": 2.3281, + "loss": 2.3641, "step": 4800 }, { "epoch": 0.096, - "eval_loss": 2.4486067295074463, - "eval_runtime": 266.8511, - "eval_samples_per_second": 3.08, - "eval_steps_per_second": 1.54, + "eval_loss": 2.4332797527313232, + "eval_runtime": 264.5164, + "eval_samples_per_second": 3.108, + "eval_steps_per_second": 1.554, "step": 4800 }, { "epoch": 0.0965, - "grad_norm": 0.6826328022309806, + "grad_norm": 0.600707218384485, "learning_rate": 9.648000000000001e-06, - "loss": 2.3272, + "loss": 2.3715, "step": 4825 }, { "epoch": 0.097, - "grad_norm": 0.6699367425641755, + "grad_norm": 0.5705494762785608, "learning_rate": 9.698000000000001e-06, - "loss": 2.3259, + "loss": 2.3741, "step": 4850 }, { "epoch": 0.0975, - "grad_norm": 0.7187871648191224, + "grad_norm": 0.5891811727113021, "learning_rate": 9.748e-06, - "loss": 2.3283, + "loss": 2.3738, "step": 4875 }, { "epoch": 0.098, - "grad_norm": 0.7364128547560979, + "grad_norm": 0.5947555260131183, "learning_rate": 9.798e-06, - "loss": 2.3176, + "loss": 2.365, "step": 4900 }, { "epoch": 0.098, - "eval_loss": 2.4474828243255615, - "eval_runtime": 266.4312, - "eval_samples_per_second": 3.085, - "eval_steps_per_second": 1.543, + "eval_loss": 2.433032751083374, + "eval_runtime": 264.6355, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 1.553, "step": 4900 }, { "epoch": 0.0985, - "grad_norm": 1.2246499149220575, + "grad_norm": 0.6055417663185935, "learning_rate": 9.848000000000001e-06, - "loss": 2.327, + "loss": 2.3677, "step": 4925 }, { "epoch": 0.099, - "grad_norm": 0.7443136433045754, + "grad_norm": 0.5803464068069174, "learning_rate": 9.898e-06, - "loss": 2.3258, + "loss": 2.3699, "step": 4950 }, { "epoch": 0.0995, - "grad_norm": 0.7376813348732968, + "grad_norm": 0.5899201870269601, "learning_rate": 9.948e-06, - "loss": 2.3179, + "loss": 2.3685, "step": 4975 }, { "epoch": 0.1, - "grad_norm": 0.6982272728278753, + "grad_norm": 0.6226759838202708, "learning_rate": 9.998000000000002e-06, - "loss": 2.3257, + "loss": 2.3599, "step": 5000 }, { "epoch": 0.1, - "eval_loss": 2.4501407146453857, - "eval_runtime": 266.6865, - "eval_samples_per_second": 3.082, - "eval_steps_per_second": 1.541, + "eval_loss": 2.433412551879883, + "eval_runtime": 279.6783, + "eval_samples_per_second": 2.939, + "eval_steps_per_second": 1.47, "step": 5000 }, { "epoch": 0.1005, - "grad_norm": 0.6681721665421178, + "grad_norm": 0.6129345554278736, "learning_rate": 9.994666666666668e-06, - "loss": 2.3213, + "loss": 2.3651, "step": 5025 }, { "epoch": 0.101, - "grad_norm": 0.6575702354718237, + "grad_norm": 0.5783687106202524, "learning_rate": 9.989111111111111e-06, - "loss": 2.3273, + "loss": 2.3635, "step": 5050 }, { "epoch": 0.1015, - "grad_norm": 0.6971113354828473, + "grad_norm": 0.7886759246703615, "learning_rate": 9.983555555555556e-06, - "loss": 2.3177, + "loss": 2.3688, "step": 5075 }, { "epoch": 0.102, - "grad_norm": 0.6998740128971044, + "grad_norm": 0.5496276670344779, "learning_rate": 9.978000000000002e-06, - "loss": 2.3216, + "loss": 2.3718, "step": 5100 }, { "epoch": 0.102, - "eval_loss": 2.4517312049865723, - "eval_runtime": 266.5502, - "eval_samples_per_second": 3.084, - "eval_steps_per_second": 1.542, + "eval_loss": 2.4336636066436768, + "eval_runtime": 264.0531, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.557, "step": 5100 }, { "epoch": 0.1025, - "grad_norm": 0.7760241850843234, + "grad_norm": 0.596488402670124, "learning_rate": 9.972444444444445e-06, - "loss": 2.3207, + "loss": 2.3654, "step": 5125 }, { "epoch": 0.103, - "grad_norm": 0.6558847315179642, + "grad_norm": 0.5758952191659142, "learning_rate": 9.966888888888889e-06, - "loss": 2.3099, + "loss": 2.3662, "step": 5150 }, { "epoch": 0.1035, - "grad_norm": 0.9064918476165184, + "grad_norm": 0.5714325894660194, "learning_rate": 9.961333333333334e-06, - "loss": 2.311, + "loss": 2.3671, "step": 5175 }, { "epoch": 0.104, - "grad_norm": 0.9360076872893354, + "grad_norm": 0.5826964477363549, "learning_rate": 9.95577777777778e-06, - "loss": 2.3182, + "loss": 2.3621, "step": 5200 }, { "epoch": 0.104, - "eval_loss": 2.4505715370178223, - "eval_runtime": 266.5823, - "eval_samples_per_second": 3.083, - "eval_steps_per_second": 1.542, + "eval_loss": 2.433170795440674, + "eval_runtime": 263.4913, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 1.56, "step": 5200 }, { "epoch": 0.1045, - "grad_norm": 0.674748709590867, + "grad_norm": 0.5939017286545814, "learning_rate": 9.950222222222223e-06, - "loss": 2.3177, + "loss": 2.3704, "step": 5225 }, { "epoch": 0.105, - "grad_norm": 0.6815938545709628, + "grad_norm": 0.5916137818576529, "learning_rate": 9.944666666666668e-06, - "loss": 2.3117, + "loss": 2.3662, "step": 5250 }, { "epoch": 0.1055, - "grad_norm": 0.7208564899692077, + "grad_norm": 0.6105360548349205, "learning_rate": 9.939111111111112e-06, - "loss": 2.3129, + "loss": 2.3646, "step": 5275 }, { "epoch": 0.106, - "grad_norm": 0.6517762207461896, + "grad_norm": 0.5821955662592928, "learning_rate": 9.933555555555557e-06, - "loss": 2.3083, + "loss": 2.365, "step": 5300 }, { "epoch": 0.106, - "eval_loss": 2.451033115386963, - "eval_runtime": 266.739, - "eval_samples_per_second": 3.082, - "eval_steps_per_second": 1.541, + "eval_loss": 2.4327642917633057, + "eval_runtime": 263.745, + "eval_samples_per_second": 3.117, + "eval_steps_per_second": 1.558, "step": 5300 }, { "epoch": 0.1065, - "grad_norm": 0.7209332286785523, + "grad_norm": 0.5805717889494187, "learning_rate": 9.928e-06, - "loss": 2.3215, + "loss": 2.364, "step": 5325 }, { "epoch": 0.107, - "grad_norm": 0.6530480515190894, + "grad_norm": 0.5876895049794754, "learning_rate": 9.922444444444446e-06, - "loss": 2.3087, + "loss": 2.362, "step": 5350 }, { "epoch": 0.1075, - "grad_norm": 0.6611961097877365, + "grad_norm": 0.6258383766876349, "learning_rate": 9.91688888888889e-06, - "loss": 2.3004, + "loss": 2.3654, "step": 5375 }, { "epoch": 0.108, - "grad_norm": 0.6667421121158212, + "grad_norm": 0.5963835367877209, "learning_rate": 9.911333333333335e-06, - "loss": 2.3097, + "loss": 2.3627, "step": 5400 }, { "epoch": 0.108, - "eval_loss": 2.4530234336853027, - "eval_runtime": 266.6052, - "eval_samples_per_second": 3.083, - "eval_steps_per_second": 1.542, + "eval_loss": 2.4326930046081543, + "eval_runtime": 263.2366, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 1.561, "step": 5400 }, { "epoch": 0.1085, - "grad_norm": 0.6941489468595143, + "grad_norm": 0.5827253994353866, "learning_rate": 9.905777777777778e-06, - "loss": 2.3097, + "loss": 2.3703, "step": 5425 }, { "epoch": 0.109, - "grad_norm": 0.6857832561703359, + "grad_norm": 0.571031920084426, "learning_rate": 9.900222222222223e-06, - "loss": 2.3139, + "loss": 2.3671, "step": 5450 }, { "epoch": 0.1095, - "grad_norm": 0.6840488369499558, + "grad_norm": 0.599548806743577, "learning_rate": 9.894666666666669e-06, - "loss": 2.303, + "loss": 2.362, "step": 5475 }, { "epoch": 0.11, - "grad_norm": 0.6802796535790764, + "grad_norm": 0.5736311725646083, "learning_rate": 9.889111111111112e-06, - "loss": 2.3149, + "loss": 2.3622, "step": 5500 }, { "epoch": 0.11, - "eval_loss": 2.452944040298462, - "eval_runtime": 267.9917, - "eval_samples_per_second": 3.067, - "eval_steps_per_second": 1.534, + "eval_loss": 2.4330084323883057, + "eval_runtime": 264.1044, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 1.556, "step": 5500 }, { "epoch": 0.1105, - "grad_norm": 0.6483920466336737, + "grad_norm": 0.6098672058792028, "learning_rate": 9.883555555555556e-06, - "loss": 2.3111, + "loss": 2.3705, "step": 5525 }, { "epoch": 0.111, - "grad_norm": 0.6695881633643047, + "grad_norm": 0.5761728375832208, "learning_rate": 9.878000000000001e-06, - "loss": 2.31, + "loss": 2.3608, "step": 5550 }, { "epoch": 0.1115, - "grad_norm": 0.6964827034411643, + "grad_norm": 0.5922504560114277, "learning_rate": 9.872444444444446e-06, - "loss": 2.3111, + "loss": 2.3542, "step": 5575 }, { "epoch": 0.112, - "grad_norm": 0.7096404846524412, + "grad_norm": 0.5668795024079605, "learning_rate": 9.86688888888889e-06, - "loss": 2.2995, + "loss": 2.3623, "step": 5600 }, { "epoch": 0.112, - "eval_loss": 2.4539694786071777, - "eval_runtime": 268.5912, - "eval_samples_per_second": 3.06, - "eval_steps_per_second": 1.53, + "eval_loss": 2.432955503463745, + "eval_runtime": 263.8097, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 1.558, "step": 5600 }, { "epoch": 0.1125, - "grad_norm": 0.778866892002705, + "grad_norm": 0.5697809034851604, "learning_rate": 9.861333333333333e-06, - "loss": 2.2951, + "loss": 2.3541, "step": 5625 }, { "epoch": 0.113, - "grad_norm": 0.6534696913167488, + "grad_norm": 0.5740407982821335, "learning_rate": 9.855777777777779e-06, - "loss": 2.2999, + "loss": 2.3594, "step": 5650 }, { "epoch": 0.1135, - "grad_norm": 0.6378284092785932, + "grad_norm": 0.5697372211616294, "learning_rate": 9.850222222222224e-06, - "loss": 2.3005, + "loss": 2.3592, "step": 5675 }, { "epoch": 0.114, - "grad_norm": 0.6797038144836399, + "grad_norm": 0.5845230307189324, "learning_rate": 9.844666666666667e-06, - "loss": 2.3043, + "loss": 2.3456, "step": 5700 }, { "epoch": 0.114, - "eval_loss": 2.4557044506073, - "eval_runtime": 268.6823, - "eval_samples_per_second": 3.059, - "eval_steps_per_second": 1.53, + "eval_loss": 2.432389974594116, + "eval_runtime": 263.8043, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 1.558, "step": 5700 }, { "epoch": 0.1145, - "grad_norm": 1.365902512042536, + "grad_norm": 0.5677067211464538, "learning_rate": 9.839111111111111e-06, - "loss": 2.2916, + "loss": 2.3581, "step": 5725 }, { "epoch": 0.115, - "grad_norm": 0.7361097107736868, + "grad_norm": 0.6024564908699644, "learning_rate": 9.833555555555556e-06, - "loss": 2.3111, + "loss": 2.359, "step": 5750 }, { "epoch": 0.1155, - "grad_norm": 0.7329661017576566, + "grad_norm": 0.5789830837760237, "learning_rate": 9.828000000000001e-06, - "loss": 2.2864, + "loss": 2.36, "step": 5775 }, { "epoch": 0.116, - "grad_norm": 0.7330550696150586, + "grad_norm": 0.5912805339254935, "learning_rate": 9.822444444444445e-06, - "loss": 2.2953, + "loss": 2.3588, "step": 5800 }, { "epoch": 0.116, - "eval_loss": 2.456087589263916, - "eval_runtime": 268.8338, - "eval_samples_per_second": 3.058, - "eval_steps_per_second": 1.529, + "eval_loss": 2.432565689086914, + "eval_runtime": 263.3515, + "eval_samples_per_second": 3.121, + "eval_steps_per_second": 1.561, "step": 5800 }, { "epoch": 0.1165, - "grad_norm": 0.7270812590805177, + "grad_norm": 0.5647440650976697, "learning_rate": 9.81688888888889e-06, - "loss": 2.3084, + "loss": 2.3576, "step": 5825 }, { "epoch": 0.117, - "grad_norm": 0.7391337123301003, + "grad_norm": 0.5673458673735715, "learning_rate": 9.811333333333334e-06, - "loss": 2.2892, + "loss": 2.3616, "step": 5850 }, { "epoch": 0.1175, - "grad_norm": 0.7460062590261749, + "grad_norm": 0.6030082642745155, "learning_rate": 9.805777777777779e-06, - "loss": 2.2902, + "loss": 2.3556, "step": 5875 }, { "epoch": 0.118, - "grad_norm": 0.7111372338674795, + "grad_norm": 0.5571893163840321, "learning_rate": 9.800222222222223e-06, - "loss": 2.3026, + "loss": 2.3557, "step": 5900 }, { "epoch": 0.118, - "eval_loss": 2.4564828872680664, - "eval_runtime": 267.0789, - "eval_samples_per_second": 3.078, - "eval_steps_per_second": 1.539, + "eval_loss": 2.4327075481414795, + "eval_runtime": 263.2657, + "eval_samples_per_second": 3.122, + "eval_steps_per_second": 1.561, "step": 5900 }, { "epoch": 0.1185, - "grad_norm": 0.6928050054331656, + "grad_norm": 0.5716010515949606, "learning_rate": 9.794666666666668e-06, - "loss": 2.2939, + "loss": 2.3616, "step": 5925 }, { "epoch": 0.119, - "grad_norm": 1.1910401129331545, + "grad_norm": 0.6245053681878497, "learning_rate": 9.789111111111111e-06, - "loss": 2.295, + "loss": 2.358, "step": 5950 }, { "epoch": 0.1195, - "grad_norm": 0.7117996271252786, + "grad_norm": 0.5896528100704728, "learning_rate": 9.783555555555557e-06, - "loss": 2.2921, + "loss": 2.355, "step": 5975 }, { "epoch": 0.12, - "grad_norm": 0.7331367166809584, + "grad_norm": 0.5534590488643797, "learning_rate": 9.778e-06, - "loss": 2.288, + "loss": 2.3567, "step": 6000 }, { "epoch": 0.12, - "eval_loss": 2.457658529281616, - "eval_runtime": 279.2078, - "eval_samples_per_second": 2.944, - "eval_steps_per_second": 1.472, + "eval_loss": 2.4327354431152344, + "eval_runtime": 263.9156, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.557, "step": 6000 }, { "epoch": 0.1205, - "grad_norm": 0.7254880982172824, + "grad_norm": 0.5779403883996491, "learning_rate": 9.772444444444445e-06, - "loss": 2.2947, + "loss": 2.3487, "step": 6025 }, { "epoch": 0.121, - "grad_norm": 0.7138587001986453, + "grad_norm": 0.5693494880188505, "learning_rate": 9.76688888888889e-06, - "loss": 2.2925, + "loss": 2.3506, "step": 6050 }, { "epoch": 0.1215, - "grad_norm": 0.7727863613296952, + "grad_norm": 0.5864069751838692, "learning_rate": 9.761333333333334e-06, - "loss": 2.2922, + "loss": 2.3498, "step": 6075 }, { "epoch": 0.122, - "grad_norm": 0.7194518989315374, + "grad_norm": 0.5930208676954954, "learning_rate": 9.755777777777778e-06, - "loss": 2.2885, + "loss": 2.3508, "step": 6100 }, { "epoch": 0.122, - "eval_loss": 2.458888530731201, - "eval_runtime": 266.9076, - "eval_samples_per_second": 3.08, - "eval_steps_per_second": 1.54, + "eval_loss": 2.432914972305298, + "eval_runtime": 263.746, + "eval_samples_per_second": 3.117, + "eval_steps_per_second": 1.558, "step": 6100 }, { "epoch": 0.1225, - "grad_norm": 0.6651864231254107, + "grad_norm": 0.5967532601446782, "learning_rate": 9.750222222222223e-06, - "loss": 2.2894, + "loss": 2.3584, "step": 6125 }, { "epoch": 0.123, - "grad_norm": 0.6725418881387486, + "grad_norm": 0.5670429310236035, "learning_rate": 9.744666666666668e-06, - "loss": 2.2826, + "loss": 2.3584, "step": 6150 }, { "epoch": 0.1235, - "grad_norm": 0.6897551604143717, + "grad_norm": 0.5744482242457726, "learning_rate": 9.739111111111112e-06, - "loss": 2.2884, + "loss": 2.351, "step": 6175 }, { "epoch": 0.124, - "grad_norm": 0.6856370312765101, + "grad_norm": 0.6029007635970692, "learning_rate": 9.733555555555555e-06, - "loss": 2.2947, + "loss": 2.3494, "step": 6200 }, { "epoch": 0.124, - "eval_loss": 2.4586613178253174, - "eval_runtime": 266.769, - "eval_samples_per_second": 3.081, - "eval_steps_per_second": 1.541, + "eval_loss": 2.432878255844116, + "eval_runtime": 263.5842, + "eval_samples_per_second": 3.119, + "eval_steps_per_second": 1.559, "step": 6200 }, { "epoch": 0.1245, - "grad_norm": 0.6754708833947095, + "grad_norm": 0.564399310279196, "learning_rate": 9.728e-06, - "loss": 2.2804, + "loss": 2.3595, "step": 6225 }, { "epoch": 0.125, - "grad_norm": 0.7436157184063352, + "grad_norm": 0.6065670221926927, "learning_rate": 9.722444444444446e-06, - "loss": 2.291, + "loss": 2.3547, "step": 6250 }, { "epoch": 0.1255, - "grad_norm": 1.2289814988726309, + "grad_norm": 0.5659801132085207, "learning_rate": 9.71688888888889e-06, - "loss": 2.2884, + "loss": 2.3511, "step": 6275 }, { "epoch": 0.126, - "grad_norm": 0.6933321663168064, + "grad_norm": 0.5837628069797915, "learning_rate": 9.711333333333333e-06, - "loss": 2.2842, + "loss": 2.3575, "step": 6300 }, { "epoch": 0.126, - "eval_loss": 2.4607903957366943, - "eval_runtime": 267.0225, - "eval_samples_per_second": 3.078, - "eval_steps_per_second": 1.539, + "eval_loss": 2.4329097270965576, + "eval_runtime": 264.6192, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 1.553, "step": 6300 }, { "epoch": 0.1265, - "grad_norm": 0.7046257381734213, + "grad_norm": 0.5760319910919499, "learning_rate": 9.705777777777778e-06, - "loss": 2.2782, + "loss": 2.3488, "step": 6325 }, { "epoch": 0.127, - "grad_norm": 0.8061654046726316, + "grad_norm": 0.5761318046315628, "learning_rate": 9.700222222222224e-06, - "loss": 2.2769, + "loss": 2.3435, "step": 6350 }, { "epoch": 0.1275, - "grad_norm": 1.236389278800273, + "grad_norm": 0.5609369346838009, "learning_rate": 9.694666666666667e-06, - "loss": 2.2785, + "loss": 2.347, "step": 6375 }, { "epoch": 0.128, - "grad_norm": 1.3566480389687148, + "grad_norm": 0.5954461846572633, "learning_rate": 9.68911111111111e-06, - "loss": 2.276, + "loss": 2.3485, "step": 6400 }, { "epoch": 0.128, - "eval_loss": 2.4614193439483643, - "eval_runtime": 267.2774, - "eval_samples_per_second": 3.075, - "eval_steps_per_second": 1.538, + "eval_loss": 2.4333934783935547, + "eval_runtime": 263.5903, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 1.559, "step": 6400 }, { "epoch": 0.1285, - "grad_norm": 0.7076617481097072, + "grad_norm": 0.5524126786458765, "learning_rate": 9.683555555555556e-06, - "loss": 2.2822, + "loss": 2.3514, "step": 6425 }, { "epoch": 0.129, - "grad_norm": 0.7238835326857954, + "grad_norm": 0.5590067107241867, "learning_rate": 9.678000000000001e-06, - "loss": 2.2836, + "loss": 2.3477, "step": 6450 }, { "epoch": 0.1295, - "grad_norm": 0.6956437949619986, + "grad_norm": 0.5578028236930622, "learning_rate": 9.672444444444445e-06, - "loss": 2.2861, + "loss": 2.3434, "step": 6475 }, { "epoch": 0.13, - "grad_norm": 0.6826904740769445, + "grad_norm": 0.6002389478119885, "learning_rate": 9.66688888888889e-06, - "loss": 2.28, + "loss": 2.3415, "step": 6500 }, { "epoch": 0.13, - "eval_loss": 2.461963176727295, - "eval_runtime": 266.9635, - "eval_samples_per_second": 3.079, - "eval_steps_per_second": 1.54, + "eval_loss": 2.433302164077759, + "eval_runtime": 263.4334, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 1.56, "step": 6500 }, { "epoch": 0.1305, - "grad_norm": 0.7354375510465465, + "grad_norm": 0.5868647352323021, "learning_rate": 9.661333333333334e-06, - "loss": 2.2762, + "loss": 2.3532, "step": 6525 }, { "epoch": 0.131, - "grad_norm": 0.7097500898345966, + "grad_norm": 0.5525203092071236, "learning_rate": 9.655777777777779e-06, - "loss": 2.27, + "loss": 2.3439, "step": 6550 }, { "epoch": 0.1315, - "grad_norm": 0.6946565983004674, + "grad_norm": 0.642282300647443, "learning_rate": 9.650222222222222e-06, - "loss": 2.2864, + "loss": 2.333, "step": 6575 }, { "epoch": 0.132, - "grad_norm": 0.7162174051756143, + "grad_norm": 0.5954691746571129, "learning_rate": 9.644666666666668e-06, - "loss": 2.2759, + "loss": 2.3371, "step": 6600 }, { "epoch": 0.132, - "eval_loss": 2.4648051261901855, - "eval_runtime": 267.168, - "eval_samples_per_second": 3.077, - "eval_steps_per_second": 1.538, + "eval_loss": 2.4332070350646973, + "eval_runtime": 263.9928, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 1.557, "step": 6600 }, { "epoch": 0.1325, - "grad_norm": 0.7076765734841959, + "grad_norm": 0.5696322215994257, "learning_rate": 9.639111111111113e-06, - "loss": 2.27, + "loss": 2.3568, "step": 6625 }, { "epoch": 0.133, - "grad_norm": 0.6927072572875789, + "grad_norm": 0.569783318316734, "learning_rate": 9.633555555555556e-06, - "loss": 2.2634, + "loss": 2.3468, "step": 6650 }, { "epoch": 0.1335, - "grad_norm": 0.7123893564222622, + "grad_norm": 0.5974477984803339, "learning_rate": 9.628e-06, - "loss": 2.2716, + "loss": 2.3369, "step": 6675 }, { "epoch": 0.134, - "grad_norm": 1.793886923890115, + "grad_norm": 0.5850514409957908, "learning_rate": 9.622444444444445e-06, - "loss": 2.2769, + "loss": 2.3328, "step": 6700 }, { "epoch": 0.134, - "eval_loss": 2.463580369949341, - "eval_runtime": 266.9031, - "eval_samples_per_second": 3.08, - "eval_steps_per_second": 1.54, + "eval_loss": 2.4336042404174805, + "eval_runtime": 264.1653, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 1.556, "step": 6700 }, { "epoch": 0.1345, - "grad_norm": 0.7783341295661176, + "grad_norm": 0.5598567946533984, "learning_rate": 9.61688888888889e-06, - "loss": 2.2732, + "loss": 2.3505, "step": 6725 }, { "epoch": 0.135, - "grad_norm": 0.686324311806464, + "grad_norm": 0.564538169627995, "learning_rate": 9.611333333333334e-06, - "loss": 2.2616, + "loss": 2.3512, "step": 6750 }, { "epoch": 0.1355, - "grad_norm": 0.7017572245338503, + "grad_norm": 0.555057205811747, "learning_rate": 9.605777777777778e-06, - "loss": 2.2727, + "loss": 2.3441, "step": 6775 }, { "epoch": 0.136, - "grad_norm": 0.6704354049818998, + "grad_norm": 0.5928392878820046, "learning_rate": 9.600222222222223e-06, - "loss": 2.265, + "loss": 2.342, "step": 6800 }, { "epoch": 0.136, - "eval_loss": 2.4660842418670654, - "eval_runtime": 267.0219, - "eval_samples_per_second": 3.078, - "eval_steps_per_second": 1.539, + "eval_loss": 2.4332380294799805, + "eval_runtime": 263.6981, + "eval_samples_per_second": 3.117, + "eval_steps_per_second": 1.559, "step": 6800 }, { "epoch": 0.1365, - "grad_norm": 0.6584651224563807, + "grad_norm": 0.580747535991996, "learning_rate": 9.594666666666668e-06, - "loss": 2.2649, + "loss": 2.3402, "step": 6825 }, { "epoch": 0.137, - "grad_norm": 0.7602880717784024, + "grad_norm": 0.5361093856752921, "learning_rate": 9.589111111111112e-06, - "loss": 2.2686, + "loss": 2.3345, "step": 6850 }, { "epoch": 0.1375, - "grad_norm": 0.6853861790224037, + "grad_norm": 0.5764684974648585, "learning_rate": 9.583555555555555e-06, - "loss": 2.2608, + "loss": 2.3434, "step": 6875 }, { "epoch": 0.138, - "grad_norm": 0.6816515996641028, + "grad_norm": 0.5695437902803252, "learning_rate": 9.578e-06, - "loss": 2.2519, + "loss": 2.3345, "step": 6900 }, { "epoch": 0.138, - "eval_loss": 2.4669978618621826, - "eval_runtime": 266.9803, - "eval_samples_per_second": 3.079, - "eval_steps_per_second": 1.539, + "eval_loss": 2.4334897994995117, + "eval_runtime": 263.9042, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.557, "step": 6900 }, { "epoch": 0.1385, - "grad_norm": 0.7860344470255605, + "grad_norm": 0.5856816810807355, "learning_rate": 9.572444444444446e-06, - "loss": 2.272, + "loss": 2.3344, "step": 6925 }, { "epoch": 0.139, - "grad_norm": 0.7032180900006014, + "grad_norm": 0.5692161417871612, "learning_rate": 9.56688888888889e-06, - "loss": 2.266, + "loss": 2.3492, "step": 6950 }, { "epoch": 0.1395, - "grad_norm": 0.8534761769930719, + "grad_norm": 0.5782790626699041, "learning_rate": 9.561333333333333e-06, - "loss": 2.2634, + "loss": 2.3343, "step": 6975 }, { "epoch": 0.14, - "grad_norm": 1.0587470449754097, + "grad_norm": 0.5592348825440727, "learning_rate": 9.555777777777778e-06, - "loss": 2.2648, + "loss": 2.3361, "step": 7000 }, { "epoch": 0.14, - "eval_loss": 2.469151735305786, - "eval_runtime": 267.2244, - "eval_samples_per_second": 3.076, - "eval_steps_per_second": 1.538, + "eval_loss": 2.4338128566741943, + "eval_runtime": 264.0278, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.557, "step": 7000 + }, + { + "epoch": 0.1405, + "grad_norm": 0.5810855929853301, + "learning_rate": 9.550222222222223e-06, + "loss": 2.3397, + "step": 7025 + }, + { + "epoch": 0.141, + "grad_norm": 0.5672444444354668, + "learning_rate": 9.544666666666667e-06, + "loss": 2.3384, + "step": 7050 + }, + { + "epoch": 0.1415, + "grad_norm": 0.649461804794621, + "learning_rate": 9.539111111111112e-06, + "loss": 2.3384, + "step": 7075 + }, + { + "epoch": 0.142, + "grad_norm": 0.5697893925017475, + "learning_rate": 9.533555555555556e-06, + "loss": 2.3415, + "step": 7100 + }, + { + "epoch": 0.142, + "eval_loss": 2.4329330921173096, + "eval_runtime": 263.8408, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 1.558, + "step": 7100 + }, + { + "epoch": 0.1425, + "grad_norm": 0.562192662676289, + "learning_rate": 9.528000000000001e-06, + "loss": 2.3381, + "step": 7125 + }, + { + "epoch": 0.143, + "grad_norm": 0.5782927675061864, + "learning_rate": 9.522444444444444e-06, + "loss": 2.3316, + "step": 7150 + }, + { + "epoch": 0.1435, + "grad_norm": 0.5470889439002048, + "learning_rate": 9.51688888888889e-06, + "loss": 2.3336, + "step": 7175 + }, + { + "epoch": 0.144, + "grad_norm": 0.5732687375919955, + "learning_rate": 9.511333333333335e-06, + "loss": 2.3302, + "step": 7200 + }, + { + "epoch": 0.144, + "eval_loss": 2.4339091777801514, + "eval_runtime": 265.4685, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 1.548, + "step": 7200 + }, + { + "epoch": 0.1445, + "grad_norm": 0.5552677779418167, + "learning_rate": 9.505777777777779e-06, + "loss": 2.3382, + "step": 7225 + }, + { + "epoch": 0.145, + "grad_norm": 0.5597695533114173, + "learning_rate": 9.500222222222222e-06, + "loss": 2.3281, + "step": 7250 + }, + { + "epoch": 0.1455, + "grad_norm": 0.586047229250587, + "learning_rate": 9.494666666666667e-06, + "loss": 2.3365, + "step": 7275 + }, + { + "epoch": 0.146, + "grad_norm": 0.5631697021330876, + "learning_rate": 9.489111111111113e-06, + "loss": 2.3434, + "step": 7300 + }, + { + "epoch": 0.146, + "eval_loss": 2.4337289333343506, + "eval_runtime": 264.0121, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.557, + "step": 7300 + }, + { + "epoch": 0.1465, + "grad_norm": 0.5787283610065107, + "learning_rate": 9.483555555555556e-06, + "loss": 2.3385, + "step": 7325 + }, + { + "epoch": 0.147, + "grad_norm": 0.5894250508009748, + "learning_rate": 9.478e-06, + "loss": 2.3289, + "step": 7350 + }, + { + "epoch": 0.1475, + "grad_norm": 0.5698558287850775, + "learning_rate": 9.472444444444445e-06, + "loss": 2.3363, + "step": 7375 + }, + { + "epoch": 0.148, + "grad_norm": 0.5704695535231787, + "learning_rate": 9.46688888888889e-06, + "loss": 2.3245, + "step": 7400 + }, + { + "epoch": 0.148, + "eval_loss": 2.4338371753692627, + "eval_runtime": 264.1068, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 1.556, + "step": 7400 + }, + { + "epoch": 0.1485, + "grad_norm": 0.5452782996001769, + "learning_rate": 9.461333333333334e-06, + "loss": 2.3442, + "step": 7425 + }, + { + "epoch": 0.149, + "grad_norm": 0.5741037001956839, + "learning_rate": 9.455777777777777e-06, + "loss": 2.3349, + "step": 7450 + }, + { + "epoch": 0.1495, + "grad_norm": 0.5570524045425876, + "learning_rate": 9.450222222222223e-06, + "loss": 2.3324, + "step": 7475 + }, + { + "epoch": 0.15, + "grad_norm": 0.5701333037498688, + "learning_rate": 9.444666666666668e-06, + "loss": 2.3268, + "step": 7500 + }, + { + "epoch": 0.15, + "eval_loss": 2.4347753524780273, + "eval_runtime": 264.1822, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.556, + "step": 7500 + }, + { + "epoch": 0.1505, + "grad_norm": 0.5636194713998469, + "learning_rate": 9.439111111111111e-06, + "loss": 2.3324, + "step": 7525 + }, + { + "epoch": 0.151, + "grad_norm": 0.5745462812172999, + "learning_rate": 9.433555555555557e-06, + "loss": 2.3438, + "step": 7550 + }, + { + "epoch": 0.1515, + "grad_norm": 0.5658180287749817, + "learning_rate": 9.428e-06, + "loss": 2.3272, + "step": 7575 + }, + { + "epoch": 0.152, + "grad_norm": 0.5590021944536283, + "learning_rate": 9.422444444444445e-06, + "loss": 2.3379, + "step": 7600 + }, + { + "epoch": 0.152, + "eval_loss": 2.43342924118042, + "eval_runtime": 264.6073, + "eval_samples_per_second": 3.106, + "eval_steps_per_second": 1.553, + "step": 7600 + }, + { + "epoch": 0.1525, + "grad_norm": 0.5756847823781959, + "learning_rate": 9.41688888888889e-06, + "loss": 2.3291, + "step": 7625 + }, + { + "epoch": 0.153, + "grad_norm": 0.5614727649452073, + "learning_rate": 9.411333333333334e-06, + "loss": 2.3164, + "step": 7650 + }, + { + "epoch": 0.1535, + "grad_norm": 0.581410678990456, + "learning_rate": 9.405777777777778e-06, + "loss": 2.3205, + "step": 7675 + }, + { + "epoch": 0.154, + "grad_norm": 0.6063515370764081, + "learning_rate": 9.400222222222223e-06, + "loss": 2.3331, + "step": 7700 + }, + { + "epoch": 0.154, + "eval_loss": 2.435711622238159, + "eval_runtime": 283.6724, + "eval_samples_per_second": 2.898, + "eval_steps_per_second": 1.449, + "step": 7700 + }, + { + "epoch": 0.1545, + "grad_norm": 0.5535459156675728, + "learning_rate": 9.394666666666668e-06, + "loss": 2.3312, + "step": 7725 + }, + { + "epoch": 0.155, + "grad_norm": 0.5550223235337549, + "learning_rate": 9.389111111111112e-06, + "loss": 2.3222, + "step": 7750 + }, + { + "epoch": 0.1555, + "grad_norm": 0.5661396564004607, + "learning_rate": 9.383555555555557e-06, + "loss": 2.329, + "step": 7775 + }, + { + "epoch": 0.156, + "grad_norm": 0.5754229466302317, + "learning_rate": 9.378e-06, + "loss": 2.3375, + "step": 7800 + }, + { + "epoch": 0.156, + "eval_loss": 2.4339263439178467, + "eval_runtime": 263.7245, + "eval_samples_per_second": 3.117, + "eval_steps_per_second": 1.558, + "step": 7800 + }, + { + "epoch": 0.1565, + "grad_norm": 0.5922113870936093, + "learning_rate": 9.372444444444446e-06, + "loss": 2.3326, + "step": 7825 + }, + { + "epoch": 0.157, + "grad_norm": 0.5802231546249389, + "learning_rate": 9.36688888888889e-06, + "loss": 2.3313, + "step": 7850 + }, + { + "epoch": 0.1575, + "grad_norm": 0.5613750089293277, + "learning_rate": 9.361333333333335e-06, + "loss": 2.3306, + "step": 7875 + }, + { + "epoch": 0.158, + "grad_norm": 0.5554952690049914, + "learning_rate": 9.355777777777778e-06, + "loss": 2.3307, + "step": 7900 + }, + { + "epoch": 0.158, + "eval_loss": 2.435500144958496, + "eval_runtime": 268.1064, + "eval_samples_per_second": 3.066, + "eval_steps_per_second": 1.533, + "step": 7900 + }, + { + "epoch": 0.1585, + "grad_norm": 0.5699743157285643, + "learning_rate": 9.350222222222224e-06, + "loss": 2.3274, + "step": 7925 + }, + { + "epoch": 0.159, + "grad_norm": 0.580771514541295, + "learning_rate": 9.344666666666667e-06, + "loss": 2.3238, + "step": 7950 + }, + { + "epoch": 0.1595, + "grad_norm": 0.563419791930312, + "learning_rate": 9.339111111111112e-06, + "loss": 2.3384, + "step": 7975 + }, + { + "epoch": 0.16, + "grad_norm": 0.5793778749938447, + "learning_rate": 9.333555555555558e-06, + "loss": 2.3291, + "step": 8000 + }, + { + "epoch": 0.16, + "eval_loss": 2.4343531131744385, + "eval_runtime": 263.9111, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.557, + "step": 8000 + }, + { + "epoch": 0.1605, + "grad_norm": 0.5748501940226582, + "learning_rate": 9.328000000000001e-06, + "loss": 2.3272, + "step": 8025 + }, + { + "epoch": 0.161, + "grad_norm": 0.5776520997935511, + "learning_rate": 9.322444444444445e-06, + "loss": 2.3232, + "step": 8050 + }, + { + "epoch": 0.1615, + "grad_norm": 0.5841162716826148, + "learning_rate": 9.31688888888889e-06, + "loss": 2.3252, + "step": 8075 + }, + { + "epoch": 0.162, + "grad_norm": 0.5582161918345583, + "learning_rate": 9.311333333333335e-06, + "loss": 2.3254, + "step": 8100 + }, + { + "epoch": 0.162, + "eval_loss": 2.4345877170562744, + "eval_runtime": 263.9792, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 1.557, + "step": 8100 + }, + { + "epoch": 0.1625, + "grad_norm": 0.5744381110572562, + "learning_rate": 9.305777777777779e-06, + "loss": 2.325, + "step": 8125 + }, + { + "epoch": 0.163, + "grad_norm": 0.5801402993634438, + "learning_rate": 9.300222222222222e-06, + "loss": 2.3203, + "step": 8150 + }, + { + "epoch": 0.1635, + "grad_norm": 0.5644380448766211, + "learning_rate": 9.294666666666668e-06, + "loss": 2.3179, + "step": 8175 + }, + { + "epoch": 0.164, + "grad_norm": 0.5747041663572834, + "learning_rate": 9.289111111111113e-06, + "loss": 2.3241, + "step": 8200 + }, + { + "epoch": 0.164, + "eval_loss": 2.435701847076416, + "eval_runtime": 263.9699, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 1.557, + "step": 8200 + }, + { + "epoch": 0.1645, + "grad_norm": 0.5550631701119645, + "learning_rate": 9.283555555555556e-06, + "loss": 2.3176, + "step": 8225 + }, + { + "epoch": 0.165, + "grad_norm": 0.5828828542252756, + "learning_rate": 9.278e-06, + "loss": 2.3213, + "step": 8250 + }, + { + "epoch": 0.1655, + "grad_norm": 0.5610132600982978, + "learning_rate": 9.272444444444445e-06, + "loss": 2.3117, + "step": 8275 + }, + { + "epoch": 0.166, + "grad_norm": 0.5777357931804634, + "learning_rate": 9.26688888888889e-06, + "loss": 2.3189, + "step": 8300 + }, + { + "epoch": 0.166, + "eval_loss": 2.43573260307312, + "eval_runtime": 264.2018, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.556, + "step": 8300 + }, + { + "epoch": 0.1665, + "grad_norm": 0.5515402141694353, + "learning_rate": 9.261333333333334e-06, + "loss": 2.3267, + "step": 8325 + }, + { + "epoch": 0.167, + "grad_norm": 0.588745393922677, + "learning_rate": 9.25577777777778e-06, + "loss": 2.3219, + "step": 8350 + }, + { + "epoch": 0.1675, + "grad_norm": 0.5391388541771018, + "learning_rate": 9.250222222222223e-06, + "loss": 2.3181, + "step": 8375 + }, + { + "epoch": 0.168, + "grad_norm": 0.5680296112961243, + "learning_rate": 9.244666666666668e-06, + "loss": 2.3231, + "step": 8400 + }, + { + "epoch": 0.168, + "eval_loss": 2.435276985168457, + "eval_runtime": 263.8428, + "eval_samples_per_second": 3.115, + "eval_steps_per_second": 1.558, + "step": 8400 + }, + { + "epoch": 0.1685, + "grad_norm": 0.5655802530008279, + "learning_rate": 9.239111111111112e-06, + "loss": 2.3201, + "step": 8425 + }, + { + "epoch": 0.169, + "grad_norm": 0.5917481613153034, + "learning_rate": 9.233555555555557e-06, + "loss": 2.3184, + "step": 8450 + }, + { + "epoch": 0.1695, + "grad_norm": 0.5808853698441179, + "learning_rate": 9.228e-06, + "loss": 2.3151, + "step": 8475 + }, + { + "epoch": 0.17, + "grad_norm": 0.5868551530423814, + "learning_rate": 9.222444444444446e-06, + "loss": 2.3146, + "step": 8500 + }, + { + "epoch": 0.17, + "eval_loss": 2.435950994491577, + "eval_runtime": 264.3586, + "eval_samples_per_second": 3.109, + "eval_steps_per_second": 1.555, + "step": 8500 + }, + { + "epoch": 0.1705, + "grad_norm": 0.5638181149272796, + "learning_rate": 9.21688888888889e-06, + "loss": 2.3155, + "step": 8525 + }, + { + "epoch": 0.171, + "grad_norm": 0.5740285526813199, + "learning_rate": 9.211333333333334e-06, + "loss": 2.319, + "step": 8550 + }, + { + "epoch": 0.1715, + "grad_norm": 0.5695622395648989, + "learning_rate": 9.20577777777778e-06, + "loss": 2.3206, + "step": 8575 + }, + { + "epoch": 0.172, + "grad_norm": 0.5747463636735414, + "learning_rate": 9.200222222222223e-06, + "loss": 2.3111, + "step": 8600 + }, + { + "epoch": 0.172, + "eval_loss": 2.4367878437042236, + "eval_runtime": 264.2061, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.556, + "step": 8600 + }, + { + "epoch": 0.1725, + "grad_norm": 0.5777631704492084, + "learning_rate": 9.194666666666667e-06, + "loss": 2.3078, + "step": 8625 + }, + { + "epoch": 0.173, + "grad_norm": 0.5746886517313039, + "learning_rate": 9.189111111111112e-06, + "loss": 2.3152, + "step": 8650 + }, + { + "epoch": 0.1735, + "grad_norm": 0.564580351173264, + "learning_rate": 9.183555555555557e-06, + "loss": 2.316, + "step": 8675 + }, + { + "epoch": 0.174, + "grad_norm": 0.6048784393681501, + "learning_rate": 9.178000000000001e-06, + "loss": 2.3251, + "step": 8700 + }, + { + "epoch": 0.174, + "eval_loss": 2.435750722885132, + "eval_runtime": 264.296, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 1.555, + "step": 8700 + }, + { + "epoch": 0.1745, + "grad_norm": 0.5769443750882641, + "learning_rate": 9.172444444444444e-06, + "loss": 2.3186, + "step": 8725 + }, + { + "epoch": 0.175, + "grad_norm": 0.5792202067037501, + "learning_rate": 9.16688888888889e-06, + "loss": 2.3106, + "step": 8750 + }, + { + "epoch": 0.1755, + "grad_norm": 0.5819115394572557, + "learning_rate": 9.161333333333335e-06, + "loss": 2.3118, + "step": 8775 + }, + { + "epoch": 0.176, + "grad_norm": 0.575657270210696, + "learning_rate": 9.155777777777779e-06, + "loss": 2.3106, + "step": 8800 + }, + { + "epoch": 0.176, + "eval_loss": 2.436899185180664, + "eval_runtime": 263.9579, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 1.557, + "step": 8800 + }, + { + "epoch": 0.1765, + "grad_norm": 0.572118834452971, + "learning_rate": 9.150222222222222e-06, + "loss": 2.3139, + "step": 8825 + }, + { + "epoch": 0.177, + "grad_norm": 0.5812618278818413, + "learning_rate": 9.144666666666667e-06, + "loss": 2.319, + "step": 8850 + }, + { + "epoch": 0.1775, + "grad_norm": 0.5527533551295488, + "learning_rate": 9.139111111111113e-06, + "loss": 2.3152, + "step": 8875 + }, + { + "epoch": 0.178, + "grad_norm": 0.5749551425231054, + "learning_rate": 9.133555555555556e-06, + "loss": 2.3065, + "step": 8900 + }, + { + "epoch": 0.178, + "eval_loss": 2.4364571571350098, + "eval_runtime": 264.0259, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 1.557, + "step": 8900 + }, + { + "epoch": 0.1785, + "grad_norm": 0.5758182476998225, + "learning_rate": 9.128e-06, + "loss": 2.3104, + "step": 8925 + }, + { + "epoch": 0.179, + "grad_norm": 0.5922756280220078, + "learning_rate": 9.122444444444445e-06, + "loss": 2.3158, + "step": 8950 + }, + { + "epoch": 0.1795, + "grad_norm": 0.5943790910117238, + "learning_rate": 9.11688888888889e-06, + "loss": 2.3167, + "step": 8975 + }, + { + "epoch": 0.18, + "grad_norm": 0.580613992072982, + "learning_rate": 9.111333333333334e-06, + "loss": 2.3069, + "step": 9000 + }, + { + "epoch": 0.18, + "eval_loss": 2.436984062194824, + "eval_runtime": 264.2235, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 1.556, + "step": 9000 } ], "logging_steps": 25, @@ -2547,7 +3267,7 @@ "attributes": {} } }, - "total_flos": 2.2282416088290427e+19, + "total_flos": 2.8648820684944835e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null