| { | |
| "best_global_step": 4300, | |
| "best_metric": 2.432278633117676, | |
| "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-4000", | |
| "epoch": 0.18, | |
| "eval_steps": 100, | |
| "global_step": 9000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005, | |
| "grad_norm": 39.75564521032967, | |
| "learning_rate": 4.8e-08, | |
| "loss": 3.6517, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 28.937531835097435, | |
| "learning_rate": 9.8e-08, | |
| "loss": 3.5931, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0015, | |
| "grad_norm": 21.922720332659644, | |
| "learning_rate": 1.4800000000000003e-07, | |
| "loss": 3.3397, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 8.739610199908325, | |
| "learning_rate": 1.9800000000000003e-07, | |
| "loss": 3.1289, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_loss": 2.9243295192718506, | |
| "eval_runtime": 264.3302, | |
| "eval_samples_per_second": 3.11, | |
| "eval_steps_per_second": 1.555, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 4.433912600039677, | |
| "learning_rate": 2.48e-07, | |
| "loss": 2.8957, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 3.2874790066620303, | |
| "learning_rate": 2.9800000000000005e-07, | |
| "loss": 2.763, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0035, | |
| "grad_norm": 1.5203472215469231, | |
| "learning_rate": 3.48e-07, | |
| "loss": 2.676, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 1.1945541683905954, | |
| "learning_rate": 3.9800000000000004e-07, | |
| "loss": 2.635, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_loss": 2.6094932556152344, | |
| "eval_runtime": 265.7702, | |
| "eval_samples_per_second": 3.093, | |
| "eval_steps_per_second": 1.546, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0045, | |
| "grad_norm": 1.0852713304633745, | |
| "learning_rate": 4.4800000000000004e-07, | |
| "loss": 2.6016, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 1.0733940346699529, | |
| "learning_rate": 4.98e-07, | |
| "loss": 2.5797, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0055, | |
| "grad_norm": 0.9273949035031271, | |
| "learning_rate": 5.480000000000001e-07, | |
| "loss": 2.5607, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 0.9289300678591714, | |
| "learning_rate": 5.98e-07, | |
| "loss": 2.552, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_loss": 2.541522264480591, | |
| "eval_runtime": 266.7478, | |
| "eval_samples_per_second": 3.082, | |
| "eval_steps_per_second": 1.541, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0065, | |
| "grad_norm": 1.1328584507449984, | |
| "learning_rate": 6.48e-07, | |
| "loss": 2.5402, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 0.8593307029257858, | |
| "learning_rate": 6.98e-07, | |
| "loss": 2.5286, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 0.895615604067586, | |
| "learning_rate": 7.480000000000001e-07, | |
| "loss": 2.5311, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.912306580242149, | |
| "learning_rate": 7.98e-07, | |
| "loss": 2.5037, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_loss": 2.514389991760254, | |
| "eval_runtime": 266.4899, | |
| "eval_samples_per_second": 3.085, | |
| "eval_steps_per_second": 1.542, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0085, | |
| "grad_norm": 1.1866535514670034, | |
| "learning_rate": 8.480000000000001e-07, | |
| "loss": 2.5011, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 1.211342504193914, | |
| "learning_rate": 8.980000000000001e-07, | |
| "loss": 2.503, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0095, | |
| "grad_norm": 1.113763817383069, | |
| "learning_rate": 9.480000000000001e-07, | |
| "loss": 2.4999, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.2585585589647226, | |
| "learning_rate": 9.98e-07, | |
| "loss": 2.4872, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 2.497868061065674, | |
| "eval_runtime": 265.7962, | |
| "eval_samples_per_second": 3.093, | |
| "eval_steps_per_second": 1.546, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0105, | |
| "grad_norm": 1.2585825718084245, | |
| "learning_rate": 1.0480000000000002e-06, | |
| "loss": 2.4852, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 1.4101257437846046, | |
| "learning_rate": 1.0980000000000001e-06, | |
| "loss": 2.4892, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0115, | |
| "grad_norm": 1.1975234150707363, | |
| "learning_rate": 1.148e-06, | |
| "loss": 2.4861, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 1.3662769225582332, | |
| "learning_rate": 1.1980000000000002e-06, | |
| "loss": 2.4882, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_loss": 2.4879231452941895, | |
| "eval_runtime": 267.0005, | |
| "eval_samples_per_second": 3.079, | |
| "eval_steps_per_second": 1.539, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 1.3086724275194024, | |
| "learning_rate": 1.248e-06, | |
| "loss": 2.4745, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 1.317023206802888, | |
| "learning_rate": 1.2980000000000001e-06, | |
| "loss": 2.4727, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0135, | |
| "grad_norm": 1.5284967544483212, | |
| "learning_rate": 1.348e-06, | |
| "loss": 2.469, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 1.1047595217316941, | |
| "learning_rate": 1.3980000000000002e-06, | |
| "loss": 2.4695, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_loss": 2.480103015899658, | |
| "eval_runtime": 263.5022, | |
| "eval_samples_per_second": 3.12, | |
| "eval_steps_per_second": 1.56, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0145, | |
| "grad_norm": 1.2077328209863791, | |
| "learning_rate": 1.4480000000000002e-06, | |
| "loss": 2.4654, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 1.209220841771836, | |
| "learning_rate": 1.498e-06, | |
| "loss": 2.4663, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0155, | |
| "grad_norm": 1.3063169829879686, | |
| "learning_rate": 1.548e-06, | |
| "loss": 2.4704, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 1.3180183352683195, | |
| "learning_rate": 1.5980000000000002e-06, | |
| "loss": 2.4583, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_loss": 2.473590850830078, | |
| "eval_runtime": 305.9875, | |
| "eval_samples_per_second": 2.686, | |
| "eval_steps_per_second": 1.343, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0165, | |
| "grad_norm": 1.1674852380778837, | |
| "learning_rate": 1.6480000000000001e-06, | |
| "loss": 2.467, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 1.2497656349941002, | |
| "learning_rate": 1.6980000000000003e-06, | |
| "loss": 2.4612, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 1.3358614980967494, | |
| "learning_rate": 1.7480000000000002e-06, | |
| "loss": 2.4636, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 1.252489857653356, | |
| "learning_rate": 1.798e-06, | |
| "loss": 2.454, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_loss": 2.4681763648986816, | |
| "eval_runtime": 264.702, | |
| "eval_samples_per_second": 3.105, | |
| "eval_steps_per_second": 1.553, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0185, | |
| "grad_norm": 1.2815437998994337, | |
| "learning_rate": 1.8480000000000001e-06, | |
| "loss": 2.4571, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 1.0902475329451575, | |
| "learning_rate": 1.898e-06, | |
| "loss": 2.451, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.0195, | |
| "grad_norm": 1.1502696024965324, | |
| "learning_rate": 1.9480000000000002e-06, | |
| "loss": 2.4527, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.2336661855806117, | |
| "learning_rate": 1.998e-06, | |
| "loss": 2.4496, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 2.463880777359009, | |
| "eval_runtime": 275.7426, | |
| "eval_samples_per_second": 2.981, | |
| "eval_steps_per_second": 1.491, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0205, | |
| "grad_norm": 1.2680742209094296, | |
| "learning_rate": 2.048e-06, | |
| "loss": 2.4494, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 1.0341778808278126, | |
| "learning_rate": 2.098e-06, | |
| "loss": 2.4467, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0215, | |
| "grad_norm": 0.9860490736001175, | |
| "learning_rate": 2.148e-06, | |
| "loss": 2.4473, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 0.9419267295275278, | |
| "learning_rate": 2.198e-06, | |
| "loss": 2.443, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_loss": 2.4598941802978516, | |
| "eval_runtime": 265.0502, | |
| "eval_samples_per_second": 3.101, | |
| "eval_steps_per_second": 1.551, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 1.3280720471027394, | |
| "learning_rate": 2.2480000000000003e-06, | |
| "loss": 2.4515, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 1.053570785582915, | |
| "learning_rate": 2.2980000000000003e-06, | |
| "loss": 2.4396, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.0235, | |
| "grad_norm": 0.9108119839585552, | |
| "learning_rate": 2.3480000000000002e-06, | |
| "loss": 2.4442, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.0062346367900277, | |
| "learning_rate": 2.398e-06, | |
| "loss": 2.4443, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_loss": 2.456455945968628, | |
| "eval_runtime": 264.5888, | |
| "eval_samples_per_second": 3.107, | |
| "eval_steps_per_second": 1.553, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0245, | |
| "grad_norm": 1.0264127705426926, | |
| "learning_rate": 2.448e-06, | |
| "loss": 2.4351, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.8015249588347212, | |
| "learning_rate": 2.498e-06, | |
| "loss": 2.4406, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.0255, | |
| "grad_norm": 1.1105649485540114, | |
| "learning_rate": 2.5480000000000004e-06, | |
| "loss": 2.4377, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 0.9701758426012801, | |
| "learning_rate": 2.598e-06, | |
| "loss": 2.4341, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_loss": 2.453026056289673, | |
| "eval_runtime": 264.7653, | |
| "eval_samples_per_second": 3.105, | |
| "eval_steps_per_second": 1.552, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.0265, | |
| "grad_norm": 0.9587254891845429, | |
| "learning_rate": 2.648e-06, | |
| "loss": 2.4303, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 0.8135883960763247, | |
| "learning_rate": 2.6980000000000003e-06, | |
| "loss": 2.4363, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.9192860127847176, | |
| "learning_rate": 2.748e-06, | |
| "loss": 2.4257, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.947465928893444, | |
| "learning_rate": 2.798e-06, | |
| "loss": 2.4353, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_loss": 2.450345993041992, | |
| "eval_runtime": 265.6266, | |
| "eval_samples_per_second": 3.095, | |
| "eval_steps_per_second": 1.547, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.0285, | |
| "grad_norm": 0.9270137901066681, | |
| "learning_rate": 2.848e-06, | |
| "loss": 2.4347, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 0.8839980710491563, | |
| "learning_rate": 2.8980000000000005e-06, | |
| "loss": 2.4213, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.0295, | |
| "grad_norm": 0.913196005454606, | |
| "learning_rate": 2.9480000000000004e-06, | |
| "loss": 2.4232, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8139623858623861, | |
| "learning_rate": 2.9980000000000003e-06, | |
| "loss": 2.4254, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 2.447662830352783, | |
| "eval_runtime": 263.4353, | |
| "eval_samples_per_second": 3.12, | |
| "eval_steps_per_second": 1.56, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0305, | |
| "grad_norm": 0.8422198221554755, | |
| "learning_rate": 3.0480000000000003e-06, | |
| "loss": 2.4196, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 0.8542957579365906, | |
| "learning_rate": 3.0980000000000007e-06, | |
| "loss": 2.4294, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.0315, | |
| "grad_norm": 1.149263137594797, | |
| "learning_rate": 3.1480000000000006e-06, | |
| "loss": 2.4265, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.811470126240392, | |
| "learning_rate": 3.198e-06, | |
| "loss": 2.4105, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 2.4456679821014404, | |
| "eval_runtime": 264.056, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.556, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 2.3928975221881434, | |
| "learning_rate": 3.248e-06, | |
| "loss": 2.4208, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 0.8031315125360012, | |
| "learning_rate": 3.298e-06, | |
| "loss": 2.4224, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.0335, | |
| "grad_norm": 0.835567276692195, | |
| "learning_rate": 3.348e-06, | |
| "loss": 2.4188, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 0.8894325175719718, | |
| "learning_rate": 3.3980000000000003e-06, | |
| "loss": 2.4206, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_loss": 2.4437851905822754, | |
| "eval_runtime": 264.6455, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 1.553, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.0345, | |
| "grad_norm": 0.802724390649243, | |
| "learning_rate": 3.4480000000000003e-06, | |
| "loss": 2.4241, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.8206312612014312, | |
| "learning_rate": 3.4980000000000002e-06, | |
| "loss": 2.4157, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.0355, | |
| "grad_norm": 0.8653789917535344, | |
| "learning_rate": 3.548e-06, | |
| "loss": 2.412, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.7816319078215015, | |
| "learning_rate": 3.5980000000000005e-06, | |
| "loss": 2.4179, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_loss": 2.4423036575317383, | |
| "eval_runtime": 264.5578, | |
| "eval_samples_per_second": 3.107, | |
| "eval_steps_per_second": 1.554, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0365, | |
| "grad_norm": 0.707594544466941, | |
| "learning_rate": 3.6480000000000005e-06, | |
| "loss": 2.416, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 0.7481066913011816, | |
| "learning_rate": 3.6980000000000004e-06, | |
| "loss": 2.4242, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.7612014979445353, | |
| "learning_rate": 3.7480000000000004e-06, | |
| "loss": 2.4173, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 0.772750918048857, | |
| "learning_rate": 3.7980000000000007e-06, | |
| "loss": 2.4134, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_loss": 2.440969228744507, | |
| "eval_runtime": 274.3624, | |
| "eval_samples_per_second": 2.996, | |
| "eval_steps_per_second": 1.498, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0385, | |
| "grad_norm": 0.7927966042188935, | |
| "learning_rate": 3.848e-06, | |
| "loss": 2.4131, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 0.7664274167276341, | |
| "learning_rate": 3.898e-06, | |
| "loss": 2.4133, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.0395, | |
| "grad_norm": 0.7038638213491795, | |
| "learning_rate": 3.948e-06, | |
| "loss": 2.4135, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.7231696877425319, | |
| "learning_rate": 3.9980000000000005e-06, | |
| "loss": 2.4169, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 2.439641237258911, | |
| "eval_runtime": 282.4449, | |
| "eval_samples_per_second": 2.91, | |
| "eval_steps_per_second": 1.455, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0405, | |
| "grad_norm": 0.7184393791203537, | |
| "learning_rate": 4.048e-06, | |
| "loss": 2.4071, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 0.7366813467336683, | |
| "learning_rate": 4.098e-06, | |
| "loss": 2.4113, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.0415, | |
| "grad_norm": 0.7081408763220511, | |
| "learning_rate": 4.148000000000001e-06, | |
| "loss": 2.4168, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 0.6912835983850483, | |
| "learning_rate": 4.198e-06, | |
| "loss": 2.4105, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_loss": 2.438904047012329, | |
| "eval_runtime": 277.7481, | |
| "eval_samples_per_second": 2.96, | |
| "eval_steps_per_second": 1.48, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.7745538733736145, | |
| "learning_rate": 4.248000000000001e-06, | |
| "loss": 2.4131, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 0.6897576190091962, | |
| "learning_rate": 4.298e-06, | |
| "loss": 2.4084, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.0435, | |
| "grad_norm": 0.7020994032566351, | |
| "learning_rate": 4.3480000000000006e-06, | |
| "loss": 2.4125, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 0.6668651869738377, | |
| "learning_rate": 4.398000000000001e-06, | |
| "loss": 2.4034, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_loss": 2.4380908012390137, | |
| "eval_runtime": 268.2252, | |
| "eval_samples_per_second": 3.065, | |
| "eval_steps_per_second": 1.532, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0445, | |
| "grad_norm": 0.6547759047620061, | |
| "learning_rate": 4.4480000000000004e-06, | |
| "loss": 2.4099, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.6865815945777785, | |
| "learning_rate": 4.498e-06, | |
| "loss": 2.412, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.0455, | |
| "grad_norm": 0.6878267781655092, | |
| "learning_rate": 4.548e-06, | |
| "loss": 2.4137, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 0.8314813616644483, | |
| "learning_rate": 4.598e-06, | |
| "loss": 2.4097, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_loss": 2.4374496936798096, | |
| "eval_runtime": 263.1701, | |
| "eval_samples_per_second": 3.123, | |
| "eval_steps_per_second": 1.562, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.0465, | |
| "grad_norm": 0.6723966792931375, | |
| "learning_rate": 4.648e-06, | |
| "loss": 2.4051, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.7003756914046538, | |
| "learning_rate": 4.698000000000001e-06, | |
| "loss": 2.4032, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.6747085415631567, | |
| "learning_rate": 4.748e-06, | |
| "loss": 2.4096, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.6571218540079207, | |
| "learning_rate": 4.7980000000000005e-06, | |
| "loss": 2.4165, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_loss": 2.4365923404693604, | |
| "eval_runtime": 264.2268, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 1.555, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.0485, | |
| "grad_norm": 0.7464314980483315, | |
| "learning_rate": 4.848000000000001e-06, | |
| "loss": 2.4098, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 0.6267266619200393, | |
| "learning_rate": 4.898e-06, | |
| "loss": 2.4019, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.0495, | |
| "grad_norm": 0.6650772680412506, | |
| "learning_rate": 4.948000000000001e-06, | |
| "loss": 2.405, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.7197173899674899, | |
| "learning_rate": 4.998e-06, | |
| "loss": 2.4095, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 2.4358348846435547, | |
| "eval_runtime": 266.7682, | |
| "eval_samples_per_second": 3.081, | |
| "eval_steps_per_second": 1.541, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0505, | |
| "grad_norm": 0.6249572472256157, | |
| "learning_rate": 5.048000000000001e-06, | |
| "loss": 2.4058, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 0.7429228032719255, | |
| "learning_rate": 5.098000000000001e-06, | |
| "loss": 2.4084, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.0515, | |
| "grad_norm": 0.6320325962693778, | |
| "learning_rate": 5.1480000000000005e-06, | |
| "loss": 2.4015, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 0.672581755106835, | |
| "learning_rate": 5.198000000000001e-06, | |
| "loss": 2.4051, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_loss": 2.4351842403411865, | |
| "eval_runtime": 264.9149, | |
| "eval_samples_per_second": 3.103, | |
| "eval_steps_per_second": 1.551, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.7086480776921088, | |
| "learning_rate": 5.248000000000001e-06, | |
| "loss": 2.3988, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 0.6774201154936552, | |
| "learning_rate": 5.298000000000001e-06, | |
| "loss": 2.394, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.0535, | |
| "grad_norm": 0.6661104910300973, | |
| "learning_rate": 5.348000000000001e-06, | |
| "loss": 2.4034, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 0.6224421593448741, | |
| "learning_rate": 5.398e-06, | |
| "loss": 2.3939, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_loss": 2.434826374053955, | |
| "eval_runtime": 264.1641, | |
| "eval_samples_per_second": 3.112, | |
| "eval_steps_per_second": 1.556, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.0545, | |
| "grad_norm": 0.6944661408419767, | |
| "learning_rate": 5.448e-06, | |
| "loss": 2.4064, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.6597297955298902, | |
| "learning_rate": 5.498e-06, | |
| "loss": 2.4051, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.0555, | |
| "grad_norm": 0.6526109506522182, | |
| "learning_rate": 5.548e-06, | |
| "loss": 2.4124, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.6528041780055424, | |
| "learning_rate": 5.5980000000000004e-06, | |
| "loss": 2.3979, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_loss": 2.4344167709350586, | |
| "eval_runtime": 264.2924, | |
| "eval_samples_per_second": 3.11, | |
| "eval_steps_per_second": 1.555, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.0565, | |
| "grad_norm": 0.7067565611523313, | |
| "learning_rate": 5.648e-06, | |
| "loss": 2.398, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 0.6416666495903947, | |
| "learning_rate": 5.698e-06, | |
| "loss": 2.3991, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.6605105424774851, | |
| "learning_rate": 5.748e-06, | |
| "loss": 2.3962, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.6308761264530915, | |
| "learning_rate": 5.798e-06, | |
| "loss": 2.4058, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_loss": 2.434436082839966, | |
| "eval_runtime": 265.0112, | |
| "eval_samples_per_second": 3.102, | |
| "eval_steps_per_second": 1.551, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.0585, | |
| "grad_norm": 0.6363649329289001, | |
| "learning_rate": 5.848000000000001e-06, | |
| "loss": 2.3943, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 0.6147983139117156, | |
| "learning_rate": 5.898e-06, | |
| "loss": 2.3982, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.0595, | |
| "grad_norm": 0.611354772141602, | |
| "learning_rate": 5.9480000000000005e-06, | |
| "loss": 2.3921, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.6269054680170398, | |
| "learning_rate": 5.998000000000001e-06, | |
| "loss": 2.392, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 2.433990955352783, | |
| "eval_runtime": 264.2169, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 1.556, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.0605, | |
| "grad_norm": 0.6248207448228328, | |
| "learning_rate": 6.048e-06, | |
| "loss": 2.3858, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "grad_norm": 0.6275258656299642, | |
| "learning_rate": 6.098000000000001e-06, | |
| "loss": 2.4015, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.0615, | |
| "grad_norm": 1.0457401571274152, | |
| "learning_rate": 6.148e-06, | |
| "loss": 2.3909, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 0.6551230863319748, | |
| "learning_rate": 6.198000000000001e-06, | |
| "loss": 2.3983, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "eval_loss": 2.433279275894165, | |
| "eval_runtime": 264.1521, | |
| "eval_samples_per_second": 3.112, | |
| "eval_steps_per_second": 1.556, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.6306746226297937, | |
| "learning_rate": 6.248000000000001e-06, | |
| "loss": 2.397, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.063, | |
| "grad_norm": 0.6299802316587856, | |
| "learning_rate": 6.2980000000000005e-06, | |
| "loss": 2.4018, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.0635, | |
| "grad_norm": 0.6265424590222634, | |
| "learning_rate": 6.348000000000001e-06, | |
| "loss": 2.4065, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.6717273211615455, | |
| "learning_rate": 6.398000000000001e-06, | |
| "loss": 2.3906, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "eval_loss": 2.4333276748657227, | |
| "eval_runtime": 263.9592, | |
| "eval_samples_per_second": 3.114, | |
| "eval_steps_per_second": 1.557, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.0645, | |
| "grad_norm": 0.6159924635031793, | |
| "learning_rate": 6.448000000000001e-06, | |
| "loss": 2.3947, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.6124462043712093, | |
| "learning_rate": 6.498000000000001e-06, | |
| "loss": 2.3963, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.0655, | |
| "grad_norm": 0.6144378183602921, | |
| "learning_rate": 6.548000000000001e-06, | |
| "loss": 2.402, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "grad_norm": 0.6295732934678283, | |
| "learning_rate": 6.598000000000001e-06, | |
| "loss": 2.3877, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "eval_loss": 2.4331116676330566, | |
| "eval_runtime": 263.4524, | |
| "eval_samples_per_second": 3.12, | |
| "eval_steps_per_second": 1.56, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.0665, | |
| "grad_norm": 0.5938287129149346, | |
| "learning_rate": 6.648e-06, | |
| "loss": 2.389, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.067, | |
| "grad_norm": 0.6194783667871923, | |
| "learning_rate": 6.698e-06, | |
| "loss": 2.39, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 0.60927231594853, | |
| "learning_rate": 6.7480000000000004e-06, | |
| "loss": 2.3968, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 0.6386175333576501, | |
| "learning_rate": 6.798e-06, | |
| "loss": 2.3861, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "eval_loss": 2.4328911304473877, | |
| "eval_runtime": 264.2923, | |
| "eval_samples_per_second": 3.11, | |
| "eval_steps_per_second": 1.555, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.0685, | |
| "grad_norm": 0.6092295027577579, | |
| "learning_rate": 6.848e-06, | |
| "loss": 2.3827, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.069, | |
| "grad_norm": 0.5914846449422462, | |
| "learning_rate": 6.898e-06, | |
| "loss": 2.3894, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.0695, | |
| "grad_norm": 0.5927461214526666, | |
| "learning_rate": 6.948e-06, | |
| "loss": 2.3858, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.5992194088197265, | |
| "learning_rate": 6.998000000000001e-06, | |
| "loss": 2.3941, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_loss": 2.432774543762207, | |
| "eval_runtime": 263.8546, | |
| "eval_samples_per_second": 3.115, | |
| "eval_steps_per_second": 1.558, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0705, | |
| "grad_norm": 0.6119297158568089, | |
| "learning_rate": 7.048e-06, | |
| "loss": 2.3897, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.071, | |
| "grad_norm": 0.6040666217758901, | |
| "learning_rate": 7.0980000000000005e-06, | |
| "loss": 2.3966, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.0715, | |
| "grad_norm": 0.6142925813030266, | |
| "learning_rate": 7.148000000000001e-06, | |
| "loss": 2.3953, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.5857079248330344, | |
| "learning_rate": 7.198e-06, | |
| "loss": 2.3854, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "eval_loss": 2.432868719100952, | |
| "eval_runtime": 264.1849, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 1.556, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.6075613052530382, | |
| "learning_rate": 7.248000000000001e-06, | |
| "loss": 2.3798, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.073, | |
| "grad_norm": 0.6146043204282547, | |
| "learning_rate": 7.298e-06, | |
| "loss": 2.3894, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.0735, | |
| "grad_norm": 0.613284002341936, | |
| "learning_rate": 7.348000000000001e-06, | |
| "loss": 2.3897, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "grad_norm": 0.6694404263159593, | |
| "learning_rate": 7.398000000000001e-06, | |
| "loss": 2.3925, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "eval_loss": 2.4324021339416504, | |
| "eval_runtime": 263.3107, | |
| "eval_samples_per_second": 3.122, | |
| "eval_steps_per_second": 1.561, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.0745, | |
| "grad_norm": 0.5756401973694445, | |
| "learning_rate": 7.4480000000000005e-06, | |
| "loss": 2.3894, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.5945783703417461, | |
| "learning_rate": 7.498000000000001e-06, | |
| "loss": 2.3928, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.0755, | |
| "grad_norm": 0.5935750222986942, | |
| "learning_rate": 7.548000000000001e-06, | |
| "loss": 2.3774, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 0.5938734543073783, | |
| "learning_rate": 7.598000000000001e-06, | |
| "loss": 2.3776, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "eval_loss": 2.432751178741455, | |
| "eval_runtime": 263.8929, | |
| "eval_samples_per_second": 3.115, | |
| "eval_steps_per_second": 1.557, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.0765, | |
| "grad_norm": 0.595820899700728, | |
| "learning_rate": 7.648e-06, | |
| "loss": 2.3804, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.077, | |
| "grad_norm": 0.6079304106413467, | |
| "learning_rate": 7.698000000000002e-06, | |
| "loss": 2.3917, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 0.6083448146618482, | |
| "learning_rate": 7.748000000000001e-06, | |
| "loss": 2.3842, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "grad_norm": 0.6128893415605828, | |
| "learning_rate": 7.798e-06, | |
| "loss": 2.3806, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "eval_loss": 2.4325239658355713, | |
| "eval_runtime": 263.6693, | |
| "eval_samples_per_second": 3.118, | |
| "eval_steps_per_second": 1.559, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.0785, | |
| "grad_norm": 0.6079041195191952, | |
| "learning_rate": 7.848000000000002e-06, | |
| "loss": 2.3801, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.079, | |
| "grad_norm": 0.6075689821557235, | |
| "learning_rate": 7.898e-06, | |
| "loss": 2.3797, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.0795, | |
| "grad_norm": 0.5882326737716994, | |
| "learning_rate": 7.948e-06, | |
| "loss": 2.3905, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5828476462223788, | |
| "learning_rate": 7.998e-06, | |
| "loss": 2.3806, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 2.4323527812957764, | |
| "eval_runtime": 263.9786, | |
| "eval_samples_per_second": 3.114, | |
| "eval_steps_per_second": 1.557, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.0805, | |
| "grad_norm": 0.5907927035367586, | |
| "learning_rate": 8.048e-06, | |
| "loss": 2.3739, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.081, | |
| "grad_norm": 0.608189189988593, | |
| "learning_rate": 8.098000000000001e-06, | |
| "loss": 2.3837, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.0815, | |
| "grad_norm": 0.5933025642280234, | |
| "learning_rate": 8.148e-06, | |
| "loss": 2.3814, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "grad_norm": 0.5898305070270532, | |
| "learning_rate": 8.198e-06, | |
| "loss": 2.3854, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "eval_loss": 2.432577610015869, | |
| "eval_runtime": 264.0972, | |
| "eval_samples_per_second": 3.112, | |
| "eval_steps_per_second": 1.556, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 0.5673002921483621, | |
| "learning_rate": 8.248e-06, | |
| "loss": 2.3827, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.083, | |
| "grad_norm": 0.5859186364996516, | |
| "learning_rate": 8.298000000000001e-06, | |
| "loss": 2.3859, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.0835, | |
| "grad_norm": 0.5852893491639726, | |
| "learning_rate": 8.348e-06, | |
| "loss": 2.3711, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 0.5704807601233864, | |
| "learning_rate": 8.398e-06, | |
| "loss": 2.3682, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "eval_loss": 2.4325780868530273, | |
| "eval_runtime": 264.0677, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.556, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.0845, | |
| "grad_norm": 0.565873049775094, | |
| "learning_rate": 8.448000000000001e-06, | |
| "loss": 2.3894, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.6594348238393681, | |
| "learning_rate": 8.498e-06, | |
| "loss": 2.3736, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.0855, | |
| "grad_norm": 0.6114416993962639, | |
| "learning_rate": 8.548e-06, | |
| "loss": 2.3768, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "grad_norm": 0.613007148558132, | |
| "learning_rate": 8.598000000000001e-06, | |
| "loss": 2.3841, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "eval_loss": 2.432278633117676, | |
| "eval_runtime": 264.5455, | |
| "eval_samples_per_second": 3.107, | |
| "eval_steps_per_second": 1.554, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.0865, | |
| "grad_norm": 0.6316113111159283, | |
| "learning_rate": 8.648000000000001e-06, | |
| "loss": 2.3853, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.087, | |
| "grad_norm": 0.578758909498954, | |
| "learning_rate": 8.698e-06, | |
| "loss": 2.3838, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.5663796780744771, | |
| "learning_rate": 8.748000000000002e-06, | |
| "loss": 2.3744, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.5996723194508057, | |
| "learning_rate": 8.798000000000001e-06, | |
| "loss": 2.3741, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "eval_loss": 2.4327504634857178, | |
| "eval_runtime": 264.3839, | |
| "eval_samples_per_second": 3.109, | |
| "eval_steps_per_second": 1.555, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.0885, | |
| "grad_norm": 0.5903185672805589, | |
| "learning_rate": 8.848e-06, | |
| "loss": 2.3789, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.089, | |
| "grad_norm": 0.5683354037993711, | |
| "learning_rate": 8.898000000000002e-06, | |
| "loss": 2.3739, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.0895, | |
| "grad_norm": 0.5992802333814672, | |
| "learning_rate": 8.948000000000001e-06, | |
| "loss": 2.3805, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.5951158771681028, | |
| "learning_rate": 8.998000000000001e-06, | |
| "loss": 2.3702, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": 2.432904005050659, | |
| "eval_runtime": 264.0927, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.556, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.0905, | |
| "grad_norm": 0.628437176595306, | |
| "learning_rate": 9.048e-06, | |
| "loss": 2.3705, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.091, | |
| "grad_norm": 0.5852194468933433, | |
| "learning_rate": 9.098000000000002e-06, | |
| "loss": 2.3726, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.0915, | |
| "grad_norm": 0.5832814461503186, | |
| "learning_rate": 9.148e-06, | |
| "loss": 2.3709, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 0.6235298544634128, | |
| "learning_rate": 9.198e-06, | |
| "loss": 2.3823, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "eval_loss": 2.433288335800171, | |
| "eval_runtime": 264.0394, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.557, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 0.6097464410099737, | |
| "learning_rate": 9.248e-06, | |
| "loss": 2.3715, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.093, | |
| "grad_norm": 0.5830918527201829, | |
| "learning_rate": 9.298e-06, | |
| "loss": 2.3694, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.0935, | |
| "grad_norm": 0.6195865573807103, | |
| "learning_rate": 9.348000000000001e-06, | |
| "loss": 2.3711, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "grad_norm": 0.5922485886549429, | |
| "learning_rate": 9.398e-06, | |
| "loss": 2.3764, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "eval_loss": 2.4330477714538574, | |
| "eval_runtime": 263.7501, | |
| "eval_samples_per_second": 3.117, | |
| "eval_steps_per_second": 1.558, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.0945, | |
| "grad_norm": 0.5909566806378528, | |
| "learning_rate": 9.448e-06, | |
| "loss": 2.3799, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.5872189964007283, | |
| "learning_rate": 9.498000000000001e-06, | |
| "loss": 2.3737, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.0955, | |
| "grad_norm": 0.6071714619656263, | |
| "learning_rate": 9.548e-06, | |
| "loss": 2.3789, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.5631342344537085, | |
| "learning_rate": 9.598e-06, | |
| "loss": 2.3641, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "eval_loss": 2.4332797527313232, | |
| "eval_runtime": 264.5164, | |
| "eval_samples_per_second": 3.108, | |
| "eval_steps_per_second": 1.554, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.0965, | |
| "grad_norm": 0.600707218384485, | |
| "learning_rate": 9.648000000000001e-06, | |
| "loss": 2.3715, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.097, | |
| "grad_norm": 0.5705494762785608, | |
| "learning_rate": 9.698000000000001e-06, | |
| "loss": 2.3741, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 0.5891811727113021, | |
| "learning_rate": 9.748e-06, | |
| "loss": 2.3738, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "grad_norm": 0.5947555260131183, | |
| "learning_rate": 9.798e-06, | |
| "loss": 2.365, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "eval_loss": 2.433032751083374, | |
| "eval_runtime": 264.6355, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 1.553, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.0985, | |
| "grad_norm": 0.6055417663185935, | |
| "learning_rate": 9.848000000000001e-06, | |
| "loss": 2.3677, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.099, | |
| "grad_norm": 0.5803464068069174, | |
| "learning_rate": 9.898e-06, | |
| "loss": 2.3699, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.0995, | |
| "grad_norm": 0.5899201870269601, | |
| "learning_rate": 9.948e-06, | |
| "loss": 2.3685, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.6226759838202708, | |
| "learning_rate": 9.998000000000002e-06, | |
| "loss": 2.3599, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 2.433412551879883, | |
| "eval_runtime": 279.6783, | |
| "eval_samples_per_second": 2.939, | |
| "eval_steps_per_second": 1.47, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1005, | |
| "grad_norm": 0.6129345554278736, | |
| "learning_rate": 9.994666666666668e-06, | |
| "loss": 2.3651, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 0.101, | |
| "grad_norm": 0.5783687106202524, | |
| "learning_rate": 9.989111111111111e-06, | |
| "loss": 2.3635, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.1015, | |
| "grad_norm": 0.7886759246703615, | |
| "learning_rate": 9.983555555555556e-06, | |
| "loss": 2.3688, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "grad_norm": 0.5496276670344779, | |
| "learning_rate": 9.978000000000002e-06, | |
| "loss": 2.3718, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "eval_loss": 2.4336636066436768, | |
| "eval_runtime": 264.0531, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.557, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 0.596488402670124, | |
| "learning_rate": 9.972444444444445e-06, | |
| "loss": 2.3654, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 0.103, | |
| "grad_norm": 0.5758952191659142, | |
| "learning_rate": 9.966888888888889e-06, | |
| "loss": 2.3662, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.1035, | |
| "grad_norm": 0.5714325894660194, | |
| "learning_rate": 9.961333333333334e-06, | |
| "loss": 2.3671, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.5826964477363549, | |
| "learning_rate": 9.95577777777778e-06, | |
| "loss": 2.3621, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "eval_loss": 2.433170795440674, | |
| "eval_runtime": 263.4913, | |
| "eval_samples_per_second": 3.12, | |
| "eval_steps_per_second": 1.56, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.1045, | |
| "grad_norm": 0.5939017286545814, | |
| "learning_rate": 9.950222222222223e-06, | |
| "loss": 2.3704, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.5916137818576529, | |
| "learning_rate": 9.944666666666668e-06, | |
| "loss": 2.3662, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.1055, | |
| "grad_norm": 0.6105360548349205, | |
| "learning_rate": 9.939111111111112e-06, | |
| "loss": 2.3646, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "grad_norm": 0.5821955662592928, | |
| "learning_rate": 9.933555555555557e-06, | |
| "loss": 2.365, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "eval_loss": 2.4327642917633057, | |
| "eval_runtime": 263.745, | |
| "eval_samples_per_second": 3.117, | |
| "eval_steps_per_second": 1.558, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.1065, | |
| "grad_norm": 0.5805717889494187, | |
| "learning_rate": 9.928e-06, | |
| "loss": 2.364, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 0.107, | |
| "grad_norm": 0.5876895049794754, | |
| "learning_rate": 9.922444444444446e-06, | |
| "loss": 2.362, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 0.6258383766876349, | |
| "learning_rate": 9.91688888888889e-06, | |
| "loss": 2.3654, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 0.5963835367877209, | |
| "learning_rate": 9.911333333333335e-06, | |
| "loss": 2.3627, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "eval_loss": 2.4326930046081543, | |
| "eval_runtime": 263.2366, | |
| "eval_samples_per_second": 3.123, | |
| "eval_steps_per_second": 1.561, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.1085, | |
| "grad_norm": 0.5827253994353866, | |
| "learning_rate": 9.905777777777778e-06, | |
| "loss": 2.3703, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 0.109, | |
| "grad_norm": 0.571031920084426, | |
| "learning_rate": 9.900222222222223e-06, | |
| "loss": 2.3671, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.1095, | |
| "grad_norm": 0.599548806743577, | |
| "learning_rate": 9.894666666666669e-06, | |
| "loss": 2.362, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.5736311725646083, | |
| "learning_rate": 9.889111111111112e-06, | |
| "loss": 2.3622, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_loss": 2.4330084323883057, | |
| "eval_runtime": 264.1044, | |
| "eval_samples_per_second": 3.112, | |
| "eval_steps_per_second": 1.556, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.1105, | |
| "grad_norm": 0.6098672058792028, | |
| "learning_rate": 9.883555555555556e-06, | |
| "loss": 2.3705, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 0.111, | |
| "grad_norm": 0.5761728375832208, | |
| "learning_rate": 9.878000000000001e-06, | |
| "loss": 2.3608, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.1115, | |
| "grad_norm": 0.5922504560114277, | |
| "learning_rate": 9.872444444444446e-06, | |
| "loss": 2.3542, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.5668795024079605, | |
| "learning_rate": 9.86688888888889e-06, | |
| "loss": 2.3623, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "eval_loss": 2.432955503463745, | |
| "eval_runtime": 263.8097, | |
| "eval_samples_per_second": 3.116, | |
| "eval_steps_per_second": 1.558, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.5697809034851604, | |
| "learning_rate": 9.861333333333333e-06, | |
| "loss": 2.3541, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 0.113, | |
| "grad_norm": 0.5740407982821335, | |
| "learning_rate": 9.855777777777779e-06, | |
| "loss": 2.3594, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.1135, | |
| "grad_norm": 0.5697372211616294, | |
| "learning_rate": 9.850222222222224e-06, | |
| "loss": 2.3592, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "grad_norm": 0.5845230307189324, | |
| "learning_rate": 9.844666666666667e-06, | |
| "loss": 2.3456, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "eval_loss": 2.432389974594116, | |
| "eval_runtime": 263.8043, | |
| "eval_samples_per_second": 3.116, | |
| "eval_steps_per_second": 1.558, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.1145, | |
| "grad_norm": 0.5677067211464538, | |
| "learning_rate": 9.839111111111111e-06, | |
| "loss": 2.3581, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.6024564908699644, | |
| "learning_rate": 9.833555555555556e-06, | |
| "loss": 2.359, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.1155, | |
| "grad_norm": 0.5789830837760237, | |
| "learning_rate": 9.828000000000001e-06, | |
| "loss": 2.36, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 0.5912805339254935, | |
| "learning_rate": 9.822444444444445e-06, | |
| "loss": 2.3588, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "eval_loss": 2.432565689086914, | |
| "eval_runtime": 263.3515, | |
| "eval_samples_per_second": 3.121, | |
| "eval_steps_per_second": 1.561, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.1165, | |
| "grad_norm": 0.5647440650976697, | |
| "learning_rate": 9.81688888888889e-06, | |
| "loss": 2.3576, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 0.117, | |
| "grad_norm": 0.5673458673735715, | |
| "learning_rate": 9.811333333333334e-06, | |
| "loss": 2.3616, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 0.6030082642745155, | |
| "learning_rate": 9.805777777777779e-06, | |
| "loss": 2.3556, | |
| "step": 5875 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "grad_norm": 0.5571893163840321, | |
| "learning_rate": 9.800222222222223e-06, | |
| "loss": 2.3557, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "eval_loss": 2.4327075481414795, | |
| "eval_runtime": 263.2657, | |
| "eval_samples_per_second": 3.122, | |
| "eval_steps_per_second": 1.561, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.1185, | |
| "grad_norm": 0.5716010515949606, | |
| "learning_rate": 9.794666666666668e-06, | |
| "loss": 2.3616, | |
| "step": 5925 | |
| }, | |
| { | |
| "epoch": 0.119, | |
| "grad_norm": 0.6245053681878497, | |
| "learning_rate": 9.789111111111111e-06, | |
| "loss": 2.358, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.1195, | |
| "grad_norm": 0.5896528100704728, | |
| "learning_rate": 9.783555555555557e-06, | |
| "loss": 2.355, | |
| "step": 5975 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.5534590488643797, | |
| "learning_rate": 9.778e-06, | |
| "loss": 2.3567, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 2.4327354431152344, | |
| "eval_runtime": 263.9156, | |
| "eval_samples_per_second": 3.115, | |
| "eval_steps_per_second": 1.557, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.1205, | |
| "grad_norm": 0.5779403883996491, | |
| "learning_rate": 9.772444444444445e-06, | |
| "loss": 2.3487, | |
| "step": 6025 | |
| }, | |
| { | |
| "epoch": 0.121, | |
| "grad_norm": 0.5693494880188505, | |
| "learning_rate": 9.76688888888889e-06, | |
| "loss": 2.3506, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.1215, | |
| "grad_norm": 0.5864069751838692, | |
| "learning_rate": 9.761333333333334e-06, | |
| "loss": 2.3498, | |
| "step": 6075 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "grad_norm": 0.5930208676954954, | |
| "learning_rate": 9.755777777777778e-06, | |
| "loss": 2.3508, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "eval_loss": 2.432914972305298, | |
| "eval_runtime": 263.746, | |
| "eval_samples_per_second": 3.117, | |
| "eval_steps_per_second": 1.558, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 0.5967532601446782, | |
| "learning_rate": 9.750222222222223e-06, | |
| "loss": 2.3584, | |
| "step": 6125 | |
| }, | |
| { | |
| "epoch": 0.123, | |
| "grad_norm": 0.5670429310236035, | |
| "learning_rate": 9.744666666666668e-06, | |
| "loss": 2.3584, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.1235, | |
| "grad_norm": 0.5744482242457726, | |
| "learning_rate": 9.739111111111112e-06, | |
| "loss": 2.351, | |
| "step": 6175 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 0.6029007635970692, | |
| "learning_rate": 9.733555555555555e-06, | |
| "loss": 2.3494, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "eval_loss": 2.432878255844116, | |
| "eval_runtime": 263.5842, | |
| "eval_samples_per_second": 3.119, | |
| "eval_steps_per_second": 1.559, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.1245, | |
| "grad_norm": 0.564399310279196, | |
| "learning_rate": 9.728e-06, | |
| "loss": 2.3595, | |
| "step": 6225 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.6065670221926927, | |
| "learning_rate": 9.722444444444446e-06, | |
| "loss": 2.3547, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.1255, | |
| "grad_norm": 0.5659801132085207, | |
| "learning_rate": 9.71688888888889e-06, | |
| "loss": 2.3511, | |
| "step": 6275 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "grad_norm": 0.5837628069797915, | |
| "learning_rate": 9.711333333333333e-06, | |
| "loss": 2.3575, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "eval_loss": 2.4329097270965576, | |
| "eval_runtime": 264.6192, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 1.553, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.1265, | |
| "grad_norm": 0.5760319910919499, | |
| "learning_rate": 9.705777777777778e-06, | |
| "loss": 2.3488, | |
| "step": 6325 | |
| }, | |
| { | |
| "epoch": 0.127, | |
| "grad_norm": 0.5761318046315628, | |
| "learning_rate": 9.700222222222224e-06, | |
| "loss": 2.3435, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 0.5609369346838009, | |
| "learning_rate": 9.694666666666667e-06, | |
| "loss": 2.347, | |
| "step": 6375 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.5954461846572633, | |
| "learning_rate": 9.68911111111111e-06, | |
| "loss": 2.3485, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "eval_loss": 2.4333934783935547, | |
| "eval_runtime": 263.5903, | |
| "eval_samples_per_second": 3.118, | |
| "eval_steps_per_second": 1.559, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.1285, | |
| "grad_norm": 0.5524126786458765, | |
| "learning_rate": 9.683555555555556e-06, | |
| "loss": 2.3514, | |
| "step": 6425 | |
| }, | |
| { | |
| "epoch": 0.129, | |
| "grad_norm": 0.5590067107241867, | |
| "learning_rate": 9.678000000000001e-06, | |
| "loss": 2.3477, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.1295, | |
| "grad_norm": 0.5578028236930622, | |
| "learning_rate": 9.672444444444445e-06, | |
| "loss": 2.3434, | |
| "step": 6475 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.6002389478119885, | |
| "learning_rate": 9.66688888888889e-06, | |
| "loss": 2.3415, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "eval_loss": 2.433302164077759, | |
| "eval_runtime": 263.4334, | |
| "eval_samples_per_second": 3.12, | |
| "eval_steps_per_second": 1.56, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1305, | |
| "grad_norm": 0.5868647352323021, | |
| "learning_rate": 9.661333333333334e-06, | |
| "loss": 2.3532, | |
| "step": 6525 | |
| }, | |
| { | |
| "epoch": 0.131, | |
| "grad_norm": 0.5525203092071236, | |
| "learning_rate": 9.655777777777779e-06, | |
| "loss": 2.3439, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.1315, | |
| "grad_norm": 0.642282300647443, | |
| "learning_rate": 9.650222222222222e-06, | |
| "loss": 2.333, | |
| "step": 6575 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 0.5954691746571129, | |
| "learning_rate": 9.644666666666668e-06, | |
| "loss": 2.3371, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "eval_loss": 2.4332070350646973, | |
| "eval_runtime": 263.9928, | |
| "eval_samples_per_second": 3.114, | |
| "eval_steps_per_second": 1.557, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 0.5696322215994257, | |
| "learning_rate": 9.639111111111113e-06, | |
| "loss": 2.3568, | |
| "step": 6625 | |
| }, | |
| { | |
| "epoch": 0.133, | |
| "grad_norm": 0.569783318316734, | |
| "learning_rate": 9.633555555555556e-06, | |
| "loss": 2.3468, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.1335, | |
| "grad_norm": 0.5974477984803339, | |
| "learning_rate": 9.628e-06, | |
| "loss": 2.3369, | |
| "step": 6675 | |
| }, | |
| { | |
| "epoch": 0.134, | |
| "grad_norm": 0.5850514409957908, | |
| "learning_rate": 9.622444444444445e-06, | |
| "loss": 2.3328, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.134, | |
| "eval_loss": 2.4336042404174805, | |
| "eval_runtime": 264.1653, | |
| "eval_samples_per_second": 3.112, | |
| "eval_steps_per_second": 1.556, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.1345, | |
| "grad_norm": 0.5598567946533984, | |
| "learning_rate": 9.61688888888889e-06, | |
| "loss": 2.3505, | |
| "step": 6725 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.564538169627995, | |
| "learning_rate": 9.611333333333334e-06, | |
| "loss": 2.3512, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.1355, | |
| "grad_norm": 0.555057205811747, | |
| "learning_rate": 9.605777777777778e-06, | |
| "loss": 2.3441, | |
| "step": 6775 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.5928392878820046, | |
| "learning_rate": 9.600222222222223e-06, | |
| "loss": 2.342, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "eval_loss": 2.4332380294799805, | |
| "eval_runtime": 263.6981, | |
| "eval_samples_per_second": 3.117, | |
| "eval_steps_per_second": 1.559, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.1365, | |
| "grad_norm": 0.580747535991996, | |
| "learning_rate": 9.594666666666668e-06, | |
| "loss": 2.3402, | |
| "step": 6825 | |
| }, | |
| { | |
| "epoch": 0.137, | |
| "grad_norm": 0.5361093856752921, | |
| "learning_rate": 9.589111111111112e-06, | |
| "loss": 2.3345, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 0.5764684974648585, | |
| "learning_rate": 9.583555555555555e-06, | |
| "loss": 2.3434, | |
| "step": 6875 | |
| }, | |
| { | |
| "epoch": 0.138, | |
| "grad_norm": 0.5695437902803252, | |
| "learning_rate": 9.578e-06, | |
| "loss": 2.3345, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.138, | |
| "eval_loss": 2.4334897994995117, | |
| "eval_runtime": 263.9042, | |
| "eval_samples_per_second": 3.115, | |
| "eval_steps_per_second": 1.557, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.1385, | |
| "grad_norm": 0.5856816810807355, | |
| "learning_rate": 9.572444444444446e-06, | |
| "loss": 2.3344, | |
| "step": 6925 | |
| }, | |
| { | |
| "epoch": 0.139, | |
| "grad_norm": 0.5692161417871612, | |
| "learning_rate": 9.56688888888889e-06, | |
| "loss": 2.3492, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.1395, | |
| "grad_norm": 0.5782790626699041, | |
| "learning_rate": 9.561333333333333e-06, | |
| "loss": 2.3343, | |
| "step": 6975 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.5592348825440727, | |
| "learning_rate": 9.555777777777778e-06, | |
| "loss": 2.3361, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_loss": 2.4338128566741943, | |
| "eval_runtime": 264.0278, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.557, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.1405, | |
| "grad_norm": 0.5810855929853301, | |
| "learning_rate": 9.550222222222223e-06, | |
| "loss": 2.3397, | |
| "step": 7025 | |
| }, | |
| { | |
| "epoch": 0.141, | |
| "grad_norm": 0.5672444444354668, | |
| "learning_rate": 9.544666666666667e-06, | |
| "loss": 2.3384, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.1415, | |
| "grad_norm": 0.649461804794621, | |
| "learning_rate": 9.539111111111112e-06, | |
| "loss": 2.3384, | |
| "step": 7075 | |
| }, | |
| { | |
| "epoch": 0.142, | |
| "grad_norm": 0.5697893925017475, | |
| "learning_rate": 9.533555555555556e-06, | |
| "loss": 2.3415, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.142, | |
| "eval_loss": 2.4329330921173096, | |
| "eval_runtime": 263.8408, | |
| "eval_samples_per_second": 3.116, | |
| "eval_steps_per_second": 1.558, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 0.562192662676289, | |
| "learning_rate": 9.528000000000001e-06, | |
| "loss": 2.3381, | |
| "step": 7125 | |
| }, | |
| { | |
| "epoch": 0.143, | |
| "grad_norm": 0.5782927675061864, | |
| "learning_rate": 9.522444444444444e-06, | |
| "loss": 2.3316, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.1435, | |
| "grad_norm": 0.5470889439002048, | |
| "learning_rate": 9.51688888888889e-06, | |
| "loss": 2.3336, | |
| "step": 7175 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.5732687375919955, | |
| "learning_rate": 9.511333333333335e-06, | |
| "loss": 2.3302, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "eval_loss": 2.4339091777801514, | |
| "eval_runtime": 265.4685, | |
| "eval_samples_per_second": 3.096, | |
| "eval_steps_per_second": 1.548, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.1445, | |
| "grad_norm": 0.5552677779418167, | |
| "learning_rate": 9.505777777777779e-06, | |
| "loss": 2.3382, | |
| "step": 7225 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.5597695533114173, | |
| "learning_rate": 9.500222222222222e-06, | |
| "loss": 2.3281, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.1455, | |
| "grad_norm": 0.586047229250587, | |
| "learning_rate": 9.494666666666667e-06, | |
| "loss": 2.3365, | |
| "step": 7275 | |
| }, | |
| { | |
| "epoch": 0.146, | |
| "grad_norm": 0.5631697021330876, | |
| "learning_rate": 9.489111111111113e-06, | |
| "loss": 2.3434, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.146, | |
| "eval_loss": 2.4337289333343506, | |
| "eval_runtime": 264.0121, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.557, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.1465, | |
| "grad_norm": 0.5787283610065107, | |
| "learning_rate": 9.483555555555556e-06, | |
| "loss": 2.3385, | |
| "step": 7325 | |
| }, | |
| { | |
| "epoch": 0.147, | |
| "grad_norm": 0.5894250508009748, | |
| "learning_rate": 9.478e-06, | |
| "loss": 2.3289, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 0.5698558287850775, | |
| "learning_rate": 9.472444444444445e-06, | |
| "loss": 2.3363, | |
| "step": 7375 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 0.5704695535231787, | |
| "learning_rate": 9.46688888888889e-06, | |
| "loss": 2.3245, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "eval_loss": 2.4338371753692627, | |
| "eval_runtime": 264.1068, | |
| "eval_samples_per_second": 3.112, | |
| "eval_steps_per_second": 1.556, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.1485, | |
| "grad_norm": 0.5452782996001769, | |
| "learning_rate": 9.461333333333334e-06, | |
| "loss": 2.3442, | |
| "step": 7425 | |
| }, | |
| { | |
| "epoch": 0.149, | |
| "grad_norm": 0.5741037001956839, | |
| "learning_rate": 9.455777777777777e-06, | |
| "loss": 2.3349, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.1495, | |
| "grad_norm": 0.5570524045425876, | |
| "learning_rate": 9.450222222222223e-06, | |
| "loss": 2.3324, | |
| "step": 7475 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.5701333037498688, | |
| "learning_rate": 9.444666666666668e-06, | |
| "loss": 2.3268, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_loss": 2.4347753524780273, | |
| "eval_runtime": 264.1822, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 1.556, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.1505, | |
| "grad_norm": 0.5636194713998469, | |
| "learning_rate": 9.439111111111111e-06, | |
| "loss": 2.3324, | |
| "step": 7525 | |
| }, | |
| { | |
| "epoch": 0.151, | |
| "grad_norm": 0.5745462812172999, | |
| "learning_rate": 9.433555555555557e-06, | |
| "loss": 2.3438, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.1515, | |
| "grad_norm": 0.5658180287749817, | |
| "learning_rate": 9.428e-06, | |
| "loss": 2.3272, | |
| "step": 7575 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.5590021944536283, | |
| "learning_rate": 9.422444444444445e-06, | |
| "loss": 2.3379, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "eval_loss": 2.43342924118042, | |
| "eval_runtime": 264.6073, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 1.553, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 0.5756847823781959, | |
| "learning_rate": 9.41688888888889e-06, | |
| "loss": 2.3291, | |
| "step": 7625 | |
| }, | |
| { | |
| "epoch": 0.153, | |
| "grad_norm": 0.5614727649452073, | |
| "learning_rate": 9.411333333333334e-06, | |
| "loss": 2.3164, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.1535, | |
| "grad_norm": 0.581410678990456, | |
| "learning_rate": 9.405777777777778e-06, | |
| "loss": 2.3205, | |
| "step": 7675 | |
| }, | |
| { | |
| "epoch": 0.154, | |
| "grad_norm": 0.6063515370764081, | |
| "learning_rate": 9.400222222222223e-06, | |
| "loss": 2.3331, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.154, | |
| "eval_loss": 2.435711622238159, | |
| "eval_runtime": 283.6724, | |
| "eval_samples_per_second": 2.898, | |
| "eval_steps_per_second": 1.449, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.1545, | |
| "grad_norm": 0.5535459156675728, | |
| "learning_rate": 9.394666666666668e-06, | |
| "loss": 2.3312, | |
| "step": 7725 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.5550223235337549, | |
| "learning_rate": 9.389111111111112e-06, | |
| "loss": 2.3222, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.1555, | |
| "grad_norm": 0.5661396564004607, | |
| "learning_rate": 9.383555555555557e-06, | |
| "loss": 2.329, | |
| "step": 7775 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 0.5754229466302317, | |
| "learning_rate": 9.378e-06, | |
| "loss": 2.3375, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "eval_loss": 2.4339263439178467, | |
| "eval_runtime": 263.7245, | |
| "eval_samples_per_second": 3.117, | |
| "eval_steps_per_second": 1.558, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.1565, | |
| "grad_norm": 0.5922113870936093, | |
| "learning_rate": 9.372444444444446e-06, | |
| "loss": 2.3326, | |
| "step": 7825 | |
| }, | |
| { | |
| "epoch": 0.157, | |
| "grad_norm": 0.5802231546249389, | |
| "learning_rate": 9.36688888888889e-06, | |
| "loss": 2.3313, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 0.5613750089293277, | |
| "learning_rate": 9.361333333333335e-06, | |
| "loss": 2.3306, | |
| "step": 7875 | |
| }, | |
| { | |
| "epoch": 0.158, | |
| "grad_norm": 0.5554952690049914, | |
| "learning_rate": 9.355777777777778e-06, | |
| "loss": 2.3307, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.158, | |
| "eval_loss": 2.435500144958496, | |
| "eval_runtime": 268.1064, | |
| "eval_samples_per_second": 3.066, | |
| "eval_steps_per_second": 1.533, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.1585, | |
| "grad_norm": 0.5699743157285643, | |
| "learning_rate": 9.350222222222224e-06, | |
| "loss": 2.3274, | |
| "step": 7925 | |
| }, | |
| { | |
| "epoch": 0.159, | |
| "grad_norm": 0.580771514541295, | |
| "learning_rate": 9.344666666666667e-06, | |
| "loss": 2.3238, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.1595, | |
| "grad_norm": 0.563419791930312, | |
| "learning_rate": 9.339111111111112e-06, | |
| "loss": 2.3384, | |
| "step": 7975 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.5793778749938447, | |
| "learning_rate": 9.333555555555558e-06, | |
| "loss": 2.3291, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 2.4343531131744385, | |
| "eval_runtime": 263.9111, | |
| "eval_samples_per_second": 3.115, | |
| "eval_steps_per_second": 1.557, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.1605, | |
| "grad_norm": 0.5748501940226582, | |
| "learning_rate": 9.328000000000001e-06, | |
| "loss": 2.3272, | |
| "step": 8025 | |
| }, | |
| { | |
| "epoch": 0.161, | |
| "grad_norm": 0.5776520997935511, | |
| "learning_rate": 9.322444444444445e-06, | |
| "loss": 2.3232, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.1615, | |
| "grad_norm": 0.5841162716826148, | |
| "learning_rate": 9.31688888888889e-06, | |
| "loss": 2.3252, | |
| "step": 8075 | |
| }, | |
| { | |
| "epoch": 0.162, | |
| "grad_norm": 0.5582161918345583, | |
| "learning_rate": 9.311333333333335e-06, | |
| "loss": 2.3254, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.162, | |
| "eval_loss": 2.4345877170562744, | |
| "eval_runtime": 263.9792, | |
| "eval_samples_per_second": 3.114, | |
| "eval_steps_per_second": 1.557, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.5744381110572562, | |
| "learning_rate": 9.305777777777779e-06, | |
| "loss": 2.325, | |
| "step": 8125 | |
| }, | |
| { | |
| "epoch": 0.163, | |
| "grad_norm": 0.5801402993634438, | |
| "learning_rate": 9.300222222222222e-06, | |
| "loss": 2.3203, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.1635, | |
| "grad_norm": 0.5644380448766211, | |
| "learning_rate": 9.294666666666668e-06, | |
| "loss": 2.3179, | |
| "step": 8175 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 0.5747041663572834, | |
| "learning_rate": 9.289111111111113e-06, | |
| "loss": 2.3241, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "eval_loss": 2.435701847076416, | |
| "eval_runtime": 263.9699, | |
| "eval_samples_per_second": 3.114, | |
| "eval_steps_per_second": 1.557, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.1645, | |
| "grad_norm": 0.5550631701119645, | |
| "learning_rate": 9.283555555555556e-06, | |
| "loss": 2.3176, | |
| "step": 8225 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.5828828542252756, | |
| "learning_rate": 9.278e-06, | |
| "loss": 2.3213, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.1655, | |
| "grad_norm": 0.5610132600982978, | |
| "learning_rate": 9.272444444444445e-06, | |
| "loss": 2.3117, | |
| "step": 8275 | |
| }, | |
| { | |
| "epoch": 0.166, | |
| "grad_norm": 0.5777357931804634, | |
| "learning_rate": 9.26688888888889e-06, | |
| "loss": 2.3189, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.166, | |
| "eval_loss": 2.43573260307312, | |
| "eval_runtime": 264.2018, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 1.556, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.1665, | |
| "grad_norm": 0.5515402141694353, | |
| "learning_rate": 9.261333333333334e-06, | |
| "loss": 2.3267, | |
| "step": 8325 | |
| }, | |
| { | |
| "epoch": 0.167, | |
| "grad_norm": 0.588745393922677, | |
| "learning_rate": 9.25577777777778e-06, | |
| "loss": 2.3219, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 0.5391388541771018, | |
| "learning_rate": 9.250222222222223e-06, | |
| "loss": 2.3181, | |
| "step": 8375 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.5680296112961243, | |
| "learning_rate": 9.244666666666668e-06, | |
| "loss": 2.3231, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "eval_loss": 2.435276985168457, | |
| "eval_runtime": 263.8428, | |
| "eval_samples_per_second": 3.115, | |
| "eval_steps_per_second": 1.558, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.1685, | |
| "grad_norm": 0.5655802530008279, | |
| "learning_rate": 9.239111111111112e-06, | |
| "loss": 2.3201, | |
| "step": 8425 | |
| }, | |
| { | |
| "epoch": 0.169, | |
| "grad_norm": 0.5917481613153034, | |
| "learning_rate": 9.233555555555557e-06, | |
| "loss": 2.3184, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.1695, | |
| "grad_norm": 0.5808853698441179, | |
| "learning_rate": 9.228e-06, | |
| "loss": 2.3151, | |
| "step": 8475 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5868551530423814, | |
| "learning_rate": 9.222444444444446e-06, | |
| "loss": 2.3146, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_loss": 2.435950994491577, | |
| "eval_runtime": 264.3586, | |
| "eval_samples_per_second": 3.109, | |
| "eval_steps_per_second": 1.555, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.1705, | |
| "grad_norm": 0.5638181149272796, | |
| "learning_rate": 9.21688888888889e-06, | |
| "loss": 2.3155, | |
| "step": 8525 | |
| }, | |
| { | |
| "epoch": 0.171, | |
| "grad_norm": 0.5740285526813199, | |
| "learning_rate": 9.211333333333334e-06, | |
| "loss": 2.319, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.1715, | |
| "grad_norm": 0.5695622395648989, | |
| "learning_rate": 9.20577777777778e-06, | |
| "loss": 2.3206, | |
| "step": 8575 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 0.5747463636735414, | |
| "learning_rate": 9.200222222222223e-06, | |
| "loss": 2.3111, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "eval_loss": 2.4367878437042236, | |
| "eval_runtime": 264.2061, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 1.556, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 0.5777631704492084, | |
| "learning_rate": 9.194666666666667e-06, | |
| "loss": 2.3078, | |
| "step": 8625 | |
| }, | |
| { | |
| "epoch": 0.173, | |
| "grad_norm": 0.5746886517313039, | |
| "learning_rate": 9.189111111111112e-06, | |
| "loss": 2.3152, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.1735, | |
| "grad_norm": 0.564580351173264, | |
| "learning_rate": 9.183555555555557e-06, | |
| "loss": 2.316, | |
| "step": 8675 | |
| }, | |
| { | |
| "epoch": 0.174, | |
| "grad_norm": 0.6048784393681501, | |
| "learning_rate": 9.178000000000001e-06, | |
| "loss": 2.3251, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.174, | |
| "eval_loss": 2.435750722885132, | |
| "eval_runtime": 264.296, | |
| "eval_samples_per_second": 3.11, | |
| "eval_steps_per_second": 1.555, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.1745, | |
| "grad_norm": 0.5769443750882641, | |
| "learning_rate": 9.172444444444444e-06, | |
| "loss": 2.3186, | |
| "step": 8725 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.5792202067037501, | |
| "learning_rate": 9.16688888888889e-06, | |
| "loss": 2.3106, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.1755, | |
| "grad_norm": 0.5819115394572557, | |
| "learning_rate": 9.161333333333335e-06, | |
| "loss": 2.3118, | |
| "step": 8775 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.575657270210696, | |
| "learning_rate": 9.155777777777779e-06, | |
| "loss": 2.3106, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "eval_loss": 2.436899185180664, | |
| "eval_runtime": 263.9579, | |
| "eval_samples_per_second": 3.114, | |
| "eval_steps_per_second": 1.557, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.1765, | |
| "grad_norm": 0.572118834452971, | |
| "learning_rate": 9.150222222222222e-06, | |
| "loss": 2.3139, | |
| "step": 8825 | |
| }, | |
| { | |
| "epoch": 0.177, | |
| "grad_norm": 0.5812618278818413, | |
| "learning_rate": 9.144666666666667e-06, | |
| "loss": 2.319, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 0.5527533551295488, | |
| "learning_rate": 9.139111111111113e-06, | |
| "loss": 2.3152, | |
| "step": 8875 | |
| }, | |
| { | |
| "epoch": 0.178, | |
| "grad_norm": 0.5749551425231054, | |
| "learning_rate": 9.133555555555556e-06, | |
| "loss": 2.3065, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.178, | |
| "eval_loss": 2.4364571571350098, | |
| "eval_runtime": 264.0259, | |
| "eval_samples_per_second": 3.113, | |
| "eval_steps_per_second": 1.557, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.1785, | |
| "grad_norm": 0.5758182476998225, | |
| "learning_rate": 9.128e-06, | |
| "loss": 2.3104, | |
| "step": 8925 | |
| }, | |
| { | |
| "epoch": 0.179, | |
| "grad_norm": 0.5922756280220078, | |
| "learning_rate": 9.122444444444445e-06, | |
| "loss": 2.3158, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.1795, | |
| "grad_norm": 0.5943790910117238, | |
| "learning_rate": 9.11688888888889e-06, | |
| "loss": 2.3167, | |
| "step": 8975 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.580613992072982, | |
| "learning_rate": 9.111333333333334e-06, | |
| "loss": 2.3069, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "eval_loss": 2.436984062194824, | |
| "eval_runtime": 264.2235, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 1.556, | |
| "step": 9000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 50000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.8648820684944835e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |