diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.07998080460689434, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.998080460689434e-05, + "grad_norm": 1.038775086402893, + "learning_rate": 0.0, + "loss": 1.6057, + "memory/device_reserved (GiB)": 69.76, + "memory/max_active (GiB)": 65.79, + "memory/max_allocated (GiB)": 65.79, + "step": 1, + "tokens_per_second_per_gpu": 1277.91, + "total_tokens": 25611 + }, + { + "epoch": 0.00015996160921378868, + "grad_norm": 1.098587155342102, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.6297, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 2, + "tokens_per_second_per_gpu": 16977.66, + "total_tokens": 50942 + }, + { + "epoch": 0.00023994241382068303, + "grad_norm": 0.975591242313385, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.5663, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 3, + "tokens_per_second_per_gpu": 17162.2, + "total_tokens": 76956 + }, + { + "epoch": 0.00031992321842757736, + "grad_norm": 1.0335264205932617, + "learning_rate": 6.000000000000001e-07, + "loss": 1.5648, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 4, + "tokens_per_second_per_gpu": 17404.84, + "total_tokens": 102664 + }, + { + "epoch": 0.0003999040230344717, + "grad_norm": 1.0145632028579712, + "learning_rate": 8.000000000000001e-07, + "loss": 1.5456, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 5, + "tokens_per_second_per_gpu": 16790.34, + "total_tokens": 128081 + }, + { + "epoch": 0.00047988482764136606, + "grad_norm": 1.065081000328064, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.6122, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 6, + "tokens_per_second_per_gpu": 16943.09, + "total_tokens": 153326 + }, + { + "epoch": 0.0005598656322482605, + "grad_norm": 1.0195869207382202, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.5448, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 7, + "tokens_per_second_per_gpu": 16990.25, + "total_tokens": 179126 + }, + { + "epoch": 0.0006398464368551547, + "grad_norm": 1.109100580215454, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.5649, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 8, + "tokens_per_second_per_gpu": 16858.03, + "total_tokens": 204206 + }, + { + "epoch": 0.0007198272414620491, + "grad_norm": 1.0833709239959717, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.6006, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 9, + "tokens_per_second_per_gpu": 16988.61, + "total_tokens": 229672 + }, + { + "epoch": 0.0007998080460689434, + "grad_norm": 1.0193920135498047, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.6112, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 10, + "tokens_per_second_per_gpu": 16988.44, + "total_tokens": 255388 + }, + { + "epoch": 0.0008797888506758378, + "grad_norm": 0.9576646685600281, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5652, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 11, + "tokens_per_second_per_gpu": 17429.3, + "total_tokens": 281853 + }, + { + "epoch": 0.0009597696552827321, + "grad_norm": 1.0531549453735352, + "learning_rate": 2.2e-06, + "loss": 1.561, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 12, + "tokens_per_second_per_gpu": 17523.77, + "total_tokens": 307711 + }, + { + "epoch": 0.0010397504598896265, + "grad_norm": 0.9403714537620544, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.4986, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 13, + "tokens_per_second_per_gpu": 17242.09, + "total_tokens": 333825 + }, + { + "epoch": 0.001119731264496521, + "grad_norm": 1.0354647636413574, + "learning_rate": 2.6e-06, + "loss": 1.503, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 14, + "tokens_per_second_per_gpu": 17361.34, + "total_tokens": 359895 + }, + { + "epoch": 0.0011997120691034152, + "grad_norm": 1.0876542329788208, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.5442, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 15, + "tokens_per_second_per_gpu": 16605.41, + "total_tokens": 384593 + }, + { + "epoch": 0.0012796928737103094, + "grad_norm": 1.1391915082931519, + "learning_rate": 3e-06, + "loss": 1.5995, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 16, + "tokens_per_second_per_gpu": 16636.62, + "total_tokens": 409373 + }, + { + "epoch": 0.0013596736783172039, + "grad_norm": 0.9413732290267944, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.5109, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 17, + "tokens_per_second_per_gpu": 17460.95, + "total_tokens": 435918 + }, + { + "epoch": 0.0014396544829240981, + "grad_norm": 1.0454152822494507, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.5649, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 18, + "tokens_per_second_per_gpu": 16992.1, + "total_tokens": 461473 + }, + { + "epoch": 0.0015196352875309926, + "grad_norm": 1.039425253868103, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.5598, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 19, + "tokens_per_second_per_gpu": 17610.58, + "total_tokens": 487930 + }, + { + "epoch": 0.0015996160921378868, + "grad_norm": 1.0049670934677124, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.5424, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 20, + "tokens_per_second_per_gpu": 17135.21, + "total_tokens": 513747 + }, + { + "epoch": 0.0016795968967447813, + "grad_norm": 1.077114462852478, + "learning_rate": 4.000000000000001e-06, + "loss": 1.5031, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 21, + "tokens_per_second_per_gpu": 16893.74, + "total_tokens": 538891 + }, + { + "epoch": 0.0017595777013516755, + "grad_norm": 1.0136423110961914, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.5642, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 22, + "tokens_per_second_per_gpu": 17259.47, + "total_tokens": 565110 + }, + { + "epoch": 0.00183955850595857, + "grad_norm": 1.0550577640533447, + "learning_rate": 4.4e-06, + "loss": 1.5313, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 23, + "tokens_per_second_per_gpu": 16961.68, + "total_tokens": 590711 + }, + { + "epoch": 0.0019195393105654642, + "grad_norm": 1.0451573133468628, + "learning_rate": 4.600000000000001e-06, + "loss": 1.543, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 24, + "tokens_per_second_per_gpu": 16893.58, + "total_tokens": 616331 + }, + { + "epoch": 0.0019995201151723585, + "grad_norm": 1.1151784658432007, + "learning_rate": 4.800000000000001e-06, + "loss": 1.5585, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 25, + "tokens_per_second_per_gpu": 16911.88, + "total_tokens": 641401 + }, + { + "epoch": 0.002079500919779253, + "grad_norm": 1.0715839862823486, + "learning_rate": 5e-06, + "loss": 1.5898, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 26, + "tokens_per_second_per_gpu": 16957.69, + "total_tokens": 667117 + }, + { + "epoch": 0.0021594817243861474, + "grad_norm": 1.049048900604248, + "learning_rate": 5.2e-06, + "loss": 1.5229, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 27, + "tokens_per_second_per_gpu": 17196.54, + "total_tokens": 693040 + }, + { + "epoch": 0.002239462528993042, + "grad_norm": 1.128364086151123, + "learning_rate": 5.400000000000001e-06, + "loss": 1.5577, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 28, + "tokens_per_second_per_gpu": 16501.39, + "total_tokens": 717589 + }, + { + "epoch": 0.002319443333599936, + "grad_norm": 1.0650986433029175, + "learning_rate": 5.600000000000001e-06, + "loss": 1.5394, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 29, + "tokens_per_second_per_gpu": 16990.67, + "total_tokens": 743423 + }, + { + "epoch": 0.0023994241382068304, + "grad_norm": 1.0103224515914917, + "learning_rate": 5.8e-06, + "loss": 1.4581, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 30, + "tokens_per_second_per_gpu": 17264.37, + "total_tokens": 769708 + }, + { + "epoch": 0.002479404942813725, + "grad_norm": 1.074812650680542, + "learning_rate": 6e-06, + "loss": 1.5418, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 31, + "tokens_per_second_per_gpu": 16900.72, + "total_tokens": 794946 + }, + { + "epoch": 0.002559385747420619, + "grad_norm": 1.1037012338638306, + "learning_rate": 6.200000000000001e-06, + "loss": 1.554, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 32, + "tokens_per_second_per_gpu": 17437.47, + "total_tokens": 821006 + }, + { + "epoch": 0.0026393665520275133, + "grad_norm": 1.056754469871521, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.5, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 33, + "tokens_per_second_per_gpu": 16929.32, + "total_tokens": 846649 + }, + { + "epoch": 0.0027193473566344078, + "grad_norm": 0.9937567710876465, + "learning_rate": 6.600000000000001e-06, + "loss": 1.5208, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 34, + "tokens_per_second_per_gpu": 17016.45, + "total_tokens": 872946 + }, + { + "epoch": 0.0027993281612413022, + "grad_norm": 0.997081458568573, + "learning_rate": 6.800000000000001e-06, + "loss": 1.4191, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 35, + "tokens_per_second_per_gpu": 17202.72, + "total_tokens": 899114 + }, + { + "epoch": 0.0028793089658481963, + "grad_norm": 1.0784165859222412, + "learning_rate": 7e-06, + "loss": 1.4787, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 36, + "tokens_per_second_per_gpu": 17030.42, + "total_tokens": 924446 + }, + { + "epoch": 0.0029592897704550907, + "grad_norm": 1.1074408292770386, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.5245, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 37, + "tokens_per_second_per_gpu": 17583.74, + "total_tokens": 950398 + }, + { + "epoch": 0.003039270575061985, + "grad_norm": 1.0375193357467651, + "learning_rate": 7.4e-06, + "loss": 1.4235, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 38, + "tokens_per_second_per_gpu": 16790.68, + "total_tokens": 975601 + }, + { + "epoch": 0.0031192513796688796, + "grad_norm": 1.0000883340835571, + "learning_rate": 7.600000000000001e-06, + "loss": 1.4206, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 39, + "tokens_per_second_per_gpu": 16934.89, + "total_tokens": 1000972 + }, + { + "epoch": 0.0031992321842757737, + "grad_norm": 1.0457230806350708, + "learning_rate": 7.800000000000002e-06, + "loss": 1.4411, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 40, + "tokens_per_second_per_gpu": 14105.13, + "total_tokens": 1026771 + }, + { + "epoch": 0.003279212988882668, + "grad_norm": 0.9556184411048889, + "learning_rate": 8.000000000000001e-06, + "loss": 1.4376, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 41, + "tokens_per_second_per_gpu": 17975.2, + "total_tokens": 1053835 + }, + { + "epoch": 0.0033591937934895626, + "grad_norm": 1.1289631128311157, + "learning_rate": 8.2e-06, + "loss": 1.4106, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 42, + "tokens_per_second_per_gpu": 16485.8, + "total_tokens": 1077561 + }, + { + "epoch": 0.003439174598096457, + "grad_norm": 0.9723970293998718, + "learning_rate": 8.400000000000001e-06, + "loss": 1.4078, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 43, + "tokens_per_second_per_gpu": 16537.68, + "total_tokens": 1102268 + }, + { + "epoch": 0.003519155402703351, + "grad_norm": 1.06087327003479, + "learning_rate": 8.6e-06, + "loss": 1.4605, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 44, + "tokens_per_second_per_gpu": 16487.1, + "total_tokens": 1126547 + }, + { + "epoch": 0.0035991362073102455, + "grad_norm": 0.8268716931343079, + "learning_rate": 8.8e-06, + "loss": 1.3048, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 45, + "tokens_per_second_per_gpu": 17013.09, + "total_tokens": 1152320 + }, + { + "epoch": 0.00367911701191714, + "grad_norm": 0.923682451248169, + "learning_rate": 9e-06, + "loss": 1.3843, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 46, + "tokens_per_second_per_gpu": 16652.7, + "total_tokens": 1177368 + }, + { + "epoch": 0.003759097816524034, + "grad_norm": 0.8878368139266968, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3663, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 47, + "tokens_per_second_per_gpu": 16533.02, + "total_tokens": 1202165 + }, + { + "epoch": 0.0038390786211309285, + "grad_norm": 0.9188768267631531, + "learning_rate": 9.4e-06, + "loss": 1.4352, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 48, + "tokens_per_second_per_gpu": 17016.3, + "total_tokens": 1226827 + }, + { + "epoch": 0.0039190594257378225, + "grad_norm": 0.8016843795776367, + "learning_rate": 9.600000000000001e-06, + "loss": 1.3324, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 49, + "tokens_per_second_per_gpu": 17175.21, + "total_tokens": 1252015 + }, + { + "epoch": 0.003999040230344717, + "grad_norm": 0.8222874999046326, + "learning_rate": 9.800000000000001e-06, + "loss": 1.3224, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 50, + "tokens_per_second_per_gpu": 16548.61, + "total_tokens": 1276515 + }, + { + "epoch": 0.004079021034951611, + "grad_norm": 0.6688214540481567, + "learning_rate": 1e-05, + "loss": 1.2844, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 51, + "tokens_per_second_per_gpu": 17318.33, + "total_tokens": 1302842 + }, + { + "epoch": 0.004159001839558506, + "grad_norm": 0.6350716352462769, + "learning_rate": 1.02e-05, + "loss": 1.2481, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 52, + "tokens_per_second_per_gpu": 17270.15, + "total_tokens": 1329409 + }, + { + "epoch": 0.0042389826441654, + "grad_norm": 0.7399108409881592, + "learning_rate": 1.04e-05, + "loss": 1.3018, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 53, + "tokens_per_second_per_gpu": 16473.95, + "total_tokens": 1353912 + }, + { + "epoch": 0.004318963448772295, + "grad_norm": 0.6630793809890747, + "learning_rate": 1.0600000000000002e-05, + "loss": 1.1977, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 54, + "tokens_per_second_per_gpu": 16858.54, + "total_tokens": 1379028 + }, + { + "epoch": 0.004398944253379189, + "grad_norm": 0.58243727684021, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.3015, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 55, + "tokens_per_second_per_gpu": 17335.57, + "total_tokens": 1405227 + }, + { + "epoch": 0.004478925057986084, + "grad_norm": 0.5659134984016418, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.2674, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 56, + "tokens_per_second_per_gpu": 17241.25, + "total_tokens": 1430933 + }, + { + "epoch": 0.004558905862592977, + "grad_norm": 0.5408620238304138, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.2374, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 57, + "tokens_per_second_per_gpu": 17259.68, + "total_tokens": 1456689 + }, + { + "epoch": 0.004638886667199872, + "grad_norm": 0.49004459381103516, + "learning_rate": 1.14e-05, + "loss": 1.25, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 58, + "tokens_per_second_per_gpu": 16924.62, + "total_tokens": 1482624 + }, + { + "epoch": 0.004718867471806766, + "grad_norm": 0.5233814716339111, + "learning_rate": 1.16e-05, + "loss": 1.1618, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 59, + "tokens_per_second_per_gpu": 16820.35, + "total_tokens": 1507637 + }, + { + "epoch": 0.004798848276413661, + "grad_norm": 0.4358421266078949, + "learning_rate": 1.18e-05, + "loss": 1.2003, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 60, + "tokens_per_second_per_gpu": 17670.45, + "total_tokens": 1534597 + }, + { + "epoch": 0.004878829081020555, + "grad_norm": 0.44443076848983765, + "learning_rate": 1.2e-05, + "loss": 1.2287, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 61, + "tokens_per_second_per_gpu": 16846.22, + "total_tokens": 1559749 + }, + { + "epoch": 0.00495880988562745, + "grad_norm": 0.39861562848091125, + "learning_rate": 1.22e-05, + "loss": 1.1572, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 62, + "tokens_per_second_per_gpu": 17121.54, + "total_tokens": 1585882 + }, + { + "epoch": 0.005038790690234344, + "grad_norm": 0.4339846968650818, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.2128, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 63, + "tokens_per_second_per_gpu": 16583.88, + "total_tokens": 1610180 + }, + { + "epoch": 0.005118771494841238, + "grad_norm": 0.35104724764823914, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.0974, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 64, + "tokens_per_second_per_gpu": 17028.21, + "total_tokens": 1636105 + }, + { + "epoch": 0.005198752299448132, + "grad_norm": 0.3317544162273407, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.1286, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 65, + "tokens_per_second_per_gpu": 17131.02, + "total_tokens": 1661802 + }, + { + "epoch": 0.005278733104055027, + "grad_norm": 0.32120752334594727, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.1658, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 66, + "tokens_per_second_per_gpu": 17810.85, + "total_tokens": 1688553 + }, + { + "epoch": 0.005358713908661921, + "grad_norm": 0.30699560046195984, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.1215, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 67, + "tokens_per_second_per_gpu": 17086.6, + "total_tokens": 1714293 + }, + { + "epoch": 0.0054386947132688155, + "grad_norm": 0.31427061557769775, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.2197, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 68, + "tokens_per_second_per_gpu": 17286.05, + "total_tokens": 1740245 + }, + { + "epoch": 0.00551867551787571, + "grad_norm": 0.3120593726634979, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.1437, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 69, + "tokens_per_second_per_gpu": 16364.4, + "total_tokens": 1764643 + }, + { + "epoch": 0.0055986563224826045, + "grad_norm": 0.27037009596824646, + "learning_rate": 1.38e-05, + "loss": 1.1268, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 70, + "tokens_per_second_per_gpu": 16857.39, + "total_tokens": 1790305 + }, + { + "epoch": 0.005678637127089499, + "grad_norm": 0.30426427721977234, + "learning_rate": 1.4e-05, + "loss": 1.1735, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 71, + "tokens_per_second_per_gpu": 16563.03, + "total_tokens": 1814880 + }, + { + "epoch": 0.0057586179316963925, + "grad_norm": 0.2649443745613098, + "learning_rate": 1.4200000000000001e-05, + "loss": 1.1177, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 72, + "tokens_per_second_per_gpu": 17367.91, + "total_tokens": 1840810 + }, + { + "epoch": 0.005838598736303287, + "grad_norm": 0.253825843334198, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.144, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 73, + "tokens_per_second_per_gpu": 16918.97, + "total_tokens": 1866252 + }, + { + "epoch": 0.005918579540910181, + "grad_norm": 0.2598889172077179, + "learning_rate": 1.46e-05, + "loss": 1.1142, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 74, + "tokens_per_second_per_gpu": 17104.53, + "total_tokens": 1892027 + }, + { + "epoch": 0.005998560345517076, + "grad_norm": 0.2475835680961609, + "learning_rate": 1.48e-05, + "loss": 1.0584, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 75, + "tokens_per_second_per_gpu": 16580.5, + "total_tokens": 1916988 + }, + { + "epoch": 0.00607854115012397, + "grad_norm": 0.25065016746520996, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.1627, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 76, + "tokens_per_second_per_gpu": 17326.24, + "total_tokens": 1943188 + }, + { + "epoch": 0.006158521954730865, + "grad_norm": 0.23243308067321777, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.1042, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 77, + "tokens_per_second_per_gpu": 17705.83, + "total_tokens": 1970033 + }, + { + "epoch": 0.006238502759337759, + "grad_norm": 0.22412195801734924, + "learning_rate": 1.54e-05, + "loss": 1.1211, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 78, + "tokens_per_second_per_gpu": 17159.33, + "total_tokens": 1996140 + }, + { + "epoch": 0.006318483563944653, + "grad_norm": 0.23629942536354065, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.1115, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 79, + "tokens_per_second_per_gpu": 17352.1, + "total_tokens": 2022475 + }, + { + "epoch": 0.006398464368551547, + "grad_norm": 0.24803169071674347, + "learning_rate": 1.58e-05, + "loss": 1.0912, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 80, + "tokens_per_second_per_gpu": 17005.04, + "total_tokens": 2048082 + }, + { + "epoch": 0.006478445173158442, + "grad_norm": 0.24923603236675262, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.1203, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 81, + "tokens_per_second_per_gpu": 17025.63, + "total_tokens": 2073687 + }, + { + "epoch": 0.006558425977765336, + "grad_norm": 0.22304023802280426, + "learning_rate": 1.62e-05, + "loss": 1.083, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 82, + "tokens_per_second_per_gpu": 17159.24, + "total_tokens": 2098948 + }, + { + "epoch": 0.006638406782372231, + "grad_norm": 0.21665704250335693, + "learning_rate": 1.64e-05, + "loss": 1.091, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 83, + "tokens_per_second_per_gpu": 16997.08, + "total_tokens": 2125027 + }, + { + "epoch": 0.006718387586979125, + "grad_norm": 0.24451886117458344, + "learning_rate": 1.66e-05, + "loss": 1.1218, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 84, + "tokens_per_second_per_gpu": 17150.21, + "total_tokens": 2150829 + }, + { + "epoch": 0.00679836839158602, + "grad_norm": 0.23331356048583984, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.1073, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 85, + "tokens_per_second_per_gpu": 16603.59, + "total_tokens": 2175685 + }, + { + "epoch": 0.006878349196192914, + "grad_norm": 0.21449171006679535, + "learning_rate": 1.7e-05, + "loss": 1.026, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 86, + "tokens_per_second_per_gpu": 16826.79, + "total_tokens": 2200963 + }, + { + "epoch": 0.006958330000799808, + "grad_norm": 0.22198700904846191, + "learning_rate": 1.72e-05, + "loss": 1.0516, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 87, + "tokens_per_second_per_gpu": 16907.11, + "total_tokens": 2226251 + }, + { + "epoch": 0.007038310805406702, + "grad_norm": 0.21258434653282166, + "learning_rate": 1.7400000000000003e-05, + "loss": 1.0373, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 88, + "tokens_per_second_per_gpu": 16928.44, + "total_tokens": 2251330 + }, + { + "epoch": 0.007118291610013597, + "grad_norm": 0.21518750488758087, + "learning_rate": 1.76e-05, + "loss": 1.0988, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 89, + "tokens_per_second_per_gpu": 16589.13, + "total_tokens": 2276301 + }, + { + "epoch": 0.007198272414620491, + "grad_norm": 0.20537728071212769, + "learning_rate": 1.7800000000000002e-05, + "loss": 1.0329, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 90, + "tokens_per_second_per_gpu": 17318.88, + "total_tokens": 2303225 + }, + { + "epoch": 0.0072782532192273855, + "grad_norm": 0.20714648067951202, + "learning_rate": 1.8e-05, + "loss": 1.0602, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 91, + "tokens_per_second_per_gpu": 16719.92, + "total_tokens": 2328769 + }, + { + "epoch": 0.00735823402383428, + "grad_norm": 0.1941813975572586, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.9607, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 92, + "tokens_per_second_per_gpu": 17066.42, + "total_tokens": 2354588 + }, + { + "epoch": 0.0074382148284411745, + "grad_norm": 0.21113121509552002, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.0515, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 93, + "tokens_per_second_per_gpu": 16788.91, + "total_tokens": 2379900 + }, + { + "epoch": 0.007518195633048068, + "grad_norm": 0.21530379354953766, + "learning_rate": 1.86e-05, + "loss": 1.0066, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 94, + "tokens_per_second_per_gpu": 17013.31, + "total_tokens": 2405023 + }, + { + "epoch": 0.0075981764376549625, + "grad_norm": 0.19766011834144592, + "learning_rate": 1.88e-05, + "loss": 1.059, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 95, + "tokens_per_second_per_gpu": 17033.56, + "total_tokens": 2430914 + }, + { + "epoch": 0.007678157242261857, + "grad_norm": 0.21232014894485474, + "learning_rate": 1.9e-05, + "loss": 1.0619, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 96, + "tokens_per_second_per_gpu": 17229.84, + "total_tokens": 2456462 + }, + { + "epoch": 0.007758138046868751, + "grad_norm": 0.20775918662548065, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.0754, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 97, + "tokens_per_second_per_gpu": 17206.74, + "total_tokens": 2482593 + }, + { + "epoch": 0.007838118851475645, + "grad_norm": 0.19987605512142181, + "learning_rate": 1.94e-05, + "loss": 0.9953, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 98, + "tokens_per_second_per_gpu": 16420.21, + "total_tokens": 2507351 + }, + { + "epoch": 0.00791809965608254, + "grad_norm": 0.2022673338651657, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.0519, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 99, + "tokens_per_second_per_gpu": 17355.34, + "total_tokens": 2533599 + }, + { + "epoch": 0.007998080460689434, + "grad_norm": 0.20371320843696594, + "learning_rate": 1.98e-05, + "loss": 1.0564, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 100, + "tokens_per_second_per_gpu": 16927.47, + "total_tokens": 2558539 + }, + { + "epoch": 0.00807806126529633, + "grad_norm": 0.200734481215477, + "learning_rate": 2e-05, + "loss": 0.9858, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 101, + "tokens_per_second_per_gpu": 16636.02, + "total_tokens": 2583353 + }, + { + "epoch": 0.008158042069903223, + "grad_norm": 0.20651081204414368, + "learning_rate": 1.9999939076577906e-05, + "loss": 1.0509, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 102, + "tokens_per_second_per_gpu": 16987.8, + "total_tokens": 2608864 + }, + { + "epoch": 0.008238022874510118, + "grad_norm": 0.1920926421880722, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.9654, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 103, + "tokens_per_second_per_gpu": 16783.27, + "total_tokens": 2634375 + }, + { + "epoch": 0.008318003679117012, + "grad_norm": 0.19755157828330994, + "learning_rate": 1.9999451693655125e-05, + "loss": 0.9866, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 104, + "tokens_per_second_per_gpu": 17391.35, + "total_tokens": 2659835 + }, + { + "epoch": 0.008397984483723907, + "grad_norm": 0.19291236996650696, + "learning_rate": 1.9999025240093045e-05, + "loss": 1.0565, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 105, + "tokens_per_second_per_gpu": 17339.57, + "total_tokens": 2686150 + }, + { + "epoch": 0.0084779652883308, + "grad_norm": 0.18459810316562653, + "learning_rate": 1.9998476951563914e-05, + "loss": 0.993, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 106, + "tokens_per_second_per_gpu": 17021.35, + "total_tokens": 2711966 + }, + { + "epoch": 0.008557946092937694, + "grad_norm": 0.2000616043806076, + "learning_rate": 1.9997806834748455e-05, + "loss": 1.0645, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 107, + "tokens_per_second_per_gpu": 16954.79, + "total_tokens": 2738061 + }, + { + "epoch": 0.00863792689754459, + "grad_norm": 0.19253303110599518, + "learning_rate": 1.9997014897811834e-05, + "loss": 0.9867, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 108, + "tokens_per_second_per_gpu": 16678.06, + "total_tokens": 2763035 + }, + { + "epoch": 0.008717907702151483, + "grad_norm": 0.22710327804088593, + "learning_rate": 1.9996101150403543e-05, + "loss": 1.0623, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 109, + "tokens_per_second_per_gpu": 16594.39, + "total_tokens": 2788014 + }, + { + "epoch": 0.008797888506758379, + "grad_norm": 0.18572771549224854, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.9652, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 110, + "tokens_per_second_per_gpu": 16515.36, + "total_tokens": 2812864 + }, + { + "epoch": 0.008877869311365272, + "grad_norm": 0.20359967648983002, + "learning_rate": 1.999390827019096e-05, + "loss": 1.0123, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 111, + "tokens_per_second_per_gpu": 17242.77, + "total_tokens": 2838892 + }, + { + "epoch": 0.008957850115972167, + "grad_norm": 0.19035907089710236, + "learning_rate": 1.999262916410621e-05, + "loss": 0.9459, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 112, + "tokens_per_second_per_gpu": 17095.24, + "total_tokens": 2864893 + }, + { + "epoch": 0.009037830920579061, + "grad_norm": 0.19774137437343597, + "learning_rate": 1.9991228300988586e-05, + "loss": 1.0056, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 113, + "tokens_per_second_per_gpu": 17034.09, + "total_tokens": 2890624 + }, + { + "epoch": 0.009117811725185955, + "grad_norm": 0.19346508383750916, + "learning_rate": 1.998970569790715e-05, + "loss": 0.9834, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 114, + "tokens_per_second_per_gpu": 17264.27, + "total_tokens": 2916880 + }, + { + "epoch": 0.00919779252979285, + "grad_norm": 0.19959688186645508, + "learning_rate": 1.9988061373414342e-05, + "loss": 1.0041, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 115, + "tokens_per_second_per_gpu": 16800.76, + "total_tokens": 2941856 + }, + { + "epoch": 0.009277773334399744, + "grad_norm": 0.19120177626609802, + "learning_rate": 1.9986295347545738e-05, + "loss": 0.9453, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 116, + "tokens_per_second_per_gpu": 17213.91, + "total_tokens": 2967939 + }, + { + "epoch": 0.009357754139006639, + "grad_norm": 0.19319495558738708, + "learning_rate": 1.9984407641819812e-05, + "loss": 1.0185, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 117, + "tokens_per_second_per_gpu": 17607.03, + "total_tokens": 2995350 + }, + { + "epoch": 0.009437734943613533, + "grad_norm": 0.19155430793762207, + "learning_rate": 1.9982398279237657e-05, + "loss": 1.0413, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 118, + "tokens_per_second_per_gpu": 17920.03, + "total_tokens": 3022424 + }, + { + "epoch": 0.009517715748220428, + "grad_norm": 0.19833408296108246, + "learning_rate": 1.9980267284282718e-05, + "loss": 1.006, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 119, + "tokens_per_second_per_gpu": 17428.58, + "total_tokens": 3048402 + }, + { + "epoch": 0.009597696552827321, + "grad_norm": 0.19430740177631378, + "learning_rate": 1.9978014682920503e-05, + "loss": 0.9806, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 120, + "tokens_per_second_per_gpu": 16750.28, + "total_tokens": 3073297 + }, + { + "epoch": 0.009677677357434215, + "grad_norm": 0.19468539953231812, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.9751, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 121, + "tokens_per_second_per_gpu": 16610.64, + "total_tokens": 3098335 + }, + { + "epoch": 0.00975765816204111, + "grad_norm": 0.19551995396614075, + "learning_rate": 1.997314477224458e-05, + "loss": 0.9821, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 122, + "tokens_per_second_per_gpu": 17140.77, + "total_tokens": 3123841 + }, + { + "epoch": 0.009837638966648004, + "grad_norm": 0.19409964978694916, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.9219, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 123, + "tokens_per_second_per_gpu": 16856.43, + "total_tokens": 3149208 + }, + { + "epoch": 0.0099176197712549, + "grad_norm": 0.19458907842636108, + "learning_rate": 1.9967788784562474e-05, + "loss": 1.0324, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 124, + "tokens_per_second_per_gpu": 16793.17, + "total_tokens": 3173736 + }, + { + "epoch": 0.009997600575861793, + "grad_norm": 0.19394950568675995, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.9587, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 125, + "tokens_per_second_per_gpu": 16811.72, + "total_tokens": 3198890 + }, + { + "epoch": 0.010077581380468688, + "grad_norm": 0.1940041482448578, + "learning_rate": 1.9961946980917457e-05, + "loss": 1.0112, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 126, + "tokens_per_second_per_gpu": 16986.16, + "total_tokens": 3224595 + }, + { + "epoch": 0.010157562185075582, + "grad_norm": 0.1928212195634842, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.9699, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 127, + "tokens_per_second_per_gpu": 17096.16, + "total_tokens": 3250659 + }, + { + "epoch": 0.010237542989682475, + "grad_norm": 0.19149477779865265, + "learning_rate": 1.99556196460308e-05, + "loss": 0.9941, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 128, + "tokens_per_second_per_gpu": 16975.27, + "total_tokens": 3275998 + }, + { + "epoch": 0.01031752379428937, + "grad_norm": 0.19466781616210938, + "learning_rate": 1.9952273999818312e-05, + "loss": 1.0126, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 129, + "tokens_per_second_per_gpu": 17100.87, + "total_tokens": 3301550 + }, + { + "epoch": 0.010397504598896264, + "grad_norm": 0.19384890794754028, + "learning_rate": 1.9948807088287884e-05, + "loss": 1.0062, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 130, + "tokens_per_second_per_gpu": 17397.69, + "total_tokens": 3327697 + }, + { + "epoch": 0.01047748540350316, + "grad_norm": 0.19235117733478546, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.9573, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 131, + "tokens_per_second_per_gpu": 17408.0, + "total_tokens": 3354295 + }, + { + "epoch": 0.010557466208110053, + "grad_norm": 0.19668954610824585, + "learning_rate": 1.9941509639723155e-05, + "loss": 0.9378, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 132, + "tokens_per_second_per_gpu": 17367.53, + "total_tokens": 3380587 + }, + { + "epoch": 0.010637447012716949, + "grad_norm": 0.19954292476177216, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.9432, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 133, + "tokens_per_second_per_gpu": 16674.36, + "total_tokens": 3405485 + }, + { + "epoch": 0.010717427817323842, + "grad_norm": 0.2069808393716812, + "learning_rate": 1.9933727656003964e-05, + "loss": 0.9526, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 134, + "tokens_per_second_per_gpu": 17416.45, + "total_tokens": 3431579 + }, + { + "epoch": 0.010797408621930737, + "grad_norm": 0.20890875160694122, + "learning_rate": 1.992965508106537e-05, + "loss": 0.9696, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 135, + "tokens_per_second_per_gpu": 17080.6, + "total_tokens": 3457239 + }, + { + "epoch": 0.010877389426537631, + "grad_norm": 0.20000465214252472, + "learning_rate": 1.9925461516413224e-05, + "loss": 0.9747, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 136, + "tokens_per_second_per_gpu": 16945.48, + "total_tokens": 3483044 + }, + { + "epoch": 0.010957370231144525, + "grad_norm": 0.19973015785217285, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.9557, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 137, + "tokens_per_second_per_gpu": 17186.34, + "total_tokens": 3509145 + }, + { + "epoch": 0.01103735103575142, + "grad_norm": 0.206997811794281, + "learning_rate": 1.9916711623830904e-05, + "loss": 0.9155, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 138, + "tokens_per_second_per_gpu": 17008.31, + "total_tokens": 3534434 + }, + { + "epoch": 0.011117331840358314, + "grad_norm": 0.2097865790128708, + "learning_rate": 1.991215540251542e-05, + "loss": 0.9312, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 139, + "tokens_per_second_per_gpu": 16713.43, + "total_tokens": 3558430 + }, + { + "epoch": 0.011197312644965209, + "grad_norm": 0.19931592047214508, + "learning_rate": 1.9907478404714438e-05, + "loss": 0.9547, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 140, + "tokens_per_second_per_gpu": 16953.09, + "total_tokens": 3584023 + }, + { + "epoch": 0.011277293449572103, + "grad_norm": 0.2059127390384674, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.9094, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 141, + "tokens_per_second_per_gpu": 16721.38, + "total_tokens": 3609030 + }, + { + "epoch": 0.011357274254178998, + "grad_norm": 0.20056259632110596, + "learning_rate": 1.989776230907789e-05, + "loss": 0.906, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 142, + "tokens_per_second_per_gpu": 16294.34, + "total_tokens": 3633273 + }, + { + "epoch": 0.011437255058785891, + "grad_norm": 0.2079566866159439, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.9815, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 143, + "tokens_per_second_per_gpu": 17511.65, + "total_tokens": 3659737 + }, + { + "epoch": 0.011517235863392785, + "grad_norm": 0.20305806398391724, + "learning_rate": 1.988756381047006e-05, + "loss": 0.9088, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 144, + "tokens_per_second_per_gpu": 16641.28, + "total_tokens": 3684810 + }, + { + "epoch": 0.01159721666799968, + "grad_norm": 0.19826866686344147, + "learning_rate": 1.988228381446553e-05, + "loss": 0.9416, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 145, + "tokens_per_second_per_gpu": 17656.98, + "total_tokens": 3712176 + }, + { + "epoch": 0.011677197472606574, + "grad_norm": 0.21241246163845062, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.9926, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 146, + "tokens_per_second_per_gpu": 17247.13, + "total_tokens": 3737518 + }, + { + "epoch": 0.01175717827721347, + "grad_norm": 0.21107642352581024, + "learning_rate": 1.987136265072988e-05, + "loss": 0.9405, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 147, + "tokens_per_second_per_gpu": 17433.04, + "total_tokens": 3763871 + }, + { + "epoch": 0.011837159081820363, + "grad_norm": 0.19587242603302002, + "learning_rate": 1.9865721616069695e-05, + "loss": 0.8517, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 148, + "tokens_per_second_per_gpu": 17093.93, + "total_tokens": 3790220 + }, + { + "epoch": 0.011917139886427258, + "grad_norm": 0.20216360688209534, + "learning_rate": 1.985996037070505e-05, + "loss": 0.8985, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 149, + "tokens_per_second_per_gpu": 17259.66, + "total_tokens": 3815740 + }, + { + "epoch": 0.011997120691034152, + "grad_norm": 0.2222292274236679, + "learning_rate": 1.9854078984834904e-05, + "loss": 0.9209, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 150, + "tokens_per_second_per_gpu": 17035.62, + "total_tokens": 3841798 + }, + { + "epoch": 0.012077101495641045, + "grad_norm": 0.21165066957473755, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.9174, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 151, + "tokens_per_second_per_gpu": 16628.36, + "total_tokens": 3866966 + }, + { + "epoch": 0.01215708230024794, + "grad_norm": 0.23638273775577545, + "learning_rate": 1.984195607969242e-05, + "loss": 0.9104, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 152, + "tokens_per_second_per_gpu": 16175.3, + "total_tokens": 3891476 + }, + { + "epoch": 0.012237063104854834, + "grad_norm": 0.20337818562984467, + "learning_rate": 1.983571470813386e-05, + "loss": 0.9093, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 153, + "tokens_per_second_per_gpu": 16886.31, + "total_tokens": 3917465 + }, + { + "epoch": 0.01231704390946173, + "grad_norm": 0.2119511514902115, + "learning_rate": 1.9829353491495545e-05, + "loss": 0.8815, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 154, + "tokens_per_second_per_gpu": 16518.37, + "total_tokens": 3942537 + }, + { + "epoch": 0.012397024714068623, + "grad_norm": 0.205114483833313, + "learning_rate": 1.982287250728689e-05, + "loss": 0.9035, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 155, + "tokens_per_second_per_gpu": 16966.41, + "total_tokens": 3968215 + }, + { + "epoch": 0.012477005518675519, + "grad_norm": 0.21471446752548218, + "learning_rate": 1.9816271834476642e-05, + "loss": 0.9305, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 156, + "tokens_per_second_per_gpu": 16764.47, + "total_tokens": 3993218 + }, + { + "epoch": 0.012556986323282412, + "grad_norm": 0.208131805062294, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.8548, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 157, + "tokens_per_second_per_gpu": 16967.42, + "total_tokens": 4018747 + }, + { + "epoch": 0.012636967127889306, + "grad_norm": 0.2270553857088089, + "learning_rate": 1.9802711746217222e-05, + "loss": 0.9206, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 158, + "tokens_per_second_per_gpu": 17194.18, + "total_tokens": 4044793 + }, + { + "epoch": 0.012716947932496201, + "grad_norm": 0.2232825756072998, + "learning_rate": 1.979575249599344e-05, + "loss": 0.9185, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 159, + "tokens_per_second_per_gpu": 16239.69, + "total_tokens": 4069372 + }, + { + "epoch": 0.012796928737103095, + "grad_norm": 0.22711730003356934, + "learning_rate": 1.9788673887616852e-05, + "loss": 0.8979, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 160, + "tokens_per_second_per_gpu": 17230.88, + "total_tokens": 4094803 + }, + { + "epoch": 0.01287690954170999, + "grad_norm": 0.23493967950344086, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.9238, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 161, + "tokens_per_second_per_gpu": 17095.3, + "total_tokens": 4119931 + }, + { + "epoch": 0.012956890346316884, + "grad_norm": 0.24412371218204498, + "learning_rate": 1.9774158942860962e-05, + "loss": 0.8189, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 162, + "tokens_per_second_per_gpu": 16862.59, + "total_tokens": 4145075 + }, + { + "epoch": 0.013036871150923779, + "grad_norm": 0.228457972407341, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.9137, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 163, + "tokens_per_second_per_gpu": 16955.72, + "total_tokens": 4170710 + }, + { + "epoch": 0.013116851955530672, + "grad_norm": 0.23934195935726166, + "learning_rate": 1.9759167619387474e-05, + "loss": 0.9302, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 164, + "tokens_per_second_per_gpu": 16683.88, + "total_tokens": 4195279 + }, + { + "epoch": 0.013196832760137568, + "grad_norm": 0.23014573752880096, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.9042, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 165, + "tokens_per_second_per_gpu": 16489.02, + "total_tokens": 4219744 + }, + { + "epoch": 0.013276813564744461, + "grad_norm": 0.230689138174057, + "learning_rate": 1.9743700647852356e-05, + "loss": 0.9264, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 166, + "tokens_per_second_per_gpu": 17158.8, + "total_tokens": 4245795 + }, + { + "epoch": 0.013356794369351355, + "grad_norm": 0.23245392739772797, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.8792, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 167, + "tokens_per_second_per_gpu": 16252.68, + "total_tokens": 4270401 + }, + { + "epoch": 0.01343677517395825, + "grad_norm": 0.43715667724609375, + "learning_rate": 1.972775878209397e-05, + "loss": 0.8955, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 168, + "tokens_per_second_per_gpu": 16531.81, + "total_tokens": 4295082 + }, + { + "epoch": 0.013516755978565144, + "grad_norm": 0.2350732386112213, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.8973, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 169, + "tokens_per_second_per_gpu": 16782.69, + "total_tokens": 4319861 + }, + { + "epoch": 0.01359673678317204, + "grad_norm": 0.23498980700969696, + "learning_rate": 1.971134279909636e-05, + "loss": 0.949, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 170, + "tokens_per_second_per_gpu": 17103.87, + "total_tokens": 4345416 + }, + { + "epoch": 0.013676717587778933, + "grad_norm": 0.22032824158668518, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.89, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 171, + "tokens_per_second_per_gpu": 16754.18, + "total_tokens": 4370945 + }, + { + "epoch": 0.013756698392385828, + "grad_norm": 0.22958935797214508, + "learning_rate": 1.9694453498951392e-05, + "loss": 0.9186, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 172, + "tokens_per_second_per_gpu": 17127.91, + "total_tokens": 4396853 + }, + { + "epoch": 0.013836679196992722, + "grad_norm": 0.24071195721626282, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.962, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 173, + "tokens_per_second_per_gpu": 17406.78, + "total_tokens": 4422628 + }, + { + "epoch": 0.013916660001599615, + "grad_norm": 0.2658619284629822, + "learning_rate": 1.9677091704819714e-05, + "loss": 0.9132, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 174, + "tokens_per_second_per_gpu": 16846.42, + "total_tokens": 4448035 + }, + { + "epoch": 0.01399664080620651, + "grad_norm": 0.23434384167194366, + "learning_rate": 1.9668233886044597e-05, + "loss": 0.8774, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 175, + "tokens_per_second_per_gpu": 17076.79, + "total_tokens": 4473972 + }, + { + "epoch": 0.014076621610813404, + "grad_norm": 0.24713198840618134, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.905, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 176, + "tokens_per_second_per_gpu": 16949.62, + "total_tokens": 4498614 + }, + { + "epoch": 0.0141566024154203, + "grad_norm": 0.2283277064561844, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.8451, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 177, + "tokens_per_second_per_gpu": 16328.89, + "total_tokens": 4522729 + }, + { + "epoch": 0.014236583220027193, + "grad_norm": 0.23061935603618622, + "learning_rate": 1.96409540423411e-05, + "loss": 0.9068, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 178, + "tokens_per_second_per_gpu": 17338.63, + "total_tokens": 4549252 + }, + { + "epoch": 0.014316564024634089, + "grad_norm": 0.24155394732952118, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.9077, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 179, + "tokens_per_second_per_gpu": 17231.25, + "total_tokens": 4575502 + }, + { + "epoch": 0.014396544829240982, + "grad_norm": 0.24086996912956238, + "learning_rate": 1.9622179935292855e-05, + "loss": 0.8863, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 180, + "tokens_per_second_per_gpu": 17401.86, + "total_tokens": 4601736 + }, + { + "epoch": 0.014476525633847876, + "grad_norm": 0.2347906082868576, + "learning_rate": 1.961261695938319e-05, + "loss": 0.8696, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 181, + "tokens_per_second_per_gpu": 17379.79, + "total_tokens": 4628714 + }, + { + "epoch": 0.014556506438454771, + "grad_norm": 0.24834582209587097, + "learning_rate": 1.9602936856769432e-05, + "loss": 0.866, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 182, + "tokens_per_second_per_gpu": 17082.86, + "total_tokens": 4654025 + }, + { + "epoch": 0.014636487243061665, + "grad_norm": 0.23946715891361237, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.8721, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 183, + "tokens_per_second_per_gpu": 17164.87, + "total_tokens": 4679462 + }, + { + "epoch": 0.01471646804766856, + "grad_norm": 0.2574214041233063, + "learning_rate": 1.9583225744651334e-05, + "loss": 0.852, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 184, + "tokens_per_second_per_gpu": 16882.9, + "total_tokens": 4705235 + }, + { + "epoch": 0.014796448852275454, + "grad_norm": 0.2426890730857849, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.8865, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 185, + "tokens_per_second_per_gpu": 17466.12, + "total_tokens": 4731568 + }, + { + "epoch": 0.014876429656882349, + "grad_norm": 0.24403586983680725, + "learning_rate": 1.9563047559630356e-05, + "loss": 0.8622, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 186, + "tokens_per_second_per_gpu": 16780.87, + "total_tokens": 4756596 + }, + { + "epoch": 0.014956410461489242, + "grad_norm": 0.24118457734584808, + "learning_rate": 1.9552783621223437e-05, + "loss": 0.8634, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 187, + "tokens_per_second_per_gpu": 16944.71, + "total_tokens": 4781534 + }, + { + "epoch": 0.015036391266096136, + "grad_norm": 0.23634915053844452, + "learning_rate": 1.954240328516277e-05, + "loss": 0.8703, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 188, + "tokens_per_second_per_gpu": 17084.81, + "total_tokens": 4807256 + }, + { + "epoch": 0.015116372070703031, + "grad_norm": 0.2323237955570221, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.8458, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 189, + "tokens_per_second_per_gpu": 16775.18, + "total_tokens": 4832696 + }, + { + "epoch": 0.015196352875309925, + "grad_norm": 0.24547705054283142, + "learning_rate": 1.9521293927421388e-05, + "loss": 0.87, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 190, + "tokens_per_second_per_gpu": 17118.63, + "total_tokens": 4858403 + }, + { + "epoch": 0.01527633367991682, + "grad_norm": 0.252999484539032, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8938, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 191, + "tokens_per_second_per_gpu": 17209.42, + "total_tokens": 4884814 + }, + { + "epoch": 0.015356314484523714, + "grad_norm": 0.26226651668548584, + "learning_rate": 1.9499720515246524e-05, + "loss": 0.9225, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 192, + "tokens_per_second_per_gpu": 17211.08, + "total_tokens": 4910205 + }, + { + "epoch": 0.01543629528913061, + "grad_norm": 0.25136351585388184, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.8292, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 193, + "tokens_per_second_per_gpu": 16452.56, + "total_tokens": 4934218 + }, + { + "epoch": 0.015516276093737503, + "grad_norm": 0.25001877546310425, + "learning_rate": 1.947768410009586e-05, + "loss": 0.8845, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 194, + "tokens_per_second_per_gpu": 17134.82, + "total_tokens": 4960601 + }, + { + "epoch": 0.015596256898344398, + "grad_norm": 0.24449992179870605, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.8761, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 195, + "tokens_per_second_per_gpu": 17246.7, + "total_tokens": 4986559 + }, + { + "epoch": 0.01567623770295129, + "grad_norm": 0.24022875726222992, + "learning_rate": 1.945518575599317e-05, + "loss": 0.8345, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 196, + "tokens_per_second_per_gpu": 16785.14, + "total_tokens": 5011413 + }, + { + "epoch": 0.015756218507558185, + "grad_norm": 0.2391171008348465, + "learning_rate": 1.944376370237481e-05, + "loss": 0.8405, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 197, + "tokens_per_second_per_gpu": 17283.57, + "total_tokens": 5037700 + }, + { + "epoch": 0.01583619931216508, + "grad_norm": 0.2588050961494446, + "learning_rate": 1.943222657947601e-05, + "loss": 0.8114, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 198, + "tokens_per_second_per_gpu": 16778.59, + "total_tokens": 5062904 + }, + { + "epoch": 0.015916180116771976, + "grad_norm": 0.26281964778900146, + "learning_rate": 1.942057452787297e-05, + "loss": 0.8992, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 199, + "tokens_per_second_per_gpu": 16845.87, + "total_tokens": 5087758 + }, + { + "epoch": 0.015996160921378868, + "grad_norm": 0.2588569223880768, + "learning_rate": 1.9408807689542257e-05, + "loss": 0.7411, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 200, + "tokens_per_second_per_gpu": 16066.45, + "total_tokens": 5111019 + }, + { + "epoch": 0.016076141725985763, + "grad_norm": 0.2594797611236572, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.8375, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 201, + "tokens_per_second_per_gpu": 17125.37, + "total_tokens": 5136688 + }, + { + "epoch": 0.01615612253059266, + "grad_norm": 0.2549116015434265, + "learning_rate": 1.938493022759556e-05, + "loss": 0.8919, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 202, + "tokens_per_second_per_gpu": 17181.47, + "total_tokens": 5162141 + }, + { + "epoch": 0.01623610333519955, + "grad_norm": 0.258368581533432, + "learning_rate": 1.937281989491892e-05, + "loss": 0.9158, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 203, + "tokens_per_second_per_gpu": 17312.04, + "total_tokens": 5188261 + }, + { + "epoch": 0.016316084139806446, + "grad_norm": 0.31648266315460205, + "learning_rate": 1.9360595357389735e-05, + "loss": 0.8818, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 204, + "tokens_per_second_per_gpu": 17703.45, + "total_tokens": 5214925 + }, + { + "epoch": 0.01639606494441334, + "grad_norm": 0.2698972523212433, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.943, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 205, + "tokens_per_second_per_gpu": 17321.57, + "total_tokens": 5240212 + }, + { + "epoch": 0.016476045749020236, + "grad_norm": 0.2627377212047577, + "learning_rate": 1.9335804264972018e-05, + "loss": 0.8122, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 206, + "tokens_per_second_per_gpu": 17099.13, + "total_tokens": 5265405 + }, + { + "epoch": 0.016556026553627128, + "grad_norm": 0.2688179016113281, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.8562, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 207, + "tokens_per_second_per_gpu": 16059.88, + "total_tokens": 5289288 + }, + { + "epoch": 0.016636007358234024, + "grad_norm": 0.2609153985977173, + "learning_rate": 1.9310558158625286e-05, + "loss": 0.8241, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 208, + "tokens_per_second_per_gpu": 17391.96, + "total_tokens": 5315382 + }, + { + "epoch": 0.01671598816284092, + "grad_norm": 0.26036036014556885, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.8422, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 209, + "tokens_per_second_per_gpu": 16765.8, + "total_tokens": 5340531 + }, + { + "epoch": 0.016795968967447814, + "grad_norm": 0.25738534331321716, + "learning_rate": 1.9284858268809135e-05, + "loss": 0.8039, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 210, + "tokens_per_second_per_gpu": 16745.69, + "total_tokens": 5365785 + }, + { + "epoch": 0.016875949772054706, + "grad_norm": 0.2648962438106537, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.8752, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 211, + "tokens_per_second_per_gpu": 17666.5, + "total_tokens": 5392095 + }, + { + "epoch": 0.0169559305766616, + "grad_norm": 0.2604057192802429, + "learning_rate": 1.925870584809995e-05, + "loss": 0.8895, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 212, + "tokens_per_second_per_gpu": 17065.99, + "total_tokens": 5417896 + }, + { + "epoch": 0.017035911381268497, + "grad_norm": 0.39727583527565, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.8353, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 213, + "tokens_per_second_per_gpu": 17509.61, + "total_tokens": 5443736 + }, + { + "epoch": 0.01711589218587539, + "grad_norm": 0.27699121832847595, + "learning_rate": 1.923210217112981e-05, + "loss": 0.8254, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 214, + "tokens_per_second_per_gpu": 16573.45, + "total_tokens": 5468669 + }, + { + "epoch": 0.017195872990482284, + "grad_norm": 0.2744996249675751, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.8669, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 215, + "tokens_per_second_per_gpu": 17213.09, + "total_tokens": 5493788 + }, + { + "epoch": 0.01727585379508918, + "grad_norm": 0.27408525347709656, + "learning_rate": 1.9205048534524405e-05, + "loss": 0.8753, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 216, + "tokens_per_second_per_gpu": 17308.66, + "total_tokens": 5520168 + }, + { + "epoch": 0.017355834599696075, + "grad_norm": 0.279653400182724, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.8309, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 217, + "tokens_per_second_per_gpu": 16264.72, + "total_tokens": 5544200 + }, + { + "epoch": 0.017435815404302966, + "grad_norm": 0.2667289078235626, + "learning_rate": 1.9177546256839814e-05, + "loss": 0.8341, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 218, + "tokens_per_second_per_gpu": 17247.1, + "total_tokens": 5570605 + }, + { + "epoch": 0.017515796208909862, + "grad_norm": 0.2734803557395935, + "learning_rate": 1.9163627295622397e-05, + "loss": 0.8676, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 219, + "tokens_per_second_per_gpu": 17464.35, + "total_tokens": 5596671 + }, + { + "epoch": 0.017595777013516757, + "grad_norm": 0.2817804217338562, + "learning_rate": 1.914959667849825e-05, + "loss": 0.7697, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 220, + "tokens_per_second_per_gpu": 16843.97, + "total_tokens": 5621627 + }, + { + "epoch": 0.01767575781812365, + "grad_norm": 0.27030467987060547, + "learning_rate": 1.913545457642601e-05, + "loss": 0.9119, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 221, + "tokens_per_second_per_gpu": 17478.61, + "total_tokens": 5648136 + }, + { + "epoch": 0.017755738622730544, + "grad_norm": 0.27667850255966187, + "learning_rate": 1.9121201161722732e-05, + "loss": 0.879, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 222, + "tokens_per_second_per_gpu": 17271.95, + "total_tokens": 5674496 + }, + { + "epoch": 0.01783571942733744, + "grad_norm": 0.2836981415748596, + "learning_rate": 1.910683660806177e-05, + "loss": 0.8441, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 223, + "tokens_per_second_per_gpu": 17159.55, + "total_tokens": 5700058 + }, + { + "epoch": 0.017915700231944335, + "grad_norm": 0.2744138538837433, + "learning_rate": 1.9092361090470688e-05, + "loss": 0.847, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 224, + "tokens_per_second_per_gpu": 17264.31, + "total_tokens": 5725619 + }, + { + "epoch": 0.017995681036551227, + "grad_norm": 0.26166272163391113, + "learning_rate": 1.907777478532909e-05, + "loss": 0.8067, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 225, + "tokens_per_second_per_gpu": 17051.11, + "total_tokens": 5750839 + }, + { + "epoch": 0.018075661841158122, + "grad_norm": 0.2761372923851013, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.8297, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 226, + "tokens_per_second_per_gpu": 16956.47, + "total_tokens": 5776313 + }, + { + "epoch": 0.018155642645765017, + "grad_norm": 0.27935782074928284, + "learning_rate": 1.9048270524660197e-05, + "loss": 0.8442, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 227, + "tokens_per_second_per_gpu": 16999.99, + "total_tokens": 5802194 + }, + { + "epoch": 0.01823562345037191, + "grad_norm": 0.2994026839733124, + "learning_rate": 1.903335292863301e-05, + "loss": 0.8445, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 228, + "tokens_per_second_per_gpu": 16475.4, + "total_tokens": 5826348 + }, + { + "epoch": 0.018315604254978805, + "grad_norm": 0.2798149883747101, + "learning_rate": 1.901832526405114e-05, + "loss": 0.8692, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 229, + "tokens_per_second_per_gpu": 17101.79, + "total_tokens": 5851978 + }, + { + "epoch": 0.0183955850595857, + "grad_norm": 0.2609909474849701, + "learning_rate": 1.9003187714021936e-05, + "loss": 0.7482, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 230, + "tokens_per_second_per_gpu": 16552.15, + "total_tokens": 5877167 + }, + { + "epoch": 0.018475565864192595, + "grad_norm": 0.29680418968200684, + "learning_rate": 1.8987940462991673e-05, + "loss": 0.861, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 231, + "tokens_per_second_per_gpu": 16968.37, + "total_tokens": 5902589 + }, + { + "epoch": 0.018555546668799487, + "grad_norm": 0.2876088321208954, + "learning_rate": 1.8972583696743284e-05, + "loss": 0.8511, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 232, + "tokens_per_second_per_gpu": 16522.68, + "total_tokens": 5927666 + }, + { + "epoch": 0.018635527473406382, + "grad_norm": 0.2777324318885803, + "learning_rate": 1.895711760239413e-05, + "loss": 0.7771, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 233, + "tokens_per_second_per_gpu": 16851.59, + "total_tokens": 5953221 + }, + { + "epoch": 0.018715508278013278, + "grad_norm": 0.29070353507995605, + "learning_rate": 1.8941542368393683e-05, + "loss": 0.8033, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 234, + "tokens_per_second_per_gpu": 16504.59, + "total_tokens": 5978512 + }, + { + "epoch": 0.01879548908262017, + "grad_norm": 0.29157114028930664, + "learning_rate": 1.892585818452126e-05, + "loss": 0.8529, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 235, + "tokens_per_second_per_gpu": 16711.56, + "total_tokens": 6003185 + }, + { + "epoch": 0.018875469887227065, + "grad_norm": 0.30835041403770447, + "learning_rate": 1.891006524188368e-05, + "loss": 0.8518, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 236, + "tokens_per_second_per_gpu": 16536.32, + "total_tokens": 6027878 + }, + { + "epoch": 0.01895545069183396, + "grad_norm": 0.2955070436000824, + "learning_rate": 1.889416373291298e-05, + "loss": 0.8672, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 237, + "tokens_per_second_per_gpu": 16792.68, + "total_tokens": 6053557 + }, + { + "epoch": 0.019035431496440856, + "grad_norm": 0.2786145806312561, + "learning_rate": 1.8878153851364013e-05, + "loss": 0.8302, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 238, + "tokens_per_second_per_gpu": 17428.54, + "total_tokens": 6080795 + }, + { + "epoch": 0.019115412301047748, + "grad_norm": 0.2858044505119324, + "learning_rate": 1.8862035792312148e-05, + "loss": 0.8413, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 239, + "tokens_per_second_per_gpu": 17091.79, + "total_tokens": 6106593 + }, + { + "epoch": 0.019195393105654643, + "grad_norm": 0.29661673307418823, + "learning_rate": 1.884580975215084e-05, + "loss": 0.8058, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 240, + "tokens_per_second_per_gpu": 16379.78, + "total_tokens": 6130856 + }, + { + "epoch": 0.019275373910261538, + "grad_norm": 0.2872996926307678, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.8136, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 241, + "tokens_per_second_per_gpu": 16644.37, + "total_tokens": 6156050 + }, + { + "epoch": 0.01935535471486843, + "grad_norm": 0.29381078481674194, + "learning_rate": 1.8813034520649923e-05, + "loss": 0.8415, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 242, + "tokens_per_second_per_gpu": 16654.68, + "total_tokens": 6181435 + }, + { + "epoch": 0.019435335519475325, + "grad_norm": 0.28002533316612244, + "learning_rate": 1.879648572866617e-05, + "loss": 0.7861, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 243, + "tokens_per_second_per_gpu": 16498.47, + "total_tokens": 6206084 + }, + { + "epoch": 0.01951531632408222, + "grad_norm": 0.30103883147239685, + "learning_rate": 1.8779829754279806e-05, + "loss": 0.8378, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 244, + "tokens_per_second_per_gpu": 17231.92, + "total_tokens": 6232452 + }, + { + "epoch": 0.019595297128689116, + "grad_norm": 0.3162606358528137, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.8362, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 245, + "tokens_per_second_per_gpu": 17419.8, + "total_tokens": 6258179 + }, + { + "epoch": 0.019675277933296008, + "grad_norm": 0.30273863673210144, + "learning_rate": 1.874619707139396e-05, + "loss": 0.8654, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 246, + "tokens_per_second_per_gpu": 16774.8, + "total_tokens": 6283069 + }, + { + "epoch": 0.019755258737902903, + "grad_norm": 0.2920013666152954, + "learning_rate": 1.8729220772698096e-05, + "loss": 0.799, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 247, + "tokens_per_second_per_gpu": 16609.12, + "total_tokens": 6308439 + }, + { + "epoch": 0.0198352395425098, + "grad_norm": 0.28597742319107056, + "learning_rate": 1.8712138111201898e-05, + "loss": 0.7502, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 248, + "tokens_per_second_per_gpu": 16656.5, + "total_tokens": 6333609 + }, + { + "epoch": 0.01991522034711669, + "grad_norm": 0.3035345673561096, + "learning_rate": 1.869494929505219e-05, + "loss": 0.8001, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 249, + "tokens_per_second_per_gpu": 16881.32, + "total_tokens": 6358608 + }, + { + "epoch": 0.019995201151723586, + "grad_norm": 0.2953839600086212, + "learning_rate": 1.8677654533689287e-05, + "loss": 0.7813, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 250, + "tokens_per_second_per_gpu": 16977.53, + "total_tokens": 6383721 + }, + { + "epoch": 0.02007518195633048, + "grad_norm": 0.3125785291194916, + "learning_rate": 1.866025403784439e-05, + "loss": 0.8195, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 251, + "tokens_per_second_per_gpu": 16863.17, + "total_tokens": 6408806 + }, + { + "epoch": 0.020155162760937376, + "grad_norm": 0.2873575985431671, + "learning_rate": 1.864274801953705e-05, + "loss": 0.8268, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 252, + "tokens_per_second_per_gpu": 17229.79, + "total_tokens": 6434899 + }, + { + "epoch": 0.020235143565544268, + "grad_norm": 0.29636356234550476, + "learning_rate": 1.8625136692072577e-05, + "loss": 0.8041, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 253, + "tokens_per_second_per_gpu": 17474.25, + "total_tokens": 6462145 + }, + { + "epoch": 0.020315124370151164, + "grad_norm": 0.29690074920654297, + "learning_rate": 1.860742027003944e-05, + "loss": 0.8282, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 254, + "tokens_per_second_per_gpu": 17097.68, + "total_tokens": 6488193 + }, + { + "epoch": 0.02039510517475806, + "grad_norm": 0.4087201654911041, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.7644, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 255, + "tokens_per_second_per_gpu": 16154.98, + "total_tokens": 6512510 + }, + { + "epoch": 0.02047508597936495, + "grad_norm": 0.2895331084728241, + "learning_rate": 1.8571673007021124e-05, + "loss": 0.8014, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 256, + "tokens_per_second_per_gpu": 17165.77, + "total_tokens": 6538276 + }, + { + "epoch": 0.020555066783971846, + "grad_norm": 0.3026330769062042, + "learning_rate": 1.855364260160507e-05, + "loss": 0.7987, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 257, + "tokens_per_second_per_gpu": 16744.28, + "total_tokens": 6563490 + }, + { + "epoch": 0.02063504758857874, + "grad_norm": 0.32229679822921753, + "learning_rate": 1.8535507972753275e-05, + "loss": 0.8214, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 258, + "tokens_per_second_per_gpu": 17172.36, + "total_tokens": 6589271 + }, + { + "epoch": 0.020715028393185637, + "grad_norm": 0.3137056231498718, + "learning_rate": 1.851726934143048e-05, + "loss": 0.7672, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 259, + "tokens_per_second_per_gpu": 16452.74, + "total_tokens": 6613284 + }, + { + "epoch": 0.02079500919779253, + "grad_norm": 0.28917086124420166, + "learning_rate": 1.849892692986864e-05, + "loss": 0.7599, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 260, + "tokens_per_second_per_gpu": 16640.08, + "total_tokens": 6637930 + }, + { + "epoch": 0.020874990002399424, + "grad_norm": 0.2955164313316345, + "learning_rate": 1.848048096156426e-05, + "loss": 0.8276, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 261, + "tokens_per_second_per_gpu": 17161.26, + "total_tokens": 6663889 + }, + { + "epoch": 0.02095497080700632, + "grad_norm": 0.3261178731918335, + "learning_rate": 1.8461931661275642e-05, + "loss": 0.8166, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 262, + "tokens_per_second_per_gpu": 17195.28, + "total_tokens": 6689227 + }, + { + "epoch": 0.02103495161161321, + "grad_norm": 0.33998236060142517, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.8438, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 263, + "tokens_per_second_per_gpu": 17061.4, + "total_tokens": 6714449 + }, + { + "epoch": 0.021114932416220106, + "grad_norm": 0.2960314154624939, + "learning_rate": 1.842452397007148e-05, + "loss": 0.7353, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 264, + "tokens_per_second_per_gpu": 16816.51, + "total_tokens": 6739649 + }, + { + "epoch": 0.021194913220827002, + "grad_norm": 0.3163682520389557, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.7903, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 265, + "tokens_per_second_per_gpu": 16833.27, + "total_tokens": 6764236 + }, + { + "epoch": 0.021274894025433897, + "grad_norm": 0.3097144365310669, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.7922, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 266, + "tokens_per_second_per_gpu": 17249.38, + "total_tokens": 6790713 + }, + { + "epoch": 0.02135487483004079, + "grad_norm": 0.30585765838623047, + "learning_rate": 1.836764313458962e-05, + "loss": 0.7718, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 267, + "tokens_per_second_per_gpu": 17056.67, + "total_tokens": 6816552 + }, + { + "epoch": 0.021434855634647684, + "grad_norm": 0.31744128465652466, + "learning_rate": 1.8348478632634067e-05, + "loss": 0.8042, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 268, + "tokens_per_second_per_gpu": 16660.22, + "total_tokens": 6842118 + }, + { + "epoch": 0.02151483643925458, + "grad_norm": 0.32907190918922424, + "learning_rate": 1.8329212407100996e-05, + "loss": 0.7517, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 269, + "tokens_per_second_per_gpu": 16136.63, + "total_tokens": 6865763 + }, + { + "epoch": 0.021594817243861475, + "grad_norm": 0.30771222710609436, + "learning_rate": 1.8309844692743283e-05, + "loss": 0.7819, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 270, + "tokens_per_second_per_gpu": 17209.72, + "total_tokens": 6891634 + }, + { + "epoch": 0.021674798048468367, + "grad_norm": 0.31215161085128784, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.7773, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 271, + "tokens_per_second_per_gpu": 16681.19, + "total_tokens": 6917029 + }, + { + "epoch": 0.021754778853075262, + "grad_norm": 0.31891316175460815, + "learning_rate": 1.827080574274562e-05, + "loss": 0.8129, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 272, + "tokens_per_second_per_gpu": 17075.42, + "total_tokens": 6942625 + }, + { + "epoch": 0.021834759657682157, + "grad_norm": 0.33234041929244995, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.8021, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 273, + "tokens_per_second_per_gpu": 16993.73, + "total_tokens": 6968068 + }, + { + "epoch": 0.02191474046228905, + "grad_norm": 0.3127538859844208, + "learning_rate": 1.8231363685344422e-05, + "loss": 0.8295, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 274, + "tokens_per_second_per_gpu": 16858.57, + "total_tokens": 6993212 + }, + { + "epoch": 0.021994721266895945, + "grad_norm": 0.3256042003631592, + "learning_rate": 1.821149209133704e-05, + "loss": 0.7551, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 275, + "tokens_per_second_per_gpu": 16664.19, + "total_tokens": 7018151 + }, + { + "epoch": 0.02207470207150284, + "grad_norm": 0.33830273151397705, + "learning_rate": 1.819152044288992e-05, + "loss": 0.7927, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 276, + "tokens_per_second_per_gpu": 16526.59, + "total_tokens": 7042282 + }, + { + "epoch": 0.022154682876109735, + "grad_norm": 0.32764095067977905, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.8133, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 277, + "tokens_per_second_per_gpu": 16885.71, + "total_tokens": 7068020 + }, + { + "epoch": 0.022234663680716627, + "grad_norm": 0.3218875527381897, + "learning_rate": 1.815127795728554e-05, + "loss": 0.8018, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 278, + "tokens_per_second_per_gpu": 17189.97, + "total_tokens": 7093302 + }, + { + "epoch": 0.022314644485323522, + "grad_norm": 0.304941326379776, + "learning_rate": 1.8131007610470278e-05, + "loss": 0.7814, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 279, + "tokens_per_second_per_gpu": 17206.92, + "total_tokens": 7119187 + }, + { + "epoch": 0.022394625289930418, + "grad_norm": 0.340358704328537, + "learning_rate": 1.8110638189893267e-05, + "loss": 0.8054, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 280, + "tokens_per_second_per_gpu": 16899.54, + "total_tokens": 7144790 + }, + { + "epoch": 0.02247460609453731, + "grad_norm": 0.3224817216396332, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.808, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 281, + "tokens_per_second_per_gpu": 16848.86, + "total_tokens": 7170086 + }, + { + "epoch": 0.022554586899144205, + "grad_norm": 0.3096613585948944, + "learning_rate": 1.806960312143802e-05, + "loss": 0.7462, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 282, + "tokens_per_second_per_gpu": 17102.35, + "total_tokens": 7196108 + }, + { + "epoch": 0.0226345677037511, + "grad_norm": 0.3089353144168854, + "learning_rate": 1.804893797355914e-05, + "loss": 0.7468, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 283, + "tokens_per_second_per_gpu": 16643.16, + "total_tokens": 7221381 + }, + { + "epoch": 0.022714548508357996, + "grad_norm": 0.3287941813468933, + "learning_rate": 1.8028174751911147e-05, + "loss": 0.7332, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 284, + "tokens_per_second_per_gpu": 16230.46, + "total_tokens": 7245664 + }, + { + "epoch": 0.022794529312964888, + "grad_norm": 0.33320385217666626, + "learning_rate": 1.8007313709487334e-05, + "loss": 0.769, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 285, + "tokens_per_second_per_gpu": 17250.49, + "total_tokens": 7271281 + }, + { + "epoch": 0.022874510117571783, + "grad_norm": 0.33193832635879517, + "learning_rate": 1.798635510047293e-05, + "loss": 0.7806, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 286, + "tokens_per_second_per_gpu": 17234.16, + "total_tokens": 7297650 + }, + { + "epoch": 0.022954490922178678, + "grad_norm": 0.3023802638053894, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.727, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 287, + "tokens_per_second_per_gpu": 16947.96, + "total_tokens": 7323634 + }, + { + "epoch": 0.02303447172678557, + "grad_norm": 0.3405572772026062, + "learning_rate": 1.7944146205354182e-05, + "loss": 0.7677, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 288, + "tokens_per_second_per_gpu": 17075.52, + "total_tokens": 7348943 + }, + { + "epoch": 0.023114452531392465, + "grad_norm": 0.33041706681251526, + "learning_rate": 1.792289643355191e-05, + "loss": 0.771, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 289, + "tokens_per_second_per_gpu": 17190.81, + "total_tokens": 7374829 + }, + { + "epoch": 0.02319443333599936, + "grad_norm": 0.3304063677787781, + "learning_rate": 1.7901550123756906e-05, + "loss": 0.7701, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 290, + "tokens_per_second_per_gpu": 17388.73, + "total_tokens": 7401339 + }, + { + "epoch": 0.023274414140606256, + "grad_norm": 0.3571583032608032, + "learning_rate": 1.788010753606722e-05, + "loss": 0.7701, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 291, + "tokens_per_second_per_gpu": 16986.08, + "total_tokens": 7426830 + }, + { + "epoch": 0.023354394945213148, + "grad_norm": 0.3259941339492798, + "learning_rate": 1.785856893175402e-05, + "loss": 0.7765, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 292, + "tokens_per_second_per_gpu": 17349.81, + "total_tokens": 7452984 + }, + { + "epoch": 0.023434375749820043, + "grad_norm": 0.3239382803440094, + "learning_rate": 1.78369345732584e-05, + "loss": 0.7691, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 293, + "tokens_per_second_per_gpu": 17073.92, + "total_tokens": 7477969 + }, + { + "epoch": 0.02351435655442694, + "grad_norm": 0.3326447010040283, + "learning_rate": 1.781520472418819e-05, + "loss": 0.7332, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 294, + "tokens_per_second_per_gpu": 16657.1, + "total_tokens": 7502565 + }, + { + "epoch": 0.02359433735903383, + "grad_norm": 0.34120991826057434, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.7993, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 295, + "tokens_per_second_per_gpu": 16742.23, + "total_tokens": 7527568 + }, + { + "epoch": 0.023674318163640726, + "grad_norm": 0.33794164657592773, + "learning_rate": 1.777145961456971e-05, + "loss": 0.8072, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 296, + "tokens_per_second_per_gpu": 17036.53, + "total_tokens": 7552596 + }, + { + "epoch": 0.02375429896824762, + "grad_norm": 0.3645365834236145, + "learning_rate": 1.7749444887041797e-05, + "loss": 0.7621, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 297, + "tokens_per_second_per_gpu": 16764.81, + "total_tokens": 7577121 + }, + { + "epoch": 0.023834279772854516, + "grad_norm": 0.35922765731811523, + "learning_rate": 1.7727335734973512e-05, + "loss": 0.7771, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 298, + "tokens_per_second_per_gpu": 17504.58, + "total_tokens": 7602943 + }, + { + "epoch": 0.023914260577461408, + "grad_norm": 0.3424239456653595, + "learning_rate": 1.7705132427757895e-05, + "loss": 0.729, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 299, + "tokens_per_second_per_gpu": 16983.33, + "total_tokens": 7627654 + }, + { + "epoch": 0.023994241382068304, + "grad_norm": 0.34089338779449463, + "learning_rate": 1.7682835235935236e-05, + "loss": 0.7803, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 300, + "tokens_per_second_per_gpu": 16880.41, + "total_tokens": 7653306 + }, + { + "epoch": 0.0240742221866752, + "grad_norm": 0.3372519016265869, + "learning_rate": 1.766044443118978e-05, + "loss": 0.7079, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 301, + "tokens_per_second_per_gpu": 16821.65, + "total_tokens": 7678769 + }, + { + "epoch": 0.02415420299128209, + "grad_norm": 0.33545535802841187, + "learning_rate": 1.7637960286346423e-05, + "loss": 0.7465, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 302, + "tokens_per_second_per_gpu": 16997.07, + "total_tokens": 7704035 + }, + { + "epoch": 0.024234183795888986, + "grad_norm": 0.35364168882369995, + "learning_rate": 1.761538307536737e-05, + "loss": 0.8609, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 303, + "tokens_per_second_per_gpu": 17088.38, + "total_tokens": 7729313 + }, + { + "epoch": 0.02431416460049588, + "grad_norm": 0.3543623089790344, + "learning_rate": 1.759271307334881e-05, + "loss": 0.7496, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 304, + "tokens_per_second_per_gpu": 16831.61, + "total_tokens": 7754311 + }, + { + "epoch": 0.024394145405102777, + "grad_norm": 0.35020682215690613, + "learning_rate": 1.7569950556517566e-05, + "loss": 0.8171, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 305, + "tokens_per_second_per_gpu": 17372.62, + "total_tokens": 7780114 + }, + { + "epoch": 0.02447412620970967, + "grad_norm": 0.3287740647792816, + "learning_rate": 1.7547095802227723e-05, + "loss": 0.756, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 306, + "tokens_per_second_per_gpu": 17061.4, + "total_tokens": 7806162 + }, + { + "epoch": 0.024554107014316564, + "grad_norm": 0.347204327583313, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.7294, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 307, + "tokens_per_second_per_gpu": 16759.71, + "total_tokens": 7831007 + }, + { + "epoch": 0.02463408781892346, + "grad_norm": 0.36060085892677307, + "learning_rate": 1.7501110696304598e-05, + "loss": 0.7522, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 308, + "tokens_per_second_per_gpu": 16761.57, + "total_tokens": 7855632 + }, + { + "epoch": 0.02471406862353035, + "grad_norm": 0.3645978569984436, + "learning_rate": 1.747798090498532e-05, + "loss": 0.7788, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 309, + "tokens_per_second_per_gpu": 17255.8, + "total_tokens": 7881203 + }, + { + "epoch": 0.024794049428137246, + "grad_norm": 0.37542036175727844, + "learning_rate": 1.7454759996828622e-05, + "loss": 0.7185, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 310, + "tokens_per_second_per_gpu": 16261.57, + "total_tokens": 7905495 + }, + { + "epoch": 0.024874030232744142, + "grad_norm": 0.34638047218322754, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.7835, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 311, + "tokens_per_second_per_gpu": 17285.95, + "total_tokens": 7932504 + }, + { + "epoch": 0.024954011037351037, + "grad_norm": 0.342735230922699, + "learning_rate": 1.74080459628675e-05, + "loss": 0.777, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 312, + "tokens_per_second_per_gpu": 17247.27, + "total_tokens": 7958943 + }, + { + "epoch": 0.02503399184195793, + "grad_norm": 0.3410895764827728, + "learning_rate": 1.7384553406258842e-05, + "loss": 0.7335, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 313, + "tokens_per_second_per_gpu": 16698.56, + "total_tokens": 7984134 + }, + { + "epoch": 0.025113972646564824, + "grad_norm": 0.3840852379798889, + "learning_rate": 1.7360970871197347e-05, + "loss": 0.7869, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 314, + "tokens_per_second_per_gpu": 17022.06, + "total_tokens": 8009259 + }, + { + "epoch": 0.02519395345117172, + "grad_norm": 0.36912381649017334, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.7762, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 315, + "tokens_per_second_per_gpu": 17332.18, + "total_tokens": 8035054 + }, + { + "epoch": 0.02527393425577861, + "grad_norm": 0.3521462082862854, + "learning_rate": 1.7313537016191706e-05, + "loss": 0.8045, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 316, + "tokens_per_second_per_gpu": 17373.03, + "total_tokens": 8061906 + }, + { + "epoch": 0.025353915060385507, + "grad_norm": 0.33142420649528503, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.7184, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 317, + "tokens_per_second_per_gpu": 17144.27, + "total_tokens": 8087938 + }, + { + "epoch": 0.025433895864992402, + "grad_norm": 0.3533654808998108, + "learning_rate": 1.7265746709709762e-05, + "loss": 0.7285, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 318, + "tokens_per_second_per_gpu": 16330.33, + "total_tokens": 8112163 + }, + { + "epoch": 0.025513876669599297, + "grad_norm": 0.37131303548812866, + "learning_rate": 1.7241718614374678e-05, + "loss": 0.7473, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 319, + "tokens_per_second_per_gpu": 16981.6, + "total_tokens": 8137330 + }, + { + "epoch": 0.02559385747420619, + "grad_norm": 0.3532845675945282, + "learning_rate": 1.7217602280983622e-05, + "loss": 0.7588, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 320, + "tokens_per_second_per_gpu": 16989.56, + "total_tokens": 8163328 + }, + { + "epoch": 0.025673838278813085, + "grad_norm": 0.3767626881599426, + "learning_rate": 1.7193398003386514e-05, + "loss": 0.7187, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 321, + "tokens_per_second_per_gpu": 16326.84, + "total_tokens": 8187352 + }, + { + "epoch": 0.02575381908341998, + "grad_norm": 0.35990527272224426, + "learning_rate": 1.716910607650483e-05, + "loss": 0.7561, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 322, + "tokens_per_second_per_gpu": 16850.09, + "total_tokens": 8212291 + }, + { + "epoch": 0.025833799888026872, + "grad_norm": 0.3629964590072632, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.7782, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 323, + "tokens_per_second_per_gpu": 17002.36, + "total_tokens": 8237584 + }, + { + "epoch": 0.025913780692633767, + "grad_norm": 0.3404940068721771, + "learning_rate": 1.712026045990997e-05, + "loss": 0.7393, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 324, + "tokens_per_second_per_gpu": 17033.24, + "total_tokens": 8263414 + }, + { + "epoch": 0.025993761497240662, + "grad_norm": 0.3736456334590912, + "learning_rate": 1.709570736536521e-05, + "loss": 0.7916, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 325, + "tokens_per_second_per_gpu": 17195.68, + "total_tokens": 8289808 + }, + { + "epoch": 0.026073742301847558, + "grad_norm": 0.3524475693702698, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.6945, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 326, + "tokens_per_second_per_gpu": 16341.29, + "total_tokens": 8314490 + }, + { + "epoch": 0.02615372310645445, + "grad_norm": 0.3585701286792755, + "learning_rate": 1.7046342099635948e-05, + "loss": 0.7295, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 327, + "tokens_per_second_per_gpu": 16935.65, + "total_tokens": 8339442 + }, + { + "epoch": 0.026233703911061345, + "grad_norm": 0.3640107810497284, + "learning_rate": 1.7021530529951627e-05, + "loss": 0.7135, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 328, + "tokens_per_second_per_gpu": 16629.7, + "total_tokens": 8364542 + }, + { + "epoch": 0.02631368471566824, + "grad_norm": 0.3694165050983429, + "learning_rate": 1.6996633405133656e-05, + "loss": 0.7346, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 329, + "tokens_per_second_per_gpu": 16903.87, + "total_tokens": 8389153 + }, + { + "epoch": 0.026393665520275136, + "grad_norm": 0.42035412788391113, + "learning_rate": 1.697165102854565e-05, + "loss": 0.7501, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 330, + "tokens_per_second_per_gpu": 16961.37, + "total_tokens": 8414362 + }, + { + "epoch": 0.026473646324882028, + "grad_norm": 0.36393973231315613, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.7965, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 331, + "tokens_per_second_per_gpu": 16662.47, + "total_tokens": 8439456 + }, + { + "epoch": 0.026553627129488923, + "grad_norm": 0.36519739031791687, + "learning_rate": 1.692143173870407e-05, + "loss": 0.6917, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 332, + "tokens_per_second_per_gpu": 16982.28, + "total_tokens": 8465047 + }, + { + "epoch": 0.026633607934095818, + "grad_norm": 0.36028608679771423, + "learning_rate": 1.68961954373567e-05, + "loss": 0.7372, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 333, + "tokens_per_second_per_gpu": 17268.51, + "total_tokens": 8491108 + }, + { + "epoch": 0.02671358873870271, + "grad_norm": 0.3669857382774353, + "learning_rate": 1.6870875108044233e-05, + "loss": 0.7399, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 334, + "tokens_per_second_per_gpu": 17302.25, + "total_tokens": 8517165 + }, + { + "epoch": 0.026793569543309605, + "grad_norm": 0.3491288721561432, + "learning_rate": 1.684547105928689e-05, + "loss": 0.7207, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 335, + "tokens_per_second_per_gpu": 16568.06, + "total_tokens": 8541827 + }, + { + "epoch": 0.0268735503479165, + "grad_norm": 0.3872898817062378, + "learning_rate": 1.6819983600624986e-05, + "loss": 0.7689, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 336, + "tokens_per_second_per_gpu": 16928.56, + "total_tokens": 8566999 + }, + { + "epoch": 0.026953531152523396, + "grad_norm": 0.3505984842777252, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.6918, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 337, + "tokens_per_second_per_gpu": 16909.05, + "total_tokens": 8591876 + }, + { + "epoch": 0.027033511957130288, + "grad_norm": 0.37660378217697144, + "learning_rate": 1.6768759696826608e-05, + "loss": 0.7235, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 338, + "tokens_per_second_per_gpu": 17084.11, + "total_tokens": 8617647 + }, + { + "epoch": 0.027113492761737183, + "grad_norm": 0.38223305344581604, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.7838, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 339, + "tokens_per_second_per_gpu": 17355.85, + "total_tokens": 8643982 + }, + { + "epoch": 0.02719347356634408, + "grad_norm": 0.3753760755062103, + "learning_rate": 1.6717205893229904e-05, + "loss": 0.7303, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 340, + "tokens_per_second_per_gpu": 16722.16, + "total_tokens": 8669112 + }, + { + "epoch": 0.02727345437095097, + "grad_norm": 0.3831718862056732, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.7441, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 341, + "tokens_per_second_per_gpu": 16783.28, + "total_tokens": 8694589 + }, + { + "epoch": 0.027353435175557866, + "grad_norm": 0.38198089599609375, + "learning_rate": 1.6665324702494524e-05, + "loss": 0.7216, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 342, + "tokens_per_second_per_gpu": 16233.97, + "total_tokens": 8719038 + }, + { + "epoch": 0.02743341598016476, + "grad_norm": 0.37571123242378235, + "learning_rate": 1.6639262126522417e-05, + "loss": 0.8385, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 343, + "tokens_per_second_per_gpu": 17524.97, + "total_tokens": 8745289 + }, + { + "epoch": 0.027513396784771656, + "grad_norm": 0.3696345388889313, + "learning_rate": 1.661311865323652e-05, + "loss": 0.7894, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 344, + "tokens_per_second_per_gpu": 16521.06, + "total_tokens": 8770700 + }, + { + "epoch": 0.027593377589378548, + "grad_norm": 0.3620677590370178, + "learning_rate": 1.6586894601186804e-05, + "loss": 0.7883, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 345, + "tokens_per_second_per_gpu": 17402.15, + "total_tokens": 8797084 + }, + { + "epoch": 0.027673358393985444, + "grad_norm": 0.372738242149353, + "learning_rate": 1.6560590289905074e-05, + "loss": 0.7291, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 346, + "tokens_per_second_per_gpu": 17215.48, + "total_tokens": 8822494 + }, + { + "epoch": 0.02775333919859234, + "grad_norm": 0.3729492425918579, + "learning_rate": 1.6534206039901057e-05, + "loss": 0.7847, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 347, + "tokens_per_second_per_gpu": 16917.18, + "total_tokens": 8847694 + }, + { + "epoch": 0.02783332000319923, + "grad_norm": 0.3795606791973114, + "learning_rate": 1.650774217265851e-05, + "loss": 0.7178, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 348, + "tokens_per_second_per_gpu": 16262.38, + "total_tokens": 8871607 + }, + { + "epoch": 0.027913300807806126, + "grad_norm": 0.38951990008354187, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.732, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 349, + "tokens_per_second_per_gpu": 16713.27, + "total_tokens": 8896607 + }, + { + "epoch": 0.02799328161241302, + "grad_norm": 0.37609028816223145, + "learning_rate": 1.645457687723951e-05, + "loss": 0.7056, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 350, + "tokens_per_second_per_gpu": 17036.12, + "total_tokens": 8921918 + }, + { + "epoch": 0.028073262417019917, + "grad_norm": 0.354303240776062, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.7704, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 351, + "tokens_per_second_per_gpu": 17543.34, + "total_tokens": 8948802 + }, + { + "epoch": 0.02815324322162681, + "grad_norm": 0.367156445980072, + "learning_rate": 1.6401096994849558e-05, + "loss": 0.7856, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 352, + "tokens_per_second_per_gpu": 17447.98, + "total_tokens": 8975725 + }, + { + "epoch": 0.028233224026233704, + "grad_norm": 0.3801327645778656, + "learning_rate": 1.63742398974869e-05, + "loss": 0.7162, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 353, + "tokens_per_second_per_gpu": 16838.46, + "total_tokens": 9000553 + }, + { + "epoch": 0.0283132048308406, + "grad_norm": 0.3771909773349762, + "learning_rate": 1.6347305132022677e-05, + "loss": 0.7503, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 354, + "tokens_per_second_per_gpu": 17019.93, + "total_tokens": 9026503 + }, + { + "epoch": 0.02839318563544749, + "grad_norm": 0.3548984229564667, + "learning_rate": 1.632029302664851e-05, + "loss": 0.7121, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 355, + "tokens_per_second_per_gpu": 17243.63, + "total_tokens": 9052692 + }, + { + "epoch": 0.028473166440054386, + "grad_norm": 0.38791143894195557, + "learning_rate": 1.6293203910498375e-05, + "loss": 0.7143, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 356, + "tokens_per_second_per_gpu": 16719.42, + "total_tokens": 9077149 + }, + { + "epoch": 0.028553147244661282, + "grad_norm": 0.37814652919769287, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.7185, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 357, + "tokens_per_second_per_gpu": 16720.19, + "total_tokens": 9102344 + }, + { + "epoch": 0.028633128049268177, + "grad_norm": 0.39943739771842957, + "learning_rate": 1.6238795967093865e-05, + "loss": 0.7723, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 358, + "tokens_per_second_per_gpu": 17517.32, + "total_tokens": 9128929 + }, + { + "epoch": 0.02871310885387507, + "grad_norm": 0.3772953748703003, + "learning_rate": 1.6211477802783105e-05, + "loss": 0.728, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 359, + "tokens_per_second_per_gpu": 16422.02, + "total_tokens": 9153580 + }, + { + "epoch": 0.028793089658481964, + "grad_norm": 0.38691309094429016, + "learning_rate": 1.6184083953575543e-05, + "loss": 0.7345, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 360, + "tokens_per_second_per_gpu": 17226.57, + "total_tokens": 9179977 + }, + { + "epoch": 0.02887307046308886, + "grad_norm": 0.38146907091140747, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.7257, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 361, + "tokens_per_second_per_gpu": 16847.61, + "total_tokens": 9205256 + }, + { + "epoch": 0.02895305126769575, + "grad_norm": 0.3648886978626251, + "learning_rate": 1.6129070536529767e-05, + "loss": 0.7108, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 362, + "tokens_per_second_per_gpu": 16919.86, + "total_tokens": 9230795 + }, + { + "epoch": 0.029033032072302647, + "grad_norm": 0.39110928773880005, + "learning_rate": 1.610145163901268e-05, + "loss": 0.7205, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 363, + "tokens_per_second_per_gpu": 16929.94, + "total_tokens": 9256324 + }, + { + "epoch": 0.029113012876909542, + "grad_norm": 0.3927913308143616, + "learning_rate": 1.607375839723287e-05, + "loss": 0.7325, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 364, + "tokens_per_second_per_gpu": 17259.35, + "total_tokens": 9282305 + }, + { + "epoch": 0.029192993681516437, + "grad_norm": 0.4146783947944641, + "learning_rate": 1.6045991148623752e-05, + "loss": 0.7032, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 365, + "tokens_per_second_per_gpu": 17299.87, + "total_tokens": 9307760 + }, + { + "epoch": 0.02927297448612333, + "grad_norm": 0.38273462653160095, + "learning_rate": 1.6018150231520486e-05, + "loss": 0.7482, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 366, + "tokens_per_second_per_gpu": 16898.79, + "total_tokens": 9333318 + }, + { + "epoch": 0.029352955290730225, + "grad_norm": 0.37070807814598083, + "learning_rate": 1.599023598515586e-05, + "loss": 0.7562, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 367, + "tokens_per_second_per_gpu": 17749.01, + "total_tokens": 9360564 + }, + { + "epoch": 0.02943293609533712, + "grad_norm": 0.3885659873485565, + "learning_rate": 1.5962248749656158e-05, + "loss": 0.7191, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 368, + "tokens_per_second_per_gpu": 17134.11, + "total_tokens": 9386204 + }, + { + "epoch": 0.029512916899944012, + "grad_norm": 0.40251046419143677, + "learning_rate": 1.5934188866037017e-05, + "loss": 0.7055, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 369, + "tokens_per_second_per_gpu": 17219.31, + "total_tokens": 9412581 + }, + { + "epoch": 0.029592897704550907, + "grad_norm": 0.40094780921936035, + "learning_rate": 1.5906056676199256e-05, + "loss": 0.6937, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 370, + "tokens_per_second_per_gpu": 16162.93, + "total_tokens": 9437163 + }, + { + "epoch": 0.029672878509157802, + "grad_norm": 0.41726741194725037, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.7203, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 371, + "tokens_per_second_per_gpu": 16507.8, + "total_tokens": 9461430 + }, + { + "epoch": 0.029752859313764698, + "grad_norm": 0.4103233218193054, + "learning_rate": 1.584957674987216e-05, + "loss": 0.6705, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 372, + "tokens_per_second_per_gpu": 16712.62, + "total_tokens": 9485701 + }, + { + "epoch": 0.02983284011837159, + "grad_norm": 0.4164546728134155, + "learning_rate": 1.5821229701572897e-05, + "loss": 0.741, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 373, + "tokens_per_second_per_gpu": 16965.21, + "total_tokens": 9511120 + }, + { + "epoch": 0.029912820922978485, + "grad_norm": 0.3924483358860016, + "learning_rate": 1.5792811723426787e-05, + "loss": 0.7683, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 374, + "tokens_per_second_per_gpu": 17353.54, + "total_tokens": 9537186 + }, + { + "epoch": 0.02999280172758538, + "grad_norm": 0.4054587781429291, + "learning_rate": 1.5764323161697933e-05, + "loss": 0.7073, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 375, + "tokens_per_second_per_gpu": 17264.54, + "total_tokens": 9563703 + }, + { + "epoch": 0.030072782532192272, + "grad_norm": 0.3829587996006012, + "learning_rate": 1.573576436351046e-05, + "loss": 0.73, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 376, + "tokens_per_second_per_gpu": 17249.09, + "total_tokens": 9589768 + }, + { + "epoch": 0.030152763336799168, + "grad_norm": 0.4045129418373108, + "learning_rate": 1.570713567684432e-05, + "loss": 0.6873, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 377, + "tokens_per_second_per_gpu": 16495.57, + "total_tokens": 9614554 + }, + { + "epoch": 0.030232744141406063, + "grad_norm": 0.42311742901802063, + "learning_rate": 1.5678437450531014e-05, + "loss": 0.7036, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 378, + "tokens_per_second_per_gpu": 17035.79, + "total_tokens": 9639664 + }, + { + "epoch": 0.030312724946012958, + "grad_norm": 0.40890172123908997, + "learning_rate": 1.564967003424938e-05, + "loss": 0.7218, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 379, + "tokens_per_second_per_gpu": 16421.78, + "total_tokens": 9663997 + }, + { + "epoch": 0.03039270575061985, + "grad_norm": 0.37312084436416626, + "learning_rate": 1.5620833778521306e-05, + "loss": 0.6829, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 380, + "tokens_per_second_per_gpu": 17273.67, + "total_tokens": 9690145 + }, + { + "epoch": 0.030472686555226745, + "grad_norm": 0.40423229336738586, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.7155, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 381, + "tokens_per_second_per_gpu": 17179.5, + "total_tokens": 9715856 + }, + { + "epoch": 0.03055266735983364, + "grad_norm": 0.3965972363948822, + "learning_rate": 1.556295615500305e-05, + "loss": 0.7335, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 382, + "tokens_per_second_per_gpu": 16914.33, + "total_tokens": 9740705 + }, + { + "epoch": 0.030632648164440533, + "grad_norm": 0.39814358949661255, + "learning_rate": 1.553391549243344e-05, + "loss": 0.6777, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 383, + "tokens_per_second_per_gpu": 16778.95, + "total_tokens": 9765413 + }, + { + "epoch": 0.030712628969047428, + "grad_norm": 0.41755273938179016, + "learning_rate": 1.5504807400849957e-05, + "loss": 0.7882, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 384, + "tokens_per_second_per_gpu": 16864.26, + "total_tokens": 9790820 + }, + { + "epoch": 0.030792609773654323, + "grad_norm": 0.4052574932575226, + "learning_rate": 1.5475632234925505e-05, + "loss": 0.7715, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 385, + "tokens_per_second_per_gpu": 17356.01, + "total_tokens": 9817140 + }, + { + "epoch": 0.03087259057826122, + "grad_norm": 0.3887154757976532, + "learning_rate": 1.5446390350150272e-05, + "loss": 0.6877, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 386, + "tokens_per_second_per_gpu": 16417.7, + "total_tokens": 9841945 + }, + { + "epoch": 0.03095257138286811, + "grad_norm": 0.39953020215034485, + "learning_rate": 1.54170821028274e-05, + "loss": 0.7477, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 387, + "tokens_per_second_per_gpu": 17048.84, + "total_tokens": 9867366 + }, + { + "epoch": 0.031032552187475006, + "grad_norm": 0.3856733441352844, + "learning_rate": 1.5387707850068633e-05, + "loss": 0.654, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 388, + "tokens_per_second_per_gpu": 16843.48, + "total_tokens": 9892604 + }, + { + "epoch": 0.0311125329920819, + "grad_norm": 0.3791309595108032, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.7138, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 389, + "tokens_per_second_per_gpu": 17779.4, + "total_tokens": 9919047 + }, + { + "epoch": 0.031192513796688796, + "grad_norm": 0.4217212498188019, + "learning_rate": 1.53287627607073e-05, + "loss": 0.7381, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 390, + "tokens_per_second_per_gpu": 17139.02, + "total_tokens": 9944164 + }, + { + "epoch": 0.03127249460129569, + "grad_norm": 0.3937268853187561, + "learning_rate": 1.529919264233205e-05, + "loss": 0.6793, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 391, + "tokens_per_second_per_gpu": 16873.44, + "total_tokens": 9969559 + }, + { + "epoch": 0.03135247540590258, + "grad_norm": 0.39358460903167725, + "learning_rate": 1.5269557954966777e-05, + "loss": 0.7156, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 392, + "tokens_per_second_per_gpu": 16668.99, + "total_tokens": 9994082 + }, + { + "epoch": 0.031432456210509475, + "grad_norm": 0.41820088028907776, + "learning_rate": 1.5239859059700794e-05, + "loss": 0.7444, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 393, + "tokens_per_second_per_gpu": 16729.16, + "total_tokens": 10019253 + }, + { + "epoch": 0.03151243701511637, + "grad_norm": 0.40098121762275696, + "learning_rate": 1.5210096318405768e-05, + "loss": 0.7275, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 394, + "tokens_per_second_per_gpu": 17146.85, + "total_tokens": 10044970 + }, + { + "epoch": 0.031592417819723266, + "grad_norm": 0.3832881450653076, + "learning_rate": 1.5180270093731305e-05, + "loss": 0.7174, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 395, + "tokens_per_second_per_gpu": 16825.22, + "total_tokens": 10069975 + }, + { + "epoch": 0.03167239862433016, + "grad_norm": 0.5176158547401428, + "learning_rate": 1.5150380749100545e-05, + "loss": 0.7295, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 396, + "tokens_per_second_per_gpu": 17251.5, + "total_tokens": 10095923 + }, + { + "epoch": 0.03175237942893706, + "grad_norm": 0.3928660452365875, + "learning_rate": 1.5120428648705716e-05, + "loss": 0.6951, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 397, + "tokens_per_second_per_gpu": 17162.01, + "total_tokens": 10122361 + }, + { + "epoch": 0.03183236023354395, + "grad_norm": 0.3940604627132416, + "learning_rate": 1.5090414157503715e-05, + "loss": 0.7341, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 398, + "tokens_per_second_per_gpu": 16976.56, + "total_tokens": 10148210 + }, + { + "epoch": 0.03191234103815084, + "grad_norm": 0.4209328591823578, + "learning_rate": 1.5060337641211637e-05, + "loss": 0.7186, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 399, + "tokens_per_second_per_gpu": 16919.75, + "total_tokens": 10173572 + }, + { + "epoch": 0.031992321842757736, + "grad_norm": 0.40747904777526855, + "learning_rate": 1.5030199466302354e-05, + "loss": 0.7456, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 400, + "tokens_per_second_per_gpu": 17805.14, + "total_tokens": 10200493 + }, + { + "epoch": 0.03207230264736463, + "grad_norm": 0.46691301465034485, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.726, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 401, + "tokens_per_second_per_gpu": 16555.25, + "total_tokens": 10224698 + }, + { + "epoch": 0.032152283451971526, + "grad_norm": 0.3882039487361908, + "learning_rate": 1.4969739610275556e-05, + "loss": 0.6911, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 402, + "tokens_per_second_per_gpu": 17460.29, + "total_tokens": 10250839 + }, + { + "epoch": 0.03223226425657842, + "grad_norm": 0.41841983795166016, + "learning_rate": 1.493941866584231e-05, + "loss": 0.7002, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 403, + "tokens_per_second_per_gpu": 17185.88, + "total_tokens": 10276595 + }, + { + "epoch": 0.03231224506118532, + "grad_norm": 0.4183862805366516, + "learning_rate": 1.490903753615141e-05, + "loss": 0.6808, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 404, + "tokens_per_second_per_gpu": 16490.44, + "total_tokens": 10301334 + }, + { + "epoch": 0.03239222586579221, + "grad_norm": 0.426186740398407, + "learning_rate": 1.4878596591387329e-05, + "loss": 0.7433, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 405, + "tokens_per_second_per_gpu": 17491.41, + "total_tokens": 10326588 + }, + { + "epoch": 0.0324722066703991, + "grad_norm": 0.4127671718597412, + "learning_rate": 1.4848096202463373e-05, + "loss": 0.6778, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 406, + "tokens_per_second_per_gpu": 16889.42, + "total_tokens": 10351330 + }, + { + "epoch": 0.032552187475005996, + "grad_norm": 0.3885892629623413, + "learning_rate": 1.4817536741017153e-05, + "loss": 0.6335, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 407, + "tokens_per_second_per_gpu": 17271.18, + "total_tokens": 10376865 + }, + { + "epoch": 0.03263216827961289, + "grad_norm": 0.4392751157283783, + "learning_rate": 1.478691857940607e-05, + "loss": 0.6889, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 408, + "tokens_per_second_per_gpu": 17086.59, + "total_tokens": 10401715 + }, + { + "epoch": 0.03271214908421979, + "grad_norm": 0.4046195149421692, + "learning_rate": 1.4756242090702756e-05, + "loss": 0.6995, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 409, + "tokens_per_second_per_gpu": 17387.64, + "total_tokens": 10427512 + }, + { + "epoch": 0.03279212988882668, + "grad_norm": 0.42296287417411804, + "learning_rate": 1.4725507648690542e-05, + "loss": 0.6922, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 410, + "tokens_per_second_per_gpu": 16433.12, + "total_tokens": 10452185 + }, + { + "epoch": 0.03287211069343358, + "grad_norm": 0.41615429520606995, + "learning_rate": 1.469471562785891e-05, + "loss": 0.6738, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 411, + "tokens_per_second_per_gpu": 17170.46, + "total_tokens": 10477581 + }, + { + "epoch": 0.03295209149804047, + "grad_norm": 0.4219436049461365, + "learning_rate": 1.4663866403398915e-05, + "loss": 0.6897, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 412, + "tokens_per_second_per_gpu": 16646.71, + "total_tokens": 10502411 + }, + { + "epoch": 0.03303207230264737, + "grad_norm": 0.42644554376602173, + "learning_rate": 1.463296035119862e-05, + "loss": 0.7273, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 413, + "tokens_per_second_per_gpu": 16944.04, + "total_tokens": 10527755 + }, + { + "epoch": 0.033112053107254256, + "grad_norm": 0.39926496148109436, + "learning_rate": 1.4601997847838518e-05, + "loss": 0.7163, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 414, + "tokens_per_second_per_gpu": 17372.82, + "total_tokens": 10554332 + }, + { + "epoch": 0.03319203391186115, + "grad_norm": 0.40787941217422485, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.6688, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 415, + "tokens_per_second_per_gpu": 17193.02, + "total_tokens": 10580110 + }, + { + "epoch": 0.03327201471646805, + "grad_norm": 0.42348116636276245, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.655, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 416, + "tokens_per_second_per_gpu": 17180.72, + "total_tokens": 10605425 + }, + { + "epoch": 0.03335199552107494, + "grad_norm": 0.44330260157585144, + "learning_rate": 1.4508775406894308e-05, + "loss": 0.7509, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 417, + "tokens_per_second_per_gpu": 17173.37, + "total_tokens": 10631736 + }, + { + "epoch": 0.03343197632568184, + "grad_norm": 0.44089949131011963, + "learning_rate": 1.4477590878387697e-05, + "loss": 0.7204, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 418, + "tokens_per_second_per_gpu": 17112.95, + "total_tokens": 10657439 + }, + { + "epoch": 0.03351195713028873, + "grad_norm": 0.45663735270500183, + "learning_rate": 1.4446351791849276e-05, + "loss": 0.7088, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 419, + "tokens_per_second_per_gpu": 16506.33, + "total_tokens": 10681868 + }, + { + "epoch": 0.03359193793489563, + "grad_norm": 0.422953724861145, + "learning_rate": 1.4415058527917454e-05, + "loss": 0.7334, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 420, + "tokens_per_second_per_gpu": 17558.74, + "total_tokens": 10708474 + }, + { + "epoch": 0.03367191873950252, + "grad_norm": 0.4254125654697418, + "learning_rate": 1.4383711467890776e-05, + "loss": 0.6822, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 421, + "tokens_per_second_per_gpu": 17224.81, + "total_tokens": 10733585 + }, + { + "epoch": 0.03375189954410941, + "grad_norm": 0.4303964674472809, + "learning_rate": 1.4352310993723277e-05, + "loss": 0.7347, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 422, + "tokens_per_second_per_gpu": 17144.03, + "total_tokens": 10759349 + }, + { + "epoch": 0.03383188034871631, + "grad_norm": 0.422776997089386, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.7005, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 423, + "tokens_per_second_per_gpu": 16822.49, + "total_tokens": 10785174 + }, + { + "epoch": 0.0339118611533232, + "grad_norm": 0.4445240795612335, + "learning_rate": 1.4289351334031461e-05, + "loss": 0.6952, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 424, + "tokens_per_second_per_gpu": 17287.26, + "total_tokens": 10810894 + }, + { + "epoch": 0.0339918419579301, + "grad_norm": 0.402654767036438, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.7211, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 425, + "tokens_per_second_per_gpu": 14675.92, + "total_tokens": 10836666 + }, + { + "epoch": 0.034071822762536993, + "grad_norm": 0.4416694939136505, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.7003, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 426, + "tokens_per_second_per_gpu": 16632.17, + "total_tokens": 10861229 + }, + { + "epoch": 0.03415180356714389, + "grad_norm": 0.41705960035324097, + "learning_rate": 1.4194520824461773e-05, + "loss": 0.7096, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 427, + "tokens_per_second_per_gpu": 17211.97, + "total_tokens": 10887118 + }, + { + "epoch": 0.03423178437175078, + "grad_norm": 0.4063047170639038, + "learning_rate": 1.4162807922604014e-05, + "loss": 0.6899, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 428, + "tokens_per_second_per_gpu": 16943.74, + "total_tokens": 10912629 + }, + { + "epoch": 0.03431176517635767, + "grad_norm": 0.46809977293014526, + "learning_rate": 1.413104429824542e-05, + "loss": 0.7048, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 429, + "tokens_per_second_per_gpu": 16626.15, + "total_tokens": 10937453 + }, + { + "epoch": 0.03439174598096457, + "grad_norm": 0.4412693977355957, + "learning_rate": 1.4099230338415728e-05, + "loss": 0.6755, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 430, + "tokens_per_second_per_gpu": 16983.01, + "total_tokens": 10962482 + }, + { + "epoch": 0.03447172678557146, + "grad_norm": 0.43178603053092957, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.6916, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 431, + "tokens_per_second_per_gpu": 16939.86, + "total_tokens": 10987805 + }, + { + "epoch": 0.03455170759017836, + "grad_norm": 0.443692147731781, + "learning_rate": 1.4035452963523903e-05, + "loss": 0.7305, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 432, + "tokens_per_second_per_gpu": 16964.69, + "total_tokens": 11013753 + }, + { + "epoch": 0.034631688394785254, + "grad_norm": 0.4076201915740967, + "learning_rate": 1.4003490325568953e-05, + "loss": 0.6864, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 433, + "tokens_per_second_per_gpu": 17153.51, + "total_tokens": 11039930 + }, + { + "epoch": 0.03471166919939215, + "grad_norm": 0.44919684529304504, + "learning_rate": 1.3971478906347806e-05, + "loss": 0.6828, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 434, + "tokens_per_second_per_gpu": 16898.36, + "total_tokens": 11065066 + }, + { + "epoch": 0.03479165000399904, + "grad_norm": 0.4365704655647278, + "learning_rate": 1.3939419095909513e-05, + "loss": 0.7284, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 435, + "tokens_per_second_per_gpu": 16954.82, + "total_tokens": 11090213 + }, + { + "epoch": 0.03487163080860593, + "grad_norm": 0.4258210062980652, + "learning_rate": 1.3907311284892737e-05, + "loss": 0.7079, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 436, + "tokens_per_second_per_gpu": 17515.26, + "total_tokens": 11116316 + }, + { + "epoch": 0.03495161161321283, + "grad_norm": 0.4155106544494629, + "learning_rate": 1.3875155864521031e-05, + "loss": 0.7349, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 437, + "tokens_per_second_per_gpu": 17376.63, + "total_tokens": 11143315 + }, + { + "epoch": 0.035031592417819724, + "grad_norm": 0.45664307475090027, + "learning_rate": 1.3842953226598036e-05, + "loss": 0.6599, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 438, + "tokens_per_second_per_gpu": 16759.03, + "total_tokens": 11167988 + }, + { + "epoch": 0.03511157322242662, + "grad_norm": 0.4296400249004364, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.7074, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 439, + "tokens_per_second_per_gpu": 17066.98, + "total_tokens": 11193249 + }, + { + "epoch": 0.035191554027033514, + "grad_norm": 0.4324433207511902, + "learning_rate": 1.3778407868184674e-05, + "loss": 0.7137, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 440, + "tokens_per_second_per_gpu": 16678.98, + "total_tokens": 11217984 + }, + { + "epoch": 0.03527153483164041, + "grad_norm": 0.4287432134151459, + "learning_rate": 1.3746065934159123e-05, + "loss": 0.6642, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 441, + "tokens_per_second_per_gpu": 16398.96, + "total_tokens": 11242621 + }, + { + "epoch": 0.0353515156362473, + "grad_norm": 0.4307049810886383, + "learning_rate": 1.371367835550235e-05, + "loss": 0.7475, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 442, + "tokens_per_second_per_gpu": 17000.86, + "total_tokens": 11268097 + }, + { + "epoch": 0.03543149644085419, + "grad_norm": 0.42402443289756775, + "learning_rate": 1.3681245526846782e-05, + "loss": 0.6431, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 443, + "tokens_per_second_per_gpu": 17005.71, + "total_tokens": 11293129 + }, + { + "epoch": 0.03551147724546109, + "grad_norm": 0.4233229458332062, + "learning_rate": 1.3648767843376196e-05, + "loss": 0.6949, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 444, + "tokens_per_second_per_gpu": 17142.72, + "total_tokens": 11318904 + }, + { + "epoch": 0.035591458050067984, + "grad_norm": 0.441266268491745, + "learning_rate": 1.3616245700820922e-05, + "loss": 0.7124, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 445, + "tokens_per_second_per_gpu": 16907.05, + "total_tokens": 11344667 + }, + { + "epoch": 0.03567143885467488, + "grad_norm": 0.45229724049568176, + "learning_rate": 1.3583679495453e-05, + "loss": 0.6818, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 446, + "tokens_per_second_per_gpu": 16779.4, + "total_tokens": 11370012 + }, + { + "epoch": 0.035751419659281775, + "grad_norm": 0.4272010326385498, + "learning_rate": 1.3551069624081372e-05, + "loss": 0.6735, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 447, + "tokens_per_second_per_gpu": 16302.12, + "total_tokens": 11394710 + }, + { + "epoch": 0.03583140046388867, + "grad_norm": 0.4327336251735687, + "learning_rate": 1.3518416484047018e-05, + "loss": 0.6747, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 448, + "tokens_per_second_per_gpu": 16728.59, + "total_tokens": 11419699 + }, + { + "epoch": 0.03591138126849556, + "grad_norm": 0.4202955961227417, + "learning_rate": 1.3485720473218153e-05, + "loss": 0.6721, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 449, + "tokens_per_second_per_gpu": 17303.71, + "total_tokens": 11445057 + }, + { + "epoch": 0.035991362073102454, + "grad_norm": 0.4030447006225586, + "learning_rate": 1.3452981989985347e-05, + "loss": 0.6492, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 450, + "tokens_per_second_per_gpu": 17036.83, + "total_tokens": 11470311 + }, + { + "epoch": 0.03607134287770935, + "grad_norm": 0.4464939534664154, + "learning_rate": 1.342020143325669e-05, + "loss": 0.6813, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 451, + "tokens_per_second_per_gpu": 16636.45, + "total_tokens": 11495272 + }, + { + "epoch": 0.036151323682316244, + "grad_norm": 0.41173145174980164, + "learning_rate": 1.3387379202452917e-05, + "loss": 0.6944, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 452, + "tokens_per_second_per_gpu": 17471.58, + "total_tokens": 11521756 + }, + { + "epoch": 0.03623130448692314, + "grad_norm": 0.43435975909233093, + "learning_rate": 1.3354515697502552e-05, + "loss": 0.6224, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 453, + "tokens_per_second_per_gpu": 16937.9, + "total_tokens": 11547286 + }, + { + "epoch": 0.036311285291530035, + "grad_norm": 0.442965030670166, + "learning_rate": 1.3321611318837033e-05, + "loss": 0.6622, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 454, + "tokens_per_second_per_gpu": 16823.21, + "total_tokens": 11572126 + }, + { + "epoch": 0.03639126609613693, + "grad_norm": 0.4620346128940582, + "learning_rate": 1.3288666467385834e-05, + "loss": 0.7231, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 455, + "tokens_per_second_per_gpu": 17179.18, + "total_tokens": 11597500 + }, + { + "epoch": 0.03647124690074382, + "grad_norm": 0.4446198642253876, + "learning_rate": 1.3255681544571568e-05, + "loss": 0.6583, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 456, + "tokens_per_second_per_gpu": 16641.55, + "total_tokens": 11621978 + }, + { + "epoch": 0.036551227705350714, + "grad_norm": 0.44696947932243347, + "learning_rate": 1.3222656952305113e-05, + "loss": 0.6597, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 457, + "tokens_per_second_per_gpu": 17202.52, + "total_tokens": 11647603 + }, + { + "epoch": 0.03663120850995761, + "grad_norm": 0.446732759475708, + "learning_rate": 1.3189593092980701e-05, + "loss": 0.7131, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 458, + "tokens_per_second_per_gpu": 17469.88, + "total_tokens": 11673603 + }, + { + "epoch": 0.036711189314564505, + "grad_norm": 0.44011181592941284, + "learning_rate": 1.3156490369471026e-05, + "loss": 0.6603, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 459, + "tokens_per_second_per_gpu": 16763.75, + "total_tokens": 11698721 + }, + { + "epoch": 0.0367911701191714, + "grad_norm": 0.47020354866981506, + "learning_rate": 1.3123349185122328e-05, + "loss": 0.6767, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 460, + "tokens_per_second_per_gpu": 16818.73, + "total_tokens": 11723824 + }, + { + "epoch": 0.036871150923778295, + "grad_norm": 0.4808385670185089, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.7207, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 461, + "tokens_per_second_per_gpu": 16326.55, + "total_tokens": 11747939 + }, + { + "epoch": 0.03695113172838519, + "grad_norm": 0.4525218904018402, + "learning_rate": 1.3056953049631059e-05, + "loss": 0.6545, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 462, + "tokens_per_second_per_gpu": 17413.36, + "total_tokens": 11774028 + }, + { + "epoch": 0.03703111253299208, + "grad_norm": 0.4264589250087738, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.6375, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 463, + "tokens_per_second_per_gpu": 16640.79, + "total_tokens": 11798824 + }, + { + "epoch": 0.037111093337598974, + "grad_norm": 0.43030428886413574, + "learning_rate": 1.2990407922560869e-05, + "loss": 0.656, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 464, + "tokens_per_second_per_gpu": 16456.8, + "total_tokens": 11822904 + }, + { + "epoch": 0.03719107414220587, + "grad_norm": 0.43640056252479553, + "learning_rate": 1.2957080500440469e-05, + "loss": 0.6872, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 465, + "tokens_per_second_per_gpu": 17006.93, + "total_tokens": 11848488 + }, + { + "epoch": 0.037271054946812765, + "grad_norm": 0.44964516162872314, + "learning_rate": 1.2923717047227368e-05, + "loss": 0.6901, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 466, + "tokens_per_second_per_gpu": 17548.04, + "total_tokens": 11874583 + }, + { + "epoch": 0.03735103575141966, + "grad_norm": 0.4395027160644531, + "learning_rate": 1.2890317969444716e-05, + "loss": 0.6383, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 467, + "tokens_per_second_per_gpu": 16482.1, + "total_tokens": 11899197 + }, + { + "epoch": 0.037431016556026556, + "grad_norm": 0.42954379320144653, + "learning_rate": 1.2856883674049736e-05, + "loss": 0.6412, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 468, + "tokens_per_second_per_gpu": 17318.43, + "total_tokens": 11924491 + }, + { + "epoch": 0.03751099736063345, + "grad_norm": 0.4213207960128784, + "learning_rate": 1.2823414568428767e-05, + "loss": 0.6353, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 469, + "tokens_per_second_per_gpu": 16459.46, + "total_tokens": 11948982 + }, + { + "epoch": 0.03759097816524034, + "grad_norm": 0.43104055523872375, + "learning_rate": 1.2789911060392295e-05, + "loss": 0.7354, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 470, + "tokens_per_second_per_gpu": 17537.58, + "total_tokens": 11976515 + }, + { + "epoch": 0.037670958969847235, + "grad_norm": 0.4502396881580353, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.675, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 471, + "tokens_per_second_per_gpu": 17392.8, + "total_tokens": 12002880 + }, + { + "epoch": 0.03775093977445413, + "grad_norm": 0.45354557037353516, + "learning_rate": 1.2722802470405744e-05, + "loss": 0.6707, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 472, + "tokens_per_second_per_gpu": 16881.98, + "total_tokens": 12028090 + }, + { + "epoch": 0.037830920579061025, + "grad_norm": 0.43540510535240173, + "learning_rate": 1.2689198206152657e-05, + "loss": 0.6882, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 473, + "tokens_per_second_per_gpu": 17161.61, + "total_tokens": 12053924 + }, + { + "epoch": 0.03791090138366792, + "grad_norm": 0.4614422917366028, + "learning_rate": 1.265556117486809e-05, + "loss": 0.7256, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 474, + "tokens_per_second_per_gpu": 17023.83, + "total_tokens": 12079474 + }, + { + "epoch": 0.037990882188274816, + "grad_norm": 0.44551095366477966, + "learning_rate": 1.2621891786408648e-05, + "loss": 0.7414, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 475, + "tokens_per_second_per_gpu": 17029.63, + "total_tokens": 12105739 + }, + { + "epoch": 0.03807086299288171, + "grad_norm": 0.45504751801490784, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.717, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 476, + "tokens_per_second_per_gpu": 17089.58, + "total_tokens": 12131563 + }, + { + "epoch": 0.0381508437974886, + "grad_norm": 0.4884074628353119, + "learning_rate": 1.2554457579357906e-05, + "loss": 0.7327, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 477, + "tokens_per_second_per_gpu": 17239.01, + "total_tokens": 12156874 + }, + { + "epoch": 0.038230824602095495, + "grad_norm": 0.4748455882072449, + "learning_rate": 1.252069358243114e-05, + "loss": 0.6711, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 478, + "tokens_per_second_per_gpu": 17469.29, + "total_tokens": 12183330 + }, + { + "epoch": 0.03831080540670239, + "grad_norm": 0.4526073634624481, + "learning_rate": 1.2486898871648552e-05, + "loss": 0.6717, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 479, + "tokens_per_second_per_gpu": 16705.93, + "total_tokens": 12207907 + }, + { + "epoch": 0.038390786211309286, + "grad_norm": 0.4595562517642975, + "learning_rate": 1.2453073858788027e-05, + "loss": 0.6678, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 480, + "tokens_per_second_per_gpu": 16815.36, + "total_tokens": 12233006 + }, + { + "epoch": 0.03847076701591618, + "grad_norm": 0.4445168673992157, + "learning_rate": 1.2419218955996677e-05, + "loss": 0.6174, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 481, + "tokens_per_second_per_gpu": 16899.74, + "total_tokens": 12258436 + }, + { + "epoch": 0.038550747820523076, + "grad_norm": 0.4498426914215088, + "learning_rate": 1.238533457578581e-05, + "loss": 0.6202, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 482, + "tokens_per_second_per_gpu": 16656.0, + "total_tokens": 12283200 + }, + { + "epoch": 0.03863072862512997, + "grad_norm": 0.48890069127082825, + "learning_rate": 1.23514211310259e-05, + "loss": 0.7179, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 483, + "tokens_per_second_per_gpu": 17130.89, + "total_tokens": 12309222 + }, + { + "epoch": 0.03871070942973686, + "grad_norm": 0.4737612307071686, + "learning_rate": 1.2317479034941572e-05, + "loss": 0.711, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 484, + "tokens_per_second_per_gpu": 16815.6, + "total_tokens": 12334335 + }, + { + "epoch": 0.038790690234343755, + "grad_norm": 0.4556877315044403, + "learning_rate": 1.2283508701106559e-05, + "loss": 0.7006, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 485, + "tokens_per_second_per_gpu": 17323.63, + "total_tokens": 12360225 + }, + { + "epoch": 0.03887067103895065, + "grad_norm": 0.4712156057357788, + "learning_rate": 1.2249510543438652e-05, + "loss": 0.6762, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 486, + "tokens_per_second_per_gpu": 16818.52, + "total_tokens": 12385730 + }, + { + "epoch": 0.038950651843557546, + "grad_norm": 0.45326972007751465, + "learning_rate": 1.2215484976194675e-05, + "loss": 0.6396, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 487, + "tokens_per_second_per_gpu": 16917.67, + "total_tokens": 12410731 + }, + { + "epoch": 0.03903063264816444, + "grad_norm": 0.4285866916179657, + "learning_rate": 1.2181432413965428e-05, + "loss": 0.6759, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 488, + "tokens_per_second_per_gpu": 17554.28, + "total_tokens": 12437737 + }, + { + "epoch": 0.03911061345277134, + "grad_norm": 0.4505816400051117, + "learning_rate": 1.2147353271670634e-05, + "loss": 0.7116, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 489, + "tokens_per_second_per_gpu": 17837.91, + "total_tokens": 12465159 + }, + { + "epoch": 0.03919059425737823, + "grad_norm": 0.4805770814418793, + "learning_rate": 1.211324796455389e-05, + "loss": 0.6968, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 490, + "tokens_per_second_per_gpu": 16656.0, + "total_tokens": 12490064 + }, + { + "epoch": 0.03927057506198512, + "grad_norm": 0.45226889848709106, + "learning_rate": 1.2079116908177592e-05, + "loss": 0.6759, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 491, + "tokens_per_second_per_gpu": 17115.44, + "total_tokens": 12516094 + }, + { + "epoch": 0.039350555866592016, + "grad_norm": 0.4620254635810852, + "learning_rate": 1.2044960518417902e-05, + "loss": 0.7178, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 492, + "tokens_per_second_per_gpu": 16855.59, + "total_tokens": 12541699 + }, + { + "epoch": 0.03943053667119891, + "grad_norm": 0.44682419300079346, + "learning_rate": 1.2010779211459649e-05, + "loss": 0.6887, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 493, + "tokens_per_second_per_gpu": 16927.33, + "total_tokens": 12567064 + }, + { + "epoch": 0.039510517475805806, + "grad_norm": 0.4683786928653717, + "learning_rate": 1.1976573403791263e-05, + "loss": 0.658, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 494, + "tokens_per_second_per_gpu": 16693.06, + "total_tokens": 12591658 + }, + { + "epoch": 0.0395904982804127, + "grad_norm": 0.4709741771221161, + "learning_rate": 1.194234351219972e-05, + "loss": 0.6681, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 495, + "tokens_per_second_per_gpu": 16319.58, + "total_tokens": 12615802 + }, + { + "epoch": 0.0396704790850196, + "grad_norm": 0.7030223608016968, + "learning_rate": 1.190808995376545e-05, + "loss": 0.6886, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 496, + "tokens_per_second_per_gpu": 16896.03, + "total_tokens": 12640099 + }, + { + "epoch": 0.03975045988962649, + "grad_norm": 0.4555974304676056, + "learning_rate": 1.187381314585725e-05, + "loss": 0.6925, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 497, + "tokens_per_second_per_gpu": 17402.31, + "total_tokens": 12666288 + }, + { + "epoch": 0.03983044069423338, + "grad_norm": 0.4940910041332245, + "learning_rate": 1.1839513506127202e-05, + "loss": 0.681, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 498, + "tokens_per_second_per_gpu": 16930.11, + "total_tokens": 12691469 + }, + { + "epoch": 0.039910421498840276, + "grad_norm": 0.4535921812057495, + "learning_rate": 1.1805191452505602e-05, + "loss": 0.6589, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 499, + "tokens_per_second_per_gpu": 16906.81, + "total_tokens": 12716852 + }, + { + "epoch": 0.03999040230344717, + "grad_norm": 0.46495068073272705, + "learning_rate": 1.1770847403195836e-05, + "loss": 0.7064, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 500, + "tokens_per_second_per_gpu": 17272.14, + "total_tokens": 12742985 + }, + { + "epoch": 0.04007038310805407, + "grad_norm": 0.46297863125801086, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.7218, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 501, + "tokens_per_second_per_gpu": 16950.12, + "total_tokens": 12768836 + }, + { + "epoch": 0.04015036391266096, + "grad_norm": 0.4618571698665619, + "learning_rate": 1.1702094991660326e-05, + "loss": 0.674, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 502, + "tokens_per_second_per_gpu": 16882.33, + "total_tokens": 12794066 + }, + { + "epoch": 0.04023034471726786, + "grad_norm": 0.44983258843421936, + "learning_rate": 1.1667687467161025e-05, + "loss": 0.6893, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 503, + "tokens_per_second_per_gpu": 17461.33, + "total_tokens": 12820375 + }, + { + "epoch": 0.04031032552187475, + "grad_norm": 0.46179690957069397, + "learning_rate": 1.1633259622416224e-05, + "loss": 0.6698, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 504, + "tokens_per_second_per_gpu": 16827.05, + "total_tokens": 12845726 + }, + { + "epoch": 0.04039030632648164, + "grad_norm": 0.4472286105155945, + "learning_rate": 1.159881187691835e-05, + "loss": 0.7078, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 505, + "tokens_per_second_per_gpu": 17124.23, + "total_tokens": 12872001 + }, + { + "epoch": 0.040470287131088536, + "grad_norm": 0.4627981185913086, + "learning_rate": 1.156434465040231e-05, + "loss": 0.6686, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 506, + "tokens_per_second_per_gpu": 16904.21, + "total_tokens": 12897321 + }, + { + "epoch": 0.04055026793569543, + "grad_norm": 0.44518762826919556, + "learning_rate": 1.1529858362840383e-05, + "loss": 0.6474, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 507, + "tokens_per_second_per_gpu": 17281.89, + "total_tokens": 12923660 + }, + { + "epoch": 0.04063024874030233, + "grad_norm": 0.4409578740596771, + "learning_rate": 1.1495353434437098e-05, + "loss": 0.6399, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 508, + "tokens_per_second_per_gpu": 17102.59, + "total_tokens": 12949277 + }, + { + "epoch": 0.04071022954490922, + "grad_norm": 0.44398391246795654, + "learning_rate": 1.1460830285624119e-05, + "loss": 0.6753, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 509, + "tokens_per_second_per_gpu": 17247.23, + "total_tokens": 12975352 + }, + { + "epoch": 0.04079021034951612, + "grad_norm": 0.4975646436214447, + "learning_rate": 1.1426289337055119e-05, + "loss": 0.6131, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 510, + "tokens_per_second_per_gpu": 15943.69, + "total_tokens": 12998651 + }, + { + "epoch": 0.04087019115412301, + "grad_norm": 0.48738542199134827, + "learning_rate": 1.1391731009600655e-05, + "loss": 0.6322, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 511, + "tokens_per_second_per_gpu": 16154.38, + "total_tokens": 13022893 + }, + { + "epoch": 0.0409501719587299, + "grad_norm": 0.4914393723011017, + "learning_rate": 1.1357155724343046e-05, + "loss": 0.6633, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 512, + "tokens_per_second_per_gpu": 16945.85, + "total_tokens": 13047605 + }, + { + "epoch": 0.0410301527633368, + "grad_norm": 0.45245736837387085, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.6315, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 513, + "tokens_per_second_per_gpu": 17274.69, + "total_tokens": 13073795 + }, + { + "epoch": 0.04111013356794369, + "grad_norm": 0.4842854142189026, + "learning_rate": 1.128795596577563e-05, + "loss": 0.6499, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 514, + "tokens_per_second_per_gpu": 17081.96, + "total_tokens": 13099940 + }, + { + "epoch": 0.04119011437255059, + "grad_norm": 0.5243505835533142, + "learning_rate": 1.1253332335643043e-05, + "loss": 0.675, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 515, + "tokens_per_second_per_gpu": 16642.09, + "total_tokens": 13124720 + }, + { + "epoch": 0.04127009517715748, + "grad_norm": 0.46914488077163696, + "learning_rate": 1.1218693434051475e-05, + "loss": 0.6719, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 516, + "tokens_per_second_per_gpu": 16966.49, + "total_tokens": 13150010 + }, + { + "epoch": 0.04135007598176438, + "grad_norm": 0.44769319891929626, + "learning_rate": 1.1184039683065014e-05, + "loss": 0.6736, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 517, + "tokens_per_second_per_gpu": 17236.72, + "total_tokens": 13176461 + }, + { + "epoch": 0.041430056786371273, + "grad_norm": 0.4807461202144623, + "learning_rate": 1.1149371504928667e-05, + "loss": 0.7321, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 518, + "tokens_per_second_per_gpu": 16848.78, + "total_tokens": 13202015 + }, + { + "epoch": 0.04151003759097816, + "grad_norm": 0.4664666950702667, + "learning_rate": 1.1114689322063255e-05, + "loss": 0.6415, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 519, + "tokens_per_second_per_gpu": 17132.94, + "total_tokens": 13228010 + }, + { + "epoch": 0.04159001839558506, + "grad_norm": 0.48780035972595215, + "learning_rate": 1.1079993557060228e-05, + "loss": 0.6729, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 520, + "tokens_per_second_per_gpu": 16422.78, + "total_tokens": 13252575 + }, + { + "epoch": 0.04166999920019195, + "grad_norm": 0.4693656861782074, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.6891, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 521, + "tokens_per_second_per_gpu": 17555.32, + "total_tokens": 13278801 + }, + { + "epoch": 0.04174998000479885, + "grad_norm": 0.458926796913147, + "learning_rate": 1.1010562971829464e-05, + "loss": 0.6527, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 522, + "tokens_per_second_per_gpu": 16075.34, + "total_tokens": 13302766 + }, + { + "epoch": 0.04182996080940574, + "grad_norm": 0.462158739566803, + "learning_rate": 1.0975828997591496e-05, + "loss": 0.6799, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 523, + "tokens_per_second_per_gpu": 17398.83, + "total_tokens": 13329089 + }, + { + "epoch": 0.04190994161401264, + "grad_norm": 0.4593111276626587, + "learning_rate": 1.0941083133185146e-05, + "loss": 0.7031, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 524, + "tokens_per_second_per_gpu": 17006.54, + "total_tokens": 13355144 + }, + { + "epoch": 0.041989922418619534, + "grad_norm": 0.46989020705223083, + "learning_rate": 1.0906325801977804e-05, + "loss": 0.7105, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 525, + "tokens_per_second_per_gpu": 17221.48, + "total_tokens": 13381015 + }, + { + "epoch": 0.04206990322322642, + "grad_norm": 0.46403929591178894, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.6809, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 526, + "tokens_per_second_per_gpu": 17428.79, + "total_tokens": 13407634 + }, + { + "epoch": 0.04214988402783332, + "grad_norm": 0.44122979044914246, + "learning_rate": 1.083677843332316e-05, + "loss": 0.661, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 527, + "tokens_per_second_per_gpu": 17050.17, + "total_tokens": 13434044 + }, + { + "epoch": 0.04222986483244021, + "grad_norm": 0.46086767315864563, + "learning_rate": 1.0801989243288588e-05, + "loss": 0.692, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 528, + "tokens_per_second_per_gpu": 17260.71, + "total_tokens": 13460422 + }, + { + "epoch": 0.04230984563704711, + "grad_norm": 0.4788115918636322, + "learning_rate": 1.0767190281268187e-05, + "loss": 0.6774, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 529, + "tokens_per_second_per_gpu": 16919.47, + "total_tokens": 13485959 + }, + { + "epoch": 0.042389826441654004, + "grad_norm": 0.46982550621032715, + "learning_rate": 1.0732381971276318e-05, + "loss": 0.6199, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 530, + "tokens_per_second_per_gpu": 16852.02, + "total_tokens": 13510769 + }, + { + "epoch": 0.0424698072462609, + "grad_norm": 0.4891279339790344, + "learning_rate": 1.0697564737441254e-05, + "loss": 0.6276, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 531, + "tokens_per_second_per_gpu": 16554.57, + "total_tokens": 13535563 + }, + { + "epoch": 0.042549788050867794, + "grad_norm": 0.4954123795032501, + "learning_rate": 1.0662739004000005e-05, + "loss": 0.6915, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 532, + "tokens_per_second_per_gpu": 17455.25, + "total_tokens": 13561469 + }, + { + "epoch": 0.04262976885547469, + "grad_norm": 0.49951866269111633, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.6864, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 533, + "tokens_per_second_per_gpu": 17206.21, + "total_tokens": 13587381 + }, + { + "epoch": 0.04270974966008158, + "grad_norm": 0.4457262456417084, + "learning_rate": 1.0593063735759619e-05, + "loss": 0.6785, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 534, + "tokens_per_second_per_gpu": 17794.38, + "total_tokens": 13614365 + }, + { + "epoch": 0.04278973046468847, + "grad_norm": 0.475887656211853, + "learning_rate": 1.055821504993164e-05, + "loss": 0.634, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 535, + "tokens_per_second_per_gpu": 16492.61, + "total_tokens": 13639374 + }, + { + "epoch": 0.04286971126929537, + "grad_norm": 0.48933205008506775, + "learning_rate": 1.0523359562429441e-05, + "loss": 0.6554, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 536, + "tokens_per_second_per_gpu": 17192.3, + "total_tokens": 13665700 + }, + { + "epoch": 0.042949692073902264, + "grad_norm": 0.5178970098495483, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.7028, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 537, + "tokens_per_second_per_gpu": 16638.15, + "total_tokens": 13690118 + }, + { + "epoch": 0.04302967287850916, + "grad_norm": 0.4723743498325348, + "learning_rate": 1.0453629881292537e-05, + "loss": 0.689, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 538, + "tokens_per_second_per_gpu": 17283.06, + "total_tokens": 13716392 + }, + { + "epoch": 0.043109653683116055, + "grad_norm": 0.5020018219947815, + "learning_rate": 1.0418756537291996e-05, + "loss": 0.6389, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 539, + "tokens_per_second_per_gpu": 17027.19, + "total_tokens": 13741612 + }, + { + "epoch": 0.04318963448772295, + "grad_norm": 0.5196510553359985, + "learning_rate": 1.03838780908752e-05, + "loss": 0.6469, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 540, + "tokens_per_second_per_gpu": 16934.53, + "total_tokens": 13767073 + }, + { + "epoch": 0.04326961529232984, + "grad_norm": 0.4690985083580017, + "learning_rate": 1.0348994967025012e-05, + "loss": 0.6625, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 541, + "tokens_per_second_per_gpu": 16886.26, + "total_tokens": 13792114 + }, + { + "epoch": 0.043349596096936734, + "grad_norm": 0.5237311124801636, + "learning_rate": 1.0314107590781284e-05, + "loss": 0.6957, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 542, + "tokens_per_second_per_gpu": 16878.17, + "total_tokens": 13817155 + }, + { + "epoch": 0.04342957690154363, + "grad_norm": 0.47251269221305847, + "learning_rate": 1.0279216387235691e-05, + "loss": 0.6607, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 543, + "tokens_per_second_per_gpu": 17364.53, + "total_tokens": 13843641 + }, + { + "epoch": 0.043509557706150524, + "grad_norm": 0.49236616492271423, + "learning_rate": 1.0244321781526533e-05, + "loss": 0.6878, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 544, + "tokens_per_second_per_gpu": 17090.74, + "total_tokens": 13869705 + }, + { + "epoch": 0.04358953851075742, + "grad_norm": 0.4944368898868561, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.6828, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 545, + "tokens_per_second_per_gpu": 17123.06, + "total_tokens": 13895270 + }, + { + "epoch": 0.043669519315364315, + "grad_norm": 0.4860251545906067, + "learning_rate": 1.0174524064372837e-05, + "loss": 0.6804, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 546, + "tokens_per_second_per_gpu": 16587.1, + "total_tokens": 13920244 + }, + { + "epoch": 0.04374950011997121, + "grad_norm": 0.48462778329849243, + "learning_rate": 1.0139621803391454e-05, + "loss": 0.6694, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 547, + "tokens_per_second_per_gpu": 16738.82, + "total_tokens": 13945409 + }, + { + "epoch": 0.0438294809245781, + "grad_norm": 0.4959378242492676, + "learning_rate": 1.010471784116246e-05, + "loss": 0.6386, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 548, + "tokens_per_second_per_gpu": 16547.11, + "total_tokens": 13970587 + }, + { + "epoch": 0.043909461729184994, + "grad_norm": 0.4693349301815033, + "learning_rate": 1.0069812602979617e-05, + "loss": 0.6432, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 549, + "tokens_per_second_per_gpu": 17222.81, + "total_tokens": 13996670 + }, + { + "epoch": 0.04398944253379189, + "grad_norm": 0.4579184055328369, + "learning_rate": 1.0034906514152239e-05, + "loss": 0.6737, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 550, + "tokens_per_second_per_gpu": 17389.32, + "total_tokens": 14022626 + }, + { + "epoch": 0.044069423338398785, + "grad_norm": 0.46185624599456787, + "learning_rate": 1e-05, + "loss": 0.7294, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 551, + "tokens_per_second_per_gpu": 17336.46, + "total_tokens": 14049035 + }, + { + "epoch": 0.04414940414300568, + "grad_norm": 0.4870699644088745, + "learning_rate": 9.965093485847766e-06, + "loss": 0.6866, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 552, + "tokens_per_second_per_gpu": 17535.72, + "total_tokens": 14075514 + }, + { + "epoch": 0.044229384947612575, + "grad_norm": 0.4829731285572052, + "learning_rate": 9.930187397020385e-06, + "loss": 0.6143, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 553, + "tokens_per_second_per_gpu": 16359.15, + "total_tokens": 14099952 + }, + { + "epoch": 0.04430936575221947, + "grad_norm": 0.4855392575263977, + "learning_rate": 9.895282158837545e-06, + "loss": 0.6524, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 554, + "tokens_per_second_per_gpu": 16914.54, + "total_tokens": 14125208 + }, + { + "epoch": 0.04438934655682636, + "grad_norm": 0.5001446604728699, + "learning_rate": 9.860378196608549e-06, + "loss": 0.6716, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 555, + "tokens_per_second_per_gpu": 16896.62, + "total_tokens": 14150850 + }, + { + "epoch": 0.044469327361433254, + "grad_norm": 0.45474767684936523, + "learning_rate": 9.825475935627165e-06, + "loss": 0.6442, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 556, + "tokens_per_second_per_gpu": 16980.77, + "total_tokens": 14176498 + }, + { + "epoch": 0.04454930816604015, + "grad_norm": 0.4773014485836029, + "learning_rate": 9.790575801166432e-06, + "loss": 0.6755, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 557, + "tokens_per_second_per_gpu": 16906.61, + "total_tokens": 14202548 + }, + { + "epoch": 0.044629288970647045, + "grad_norm": 0.4736998379230499, + "learning_rate": 9.75567821847347e-06, + "loss": 0.6523, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 558, + "tokens_per_second_per_gpu": 17269.9, + "total_tokens": 14228353 + }, + { + "epoch": 0.04470926977525394, + "grad_norm": 0.47355714440345764, + "learning_rate": 9.720783612764314e-06, + "loss": 0.5922, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 559, + "tokens_per_second_per_gpu": 16300.57, + "total_tokens": 14252682 + }, + { + "epoch": 0.044789250579860836, + "grad_norm": 0.47529494762420654, + "learning_rate": 9.685892409218718e-06, + "loss": 0.6587, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 560, + "tokens_per_second_per_gpu": 16787.12, + "total_tokens": 14278093 + }, + { + "epoch": 0.04486923138446773, + "grad_norm": 0.5045996308326721, + "learning_rate": 9.651005032974994e-06, + "loss": 0.6459, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 561, + "tokens_per_second_per_gpu": 16965.31, + "total_tokens": 14302754 + }, + { + "epoch": 0.04494921218907462, + "grad_norm": 0.47231438755989075, + "learning_rate": 9.616121909124801e-06, + "loss": 0.7112, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 562, + "tokens_per_second_per_gpu": 17578.07, + "total_tokens": 14329323 + }, + { + "epoch": 0.045029192993681515, + "grad_norm": 0.49113765358924866, + "learning_rate": 9.581243462708007e-06, + "loss": 0.6435, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 563, + "tokens_per_second_per_gpu": 16616.31, + "total_tokens": 14353713 + }, + { + "epoch": 0.04510917379828841, + "grad_norm": 0.44610634446144104, + "learning_rate": 9.546370118707463e-06, + "loss": 0.6374, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 564, + "tokens_per_second_per_gpu": 17000.59, + "total_tokens": 14379865 + }, + { + "epoch": 0.045189154602895305, + "grad_norm": 0.4994834065437317, + "learning_rate": 9.511502302043867e-06, + "loss": 0.6428, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 565, + "tokens_per_second_per_gpu": 16745.1, + "total_tokens": 14404914 + }, + { + "epoch": 0.0452691354075022, + "grad_norm": 0.47246044874191284, + "learning_rate": 9.476640437570562e-06, + "loss": 0.665, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 566, + "tokens_per_second_per_gpu": 16875.25, + "total_tokens": 14431100 + }, + { + "epoch": 0.045349116212109096, + "grad_norm": 0.5038020014762878, + "learning_rate": 9.441784950068362e-06, + "loss": 0.6742, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 567, + "tokens_per_second_per_gpu": 16680.55, + "total_tokens": 14456004 + }, + { + "epoch": 0.04542909701671599, + "grad_norm": 0.4679954946041107, + "learning_rate": 9.406936264240386e-06, + "loss": 0.6609, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 568, + "tokens_per_second_per_gpu": 17120.18, + "total_tokens": 14482147 + }, + { + "epoch": 0.04550907782132288, + "grad_norm": 0.47112342715263367, + "learning_rate": 9.372094804706867e-06, + "loss": 0.6283, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 569, + "tokens_per_second_per_gpu": 17221.54, + "total_tokens": 14507518 + }, + { + "epoch": 0.045589058625929775, + "grad_norm": 0.4823978543281555, + "learning_rate": 9.337260996000002e-06, + "loss": 0.6006, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 570, + "tokens_per_second_per_gpu": 16988.76, + "total_tokens": 14532760 + }, + { + "epoch": 0.04566903943053667, + "grad_norm": 0.5082917809486389, + "learning_rate": 9.302435262558748e-06, + "loss": 0.6403, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 571, + "tokens_per_second_per_gpu": 17407.94, + "total_tokens": 14559212 + }, + { + "epoch": 0.045749020235143566, + "grad_norm": 0.5025095343589783, + "learning_rate": 9.267618028723687e-06, + "loss": 0.6438, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 572, + "tokens_per_second_per_gpu": 17173.37, + "total_tokens": 14585315 + }, + { + "epoch": 0.04582900103975046, + "grad_norm": 0.4819313883781433, + "learning_rate": 9.232809718731815e-06, + "loss": 0.6649, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 573, + "tokens_per_second_per_gpu": 17488.01, + "total_tokens": 14611725 + }, + { + "epoch": 0.045908981844357356, + "grad_norm": 0.4713301360607147, + "learning_rate": 9.198010756711413e-06, + "loss": 0.6653, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 574, + "tokens_per_second_per_gpu": 16638.54, + "total_tokens": 14636622 + }, + { + "epoch": 0.04598896264896425, + "grad_norm": 0.4914127588272095, + "learning_rate": 9.163221566676847e-06, + "loss": 0.6229, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 575, + "tokens_per_second_per_gpu": 16381.64, + "total_tokens": 14660439 + }, + { + "epoch": 0.04606894345357114, + "grad_norm": 0.4962431788444519, + "learning_rate": 9.128442572523418e-06, + "loss": 0.6263, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 576, + "tokens_per_second_per_gpu": 16798.67, + "total_tokens": 14685550 + }, + { + "epoch": 0.046148924258178035, + "grad_norm": 0.46047908067703247, + "learning_rate": 9.093674198022201e-06, + "loss": 0.601, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 577, + "tokens_per_second_per_gpu": 16901.2, + "total_tokens": 14710705 + }, + { + "epoch": 0.04622890506278493, + "grad_norm": 0.4952366054058075, + "learning_rate": 9.058916866814857e-06, + "loss": 0.6774, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 578, + "tokens_per_second_per_gpu": 17149.77, + "total_tokens": 14736165 + }, + { + "epoch": 0.046308885867391826, + "grad_norm": 0.5213083624839783, + "learning_rate": 9.024171002408507e-06, + "loss": 0.6587, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 579, + "tokens_per_second_per_gpu": 16984.69, + "total_tokens": 14761504 + }, + { + "epoch": 0.04638886667199872, + "grad_norm": 0.4909270703792572, + "learning_rate": 8.989437028170537e-06, + "loss": 0.656, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 580, + "tokens_per_second_per_gpu": 17165.09, + "total_tokens": 14787035 + }, + { + "epoch": 0.04646884747660562, + "grad_norm": 0.4714226722717285, + "learning_rate": 8.954715367323468e-06, + "loss": 0.646, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 581, + "tokens_per_second_per_gpu": 17370.32, + "total_tokens": 14813116 + }, + { + "epoch": 0.04654882828121251, + "grad_norm": 0.459878534078598, + "learning_rate": 8.920006442939772e-06, + "loss": 0.6484, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 582, + "tokens_per_second_per_gpu": 17366.54, + "total_tokens": 14839467 + }, + { + "epoch": 0.0466288090858194, + "grad_norm": 0.4947279989719391, + "learning_rate": 8.885310677936746e-06, + "loss": 0.656, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 583, + "tokens_per_second_per_gpu": 17204.88, + "total_tokens": 14865234 + }, + { + "epoch": 0.046708789890426296, + "grad_norm": 0.49737077951431274, + "learning_rate": 8.850628495071336e-06, + "loss": 0.6808, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 584, + "tokens_per_second_per_gpu": 17365.74, + "total_tokens": 14891556 + }, + { + "epoch": 0.04678877069503319, + "grad_norm": 0.49995678663253784, + "learning_rate": 8.815960316934991e-06, + "loss": 0.6392, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 585, + "tokens_per_second_per_gpu": 17080.84, + "total_tokens": 14917230 + }, + { + "epoch": 0.046868751499640086, + "grad_norm": 0.5089588165283203, + "learning_rate": 8.781306565948528e-06, + "loss": 0.6864, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 586, + "tokens_per_second_per_gpu": 16838.83, + "total_tokens": 14942605 + }, + { + "epoch": 0.04694873230424698, + "grad_norm": 0.4909396767616272, + "learning_rate": 8.746667664356957e-06, + "loss": 0.6111, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 587, + "tokens_per_second_per_gpu": 17205.48, + "total_tokens": 14968659 + }, + { + "epoch": 0.04702871310885388, + "grad_norm": 0.463184118270874, + "learning_rate": 8.712044034224374e-06, + "loss": 0.595, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 588, + "tokens_per_second_per_gpu": 17060.7, + "total_tokens": 14994885 + }, + { + "epoch": 0.04710869391346077, + "grad_norm": 0.905055820941925, + "learning_rate": 8.677436097428775e-06, + "loss": 0.6458, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 589, + "tokens_per_second_per_gpu": 16621.93, + "total_tokens": 15019934 + }, + { + "epoch": 0.04718867471806766, + "grad_norm": 0.4729231894016266, + "learning_rate": 8.642844275656957e-06, + "loss": 0.6957, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 590, + "tokens_per_second_per_gpu": 17788.01, + "total_tokens": 15046933 + }, + { + "epoch": 0.047268655522674556, + "grad_norm": 0.5098869204521179, + "learning_rate": 8.60826899039935e-06, + "loss": 0.6265, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 591, + "tokens_per_second_per_gpu": 16950.24, + "total_tokens": 15072162 + }, + { + "epoch": 0.04734863632728145, + "grad_norm": 0.49714773893356323, + "learning_rate": 8.573710662944884e-06, + "loss": 0.6777, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 592, + "tokens_per_second_per_gpu": 16566.85, + "total_tokens": 15097098 + }, + { + "epoch": 0.04742861713188835, + "grad_norm": 0.4808761477470398, + "learning_rate": 8.539169714375885e-06, + "loss": 0.6586, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 593, + "tokens_per_second_per_gpu": 17276.4, + "total_tokens": 15123131 + }, + { + "epoch": 0.04750859793649524, + "grad_norm": 0.5037384033203125, + "learning_rate": 8.504646565562907e-06, + "loss": 0.6783, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 594, + "tokens_per_second_per_gpu": 17387.82, + "total_tokens": 15149443 + }, + { + "epoch": 0.04758857874110214, + "grad_norm": 0.49192243814468384, + "learning_rate": 8.47014163715962e-06, + "loss": 0.6142, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 595, + "tokens_per_second_per_gpu": 16517.04, + "total_tokens": 15173258 + }, + { + "epoch": 0.04766855954570903, + "grad_norm": 0.5216419696807861, + "learning_rate": 8.43565534959769e-06, + "loss": 0.6364, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 596, + "tokens_per_second_per_gpu": 16922.88, + "total_tokens": 15198315 + }, + { + "epoch": 0.04774854035031592, + "grad_norm": 0.48781725764274597, + "learning_rate": 8.401188123081653e-06, + "loss": 0.6312, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 597, + "tokens_per_second_per_gpu": 17166.4, + "total_tokens": 15224223 + }, + { + "epoch": 0.047828521154922816, + "grad_norm": 0.48654705286026, + "learning_rate": 8.366740377583781e-06, + "loss": 0.6459, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 598, + "tokens_per_second_per_gpu": 17255.25, + "total_tokens": 15250664 + }, + { + "epoch": 0.04790850195952971, + "grad_norm": 0.48847904801368713, + "learning_rate": 8.332312532838978e-06, + "loss": 0.6484, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 599, + "tokens_per_second_per_gpu": 17191.9, + "total_tokens": 15276565 + }, + { + "epoch": 0.04798848276413661, + "grad_norm": 0.4727404713630676, + "learning_rate": 8.297905008339677e-06, + "loss": 0.6467, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 600, + "tokens_per_second_per_gpu": 17521.18, + "total_tokens": 15302942 + }, + { + "epoch": 0.0480684635687435, + "grad_norm": 0.49052244424819946, + "learning_rate": 8.263518223330698e-06, + "loss": 0.6492, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 601, + "tokens_per_second_per_gpu": 17039.69, + "total_tokens": 15329013 + }, + { + "epoch": 0.0481484443733504, + "grad_norm": 0.4712292551994324, + "learning_rate": 8.22915259680417e-06, + "loss": 0.6232, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 602, + "tokens_per_second_per_gpu": 16841.81, + "total_tokens": 15354575 + }, + { + "epoch": 0.04822842517795729, + "grad_norm": 0.4877064526081085, + "learning_rate": 8.194808547494401e-06, + "loss": 0.6617, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 603, + "tokens_per_second_per_gpu": 17087.94, + "total_tokens": 15379668 + }, + { + "epoch": 0.04830840598256418, + "grad_norm": 0.5102121829986572, + "learning_rate": 8.1604864938728e-06, + "loss": 0.6315, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 604, + "tokens_per_second_per_gpu": 16581.29, + "total_tokens": 15404809 + }, + { + "epoch": 0.04838838678717108, + "grad_norm": 0.4876486361026764, + "learning_rate": 8.126186854142752e-06, + "loss": 0.5826, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 605, + "tokens_per_second_per_gpu": 16902.6, + "total_tokens": 15430201 + }, + { + "epoch": 0.04846836759177797, + "grad_norm": 0.510290265083313, + "learning_rate": 8.091910046234552e-06, + "loss": 0.6742, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 606, + "tokens_per_second_per_gpu": 17105.66, + "total_tokens": 15455793 + }, + { + "epoch": 0.04854834839638487, + "grad_norm": 0.4743480980396271, + "learning_rate": 8.057656487800283e-06, + "loss": 0.6673, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 607, + "tokens_per_second_per_gpu": 17214.78, + "total_tokens": 15482472 + }, + { + "epoch": 0.04862832920099176, + "grad_norm": 0.48495572805404663, + "learning_rate": 8.023426596208739e-06, + "loss": 0.6654, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 608, + "tokens_per_second_per_gpu": 17224.85, + "total_tokens": 15508453 + }, + { + "epoch": 0.04870831000559866, + "grad_norm": 0.48911020159721375, + "learning_rate": 7.989220788540356e-06, + "loss": 0.6215, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 609, + "tokens_per_second_per_gpu": 16851.65, + "total_tokens": 15533469 + }, + { + "epoch": 0.048788290810205553, + "grad_norm": 0.46720772981643677, + "learning_rate": 7.955039481582098e-06, + "loss": 0.6018, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 610, + "tokens_per_second_per_gpu": 16750.14, + "total_tokens": 15558559 + }, + { + "epoch": 0.04886827161481244, + "grad_norm": 0.5051571130752563, + "learning_rate": 7.92088309182241e-06, + "loss": 0.6471, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 611, + "tokens_per_second_per_gpu": 16870.87, + "total_tokens": 15583697 + }, + { + "epoch": 0.04894825241941934, + "grad_norm": 0.49818551540374756, + "learning_rate": 7.886752035446116e-06, + "loss": 0.663, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 612, + "tokens_per_second_per_gpu": 17041.92, + "total_tokens": 15609568 + }, + { + "epoch": 0.04902823322402623, + "grad_norm": 0.47889798879623413, + "learning_rate": 7.852646728329368e-06, + "loss": 0.6533, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 613, + "tokens_per_second_per_gpu": 17201.08, + "total_tokens": 15635573 + }, + { + "epoch": 0.04910821402863313, + "grad_norm": 0.4940686523914337, + "learning_rate": 7.818567586034578e-06, + "loss": 0.6428, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 614, + "tokens_per_second_per_gpu": 16846.51, + "total_tokens": 15660689 + }, + { + "epoch": 0.04918819483324002, + "grad_norm": 0.4960979223251343, + "learning_rate": 7.784515023805328e-06, + "loss": 0.6548, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 615, + "tokens_per_second_per_gpu": 16855.99, + "total_tokens": 15685730 + }, + { + "epoch": 0.04926817563784692, + "grad_norm": 0.5047521591186523, + "learning_rate": 7.750489456561351e-06, + "loss": 0.609, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 616, + "tokens_per_second_per_gpu": 16865.43, + "total_tokens": 15711363 + }, + { + "epoch": 0.049348156442453814, + "grad_norm": 0.538982629776001, + "learning_rate": 7.716491298893443e-06, + "loss": 0.6671, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 617, + "tokens_per_second_per_gpu": 16915.95, + "total_tokens": 15736639 + }, + { + "epoch": 0.0494281372470607, + "grad_norm": 0.5692036151885986, + "learning_rate": 7.68252096505843e-06, + "loss": 0.6733, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 618, + "tokens_per_second_per_gpu": 16922.84, + "total_tokens": 15761816 + }, + { + "epoch": 0.0495081180516676, + "grad_norm": 0.4885812997817993, + "learning_rate": 7.6485788689741e-06, + "loss": 0.6583, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 619, + "tokens_per_second_per_gpu": 17492.93, + "total_tokens": 15787918 + }, + { + "epoch": 0.04958809885627449, + "grad_norm": 0.5453019738197327, + "learning_rate": 7.6146654242141935e-06, + "loss": 0.7266, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 620, + "tokens_per_second_per_gpu": 16796.92, + "total_tokens": 15812967 + }, + { + "epoch": 0.04966807966088139, + "grad_norm": 0.5044118165969849, + "learning_rate": 7.580781044003324e-06, + "loss": 0.6738, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 621, + "tokens_per_second_per_gpu": 16983.61, + "total_tokens": 15838154 + }, + { + "epoch": 0.049748060465488284, + "grad_norm": 0.49475517868995667, + "learning_rate": 7.546926141211975e-06, + "loss": 0.6235, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 622, + "tokens_per_second_per_gpu": 16762.51, + "total_tokens": 15863293 + }, + { + "epoch": 0.04982804127009518, + "grad_norm": 0.5201805830001831, + "learning_rate": 7.513101128351454e-06, + "loss": 0.6033, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 623, + "tokens_per_second_per_gpu": 16768.92, + "total_tokens": 15888215 + }, + { + "epoch": 0.049908022074702074, + "grad_norm": 0.5115200877189636, + "learning_rate": 7.4793064175688635e-06, + "loss": 0.6793, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 624, + "tokens_per_second_per_gpu": 17648.67, + "total_tokens": 15914592 + }, + { + "epoch": 0.04998800287930896, + "grad_norm": 0.5228220820426941, + "learning_rate": 7.445542420642097e-06, + "loss": 0.6296, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 625, + "tokens_per_second_per_gpu": 16790.6, + "total_tokens": 15940125 + }, + { + "epoch": 0.05006798368391586, + "grad_norm": 0.4957731068134308, + "learning_rate": 7.411809548974792e-06, + "loss": 0.6431, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 626, + "tokens_per_second_per_gpu": 17142.31, + "total_tokens": 15965805 + }, + { + "epoch": 0.05014796448852275, + "grad_norm": 0.48672324419021606, + "learning_rate": 7.378108213591355e-06, + "loss": 0.6589, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 627, + "tokens_per_second_per_gpu": 16842.27, + "total_tokens": 15990871 + }, + { + "epoch": 0.05022794529312965, + "grad_norm": 0.5299752950668335, + "learning_rate": 7.344438825131912e-06, + "loss": 0.6362, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 628, + "tokens_per_second_per_gpu": 16874.41, + "total_tokens": 16016112 + }, + { + "epoch": 0.050307926097736544, + "grad_norm": 0.4939616918563843, + "learning_rate": 7.310801793847344e-06, + "loss": 0.6658, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 629, + "tokens_per_second_per_gpu": 17161.05, + "total_tokens": 16041667 + }, + { + "epoch": 0.05038790690234344, + "grad_norm": 0.5360363125801086, + "learning_rate": 7.277197529594257e-06, + "loss": 0.6419, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 630, + "tokens_per_second_per_gpu": 16983.54, + "total_tokens": 16066562 + }, + { + "epoch": 0.050467887706950335, + "grad_norm": 0.4936983287334442, + "learning_rate": 7.243626441830009e-06, + "loss": 0.6341, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 631, + "tokens_per_second_per_gpu": 16644.31, + "total_tokens": 16091903 + }, + { + "epoch": 0.05054786851155722, + "grad_norm": 0.5046349763870239, + "learning_rate": 7.210088939607709e-06, + "loss": 0.7089, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 632, + "tokens_per_second_per_gpu": 17592.12, + "total_tokens": 16118442 + }, + { + "epoch": 0.05062784931616412, + "grad_norm": 0.4913012385368347, + "learning_rate": 7.176585431571235e-06, + "loss": 0.7011, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 633, + "tokens_per_second_per_gpu": 17143.68, + "total_tokens": 16144725 + }, + { + "epoch": 0.050707830120771014, + "grad_norm": 0.5462119579315186, + "learning_rate": 7.143116325950266e-06, + "loss": 0.6766, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 634, + "tokens_per_second_per_gpu": 17247.03, + "total_tokens": 16170369 + }, + { + "epoch": 0.05078781092537791, + "grad_norm": 0.5056242346763611, + "learning_rate": 7.109682030555283e-06, + "loss": 0.6201, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 635, + "tokens_per_second_per_gpu": 16855.99, + "total_tokens": 16195646 + }, + { + "epoch": 0.050867791729984804, + "grad_norm": 0.47949331998825073, + "learning_rate": 7.076282952772634e-06, + "loss": 0.6441, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 636, + "tokens_per_second_per_gpu": 16613.54, + "total_tokens": 16220540 + }, + { + "epoch": 0.0509477725345917, + "grad_norm": 0.48914220929145813, + "learning_rate": 7.042919499559538e-06, + "loss": 0.6101, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 637, + "tokens_per_second_per_gpu": 16800.29, + "total_tokens": 16245408 + }, + { + "epoch": 0.051027753339198595, + "grad_norm": 0.5196214318275452, + "learning_rate": 7.009592077439135e-06, + "loss": 0.6946, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 638, + "tokens_per_second_per_gpu": 17131.28, + "total_tokens": 16271244 + }, + { + "epoch": 0.05110773414380548, + "grad_norm": 0.5333957076072693, + "learning_rate": 6.976301092495556e-06, + "loss": 0.6489, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 639, + "tokens_per_second_per_gpu": 17256.94, + "total_tokens": 16297194 + }, + { + "epoch": 0.05118771494841238, + "grad_norm": 0.4803604781627655, + "learning_rate": 6.943046950368944e-06, + "loss": 0.6063, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 640, + "tokens_per_second_per_gpu": 16761.74, + "total_tokens": 16322047 + }, + { + "epoch": 0.051267695753019274, + "grad_norm": 0.5199413299560547, + "learning_rate": 6.909830056250527e-06, + "loss": 0.6978, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 641, + "tokens_per_second_per_gpu": 16905.31, + "total_tokens": 16347028 + }, + { + "epoch": 0.05134767655762617, + "grad_norm": 0.5130301117897034, + "learning_rate": 6.876650814877675e-06, + "loss": 0.6378, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 642, + "tokens_per_second_per_gpu": 16899.38, + "total_tokens": 16372392 + }, + { + "epoch": 0.051427657362233065, + "grad_norm": 0.5086696743965149, + "learning_rate": 6.843509630528977e-06, + "loss": 0.6444, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 643, + "tokens_per_second_per_gpu": 17210.14, + "total_tokens": 16398587 + }, + { + "epoch": 0.05150763816683996, + "grad_norm": 0.4915199875831604, + "learning_rate": 6.8104069070193e-06, + "loss": 0.6514, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 644, + "tokens_per_second_per_gpu": 16820.48, + "total_tokens": 16423176 + }, + { + "epoch": 0.051587618971446855, + "grad_norm": 0.49876171350479126, + "learning_rate": 6.777343047694891e-06, + "loss": 0.6849, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 645, + "tokens_per_second_per_gpu": 17426.66, + "total_tokens": 16450009 + }, + { + "epoch": 0.051667599776053744, + "grad_norm": 0.5137947201728821, + "learning_rate": 6.744318455428436e-06, + "loss": 0.6763, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 646, + "tokens_per_second_per_gpu": 16806.41, + "total_tokens": 16475334 + }, + { + "epoch": 0.05174758058066064, + "grad_norm": 0.5228657126426697, + "learning_rate": 6.711333532614168e-06, + "loss": 0.6699, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 647, + "tokens_per_second_per_gpu": 17221.27, + "total_tokens": 16501203 + }, + { + "epoch": 0.051827561385267534, + "grad_norm": 0.5308648943901062, + "learning_rate": 6.67838868116297e-06, + "loss": 0.668, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 648, + "tokens_per_second_per_gpu": 17008.86, + "total_tokens": 16526728 + }, + { + "epoch": 0.05190754218987443, + "grad_norm": 0.5293684005737305, + "learning_rate": 6.645484302497452e-06, + "loss": 0.6544, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 649, + "tokens_per_second_per_gpu": 16986.29, + "total_tokens": 16551898 + }, + { + "epoch": 0.051987522994481325, + "grad_norm": 0.5115300416946411, + "learning_rate": 6.612620797547087e-06, + "loss": 0.6249, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 650, + "tokens_per_second_per_gpu": 17339.71, + "total_tokens": 16577920 + }, + { + "epoch": 0.05206750379908822, + "grad_norm": 0.5213042497634888, + "learning_rate": 6.579798566743314e-06, + "loss": 0.6496, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 651, + "tokens_per_second_per_gpu": 17219.75, + "total_tokens": 16604021 + }, + { + "epoch": 0.052147484603695116, + "grad_norm": 0.5389010310173035, + "learning_rate": 6.547018010014654e-06, + "loss": 0.6647, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 652, + "tokens_per_second_per_gpu": 17230.96, + "total_tokens": 16629696 + }, + { + "epoch": 0.05222746540830201, + "grad_norm": 0.5159024000167847, + "learning_rate": 6.5142795267818505e-06, + "loss": 0.6325, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 653, + "tokens_per_second_per_gpu": 17017.35, + "total_tokens": 16655351 + }, + { + "epoch": 0.0523074462129089, + "grad_norm": 0.4998682141304016, + "learning_rate": 6.481583515952983e-06, + "loss": 0.6439, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 654, + "tokens_per_second_per_gpu": 17015.79, + "total_tokens": 16680965 + }, + { + "epoch": 0.052387427017515795, + "grad_norm": 0.5311859250068665, + "learning_rate": 6.448930375918632e-06, + "loss": 0.6561, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 655, + "tokens_per_second_per_gpu": 16236.56, + "total_tokens": 16705336 + }, + { + "epoch": 0.05246740782212269, + "grad_norm": 0.5103529691696167, + "learning_rate": 6.4163205045469975e-06, + "loss": 0.6153, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 656, + "tokens_per_second_per_gpu": 16413.1, + "total_tokens": 16729896 + }, + { + "epoch": 0.052547388626729585, + "grad_norm": 0.5097713470458984, + "learning_rate": 6.383754299179079e-06, + "loss": 0.6412, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 657, + "tokens_per_second_per_gpu": 17055.16, + "total_tokens": 16755420 + }, + { + "epoch": 0.05262736943133648, + "grad_norm": 0.4635200500488281, + "learning_rate": 6.351232156623803e-06, + "loss": 0.5744, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 658, + "tokens_per_second_per_gpu": 17132.79, + "total_tokens": 16781693 + }, + { + "epoch": 0.052707350235943376, + "grad_norm": 0.5747168660163879, + "learning_rate": 6.318754473153221e-06, + "loss": 0.6812, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 659, + "tokens_per_second_per_gpu": 16736.57, + "total_tokens": 16806316 + }, + { + "epoch": 0.05278733104055027, + "grad_norm": 0.4747006595134735, + "learning_rate": 6.286321644497655e-06, + "loss": 0.6251, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 660, + "tokens_per_second_per_gpu": 17304.87, + "total_tokens": 16832772 + }, + { + "epoch": 0.05286731184515716, + "grad_norm": 0.5061115026473999, + "learning_rate": 6.25393406584088e-06, + "loss": 0.6581, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 661, + "tokens_per_second_per_gpu": 17742.51, + "total_tokens": 16859124 + }, + { + "epoch": 0.052947292649764055, + "grad_norm": 0.4995548725128174, + "learning_rate": 6.22159213181533e-06, + "loss": 0.6492, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 662, + "tokens_per_second_per_gpu": 16850.9, + "total_tokens": 16884185 + }, + { + "epoch": 0.05302727345437095, + "grad_norm": 0.5568655729293823, + "learning_rate": 6.18929623649726e-06, + "loss": 0.5819, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 663, + "tokens_per_second_per_gpu": 16745.3, + "total_tokens": 16909485 + }, + { + "epoch": 0.053107254258977846, + "grad_norm": 0.502731204032898, + "learning_rate": 6.157046773401964e-06, + "loss": 0.6288, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 664, + "tokens_per_second_per_gpu": 17083.54, + "total_tokens": 16935162 + }, + { + "epoch": 0.05318723506358474, + "grad_norm": 0.517120361328125, + "learning_rate": 6.124844135478971e-06, + "loss": 0.6518, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 665, + "tokens_per_second_per_gpu": 17008.29, + "total_tokens": 16960109 + }, + { + "epoch": 0.053267215868191636, + "grad_norm": 0.5138611793518066, + "learning_rate": 6.092688715107265e-06, + "loss": 0.643, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 666, + "tokens_per_second_per_gpu": 16915.46, + "total_tokens": 16985762 + }, + { + "epoch": 0.05334719667279853, + "grad_norm": 0.5278694033622742, + "learning_rate": 6.06058090409049e-06, + "loss": 0.6474, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 667, + "tokens_per_second_per_gpu": 17094.87, + "total_tokens": 17011822 + }, + { + "epoch": 0.05342717747740542, + "grad_norm": 0.4872185289859772, + "learning_rate": 6.028521093652195e-06, + "loss": 0.6303, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 668, + "tokens_per_second_per_gpu": 17232.09, + "total_tokens": 17038442 + }, + { + "epoch": 0.053507158282012315, + "grad_norm": 0.5109195113182068, + "learning_rate": 5.996509674431053e-06, + "loss": 0.6477, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 669, + "tokens_per_second_per_gpu": 16715.13, + "total_tokens": 17063529 + }, + { + "epoch": 0.05358713908661921, + "grad_norm": 0.5262460708618164, + "learning_rate": 5.9645470364761e-06, + "loss": 0.6393, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 670, + "tokens_per_second_per_gpu": 16640.95, + "total_tokens": 17088499 + }, + { + "epoch": 0.053667119891226106, + "grad_norm": 0.4987565875053406, + "learning_rate": 5.932633569242e-06, + "loss": 0.6176, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 671, + "tokens_per_second_per_gpu": 16846.36, + "total_tokens": 17113810 + }, + { + "epoch": 0.053747100695833, + "grad_norm": 0.5298067927360535, + "learning_rate": 5.900769661584273e-06, + "loss": 0.7042, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 672, + "tokens_per_second_per_gpu": 17058.35, + "total_tokens": 17139525 + }, + { + "epoch": 0.0538270815004399, + "grad_norm": 0.4801011085510254, + "learning_rate": 5.868955701754584e-06, + "loss": 0.5934, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 673, + "tokens_per_second_per_gpu": 16728.94, + "total_tokens": 17165170 + }, + { + "epoch": 0.05390706230504679, + "grad_norm": 0.5165581107139587, + "learning_rate": 5.83719207739599e-06, + "loss": 0.5992, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 674, + "tokens_per_second_per_gpu": 16374.51, + "total_tokens": 17189646 + }, + { + "epoch": 0.05398704310965368, + "grad_norm": 0.5193620920181274, + "learning_rate": 5.8054791755382286e-06, + "loss": 0.6553, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 675, + "tokens_per_second_per_gpu": 16535.5, + "total_tokens": 17214531 + }, + { + "epoch": 0.054067023914260576, + "grad_norm": 0.48226460814476013, + "learning_rate": 5.773817382593008e-06, + "loss": 0.6172, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 676, + "tokens_per_second_per_gpu": 17723.66, + "total_tokens": 17241548 + }, + { + "epoch": 0.05414700471886747, + "grad_norm": 0.5453917980194092, + "learning_rate": 5.742207084349274e-06, + "loss": 0.6006, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 677, + "tokens_per_second_per_gpu": 16531.5, + "total_tokens": 17265863 + }, + { + "epoch": 0.054226985523474366, + "grad_norm": 0.5053668022155762, + "learning_rate": 5.710648665968543e-06, + "loss": 0.639, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 678, + "tokens_per_second_per_gpu": 16542.28, + "total_tokens": 17290327 + }, + { + "epoch": 0.05430696632808126, + "grad_norm": 0.5109512209892273, + "learning_rate": 5.679142511980176e-06, + "loss": 0.6027, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 679, + "tokens_per_second_per_gpu": 16971.18, + "total_tokens": 17315480 + }, + { + "epoch": 0.05438694713268816, + "grad_norm": 0.5201040506362915, + "learning_rate": 5.647689006276727e-06, + "loss": 0.6206, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 680, + "tokens_per_second_per_gpu": 16478.13, + "total_tokens": 17340212 + }, + { + "epoch": 0.05446692793729505, + "grad_norm": 0.502582848072052, + "learning_rate": 5.616288532109225e-06, + "loss": 0.6805, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 681, + "tokens_per_second_per_gpu": 17185.1, + "total_tokens": 17366028 + }, + { + "epoch": 0.05454690874190194, + "grad_norm": 0.5199177861213684, + "learning_rate": 5.584941472082549e-06, + "loss": 0.6613, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 682, + "tokens_per_second_per_gpu": 16955.39, + "total_tokens": 17390977 + }, + { + "epoch": 0.054626889546508836, + "grad_norm": 0.5209512114524841, + "learning_rate": 5.553648208150728e-06, + "loss": 0.6395, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 683, + "tokens_per_second_per_gpu": 16364.97, + "total_tokens": 17415065 + }, + { + "epoch": 0.05470687035111573, + "grad_norm": 0.5158247947692871, + "learning_rate": 5.522409121612304e-06, + "loss": 0.6239, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 684, + "tokens_per_second_per_gpu": 17041.4, + "total_tokens": 17440462 + }, + { + "epoch": 0.05478685115572263, + "grad_norm": 0.5076451897621155, + "learning_rate": 5.491224593105695e-06, + "loss": 0.6193, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 685, + "tokens_per_second_per_gpu": 16489.47, + "total_tokens": 17465299 + }, + { + "epoch": 0.05486683196032952, + "grad_norm": 0.4743523895740509, + "learning_rate": 5.460095002604533e-06, + "loss": 0.6283, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 686, + "tokens_per_second_per_gpu": 17522.65, + "total_tokens": 17491830 + }, + { + "epoch": 0.05494681276493642, + "grad_norm": 0.5121709108352661, + "learning_rate": 5.429020729413062e-06, + "loss": 0.6348, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 687, + "tokens_per_second_per_gpu": 16463.08, + "total_tokens": 17516529 + }, + { + "epoch": 0.05502679356954331, + "grad_norm": 0.510275661945343, + "learning_rate": 5.398002152161484e-06, + "loss": 0.6229, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 688, + "tokens_per_second_per_gpu": 16654.05, + "total_tokens": 17541653 + }, + { + "epoch": 0.0551067743741502, + "grad_norm": 0.49475717544555664, + "learning_rate": 5.367039648801386e-06, + "loss": 0.6389, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 689, + "tokens_per_second_per_gpu": 16899.12, + "total_tokens": 17567106 + }, + { + "epoch": 0.055186755178757096, + "grad_norm": 0.5166232585906982, + "learning_rate": 5.336133596601089e-06, + "loss": 0.669, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 690, + "tokens_per_second_per_gpu": 17002.49, + "total_tokens": 17592976 + }, + { + "epoch": 0.05526673598336399, + "grad_norm": 0.4955079257488251, + "learning_rate": 5.305284372141095e-06, + "loss": 0.5659, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 691, + "tokens_per_second_per_gpu": 16119.99, + "total_tokens": 17616788 + }, + { + "epoch": 0.05534671678797089, + "grad_norm": 0.49480971693992615, + "learning_rate": 5.274492351309462e-06, + "loss": 0.6458, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 692, + "tokens_per_second_per_gpu": 17306.47, + "total_tokens": 17642818 + }, + { + "epoch": 0.05542669759257778, + "grad_norm": 0.5008161067962646, + "learning_rate": 5.243757909297247e-06, + "loss": 0.6161, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 693, + "tokens_per_second_per_gpu": 17152.18, + "total_tokens": 17668640 + }, + { + "epoch": 0.05550667839718468, + "grad_norm": 0.5221447348594666, + "learning_rate": 5.213081420593933e-06, + "loss": 0.616, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 694, + "tokens_per_second_per_gpu": 16822.73, + "total_tokens": 17693424 + }, + { + "epoch": 0.05558665920179157, + "grad_norm": 0.5296872854232788, + "learning_rate": 5.1824632589828465e-06, + "loss": 0.6246, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 695, + "tokens_per_second_per_gpu": 16770.43, + "total_tokens": 17718572 + }, + { + "epoch": 0.05566664000639846, + "grad_norm": 0.5189606547355652, + "learning_rate": 5.151903797536631e-06, + "loss": 0.6366, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 696, + "tokens_per_second_per_gpu": 16921.39, + "total_tokens": 17743965 + }, + { + "epoch": 0.05574662081100536, + "grad_norm": 0.5203530788421631, + "learning_rate": 5.121403408612672e-06, + "loss": 0.7065, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 697, + "tokens_per_second_per_gpu": 17426.51, + "total_tokens": 17770644 + }, + { + "epoch": 0.05582660161561225, + "grad_norm": 0.51515132188797, + "learning_rate": 5.090962463848592e-06, + "loss": 0.6459, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 698, + "tokens_per_second_per_gpu": 17112.57, + "total_tokens": 17796697 + }, + { + "epoch": 0.05590658242021915, + "grad_norm": 0.5101720094680786, + "learning_rate": 5.060581334157693e-06, + "loss": 0.6448, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 699, + "tokens_per_second_per_gpu": 16878.94, + "total_tokens": 17821904 + }, + { + "epoch": 0.05598656322482604, + "grad_norm": 0.5070253610610962, + "learning_rate": 5.030260389724447e-06, + "loss": 0.6271, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 700, + "tokens_per_second_per_gpu": 17086.03, + "total_tokens": 17847477 + }, + { + "epoch": 0.05606654402943294, + "grad_norm": 0.5088156461715698, + "learning_rate": 5.000000000000003e-06, + "loss": 0.6259, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 701, + "tokens_per_second_per_gpu": 17014.29, + "total_tokens": 17873042 + }, + { + "epoch": 0.05614652483403983, + "grad_norm": 0.4844730496406555, + "learning_rate": 4.96980053369765e-06, + "loss": 0.6019, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 702, + "tokens_per_second_per_gpu": 16950.47, + "total_tokens": 17898577 + }, + { + "epoch": 0.05622650563864672, + "grad_norm": 0.5203348994255066, + "learning_rate": 4.939662358788364e-06, + "loss": 0.6317, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 703, + "tokens_per_second_per_gpu": 16871.59, + "total_tokens": 17923826 + }, + { + "epoch": 0.05630648644325362, + "grad_norm": 0.5411732196807861, + "learning_rate": 4.909585842496287e-06, + "loss": 0.6407, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 704, + "tokens_per_second_per_gpu": 17433.86, + "total_tokens": 17950148 + }, + { + "epoch": 0.05638646724786051, + "grad_norm": 0.5115430951118469, + "learning_rate": 4.879571351294287e-06, + "loss": 0.6517, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 705, + "tokens_per_second_per_gpu": 17553.94, + "total_tokens": 17976281 + }, + { + "epoch": 0.05646644805246741, + "grad_norm": 0.5059305429458618, + "learning_rate": 4.849619250899458e-06, + "loss": 0.6271, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 706, + "tokens_per_second_per_gpu": 16888.3, + "total_tokens": 18001831 + }, + { + "epoch": 0.0565464288570743, + "grad_norm": 0.4909086525440216, + "learning_rate": 4.8197299062687e-06, + "loss": 0.656, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 707, + "tokens_per_second_per_gpu": 17176.8, + "total_tokens": 18028323 + }, + { + "epoch": 0.0566264096616812, + "grad_norm": 0.5115363597869873, + "learning_rate": 4.78990368159424e-06, + "loss": 0.6764, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 708, + "tokens_per_second_per_gpu": 17153.77, + "total_tokens": 18054022 + }, + { + "epoch": 0.056706390466288094, + "grad_norm": 0.5104652643203735, + "learning_rate": 4.76014094029921e-06, + "loss": 0.648, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 709, + "tokens_per_second_per_gpu": 17076.04, + "total_tokens": 18080235 + }, + { + "epoch": 0.05678637127089498, + "grad_norm": 0.5099148154258728, + "learning_rate": 4.7304420450332244e-06, + "loss": 0.6074, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 710, + "tokens_per_second_per_gpu": 16836.58, + "total_tokens": 18105272 + }, + { + "epoch": 0.05686635207550188, + "grad_norm": 0.5084642171859741, + "learning_rate": 4.700807357667953e-06, + "loss": 0.6519, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 711, + "tokens_per_second_per_gpu": 17614.67, + "total_tokens": 18131849 + }, + { + "epoch": 0.05694633288010877, + "grad_norm": 0.5015023946762085, + "learning_rate": 4.671237239292699e-06, + "loss": 0.5743, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 712, + "tokens_per_second_per_gpu": 16473.36, + "total_tokens": 18156262 + }, + { + "epoch": 0.05702631368471567, + "grad_norm": 0.5393797159194946, + "learning_rate": 4.641732050210032e-06, + "loss": 0.667, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 713, + "tokens_per_second_per_gpu": 17115.49, + "total_tokens": 18181782 + }, + { + "epoch": 0.057106294489322564, + "grad_norm": 0.5561414361000061, + "learning_rate": 4.612292149931369e-06, + "loss": 0.6896, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 714, + "tokens_per_second_per_gpu": 17365.17, + "total_tokens": 18208558 + }, + { + "epoch": 0.05718627529392946, + "grad_norm": 0.5471202731132507, + "learning_rate": 4.582917897172603e-06, + "loss": 0.6506, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 715, + "tokens_per_second_per_gpu": 17052.18, + "total_tokens": 18234567 + }, + { + "epoch": 0.057266256098536354, + "grad_norm": 0.4913035035133362, + "learning_rate": 4.5536096498497295e-06, + "loss": 0.6357, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 716, + "tokens_per_second_per_gpu": 17184.7, + "total_tokens": 18261123 + }, + { + "epoch": 0.05734623690314324, + "grad_norm": 0.49759647250175476, + "learning_rate": 4.524367765074499e-06, + "loss": 0.6172, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 717, + "tokens_per_second_per_gpu": 17501.25, + "total_tokens": 18287293 + }, + { + "epoch": 0.05742621770775014, + "grad_norm": 0.5413016080856323, + "learning_rate": 4.495192599150045e-06, + "loss": 0.6359, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 718, + "tokens_per_second_per_gpu": 16512.11, + "total_tokens": 18312199 + }, + { + "epoch": 0.05750619851235703, + "grad_norm": 0.5255224108695984, + "learning_rate": 4.46608450756656e-06, + "loss": 0.638, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 719, + "tokens_per_second_per_gpu": 17083.09, + "total_tokens": 18337670 + }, + { + "epoch": 0.05758617931696393, + "grad_norm": 0.5278708338737488, + "learning_rate": 4.437043844996952e-06, + "loss": 0.6669, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 720, + "tokens_per_second_per_gpu": 17489.34, + "total_tokens": 18364339 + }, + { + "epoch": 0.057666160121570824, + "grad_norm": 0.5288352370262146, + "learning_rate": 4.408070965292534e-06, + "loss": 0.6484, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 721, + "tokens_per_second_per_gpu": 17045.79, + "total_tokens": 18389586 + }, + { + "epoch": 0.05774614092617772, + "grad_norm": 0.4860366880893707, + "learning_rate": 4.379166221478697e-06, + "loss": 0.6261, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 722, + "tokens_per_second_per_gpu": 17383.61, + "total_tokens": 18416280 + }, + { + "epoch": 0.057826121730784615, + "grad_norm": 0.5295699834823608, + "learning_rate": 4.350329965750622e-06, + "loss": 0.6549, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 723, + "tokens_per_second_per_gpu": 16710.45, + "total_tokens": 18441370 + }, + { + "epoch": 0.0579061025353915, + "grad_norm": 0.4987591505050659, + "learning_rate": 4.321562549468991e-06, + "loss": 0.6431, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 724, + "tokens_per_second_per_gpu": 17326.31, + "total_tokens": 18468281 + }, + { + "epoch": 0.0579860833399984, + "grad_norm": 0.5106927752494812, + "learning_rate": 4.292864323155684e-06, + "loss": 0.6309, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 725, + "tokens_per_second_per_gpu": 17342.4, + "total_tokens": 18494509 + }, + { + "epoch": 0.058066064144605294, + "grad_norm": 0.4820137023925781, + "learning_rate": 4.264235636489542e-06, + "loss": 0.6057, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 726, + "tokens_per_second_per_gpu": 16684.63, + "total_tokens": 18519945 + }, + { + "epoch": 0.05814604494921219, + "grad_norm": 0.5269767642021179, + "learning_rate": 4.235676838302069e-06, + "loss": 0.6297, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 727, + "tokens_per_second_per_gpu": 17668.63, + "total_tokens": 18546042 + }, + { + "epoch": 0.058226025753819084, + "grad_norm": 0.46701568365097046, + "learning_rate": 4.207188276573214e-06, + "loss": 0.6421, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 728, + "tokens_per_second_per_gpu": 17517.42, + "total_tokens": 18572973 + }, + { + "epoch": 0.05830600655842598, + "grad_norm": 0.5261129140853882, + "learning_rate": 4.178770298427107e-06, + "loss": 0.659, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 729, + "tokens_per_second_per_gpu": 16871.14, + "total_tokens": 18598790 + }, + { + "epoch": 0.058385987363032875, + "grad_norm": 0.5487871170043945, + "learning_rate": 4.150423250127846e-06, + "loss": 0.6549, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 730, + "tokens_per_second_per_gpu": 16979.14, + "total_tokens": 18624176 + }, + { + "epoch": 0.05846596816763976, + "grad_norm": 0.4980189800262451, + "learning_rate": 4.12214747707527e-06, + "loss": 0.6359, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 731, + "tokens_per_second_per_gpu": 17217.12, + "total_tokens": 18649963 + }, + { + "epoch": 0.05854594897224666, + "grad_norm": 0.588450014591217, + "learning_rate": 4.093943323800746e-06, + "loss": 0.6685, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 732, + "tokens_per_second_per_gpu": 17620.46, + "total_tokens": 18676309 + }, + { + "epoch": 0.058625929776853554, + "grad_norm": 0.5422666668891907, + "learning_rate": 4.065811133962987e-06, + "loss": 0.6858, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 733, + "tokens_per_second_per_gpu": 17164.56, + "total_tokens": 18701937 + }, + { + "epoch": 0.05870591058146045, + "grad_norm": 0.49481356143951416, + "learning_rate": 4.037751250343841e-06, + "loss": 0.6455, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 734, + "tokens_per_second_per_gpu": 17268.18, + "total_tokens": 18728402 + }, + { + "epoch": 0.058785891386067345, + "grad_norm": 0.4996780753135681, + "learning_rate": 4.009764014844143e-06, + "loss": 0.6418, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 735, + "tokens_per_second_per_gpu": 16964.57, + "total_tokens": 18754341 + }, + { + "epoch": 0.05886587219067424, + "grad_norm": 0.5555558204650879, + "learning_rate": 3.981849768479516e-06, + "loss": 0.6603, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 736, + "tokens_per_second_per_gpu": 16663.17, + "total_tokens": 18778777 + }, + { + "epoch": 0.058945852995281135, + "grad_norm": 0.5153935551643372, + "learning_rate": 3.954008851376252e-06, + "loss": 0.6305, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 737, + "tokens_per_second_per_gpu": 16853.63, + "total_tokens": 18804000 + }, + { + "epoch": 0.059025833799888024, + "grad_norm": 0.5119479298591614, + "learning_rate": 3.9262416027671354e-06, + "loss": 0.622, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 738, + "tokens_per_second_per_gpu": 16758.19, + "total_tokens": 18829052 + }, + { + "epoch": 0.05910581460449492, + "grad_norm": 0.5353497862815857, + "learning_rate": 3.898548360987325e-06, + "loss": 0.6104, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 739, + "tokens_per_second_per_gpu": 16663.69, + "total_tokens": 18854098 + }, + { + "epoch": 0.059185795409101814, + "grad_norm": 0.5033715963363647, + "learning_rate": 3.8709294634702374e-06, + "loss": 0.6282, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 740, + "tokens_per_second_per_gpu": 16812.07, + "total_tokens": 18879460 + }, + { + "epoch": 0.05926577621370871, + "grad_norm": 0.5525617599487305, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.6933, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 741, + "tokens_per_second_per_gpu": 16728.85, + "total_tokens": 18905035 + }, + { + "epoch": 0.059345757018315605, + "grad_norm": 0.5698568820953369, + "learning_rate": 3.81591604642446e-06, + "loss": 0.6629, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 742, + "tokens_per_second_per_gpu": 17133.82, + "total_tokens": 18930728 + }, + { + "epoch": 0.0594257378229225, + "grad_norm": 0.5329509973526001, + "learning_rate": 3.7885221972168974e-06, + "loss": 0.6, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 743, + "tokens_per_second_per_gpu": 16556.46, + "total_tokens": 18955130 + }, + { + "epoch": 0.059505718627529396, + "grad_norm": 0.5058096647262573, + "learning_rate": 3.7612040329061405e-06, + "loss": 0.6008, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 744, + "tokens_per_second_per_gpu": 17059.72, + "total_tokens": 18980442 + }, + { + "epoch": 0.059585699432136284, + "grad_norm": 0.5127116441726685, + "learning_rate": 3.7339618863553983e-06, + "loss": 0.5898, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 745, + "tokens_per_second_per_gpu": 16644.72, + "total_tokens": 19005603 + }, + { + "epoch": 0.05966568023674318, + "grad_norm": 0.5248084664344788, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.6807, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 746, + "tokens_per_second_per_gpu": 17100.45, + "total_tokens": 19031447 + }, + { + "epoch": 0.059745661041350075, + "grad_norm": 0.5084623694419861, + "learning_rate": 3.679706973351491e-06, + "loss": 0.6247, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 747, + "tokens_per_second_per_gpu": 16938.92, + "total_tokens": 19057263 + }, + { + "epoch": 0.05982564184595697, + "grad_norm": 0.5016053915023804, + "learning_rate": 3.6526948679773256e-06, + "loss": 0.6035, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 748, + "tokens_per_second_per_gpu": 17456.58, + "total_tokens": 19082873 + }, + { + "epoch": 0.059905622650563865, + "grad_norm": 0.5239890813827515, + "learning_rate": 3.625760102513103e-06, + "loss": 0.6351, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 749, + "tokens_per_second_per_gpu": 16711.43, + "total_tokens": 19108031 + }, + { + "epoch": 0.05998560345517076, + "grad_norm": 0.5268155336380005, + "learning_rate": 3.598903005150444e-06, + "loss": 0.6499, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 750, + "tokens_per_second_per_gpu": 17401.95, + "total_tokens": 19133855 + }, + { + "epoch": 0.060065584259777656, + "grad_norm": 0.5115627646446228, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.5869, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 751, + "tokens_per_second_per_gpu": 17175.88, + "total_tokens": 19159287 + }, + { + "epoch": 0.060145565064384544, + "grad_norm": 0.5156000852584839, + "learning_rate": 3.545423122760493e-06, + "loss": 0.6222, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 752, + "tokens_per_second_per_gpu": 16409.08, + "total_tokens": 19184023 + }, + { + "epoch": 0.06022554586899144, + "grad_norm": 0.5103474259376526, + "learning_rate": 3.5188009893686916e-06, + "loss": 0.6848, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 753, + "tokens_per_second_per_gpu": 17179.22, + "total_tokens": 19209757 + }, + { + "epoch": 0.060305526673598335, + "grad_norm": 0.5245898365974426, + "learning_rate": 3.492257827341492e-06, + "loss": 0.6132, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 754, + "tokens_per_second_per_gpu": 16744.73, + "total_tokens": 19235219 + }, + { + "epoch": 0.06038550747820523, + "grad_norm": 0.5107713937759399, + "learning_rate": 3.4657939600989453e-06, + "loss": 0.6396, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 755, + "tokens_per_second_per_gpu": 17234.96, + "total_tokens": 19261421 + }, + { + "epoch": 0.060465488282812126, + "grad_norm": 0.5091108679771423, + "learning_rate": 3.4394097100949286e-06, + "loss": 0.6414, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 756, + "tokens_per_second_per_gpu": 16886.75, + "total_tokens": 19287079 + }, + { + "epoch": 0.06054546908741902, + "grad_norm": 0.5734265446662903, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.6478, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 757, + "tokens_per_second_per_gpu": 16271.26, + "total_tokens": 19311542 + }, + { + "epoch": 0.060625449892025916, + "grad_norm": 0.5568541884422302, + "learning_rate": 3.3868813467634833e-06, + "loss": 0.6899, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 758, + "tokens_per_second_per_gpu": 16956.37, + "total_tokens": 19336639 + }, + { + "epoch": 0.060705430696632805, + "grad_norm": 0.513871967792511, + "learning_rate": 3.360737873477584e-06, + "loss": 0.6322, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 759, + "tokens_per_second_per_gpu": 17458.5, + "total_tokens": 19362877 + }, + { + "epoch": 0.0607854115012397, + "grad_norm": 0.9253639578819275, + "learning_rate": 3.3346752975054763e-06, + "loss": 0.6365, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 760, + "tokens_per_second_per_gpu": 16756.47, + "total_tokens": 19387787 + }, + { + "epoch": 0.060865392305846595, + "grad_norm": 0.5255782604217529, + "learning_rate": 3.308693936411421e-06, + "loss": 0.6198, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 761, + "tokens_per_second_per_gpu": 16632.82, + "total_tokens": 19412823 + }, + { + "epoch": 0.06094537311045349, + "grad_norm": 0.5765253901481628, + "learning_rate": 3.2827941067700996e-06, + "loss": 0.683, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 762, + "tokens_per_second_per_gpu": 17055.63, + "total_tokens": 19438474 + }, + { + "epoch": 0.061025353915060386, + "grad_norm": 0.5258163809776306, + "learning_rate": 3.2569761241627694e-06, + "loss": 0.6129, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 763, + "tokens_per_second_per_gpu": 16890.04, + "total_tokens": 19463735 + }, + { + "epoch": 0.06110533471966728, + "grad_norm": 0.5253279209136963, + "learning_rate": 3.2312403031733943e-06, + "loss": 0.6451, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 764, + "tokens_per_second_per_gpu": 16498.84, + "total_tokens": 19488908 + }, + { + "epoch": 0.06118531552427418, + "grad_norm": 0.5541175603866577, + "learning_rate": 3.2055869573848374e-06, + "loss": 0.6668, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 765, + "tokens_per_second_per_gpu": 16835.26, + "total_tokens": 19514039 + }, + { + "epoch": 0.061265296328881065, + "grad_norm": 0.5414297580718994, + "learning_rate": 3.1800163993750166e-06, + "loss": 0.6561, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 766, + "tokens_per_second_per_gpu": 16740.75, + "total_tokens": 19539620 + }, + { + "epoch": 0.06134527713348796, + "grad_norm": 0.5167970657348633, + "learning_rate": 3.1545289407131128e-06, + "loss": 0.6536, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 767, + "tokens_per_second_per_gpu": 17410.67, + "total_tokens": 19565987 + }, + { + "epoch": 0.061425257938094856, + "grad_norm": 0.5267289280891418, + "learning_rate": 3.1291248919557717e-06, + "loss": 0.6601, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 768, + "tokens_per_second_per_gpu": 16764.95, + "total_tokens": 19591059 + }, + { + "epoch": 0.06150523874270175, + "grad_norm": 0.5405831336975098, + "learning_rate": 3.103804562643302e-06, + "loss": 0.6209, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 769, + "tokens_per_second_per_gpu": 16595.39, + "total_tokens": 19615537 + }, + { + "epoch": 0.061585219547308646, + "grad_norm": 0.5549702048301697, + "learning_rate": 3.0785682612959334e-06, + "loss": 0.7085, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 770, + "tokens_per_second_per_gpu": 17500.29, + "total_tokens": 19642176 + }, + { + "epoch": 0.06166520035191554, + "grad_norm": 0.526394784450531, + "learning_rate": 3.0534162954100264e-06, + "loss": 0.6627, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 771, + "tokens_per_second_per_gpu": 16967.6, + "total_tokens": 19668033 + }, + { + "epoch": 0.06174518115652244, + "grad_norm": 0.5020858645439148, + "learning_rate": 3.028348971454356e-06, + "loss": 0.6248, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 772, + "tokens_per_second_per_gpu": 16806.61, + "total_tokens": 19693414 + }, + { + "epoch": 0.06182516196112933, + "grad_norm": 0.5282226204872131, + "learning_rate": 3.003366594866345e-06, + "loss": 0.5409, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 773, + "tokens_per_second_per_gpu": 15666.25, + "total_tokens": 19716633 + }, + { + "epoch": 0.06190514276573622, + "grad_norm": 0.5440317988395691, + "learning_rate": 2.978469470048376e-06, + "loss": 0.6455, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 774, + "tokens_per_second_per_gpu": 16783.48, + "total_tokens": 19741484 + }, + { + "epoch": 0.061985123570343116, + "grad_norm": 0.5348433256149292, + "learning_rate": 2.953657900364053e-06, + "loss": 0.6522, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 775, + "tokens_per_second_per_gpu": 17053.01, + "total_tokens": 19767834 + }, + { + "epoch": 0.06206510437495001, + "grad_norm": 0.543786883354187, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.6371, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 776, + "tokens_per_second_per_gpu": 16892.92, + "total_tokens": 19793247 + }, + { + "epoch": 0.06214508517955691, + "grad_norm": 0.5311694741249084, + "learning_rate": 2.9042926346347932e-06, + "loss": 0.649, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 777, + "tokens_per_second_per_gpu": 16817.82, + "total_tokens": 19818294 + }, + { + "epoch": 0.0622250659841638, + "grad_norm": 0.5300283432006836, + "learning_rate": 2.8797395400900362e-06, + "loss": 0.641, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 778, + "tokens_per_second_per_gpu": 16882.54, + "total_tokens": 19843333 + }, + { + "epoch": 0.0623050467887707, + "grad_norm": 0.5160916447639465, + "learning_rate": 2.855273203671969e-06, + "loss": 0.6468, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 779, + "tokens_per_second_per_gpu": 17472.3, + "total_tokens": 19869580 + }, + { + "epoch": 0.06238502759337759, + "grad_norm": 0.5387117266654968, + "learning_rate": 2.830893923495173e-06, + "loss": 0.6213, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 780, + "tokens_per_second_per_gpu": 17099.02, + "total_tokens": 19895160 + }, + { + "epoch": 0.06246500839798448, + "grad_norm": 0.5398359894752502, + "learning_rate": 2.8066019966134907e-06, + "loss": 0.6452, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 781, + "tokens_per_second_per_gpu": 17350.46, + "total_tokens": 19920688 + }, + { + "epoch": 0.06254498920259138, + "grad_norm": 0.5316033363342285, + "learning_rate": 2.7823977190163788e-06, + "loss": 0.6397, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 782, + "tokens_per_second_per_gpu": 16772.7, + "total_tokens": 19945648 + }, + { + "epoch": 0.06262497000719827, + "grad_norm": 0.5031722187995911, + "learning_rate": 2.7582813856253276e-06, + "loss": 0.6356, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 783, + "tokens_per_second_per_gpu": 17215.15, + "total_tokens": 19971748 + }, + { + "epoch": 0.06270495081180516, + "grad_norm": 0.5138970017433167, + "learning_rate": 2.7342532902902418e-06, + "loss": 0.6307, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 784, + "tokens_per_second_per_gpu": 17223.48, + "total_tokens": 19998237 + }, + { + "epoch": 0.06278493161641206, + "grad_norm": 0.4964427053928375, + "learning_rate": 2.7103137257858867e-06, + "loss": 0.5311, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 785, + "tokens_per_second_per_gpu": 16260.69, + "total_tokens": 20022768 + }, + { + "epoch": 0.06286491242101895, + "grad_norm": 0.5650128126144409, + "learning_rate": 2.6864629838082957e-06, + "loss": 0.5814, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 786, + "tokens_per_second_per_gpu": 16690.38, + "total_tokens": 20047748 + }, + { + "epoch": 0.06294489322562585, + "grad_norm": 0.5131796598434448, + "learning_rate": 2.6627013549712355e-06, + "loss": 0.6104, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 787, + "tokens_per_second_per_gpu": 16507.9, + "total_tokens": 20072431 + }, + { + "epoch": 0.06302487403023274, + "grad_norm": 0.5409046411514282, + "learning_rate": 2.639029128802657e-06, + "loss": 0.6366, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 788, + "tokens_per_second_per_gpu": 17400.52, + "total_tokens": 20098484 + }, + { + "epoch": 0.06310485483483964, + "grad_norm": 0.5458826422691345, + "learning_rate": 2.615446593741161e-06, + "loss": 0.6474, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 789, + "tokens_per_second_per_gpu": 17471.76, + "total_tokens": 20124504 + }, + { + "epoch": 0.06318483563944653, + "grad_norm": 0.5500627160072327, + "learning_rate": 2.5919540371325005e-06, + "loss": 0.6645, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 790, + "tokens_per_second_per_gpu": 16887.62, + "total_tokens": 20149719 + }, + { + "epoch": 0.06326481644405342, + "grad_norm": 0.7003596425056458, + "learning_rate": 2.5685517452260566e-06, + "loss": 0.6207, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 791, + "tokens_per_second_per_gpu": 17039.19, + "total_tokens": 20174892 + }, + { + "epoch": 0.06334479724866032, + "grad_norm": 0.5149163007736206, + "learning_rate": 2.5452400031713786e-06, + "loss": 0.5908, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 792, + "tokens_per_second_per_gpu": 16747.71, + "total_tokens": 20199527 + }, + { + "epoch": 0.06342477805326721, + "grad_norm": 0.5385146141052246, + "learning_rate": 2.522019095014683e-06, + "loss": 0.6026, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 793, + "tokens_per_second_per_gpu": 16726.82, + "total_tokens": 20224267 + }, + { + "epoch": 0.06350475885787411, + "grad_norm": 0.5230799317359924, + "learning_rate": 2.4988893036954045e-06, + "loss": 0.6515, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 794, + "tokens_per_second_per_gpu": 17114.37, + "total_tokens": 20250327 + }, + { + "epoch": 0.063584739662481, + "grad_norm": 0.5139489769935608, + "learning_rate": 2.4758509110427576e-06, + "loss": 0.6482, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 795, + "tokens_per_second_per_gpu": 16803.05, + "total_tokens": 20275583 + }, + { + "epoch": 0.0636647204670879, + "grad_norm": 0.523923933506012, + "learning_rate": 2.45290419777228e-06, + "loss": 0.6579, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 796, + "tokens_per_second_per_gpu": 17032.63, + "total_tokens": 20301068 + }, + { + "epoch": 0.06374470127169479, + "grad_norm": 0.7031800746917725, + "learning_rate": 2.4300494434824373e-06, + "loss": 0.6209, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 797, + "tokens_per_second_per_gpu": 16706.86, + "total_tokens": 20326831 + }, + { + "epoch": 0.06382468207630168, + "grad_norm": 0.5347440838813782, + "learning_rate": 2.407286926651192e-06, + "loss": 0.6361, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 798, + "tokens_per_second_per_gpu": 16973.33, + "total_tokens": 20351944 + }, + { + "epoch": 0.06390466288090858, + "grad_norm": 0.5046122074127197, + "learning_rate": 2.3846169246326345e-06, + "loss": 0.6284, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 799, + "tokens_per_second_per_gpu": 17113.48, + "total_tokens": 20377650 + }, + { + "epoch": 0.06398464368551547, + "grad_norm": 0.5298998355865479, + "learning_rate": 2.362039713653581e-06, + "loss": 0.6105, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 800, + "tokens_per_second_per_gpu": 16898.19, + "total_tokens": 20403133 + }, + { + "epoch": 0.06406462449012237, + "grad_norm": 0.5419802665710449, + "learning_rate": 2.339555568810221e-06, + "loss": 0.6345, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 801, + "tokens_per_second_per_gpu": 16951.5, + "total_tokens": 20428474 + }, + { + "epoch": 0.06414460529472926, + "grad_norm": 0.5342543125152588, + "learning_rate": 2.317164764064769e-06, + "loss": 0.6599, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 802, + "tokens_per_second_per_gpu": 17249.38, + "total_tokens": 20454441 + }, + { + "epoch": 0.06422458609933616, + "grad_norm": 0.5269400477409363, + "learning_rate": 2.2948675722421086e-06, + "loss": 0.5818, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 803, + "tokens_per_second_per_gpu": 16686.12, + "total_tokens": 20478852 + }, + { + "epoch": 0.06430456690394305, + "grad_norm": 0.5437107086181641, + "learning_rate": 2.27266426502649e-06, + "loss": 0.6462, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 804, + "tokens_per_second_per_gpu": 16684.85, + "total_tokens": 20503980 + }, + { + "epoch": 0.06438454770854994, + "grad_norm": 0.5457687973976135, + "learning_rate": 2.2505551129582047e-06, + "loss": 0.6608, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 805, + "tokens_per_second_per_gpu": 16383.96, + "total_tokens": 20528619 + }, + { + "epoch": 0.06446452851315684, + "grad_norm": 0.5263291001319885, + "learning_rate": 2.2285403854302912e-06, + "loss": 0.6213, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 806, + "tokens_per_second_per_gpu": 17247.48, + "total_tokens": 20554387 + }, + { + "epoch": 0.06454450931776373, + "grad_norm": 0.5361708402633667, + "learning_rate": 2.206620350685257e-06, + "loss": 0.6427, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 807, + "tokens_per_second_per_gpu": 17243.59, + "total_tokens": 20579876 + }, + { + "epoch": 0.06462449012237063, + "grad_norm": 0.5289268493652344, + "learning_rate": 2.1847952758118118e-06, + "loss": 0.6201, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 808, + "tokens_per_second_per_gpu": 16553.91, + "total_tokens": 20604719 + }, + { + "epoch": 0.06470447092697752, + "grad_norm": 0.544245183467865, + "learning_rate": 2.163065426741603e-06, + "loss": 0.6662, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 809, + "tokens_per_second_per_gpu": 17452.98, + "total_tokens": 20630589 + }, + { + "epoch": 0.06478445173158442, + "grad_norm": 0.5488360524177551, + "learning_rate": 2.1414310682459805e-06, + "loss": 0.6423, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 810, + "tokens_per_second_per_gpu": 16514.2, + "total_tokens": 20655490 + }, + { + "epoch": 0.06486443253619131, + "grad_norm": 0.5205331444740295, + "learning_rate": 2.119892463932781e-06, + "loss": 0.5988, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 811, + "tokens_per_second_per_gpu": 17129.86, + "total_tokens": 20681611 + }, + { + "epoch": 0.0649444133407982, + "grad_norm": 0.5465454459190369, + "learning_rate": 2.098449876243096e-06, + "loss": 0.6205, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 812, + "tokens_per_second_per_gpu": 14461.03, + "total_tokens": 20706354 + }, + { + "epoch": 0.0650243941454051, + "grad_norm": 0.5238451361656189, + "learning_rate": 2.0771035664480944e-06, + "loss": 0.6657, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 813, + "tokens_per_second_per_gpu": 17103.66, + "total_tokens": 20732674 + }, + { + "epoch": 0.06510437495001199, + "grad_norm": 0.5532448291778564, + "learning_rate": 2.0558537946458177e-06, + "loss": 0.7047, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 814, + "tokens_per_second_per_gpu": 17073.99, + "total_tokens": 20758451 + }, + { + "epoch": 0.0651843557546189, + "grad_norm": 0.5232256054878235, + "learning_rate": 2.0347008197580376e-06, + "loss": 0.6141, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 815, + "tokens_per_second_per_gpu": 17314.13, + "total_tokens": 20784400 + }, + { + "epoch": 0.06526433655922578, + "grad_norm": 0.537419855594635, + "learning_rate": 2.013644899527074e-06, + "loss": 0.6804, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 816, + "tokens_per_second_per_gpu": 17404.52, + "total_tokens": 20811146 + }, + { + "epoch": 0.06534431736383269, + "grad_norm": 0.5733768343925476, + "learning_rate": 1.9926862905126663e-06, + "loss": 0.6745, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 817, + "tokens_per_second_per_gpu": 16847.29, + "total_tokens": 20836415 + }, + { + "epoch": 0.06542429816843957, + "grad_norm": 0.5123438239097595, + "learning_rate": 1.9718252480888567e-06, + "loss": 0.6181, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 818, + "tokens_per_second_per_gpu": 17343.21, + "total_tokens": 20862400 + }, + { + "epoch": 0.06550427897304646, + "grad_norm": 0.5344765782356262, + "learning_rate": 1.95106202644086e-06, + "loss": 0.6232, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 819, + "tokens_per_second_per_gpu": 16935.96, + "total_tokens": 20887638 + }, + { + "epoch": 0.06558425977765336, + "grad_norm": 0.5133531093597412, + "learning_rate": 1.930396878561983e-06, + "loss": 0.6081, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 820, + "tokens_per_second_per_gpu": 17029.02, + "total_tokens": 20913351 + }, + { + "epoch": 0.06566424058226025, + "grad_norm": 0.5093186497688293, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.5818, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 821, + "tokens_per_second_per_gpu": 17013.0, + "total_tokens": 20938940 + }, + { + "epoch": 0.06574422138686715, + "grad_norm": 0.5392187833786011, + "learning_rate": 1.8893618101067357e-06, + "loss": 0.6245, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 822, + "tokens_per_second_per_gpu": 17008.86, + "total_tokens": 20964899 + }, + { + "epoch": 0.06582420219147404, + "grad_norm": 0.5658339858055115, + "learning_rate": 1.8689923895297247e-06, + "loss": 0.6505, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 823, + "tokens_per_second_per_gpu": 16746.45, + "total_tokens": 20990024 + }, + { + "epoch": 0.06590418299608095, + "grad_norm": 0.5425558090209961, + "learning_rate": 1.848722042714457e-06, + "loss": 0.6539, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 824, + "tokens_per_second_per_gpu": 16762.28, + "total_tokens": 21015394 + }, + { + "epoch": 0.06598416380068783, + "grad_norm": 0.49881938099861145, + "learning_rate": 1.8285510166487154e-06, + "loss": 0.6495, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 825, + "tokens_per_second_per_gpu": 17344.43, + "total_tokens": 21041742 + }, + { + "epoch": 0.06606414460529474, + "grad_norm": 0.5435726642608643, + "learning_rate": 1.808479557110081e-06, + "loss": 0.6394, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 826, + "tokens_per_second_per_gpu": 16958.22, + "total_tokens": 21066554 + }, + { + "epoch": 0.06614412540990162, + "grad_norm": 0.5088474750518799, + "learning_rate": 1.7885079086629598e-06, + "loss": 0.6283, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 827, + "tokens_per_second_per_gpu": 16846.33, + "total_tokens": 21091431 + }, + { + "epoch": 0.06622410621450851, + "grad_norm": 0.5184812545776367, + "learning_rate": 1.7686363146555807e-06, + "loss": 0.6048, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 828, + "tokens_per_second_per_gpu": 16807.75, + "total_tokens": 21116183 + }, + { + "epoch": 0.06630408701911542, + "grad_norm": 0.5359786748886108, + "learning_rate": 1.7488650172170496e-06, + "loss": 0.6232, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 829, + "tokens_per_second_per_gpu": 16758.36, + "total_tokens": 21141076 + }, + { + "epoch": 0.0663840678237223, + "grad_norm": 0.5581966638565063, + "learning_rate": 1.7291942572543806e-06, + "loss": 0.6722, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 830, + "tokens_per_second_per_gpu": 16923.34, + "total_tokens": 21166455 + }, + { + "epoch": 0.0664640486283292, + "grad_norm": 0.5573216080665588, + "learning_rate": 1.709624274449584e-06, + "loss": 0.6268, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 831, + "tokens_per_second_per_gpu": 16440.4, + "total_tokens": 21190565 + }, + { + "epoch": 0.0665440294329361, + "grad_norm": 0.5541191697120667, + "learning_rate": 1.6901553072567189e-06, + "loss": 0.6127, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 832, + "tokens_per_second_per_gpu": 16602.29, + "total_tokens": 21214401 + }, + { + "epoch": 0.066624010237543, + "grad_norm": 0.5143559575080872, + "learning_rate": 1.6707875928990059e-06, + "loss": 0.6113, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 833, + "tokens_per_second_per_gpu": 16831.5, + "total_tokens": 21239453 + }, + { + "epoch": 0.06670399104214988, + "grad_norm": 0.5253430604934692, + "learning_rate": 1.651521367365936e-06, + "loss": 0.6203, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 834, + "tokens_per_second_per_gpu": 17026.44, + "total_tokens": 21265048 + }, + { + "epoch": 0.06678397184675677, + "grad_norm": 0.5263636708259583, + "learning_rate": 1.6323568654103838e-06, + "loss": 0.6411, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 835, + "tokens_per_second_per_gpu": 17001.65, + "total_tokens": 21290537 + }, + { + "epoch": 0.06686395265136368, + "grad_norm": 0.5092071890830994, + "learning_rate": 1.6132943205457607e-06, + "loss": 0.6245, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 836, + "tokens_per_second_per_gpu": 17222.45, + "total_tokens": 21316714 + }, + { + "epoch": 0.06694393345597056, + "grad_norm": 0.48893386125564575, + "learning_rate": 1.5943339650431578e-06, + "loss": 0.598, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 837, + "tokens_per_second_per_gpu": 17556.4, + "total_tokens": 21343578 + }, + { + "epoch": 0.06702391426057747, + "grad_norm": 0.5376018285751343, + "learning_rate": 1.5754760299285255e-06, + "loss": 0.6301, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 838, + "tokens_per_second_per_gpu": 16566.21, + "total_tokens": 21367631 + }, + { + "epoch": 0.06710389506518435, + "grad_norm": 0.5278213024139404, + "learning_rate": 1.5567207449798517e-06, + "loss": 0.613, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 839, + "tokens_per_second_per_gpu": 16833.56, + "total_tokens": 21393005 + }, + { + "epoch": 0.06718387586979126, + "grad_norm": 0.5237742066383362, + "learning_rate": 1.538068338724361e-06, + "loss": 0.6129, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 840, + "tokens_per_second_per_gpu": 16785.39, + "total_tokens": 21418322 + }, + { + "epoch": 0.06726385667439815, + "grad_norm": 0.5155054926872253, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.6684, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 841, + "tokens_per_second_per_gpu": 16989.1, + "total_tokens": 21444279 + }, + { + "epoch": 0.06734383747900503, + "grad_norm": 0.509067714214325, + "learning_rate": 1.5010730701313626e-06, + "loss": 0.5873, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 842, + "tokens_per_second_per_gpu": 16848.11, + "total_tokens": 21469610 + }, + { + "epoch": 0.06742381828361194, + "grad_norm": 0.5532313585281372, + "learning_rate": 1.4827306585695234e-06, + "loss": 0.7063, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 843, + "tokens_per_second_per_gpu": 17019.37, + "total_tokens": 21495536 + }, + { + "epoch": 0.06750379908821882, + "grad_norm": 0.5344107747077942, + "learning_rate": 1.4644920272467245e-06, + "loss": 0.649, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 844, + "tokens_per_second_per_gpu": 17240.12, + "total_tokens": 21521566 + }, + { + "epoch": 0.06758377989282573, + "grad_norm": 0.5316765904426575, + "learning_rate": 1.446357398394934e-06, + "loss": 0.6567, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 845, + "tokens_per_second_per_gpu": 16519.7, + "total_tokens": 21546818 + }, + { + "epoch": 0.06766376069743262, + "grad_norm": 0.5471608638763428, + "learning_rate": 1.4283269929788779e-06, + "loss": 0.719, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 846, + "tokens_per_second_per_gpu": 17558.19, + "total_tokens": 21573813 + }, + { + "epoch": 0.06774374150203952, + "grad_norm": 0.5042493343353271, + "learning_rate": 1.4104010306933558e-06, + "loss": 0.5862, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 847, + "tokens_per_second_per_gpu": 16810.13, + "total_tokens": 21598898 + }, + { + "epoch": 0.0678237223066464, + "grad_norm": 0.5342798233032227, + "learning_rate": 1.3925797299605649e-06, + "loss": 0.629, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 848, + "tokens_per_second_per_gpu": 16624.04, + "total_tokens": 21624135 + }, + { + "epoch": 0.0679037031112533, + "grad_norm": 0.4844224750995636, + "learning_rate": 1.3748633079274254e-06, + "loss": 0.6284, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 849, + "tokens_per_second_per_gpu": 17643.78, + "total_tokens": 21651187 + }, + { + "epoch": 0.0679836839158602, + "grad_norm": 0.5576471090316772, + "learning_rate": 1.3572519804629537e-06, + "loss": 0.6663, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 850, + "tokens_per_second_per_gpu": 16739.95, + "total_tokens": 21676719 + }, + { + "epoch": 0.06806366472046708, + "grad_norm": 0.561067521572113, + "learning_rate": 1.339745962155613e-06, + "loss": 0.6616, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 851, + "tokens_per_second_per_gpu": 17580.06, + "total_tokens": 21703252 + }, + { + "epoch": 0.06814364552507399, + "grad_norm": 0.4975440204143524, + "learning_rate": 1.322345466310717e-06, + "loss": 0.6046, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 852, + "tokens_per_second_per_gpu": 17651.26, + "total_tokens": 21729685 + }, + { + "epoch": 0.06822362632968088, + "grad_norm": 0.5225350856781006, + "learning_rate": 1.30505070494781e-06, + "loss": 0.6014, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 853, + "tokens_per_second_per_gpu": 16744.83, + "total_tokens": 21754924 + }, + { + "epoch": 0.06830360713428778, + "grad_norm": 0.5175594687461853, + "learning_rate": 1.2878618887981064e-06, + "loss": 0.6292, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 854, + "tokens_per_second_per_gpu": 17130.6, + "total_tokens": 21780725 + }, + { + "epoch": 0.06838358793889467, + "grad_norm": 0.5212383270263672, + "learning_rate": 1.2707792273019049e-06, + "loss": 0.6432, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 855, + "tokens_per_second_per_gpu": 16877.97, + "total_tokens": 21806402 + }, + { + "epoch": 0.06846356874350155, + "grad_norm": 0.5338414907455444, + "learning_rate": 1.2538029286060428e-06, + "loss": 0.6623, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 856, + "tokens_per_second_per_gpu": 17129.27, + "total_tokens": 21832107 + }, + { + "epoch": 0.06854354954810846, + "grad_norm": 0.5500073432922363, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.5917, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 857, + "tokens_per_second_per_gpu": 17256.77, + "total_tokens": 21857407 + }, + { + "epoch": 0.06862353035271535, + "grad_norm": 0.5338061451911926, + "learning_rate": 1.2201702457201948e-06, + "loss": 0.5952, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 858, + "tokens_per_second_per_gpu": 16673.84, + "total_tokens": 21882039 + }, + { + "epoch": 0.06870351115732225, + "grad_norm": 0.5579566955566406, + "learning_rate": 1.2035142713338366e-06, + "loss": 0.6569, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 859, + "tokens_per_second_per_gpu": 16671.62, + "total_tokens": 21907057 + }, + { + "epoch": 0.06878349196192914, + "grad_norm": 0.5408582091331482, + "learning_rate": 1.1869654793500784e-06, + "loss": 0.6789, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 860, + "tokens_per_second_per_gpu": 17620.35, + "total_tokens": 21933442 + }, + { + "epoch": 0.06886347276653604, + "grad_norm": 0.5381220579147339, + "learning_rate": 1.1705240714107301e-06, + "loss": 0.6369, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 861, + "tokens_per_second_per_gpu": 16727.68, + "total_tokens": 21958329 + }, + { + "epoch": 0.06894345357114293, + "grad_norm": 0.6026800274848938, + "learning_rate": 1.1541902478491607e-06, + "loss": 0.6602, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 862, + "tokens_per_second_per_gpu": 17363.28, + "total_tokens": 21984511 + }, + { + "epoch": 0.06902343437574981, + "grad_norm": 0.5562117099761963, + "learning_rate": 1.1379642076878528e-06, + "loss": 0.6528, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 863, + "tokens_per_second_per_gpu": 16971.05, + "total_tokens": 22009918 + }, + { + "epoch": 0.06910341518035672, + "grad_norm": 0.5344293713569641, + "learning_rate": 1.1218461486359878e-06, + "loss": 0.5938, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 864, + "tokens_per_second_per_gpu": 17145.65, + "total_tokens": 22035209 + }, + { + "epoch": 0.0691833959849636, + "grad_norm": 0.5310640931129456, + "learning_rate": 1.1058362670870248e-06, + "loss": 0.6437, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 865, + "tokens_per_second_per_gpu": 17253.19, + "total_tokens": 22060530 + }, + { + "epoch": 0.06926337678957051, + "grad_norm": 0.5512663125991821, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.6598, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 866, + "tokens_per_second_per_gpu": 16895.55, + "total_tokens": 22085916 + }, + { + "epoch": 0.0693433575941774, + "grad_norm": 0.5096883773803711, + "learning_rate": 1.0741418154787443e-06, + "loss": 0.5948, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 867, + "tokens_per_second_per_gpu": 17271.65, + "total_tokens": 22111625 + }, + { + "epoch": 0.0694233383987843, + "grad_norm": 0.564525306224823, + "learning_rate": 1.058457631606319e-06, + "loss": 0.6606, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 868, + "tokens_per_second_per_gpu": 16660.76, + "total_tokens": 22136753 + }, + { + "epoch": 0.06950331920339119, + "grad_norm": 0.5231457352638245, + "learning_rate": 1.042882397605871e-06, + "loss": 0.6307, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 869, + "tokens_per_second_per_gpu": 17528.22, + "total_tokens": 22163028 + }, + { + "epoch": 0.06958330000799808, + "grad_norm": 0.5321533679962158, + "learning_rate": 1.0274163032567165e-06, + "loss": 0.61, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 870, + "tokens_per_second_per_gpu": 16565.4, + "total_tokens": 22187415 + }, + { + "epoch": 0.06966328081260498, + "grad_norm": 0.5243551135063171, + "learning_rate": 1.012059537008332e-06, + "loss": 0.6004, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 871, + "tokens_per_second_per_gpu": 17433.64, + "total_tokens": 22213442 + }, + { + "epoch": 0.06974326161721187, + "grad_norm": 0.5377528667449951, + "learning_rate": 9.968122859780648e-07, + "loss": 0.6148, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 872, + "tokens_per_second_per_gpu": 16785.52, + "total_tokens": 22238355 + }, + { + "epoch": 0.06982324242181877, + "grad_norm": 0.5466241240501404, + "learning_rate": 9.816747359488632e-07, + "loss": 0.6273, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 873, + "tokens_per_second_per_gpu": 16524.06, + "total_tokens": 22263013 + }, + { + "epoch": 0.06990322322642566, + "grad_norm": 0.5036591291427612, + "learning_rate": 9.666470713669918e-07, + "loss": 0.5697, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 874, + "tokens_per_second_per_gpu": 16487.97, + "total_tokens": 22287505 + }, + { + "epoch": 0.06998320403103256, + "grad_norm": 0.5080577731132507, + "learning_rate": 9.517294753398066e-07, + "loss": 0.629, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 875, + "tokens_per_second_per_gpu": 17594.93, + "total_tokens": 22314169 + }, + { + "epoch": 0.07006318483563945, + "grad_norm": 0.48881781101226807, + "learning_rate": 9.369221296335007e-07, + "loss": 0.5497, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 876, + "tokens_per_second_per_gpu": 16353.97, + "total_tokens": 22338437 + }, + { + "epoch": 0.07014316564024634, + "grad_norm": 0.6482576727867126, + "learning_rate": 9.222252146709143e-07, + "loss": 0.6313, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 877, + "tokens_per_second_per_gpu": 18105.32, + "total_tokens": 22365579 + }, + { + "epoch": 0.07022314644485324, + "grad_norm": 0.5619848966598511, + "learning_rate": 9.076389095293148e-07, + "loss": 0.6667, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 878, + "tokens_per_second_per_gpu": 17285.05, + "total_tokens": 22391841 + }, + { + "epoch": 0.07030312724946013, + "grad_norm": 0.5285101532936096, + "learning_rate": 8.931633919382299e-07, + "loss": 0.7238, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 879, + "tokens_per_second_per_gpu": 17521.55, + "total_tokens": 22418478 + }, + { + "epoch": 0.07038310805406703, + "grad_norm": 0.5396612286567688, + "learning_rate": 8.787988382772705e-07, + "loss": 0.5924, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 880, + "tokens_per_second_per_gpu": 16798.7, + "total_tokens": 22443336 + }, + { + "epoch": 0.07046308885867392, + "grad_norm": 0.5362244248390198, + "learning_rate": 8.645454235739903e-07, + "loss": 0.6344, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 881, + "tokens_per_second_per_gpu": 17292.72, + "total_tokens": 22469492 + }, + { + "epoch": 0.07054306966328082, + "grad_norm": 0.5551726222038269, + "learning_rate": 8.504033215017527e-07, + "loss": 0.6013, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 882, + "tokens_per_second_per_gpu": 16799.15, + "total_tokens": 22494816 + }, + { + "epoch": 0.07062305046788771, + "grad_norm": 0.5338584184646606, + "learning_rate": 8.363727043776037e-07, + "loss": 0.5833, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 883, + "tokens_per_second_per_gpu": 16136.84, + "total_tokens": 22518177 + }, + { + "epoch": 0.0707030312724946, + "grad_norm": 0.551880419254303, + "learning_rate": 8.224537431601886e-07, + "loss": 0.655, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 884, + "tokens_per_second_per_gpu": 16505.32, + "total_tokens": 22543491 + }, + { + "epoch": 0.0707830120771015, + "grad_norm": 0.5214104056358337, + "learning_rate": 8.086466074476562e-07, + "loss": 0.659, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 885, + "tokens_per_second_per_gpu": 17549.53, + "total_tokens": 22569806 + }, + { + "epoch": 0.07086299288170839, + "grad_norm": 0.542853057384491, + "learning_rate": 7.949514654755963e-07, + "loss": 0.6105, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 886, + "tokens_per_second_per_gpu": 16930.88, + "total_tokens": 22594691 + }, + { + "epoch": 0.07094297368631529, + "grad_norm": 0.5428863167762756, + "learning_rate": 7.81368484114996e-07, + "loss": 0.6104, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 887, + "tokens_per_second_per_gpu": 17283.71, + "total_tokens": 22620948 + }, + { + "epoch": 0.07102295449092218, + "grad_norm": 0.5423809289932251, + "learning_rate": 7.678978288701911e-07, + "loss": 0.636, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 888, + "tokens_per_second_per_gpu": 17444.73, + "total_tokens": 22647130 + }, + { + "epoch": 0.07110293529552908, + "grad_norm": 0.5242039561271667, + "learning_rate": 7.545396638768698e-07, + "loss": 0.6312, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 889, + "tokens_per_second_per_gpu": 16473.27, + "total_tokens": 22671849 + }, + { + "epoch": 0.07118291610013597, + "grad_norm": 0.5590543746948242, + "learning_rate": 7.412941519000527e-07, + "loss": 0.6512, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 890, + "tokens_per_second_per_gpu": 16881.91, + "total_tokens": 22697023 + }, + { + "epoch": 0.07126289690474286, + "grad_norm": 0.5263185501098633, + "learning_rate": 7.281614543321269e-07, + "loss": 0.6223, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 891, + "tokens_per_second_per_gpu": 16850.16, + "total_tokens": 22722217 + }, + { + "epoch": 0.07134287770934976, + "grad_norm": 0.5166627764701843, + "learning_rate": 7.151417311908648e-07, + "loss": 0.6212, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 892, + "tokens_per_second_per_gpu": 16842.39, + "total_tokens": 22747378 + }, + { + "epoch": 0.07142285851395665, + "grad_norm": 0.5226914286613464, + "learning_rate": 7.022351411174866e-07, + "loss": 0.6251, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 893, + "tokens_per_second_per_gpu": 17338.77, + "total_tokens": 22773619 + }, + { + "epoch": 0.07150283931856355, + "grad_norm": 0.5193334221839905, + "learning_rate": 6.894418413747183e-07, + "loss": 0.6043, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 894, + "tokens_per_second_per_gpu": 17232.17, + "total_tokens": 22799126 + }, + { + "epoch": 0.07158282012317044, + "grad_norm": 0.5356144905090332, + "learning_rate": 6.767619878448783e-07, + "loss": 0.6715, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 895, + "tokens_per_second_per_gpu": 17571.32, + "total_tokens": 22825099 + }, + { + "epoch": 0.07166280092777734, + "grad_norm": 0.5201844573020935, + "learning_rate": 6.641957350279838e-07, + "loss": 0.637, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 896, + "tokens_per_second_per_gpu": 17330.69, + "total_tokens": 22851265 + }, + { + "epoch": 0.07174278173238423, + "grad_norm": 0.5070226192474365, + "learning_rate": 6.517432360398556e-07, + "loss": 0.6164, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 897, + "tokens_per_second_per_gpu": 17725.81, + "total_tokens": 22877914 + }, + { + "epoch": 0.07182276253699112, + "grad_norm": 0.5197336673736572, + "learning_rate": 6.394046426102673e-07, + "loss": 0.6104, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 898, + "tokens_per_second_per_gpu": 17001.09, + "total_tokens": 22903502 + }, + { + "epoch": 0.07190274334159802, + "grad_norm": 0.5605859756469727, + "learning_rate": 6.271801050810856e-07, + "loss": 0.7018, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 899, + "tokens_per_second_per_gpu": 16955.61, + "total_tokens": 22929177 + }, + { + "epoch": 0.07198272414620491, + "grad_norm": 0.4978031516075134, + "learning_rate": 6.150697724044407e-07, + "loss": 0.6308, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 900, + "tokens_per_second_per_gpu": 17524.78, + "total_tokens": 22956098 + }, + { + "epoch": 0.07206270495081181, + "grad_norm": 0.5414256453514099, + "learning_rate": 6.030737921409169e-07, + "loss": 0.6352, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 901, + "tokens_per_second_per_gpu": 16727.63, + "total_tokens": 22981145 + }, + { + "epoch": 0.0721426857554187, + "grad_norm": 0.5312582850456238, + "learning_rate": 5.911923104577455e-07, + "loss": 0.6422, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 902, + "tokens_per_second_per_gpu": 16767.2, + "total_tokens": 23006386 + }, + { + "epoch": 0.0722226665600256, + "grad_norm": 0.5257201194763184, + "learning_rate": 5.794254721270331e-07, + "loss": 0.6095, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 903, + "tokens_per_second_per_gpu": 16540.78, + "total_tokens": 23031228 + }, + { + "epoch": 0.07230264736463249, + "grad_norm": 0.5407463908195496, + "learning_rate": 5.677734205239904e-07, + "loss": 0.6093, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 904, + "tokens_per_second_per_gpu": 17082.27, + "total_tokens": 23056715 + }, + { + "epoch": 0.07238262816923938, + "grad_norm": 0.5344308018684387, + "learning_rate": 5.562362976251901e-07, + "loss": 0.655, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 905, + "tokens_per_second_per_gpu": 17119.96, + "total_tokens": 23082109 + }, + { + "epoch": 0.07246260897384628, + "grad_norm": 0.5326920747756958, + "learning_rate": 5.448142440068316e-07, + "loss": 0.643, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 906, + "tokens_per_second_per_gpu": 17190.5, + "total_tokens": 23107313 + }, + { + "epoch": 0.07254258977845317, + "grad_norm": 0.5334905385971069, + "learning_rate": 5.335073988430373e-07, + "loss": 0.5988, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 907, + "tokens_per_second_per_gpu": 16700.52, + "total_tokens": 23131980 + }, + { + "epoch": 0.07262257058306007, + "grad_norm": 0.49993833899497986, + "learning_rate": 5.223158999041444e-07, + "loss": 0.6369, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 908, + "tokens_per_second_per_gpu": 16759.27, + "total_tokens": 23157639 + }, + { + "epoch": 0.07270255138766696, + "grad_norm": 0.5197968482971191, + "learning_rate": 5.112398835550348e-07, + "loss": 0.6437, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 909, + "tokens_per_second_per_gpu": 17090.58, + "total_tokens": 23183670 + }, + { + "epoch": 0.07278253219227386, + "grad_norm": 0.5189102292060852, + "learning_rate": 5.002794847534765e-07, + "loss": 0.5972, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 910, + "tokens_per_second_per_gpu": 16958.96, + "total_tokens": 23208974 + }, + { + "epoch": 0.07286251299688075, + "grad_norm": 0.5296539068222046, + "learning_rate": 4.894348370484648e-07, + "loss": 0.6398, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 911, + "tokens_per_second_per_gpu": 17188.2, + "total_tokens": 23234688 + }, + { + "epoch": 0.07294249380148764, + "grad_norm": 0.5202224850654602, + "learning_rate": 4.787060725786141e-07, + "loss": 0.6371, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 912, + "tokens_per_second_per_gpu": 17395.29, + "total_tokens": 23260424 + }, + { + "epoch": 0.07302247460609454, + "grad_norm": 0.5297356843948364, + "learning_rate": 4.6809332207053083e-07, + "loss": 0.5857, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 913, + "tokens_per_second_per_gpu": 16700.69, + "total_tokens": 23285195 + }, + { + "epoch": 0.07310245541070143, + "grad_norm": 0.5384907126426697, + "learning_rate": 4.575967148372318e-07, + "loss": 0.6212, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 914, + "tokens_per_second_per_gpu": 17063.84, + "total_tokens": 23310815 + }, + { + "epoch": 0.07318243621530833, + "grad_norm": 0.5373451709747314, + "learning_rate": 4.4721637877656377e-07, + "loss": 0.6259, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 915, + "tokens_per_second_per_gpu": 17316.89, + "total_tokens": 23336669 + }, + { + "epoch": 0.07326241701991522, + "grad_norm": 0.5419356226921082, + "learning_rate": 4.3695244036964567e-07, + "loss": 0.6562, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 916, + "tokens_per_second_per_gpu": 17299.46, + "total_tokens": 23362881 + }, + { + "epoch": 0.07334239782452212, + "grad_norm": 0.54438316822052, + "learning_rate": 4.268050246793276e-07, + "loss": 0.612, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 917, + "tokens_per_second_per_gpu": 16889.02, + "total_tokens": 23388504 + }, + { + "epoch": 0.07342237862912901, + "grad_norm": 0.5277045369148254, + "learning_rate": 4.167742553486676e-07, + "loss": 0.6303, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 918, + "tokens_per_second_per_gpu": 16606.08, + "total_tokens": 23413492 + }, + { + "epoch": 0.0735023594337359, + "grad_norm": 0.5115805864334106, + "learning_rate": 4.068602545994249e-07, + "loss": 0.6335, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 919, + "tokens_per_second_per_gpu": 17113.19, + "total_tokens": 23439554 + }, + { + "epoch": 0.0735823402383428, + "grad_norm": 0.5200874209403992, + "learning_rate": 3.9706314323056936e-07, + "loss": 0.6528, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 920, + "tokens_per_second_per_gpu": 17124.8, + "total_tokens": 23465496 + }, + { + "epoch": 0.07366232104294969, + "grad_norm": 0.535873293876648, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.6929, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 921, + "tokens_per_second_per_gpu": 17206.32, + "total_tokens": 23492042 + }, + { + "epoch": 0.07374230184755659, + "grad_norm": 0.5385628938674927, + "learning_rate": 3.7782006470714614e-07, + "loss": 0.6276, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 922, + "tokens_per_second_per_gpu": 16672.51, + "total_tokens": 23516992 + }, + { + "epoch": 0.07382228265216348, + "grad_norm": 0.5313562750816345, + "learning_rate": 3.68374332023419e-07, + "loss": 0.5886, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 923, + "tokens_per_second_per_gpu": 16765.04, + "total_tokens": 23541526 + }, + { + "epoch": 0.07390226345677038, + "grad_norm": 0.5149674415588379, + "learning_rate": 3.590459576589e-07, + "loss": 0.6305, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 924, + "tokens_per_second_per_gpu": 17117.27, + "total_tokens": 23567238 + }, + { + "epoch": 0.07398224426137727, + "grad_norm": 0.5549934506416321, + "learning_rate": 3.498350552768859e-07, + "loss": 0.6459, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 925, + "tokens_per_second_per_gpu": 16924.56, + "total_tokens": 23592535 + }, + { + "epoch": 0.07406222506598416, + "grad_norm": 0.5501406788825989, + "learning_rate": 3.4074173710931804e-07, + "loss": 0.6181, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 926, + "tokens_per_second_per_gpu": 17134.37, + "total_tokens": 23618080 + }, + { + "epoch": 0.07414220587059106, + "grad_norm": 0.5153313875198364, + "learning_rate": 3.3176611395540625e-07, + "loss": 0.6321, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 927, + "tokens_per_second_per_gpu": 17355.28, + "total_tokens": 23644485 + }, + { + "epoch": 0.07422218667519795, + "grad_norm": 0.5083333849906921, + "learning_rate": 3.2290829518028867e-07, + "loss": 0.627, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 928, + "tokens_per_second_per_gpu": 17407.43, + "total_tokens": 23670424 + }, + { + "epoch": 0.07430216747980485, + "grad_norm": 0.544588565826416, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.6258, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 929, + "tokens_per_second_per_gpu": 17224.15, + "total_tokens": 23696336 + }, + { + "epoch": 0.07438214828441174, + "grad_norm": 0.5177492499351501, + "learning_rate": 3.0554650104861137e-07, + "loss": 0.6479, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 930, + "tokens_per_second_per_gpu": 17223.45, + "total_tokens": 23722109 + }, + { + "epoch": 0.07446212908901864, + "grad_norm": 0.5235950350761414, + "learning_rate": 2.970427372400353e-07, + "loss": 0.5707, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 931, + "tokens_per_second_per_gpu": 16511.77, + "total_tokens": 23746933 + }, + { + "epoch": 0.07454210989362553, + "grad_norm": 0.4983116090297699, + "learning_rate": 2.8865720090364037e-07, + "loss": 0.6071, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 932, + "tokens_per_second_per_gpu": 17590.81, + "total_tokens": 23774001 + }, + { + "epoch": 0.07462209069823242, + "grad_norm": 0.5138797760009766, + "learning_rate": 2.8038999421453827e-07, + "loss": 0.6404, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 933, + "tokens_per_second_per_gpu": 17531.8, + "total_tokens": 23800499 + }, + { + "epoch": 0.07470207150283932, + "grad_norm": 0.5279234051704407, + "learning_rate": 2.7224121790603517e-07, + "loss": 0.6455, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 934, + "tokens_per_second_per_gpu": 17253.47, + "total_tokens": 23826516 + }, + { + "epoch": 0.07478205230744621, + "grad_norm": 0.5243753790855408, + "learning_rate": 2.6421097126839714e-07, + "loss": 0.6143, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 935, + "tokens_per_second_per_gpu": 17104.36, + "total_tokens": 23852173 + }, + { + "epoch": 0.07486203311205311, + "grad_norm": 0.5066297054290771, + "learning_rate": 2.5629935214764866e-07, + "loss": 0.5797, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 936, + "tokens_per_second_per_gpu": 16824.53, + "total_tokens": 23877312 + }, + { + "epoch": 0.07494201391666, + "grad_norm": 0.5267778635025024, + "learning_rate": 2.4850645694436736e-07, + "loss": 0.6155, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 937, + "tokens_per_second_per_gpu": 16797.05, + "total_tokens": 23902155 + }, + { + "epoch": 0.0750219947212669, + "grad_norm": 0.5988689661026001, + "learning_rate": 2.4083238061252565e-07, + "loss": 0.6415, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 938, + "tokens_per_second_per_gpu": 17265.67, + "total_tokens": 23928370 + }, + { + "epoch": 0.07510197552587379, + "grad_norm": 0.49074527621269226, + "learning_rate": 2.332772166583208e-07, + "loss": 0.595, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 939, + "tokens_per_second_per_gpu": 17273.07, + "total_tokens": 23954278 + }, + { + "epoch": 0.07518195633048068, + "grad_norm": 0.5439552068710327, + "learning_rate": 2.2584105713904126e-07, + "loss": 0.6362, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 940, + "tokens_per_second_per_gpu": 16440.73, + "total_tokens": 23978844 + }, + { + "epoch": 0.07526193713508758, + "grad_norm": 0.5561083555221558, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.6293, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 941, + "tokens_per_second_per_gpu": 16716.95, + "total_tokens": 24003689 + }, + { + "epoch": 0.07534191793969447, + "grad_norm": 0.5233296155929565, + "learning_rate": 2.1132611238315004e-07, + "loss": 0.6433, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 942, + "tokens_per_second_per_gpu": 17137.28, + "total_tokens": 24029244 + }, + { + "epoch": 0.07542189874430137, + "grad_norm": 0.5398736596107483, + "learning_rate": 2.0424750400655947e-07, + "loss": 0.6613, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 943, + "tokens_per_second_per_gpu": 17276.11, + "total_tokens": 24055284 + }, + { + "epoch": 0.07550187954890826, + "grad_norm": 0.5166030526161194, + "learning_rate": 1.9728825378278248e-07, + "loss": 0.6078, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 944, + "tokens_per_second_per_gpu": 17373.57, + "total_tokens": 24081384 + }, + { + "epoch": 0.07558186035351516, + "grad_norm": 0.5131341218948364, + "learning_rate": 1.9044844650808468e-07, + "loss": 0.6308, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 945, + "tokens_per_second_per_gpu": 17527.39, + "total_tokens": 24107343 + }, + { + "epoch": 0.07566184115812205, + "grad_norm": 0.5063515901565552, + "learning_rate": 1.8372816552336025e-07, + "loss": 0.57, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 946, + "tokens_per_second_per_gpu": 17410.89, + "total_tokens": 24133150 + }, + { + "epoch": 0.07574182196272894, + "grad_norm": 0.5385065078735352, + "learning_rate": 1.7712749271311392e-07, + "loss": 0.6102, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 947, + "tokens_per_second_per_gpu": 17121.83, + "total_tokens": 24158646 + }, + { + "epoch": 0.07582180276733584, + "grad_norm": 0.5242645740509033, + "learning_rate": 1.706465085044584e-07, + "loss": 0.6648, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 948, + "tokens_per_second_per_gpu": 17418.14, + "total_tokens": 24185422 + }, + { + "epoch": 0.07590178357194273, + "grad_norm": 0.5127405524253845, + "learning_rate": 1.6428529186614195e-07, + "loss": 0.5975, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 949, + "tokens_per_second_per_gpu": 16750.54, + "total_tokens": 24210681 + }, + { + "epoch": 0.07598176437654963, + "grad_norm": 0.4975850582122803, + "learning_rate": 1.580439203075812e-07, + "loss": 0.6102, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 950, + "tokens_per_second_per_gpu": 17360.01, + "total_tokens": 24237288 + }, + { + "epoch": 0.07606174518115652, + "grad_norm": 0.5240901112556458, + "learning_rate": 1.519224698779198e-07, + "loss": 0.6304, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 951, + "tokens_per_second_per_gpu": 16648.45, + "total_tokens": 24262146 + }, + { + "epoch": 0.07614172598576342, + "grad_norm": 0.5453839898109436, + "learning_rate": 1.4592101516509916e-07, + "loss": 0.6353, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 952, + "tokens_per_second_per_gpu": 17196.04, + "total_tokens": 24288304 + }, + { + "epoch": 0.07622170679037031, + "grad_norm": 0.5186492204666138, + "learning_rate": 1.400396292949513e-07, + "loss": 0.6199, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 953, + "tokens_per_second_per_gpu": 16644.24, + "total_tokens": 24313146 + }, + { + "epoch": 0.0763016875949772, + "grad_norm": 0.49842411279678345, + "learning_rate": 1.3427838393030634e-07, + "loss": 0.6134, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 954, + "tokens_per_second_per_gpu": 17154.09, + "total_tokens": 24339160 + }, + { + "epoch": 0.0763816683995841, + "grad_norm": 0.5567916035652161, + "learning_rate": 1.2863734927012094e-07, + "loss": 0.6414, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 955, + "tokens_per_second_per_gpu": 16807.04, + "total_tokens": 24364306 + }, + { + "epoch": 0.07646164920419099, + "grad_norm": 0.5145589709281921, + "learning_rate": 1.231165940486234e-07, + "loss": 0.5819, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 956, + "tokens_per_second_per_gpu": 16993.98, + "total_tokens": 24389531 + }, + { + "epoch": 0.07654163000879789, + "grad_norm": 0.543738842010498, + "learning_rate": 1.1771618553447217e-07, + "loss": 0.651, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 957, + "tokens_per_second_per_gpu": 17372.18, + "total_tokens": 24415520 + }, + { + "epoch": 0.07662161081340478, + "grad_norm": 0.49085065722465515, + "learning_rate": 1.1243618952994195e-07, + "loss": 0.6086, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 958, + "tokens_per_second_per_gpu": 17246.99, + "total_tokens": 24441786 + }, + { + "epoch": 0.07670159161801168, + "grad_norm": 0.5156171917915344, + "learning_rate": 1.0727667037011668e-07, + "loss": 0.5842, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 959, + "tokens_per_second_per_gpu": 17115.42, + "total_tokens": 24467022 + }, + { + "epoch": 0.07678157242261857, + "grad_norm": 0.5145148634910583, + "learning_rate": 1.0223769092211012e-07, + "loss": 0.5551, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 960, + "tokens_per_second_per_gpu": 16428.63, + "total_tokens": 24490929 + }, + { + "epoch": 0.07686155322722546, + "grad_norm": 0.5537049174308777, + "learning_rate": 9.731931258429638e-08, + "loss": 0.6454, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 961, + "tokens_per_second_per_gpu": 17267.19, + "total_tokens": 24516248 + }, + { + "epoch": 0.07694153403183236, + "grad_norm": 0.543889045715332, + "learning_rate": 9.252159528556404e-08, + "loss": 0.594, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 962, + "tokens_per_second_per_gpu": 16855.69, + "total_tokens": 24541430 + }, + { + "epoch": 0.07702151483643925, + "grad_norm": 0.5286483764648438, + "learning_rate": 8.784459748458318e-08, + "loss": 0.6115, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 963, + "tokens_per_second_per_gpu": 17319.91, + "total_tokens": 24567628 + }, + { + "epoch": 0.07710149564104615, + "grad_norm": 0.5454973578453064, + "learning_rate": 8.328837616909612e-08, + "loss": 0.6497, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 964, + "tokens_per_second_per_gpu": 17356.21, + "total_tokens": 24594157 + }, + { + "epoch": 0.07718147644565304, + "grad_norm": 0.5202831029891968, + "learning_rate": 7.885298685522235e-08, + "loss": 0.6047, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 965, + "tokens_per_second_per_gpu": 17130.28, + "total_tokens": 24619949 + }, + { + "epoch": 0.07726145725025994, + "grad_norm": 0.5220912098884583, + "learning_rate": 7.453848358678018e-08, + "loss": 0.6013, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 966, + "tokens_per_second_per_gpu": 17179.24, + "total_tokens": 24645528 + }, + { + "epoch": 0.07734143805486683, + "grad_norm": 0.5430648326873779, + "learning_rate": 7.034491893463059e-08, + "loss": 0.6401, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 967, + "tokens_per_second_per_gpu": 17248.77, + "total_tokens": 24671124 + }, + { + "epoch": 0.07742141885947372, + "grad_norm": 0.5340640544891357, + "learning_rate": 6.627234399603554e-08, + "loss": 0.6728, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 968, + "tokens_per_second_per_gpu": 17379.29, + "total_tokens": 24697233 + }, + { + "epoch": 0.07750139966408062, + "grad_norm": 0.541018545627594, + "learning_rate": 6.232080839403631e-08, + "loss": 0.6661, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 969, + "tokens_per_second_per_gpu": 17330.58, + "total_tokens": 24723473 + }, + { + "epoch": 0.07758138046868751, + "grad_norm": 0.5246049761772156, + "learning_rate": 5.849036027684607e-08, + "loss": 0.6333, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 970, + "tokens_per_second_per_gpu": 16577.2, + "total_tokens": 24749018 + }, + { + "epoch": 0.07766136127329441, + "grad_norm": 0.5491915941238403, + "learning_rate": 5.4781046317267103e-08, + "loss": 0.6413, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 971, + "tokens_per_second_per_gpu": 17163.06, + "total_tokens": 24774116 + }, + { + "epoch": 0.0777413420779013, + "grad_norm": 0.5680013298988342, + "learning_rate": 5.119291171211793e-08, + "loss": 0.6917, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 972, + "tokens_per_second_per_gpu": 17074.63, + "total_tokens": 24799793 + }, + { + "epoch": 0.0778213228825082, + "grad_norm": 0.5304521322250366, + "learning_rate": 4.772600018168816e-08, + "loss": 0.622, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 973, + "tokens_per_second_per_gpu": 16766.62, + "total_tokens": 24825100 + }, + { + "epoch": 0.07790130368711509, + "grad_norm": 0.5385372042655945, + "learning_rate": 4.438035396920004e-08, + "loss": 0.6456, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 974, + "tokens_per_second_per_gpu": 17139.0, + "total_tokens": 24851690 + }, + { + "epoch": 0.07798128449172198, + "grad_norm": 0.549923300743103, + "learning_rate": 4.115601384029666e-08, + "loss": 0.6265, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 975, + "tokens_per_second_per_gpu": 16505.25, + "total_tokens": 24875789 + }, + { + "epoch": 0.07806126529632888, + "grad_norm": 0.5306264758110046, + "learning_rate": 3.805301908254455e-08, + "loss": 0.5789, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 976, + "tokens_per_second_per_gpu": 17103.72, + "total_tokens": 24900703 + }, + { + "epoch": 0.07814124610093577, + "grad_norm": 0.6005721688270569, + "learning_rate": 3.50714075049563e-08, + "loss": 0.5928, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 977, + "tokens_per_second_per_gpu": 16680.56, + "total_tokens": 24926020 + }, + { + "epoch": 0.07822122690554267, + "grad_norm": 0.5427029728889465, + "learning_rate": 3.22112154375287e-08, + "loss": 0.6533, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 978, + "tokens_per_second_per_gpu": 16310.65, + "total_tokens": 24950374 + }, + { + "epoch": 0.07830120771014956, + "grad_norm": 0.5275732278823853, + "learning_rate": 2.947247773079753e-08, + "loss": 0.6329, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 979, + "tokens_per_second_per_gpu": 16975.33, + "total_tokens": 24975610 + }, + { + "epoch": 0.07838118851475646, + "grad_norm": 0.5323116183280945, + "learning_rate": 2.6855227755419046e-08, + "loss": 0.6537, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 980, + "tokens_per_second_per_gpu": 17462.53, + "total_tokens": 25001687 + }, + { + "epoch": 0.07846116931936335, + "grad_norm": 0.5180700421333313, + "learning_rate": 2.4359497401758026e-08, + "loss": 0.6228, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 981, + "tokens_per_second_per_gpu": 17019.02, + "total_tokens": 25026807 + }, + { + "epoch": 0.07854115012397024, + "grad_norm": 0.5496319532394409, + "learning_rate": 2.1985317079500358e-08, + "loss": 0.6294, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 982, + "tokens_per_second_per_gpu": 16897.37, + "total_tokens": 25051565 + }, + { + "epoch": 0.07862113092857714, + "grad_norm": 0.5391293168067932, + "learning_rate": 1.973271571728441e-08, + "loss": 0.6272, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 983, + "tokens_per_second_per_gpu": 16772.23, + "total_tokens": 25076522 + }, + { + "epoch": 0.07870111173318403, + "grad_norm": 0.5196576714515686, + "learning_rate": 1.7601720762346895e-08, + "loss": 0.6276, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 984, + "tokens_per_second_per_gpu": 16931.06, + "total_tokens": 25102258 + }, + { + "epoch": 0.07878109253779093, + "grad_norm": 0.5071877241134644, + "learning_rate": 1.5592358180189782e-08, + "loss": 0.6424, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 985, + "tokens_per_second_per_gpu": 17018.31, + "total_tokens": 25128366 + }, + { + "epoch": 0.07886107334239782, + "grad_norm": 0.538322925567627, + "learning_rate": 1.370465245426167e-08, + "loss": 0.6304, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 986, + "tokens_per_second_per_gpu": 16589.6, + "total_tokens": 25152870 + }, + { + "epoch": 0.07894105414700472, + "grad_norm": 0.49802151322364807, + "learning_rate": 1.1938626585660252e-08, + "loss": 0.5973, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 987, + "tokens_per_second_per_gpu": 17044.83, + "total_tokens": 25178205 + }, + { + "epoch": 0.07902103495161161, + "grad_norm": 0.5292408466339111, + "learning_rate": 1.0294302092853647e-08, + "loss": 0.6749, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 988, + "tokens_per_second_per_gpu": 17062.32, + "total_tokens": 25203764 + }, + { + "epoch": 0.0791010157562185, + "grad_norm": 0.5256536602973938, + "learning_rate": 8.771699011416169e-09, + "loss": 0.6082, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 989, + "tokens_per_second_per_gpu": 17511.87, + "total_tokens": 25229634 + }, + { + "epoch": 0.0791809965608254, + "grad_norm": 0.5201627016067505, + "learning_rate": 7.370835893788508e-09, + "loss": 0.6419, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 990, + "tokens_per_second_per_gpu": 17090.54, + "total_tokens": 25255664 + }, + { + "epoch": 0.07926097736543229, + "grad_norm": 0.56233149766922, + "learning_rate": 6.091729809042379e-09, + "loss": 0.6216, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 991, + "tokens_per_second_per_gpu": 17406.21, + "total_tokens": 25281827 + }, + { + "epoch": 0.0793409581700392, + "grad_norm": 0.5474256277084351, + "learning_rate": 4.9343963426840006e-09, + "loss": 0.6036, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 992, + "tokens_per_second_per_gpu": 16687.8, + "total_tokens": 25306499 + }, + { + "epoch": 0.07942093897464608, + "grad_norm": 0.4903515577316284, + "learning_rate": 3.898849596456477e-09, + "loss": 0.5721, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 993, + "tokens_per_second_per_gpu": 17239.93, + "total_tokens": 25332595 + }, + { + "epoch": 0.07950091977925298, + "grad_norm": 0.5319344997406006, + "learning_rate": 2.9851021881688314e-09, + "loss": 0.6523, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 994, + "tokens_per_second_per_gpu": 17089.9, + "total_tokens": 25358509 + }, + { + "epoch": 0.07958090058385987, + "grad_norm": 0.5517778396606445, + "learning_rate": 2.193165251545004e-09, + "loss": 0.6289, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 995, + "tokens_per_second_per_gpu": 17147.96, + "total_tokens": 25384428 + }, + { + "epoch": 0.07966088138846676, + "grad_norm": 0.5497578978538513, + "learning_rate": 1.5230484360873043e-09, + "loss": 0.6752, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 996, + "tokens_per_second_per_gpu": 17469.68, + "total_tokens": 25411007 + }, + { + "epoch": 0.07974086219307366, + "grad_norm": 0.5634413361549377, + "learning_rate": 9.74759906957612e-10, + "loss": 0.6161, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 997, + "tokens_per_second_per_gpu": 16112.77, + "total_tokens": 25435004 + }, + { + "epoch": 0.07982084299768055, + "grad_norm": 0.5436570048332214, + "learning_rate": 5.483063448785686e-10, + "loss": 0.636, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 998, + "tokens_per_second_per_gpu": 16917.97, + "total_tokens": 25460910 + }, + { + "epoch": 0.07990082380228745, + "grad_norm": 0.5557326078414917, + "learning_rate": 2.436929460525317e-10, + "loss": 0.6001, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 999, + "tokens_per_second_per_gpu": 17178.6, + "total_tokens": 25486316 + }, + { + "epoch": 0.07998080460689434, + "grad_norm": 0.5365746021270752, + "learning_rate": 6.092342209607083e-11, + "loss": 0.616, + "memory/device_reserved (GiB)": 69.96, + "memory/max_active (GiB)": 66.03, + "memory/max_allocated (GiB)": 66.03, + "step": 1000, + "tokens_per_second_per_gpu": 16970.21, + "total_tokens": 25511597 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.2087150526464e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}