{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07998080460689434, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.998080460689434e-05, "grad_norm": 1.038775086402893, "learning_rate": 0.0, "loss": 1.6057, "memory/device_reserved (GiB)": 69.76, "memory/max_active (GiB)": 65.79, "memory/max_allocated (GiB)": 65.79, "step": 1, "tokens_per_second_per_gpu": 1277.91, "total_tokens": 25611 }, { "epoch": 0.00015996160921378868, "grad_norm": 1.098587155342102, "learning_rate": 2.0000000000000002e-07, "loss": 1.6297, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 2, "tokens_per_second_per_gpu": 16977.66, "total_tokens": 50942 }, { "epoch": 0.00023994241382068303, "grad_norm": 0.975591242313385, "learning_rate": 4.0000000000000003e-07, "loss": 1.5663, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 3, "tokens_per_second_per_gpu": 17162.2, "total_tokens": 76956 }, { "epoch": 0.00031992321842757736, "grad_norm": 1.0335264205932617, "learning_rate": 6.000000000000001e-07, "loss": 1.5648, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 4, "tokens_per_second_per_gpu": 17404.84, "total_tokens": 102664 }, { "epoch": 0.0003999040230344717, "grad_norm": 1.0145632028579712, "learning_rate": 8.000000000000001e-07, "loss": 1.5456, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 5, "tokens_per_second_per_gpu": 16790.34, "total_tokens": 128081 }, { "epoch": 0.00047988482764136606, "grad_norm": 1.065081000328064, "learning_rate": 1.0000000000000002e-06, "loss": 1.6122, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 6, "tokens_per_second_per_gpu": 16943.09, "total_tokens": 153326 }, { "epoch": 0.0005598656322482605, "grad_norm": 1.0195869207382202, "learning_rate": 1.2000000000000002e-06, "loss": 1.5448, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 7, "tokens_per_second_per_gpu": 16990.25, "total_tokens": 179126 }, { "epoch": 0.0006398464368551547, "grad_norm": 1.109100580215454, "learning_rate": 1.4000000000000001e-06, "loss": 1.5649, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 8, "tokens_per_second_per_gpu": 16858.03, "total_tokens": 204206 }, { "epoch": 0.0007198272414620491, "grad_norm": 1.0833709239959717, "learning_rate": 1.6000000000000001e-06, "loss": 1.6006, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 9, "tokens_per_second_per_gpu": 16988.61, "total_tokens": 229672 }, { "epoch": 0.0007998080460689434, "grad_norm": 1.0193920135498047, "learning_rate": 1.8000000000000001e-06, "loss": 1.6112, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 10, "tokens_per_second_per_gpu": 16988.44, "total_tokens": 255388 }, { "epoch": 0.0008797888506758378, "grad_norm": 0.9576646685600281, "learning_rate": 2.0000000000000003e-06, "loss": 1.5652, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 11, "tokens_per_second_per_gpu": 17429.3, "total_tokens": 281853 }, { "epoch": 0.0009597696552827321, "grad_norm": 1.0531549453735352, "learning_rate": 2.2e-06, "loss": 1.561, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 12, "tokens_per_second_per_gpu": 17523.77, "total_tokens": 307711 }, { "epoch": 0.0010397504598896265, "grad_norm": 0.9403714537620544, "learning_rate": 2.4000000000000003e-06, "loss": 1.4986, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 13, "tokens_per_second_per_gpu": 17242.09, "total_tokens": 333825 }, { "epoch": 0.001119731264496521, "grad_norm": 1.0354647636413574, "learning_rate": 2.6e-06, "loss": 1.503, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 14, "tokens_per_second_per_gpu": 17361.34, "total_tokens": 359895 }, { "epoch": 0.0011997120691034152, "grad_norm": 1.0876542329788208, "learning_rate": 2.8000000000000003e-06, "loss": 1.5442, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 15, "tokens_per_second_per_gpu": 16605.41, "total_tokens": 384593 }, { "epoch": 0.0012796928737103094, "grad_norm": 1.1391915082931519, "learning_rate": 3e-06, "loss": 1.5995, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 16, "tokens_per_second_per_gpu": 16636.62, "total_tokens": 409373 }, { "epoch": 0.0013596736783172039, "grad_norm": 0.9413732290267944, "learning_rate": 3.2000000000000003e-06, "loss": 1.5109, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 17, "tokens_per_second_per_gpu": 17460.95, "total_tokens": 435918 }, { "epoch": 0.0014396544829240981, "grad_norm": 1.0454152822494507, "learning_rate": 3.4000000000000005e-06, "loss": 1.5649, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 18, "tokens_per_second_per_gpu": 16992.1, "total_tokens": 461473 }, { "epoch": 0.0015196352875309926, "grad_norm": 1.039425253868103, "learning_rate": 3.6000000000000003e-06, "loss": 1.5598, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 19, "tokens_per_second_per_gpu": 17610.58, "total_tokens": 487930 }, { "epoch": 0.0015996160921378868, "grad_norm": 1.0049670934677124, "learning_rate": 3.8000000000000005e-06, "loss": 1.5424, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 20, "tokens_per_second_per_gpu": 17135.21, "total_tokens": 513747 }, { "epoch": 0.0016795968967447813, "grad_norm": 1.077114462852478, "learning_rate": 4.000000000000001e-06, "loss": 1.5031, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 21, "tokens_per_second_per_gpu": 16893.74, "total_tokens": 538891 }, { "epoch": 0.0017595777013516755, "grad_norm": 1.0136423110961914, "learning_rate": 4.2000000000000004e-06, "loss": 1.5642, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 22, "tokens_per_second_per_gpu": 17259.47, "total_tokens": 565110 }, { "epoch": 0.00183955850595857, "grad_norm": 1.0550577640533447, "learning_rate": 4.4e-06, "loss": 1.5313, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 23, "tokens_per_second_per_gpu": 16961.68, "total_tokens": 590711 }, { "epoch": 0.0019195393105654642, "grad_norm": 1.0451573133468628, "learning_rate": 4.600000000000001e-06, "loss": 1.543, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 24, "tokens_per_second_per_gpu": 16893.58, "total_tokens": 616331 }, { "epoch": 0.0019995201151723585, "grad_norm": 1.1151784658432007, "learning_rate": 4.800000000000001e-06, "loss": 1.5585, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 25, "tokens_per_second_per_gpu": 16911.88, "total_tokens": 641401 }, { "epoch": 0.002079500919779253, "grad_norm": 1.0715839862823486, "learning_rate": 5e-06, "loss": 1.5898, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 26, "tokens_per_second_per_gpu": 16957.69, "total_tokens": 667117 }, { "epoch": 0.0021594817243861474, "grad_norm": 1.049048900604248, "learning_rate": 5.2e-06, "loss": 1.5229, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 27, "tokens_per_second_per_gpu": 17196.54, "total_tokens": 693040 }, { "epoch": 0.002239462528993042, "grad_norm": 1.128364086151123, "learning_rate": 5.400000000000001e-06, "loss": 1.5577, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 28, "tokens_per_second_per_gpu": 16501.39, "total_tokens": 717589 }, { "epoch": 0.002319443333599936, "grad_norm": 1.0650986433029175, "learning_rate": 5.600000000000001e-06, "loss": 1.5394, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 29, "tokens_per_second_per_gpu": 16990.67, "total_tokens": 743423 }, { "epoch": 0.0023994241382068304, "grad_norm": 1.0103224515914917, "learning_rate": 5.8e-06, "loss": 1.4581, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 30, "tokens_per_second_per_gpu": 17264.37, "total_tokens": 769708 }, { "epoch": 0.002479404942813725, "grad_norm": 1.074812650680542, "learning_rate": 6e-06, "loss": 1.5418, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 31, "tokens_per_second_per_gpu": 16900.72, "total_tokens": 794946 }, { "epoch": 0.002559385747420619, "grad_norm": 1.1037012338638306, "learning_rate": 6.200000000000001e-06, "loss": 1.554, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 32, "tokens_per_second_per_gpu": 17437.47, "total_tokens": 821006 }, { "epoch": 0.0026393665520275133, "grad_norm": 1.056754469871521, "learning_rate": 6.4000000000000006e-06, "loss": 1.5, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 33, "tokens_per_second_per_gpu": 16929.32, "total_tokens": 846649 }, { "epoch": 0.0027193473566344078, "grad_norm": 0.9937567710876465, "learning_rate": 6.600000000000001e-06, "loss": 1.5208, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 34, "tokens_per_second_per_gpu": 17016.45, "total_tokens": 872946 }, { "epoch": 0.0027993281612413022, "grad_norm": 0.997081458568573, "learning_rate": 6.800000000000001e-06, "loss": 1.4191, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 35, "tokens_per_second_per_gpu": 17202.72, "total_tokens": 899114 }, { "epoch": 0.0028793089658481963, "grad_norm": 1.0784165859222412, "learning_rate": 7e-06, "loss": 1.4787, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 36, "tokens_per_second_per_gpu": 17030.42, "total_tokens": 924446 }, { "epoch": 0.0029592897704550907, "grad_norm": 1.1074408292770386, "learning_rate": 7.2000000000000005e-06, "loss": 1.5245, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 37, "tokens_per_second_per_gpu": 17583.74, "total_tokens": 950398 }, { "epoch": 0.003039270575061985, "grad_norm": 1.0375193357467651, "learning_rate": 7.4e-06, "loss": 1.4235, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 38, "tokens_per_second_per_gpu": 16790.68, "total_tokens": 975601 }, { "epoch": 0.0031192513796688796, "grad_norm": 1.0000883340835571, "learning_rate": 7.600000000000001e-06, "loss": 1.4206, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 39, "tokens_per_second_per_gpu": 16934.89, "total_tokens": 1000972 }, { "epoch": 0.0031992321842757737, "grad_norm": 1.0457230806350708, "learning_rate": 7.800000000000002e-06, "loss": 1.4411, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 40, "tokens_per_second_per_gpu": 14105.13, "total_tokens": 1026771 }, { "epoch": 0.003279212988882668, "grad_norm": 0.9556184411048889, "learning_rate": 8.000000000000001e-06, "loss": 1.4376, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 41, "tokens_per_second_per_gpu": 17975.2, "total_tokens": 1053835 }, { "epoch": 0.0033591937934895626, "grad_norm": 1.1289631128311157, "learning_rate": 8.2e-06, "loss": 1.4106, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 42, "tokens_per_second_per_gpu": 16485.8, "total_tokens": 1077561 }, { "epoch": 0.003439174598096457, "grad_norm": 0.9723970293998718, "learning_rate": 8.400000000000001e-06, "loss": 1.4078, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 43, "tokens_per_second_per_gpu": 16537.68, "total_tokens": 1102268 }, { "epoch": 0.003519155402703351, "grad_norm": 1.06087327003479, "learning_rate": 8.6e-06, "loss": 1.4605, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 44, "tokens_per_second_per_gpu": 16487.1, "total_tokens": 1126547 }, { "epoch": 0.0035991362073102455, "grad_norm": 0.8268716931343079, "learning_rate": 8.8e-06, "loss": 1.3048, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 45, "tokens_per_second_per_gpu": 17013.09, "total_tokens": 1152320 }, { "epoch": 0.00367911701191714, "grad_norm": 0.923682451248169, "learning_rate": 9e-06, "loss": 1.3843, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 46, "tokens_per_second_per_gpu": 16652.7, "total_tokens": 1177368 }, { "epoch": 0.003759097816524034, "grad_norm": 0.8878368139266968, "learning_rate": 9.200000000000002e-06, "loss": 1.3663, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 47, "tokens_per_second_per_gpu": 16533.02, "total_tokens": 1202165 }, { "epoch": 0.0038390786211309285, "grad_norm": 0.9188768267631531, "learning_rate": 9.4e-06, "loss": 1.4352, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 48, "tokens_per_second_per_gpu": 17016.3, "total_tokens": 1226827 }, { "epoch": 0.0039190594257378225, "grad_norm": 0.8016843795776367, "learning_rate": 9.600000000000001e-06, "loss": 1.3324, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 49, "tokens_per_second_per_gpu": 17175.21, "total_tokens": 1252015 }, { "epoch": 0.003999040230344717, "grad_norm": 0.8222874999046326, "learning_rate": 9.800000000000001e-06, "loss": 1.3224, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 50, "tokens_per_second_per_gpu": 16548.61, "total_tokens": 1276515 }, { "epoch": 0.004079021034951611, "grad_norm": 0.6688214540481567, "learning_rate": 1e-05, "loss": 1.2844, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 51, "tokens_per_second_per_gpu": 17318.33, "total_tokens": 1302842 }, { "epoch": 0.004159001839558506, "grad_norm": 0.6350716352462769, "learning_rate": 1.02e-05, "loss": 1.2481, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 52, "tokens_per_second_per_gpu": 17270.15, "total_tokens": 1329409 }, { "epoch": 0.0042389826441654, "grad_norm": 0.7399108409881592, "learning_rate": 1.04e-05, "loss": 1.3018, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 53, "tokens_per_second_per_gpu": 16473.95, "total_tokens": 1353912 }, { "epoch": 0.004318963448772295, "grad_norm": 0.6630793809890747, "learning_rate": 1.0600000000000002e-05, "loss": 1.1977, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 54, "tokens_per_second_per_gpu": 16858.54, "total_tokens": 1379028 }, { "epoch": 0.004398944253379189, "grad_norm": 0.58243727684021, "learning_rate": 1.0800000000000002e-05, "loss": 1.3015, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 55, "tokens_per_second_per_gpu": 17335.57, "total_tokens": 1405227 }, { "epoch": 0.004478925057986084, "grad_norm": 0.5659134984016418, "learning_rate": 1.1000000000000001e-05, "loss": 1.2674, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 56, "tokens_per_second_per_gpu": 17241.25, "total_tokens": 1430933 }, { "epoch": 0.004558905862592977, "grad_norm": 0.5408620238304138, "learning_rate": 1.1200000000000001e-05, "loss": 1.2374, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 57, "tokens_per_second_per_gpu": 17259.68, "total_tokens": 1456689 }, { "epoch": 0.004638886667199872, "grad_norm": 0.49004459381103516, "learning_rate": 1.14e-05, "loss": 1.25, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 58, "tokens_per_second_per_gpu": 16924.62, "total_tokens": 1482624 }, { "epoch": 0.004718867471806766, "grad_norm": 0.5233814716339111, "learning_rate": 1.16e-05, "loss": 1.1618, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 59, "tokens_per_second_per_gpu": 16820.35, "total_tokens": 1507637 }, { "epoch": 0.004798848276413661, "grad_norm": 0.4358421266078949, "learning_rate": 1.18e-05, "loss": 1.2003, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 60, "tokens_per_second_per_gpu": 17670.45, "total_tokens": 1534597 }, { "epoch": 0.004878829081020555, "grad_norm": 0.44443076848983765, "learning_rate": 1.2e-05, "loss": 1.2287, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 61, "tokens_per_second_per_gpu": 16846.22, "total_tokens": 1559749 }, { "epoch": 0.00495880988562745, "grad_norm": 0.39861562848091125, "learning_rate": 1.22e-05, "loss": 1.1572, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 62, "tokens_per_second_per_gpu": 17121.54, "total_tokens": 1585882 }, { "epoch": 0.005038790690234344, "grad_norm": 0.4339846968650818, "learning_rate": 1.2400000000000002e-05, "loss": 1.2128, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 63, "tokens_per_second_per_gpu": 16583.88, "total_tokens": 1610180 }, { "epoch": 0.005118771494841238, "grad_norm": 0.35104724764823914, "learning_rate": 1.2600000000000001e-05, "loss": 1.0974, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 64, "tokens_per_second_per_gpu": 17028.21, "total_tokens": 1636105 }, { "epoch": 0.005198752299448132, "grad_norm": 0.3317544162273407, "learning_rate": 1.2800000000000001e-05, "loss": 1.1286, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 65, "tokens_per_second_per_gpu": 17131.02, "total_tokens": 1661802 }, { "epoch": 0.005278733104055027, "grad_norm": 0.32120752334594727, "learning_rate": 1.3000000000000001e-05, "loss": 1.1658, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 66, "tokens_per_second_per_gpu": 17810.85, "total_tokens": 1688553 }, { "epoch": 0.005358713908661921, "grad_norm": 0.30699560046195984, "learning_rate": 1.3200000000000002e-05, "loss": 1.1215, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 67, "tokens_per_second_per_gpu": 17086.6, "total_tokens": 1714293 }, { "epoch": 0.0054386947132688155, "grad_norm": 0.31427061557769775, "learning_rate": 1.3400000000000002e-05, "loss": 1.2197, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 68, "tokens_per_second_per_gpu": 17286.05, "total_tokens": 1740245 }, { "epoch": 0.00551867551787571, "grad_norm": 0.3120593726634979, "learning_rate": 1.3600000000000002e-05, "loss": 1.1437, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 69, "tokens_per_second_per_gpu": 16364.4, "total_tokens": 1764643 }, { "epoch": 0.0055986563224826045, "grad_norm": 0.27037009596824646, "learning_rate": 1.38e-05, "loss": 1.1268, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 70, "tokens_per_second_per_gpu": 16857.39, "total_tokens": 1790305 }, { "epoch": 0.005678637127089499, "grad_norm": 0.30426427721977234, "learning_rate": 1.4e-05, "loss": 1.1735, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 71, "tokens_per_second_per_gpu": 16563.03, "total_tokens": 1814880 }, { "epoch": 0.0057586179316963925, "grad_norm": 0.2649443745613098, "learning_rate": 1.4200000000000001e-05, "loss": 1.1177, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 72, "tokens_per_second_per_gpu": 17367.91, "total_tokens": 1840810 }, { "epoch": 0.005838598736303287, "grad_norm": 0.253825843334198, "learning_rate": 1.4400000000000001e-05, "loss": 1.144, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 73, "tokens_per_second_per_gpu": 16918.97, "total_tokens": 1866252 }, { "epoch": 0.005918579540910181, "grad_norm": 0.2598889172077179, "learning_rate": 1.46e-05, "loss": 1.1142, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 74, "tokens_per_second_per_gpu": 17104.53, "total_tokens": 1892027 }, { "epoch": 0.005998560345517076, "grad_norm": 0.2475835680961609, "learning_rate": 1.48e-05, "loss": 1.0584, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 75, "tokens_per_second_per_gpu": 16580.5, "total_tokens": 1916988 }, { "epoch": 0.00607854115012397, "grad_norm": 0.25065016746520996, "learning_rate": 1.5000000000000002e-05, "loss": 1.1627, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 76, "tokens_per_second_per_gpu": 17326.24, "total_tokens": 1943188 }, { "epoch": 0.006158521954730865, "grad_norm": 0.23243308067321777, "learning_rate": 1.5200000000000002e-05, "loss": 1.1042, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 77, "tokens_per_second_per_gpu": 17705.83, "total_tokens": 1970033 }, { "epoch": 0.006238502759337759, "grad_norm": 0.22412195801734924, "learning_rate": 1.54e-05, "loss": 1.1211, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 78, "tokens_per_second_per_gpu": 17159.33, "total_tokens": 1996140 }, { "epoch": 0.006318483563944653, "grad_norm": 0.23629942536354065, "learning_rate": 1.5600000000000003e-05, "loss": 1.1115, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 79, "tokens_per_second_per_gpu": 17352.1, "total_tokens": 2022475 }, { "epoch": 0.006398464368551547, "grad_norm": 0.24803169071674347, "learning_rate": 1.58e-05, "loss": 1.0912, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 80, "tokens_per_second_per_gpu": 17005.04, "total_tokens": 2048082 }, { "epoch": 0.006478445173158442, "grad_norm": 0.24923603236675262, "learning_rate": 1.6000000000000003e-05, "loss": 1.1203, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 81, "tokens_per_second_per_gpu": 17025.63, "total_tokens": 2073687 }, { "epoch": 0.006558425977765336, "grad_norm": 0.22304023802280426, "learning_rate": 1.62e-05, "loss": 1.083, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 82, "tokens_per_second_per_gpu": 17159.24, "total_tokens": 2098948 }, { "epoch": 0.006638406782372231, "grad_norm": 0.21665704250335693, "learning_rate": 1.64e-05, "loss": 1.091, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 83, "tokens_per_second_per_gpu": 16997.08, "total_tokens": 2125027 }, { "epoch": 0.006718387586979125, "grad_norm": 0.24451886117458344, "learning_rate": 1.66e-05, "loss": 1.1218, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 84, "tokens_per_second_per_gpu": 17150.21, "total_tokens": 2150829 }, { "epoch": 0.00679836839158602, "grad_norm": 0.23331356048583984, "learning_rate": 1.6800000000000002e-05, "loss": 1.1073, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 85, "tokens_per_second_per_gpu": 16603.59, "total_tokens": 2175685 }, { "epoch": 0.006878349196192914, "grad_norm": 0.21449171006679535, "learning_rate": 1.7e-05, "loss": 1.026, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 86, "tokens_per_second_per_gpu": 16826.79, "total_tokens": 2200963 }, { "epoch": 0.006958330000799808, "grad_norm": 0.22198700904846191, "learning_rate": 1.72e-05, "loss": 1.0516, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 87, "tokens_per_second_per_gpu": 16907.11, "total_tokens": 2226251 }, { "epoch": 0.007038310805406702, "grad_norm": 0.21258434653282166, "learning_rate": 1.7400000000000003e-05, "loss": 1.0373, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 88, "tokens_per_second_per_gpu": 16928.44, "total_tokens": 2251330 }, { "epoch": 0.007118291610013597, "grad_norm": 0.21518750488758087, "learning_rate": 1.76e-05, "loss": 1.0988, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 89, "tokens_per_second_per_gpu": 16589.13, "total_tokens": 2276301 }, { "epoch": 0.007198272414620491, "grad_norm": 0.20537728071212769, "learning_rate": 1.7800000000000002e-05, "loss": 1.0329, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 90, "tokens_per_second_per_gpu": 17318.88, "total_tokens": 2303225 }, { "epoch": 0.0072782532192273855, "grad_norm": 0.20714648067951202, "learning_rate": 1.8e-05, "loss": 1.0602, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 91, "tokens_per_second_per_gpu": 16719.92, "total_tokens": 2328769 }, { "epoch": 0.00735823402383428, "grad_norm": 0.1941813975572586, "learning_rate": 1.8200000000000002e-05, "loss": 0.9607, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 92, "tokens_per_second_per_gpu": 17066.42, "total_tokens": 2354588 }, { "epoch": 0.0074382148284411745, "grad_norm": 0.21113121509552002, "learning_rate": 1.8400000000000003e-05, "loss": 1.0515, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 93, "tokens_per_second_per_gpu": 16788.91, "total_tokens": 2379900 }, { "epoch": 0.007518195633048068, "grad_norm": 0.21530379354953766, "learning_rate": 1.86e-05, "loss": 1.0066, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 94, "tokens_per_second_per_gpu": 17013.31, "total_tokens": 2405023 }, { "epoch": 0.0075981764376549625, "grad_norm": 0.19766011834144592, "learning_rate": 1.88e-05, "loss": 1.059, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 95, "tokens_per_second_per_gpu": 17033.56, "total_tokens": 2430914 }, { "epoch": 0.007678157242261857, "grad_norm": 0.21232014894485474, "learning_rate": 1.9e-05, "loss": 1.0619, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 96, "tokens_per_second_per_gpu": 17229.84, "total_tokens": 2456462 }, { "epoch": 0.007758138046868751, "grad_norm": 0.20775918662548065, "learning_rate": 1.9200000000000003e-05, "loss": 1.0754, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 97, "tokens_per_second_per_gpu": 17206.74, "total_tokens": 2482593 }, { "epoch": 0.007838118851475645, "grad_norm": 0.19987605512142181, "learning_rate": 1.94e-05, "loss": 0.9953, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 98, "tokens_per_second_per_gpu": 16420.21, "total_tokens": 2507351 }, { "epoch": 0.00791809965608254, "grad_norm": 0.2022673338651657, "learning_rate": 1.9600000000000002e-05, "loss": 1.0519, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 99, "tokens_per_second_per_gpu": 17355.34, "total_tokens": 2533599 }, { "epoch": 0.007998080460689434, "grad_norm": 0.20371320843696594, "learning_rate": 1.98e-05, "loss": 1.0564, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 100, "tokens_per_second_per_gpu": 16927.47, "total_tokens": 2558539 }, { "epoch": 0.00807806126529633, "grad_norm": 0.200734481215477, "learning_rate": 2e-05, "loss": 0.9858, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 101, "tokens_per_second_per_gpu": 16636.02, "total_tokens": 2583353 }, { "epoch": 0.008158042069903223, "grad_norm": 0.20651081204414368, "learning_rate": 1.9999939076577906e-05, "loss": 1.0509, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 102, "tokens_per_second_per_gpu": 16987.8, "total_tokens": 2608864 }, { "epoch": 0.008238022874510118, "grad_norm": 0.1920926421880722, "learning_rate": 1.9999756307053947e-05, "loss": 0.9654, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 103, "tokens_per_second_per_gpu": 16783.27, "total_tokens": 2634375 }, { "epoch": 0.008318003679117012, "grad_norm": 0.19755157828330994, "learning_rate": 1.9999451693655125e-05, "loss": 0.9866, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 104, "tokens_per_second_per_gpu": 17391.35, "total_tokens": 2659835 }, { "epoch": 0.008397984483723907, "grad_norm": 0.19291236996650696, "learning_rate": 1.9999025240093045e-05, "loss": 1.0565, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 105, "tokens_per_second_per_gpu": 17339.57, "total_tokens": 2686150 }, { "epoch": 0.0084779652883308, "grad_norm": 0.18459810316562653, "learning_rate": 1.9998476951563914e-05, "loss": 0.993, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 106, "tokens_per_second_per_gpu": 17021.35, "total_tokens": 2711966 }, { "epoch": 0.008557946092937694, "grad_norm": 0.2000616043806076, "learning_rate": 1.9997806834748455e-05, "loss": 1.0645, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 107, "tokens_per_second_per_gpu": 16954.79, "total_tokens": 2738061 }, { "epoch": 0.00863792689754459, "grad_norm": 0.19253303110599518, "learning_rate": 1.9997014897811834e-05, "loss": 0.9867, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 108, "tokens_per_second_per_gpu": 16678.06, "total_tokens": 2763035 }, { "epoch": 0.008717907702151483, "grad_norm": 0.22710327804088593, "learning_rate": 1.9996101150403543e-05, "loss": 1.0623, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 109, "tokens_per_second_per_gpu": 16594.39, "total_tokens": 2788014 }, { "epoch": 0.008797888506758379, "grad_norm": 0.18572771549224854, "learning_rate": 1.9995065603657317e-05, "loss": 0.9652, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 110, "tokens_per_second_per_gpu": 16515.36, "total_tokens": 2812864 }, { "epoch": 0.008877869311365272, "grad_norm": 0.20359967648983002, "learning_rate": 1.999390827019096e-05, "loss": 1.0123, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 111, "tokens_per_second_per_gpu": 17242.77, "total_tokens": 2838892 }, { "epoch": 0.008957850115972167, "grad_norm": 0.19035907089710236, "learning_rate": 1.999262916410621e-05, "loss": 0.9459, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 112, "tokens_per_second_per_gpu": 17095.24, "total_tokens": 2864893 }, { "epoch": 0.009037830920579061, "grad_norm": 0.19774137437343597, "learning_rate": 1.9991228300988586e-05, "loss": 1.0056, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 113, "tokens_per_second_per_gpu": 17034.09, "total_tokens": 2890624 }, { "epoch": 0.009117811725185955, "grad_norm": 0.19346508383750916, "learning_rate": 1.998970569790715e-05, "loss": 0.9834, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 114, "tokens_per_second_per_gpu": 17264.27, "total_tokens": 2916880 }, { "epoch": 0.00919779252979285, "grad_norm": 0.19959688186645508, "learning_rate": 1.9988061373414342e-05, "loss": 1.0041, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 115, "tokens_per_second_per_gpu": 16800.76, "total_tokens": 2941856 }, { "epoch": 0.009277773334399744, "grad_norm": 0.19120177626609802, "learning_rate": 1.9986295347545738e-05, "loss": 0.9453, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 116, "tokens_per_second_per_gpu": 17213.91, "total_tokens": 2967939 }, { "epoch": 0.009357754139006639, "grad_norm": 0.19319495558738708, "learning_rate": 1.9984407641819812e-05, "loss": 1.0185, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 117, "tokens_per_second_per_gpu": 17607.03, "total_tokens": 2995350 }, { "epoch": 0.009437734943613533, "grad_norm": 0.19155430793762207, "learning_rate": 1.9982398279237657e-05, "loss": 1.0413, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 118, "tokens_per_second_per_gpu": 17920.03, "total_tokens": 3022424 }, { "epoch": 0.009517715748220428, "grad_norm": 0.19833408296108246, "learning_rate": 1.9980267284282718e-05, "loss": 1.006, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 119, "tokens_per_second_per_gpu": 17428.58, "total_tokens": 3048402 }, { "epoch": 0.009597696552827321, "grad_norm": 0.19430740177631378, "learning_rate": 1.9978014682920503e-05, "loss": 0.9806, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 120, "tokens_per_second_per_gpu": 16750.28, "total_tokens": 3073297 }, { "epoch": 0.009677677357434215, "grad_norm": 0.19468539953231812, "learning_rate": 1.9975640502598243e-05, "loss": 0.9751, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 121, "tokens_per_second_per_gpu": 16610.64, "total_tokens": 3098335 }, { "epoch": 0.00975765816204111, "grad_norm": 0.19551995396614075, "learning_rate": 1.997314477224458e-05, "loss": 0.9821, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 122, "tokens_per_second_per_gpu": 17140.77, "total_tokens": 3123841 }, { "epoch": 0.009837638966648004, "grad_norm": 0.19409964978694916, "learning_rate": 1.9970527522269204e-05, "loss": 0.9219, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 123, "tokens_per_second_per_gpu": 16856.43, "total_tokens": 3149208 }, { "epoch": 0.0099176197712549, "grad_norm": 0.19458907842636108, "learning_rate": 1.9967788784562474e-05, "loss": 1.0324, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 124, "tokens_per_second_per_gpu": 16793.17, "total_tokens": 3173736 }, { "epoch": 0.009997600575861793, "grad_norm": 0.19394950568675995, "learning_rate": 1.9964928592495046e-05, "loss": 0.9587, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 125, "tokens_per_second_per_gpu": 16811.72, "total_tokens": 3198890 }, { "epoch": 0.010077581380468688, "grad_norm": 0.1940041482448578, "learning_rate": 1.9961946980917457e-05, "loss": 1.0112, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 126, "tokens_per_second_per_gpu": 16986.16, "total_tokens": 3224595 }, { "epoch": 0.010157562185075582, "grad_norm": 0.1928212195634842, "learning_rate": 1.9958843986159705e-05, "loss": 0.9699, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 127, "tokens_per_second_per_gpu": 17096.16, "total_tokens": 3250659 }, { "epoch": 0.010237542989682475, "grad_norm": 0.19149477779865265, "learning_rate": 1.99556196460308e-05, "loss": 0.9941, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 128, "tokens_per_second_per_gpu": 16975.27, "total_tokens": 3275998 }, { "epoch": 0.01031752379428937, "grad_norm": 0.19466781616210938, "learning_rate": 1.9952273999818312e-05, "loss": 1.0126, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 129, "tokens_per_second_per_gpu": 17100.87, "total_tokens": 3301550 }, { "epoch": 0.010397504598896264, "grad_norm": 0.19384890794754028, "learning_rate": 1.9948807088287884e-05, "loss": 1.0062, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 130, "tokens_per_second_per_gpu": 17397.69, "total_tokens": 3327697 }, { "epoch": 0.01047748540350316, "grad_norm": 0.19235117733478546, "learning_rate": 1.9945218953682736e-05, "loss": 0.9573, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 131, "tokens_per_second_per_gpu": 17408.0, "total_tokens": 3354295 }, { "epoch": 0.010557466208110053, "grad_norm": 0.19668954610824585, "learning_rate": 1.9941509639723155e-05, "loss": 0.9378, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 132, "tokens_per_second_per_gpu": 17367.53, "total_tokens": 3380587 }, { "epoch": 0.010637447012716949, "grad_norm": 0.19954292476177216, "learning_rate": 1.9937679191605964e-05, "loss": 0.9432, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 133, "tokens_per_second_per_gpu": 16674.36, "total_tokens": 3405485 }, { "epoch": 0.010717427817323842, "grad_norm": 0.2069808393716812, "learning_rate": 1.9933727656003964e-05, "loss": 0.9526, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 134, "tokens_per_second_per_gpu": 17416.45, "total_tokens": 3431579 }, { "epoch": 0.010797408621930737, "grad_norm": 0.20890875160694122, "learning_rate": 1.992965508106537e-05, "loss": 0.9696, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 135, "tokens_per_second_per_gpu": 17080.6, "total_tokens": 3457239 }, { "epoch": 0.010877389426537631, "grad_norm": 0.20000465214252472, "learning_rate": 1.9925461516413224e-05, "loss": 0.9747, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 136, "tokens_per_second_per_gpu": 16945.48, "total_tokens": 3483044 }, { "epoch": 0.010957370231144525, "grad_norm": 0.19973015785217285, "learning_rate": 1.9921147013144782e-05, "loss": 0.9557, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 137, "tokens_per_second_per_gpu": 17186.34, "total_tokens": 3509145 }, { "epoch": 0.01103735103575142, "grad_norm": 0.206997811794281, "learning_rate": 1.9916711623830904e-05, "loss": 0.9155, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 138, "tokens_per_second_per_gpu": 17008.31, "total_tokens": 3534434 }, { "epoch": 0.011117331840358314, "grad_norm": 0.2097865790128708, "learning_rate": 1.991215540251542e-05, "loss": 0.9312, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 139, "tokens_per_second_per_gpu": 16713.43, "total_tokens": 3558430 }, { "epoch": 0.011197312644965209, "grad_norm": 0.19931592047214508, "learning_rate": 1.9907478404714438e-05, "loss": 0.9547, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 140, "tokens_per_second_per_gpu": 16953.09, "total_tokens": 3584023 }, { "epoch": 0.011277293449572103, "grad_norm": 0.2059127390384674, "learning_rate": 1.9902680687415704e-05, "loss": 0.9094, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 141, "tokens_per_second_per_gpu": 16721.38, "total_tokens": 3609030 }, { "epoch": 0.011357274254178998, "grad_norm": 0.20056259632110596, "learning_rate": 1.989776230907789e-05, "loss": 0.906, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 142, "tokens_per_second_per_gpu": 16294.34, "total_tokens": 3633273 }, { "epoch": 0.011437255058785891, "grad_norm": 0.2079566866159439, "learning_rate": 1.9892723329629885e-05, "loss": 0.9815, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 143, "tokens_per_second_per_gpu": 17511.65, "total_tokens": 3659737 }, { "epoch": 0.011517235863392785, "grad_norm": 0.20305806398391724, "learning_rate": 1.988756381047006e-05, "loss": 0.9088, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 144, "tokens_per_second_per_gpu": 16641.28, "total_tokens": 3684810 }, { "epoch": 0.01159721666799968, "grad_norm": 0.19826866686344147, "learning_rate": 1.988228381446553e-05, "loss": 0.9416, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 145, "tokens_per_second_per_gpu": 17656.98, "total_tokens": 3712176 }, { "epoch": 0.011677197472606574, "grad_norm": 0.21241246163845062, "learning_rate": 1.9876883405951378e-05, "loss": 0.9926, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 146, "tokens_per_second_per_gpu": 17247.13, "total_tokens": 3737518 }, { "epoch": 0.01175717827721347, "grad_norm": 0.21107642352581024, "learning_rate": 1.987136265072988e-05, "loss": 0.9405, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 147, "tokens_per_second_per_gpu": 17433.04, "total_tokens": 3763871 }, { "epoch": 0.011837159081820363, "grad_norm": 0.19587242603302002, "learning_rate": 1.9865721616069695e-05, "loss": 0.8517, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 148, "tokens_per_second_per_gpu": 17093.93, "total_tokens": 3790220 }, { "epoch": 0.011917139886427258, "grad_norm": 0.20216360688209534, "learning_rate": 1.985996037070505e-05, "loss": 0.8985, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 149, "tokens_per_second_per_gpu": 17259.66, "total_tokens": 3815740 }, { "epoch": 0.011997120691034152, "grad_norm": 0.2222292274236679, "learning_rate": 1.9854078984834904e-05, "loss": 0.9209, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 150, "tokens_per_second_per_gpu": 17035.62, "total_tokens": 3841798 }, { "epoch": 0.012077101495641045, "grad_norm": 0.21165066957473755, "learning_rate": 1.9848077530122083e-05, "loss": 0.9174, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 151, "tokens_per_second_per_gpu": 16628.36, "total_tokens": 3866966 }, { "epoch": 0.01215708230024794, "grad_norm": 0.23638273775577545, "learning_rate": 1.984195607969242e-05, "loss": 0.9104, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 152, "tokens_per_second_per_gpu": 16175.3, "total_tokens": 3891476 }, { "epoch": 0.012237063104854834, "grad_norm": 0.20337818562984467, "learning_rate": 1.983571470813386e-05, "loss": 0.9093, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 153, "tokens_per_second_per_gpu": 16886.31, "total_tokens": 3917465 }, { "epoch": 0.01231704390946173, "grad_norm": 0.2119511514902115, "learning_rate": 1.9829353491495545e-05, "loss": 0.8815, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 154, "tokens_per_second_per_gpu": 16518.37, "total_tokens": 3942537 }, { "epoch": 0.012397024714068623, "grad_norm": 0.205114483833313, "learning_rate": 1.982287250728689e-05, "loss": 0.9035, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 155, "tokens_per_second_per_gpu": 16966.41, "total_tokens": 3968215 }, { "epoch": 0.012477005518675519, "grad_norm": 0.21471446752548218, "learning_rate": 1.9816271834476642e-05, "loss": 0.9305, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 156, "tokens_per_second_per_gpu": 16764.47, "total_tokens": 3993218 }, { "epoch": 0.012556986323282412, "grad_norm": 0.208131805062294, "learning_rate": 1.9809551553491918e-05, "loss": 0.8548, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 157, "tokens_per_second_per_gpu": 16967.42, "total_tokens": 4018747 }, { "epoch": 0.012636967127889306, "grad_norm": 0.2270553857088089, "learning_rate": 1.9802711746217222e-05, "loss": 0.9206, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 158, "tokens_per_second_per_gpu": 17194.18, "total_tokens": 4044793 }, { "epoch": 0.012716947932496201, "grad_norm": 0.2232825756072998, "learning_rate": 1.979575249599344e-05, "loss": 0.9185, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 159, "tokens_per_second_per_gpu": 16239.69, "total_tokens": 4069372 }, { "epoch": 0.012796928737103095, "grad_norm": 0.22711730003356934, "learning_rate": 1.9788673887616852e-05, "loss": 0.8979, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 160, "tokens_per_second_per_gpu": 17230.88, "total_tokens": 4094803 }, { "epoch": 0.01287690954170999, "grad_norm": 0.23493967950344086, "learning_rate": 1.9781476007338058e-05, "loss": 0.9238, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 161, "tokens_per_second_per_gpu": 17095.3, "total_tokens": 4119931 }, { "epoch": 0.012956890346316884, "grad_norm": 0.24412371218204498, "learning_rate": 1.9774158942860962e-05, "loss": 0.8189, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 162, "tokens_per_second_per_gpu": 16862.59, "total_tokens": 4145075 }, { "epoch": 0.013036871150923779, "grad_norm": 0.228457972407341, "learning_rate": 1.9766722783341682e-05, "loss": 0.9137, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 163, "tokens_per_second_per_gpu": 16955.72, "total_tokens": 4170710 }, { "epoch": 0.013116851955530672, "grad_norm": 0.23934195935726166, "learning_rate": 1.9759167619387474e-05, "loss": 0.9302, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 164, "tokens_per_second_per_gpu": 16683.88, "total_tokens": 4195279 }, { "epoch": 0.013196832760137568, "grad_norm": 0.23014573752880096, "learning_rate": 1.9751493543055634e-05, "loss": 0.9042, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 165, "tokens_per_second_per_gpu": 16489.02, "total_tokens": 4219744 }, { "epoch": 0.013276813564744461, "grad_norm": 0.230689138174057, "learning_rate": 1.9743700647852356e-05, "loss": 0.9264, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 166, "tokens_per_second_per_gpu": 17158.8, "total_tokens": 4245795 }, { "epoch": 0.013356794369351355, "grad_norm": 0.23245392739772797, "learning_rate": 1.9735789028731603e-05, "loss": 0.8792, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 167, "tokens_per_second_per_gpu": 16252.68, "total_tokens": 4270401 }, { "epoch": 0.01343677517395825, "grad_norm": 0.43715667724609375, "learning_rate": 1.972775878209397e-05, "loss": 0.8955, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 168, "tokens_per_second_per_gpu": 16531.81, "total_tokens": 4295082 }, { "epoch": 0.013516755978565144, "grad_norm": 0.2350732386112213, "learning_rate": 1.9719610005785466e-05, "loss": 0.8973, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 169, "tokens_per_second_per_gpu": 16782.69, "total_tokens": 4319861 }, { "epoch": 0.01359673678317204, "grad_norm": 0.23498980700969696, "learning_rate": 1.971134279909636e-05, "loss": 0.949, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 170, "tokens_per_second_per_gpu": 17103.87, "total_tokens": 4345416 }, { "epoch": 0.013676717587778933, "grad_norm": 0.22032824158668518, "learning_rate": 1.9702957262759964e-05, "loss": 0.89, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 171, "tokens_per_second_per_gpu": 16754.18, "total_tokens": 4370945 }, { "epoch": 0.013756698392385828, "grad_norm": 0.22958935797214508, "learning_rate": 1.9694453498951392e-05, "loss": 0.9186, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 172, "tokens_per_second_per_gpu": 17127.91, "total_tokens": 4396853 }, { "epoch": 0.013836679196992722, "grad_norm": 0.24071195721626282, "learning_rate": 1.9685831611286312e-05, "loss": 0.962, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 173, "tokens_per_second_per_gpu": 17406.78, "total_tokens": 4422628 }, { "epoch": 0.013916660001599615, "grad_norm": 0.2658619284629822, "learning_rate": 1.9677091704819714e-05, "loss": 0.9132, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 174, "tokens_per_second_per_gpu": 16846.42, "total_tokens": 4448035 }, { "epoch": 0.01399664080620651, "grad_norm": 0.23434384167194366, "learning_rate": 1.9668233886044597e-05, "loss": 0.8774, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 175, "tokens_per_second_per_gpu": 17076.79, "total_tokens": 4473972 }, { "epoch": 0.014076621610813404, "grad_norm": 0.24713198840618134, "learning_rate": 1.9659258262890683e-05, "loss": 0.905, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 176, "tokens_per_second_per_gpu": 16949.62, "total_tokens": 4498614 }, { "epoch": 0.0141566024154203, "grad_norm": 0.2283277064561844, "learning_rate": 1.9650164944723116e-05, "loss": 0.8451, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 177, "tokens_per_second_per_gpu": 16328.89, "total_tokens": 4522729 }, { "epoch": 0.014236583220027193, "grad_norm": 0.23061935603618622, "learning_rate": 1.96409540423411e-05, "loss": 0.9068, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 178, "tokens_per_second_per_gpu": 17338.63, "total_tokens": 4549252 }, { "epoch": 0.014316564024634089, "grad_norm": 0.24155394732952118, "learning_rate": 1.9631625667976584e-05, "loss": 0.9077, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 179, "tokens_per_second_per_gpu": 17231.25, "total_tokens": 4575502 }, { "epoch": 0.014396544829240982, "grad_norm": 0.24086996912956238, "learning_rate": 1.9622179935292855e-05, "loss": 0.8863, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 180, "tokens_per_second_per_gpu": 17401.86, "total_tokens": 4601736 }, { "epoch": 0.014476525633847876, "grad_norm": 0.2347906082868576, "learning_rate": 1.961261695938319e-05, "loss": 0.8696, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 181, "tokens_per_second_per_gpu": 17379.79, "total_tokens": 4628714 }, { "epoch": 0.014556506438454771, "grad_norm": 0.24834582209587097, "learning_rate": 1.9602936856769432e-05, "loss": 0.866, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 182, "tokens_per_second_per_gpu": 17082.86, "total_tokens": 4654025 }, { "epoch": 0.014636487243061665, "grad_norm": 0.23946715891361237, "learning_rate": 1.9593139745400575e-05, "loss": 0.8721, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 183, "tokens_per_second_per_gpu": 17164.87, "total_tokens": 4679462 }, { "epoch": 0.01471646804766856, "grad_norm": 0.2574214041233063, "learning_rate": 1.9583225744651334e-05, "loss": 0.852, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 184, "tokens_per_second_per_gpu": 16882.9, "total_tokens": 4705235 }, { "epoch": 0.014796448852275454, "grad_norm": 0.2426890730857849, "learning_rate": 1.9573194975320672e-05, "loss": 0.8865, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 185, "tokens_per_second_per_gpu": 17466.12, "total_tokens": 4731568 }, { "epoch": 0.014876429656882349, "grad_norm": 0.24403586983680725, "learning_rate": 1.9563047559630356e-05, "loss": 0.8622, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 186, "tokens_per_second_per_gpu": 16780.87, "total_tokens": 4756596 }, { "epoch": 0.014956410461489242, "grad_norm": 0.24118457734584808, "learning_rate": 1.9552783621223437e-05, "loss": 0.8634, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 187, "tokens_per_second_per_gpu": 16944.71, "total_tokens": 4781534 }, { "epoch": 0.015036391266096136, "grad_norm": 0.23634915053844452, "learning_rate": 1.954240328516277e-05, "loss": 0.8703, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 188, "tokens_per_second_per_gpu": 17084.81, "total_tokens": 4807256 }, { "epoch": 0.015116372070703031, "grad_norm": 0.2323237955570221, "learning_rate": 1.9531906677929472e-05, "loss": 0.8458, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 189, "tokens_per_second_per_gpu": 16775.18, "total_tokens": 4832696 }, { "epoch": 0.015196352875309925, "grad_norm": 0.24547705054283142, "learning_rate": 1.9521293927421388e-05, "loss": 0.87, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 190, "tokens_per_second_per_gpu": 17118.63, "total_tokens": 4858403 }, { "epoch": 0.01527633367991682, "grad_norm": 0.252999484539032, "learning_rate": 1.9510565162951538e-05, "loss": 0.8938, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 191, "tokens_per_second_per_gpu": 17209.42, "total_tokens": 4884814 }, { "epoch": 0.015356314484523714, "grad_norm": 0.26226651668548584, "learning_rate": 1.9499720515246524e-05, "loss": 0.9225, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 192, "tokens_per_second_per_gpu": 17211.08, "total_tokens": 4910205 }, { "epoch": 0.01543629528913061, "grad_norm": 0.25136351585388184, "learning_rate": 1.9488760116444966e-05, "loss": 0.8292, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 193, "tokens_per_second_per_gpu": 16452.56, "total_tokens": 4934218 }, { "epoch": 0.015516276093737503, "grad_norm": 0.25001877546310425, "learning_rate": 1.947768410009586e-05, "loss": 0.8845, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 194, "tokens_per_second_per_gpu": 17134.82, "total_tokens": 4960601 }, { "epoch": 0.015596256898344398, "grad_norm": 0.24449992179870605, "learning_rate": 1.9466492601156964e-05, "loss": 0.8761, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 195, "tokens_per_second_per_gpu": 17246.7, "total_tokens": 4986559 }, { "epoch": 0.01567623770295129, "grad_norm": 0.24022875726222992, "learning_rate": 1.945518575599317e-05, "loss": 0.8345, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 196, "tokens_per_second_per_gpu": 16785.14, "total_tokens": 5011413 }, { "epoch": 0.015756218507558185, "grad_norm": 0.2391171008348465, "learning_rate": 1.944376370237481e-05, "loss": 0.8405, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 197, "tokens_per_second_per_gpu": 17283.57, "total_tokens": 5037700 }, { "epoch": 0.01583619931216508, "grad_norm": 0.2588050961494446, "learning_rate": 1.943222657947601e-05, "loss": 0.8114, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 198, "tokens_per_second_per_gpu": 16778.59, "total_tokens": 5062904 }, { "epoch": 0.015916180116771976, "grad_norm": 0.26281964778900146, "learning_rate": 1.942057452787297e-05, "loss": 0.8992, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 199, "tokens_per_second_per_gpu": 16845.87, "total_tokens": 5087758 }, { "epoch": 0.015996160921378868, "grad_norm": 0.2588569223880768, "learning_rate": 1.9408807689542257e-05, "loss": 0.7411, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 200, "tokens_per_second_per_gpu": 16066.45, "total_tokens": 5111019 }, { "epoch": 0.016076141725985763, "grad_norm": 0.2594797611236572, "learning_rate": 1.9396926207859085e-05, "loss": 0.8375, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 201, "tokens_per_second_per_gpu": 17125.37, "total_tokens": 5136688 }, { "epoch": 0.01615612253059266, "grad_norm": 0.2549116015434265, "learning_rate": 1.938493022759556e-05, "loss": 0.8919, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 202, "tokens_per_second_per_gpu": 17181.47, "total_tokens": 5162141 }, { "epoch": 0.01623610333519955, "grad_norm": 0.258368581533432, "learning_rate": 1.937281989491892e-05, "loss": 0.9158, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 203, "tokens_per_second_per_gpu": 17312.04, "total_tokens": 5188261 }, { "epoch": 0.016316084139806446, "grad_norm": 0.31648266315460205, "learning_rate": 1.9360595357389735e-05, "loss": 0.8818, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 204, "tokens_per_second_per_gpu": 17703.45, "total_tokens": 5214925 }, { "epoch": 0.01639606494441334, "grad_norm": 0.2698972523212433, "learning_rate": 1.9348256763960146e-05, "loss": 0.943, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 205, "tokens_per_second_per_gpu": 17321.57, "total_tokens": 5240212 }, { "epoch": 0.016476045749020236, "grad_norm": 0.2627377212047577, "learning_rate": 1.9335804264972018e-05, "loss": 0.8122, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 206, "tokens_per_second_per_gpu": 17099.13, "total_tokens": 5265405 }, { "epoch": 0.016556026553627128, "grad_norm": 0.2688179016113281, "learning_rate": 1.9323238012155125e-05, "loss": 0.8562, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 207, "tokens_per_second_per_gpu": 16059.88, "total_tokens": 5289288 }, { "epoch": 0.016636007358234024, "grad_norm": 0.2609153985977173, "learning_rate": 1.9310558158625286e-05, "loss": 0.8241, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 208, "tokens_per_second_per_gpu": 17391.96, "total_tokens": 5315382 }, { "epoch": 0.01671598816284092, "grad_norm": 0.26036036014556885, "learning_rate": 1.9297764858882516e-05, "loss": 0.8422, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 209, "tokens_per_second_per_gpu": 16765.8, "total_tokens": 5340531 }, { "epoch": 0.016795968967447814, "grad_norm": 0.25738534331321716, "learning_rate": 1.9284858268809135e-05, "loss": 0.8039, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 210, "tokens_per_second_per_gpu": 16745.69, "total_tokens": 5365785 }, { "epoch": 0.016875949772054706, "grad_norm": 0.2648962438106537, "learning_rate": 1.9271838545667876e-05, "loss": 0.8752, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 211, "tokens_per_second_per_gpu": 17666.5, "total_tokens": 5392095 }, { "epoch": 0.0169559305766616, "grad_norm": 0.2604057192802429, "learning_rate": 1.925870584809995e-05, "loss": 0.8895, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 212, "tokens_per_second_per_gpu": 17065.99, "total_tokens": 5417896 }, { "epoch": 0.017035911381268497, "grad_norm": 0.39727583527565, "learning_rate": 1.9245460336123136e-05, "loss": 0.8353, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 213, "tokens_per_second_per_gpu": 17509.61, "total_tokens": 5443736 }, { "epoch": 0.01711589218587539, "grad_norm": 0.27699121832847595, "learning_rate": 1.923210217112981e-05, "loss": 0.8254, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 214, "tokens_per_second_per_gpu": 16573.45, "total_tokens": 5468669 }, { "epoch": 0.017195872990482284, "grad_norm": 0.2744996249675751, "learning_rate": 1.9218631515885007e-05, "loss": 0.8669, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 215, "tokens_per_second_per_gpu": 17213.09, "total_tokens": 5493788 }, { "epoch": 0.01727585379508918, "grad_norm": 0.27408525347709656, "learning_rate": 1.9205048534524405e-05, "loss": 0.8753, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 216, "tokens_per_second_per_gpu": 17308.66, "total_tokens": 5520168 }, { "epoch": 0.017355834599696075, "grad_norm": 0.279653400182724, "learning_rate": 1.9191353392552346e-05, "loss": 0.8309, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 217, "tokens_per_second_per_gpu": 16264.72, "total_tokens": 5544200 }, { "epoch": 0.017435815404302966, "grad_norm": 0.2667289078235626, "learning_rate": 1.9177546256839814e-05, "loss": 0.8341, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 218, "tokens_per_second_per_gpu": 17247.1, "total_tokens": 5570605 }, { "epoch": 0.017515796208909862, "grad_norm": 0.2734803557395935, "learning_rate": 1.9163627295622397e-05, "loss": 0.8676, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 219, "tokens_per_second_per_gpu": 17464.35, "total_tokens": 5596671 }, { "epoch": 0.017595777013516757, "grad_norm": 0.2817804217338562, "learning_rate": 1.914959667849825e-05, "loss": 0.7697, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 220, "tokens_per_second_per_gpu": 16843.97, "total_tokens": 5621627 }, { "epoch": 0.01767575781812365, "grad_norm": 0.27030467987060547, "learning_rate": 1.913545457642601e-05, "loss": 0.9119, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 221, "tokens_per_second_per_gpu": 17478.61, "total_tokens": 5648136 }, { "epoch": 0.017755738622730544, "grad_norm": 0.27667850255966187, "learning_rate": 1.9121201161722732e-05, "loss": 0.879, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 222, "tokens_per_second_per_gpu": 17271.95, "total_tokens": 5674496 }, { "epoch": 0.01783571942733744, "grad_norm": 0.2836981415748596, "learning_rate": 1.910683660806177e-05, "loss": 0.8441, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 223, "tokens_per_second_per_gpu": 17159.55, "total_tokens": 5700058 }, { "epoch": 0.017915700231944335, "grad_norm": 0.2744138538837433, "learning_rate": 1.9092361090470688e-05, "loss": 0.847, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 224, "tokens_per_second_per_gpu": 17264.31, "total_tokens": 5725619 }, { "epoch": 0.017995681036551227, "grad_norm": 0.26166272163391113, "learning_rate": 1.907777478532909e-05, "loss": 0.8067, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 225, "tokens_per_second_per_gpu": 17051.11, "total_tokens": 5750839 }, { "epoch": 0.018075661841158122, "grad_norm": 0.2761372923851013, "learning_rate": 1.9063077870366504e-05, "loss": 0.8297, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 226, "tokens_per_second_per_gpu": 16956.47, "total_tokens": 5776313 }, { "epoch": 0.018155642645765017, "grad_norm": 0.27935782074928284, "learning_rate": 1.9048270524660197e-05, "loss": 0.8442, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 227, "tokens_per_second_per_gpu": 16999.99, "total_tokens": 5802194 }, { "epoch": 0.01823562345037191, "grad_norm": 0.2994026839733124, "learning_rate": 1.903335292863301e-05, "loss": 0.8445, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 228, "tokens_per_second_per_gpu": 16475.4, "total_tokens": 5826348 }, { "epoch": 0.018315604254978805, "grad_norm": 0.2798149883747101, "learning_rate": 1.901832526405114e-05, "loss": 0.8692, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 229, "tokens_per_second_per_gpu": 17101.79, "total_tokens": 5851978 }, { "epoch": 0.0183955850595857, "grad_norm": 0.2609909474849701, "learning_rate": 1.9003187714021936e-05, "loss": 0.7482, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 230, "tokens_per_second_per_gpu": 16552.15, "total_tokens": 5877167 }, { "epoch": 0.018475565864192595, "grad_norm": 0.29680418968200684, "learning_rate": 1.8987940462991673e-05, "loss": 0.861, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 231, "tokens_per_second_per_gpu": 16968.37, "total_tokens": 5902589 }, { "epoch": 0.018555546668799487, "grad_norm": 0.2876088321208954, "learning_rate": 1.8972583696743284e-05, "loss": 0.8511, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 232, "tokens_per_second_per_gpu": 16522.68, "total_tokens": 5927666 }, { "epoch": 0.018635527473406382, "grad_norm": 0.2777324318885803, "learning_rate": 1.895711760239413e-05, "loss": 0.7771, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 233, "tokens_per_second_per_gpu": 16851.59, "total_tokens": 5953221 }, { "epoch": 0.018715508278013278, "grad_norm": 0.29070353507995605, "learning_rate": 1.8941542368393683e-05, "loss": 0.8033, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 234, "tokens_per_second_per_gpu": 16504.59, "total_tokens": 5978512 }, { "epoch": 0.01879548908262017, "grad_norm": 0.29157114028930664, "learning_rate": 1.892585818452126e-05, "loss": 0.8529, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 235, "tokens_per_second_per_gpu": 16711.56, "total_tokens": 6003185 }, { "epoch": 0.018875469887227065, "grad_norm": 0.30835041403770447, "learning_rate": 1.891006524188368e-05, "loss": 0.8518, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 236, "tokens_per_second_per_gpu": 16536.32, "total_tokens": 6027878 }, { "epoch": 0.01895545069183396, "grad_norm": 0.2955070436000824, "learning_rate": 1.889416373291298e-05, "loss": 0.8672, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 237, "tokens_per_second_per_gpu": 16792.68, "total_tokens": 6053557 }, { "epoch": 0.019035431496440856, "grad_norm": 0.2786145806312561, "learning_rate": 1.8878153851364013e-05, "loss": 0.8302, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 238, "tokens_per_second_per_gpu": 17428.54, "total_tokens": 6080795 }, { "epoch": 0.019115412301047748, "grad_norm": 0.2858044505119324, "learning_rate": 1.8862035792312148e-05, "loss": 0.8413, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 239, "tokens_per_second_per_gpu": 17091.79, "total_tokens": 6106593 }, { "epoch": 0.019195393105654643, "grad_norm": 0.29661673307418823, "learning_rate": 1.884580975215084e-05, "loss": 0.8058, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 240, "tokens_per_second_per_gpu": 16379.78, "total_tokens": 6130856 }, { "epoch": 0.019275373910261538, "grad_norm": 0.2872996926307678, "learning_rate": 1.8829475928589272e-05, "loss": 0.8136, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 241, "tokens_per_second_per_gpu": 16644.37, "total_tokens": 6156050 }, { "epoch": 0.01935535471486843, "grad_norm": 0.29381078481674194, "learning_rate": 1.8813034520649923e-05, "loss": 0.8415, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 242, "tokens_per_second_per_gpu": 16654.68, "total_tokens": 6181435 }, { "epoch": 0.019435335519475325, "grad_norm": 0.28002533316612244, "learning_rate": 1.879648572866617e-05, "loss": 0.7861, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 243, "tokens_per_second_per_gpu": 16498.47, "total_tokens": 6206084 }, { "epoch": 0.01951531632408222, "grad_norm": 0.30103883147239685, "learning_rate": 1.8779829754279806e-05, "loss": 0.8378, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 244, "tokens_per_second_per_gpu": 17231.92, "total_tokens": 6232452 }, { "epoch": 0.019595297128689116, "grad_norm": 0.3162606358528137, "learning_rate": 1.8763066800438638e-05, "loss": 0.8362, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 245, "tokens_per_second_per_gpu": 17419.8, "total_tokens": 6258179 }, { "epoch": 0.019675277933296008, "grad_norm": 0.30273863673210144, "learning_rate": 1.874619707139396e-05, "loss": 0.8654, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 246, "tokens_per_second_per_gpu": 16774.8, "total_tokens": 6283069 }, { "epoch": 0.019755258737902903, "grad_norm": 0.2920013666152954, "learning_rate": 1.8729220772698096e-05, "loss": 0.799, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 247, "tokens_per_second_per_gpu": 16609.12, "total_tokens": 6308439 }, { "epoch": 0.0198352395425098, "grad_norm": 0.28597742319107056, "learning_rate": 1.8712138111201898e-05, "loss": 0.7502, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 248, "tokens_per_second_per_gpu": 16656.5, "total_tokens": 6333609 }, { "epoch": 0.01991522034711669, "grad_norm": 0.3035345673561096, "learning_rate": 1.869494929505219e-05, "loss": 0.8001, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 249, "tokens_per_second_per_gpu": 16881.32, "total_tokens": 6358608 }, { "epoch": 0.019995201151723586, "grad_norm": 0.2953839600086212, "learning_rate": 1.8677654533689287e-05, "loss": 0.7813, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 250, "tokens_per_second_per_gpu": 16977.53, "total_tokens": 6383721 }, { "epoch": 0.02007518195633048, "grad_norm": 0.3125785291194916, "learning_rate": 1.866025403784439e-05, "loss": 0.8195, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 251, "tokens_per_second_per_gpu": 16863.17, "total_tokens": 6408806 }, { "epoch": 0.020155162760937376, "grad_norm": 0.2873575985431671, "learning_rate": 1.864274801953705e-05, "loss": 0.8268, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 252, "tokens_per_second_per_gpu": 17229.79, "total_tokens": 6434899 }, { "epoch": 0.020235143565544268, "grad_norm": 0.29636356234550476, "learning_rate": 1.8625136692072577e-05, "loss": 0.8041, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 253, "tokens_per_second_per_gpu": 17474.25, "total_tokens": 6462145 }, { "epoch": 0.020315124370151164, "grad_norm": 0.29690074920654297, "learning_rate": 1.860742027003944e-05, "loss": 0.8282, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 254, "tokens_per_second_per_gpu": 17097.68, "total_tokens": 6488193 }, { "epoch": 0.02039510517475806, "grad_norm": 0.4087201654911041, "learning_rate": 1.8589598969306646e-05, "loss": 0.7644, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 255, "tokens_per_second_per_gpu": 16154.98, "total_tokens": 6512510 }, { "epoch": 0.02047508597936495, "grad_norm": 0.2895331084728241, "learning_rate": 1.8571673007021124e-05, "loss": 0.8014, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 256, "tokens_per_second_per_gpu": 17165.77, "total_tokens": 6538276 }, { "epoch": 0.020555066783971846, "grad_norm": 0.3026330769062042, "learning_rate": 1.855364260160507e-05, "loss": 0.7987, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 257, "tokens_per_second_per_gpu": 16744.28, "total_tokens": 6563490 }, { "epoch": 0.02063504758857874, "grad_norm": 0.32229679822921753, "learning_rate": 1.8535507972753275e-05, "loss": 0.8214, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 258, "tokens_per_second_per_gpu": 17172.36, "total_tokens": 6589271 }, { "epoch": 0.020715028393185637, "grad_norm": 0.3137056231498718, "learning_rate": 1.851726934143048e-05, "loss": 0.7672, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 259, "tokens_per_second_per_gpu": 16452.74, "total_tokens": 6613284 }, { "epoch": 0.02079500919779253, "grad_norm": 0.28917086124420166, "learning_rate": 1.849892692986864e-05, "loss": 0.7599, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 260, "tokens_per_second_per_gpu": 16640.08, "total_tokens": 6637930 }, { "epoch": 0.020874990002399424, "grad_norm": 0.2955164313316345, "learning_rate": 1.848048096156426e-05, "loss": 0.8276, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 261, "tokens_per_second_per_gpu": 17161.26, "total_tokens": 6663889 }, { "epoch": 0.02095497080700632, "grad_norm": 0.3261178731918335, "learning_rate": 1.8461931661275642e-05, "loss": 0.8166, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 262, "tokens_per_second_per_gpu": 17195.28, "total_tokens": 6689227 }, { "epoch": 0.02103495161161321, "grad_norm": 0.33998236060142517, "learning_rate": 1.8443279255020153e-05, "loss": 0.8438, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 263, "tokens_per_second_per_gpu": 17061.4, "total_tokens": 6714449 }, { "epoch": 0.021114932416220106, "grad_norm": 0.2960314154624939, "learning_rate": 1.842452397007148e-05, "loss": 0.7353, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 264, "tokens_per_second_per_gpu": 16816.51, "total_tokens": 6739649 }, { "epoch": 0.021194913220827002, "grad_norm": 0.3163682520389557, "learning_rate": 1.8405666034956842e-05, "loss": 0.7903, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 265, "tokens_per_second_per_gpu": 16833.27, "total_tokens": 6764236 }, { "epoch": 0.021274894025433897, "grad_norm": 0.3097144365310669, "learning_rate": 1.8386705679454243e-05, "loss": 0.7922, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 266, "tokens_per_second_per_gpu": 17249.38, "total_tokens": 6790713 }, { "epoch": 0.02135487483004079, "grad_norm": 0.30585765838623047, "learning_rate": 1.836764313458962e-05, "loss": 0.7718, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 267, "tokens_per_second_per_gpu": 17056.67, "total_tokens": 6816552 }, { "epoch": 0.021434855634647684, "grad_norm": 0.31744128465652466, "learning_rate": 1.8348478632634067e-05, "loss": 0.8042, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 268, "tokens_per_second_per_gpu": 16660.22, "total_tokens": 6842118 }, { "epoch": 0.02151483643925458, "grad_norm": 0.32907190918922424, "learning_rate": 1.8329212407100996e-05, "loss": 0.7517, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 269, "tokens_per_second_per_gpu": 16136.63, "total_tokens": 6865763 }, { "epoch": 0.021594817243861475, "grad_norm": 0.30771222710609436, "learning_rate": 1.8309844692743283e-05, "loss": 0.7819, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 270, "tokens_per_second_per_gpu": 17209.72, "total_tokens": 6891634 }, { "epoch": 0.021674798048468367, "grad_norm": 0.31215161085128784, "learning_rate": 1.8290375725550417e-05, "loss": 0.7773, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 271, "tokens_per_second_per_gpu": 16681.19, "total_tokens": 6917029 }, { "epoch": 0.021754778853075262, "grad_norm": 0.31891316175460815, "learning_rate": 1.827080574274562e-05, "loss": 0.8129, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 272, "tokens_per_second_per_gpu": 17075.42, "total_tokens": 6942625 }, { "epoch": 0.021834759657682157, "grad_norm": 0.33234041929244995, "learning_rate": 1.8251134982782952e-05, "loss": 0.8021, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 273, "tokens_per_second_per_gpu": 16993.73, "total_tokens": 6968068 }, { "epoch": 0.02191474046228905, "grad_norm": 0.3127538859844208, "learning_rate": 1.8231363685344422e-05, "loss": 0.8295, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 274, "tokens_per_second_per_gpu": 16858.57, "total_tokens": 6993212 }, { "epoch": 0.021994721266895945, "grad_norm": 0.3256042003631592, "learning_rate": 1.821149209133704e-05, "loss": 0.7551, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 275, "tokens_per_second_per_gpu": 16664.19, "total_tokens": 7018151 }, { "epoch": 0.02207470207150284, "grad_norm": 0.33830273151397705, "learning_rate": 1.819152044288992e-05, "loss": 0.7927, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 276, "tokens_per_second_per_gpu": 16526.59, "total_tokens": 7042282 }, { "epoch": 0.022154682876109735, "grad_norm": 0.32764095067977905, "learning_rate": 1.8171448983351284e-05, "loss": 0.8133, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 277, "tokens_per_second_per_gpu": 16885.71, "total_tokens": 7068020 }, { "epoch": 0.022234663680716627, "grad_norm": 0.3218875527381897, "learning_rate": 1.815127795728554e-05, "loss": 0.8018, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 278, "tokens_per_second_per_gpu": 17189.97, "total_tokens": 7093302 }, { "epoch": 0.022314644485323522, "grad_norm": 0.304941326379776, "learning_rate": 1.8131007610470278e-05, "loss": 0.7814, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 279, "tokens_per_second_per_gpu": 17206.92, "total_tokens": 7119187 }, { "epoch": 0.022394625289930418, "grad_norm": 0.340358704328537, "learning_rate": 1.8110638189893267e-05, "loss": 0.8054, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 280, "tokens_per_second_per_gpu": 16899.54, "total_tokens": 7144790 }, { "epoch": 0.02247460609453731, "grad_norm": 0.3224817216396332, "learning_rate": 1.8090169943749477e-05, "loss": 0.808, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 281, "tokens_per_second_per_gpu": 16848.86, "total_tokens": 7170086 }, { "epoch": 0.022554586899144205, "grad_norm": 0.3096613585948944, "learning_rate": 1.806960312143802e-05, "loss": 0.7462, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 282, "tokens_per_second_per_gpu": 17102.35, "total_tokens": 7196108 }, { "epoch": 0.0226345677037511, "grad_norm": 0.3089353144168854, "learning_rate": 1.804893797355914e-05, "loss": 0.7468, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 283, "tokens_per_second_per_gpu": 16643.16, "total_tokens": 7221381 }, { "epoch": 0.022714548508357996, "grad_norm": 0.3287941813468933, "learning_rate": 1.8028174751911147e-05, "loss": 0.7332, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 284, "tokens_per_second_per_gpu": 16230.46, "total_tokens": 7245664 }, { "epoch": 0.022794529312964888, "grad_norm": 0.33320385217666626, "learning_rate": 1.8007313709487334e-05, "loss": 0.769, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 285, "tokens_per_second_per_gpu": 17250.49, "total_tokens": 7271281 }, { "epoch": 0.022874510117571783, "grad_norm": 0.33193832635879517, "learning_rate": 1.798635510047293e-05, "loss": 0.7806, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 286, "tokens_per_second_per_gpu": 17234.16, "total_tokens": 7297650 }, { "epoch": 0.022954490922178678, "grad_norm": 0.3023802638053894, "learning_rate": 1.7965299180241963e-05, "loss": 0.727, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 287, "tokens_per_second_per_gpu": 16947.96, "total_tokens": 7323634 }, { "epoch": 0.02303447172678557, "grad_norm": 0.3405572772026062, "learning_rate": 1.7944146205354182e-05, "loss": 0.7677, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 288, "tokens_per_second_per_gpu": 17075.52, "total_tokens": 7348943 }, { "epoch": 0.023114452531392465, "grad_norm": 0.33041706681251526, "learning_rate": 1.792289643355191e-05, "loss": 0.771, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 289, "tokens_per_second_per_gpu": 17190.81, "total_tokens": 7374829 }, { "epoch": 0.02319443333599936, "grad_norm": 0.3304063677787781, "learning_rate": 1.7901550123756906e-05, "loss": 0.7701, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 290, "tokens_per_second_per_gpu": 17388.73, "total_tokens": 7401339 }, { "epoch": 0.023274414140606256, "grad_norm": 0.3571583032608032, "learning_rate": 1.788010753606722e-05, "loss": 0.7701, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 291, "tokens_per_second_per_gpu": 16986.08, "total_tokens": 7426830 }, { "epoch": 0.023354394945213148, "grad_norm": 0.3259941339492798, "learning_rate": 1.785856893175402e-05, "loss": 0.7765, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 292, "tokens_per_second_per_gpu": 17349.81, "total_tokens": 7452984 }, { "epoch": 0.023434375749820043, "grad_norm": 0.3239382803440094, "learning_rate": 1.78369345732584e-05, "loss": 0.7691, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 293, "tokens_per_second_per_gpu": 17073.92, "total_tokens": 7477969 }, { "epoch": 0.02351435655442694, "grad_norm": 0.3326447010040283, "learning_rate": 1.781520472418819e-05, "loss": 0.7332, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 294, "tokens_per_second_per_gpu": 16657.1, "total_tokens": 7502565 }, { "epoch": 0.02359433735903383, "grad_norm": 0.34120991826057434, "learning_rate": 1.7793379649314743e-05, "loss": 0.7993, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 295, "tokens_per_second_per_gpu": 16742.23, "total_tokens": 7527568 }, { "epoch": 0.023674318163640726, "grad_norm": 0.33794164657592773, "learning_rate": 1.777145961456971e-05, "loss": 0.8072, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 296, "tokens_per_second_per_gpu": 17036.53, "total_tokens": 7552596 }, { "epoch": 0.02375429896824762, "grad_norm": 0.3645365834236145, "learning_rate": 1.7749444887041797e-05, "loss": 0.7621, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 297, "tokens_per_second_per_gpu": 16764.81, "total_tokens": 7577121 }, { "epoch": 0.023834279772854516, "grad_norm": 0.35922765731811523, "learning_rate": 1.7727335734973512e-05, "loss": 0.7771, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 298, "tokens_per_second_per_gpu": 17504.58, "total_tokens": 7602943 }, { "epoch": 0.023914260577461408, "grad_norm": 0.3424239456653595, "learning_rate": 1.7705132427757895e-05, "loss": 0.729, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 299, "tokens_per_second_per_gpu": 16983.33, "total_tokens": 7627654 }, { "epoch": 0.023994241382068304, "grad_norm": 0.34089338779449463, "learning_rate": 1.7682835235935236e-05, "loss": 0.7803, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 300, "tokens_per_second_per_gpu": 16880.41, "total_tokens": 7653306 }, { "epoch": 0.0240742221866752, "grad_norm": 0.3372519016265869, "learning_rate": 1.766044443118978e-05, "loss": 0.7079, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 301, "tokens_per_second_per_gpu": 16821.65, "total_tokens": 7678769 }, { "epoch": 0.02415420299128209, "grad_norm": 0.33545535802841187, "learning_rate": 1.7637960286346423e-05, "loss": 0.7465, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 302, "tokens_per_second_per_gpu": 16997.07, "total_tokens": 7704035 }, { "epoch": 0.024234183795888986, "grad_norm": 0.35364168882369995, "learning_rate": 1.761538307536737e-05, "loss": 0.8609, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 303, "tokens_per_second_per_gpu": 17088.38, "total_tokens": 7729313 }, { "epoch": 0.02431416460049588, "grad_norm": 0.3543623089790344, "learning_rate": 1.759271307334881e-05, "loss": 0.7496, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 304, "tokens_per_second_per_gpu": 16831.61, "total_tokens": 7754311 }, { "epoch": 0.024394145405102777, "grad_norm": 0.35020682215690613, "learning_rate": 1.7569950556517566e-05, "loss": 0.8171, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 305, "tokens_per_second_per_gpu": 17372.62, "total_tokens": 7780114 }, { "epoch": 0.02447412620970967, "grad_norm": 0.3287740647792816, "learning_rate": 1.7547095802227723e-05, "loss": 0.756, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 306, "tokens_per_second_per_gpu": 17061.4, "total_tokens": 7806162 }, { "epoch": 0.024554107014316564, "grad_norm": 0.347204327583313, "learning_rate": 1.7524149088957244e-05, "loss": 0.7294, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 307, "tokens_per_second_per_gpu": 16759.71, "total_tokens": 7831007 }, { "epoch": 0.02463408781892346, "grad_norm": 0.36060085892677307, "learning_rate": 1.7501110696304598e-05, "loss": 0.7522, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 308, "tokens_per_second_per_gpu": 16761.57, "total_tokens": 7855632 }, { "epoch": 0.02471406862353035, "grad_norm": 0.3645978569984436, "learning_rate": 1.747798090498532e-05, "loss": 0.7788, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 309, "tokens_per_second_per_gpu": 17255.8, "total_tokens": 7881203 }, { "epoch": 0.024794049428137246, "grad_norm": 0.37542036175727844, "learning_rate": 1.7454759996828622e-05, "loss": 0.7185, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 310, "tokens_per_second_per_gpu": 16261.57, "total_tokens": 7905495 }, { "epoch": 0.024874030232744142, "grad_norm": 0.34638047218322754, "learning_rate": 1.7431448254773943e-05, "loss": 0.7835, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 311, "tokens_per_second_per_gpu": 17285.95, "total_tokens": 7932504 }, { "epoch": 0.024954011037351037, "grad_norm": 0.342735230922699, "learning_rate": 1.74080459628675e-05, "loss": 0.777, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 312, "tokens_per_second_per_gpu": 17247.27, "total_tokens": 7958943 }, { "epoch": 0.02503399184195793, "grad_norm": 0.3410895764827728, "learning_rate": 1.7384553406258842e-05, "loss": 0.7335, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 313, "tokens_per_second_per_gpu": 16698.56, "total_tokens": 7984134 }, { "epoch": 0.025113972646564824, "grad_norm": 0.3840852379798889, "learning_rate": 1.7360970871197347e-05, "loss": 0.7869, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 314, "tokens_per_second_per_gpu": 17022.06, "total_tokens": 8009259 }, { "epoch": 0.02519395345117172, "grad_norm": 0.36912381649017334, "learning_rate": 1.7337298645028764e-05, "loss": 0.7762, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 315, "tokens_per_second_per_gpu": 17332.18, "total_tokens": 8035054 }, { "epoch": 0.02527393425577861, "grad_norm": 0.3521462082862854, "learning_rate": 1.7313537016191706e-05, "loss": 0.8045, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 316, "tokens_per_second_per_gpu": 17373.03, "total_tokens": 8061906 }, { "epoch": 0.025353915060385507, "grad_norm": 0.33142420649528503, "learning_rate": 1.7289686274214116e-05, "loss": 0.7184, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 317, "tokens_per_second_per_gpu": 17144.27, "total_tokens": 8087938 }, { "epoch": 0.025433895864992402, "grad_norm": 0.3533654808998108, "learning_rate": 1.7265746709709762e-05, "loss": 0.7285, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 318, "tokens_per_second_per_gpu": 16330.33, "total_tokens": 8112163 }, { "epoch": 0.025513876669599297, "grad_norm": 0.37131303548812866, "learning_rate": 1.7241718614374678e-05, "loss": 0.7473, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 319, "tokens_per_second_per_gpu": 16981.6, "total_tokens": 8137330 }, { "epoch": 0.02559385747420619, "grad_norm": 0.3532845675945282, "learning_rate": 1.7217602280983622e-05, "loss": 0.7588, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 320, "tokens_per_second_per_gpu": 16989.56, "total_tokens": 8163328 }, { "epoch": 0.025673838278813085, "grad_norm": 0.3767626881599426, "learning_rate": 1.7193398003386514e-05, "loss": 0.7187, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 321, "tokens_per_second_per_gpu": 16326.84, "total_tokens": 8187352 }, { "epoch": 0.02575381908341998, "grad_norm": 0.35990527272224426, "learning_rate": 1.716910607650483e-05, "loss": 0.7561, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 322, "tokens_per_second_per_gpu": 16850.09, "total_tokens": 8212291 }, { "epoch": 0.025833799888026872, "grad_norm": 0.3629964590072632, "learning_rate": 1.7144726796328034e-05, "loss": 0.7782, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 323, "tokens_per_second_per_gpu": 17002.36, "total_tokens": 8237584 }, { "epoch": 0.025913780692633767, "grad_norm": 0.3404940068721771, "learning_rate": 1.712026045990997e-05, "loss": 0.7393, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 324, "tokens_per_second_per_gpu": 17033.24, "total_tokens": 8263414 }, { "epoch": 0.025993761497240662, "grad_norm": 0.3736456334590912, "learning_rate": 1.709570736536521e-05, "loss": 0.7916, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 325, "tokens_per_second_per_gpu": 17195.68, "total_tokens": 8289808 }, { "epoch": 0.026073742301847558, "grad_norm": 0.3524475693702698, "learning_rate": 1.7071067811865477e-05, "loss": 0.6945, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 326, "tokens_per_second_per_gpu": 16341.29, "total_tokens": 8314490 }, { "epoch": 0.02615372310645445, "grad_norm": 0.3585701286792755, "learning_rate": 1.7046342099635948e-05, "loss": 0.7295, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 327, "tokens_per_second_per_gpu": 16935.65, "total_tokens": 8339442 }, { "epoch": 0.026233703911061345, "grad_norm": 0.3640107810497284, "learning_rate": 1.7021530529951627e-05, "loss": 0.7135, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 328, "tokens_per_second_per_gpu": 16629.7, "total_tokens": 8364542 }, { "epoch": 0.02631368471566824, "grad_norm": 0.3694165050983429, "learning_rate": 1.6996633405133656e-05, "loss": 0.7346, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 329, "tokens_per_second_per_gpu": 16903.87, "total_tokens": 8389153 }, { "epoch": 0.026393665520275136, "grad_norm": 0.42035412788391113, "learning_rate": 1.697165102854565e-05, "loss": 0.7501, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 330, "tokens_per_second_per_gpu": 16961.37, "total_tokens": 8414362 }, { "epoch": 0.026473646324882028, "grad_norm": 0.36393973231315613, "learning_rate": 1.6946583704589973e-05, "loss": 0.7965, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 331, "tokens_per_second_per_gpu": 16662.47, "total_tokens": 8439456 }, { "epoch": 0.026553627129488923, "grad_norm": 0.36519739031791687, "learning_rate": 1.692143173870407e-05, "loss": 0.6917, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 332, "tokens_per_second_per_gpu": 16982.28, "total_tokens": 8465047 }, { "epoch": 0.026633607934095818, "grad_norm": 0.36028608679771423, "learning_rate": 1.68961954373567e-05, "loss": 0.7372, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 333, "tokens_per_second_per_gpu": 17268.51, "total_tokens": 8491108 }, { "epoch": 0.02671358873870271, "grad_norm": 0.3669857382774353, "learning_rate": 1.6870875108044233e-05, "loss": 0.7399, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 334, "tokens_per_second_per_gpu": 17302.25, "total_tokens": 8517165 }, { "epoch": 0.026793569543309605, "grad_norm": 0.3491288721561432, "learning_rate": 1.684547105928689e-05, "loss": 0.7207, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 335, "tokens_per_second_per_gpu": 16568.06, "total_tokens": 8541827 }, { "epoch": 0.0268735503479165, "grad_norm": 0.3872898817062378, "learning_rate": 1.6819983600624986e-05, "loss": 0.7689, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 336, "tokens_per_second_per_gpu": 16928.56, "total_tokens": 8566999 }, { "epoch": 0.026953531152523396, "grad_norm": 0.3505984842777252, "learning_rate": 1.6794413042615168e-05, "loss": 0.6918, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 337, "tokens_per_second_per_gpu": 16909.05, "total_tokens": 8591876 }, { "epoch": 0.027033511957130288, "grad_norm": 0.37660378217697144, "learning_rate": 1.6768759696826608e-05, "loss": 0.7235, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 338, "tokens_per_second_per_gpu": 17084.11, "total_tokens": 8617647 }, { "epoch": 0.027113492761737183, "grad_norm": 0.38223305344581604, "learning_rate": 1.6743023875837233e-05, "loss": 0.7838, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 339, "tokens_per_second_per_gpu": 17355.85, "total_tokens": 8643982 }, { "epoch": 0.02719347356634408, "grad_norm": 0.3753760755062103, "learning_rate": 1.6717205893229904e-05, "loss": 0.7303, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 340, "tokens_per_second_per_gpu": 16722.16, "total_tokens": 8669112 }, { "epoch": 0.02727345437095097, "grad_norm": 0.3831718862056732, "learning_rate": 1.6691306063588583e-05, "loss": 0.7441, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 341, "tokens_per_second_per_gpu": 16783.28, "total_tokens": 8694589 }, { "epoch": 0.027353435175557866, "grad_norm": 0.38198089599609375, "learning_rate": 1.6665324702494524e-05, "loss": 0.7216, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 342, "tokens_per_second_per_gpu": 16233.97, "total_tokens": 8719038 }, { "epoch": 0.02743341598016476, "grad_norm": 0.37571123242378235, "learning_rate": 1.6639262126522417e-05, "loss": 0.8385, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 343, "tokens_per_second_per_gpu": 17524.97, "total_tokens": 8745289 }, { "epoch": 0.027513396784771656, "grad_norm": 0.3696345388889313, "learning_rate": 1.661311865323652e-05, "loss": 0.7894, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 344, "tokens_per_second_per_gpu": 16521.06, "total_tokens": 8770700 }, { "epoch": 0.027593377589378548, "grad_norm": 0.3620677590370178, "learning_rate": 1.6586894601186804e-05, "loss": 0.7883, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 345, "tokens_per_second_per_gpu": 17402.15, "total_tokens": 8797084 }, { "epoch": 0.027673358393985444, "grad_norm": 0.372738242149353, "learning_rate": 1.6560590289905074e-05, "loss": 0.7291, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 346, "tokens_per_second_per_gpu": 17215.48, "total_tokens": 8822494 }, { "epoch": 0.02775333919859234, "grad_norm": 0.3729492425918579, "learning_rate": 1.6534206039901057e-05, "loss": 0.7847, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 347, "tokens_per_second_per_gpu": 16917.18, "total_tokens": 8847694 }, { "epoch": 0.02783332000319923, "grad_norm": 0.3795606791973114, "learning_rate": 1.650774217265851e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 348, "tokens_per_second_per_gpu": 16262.38, "total_tokens": 8871607 }, { "epoch": 0.027913300807806126, "grad_norm": 0.38951990008354187, "learning_rate": 1.6481199010631312e-05, "loss": 0.732, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 349, "tokens_per_second_per_gpu": 16713.27, "total_tokens": 8896607 }, { "epoch": 0.02799328161241302, "grad_norm": 0.37609028816223145, "learning_rate": 1.645457687723951e-05, "loss": 0.7056, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 350, "tokens_per_second_per_gpu": 17036.12, "total_tokens": 8921918 }, { "epoch": 0.028073262417019917, "grad_norm": 0.354303240776062, "learning_rate": 1.6427876096865394e-05, "loss": 0.7704, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 351, "tokens_per_second_per_gpu": 17543.34, "total_tokens": 8948802 }, { "epoch": 0.02815324322162681, "grad_norm": 0.367156445980072, "learning_rate": 1.6401096994849558e-05, "loss": 0.7856, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 352, "tokens_per_second_per_gpu": 17447.98, "total_tokens": 8975725 }, { "epoch": 0.028233224026233704, "grad_norm": 0.3801327645778656, "learning_rate": 1.63742398974869e-05, "loss": 0.7162, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 353, "tokens_per_second_per_gpu": 16838.46, "total_tokens": 9000553 }, { "epoch": 0.0283132048308406, "grad_norm": 0.3771909773349762, "learning_rate": 1.6347305132022677e-05, "loss": 0.7503, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 354, "tokens_per_second_per_gpu": 17019.93, "total_tokens": 9026503 }, { "epoch": 0.02839318563544749, "grad_norm": 0.3548984229564667, "learning_rate": 1.632029302664851e-05, "loss": 0.7121, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 355, "tokens_per_second_per_gpu": 17243.63, "total_tokens": 9052692 }, { "epoch": 0.028473166440054386, "grad_norm": 0.38791143894195557, "learning_rate": 1.6293203910498375e-05, "loss": 0.7143, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 356, "tokens_per_second_per_gpu": 16719.42, "total_tokens": 9077149 }, { "epoch": 0.028553147244661282, "grad_norm": 0.37814652919769287, "learning_rate": 1.6266038113644605e-05, "loss": 0.7185, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 357, "tokens_per_second_per_gpu": 16720.19, "total_tokens": 9102344 }, { "epoch": 0.028633128049268177, "grad_norm": 0.39943739771842957, "learning_rate": 1.6238795967093865e-05, "loss": 0.7723, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 358, "tokens_per_second_per_gpu": 17517.32, "total_tokens": 9128929 }, { "epoch": 0.02871310885387507, "grad_norm": 0.3772953748703003, "learning_rate": 1.6211477802783105e-05, "loss": 0.728, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 359, "tokens_per_second_per_gpu": 16422.02, "total_tokens": 9153580 }, { "epoch": 0.028793089658481964, "grad_norm": 0.38691309094429016, "learning_rate": 1.6184083953575543e-05, "loss": 0.7345, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 360, "tokens_per_second_per_gpu": 17226.57, "total_tokens": 9179977 }, { "epoch": 0.02887307046308886, "grad_norm": 0.38146907091140747, "learning_rate": 1.6156614753256583e-05, "loss": 0.7257, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 361, "tokens_per_second_per_gpu": 16847.61, "total_tokens": 9205256 }, { "epoch": 0.02895305126769575, "grad_norm": 0.3648886978626251, "learning_rate": 1.6129070536529767e-05, "loss": 0.7108, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 362, "tokens_per_second_per_gpu": 16919.86, "total_tokens": 9230795 }, { "epoch": 0.029033032072302647, "grad_norm": 0.39110928773880005, "learning_rate": 1.610145163901268e-05, "loss": 0.7205, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 363, "tokens_per_second_per_gpu": 16929.94, "total_tokens": 9256324 }, { "epoch": 0.029113012876909542, "grad_norm": 0.3927913308143616, "learning_rate": 1.607375839723287e-05, "loss": 0.7325, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 364, "tokens_per_second_per_gpu": 17259.35, "total_tokens": 9282305 }, { "epoch": 0.029192993681516437, "grad_norm": 0.4146783947944641, "learning_rate": 1.6045991148623752e-05, "loss": 0.7032, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 365, "tokens_per_second_per_gpu": 17299.87, "total_tokens": 9307760 }, { "epoch": 0.02927297448612333, "grad_norm": 0.38273462653160095, "learning_rate": 1.6018150231520486e-05, "loss": 0.7482, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 366, "tokens_per_second_per_gpu": 16898.79, "total_tokens": 9333318 }, { "epoch": 0.029352955290730225, "grad_norm": 0.37070807814598083, "learning_rate": 1.599023598515586e-05, "loss": 0.7562, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 367, "tokens_per_second_per_gpu": 17749.01, "total_tokens": 9360564 }, { "epoch": 0.02943293609533712, "grad_norm": 0.3885659873485565, "learning_rate": 1.5962248749656158e-05, "loss": 0.7191, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 368, "tokens_per_second_per_gpu": 17134.11, "total_tokens": 9386204 }, { "epoch": 0.029512916899944012, "grad_norm": 0.40251046419143677, "learning_rate": 1.5934188866037017e-05, "loss": 0.7055, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 369, "tokens_per_second_per_gpu": 17219.31, "total_tokens": 9412581 }, { "epoch": 0.029592897704550907, "grad_norm": 0.40094780921936035, "learning_rate": 1.5906056676199256e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 370, "tokens_per_second_per_gpu": 16162.93, "total_tokens": 9437163 }, { "epoch": 0.029672878509157802, "grad_norm": 0.41726741194725037, "learning_rate": 1.5877852522924733e-05, "loss": 0.7203, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 371, "tokens_per_second_per_gpu": 16507.8, "total_tokens": 9461430 }, { "epoch": 0.029752859313764698, "grad_norm": 0.4103233218193054, "learning_rate": 1.584957674987216e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 372, "tokens_per_second_per_gpu": 16712.62, "total_tokens": 9485701 }, { "epoch": 0.02983284011837159, "grad_norm": 0.4164546728134155, "learning_rate": 1.5821229701572897e-05, "loss": 0.741, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 373, "tokens_per_second_per_gpu": 16965.21, "total_tokens": 9511120 }, { "epoch": 0.029912820922978485, "grad_norm": 0.3924483358860016, "learning_rate": 1.5792811723426787e-05, "loss": 0.7683, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 374, "tokens_per_second_per_gpu": 17353.54, "total_tokens": 9537186 }, { "epoch": 0.02999280172758538, "grad_norm": 0.4054587781429291, "learning_rate": 1.5764323161697933e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 375, "tokens_per_second_per_gpu": 17264.54, "total_tokens": 9563703 }, { "epoch": 0.030072782532192272, "grad_norm": 0.3829587996006012, "learning_rate": 1.573576436351046e-05, "loss": 0.73, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 376, "tokens_per_second_per_gpu": 17249.09, "total_tokens": 9589768 }, { "epoch": 0.030152763336799168, "grad_norm": 0.4045129418373108, "learning_rate": 1.570713567684432e-05, "loss": 0.6873, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 377, "tokens_per_second_per_gpu": 16495.57, "total_tokens": 9614554 }, { "epoch": 0.030232744141406063, "grad_norm": 0.42311742901802063, "learning_rate": 1.5678437450531014e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 378, "tokens_per_second_per_gpu": 17035.79, "total_tokens": 9639664 }, { "epoch": 0.030312724946012958, "grad_norm": 0.40890172123908997, "learning_rate": 1.564967003424938e-05, "loss": 0.7218, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 379, "tokens_per_second_per_gpu": 16421.78, "total_tokens": 9663997 }, { "epoch": 0.03039270575061985, "grad_norm": 0.37312084436416626, "learning_rate": 1.5620833778521306e-05, "loss": 0.6829, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 380, "tokens_per_second_per_gpu": 17273.67, "total_tokens": 9690145 }, { "epoch": 0.030472686555226745, "grad_norm": 0.40423229336738586, "learning_rate": 1.5591929034707468e-05, "loss": 0.7155, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 381, "tokens_per_second_per_gpu": 17179.5, "total_tokens": 9715856 }, { "epoch": 0.03055266735983364, "grad_norm": 0.3965972363948822, "learning_rate": 1.556295615500305e-05, "loss": 0.7335, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 382, "tokens_per_second_per_gpu": 16914.33, "total_tokens": 9740705 }, { "epoch": 0.030632648164440533, "grad_norm": 0.39814358949661255, "learning_rate": 1.553391549243344e-05, "loss": 0.6777, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 383, "tokens_per_second_per_gpu": 16778.95, "total_tokens": 9765413 }, { "epoch": 0.030712628969047428, "grad_norm": 0.41755273938179016, "learning_rate": 1.5504807400849957e-05, "loss": 0.7882, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 384, "tokens_per_second_per_gpu": 16864.26, "total_tokens": 9790820 }, { "epoch": 0.030792609773654323, "grad_norm": 0.4052574932575226, "learning_rate": 1.5475632234925505e-05, "loss": 0.7715, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 385, "tokens_per_second_per_gpu": 17356.01, "total_tokens": 9817140 }, { "epoch": 0.03087259057826122, "grad_norm": 0.3887154757976532, "learning_rate": 1.5446390350150272e-05, "loss": 0.6877, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 386, "tokens_per_second_per_gpu": 16417.7, "total_tokens": 9841945 }, { "epoch": 0.03095257138286811, "grad_norm": 0.39953020215034485, "learning_rate": 1.54170821028274e-05, "loss": 0.7477, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 387, "tokens_per_second_per_gpu": 17048.84, "total_tokens": 9867366 }, { "epoch": 0.031032552187475006, "grad_norm": 0.3856733441352844, "learning_rate": 1.5387707850068633e-05, "loss": 0.654, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 388, "tokens_per_second_per_gpu": 16843.48, "total_tokens": 9892604 }, { "epoch": 0.0311125329920819, "grad_norm": 0.3791309595108032, "learning_rate": 1.5358267949789968e-05, "loss": 0.7138, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 389, "tokens_per_second_per_gpu": 17779.4, "total_tokens": 9919047 }, { "epoch": 0.031192513796688796, "grad_norm": 0.4217212498188019, "learning_rate": 1.53287627607073e-05, "loss": 0.7381, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 390, "tokens_per_second_per_gpu": 17139.02, "total_tokens": 9944164 }, { "epoch": 0.03127249460129569, "grad_norm": 0.3937268853187561, "learning_rate": 1.529919264233205e-05, "loss": 0.6793, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 391, "tokens_per_second_per_gpu": 16873.44, "total_tokens": 9969559 }, { "epoch": 0.03135247540590258, "grad_norm": 0.39358460903167725, "learning_rate": 1.5269557954966777e-05, "loss": 0.7156, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 392, "tokens_per_second_per_gpu": 16668.99, "total_tokens": 9994082 }, { "epoch": 0.031432456210509475, "grad_norm": 0.41820088028907776, "learning_rate": 1.5239859059700794e-05, "loss": 0.7444, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 393, "tokens_per_second_per_gpu": 16729.16, "total_tokens": 10019253 }, { "epoch": 0.03151243701511637, "grad_norm": 0.40098121762275696, "learning_rate": 1.5210096318405768e-05, "loss": 0.7275, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 394, "tokens_per_second_per_gpu": 17146.85, "total_tokens": 10044970 }, { "epoch": 0.031592417819723266, "grad_norm": 0.3832881450653076, "learning_rate": 1.5180270093731305e-05, "loss": 0.7174, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 395, "tokens_per_second_per_gpu": 16825.22, "total_tokens": 10069975 }, { "epoch": 0.03167239862433016, "grad_norm": 0.5176158547401428, "learning_rate": 1.5150380749100545e-05, "loss": 0.7295, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 396, "tokens_per_second_per_gpu": 17251.5, "total_tokens": 10095923 }, { "epoch": 0.03175237942893706, "grad_norm": 0.3928660452365875, "learning_rate": 1.5120428648705716e-05, "loss": 0.6951, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 397, "tokens_per_second_per_gpu": 17162.01, "total_tokens": 10122361 }, { "epoch": 0.03183236023354395, "grad_norm": 0.3940604627132416, "learning_rate": 1.5090414157503715e-05, "loss": 0.7341, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 398, "tokens_per_second_per_gpu": 16976.56, "total_tokens": 10148210 }, { "epoch": 0.03191234103815084, "grad_norm": 0.4209328591823578, "learning_rate": 1.5060337641211637e-05, "loss": 0.7186, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 399, "tokens_per_second_per_gpu": 16919.75, "total_tokens": 10173572 }, { "epoch": 0.031992321842757736, "grad_norm": 0.40747904777526855, "learning_rate": 1.5030199466302354e-05, "loss": 0.7456, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 400, "tokens_per_second_per_gpu": 17805.14, "total_tokens": 10200493 }, { "epoch": 0.03207230264736463, "grad_norm": 0.46691301465034485, "learning_rate": 1.5000000000000002e-05, "loss": 0.726, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 401, "tokens_per_second_per_gpu": 16555.25, "total_tokens": 10224698 }, { "epoch": 0.032152283451971526, "grad_norm": 0.3882039487361908, "learning_rate": 1.4969739610275556e-05, "loss": 0.6911, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 402, "tokens_per_second_per_gpu": 17460.29, "total_tokens": 10250839 }, { "epoch": 0.03223226425657842, "grad_norm": 0.41841983795166016, "learning_rate": 1.493941866584231e-05, "loss": 0.7002, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 403, "tokens_per_second_per_gpu": 17185.88, "total_tokens": 10276595 }, { "epoch": 0.03231224506118532, "grad_norm": 0.4183862805366516, "learning_rate": 1.490903753615141e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 404, "tokens_per_second_per_gpu": 16490.44, "total_tokens": 10301334 }, { "epoch": 0.03239222586579221, "grad_norm": 0.426186740398407, "learning_rate": 1.4878596591387329e-05, "loss": 0.7433, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 405, "tokens_per_second_per_gpu": 17491.41, "total_tokens": 10326588 }, { "epoch": 0.0324722066703991, "grad_norm": 0.4127671718597412, "learning_rate": 1.4848096202463373e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 406, "tokens_per_second_per_gpu": 16889.42, "total_tokens": 10351330 }, { "epoch": 0.032552187475005996, "grad_norm": 0.3885892629623413, "learning_rate": 1.4817536741017153e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 407, "tokens_per_second_per_gpu": 17271.18, "total_tokens": 10376865 }, { "epoch": 0.03263216827961289, "grad_norm": 0.4392751157283783, "learning_rate": 1.478691857940607e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 408, "tokens_per_second_per_gpu": 17086.59, "total_tokens": 10401715 }, { "epoch": 0.03271214908421979, "grad_norm": 0.4046195149421692, "learning_rate": 1.4756242090702756e-05, "loss": 0.6995, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 409, "tokens_per_second_per_gpu": 17387.64, "total_tokens": 10427512 }, { "epoch": 0.03279212988882668, "grad_norm": 0.42296287417411804, "learning_rate": 1.4725507648690542e-05, "loss": 0.6922, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 410, "tokens_per_second_per_gpu": 16433.12, "total_tokens": 10452185 }, { "epoch": 0.03287211069343358, "grad_norm": 0.41615429520606995, "learning_rate": 1.469471562785891e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 411, "tokens_per_second_per_gpu": 17170.46, "total_tokens": 10477581 }, { "epoch": 0.03295209149804047, "grad_norm": 0.4219436049461365, "learning_rate": 1.4663866403398915e-05, "loss": 0.6897, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 412, "tokens_per_second_per_gpu": 16646.71, "total_tokens": 10502411 }, { "epoch": 0.03303207230264737, "grad_norm": 0.42644554376602173, "learning_rate": 1.463296035119862e-05, "loss": 0.7273, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 413, "tokens_per_second_per_gpu": 16944.04, "total_tokens": 10527755 }, { "epoch": 0.033112053107254256, "grad_norm": 0.39926496148109436, "learning_rate": 1.4601997847838518e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 414, "tokens_per_second_per_gpu": 17372.82, "total_tokens": 10554332 }, { "epoch": 0.03319203391186115, "grad_norm": 0.40787941217422485, "learning_rate": 1.4570979270586944e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 415, "tokens_per_second_per_gpu": 17193.02, "total_tokens": 10580110 }, { "epoch": 0.03327201471646805, "grad_norm": 0.42348116636276245, "learning_rate": 1.4539904997395468e-05, "loss": 0.655, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 416, "tokens_per_second_per_gpu": 17180.72, "total_tokens": 10605425 }, { "epoch": 0.03335199552107494, "grad_norm": 0.44330260157585144, "learning_rate": 1.4508775406894308e-05, "loss": 0.7509, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 417, "tokens_per_second_per_gpu": 17173.37, "total_tokens": 10631736 }, { "epoch": 0.03343197632568184, "grad_norm": 0.44089949131011963, "learning_rate": 1.4477590878387697e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 418, "tokens_per_second_per_gpu": 17112.95, "total_tokens": 10657439 }, { "epoch": 0.03351195713028873, "grad_norm": 0.45663735270500183, "learning_rate": 1.4446351791849276e-05, "loss": 0.7088, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 419, "tokens_per_second_per_gpu": 16506.33, "total_tokens": 10681868 }, { "epoch": 0.03359193793489563, "grad_norm": 0.422953724861145, "learning_rate": 1.4415058527917454e-05, "loss": 0.7334, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 420, "tokens_per_second_per_gpu": 17558.74, "total_tokens": 10708474 }, { "epoch": 0.03367191873950252, "grad_norm": 0.4254125654697418, "learning_rate": 1.4383711467890776e-05, "loss": 0.6822, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 421, "tokens_per_second_per_gpu": 17224.81, "total_tokens": 10733585 }, { "epoch": 0.03375189954410941, "grad_norm": 0.4303964674472809, "learning_rate": 1.4352310993723277e-05, "loss": 0.7347, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 422, "tokens_per_second_per_gpu": 17144.03, "total_tokens": 10759349 }, { "epoch": 0.03383188034871631, "grad_norm": 0.422776997089386, "learning_rate": 1.4320857488019826e-05, "loss": 0.7005, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 423, "tokens_per_second_per_gpu": 16822.49, "total_tokens": 10785174 }, { "epoch": 0.0339118611533232, "grad_norm": 0.4445240795612335, "learning_rate": 1.4289351334031461e-05, "loss": 0.6952, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 424, "tokens_per_second_per_gpu": 17287.26, "total_tokens": 10810894 }, { "epoch": 0.0339918419579301, "grad_norm": 0.402654767036438, "learning_rate": 1.4257792915650728e-05, "loss": 0.7211, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 425, "tokens_per_second_per_gpu": 14675.92, "total_tokens": 10836666 }, { "epoch": 0.034071822762536993, "grad_norm": 0.4416694939136505, "learning_rate": 1.4226182617406996e-05, "loss": 0.7003, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 426, "tokens_per_second_per_gpu": 16632.17, "total_tokens": 10861229 }, { "epoch": 0.03415180356714389, "grad_norm": 0.41705960035324097, "learning_rate": 1.4194520824461773e-05, "loss": 0.7096, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 427, "tokens_per_second_per_gpu": 17211.97, "total_tokens": 10887118 }, { "epoch": 0.03423178437175078, "grad_norm": 0.4063047170639038, "learning_rate": 1.4162807922604014e-05, "loss": 0.6899, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 428, "tokens_per_second_per_gpu": 16943.74, "total_tokens": 10912629 }, { "epoch": 0.03431176517635767, "grad_norm": 0.46809977293014526, "learning_rate": 1.413104429824542e-05, "loss": 0.7048, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 429, "tokens_per_second_per_gpu": 16626.15, "total_tokens": 10937453 }, { "epoch": 0.03439174598096457, "grad_norm": 0.4412693977355957, "learning_rate": 1.4099230338415728e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 430, "tokens_per_second_per_gpu": 16983.01, "total_tokens": 10962482 }, { "epoch": 0.03447172678557146, "grad_norm": 0.43178603053092957, "learning_rate": 1.4067366430758004e-05, "loss": 0.6916, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 431, "tokens_per_second_per_gpu": 16939.86, "total_tokens": 10987805 }, { "epoch": 0.03455170759017836, "grad_norm": 0.443692147731781, "learning_rate": 1.4035452963523903e-05, "loss": 0.7305, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 432, "tokens_per_second_per_gpu": 16964.69, "total_tokens": 11013753 }, { "epoch": 0.034631688394785254, "grad_norm": 0.4076201915740967, "learning_rate": 1.4003490325568953e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 433, "tokens_per_second_per_gpu": 17153.51, "total_tokens": 11039930 }, { "epoch": 0.03471166919939215, "grad_norm": 0.44919684529304504, "learning_rate": 1.3971478906347806e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 434, "tokens_per_second_per_gpu": 16898.36, "total_tokens": 11065066 }, { "epoch": 0.03479165000399904, "grad_norm": 0.4365704655647278, "learning_rate": 1.3939419095909513e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 435, "tokens_per_second_per_gpu": 16954.82, "total_tokens": 11090213 }, { "epoch": 0.03487163080860593, "grad_norm": 0.4258210062980652, "learning_rate": 1.3907311284892737e-05, "loss": 0.7079, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 436, "tokens_per_second_per_gpu": 17515.26, "total_tokens": 11116316 }, { "epoch": 0.03495161161321283, "grad_norm": 0.4155106544494629, "learning_rate": 1.3875155864521031e-05, "loss": 0.7349, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 437, "tokens_per_second_per_gpu": 17376.63, "total_tokens": 11143315 }, { "epoch": 0.035031592417819724, "grad_norm": 0.45664307475090027, "learning_rate": 1.3842953226598036e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 438, "tokens_per_second_per_gpu": 16759.03, "total_tokens": 11167988 }, { "epoch": 0.03511157322242662, "grad_norm": 0.4296400249004364, "learning_rate": 1.3810703763502744e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 439, "tokens_per_second_per_gpu": 17066.98, "total_tokens": 11193249 }, { "epoch": 0.035191554027033514, "grad_norm": 0.4324433207511902, "learning_rate": 1.3778407868184674e-05, "loss": 0.7137, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 440, "tokens_per_second_per_gpu": 16678.98, "total_tokens": 11217984 }, { "epoch": 0.03527153483164041, "grad_norm": 0.4287432134151459, "learning_rate": 1.3746065934159123e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 441, "tokens_per_second_per_gpu": 16398.96, "total_tokens": 11242621 }, { "epoch": 0.0353515156362473, "grad_norm": 0.4307049810886383, "learning_rate": 1.371367835550235e-05, "loss": 0.7475, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 442, "tokens_per_second_per_gpu": 17000.86, "total_tokens": 11268097 }, { "epoch": 0.03543149644085419, "grad_norm": 0.42402443289756775, "learning_rate": 1.3681245526846782e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 443, "tokens_per_second_per_gpu": 17005.71, "total_tokens": 11293129 }, { "epoch": 0.03551147724546109, "grad_norm": 0.4233229458332062, "learning_rate": 1.3648767843376196e-05, "loss": 0.6949, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 444, "tokens_per_second_per_gpu": 17142.72, "total_tokens": 11318904 }, { "epoch": 0.035591458050067984, "grad_norm": 0.441266268491745, "learning_rate": 1.3616245700820922e-05, "loss": 0.7124, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 445, "tokens_per_second_per_gpu": 16907.05, "total_tokens": 11344667 }, { "epoch": 0.03567143885467488, "grad_norm": 0.45229724049568176, "learning_rate": 1.3583679495453e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 446, "tokens_per_second_per_gpu": 16779.4, "total_tokens": 11370012 }, { "epoch": 0.035751419659281775, "grad_norm": 0.4272010326385498, "learning_rate": 1.3551069624081372e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 447, "tokens_per_second_per_gpu": 16302.12, "total_tokens": 11394710 }, { "epoch": 0.03583140046388867, "grad_norm": 0.4327336251735687, "learning_rate": 1.3518416484047018e-05, "loss": 0.6747, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 448, "tokens_per_second_per_gpu": 16728.59, "total_tokens": 11419699 }, { "epoch": 0.03591138126849556, "grad_norm": 0.4202955961227417, "learning_rate": 1.3485720473218153e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 449, "tokens_per_second_per_gpu": 17303.71, "total_tokens": 11445057 }, { "epoch": 0.035991362073102454, "grad_norm": 0.4030447006225586, "learning_rate": 1.3452981989985347e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 450, "tokens_per_second_per_gpu": 17036.83, "total_tokens": 11470311 }, { "epoch": 0.03607134287770935, "grad_norm": 0.4464939534664154, "learning_rate": 1.342020143325669e-05, "loss": 0.6813, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 451, "tokens_per_second_per_gpu": 16636.45, "total_tokens": 11495272 }, { "epoch": 0.036151323682316244, "grad_norm": 0.41173145174980164, "learning_rate": 1.3387379202452917e-05, "loss": 0.6944, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 452, "tokens_per_second_per_gpu": 17471.58, "total_tokens": 11521756 }, { "epoch": 0.03623130448692314, "grad_norm": 0.43435975909233093, "learning_rate": 1.3354515697502552e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 453, "tokens_per_second_per_gpu": 16937.9, "total_tokens": 11547286 }, { "epoch": 0.036311285291530035, "grad_norm": 0.442965030670166, "learning_rate": 1.3321611318837033e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 454, "tokens_per_second_per_gpu": 16823.21, "total_tokens": 11572126 }, { "epoch": 0.03639126609613693, "grad_norm": 0.4620346128940582, "learning_rate": 1.3288666467385834e-05, "loss": 0.7231, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 455, "tokens_per_second_per_gpu": 17179.18, "total_tokens": 11597500 }, { "epoch": 0.03647124690074382, "grad_norm": 0.4446198642253876, "learning_rate": 1.3255681544571568e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 456, "tokens_per_second_per_gpu": 16641.55, "total_tokens": 11621978 }, { "epoch": 0.036551227705350714, "grad_norm": 0.44696947932243347, "learning_rate": 1.3222656952305113e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 457, "tokens_per_second_per_gpu": 17202.52, "total_tokens": 11647603 }, { "epoch": 0.03663120850995761, "grad_norm": 0.446732759475708, "learning_rate": 1.3189593092980701e-05, "loss": 0.7131, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 458, "tokens_per_second_per_gpu": 17469.88, "total_tokens": 11673603 }, { "epoch": 0.036711189314564505, "grad_norm": 0.44011181592941284, "learning_rate": 1.3156490369471026e-05, "loss": 0.6603, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 459, "tokens_per_second_per_gpu": 16763.75, "total_tokens": 11698721 }, { "epoch": 0.0367911701191714, "grad_norm": 0.47020354866981506, "learning_rate": 1.3123349185122328e-05, "loss": 0.6767, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 460, "tokens_per_second_per_gpu": 16818.73, "total_tokens": 11723824 }, { "epoch": 0.036871150923778295, "grad_norm": 0.4808385670185089, "learning_rate": 1.3090169943749475e-05, "loss": 0.7207, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 461, "tokens_per_second_per_gpu": 16326.55, "total_tokens": 11747939 }, { "epoch": 0.03695113172838519, "grad_norm": 0.4525218904018402, "learning_rate": 1.3056953049631059e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 462, "tokens_per_second_per_gpu": 17413.36, "total_tokens": 11774028 }, { "epoch": 0.03703111253299208, "grad_norm": 0.4264589250087738, "learning_rate": 1.3023698907504447e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 463, "tokens_per_second_per_gpu": 16640.79, "total_tokens": 11798824 }, { "epoch": 0.037111093337598974, "grad_norm": 0.43030428886413574, "learning_rate": 1.2990407922560869e-05, "loss": 0.656, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 464, "tokens_per_second_per_gpu": 16456.8, "total_tokens": 11822904 }, { "epoch": 0.03719107414220587, "grad_norm": 0.43640056252479553, "learning_rate": 1.2957080500440469e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 465, "tokens_per_second_per_gpu": 17006.93, "total_tokens": 11848488 }, { "epoch": 0.037271054946812765, "grad_norm": 0.44964516162872314, "learning_rate": 1.2923717047227368e-05, "loss": 0.6901, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 466, "tokens_per_second_per_gpu": 17548.04, "total_tokens": 11874583 }, { "epoch": 0.03735103575141966, "grad_norm": 0.4395027160644531, "learning_rate": 1.2890317969444716e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 467, "tokens_per_second_per_gpu": 16482.1, "total_tokens": 11899197 }, { "epoch": 0.037431016556026556, "grad_norm": 0.42954379320144653, "learning_rate": 1.2856883674049736e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 468, "tokens_per_second_per_gpu": 17318.43, "total_tokens": 11924491 }, { "epoch": 0.03751099736063345, "grad_norm": 0.4213207960128784, "learning_rate": 1.2823414568428767e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 469, "tokens_per_second_per_gpu": 16459.46, "total_tokens": 11948982 }, { "epoch": 0.03759097816524034, "grad_norm": 0.43104055523872375, "learning_rate": 1.2789911060392295e-05, "loss": 0.7354, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 470, "tokens_per_second_per_gpu": 17537.58, "total_tokens": 11976515 }, { "epoch": 0.037670958969847235, "grad_norm": 0.4502396881580353, "learning_rate": 1.2756373558169992e-05, "loss": 0.675, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 471, "tokens_per_second_per_gpu": 17392.8, "total_tokens": 12002880 }, { "epoch": 0.03775093977445413, "grad_norm": 0.45354557037353516, "learning_rate": 1.2722802470405744e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 472, "tokens_per_second_per_gpu": 16881.98, "total_tokens": 12028090 }, { "epoch": 0.037830920579061025, "grad_norm": 0.43540510535240173, "learning_rate": 1.2689198206152657e-05, "loss": 0.6882, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 473, "tokens_per_second_per_gpu": 17161.61, "total_tokens": 12053924 }, { "epoch": 0.03791090138366792, "grad_norm": 0.4614422917366028, "learning_rate": 1.265556117486809e-05, "loss": 0.7256, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 474, "tokens_per_second_per_gpu": 17023.83, "total_tokens": 12079474 }, { "epoch": 0.037990882188274816, "grad_norm": 0.44551095366477966, "learning_rate": 1.2621891786408648e-05, "loss": 0.7414, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 475, "tokens_per_second_per_gpu": 17029.63, "total_tokens": 12105739 }, { "epoch": 0.03807086299288171, "grad_norm": 0.45504751801490784, "learning_rate": 1.2588190451025209e-05, "loss": 0.717, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 476, "tokens_per_second_per_gpu": 17089.58, "total_tokens": 12131563 }, { "epoch": 0.0381508437974886, "grad_norm": 0.4884074628353119, "learning_rate": 1.2554457579357906e-05, "loss": 0.7327, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 477, "tokens_per_second_per_gpu": 17239.01, "total_tokens": 12156874 }, { "epoch": 0.038230824602095495, "grad_norm": 0.4748455882072449, "learning_rate": 1.252069358243114e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 478, "tokens_per_second_per_gpu": 17469.29, "total_tokens": 12183330 }, { "epoch": 0.03831080540670239, "grad_norm": 0.4526073634624481, "learning_rate": 1.2486898871648552e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 479, "tokens_per_second_per_gpu": 16705.93, "total_tokens": 12207907 }, { "epoch": 0.038390786211309286, "grad_norm": 0.4595562517642975, "learning_rate": 1.2453073858788027e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 480, "tokens_per_second_per_gpu": 16815.36, "total_tokens": 12233006 }, { "epoch": 0.03847076701591618, "grad_norm": 0.4445168673992157, "learning_rate": 1.2419218955996677e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 481, "tokens_per_second_per_gpu": 16899.74, "total_tokens": 12258436 }, { "epoch": 0.038550747820523076, "grad_norm": 0.4498426914215088, "learning_rate": 1.238533457578581e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 482, "tokens_per_second_per_gpu": 16656.0, "total_tokens": 12283200 }, { "epoch": 0.03863072862512997, "grad_norm": 0.48890069127082825, "learning_rate": 1.23514211310259e-05, "loss": 0.7179, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 483, "tokens_per_second_per_gpu": 17130.89, "total_tokens": 12309222 }, { "epoch": 0.03871070942973686, "grad_norm": 0.4737612307071686, "learning_rate": 1.2317479034941572e-05, "loss": 0.711, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 484, "tokens_per_second_per_gpu": 16815.6, "total_tokens": 12334335 }, { "epoch": 0.038790690234343755, "grad_norm": 0.4556877315044403, "learning_rate": 1.2283508701106559e-05, "loss": 0.7006, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 485, "tokens_per_second_per_gpu": 17323.63, "total_tokens": 12360225 }, { "epoch": 0.03887067103895065, "grad_norm": 0.4712156057357788, "learning_rate": 1.2249510543438652e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 486, "tokens_per_second_per_gpu": 16818.52, "total_tokens": 12385730 }, { "epoch": 0.038950651843557546, "grad_norm": 0.45326972007751465, "learning_rate": 1.2215484976194675e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 487, "tokens_per_second_per_gpu": 16917.67, "total_tokens": 12410731 }, { "epoch": 0.03903063264816444, "grad_norm": 0.4285866916179657, "learning_rate": 1.2181432413965428e-05, "loss": 0.6759, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 488, "tokens_per_second_per_gpu": 17554.28, "total_tokens": 12437737 }, { "epoch": 0.03911061345277134, "grad_norm": 0.4505816400051117, "learning_rate": 1.2147353271670634e-05, "loss": 0.7116, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 489, "tokens_per_second_per_gpu": 17837.91, "total_tokens": 12465159 }, { "epoch": 0.03919059425737823, "grad_norm": 0.4805770814418793, "learning_rate": 1.211324796455389e-05, "loss": 0.6968, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 490, "tokens_per_second_per_gpu": 16656.0, "total_tokens": 12490064 }, { "epoch": 0.03927057506198512, "grad_norm": 0.45226889848709106, "learning_rate": 1.2079116908177592e-05, "loss": 0.6759, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 491, "tokens_per_second_per_gpu": 17115.44, "total_tokens": 12516094 }, { "epoch": 0.039350555866592016, "grad_norm": 0.4620254635810852, "learning_rate": 1.2044960518417902e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 492, "tokens_per_second_per_gpu": 16855.59, "total_tokens": 12541699 }, { "epoch": 0.03943053667119891, "grad_norm": 0.44682419300079346, "learning_rate": 1.2010779211459649e-05, "loss": 0.6887, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 493, "tokens_per_second_per_gpu": 16927.33, "total_tokens": 12567064 }, { "epoch": 0.039510517475805806, "grad_norm": 0.4683786928653717, "learning_rate": 1.1976573403791263e-05, "loss": 0.658, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 494, "tokens_per_second_per_gpu": 16693.06, "total_tokens": 12591658 }, { "epoch": 0.0395904982804127, "grad_norm": 0.4709741771221161, "learning_rate": 1.194234351219972e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 495, "tokens_per_second_per_gpu": 16319.58, "total_tokens": 12615802 }, { "epoch": 0.0396704790850196, "grad_norm": 0.7030223608016968, "learning_rate": 1.190808995376545e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 496, "tokens_per_second_per_gpu": 16896.03, "total_tokens": 12640099 }, { "epoch": 0.03975045988962649, "grad_norm": 0.4555974304676056, "learning_rate": 1.187381314585725e-05, "loss": 0.6925, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 497, "tokens_per_second_per_gpu": 17402.31, "total_tokens": 12666288 }, { "epoch": 0.03983044069423338, "grad_norm": 0.4940910041332245, "learning_rate": 1.1839513506127202e-05, "loss": 0.681, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 498, "tokens_per_second_per_gpu": 16930.11, "total_tokens": 12691469 }, { "epoch": 0.039910421498840276, "grad_norm": 0.4535921812057495, "learning_rate": 1.1805191452505602e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 499, "tokens_per_second_per_gpu": 16906.81, "total_tokens": 12716852 }, { "epoch": 0.03999040230344717, "grad_norm": 0.46495068073272705, "learning_rate": 1.1770847403195836e-05, "loss": 0.7064, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 500, "tokens_per_second_per_gpu": 17272.14, "total_tokens": 12742985 }, { "epoch": 0.04007038310805407, "grad_norm": 0.46297863125801086, "learning_rate": 1.1736481776669307e-05, "loss": 0.7218, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 501, "tokens_per_second_per_gpu": 16950.12, "total_tokens": 12768836 }, { "epoch": 0.04015036391266096, "grad_norm": 0.4618571698665619, "learning_rate": 1.1702094991660326e-05, "loss": 0.674, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 502, "tokens_per_second_per_gpu": 16882.33, "total_tokens": 12794066 }, { "epoch": 0.04023034471726786, "grad_norm": 0.44983258843421936, "learning_rate": 1.1667687467161025e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 503, "tokens_per_second_per_gpu": 17461.33, "total_tokens": 12820375 }, { "epoch": 0.04031032552187475, "grad_norm": 0.46179690957069397, "learning_rate": 1.1633259622416224e-05, "loss": 0.6698, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 504, "tokens_per_second_per_gpu": 16827.05, "total_tokens": 12845726 }, { "epoch": 0.04039030632648164, "grad_norm": 0.4472286105155945, "learning_rate": 1.159881187691835e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 505, "tokens_per_second_per_gpu": 17124.23, "total_tokens": 12872001 }, { "epoch": 0.040470287131088536, "grad_norm": 0.4627981185913086, "learning_rate": 1.156434465040231e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 506, "tokens_per_second_per_gpu": 16904.21, "total_tokens": 12897321 }, { "epoch": 0.04055026793569543, "grad_norm": 0.44518762826919556, "learning_rate": 1.1529858362840383e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 507, "tokens_per_second_per_gpu": 17281.89, "total_tokens": 12923660 }, { "epoch": 0.04063024874030233, "grad_norm": 0.4409578740596771, "learning_rate": 1.1495353434437098e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 508, "tokens_per_second_per_gpu": 17102.59, "total_tokens": 12949277 }, { "epoch": 0.04071022954490922, "grad_norm": 0.44398391246795654, "learning_rate": 1.1460830285624119e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 509, "tokens_per_second_per_gpu": 17247.23, "total_tokens": 12975352 }, { "epoch": 0.04079021034951612, "grad_norm": 0.4975646436214447, "learning_rate": 1.1426289337055119e-05, "loss": 0.6131, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 510, "tokens_per_second_per_gpu": 15943.69, "total_tokens": 12998651 }, { "epoch": 0.04087019115412301, "grad_norm": 0.48738542199134827, "learning_rate": 1.1391731009600655e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 511, "tokens_per_second_per_gpu": 16154.38, "total_tokens": 13022893 }, { "epoch": 0.0409501719587299, "grad_norm": 0.4914393723011017, "learning_rate": 1.1357155724343046e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 512, "tokens_per_second_per_gpu": 16945.85, "total_tokens": 13047605 }, { "epoch": 0.0410301527633368, "grad_norm": 0.45245736837387085, "learning_rate": 1.1322563902571227e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 513, "tokens_per_second_per_gpu": 17274.69, "total_tokens": 13073795 }, { "epoch": 0.04111013356794369, "grad_norm": 0.4842854142189026, "learning_rate": 1.128795596577563e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 514, "tokens_per_second_per_gpu": 17081.96, "total_tokens": 13099940 }, { "epoch": 0.04119011437255059, "grad_norm": 0.5243505835533142, "learning_rate": 1.1253332335643043e-05, "loss": 0.675, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 515, "tokens_per_second_per_gpu": 16642.09, "total_tokens": 13124720 }, { "epoch": 0.04127009517715748, "grad_norm": 0.46914488077163696, "learning_rate": 1.1218693434051475e-05, "loss": 0.6719, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 516, "tokens_per_second_per_gpu": 16966.49, "total_tokens": 13150010 }, { "epoch": 0.04135007598176438, "grad_norm": 0.44769319891929626, "learning_rate": 1.1184039683065014e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 517, "tokens_per_second_per_gpu": 17236.72, "total_tokens": 13176461 }, { "epoch": 0.041430056786371273, "grad_norm": 0.4807461202144623, "learning_rate": 1.1149371504928667e-05, "loss": 0.7321, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 518, "tokens_per_second_per_gpu": 16848.78, "total_tokens": 13202015 }, { "epoch": 0.04151003759097816, "grad_norm": 0.4664666950702667, "learning_rate": 1.1114689322063255e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 519, "tokens_per_second_per_gpu": 17132.94, "total_tokens": 13228010 }, { "epoch": 0.04159001839558506, "grad_norm": 0.48780035972595215, "learning_rate": 1.1079993557060228e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 520, "tokens_per_second_per_gpu": 16422.78, "total_tokens": 13252575 }, { "epoch": 0.04166999920019195, "grad_norm": 0.4693656861782074, "learning_rate": 1.1045284632676535e-05, "loss": 0.6891, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 521, "tokens_per_second_per_gpu": 17555.32, "total_tokens": 13278801 }, { "epoch": 0.04174998000479885, "grad_norm": 0.458926796913147, "learning_rate": 1.1010562971829464e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 522, "tokens_per_second_per_gpu": 16075.34, "total_tokens": 13302766 }, { "epoch": 0.04182996080940574, "grad_norm": 0.462158739566803, "learning_rate": 1.0975828997591496e-05, "loss": 0.6799, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 523, "tokens_per_second_per_gpu": 17398.83, "total_tokens": 13329089 }, { "epoch": 0.04190994161401264, "grad_norm": 0.4593111276626587, "learning_rate": 1.0941083133185146e-05, "loss": 0.7031, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 524, "tokens_per_second_per_gpu": 17006.54, "total_tokens": 13355144 }, { "epoch": 0.041989922418619534, "grad_norm": 0.46989020705223083, "learning_rate": 1.0906325801977804e-05, "loss": 0.7105, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 525, "tokens_per_second_per_gpu": 17221.48, "total_tokens": 13381015 }, { "epoch": 0.04206990322322642, "grad_norm": 0.46403929591178894, "learning_rate": 1.0871557427476585e-05, "loss": 0.6809, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 526, "tokens_per_second_per_gpu": 17428.79, "total_tokens": 13407634 }, { "epoch": 0.04214988402783332, "grad_norm": 0.44122979044914246, "learning_rate": 1.083677843332316e-05, "loss": 0.661, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 527, "tokens_per_second_per_gpu": 17050.17, "total_tokens": 13434044 }, { "epoch": 0.04222986483244021, "grad_norm": 0.46086767315864563, "learning_rate": 1.0801989243288588e-05, "loss": 0.692, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 528, "tokens_per_second_per_gpu": 17260.71, "total_tokens": 13460422 }, { "epoch": 0.04230984563704711, "grad_norm": 0.4788115918636322, "learning_rate": 1.0767190281268187e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 529, "tokens_per_second_per_gpu": 16919.47, "total_tokens": 13485959 }, { "epoch": 0.042389826441654004, "grad_norm": 0.46982550621032715, "learning_rate": 1.0732381971276318e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 530, "tokens_per_second_per_gpu": 16852.02, "total_tokens": 13510769 }, { "epoch": 0.0424698072462609, "grad_norm": 0.4891279339790344, "learning_rate": 1.0697564737441254e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 531, "tokens_per_second_per_gpu": 16554.57, "total_tokens": 13535563 }, { "epoch": 0.042549788050867794, "grad_norm": 0.4954123795032501, "learning_rate": 1.0662739004000005e-05, "loss": 0.6915, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 532, "tokens_per_second_per_gpu": 17455.25, "total_tokens": 13561469 }, { "epoch": 0.04262976885547469, "grad_norm": 0.49951866269111633, "learning_rate": 1.0627905195293135e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 533, "tokens_per_second_per_gpu": 17206.21, "total_tokens": 13587381 }, { "epoch": 0.04270974966008158, "grad_norm": 0.4457262456417084, "learning_rate": 1.0593063735759619e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 534, "tokens_per_second_per_gpu": 17794.38, "total_tokens": 13614365 }, { "epoch": 0.04278973046468847, "grad_norm": 0.475887656211853, "learning_rate": 1.055821504993164e-05, "loss": 0.634, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 535, "tokens_per_second_per_gpu": 16492.61, "total_tokens": 13639374 }, { "epoch": 0.04286971126929537, "grad_norm": 0.48933205008506775, "learning_rate": 1.0523359562429441e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 536, "tokens_per_second_per_gpu": 17192.3, "total_tokens": 13665700 }, { "epoch": 0.042949692073902264, "grad_norm": 0.5178970098495483, "learning_rate": 1.0488497697956134e-05, "loss": 0.7028, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 537, "tokens_per_second_per_gpu": 16638.15, "total_tokens": 13690118 }, { "epoch": 0.04302967287850916, "grad_norm": 0.4723743498325348, "learning_rate": 1.0453629881292537e-05, "loss": 0.689, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 538, "tokens_per_second_per_gpu": 17283.06, "total_tokens": 13716392 }, { "epoch": 0.043109653683116055, "grad_norm": 0.5020018219947815, "learning_rate": 1.0418756537291996e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 539, "tokens_per_second_per_gpu": 17027.19, "total_tokens": 13741612 }, { "epoch": 0.04318963448772295, "grad_norm": 0.5196510553359985, "learning_rate": 1.03838780908752e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 540, "tokens_per_second_per_gpu": 16934.53, "total_tokens": 13767073 }, { "epoch": 0.04326961529232984, "grad_norm": 0.4690985083580017, "learning_rate": 1.0348994967025012e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 541, "tokens_per_second_per_gpu": 16886.26, "total_tokens": 13792114 }, { "epoch": 0.043349596096936734, "grad_norm": 0.5237311124801636, "learning_rate": 1.0314107590781284e-05, "loss": 0.6957, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 542, "tokens_per_second_per_gpu": 16878.17, "total_tokens": 13817155 }, { "epoch": 0.04342957690154363, "grad_norm": 0.47251269221305847, "learning_rate": 1.0279216387235691e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 543, "tokens_per_second_per_gpu": 17364.53, "total_tokens": 13843641 }, { "epoch": 0.043509557706150524, "grad_norm": 0.49236616492271423, "learning_rate": 1.0244321781526533e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 544, "tokens_per_second_per_gpu": 17090.74, "total_tokens": 13869705 }, { "epoch": 0.04358953851075742, "grad_norm": 0.4944368898868561, "learning_rate": 1.0209424198833571e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 545, "tokens_per_second_per_gpu": 17123.06, "total_tokens": 13895270 }, { "epoch": 0.043669519315364315, "grad_norm": 0.4860251545906067, "learning_rate": 1.0174524064372837e-05, "loss": 0.6804, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 546, "tokens_per_second_per_gpu": 16587.1, "total_tokens": 13920244 }, { "epoch": 0.04374950011997121, "grad_norm": 0.48462778329849243, "learning_rate": 1.0139621803391454e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 547, "tokens_per_second_per_gpu": 16738.82, "total_tokens": 13945409 }, { "epoch": 0.0438294809245781, "grad_norm": 0.4959378242492676, "learning_rate": 1.010471784116246e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 548, "tokens_per_second_per_gpu": 16547.11, "total_tokens": 13970587 }, { "epoch": 0.043909461729184994, "grad_norm": 0.4693349301815033, "learning_rate": 1.0069812602979617e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 549, "tokens_per_second_per_gpu": 17222.81, "total_tokens": 13996670 }, { "epoch": 0.04398944253379189, "grad_norm": 0.4579184055328369, "learning_rate": 1.0034906514152239e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 550, "tokens_per_second_per_gpu": 17389.32, "total_tokens": 14022626 }, { "epoch": 0.044069423338398785, "grad_norm": 0.46185624599456787, "learning_rate": 1e-05, "loss": 0.7294, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 551, "tokens_per_second_per_gpu": 17336.46, "total_tokens": 14049035 }, { "epoch": 0.04414940414300568, "grad_norm": 0.4870699644088745, "learning_rate": 9.965093485847766e-06, "loss": 0.6866, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 552, "tokens_per_second_per_gpu": 17535.72, "total_tokens": 14075514 }, { "epoch": 0.044229384947612575, "grad_norm": 0.4829731285572052, "learning_rate": 9.930187397020385e-06, "loss": 0.6143, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 553, "tokens_per_second_per_gpu": 16359.15, "total_tokens": 14099952 }, { "epoch": 0.04430936575221947, "grad_norm": 0.4855392575263977, "learning_rate": 9.895282158837545e-06, "loss": 0.6524, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 554, "tokens_per_second_per_gpu": 16914.54, "total_tokens": 14125208 }, { "epoch": 0.04438934655682636, "grad_norm": 0.5001446604728699, "learning_rate": 9.860378196608549e-06, "loss": 0.6716, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 555, "tokens_per_second_per_gpu": 16896.62, "total_tokens": 14150850 }, { "epoch": 0.044469327361433254, "grad_norm": 0.45474767684936523, "learning_rate": 9.825475935627165e-06, "loss": 0.6442, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 556, "tokens_per_second_per_gpu": 16980.77, "total_tokens": 14176498 }, { "epoch": 0.04454930816604015, "grad_norm": 0.4773014485836029, "learning_rate": 9.790575801166432e-06, "loss": 0.6755, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 557, "tokens_per_second_per_gpu": 16906.61, "total_tokens": 14202548 }, { "epoch": 0.044629288970647045, "grad_norm": 0.4736998379230499, "learning_rate": 9.75567821847347e-06, "loss": 0.6523, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 558, "tokens_per_second_per_gpu": 17269.9, "total_tokens": 14228353 }, { "epoch": 0.04470926977525394, "grad_norm": 0.47355714440345764, "learning_rate": 9.720783612764314e-06, "loss": 0.5922, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 559, "tokens_per_second_per_gpu": 16300.57, "total_tokens": 14252682 }, { "epoch": 0.044789250579860836, "grad_norm": 0.47529494762420654, "learning_rate": 9.685892409218718e-06, "loss": 0.6587, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 560, "tokens_per_second_per_gpu": 16787.12, "total_tokens": 14278093 }, { "epoch": 0.04486923138446773, "grad_norm": 0.5045996308326721, "learning_rate": 9.651005032974994e-06, "loss": 0.6459, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 561, "tokens_per_second_per_gpu": 16965.31, "total_tokens": 14302754 }, { "epoch": 0.04494921218907462, "grad_norm": 0.47231438755989075, "learning_rate": 9.616121909124801e-06, "loss": 0.7112, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 562, "tokens_per_second_per_gpu": 17578.07, "total_tokens": 14329323 }, { "epoch": 0.045029192993681515, "grad_norm": 0.49113765358924866, "learning_rate": 9.581243462708007e-06, "loss": 0.6435, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 563, "tokens_per_second_per_gpu": 16616.31, "total_tokens": 14353713 }, { "epoch": 0.04510917379828841, "grad_norm": 0.44610634446144104, "learning_rate": 9.546370118707463e-06, "loss": 0.6374, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 564, "tokens_per_second_per_gpu": 17000.59, "total_tokens": 14379865 }, { "epoch": 0.045189154602895305, "grad_norm": 0.4994834065437317, "learning_rate": 9.511502302043867e-06, "loss": 0.6428, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 565, "tokens_per_second_per_gpu": 16745.1, "total_tokens": 14404914 }, { "epoch": 0.0452691354075022, "grad_norm": 0.47246044874191284, "learning_rate": 9.476640437570562e-06, "loss": 0.665, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 566, "tokens_per_second_per_gpu": 16875.25, "total_tokens": 14431100 }, { "epoch": 0.045349116212109096, "grad_norm": 0.5038020014762878, "learning_rate": 9.441784950068362e-06, "loss": 0.6742, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 567, "tokens_per_second_per_gpu": 16680.55, "total_tokens": 14456004 }, { "epoch": 0.04542909701671599, "grad_norm": 0.4679954946041107, "learning_rate": 9.406936264240386e-06, "loss": 0.6609, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 568, "tokens_per_second_per_gpu": 17120.18, "total_tokens": 14482147 }, { "epoch": 0.04550907782132288, "grad_norm": 0.47112342715263367, "learning_rate": 9.372094804706867e-06, "loss": 0.6283, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 569, "tokens_per_second_per_gpu": 17221.54, "total_tokens": 14507518 }, { "epoch": 0.045589058625929775, "grad_norm": 0.4823978543281555, "learning_rate": 9.337260996000002e-06, "loss": 0.6006, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 570, "tokens_per_second_per_gpu": 16988.76, "total_tokens": 14532760 }, { "epoch": 0.04566903943053667, "grad_norm": 0.5082917809486389, "learning_rate": 9.302435262558748e-06, "loss": 0.6403, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 571, "tokens_per_second_per_gpu": 17407.94, "total_tokens": 14559212 }, { "epoch": 0.045749020235143566, "grad_norm": 0.5025095343589783, "learning_rate": 9.267618028723687e-06, "loss": 0.6438, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 572, "tokens_per_second_per_gpu": 17173.37, "total_tokens": 14585315 }, { "epoch": 0.04582900103975046, "grad_norm": 0.4819313883781433, "learning_rate": 9.232809718731815e-06, "loss": 0.6649, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 573, "tokens_per_second_per_gpu": 17488.01, "total_tokens": 14611725 }, { "epoch": 0.045908981844357356, "grad_norm": 0.4713301360607147, "learning_rate": 9.198010756711413e-06, "loss": 0.6653, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 574, "tokens_per_second_per_gpu": 16638.54, "total_tokens": 14636622 }, { "epoch": 0.04598896264896425, "grad_norm": 0.4914127588272095, "learning_rate": 9.163221566676847e-06, "loss": 0.6229, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 575, "tokens_per_second_per_gpu": 16381.64, "total_tokens": 14660439 }, { "epoch": 0.04606894345357114, "grad_norm": 0.4962431788444519, "learning_rate": 9.128442572523418e-06, "loss": 0.6263, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 576, "tokens_per_second_per_gpu": 16798.67, "total_tokens": 14685550 }, { "epoch": 0.046148924258178035, "grad_norm": 0.46047908067703247, "learning_rate": 9.093674198022201e-06, "loss": 0.601, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 577, "tokens_per_second_per_gpu": 16901.2, "total_tokens": 14710705 }, { "epoch": 0.04622890506278493, "grad_norm": 0.4952366054058075, "learning_rate": 9.058916866814857e-06, "loss": 0.6774, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 578, "tokens_per_second_per_gpu": 17149.77, "total_tokens": 14736165 }, { "epoch": 0.046308885867391826, "grad_norm": 0.5213083624839783, "learning_rate": 9.024171002408507e-06, "loss": 0.6587, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 579, "tokens_per_second_per_gpu": 16984.69, "total_tokens": 14761504 }, { "epoch": 0.04638886667199872, "grad_norm": 0.4909270703792572, "learning_rate": 8.989437028170537e-06, "loss": 0.656, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 580, "tokens_per_second_per_gpu": 17165.09, "total_tokens": 14787035 }, { "epoch": 0.04646884747660562, "grad_norm": 0.4714226722717285, "learning_rate": 8.954715367323468e-06, "loss": 0.646, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 581, "tokens_per_second_per_gpu": 17370.32, "total_tokens": 14813116 }, { "epoch": 0.04654882828121251, "grad_norm": 0.459878534078598, "learning_rate": 8.920006442939772e-06, "loss": 0.6484, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 582, "tokens_per_second_per_gpu": 17366.54, "total_tokens": 14839467 }, { "epoch": 0.0466288090858194, "grad_norm": 0.4947279989719391, "learning_rate": 8.885310677936746e-06, "loss": 0.656, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 583, "tokens_per_second_per_gpu": 17204.88, "total_tokens": 14865234 }, { "epoch": 0.046708789890426296, "grad_norm": 0.49737077951431274, "learning_rate": 8.850628495071336e-06, "loss": 0.6808, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 584, "tokens_per_second_per_gpu": 17365.74, "total_tokens": 14891556 }, { "epoch": 0.04678877069503319, "grad_norm": 0.49995678663253784, "learning_rate": 8.815960316934991e-06, "loss": 0.6392, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 585, "tokens_per_second_per_gpu": 17080.84, "total_tokens": 14917230 }, { "epoch": 0.046868751499640086, "grad_norm": 0.5089588165283203, "learning_rate": 8.781306565948528e-06, "loss": 0.6864, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 586, "tokens_per_second_per_gpu": 16838.83, "total_tokens": 14942605 }, { "epoch": 0.04694873230424698, "grad_norm": 0.4909396767616272, "learning_rate": 8.746667664356957e-06, "loss": 0.6111, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 587, "tokens_per_second_per_gpu": 17205.48, "total_tokens": 14968659 }, { "epoch": 0.04702871310885388, "grad_norm": 0.463184118270874, "learning_rate": 8.712044034224374e-06, "loss": 0.595, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 588, "tokens_per_second_per_gpu": 17060.7, "total_tokens": 14994885 }, { "epoch": 0.04710869391346077, "grad_norm": 0.905055820941925, "learning_rate": 8.677436097428775e-06, "loss": 0.6458, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 589, "tokens_per_second_per_gpu": 16621.93, "total_tokens": 15019934 }, { "epoch": 0.04718867471806766, "grad_norm": 0.4729231894016266, "learning_rate": 8.642844275656957e-06, "loss": 0.6957, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 590, "tokens_per_second_per_gpu": 17788.01, "total_tokens": 15046933 }, { "epoch": 0.047268655522674556, "grad_norm": 0.5098869204521179, "learning_rate": 8.60826899039935e-06, "loss": 0.6265, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 591, "tokens_per_second_per_gpu": 16950.24, "total_tokens": 15072162 }, { "epoch": 0.04734863632728145, "grad_norm": 0.49714773893356323, "learning_rate": 8.573710662944884e-06, "loss": 0.6777, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 592, "tokens_per_second_per_gpu": 16566.85, "total_tokens": 15097098 }, { "epoch": 0.04742861713188835, "grad_norm": 0.4808761477470398, "learning_rate": 8.539169714375885e-06, "loss": 0.6586, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 593, "tokens_per_second_per_gpu": 17276.4, "total_tokens": 15123131 }, { "epoch": 0.04750859793649524, "grad_norm": 0.5037384033203125, "learning_rate": 8.504646565562907e-06, "loss": 0.6783, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 594, "tokens_per_second_per_gpu": 17387.82, "total_tokens": 15149443 }, { "epoch": 0.04758857874110214, "grad_norm": 0.49192243814468384, "learning_rate": 8.47014163715962e-06, "loss": 0.6142, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 595, "tokens_per_second_per_gpu": 16517.04, "total_tokens": 15173258 }, { "epoch": 0.04766855954570903, "grad_norm": 0.5216419696807861, "learning_rate": 8.43565534959769e-06, "loss": 0.6364, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 596, "tokens_per_second_per_gpu": 16922.88, "total_tokens": 15198315 }, { "epoch": 0.04774854035031592, "grad_norm": 0.48781725764274597, "learning_rate": 8.401188123081653e-06, "loss": 0.6312, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 597, "tokens_per_second_per_gpu": 17166.4, "total_tokens": 15224223 }, { "epoch": 0.047828521154922816, "grad_norm": 0.48654705286026, "learning_rate": 8.366740377583781e-06, "loss": 0.6459, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 598, "tokens_per_second_per_gpu": 17255.25, "total_tokens": 15250664 }, { "epoch": 0.04790850195952971, "grad_norm": 0.48847904801368713, "learning_rate": 8.332312532838978e-06, "loss": 0.6484, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 599, "tokens_per_second_per_gpu": 17191.9, "total_tokens": 15276565 }, { "epoch": 0.04798848276413661, "grad_norm": 0.4727404713630676, "learning_rate": 8.297905008339677e-06, "loss": 0.6467, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 600, "tokens_per_second_per_gpu": 17521.18, "total_tokens": 15302942 }, { "epoch": 0.0480684635687435, "grad_norm": 0.49052244424819946, "learning_rate": 8.263518223330698e-06, "loss": 0.6492, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 601, "tokens_per_second_per_gpu": 17039.69, "total_tokens": 15329013 }, { "epoch": 0.0481484443733504, "grad_norm": 0.4712292551994324, "learning_rate": 8.22915259680417e-06, "loss": 0.6232, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 602, "tokens_per_second_per_gpu": 16841.81, "total_tokens": 15354575 }, { "epoch": 0.04822842517795729, "grad_norm": 0.4877064526081085, "learning_rate": 8.194808547494401e-06, "loss": 0.6617, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 603, "tokens_per_second_per_gpu": 17087.94, "total_tokens": 15379668 }, { "epoch": 0.04830840598256418, "grad_norm": 0.5102121829986572, "learning_rate": 8.1604864938728e-06, "loss": 0.6315, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 604, "tokens_per_second_per_gpu": 16581.29, "total_tokens": 15404809 }, { "epoch": 0.04838838678717108, "grad_norm": 0.4876486361026764, "learning_rate": 8.126186854142752e-06, "loss": 0.5826, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 605, "tokens_per_second_per_gpu": 16902.6, "total_tokens": 15430201 }, { "epoch": 0.04846836759177797, "grad_norm": 0.510290265083313, "learning_rate": 8.091910046234552e-06, "loss": 0.6742, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 606, "tokens_per_second_per_gpu": 17105.66, "total_tokens": 15455793 }, { "epoch": 0.04854834839638487, "grad_norm": 0.4743480980396271, "learning_rate": 8.057656487800283e-06, "loss": 0.6673, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 607, "tokens_per_second_per_gpu": 17214.78, "total_tokens": 15482472 }, { "epoch": 0.04862832920099176, "grad_norm": 0.48495572805404663, "learning_rate": 8.023426596208739e-06, "loss": 0.6654, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 608, "tokens_per_second_per_gpu": 17224.85, "total_tokens": 15508453 }, { "epoch": 0.04870831000559866, "grad_norm": 0.48911020159721375, "learning_rate": 7.989220788540356e-06, "loss": 0.6215, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 609, "tokens_per_second_per_gpu": 16851.65, "total_tokens": 15533469 }, { "epoch": 0.048788290810205553, "grad_norm": 0.46720772981643677, "learning_rate": 7.955039481582098e-06, "loss": 0.6018, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 610, "tokens_per_second_per_gpu": 16750.14, "total_tokens": 15558559 }, { "epoch": 0.04886827161481244, "grad_norm": 0.5051571130752563, "learning_rate": 7.92088309182241e-06, "loss": 0.6471, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 611, "tokens_per_second_per_gpu": 16870.87, "total_tokens": 15583697 }, { "epoch": 0.04894825241941934, "grad_norm": 0.49818551540374756, "learning_rate": 7.886752035446116e-06, "loss": 0.663, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 612, "tokens_per_second_per_gpu": 17041.92, "total_tokens": 15609568 }, { "epoch": 0.04902823322402623, "grad_norm": 0.47889798879623413, "learning_rate": 7.852646728329368e-06, "loss": 0.6533, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 613, "tokens_per_second_per_gpu": 17201.08, "total_tokens": 15635573 }, { "epoch": 0.04910821402863313, "grad_norm": 0.4940686523914337, "learning_rate": 7.818567586034578e-06, "loss": 0.6428, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 614, "tokens_per_second_per_gpu": 16846.51, "total_tokens": 15660689 }, { "epoch": 0.04918819483324002, "grad_norm": 0.4960979223251343, "learning_rate": 7.784515023805328e-06, "loss": 0.6548, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 615, "tokens_per_second_per_gpu": 16855.99, "total_tokens": 15685730 }, { "epoch": 0.04926817563784692, "grad_norm": 0.5047521591186523, "learning_rate": 7.750489456561351e-06, "loss": 0.609, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 616, "tokens_per_second_per_gpu": 16865.43, "total_tokens": 15711363 }, { "epoch": 0.049348156442453814, "grad_norm": 0.538982629776001, "learning_rate": 7.716491298893443e-06, "loss": 0.6671, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 617, "tokens_per_second_per_gpu": 16915.95, "total_tokens": 15736639 }, { "epoch": 0.0494281372470607, "grad_norm": 0.5692036151885986, "learning_rate": 7.68252096505843e-06, "loss": 0.6733, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 618, "tokens_per_second_per_gpu": 16922.84, "total_tokens": 15761816 }, { "epoch": 0.0495081180516676, "grad_norm": 0.4885812997817993, "learning_rate": 7.6485788689741e-06, "loss": 0.6583, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 619, "tokens_per_second_per_gpu": 17492.93, "total_tokens": 15787918 }, { "epoch": 0.04958809885627449, "grad_norm": 0.5453019738197327, "learning_rate": 7.6146654242141935e-06, "loss": 0.7266, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 620, "tokens_per_second_per_gpu": 16796.92, "total_tokens": 15812967 }, { "epoch": 0.04966807966088139, "grad_norm": 0.5044118165969849, "learning_rate": 7.580781044003324e-06, "loss": 0.6738, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 621, "tokens_per_second_per_gpu": 16983.61, "total_tokens": 15838154 }, { "epoch": 0.049748060465488284, "grad_norm": 0.49475517868995667, "learning_rate": 7.546926141211975e-06, "loss": 0.6235, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 622, "tokens_per_second_per_gpu": 16762.51, "total_tokens": 15863293 }, { "epoch": 0.04982804127009518, "grad_norm": 0.5201805830001831, "learning_rate": 7.513101128351454e-06, "loss": 0.6033, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 623, "tokens_per_second_per_gpu": 16768.92, "total_tokens": 15888215 }, { "epoch": 0.049908022074702074, "grad_norm": 0.5115200877189636, "learning_rate": 7.4793064175688635e-06, "loss": 0.6793, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 624, "tokens_per_second_per_gpu": 17648.67, "total_tokens": 15914592 }, { "epoch": 0.04998800287930896, "grad_norm": 0.5228220820426941, "learning_rate": 7.445542420642097e-06, "loss": 0.6296, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 625, "tokens_per_second_per_gpu": 16790.6, "total_tokens": 15940125 }, { "epoch": 0.05006798368391586, "grad_norm": 0.4957731068134308, "learning_rate": 7.411809548974792e-06, "loss": 0.6431, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 626, "tokens_per_second_per_gpu": 17142.31, "total_tokens": 15965805 }, { "epoch": 0.05014796448852275, "grad_norm": 0.48672324419021606, "learning_rate": 7.378108213591355e-06, "loss": 0.6589, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 627, "tokens_per_second_per_gpu": 16842.27, "total_tokens": 15990871 }, { "epoch": 0.05022794529312965, "grad_norm": 0.5299752950668335, "learning_rate": 7.344438825131912e-06, "loss": 0.6362, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 628, "tokens_per_second_per_gpu": 16874.41, "total_tokens": 16016112 }, { "epoch": 0.050307926097736544, "grad_norm": 0.4939616918563843, "learning_rate": 7.310801793847344e-06, "loss": 0.6658, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 629, "tokens_per_second_per_gpu": 17161.05, "total_tokens": 16041667 }, { "epoch": 0.05038790690234344, "grad_norm": 0.5360363125801086, "learning_rate": 7.277197529594257e-06, "loss": 0.6419, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 630, "tokens_per_second_per_gpu": 16983.54, "total_tokens": 16066562 }, { "epoch": 0.050467887706950335, "grad_norm": 0.4936983287334442, "learning_rate": 7.243626441830009e-06, "loss": 0.6341, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 631, "tokens_per_second_per_gpu": 16644.31, "total_tokens": 16091903 }, { "epoch": 0.05054786851155722, "grad_norm": 0.5046349763870239, "learning_rate": 7.210088939607709e-06, "loss": 0.7089, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 632, "tokens_per_second_per_gpu": 17592.12, "total_tokens": 16118442 }, { "epoch": 0.05062784931616412, "grad_norm": 0.4913012385368347, "learning_rate": 7.176585431571235e-06, "loss": 0.7011, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 633, "tokens_per_second_per_gpu": 17143.68, "total_tokens": 16144725 }, { "epoch": 0.050707830120771014, "grad_norm": 0.5462119579315186, "learning_rate": 7.143116325950266e-06, "loss": 0.6766, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 634, "tokens_per_second_per_gpu": 17247.03, "total_tokens": 16170369 }, { "epoch": 0.05078781092537791, "grad_norm": 0.5056242346763611, "learning_rate": 7.109682030555283e-06, "loss": 0.6201, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 635, "tokens_per_second_per_gpu": 16855.99, "total_tokens": 16195646 }, { "epoch": 0.050867791729984804, "grad_norm": 0.47949331998825073, "learning_rate": 7.076282952772634e-06, "loss": 0.6441, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 636, "tokens_per_second_per_gpu": 16613.54, "total_tokens": 16220540 }, { "epoch": 0.0509477725345917, "grad_norm": 0.48914220929145813, "learning_rate": 7.042919499559538e-06, "loss": 0.6101, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 637, "tokens_per_second_per_gpu": 16800.29, "total_tokens": 16245408 }, { "epoch": 0.051027753339198595, "grad_norm": 0.5196214318275452, "learning_rate": 7.009592077439135e-06, "loss": 0.6946, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 638, "tokens_per_second_per_gpu": 17131.28, "total_tokens": 16271244 }, { "epoch": 0.05110773414380548, "grad_norm": 0.5333957076072693, "learning_rate": 6.976301092495556e-06, "loss": 0.6489, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 639, "tokens_per_second_per_gpu": 17256.94, "total_tokens": 16297194 }, { "epoch": 0.05118771494841238, "grad_norm": 0.4803604781627655, "learning_rate": 6.943046950368944e-06, "loss": 0.6063, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 640, "tokens_per_second_per_gpu": 16761.74, "total_tokens": 16322047 }, { "epoch": 0.051267695753019274, "grad_norm": 0.5199413299560547, "learning_rate": 6.909830056250527e-06, "loss": 0.6978, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 641, "tokens_per_second_per_gpu": 16905.31, "total_tokens": 16347028 }, { "epoch": 0.05134767655762617, "grad_norm": 0.5130301117897034, "learning_rate": 6.876650814877675e-06, "loss": 0.6378, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 642, "tokens_per_second_per_gpu": 16899.38, "total_tokens": 16372392 }, { "epoch": 0.051427657362233065, "grad_norm": 0.5086696743965149, "learning_rate": 6.843509630528977e-06, "loss": 0.6444, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 643, "tokens_per_second_per_gpu": 17210.14, "total_tokens": 16398587 }, { "epoch": 0.05150763816683996, "grad_norm": 0.4915199875831604, "learning_rate": 6.8104069070193e-06, "loss": 0.6514, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 644, "tokens_per_second_per_gpu": 16820.48, "total_tokens": 16423176 }, { "epoch": 0.051587618971446855, "grad_norm": 0.49876171350479126, "learning_rate": 6.777343047694891e-06, "loss": 0.6849, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 645, "tokens_per_second_per_gpu": 17426.66, "total_tokens": 16450009 }, { "epoch": 0.051667599776053744, "grad_norm": 0.5137947201728821, "learning_rate": 6.744318455428436e-06, "loss": 0.6763, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 646, "tokens_per_second_per_gpu": 16806.41, "total_tokens": 16475334 }, { "epoch": 0.05174758058066064, "grad_norm": 0.5228657126426697, "learning_rate": 6.711333532614168e-06, "loss": 0.6699, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 647, "tokens_per_second_per_gpu": 17221.27, "total_tokens": 16501203 }, { "epoch": 0.051827561385267534, "grad_norm": 0.5308648943901062, "learning_rate": 6.67838868116297e-06, "loss": 0.668, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 648, "tokens_per_second_per_gpu": 17008.86, "total_tokens": 16526728 }, { "epoch": 0.05190754218987443, "grad_norm": 0.5293684005737305, "learning_rate": 6.645484302497452e-06, "loss": 0.6544, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 649, "tokens_per_second_per_gpu": 16986.29, "total_tokens": 16551898 }, { "epoch": 0.051987522994481325, "grad_norm": 0.5115300416946411, "learning_rate": 6.612620797547087e-06, "loss": 0.6249, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 650, "tokens_per_second_per_gpu": 17339.71, "total_tokens": 16577920 }, { "epoch": 0.05206750379908822, "grad_norm": 0.5213042497634888, "learning_rate": 6.579798566743314e-06, "loss": 0.6496, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 651, "tokens_per_second_per_gpu": 17219.75, "total_tokens": 16604021 }, { "epoch": 0.052147484603695116, "grad_norm": 0.5389010310173035, "learning_rate": 6.547018010014654e-06, "loss": 0.6647, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 652, "tokens_per_second_per_gpu": 17230.96, "total_tokens": 16629696 }, { "epoch": 0.05222746540830201, "grad_norm": 0.5159024000167847, "learning_rate": 6.5142795267818505e-06, "loss": 0.6325, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 653, "tokens_per_second_per_gpu": 17017.35, "total_tokens": 16655351 }, { "epoch": 0.0523074462129089, "grad_norm": 0.4998682141304016, "learning_rate": 6.481583515952983e-06, "loss": 0.6439, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 654, "tokens_per_second_per_gpu": 17015.79, "total_tokens": 16680965 }, { "epoch": 0.052387427017515795, "grad_norm": 0.5311859250068665, "learning_rate": 6.448930375918632e-06, "loss": 0.6561, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 655, "tokens_per_second_per_gpu": 16236.56, "total_tokens": 16705336 }, { "epoch": 0.05246740782212269, "grad_norm": 0.5103529691696167, "learning_rate": 6.4163205045469975e-06, "loss": 0.6153, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 656, "tokens_per_second_per_gpu": 16413.1, "total_tokens": 16729896 }, { "epoch": 0.052547388626729585, "grad_norm": 0.5097713470458984, "learning_rate": 6.383754299179079e-06, "loss": 0.6412, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 657, "tokens_per_second_per_gpu": 17055.16, "total_tokens": 16755420 }, { "epoch": 0.05262736943133648, "grad_norm": 0.4635200500488281, "learning_rate": 6.351232156623803e-06, "loss": 0.5744, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 658, "tokens_per_second_per_gpu": 17132.79, "total_tokens": 16781693 }, { "epoch": 0.052707350235943376, "grad_norm": 0.5747168660163879, "learning_rate": 6.318754473153221e-06, "loss": 0.6812, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 659, "tokens_per_second_per_gpu": 16736.57, "total_tokens": 16806316 }, { "epoch": 0.05278733104055027, "grad_norm": 0.4747006595134735, "learning_rate": 6.286321644497655e-06, "loss": 0.6251, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 660, "tokens_per_second_per_gpu": 17304.87, "total_tokens": 16832772 }, { "epoch": 0.05286731184515716, "grad_norm": 0.5061115026473999, "learning_rate": 6.25393406584088e-06, "loss": 0.6581, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 661, "tokens_per_second_per_gpu": 17742.51, "total_tokens": 16859124 }, { "epoch": 0.052947292649764055, "grad_norm": 0.4995548725128174, "learning_rate": 6.22159213181533e-06, "loss": 0.6492, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 662, "tokens_per_second_per_gpu": 16850.9, "total_tokens": 16884185 }, { "epoch": 0.05302727345437095, "grad_norm": 0.5568655729293823, "learning_rate": 6.18929623649726e-06, "loss": 0.5819, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 663, "tokens_per_second_per_gpu": 16745.3, "total_tokens": 16909485 }, { "epoch": 0.053107254258977846, "grad_norm": 0.502731204032898, "learning_rate": 6.157046773401964e-06, "loss": 0.6288, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 664, "tokens_per_second_per_gpu": 17083.54, "total_tokens": 16935162 }, { "epoch": 0.05318723506358474, "grad_norm": 0.517120361328125, "learning_rate": 6.124844135478971e-06, "loss": 0.6518, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 665, "tokens_per_second_per_gpu": 17008.29, "total_tokens": 16960109 }, { "epoch": 0.053267215868191636, "grad_norm": 0.5138611793518066, "learning_rate": 6.092688715107265e-06, "loss": 0.643, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 666, "tokens_per_second_per_gpu": 16915.46, "total_tokens": 16985762 }, { "epoch": 0.05334719667279853, "grad_norm": 0.5278694033622742, "learning_rate": 6.06058090409049e-06, "loss": 0.6474, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 667, "tokens_per_second_per_gpu": 17094.87, "total_tokens": 17011822 }, { "epoch": 0.05342717747740542, "grad_norm": 0.4872185289859772, "learning_rate": 6.028521093652195e-06, "loss": 0.6303, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 668, "tokens_per_second_per_gpu": 17232.09, "total_tokens": 17038442 }, { "epoch": 0.053507158282012315, "grad_norm": 0.5109195113182068, "learning_rate": 5.996509674431053e-06, "loss": 0.6477, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 669, "tokens_per_second_per_gpu": 16715.13, "total_tokens": 17063529 }, { "epoch": 0.05358713908661921, "grad_norm": 0.5262460708618164, "learning_rate": 5.9645470364761e-06, "loss": 0.6393, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 670, "tokens_per_second_per_gpu": 16640.95, "total_tokens": 17088499 }, { "epoch": 0.053667119891226106, "grad_norm": 0.4987565875053406, "learning_rate": 5.932633569242e-06, "loss": 0.6176, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 671, "tokens_per_second_per_gpu": 16846.36, "total_tokens": 17113810 }, { "epoch": 0.053747100695833, "grad_norm": 0.5298067927360535, "learning_rate": 5.900769661584273e-06, "loss": 0.7042, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 672, "tokens_per_second_per_gpu": 17058.35, "total_tokens": 17139525 }, { "epoch": 0.0538270815004399, "grad_norm": 0.4801011085510254, "learning_rate": 5.868955701754584e-06, "loss": 0.5934, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 673, "tokens_per_second_per_gpu": 16728.94, "total_tokens": 17165170 }, { "epoch": 0.05390706230504679, "grad_norm": 0.5165581107139587, "learning_rate": 5.83719207739599e-06, "loss": 0.5992, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 674, "tokens_per_second_per_gpu": 16374.51, "total_tokens": 17189646 }, { "epoch": 0.05398704310965368, "grad_norm": 0.5193620920181274, "learning_rate": 5.8054791755382286e-06, "loss": 0.6553, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 675, "tokens_per_second_per_gpu": 16535.5, "total_tokens": 17214531 }, { "epoch": 0.054067023914260576, "grad_norm": 0.48226460814476013, "learning_rate": 5.773817382593008e-06, "loss": 0.6172, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 676, "tokens_per_second_per_gpu": 17723.66, "total_tokens": 17241548 }, { "epoch": 0.05414700471886747, "grad_norm": 0.5453917980194092, "learning_rate": 5.742207084349274e-06, "loss": 0.6006, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 677, "tokens_per_second_per_gpu": 16531.5, "total_tokens": 17265863 }, { "epoch": 0.054226985523474366, "grad_norm": 0.5053668022155762, "learning_rate": 5.710648665968543e-06, "loss": 0.639, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 678, "tokens_per_second_per_gpu": 16542.28, "total_tokens": 17290327 }, { "epoch": 0.05430696632808126, "grad_norm": 0.5109512209892273, "learning_rate": 5.679142511980176e-06, "loss": 0.6027, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 679, "tokens_per_second_per_gpu": 16971.18, "total_tokens": 17315480 }, { "epoch": 0.05438694713268816, "grad_norm": 0.5201040506362915, "learning_rate": 5.647689006276727e-06, "loss": 0.6206, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 680, "tokens_per_second_per_gpu": 16478.13, "total_tokens": 17340212 }, { "epoch": 0.05446692793729505, "grad_norm": 0.502582848072052, "learning_rate": 5.616288532109225e-06, "loss": 0.6805, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 681, "tokens_per_second_per_gpu": 17185.1, "total_tokens": 17366028 }, { "epoch": 0.05454690874190194, "grad_norm": 0.5199177861213684, "learning_rate": 5.584941472082549e-06, "loss": 0.6613, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 682, "tokens_per_second_per_gpu": 16955.39, "total_tokens": 17390977 }, { "epoch": 0.054626889546508836, "grad_norm": 0.5209512114524841, "learning_rate": 5.553648208150728e-06, "loss": 0.6395, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 683, "tokens_per_second_per_gpu": 16364.97, "total_tokens": 17415065 }, { "epoch": 0.05470687035111573, "grad_norm": 0.5158247947692871, "learning_rate": 5.522409121612304e-06, "loss": 0.6239, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 684, "tokens_per_second_per_gpu": 17041.4, "total_tokens": 17440462 }, { "epoch": 0.05478685115572263, "grad_norm": 0.5076451897621155, "learning_rate": 5.491224593105695e-06, "loss": 0.6193, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 685, "tokens_per_second_per_gpu": 16489.47, "total_tokens": 17465299 }, { "epoch": 0.05486683196032952, "grad_norm": 0.4743523895740509, "learning_rate": 5.460095002604533e-06, "loss": 0.6283, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 686, "tokens_per_second_per_gpu": 17522.65, "total_tokens": 17491830 }, { "epoch": 0.05494681276493642, "grad_norm": 0.5121709108352661, "learning_rate": 5.429020729413062e-06, "loss": 0.6348, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 687, "tokens_per_second_per_gpu": 16463.08, "total_tokens": 17516529 }, { "epoch": 0.05502679356954331, "grad_norm": 0.510275661945343, "learning_rate": 5.398002152161484e-06, "loss": 0.6229, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 688, "tokens_per_second_per_gpu": 16654.05, "total_tokens": 17541653 }, { "epoch": 0.0551067743741502, "grad_norm": 0.49475717544555664, "learning_rate": 5.367039648801386e-06, "loss": 0.6389, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 689, "tokens_per_second_per_gpu": 16899.12, "total_tokens": 17567106 }, { "epoch": 0.055186755178757096, "grad_norm": 0.5166232585906982, "learning_rate": 5.336133596601089e-06, "loss": 0.669, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 690, "tokens_per_second_per_gpu": 17002.49, "total_tokens": 17592976 }, { "epoch": 0.05526673598336399, "grad_norm": 0.4955079257488251, "learning_rate": 5.305284372141095e-06, "loss": 0.5659, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 691, "tokens_per_second_per_gpu": 16119.99, "total_tokens": 17616788 }, { "epoch": 0.05534671678797089, "grad_norm": 0.49480971693992615, "learning_rate": 5.274492351309462e-06, "loss": 0.6458, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 692, "tokens_per_second_per_gpu": 17306.47, "total_tokens": 17642818 }, { "epoch": 0.05542669759257778, "grad_norm": 0.5008161067962646, "learning_rate": 5.243757909297247e-06, "loss": 0.6161, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 693, "tokens_per_second_per_gpu": 17152.18, "total_tokens": 17668640 }, { "epoch": 0.05550667839718468, "grad_norm": 0.5221447348594666, "learning_rate": 5.213081420593933e-06, "loss": 0.616, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 694, "tokens_per_second_per_gpu": 16822.73, "total_tokens": 17693424 }, { "epoch": 0.05558665920179157, "grad_norm": 0.5296872854232788, "learning_rate": 5.1824632589828465e-06, "loss": 0.6246, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 695, "tokens_per_second_per_gpu": 16770.43, "total_tokens": 17718572 }, { "epoch": 0.05566664000639846, "grad_norm": 0.5189606547355652, "learning_rate": 5.151903797536631e-06, "loss": 0.6366, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 696, "tokens_per_second_per_gpu": 16921.39, "total_tokens": 17743965 }, { "epoch": 0.05574662081100536, "grad_norm": 0.5203530788421631, "learning_rate": 5.121403408612672e-06, "loss": 0.7065, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 697, "tokens_per_second_per_gpu": 17426.51, "total_tokens": 17770644 }, { "epoch": 0.05582660161561225, "grad_norm": 0.51515132188797, "learning_rate": 5.090962463848592e-06, "loss": 0.6459, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 698, "tokens_per_second_per_gpu": 17112.57, "total_tokens": 17796697 }, { "epoch": 0.05590658242021915, "grad_norm": 0.5101720094680786, "learning_rate": 5.060581334157693e-06, "loss": 0.6448, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 699, "tokens_per_second_per_gpu": 16878.94, "total_tokens": 17821904 }, { "epoch": 0.05598656322482604, "grad_norm": 0.5070253610610962, "learning_rate": 5.030260389724447e-06, "loss": 0.6271, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 700, "tokens_per_second_per_gpu": 17086.03, "total_tokens": 17847477 }, { "epoch": 0.05606654402943294, "grad_norm": 0.5088156461715698, "learning_rate": 5.000000000000003e-06, "loss": 0.6259, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 701, "tokens_per_second_per_gpu": 17014.29, "total_tokens": 17873042 }, { "epoch": 0.05614652483403983, "grad_norm": 0.4844730496406555, "learning_rate": 4.96980053369765e-06, "loss": 0.6019, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 702, "tokens_per_second_per_gpu": 16950.47, "total_tokens": 17898577 }, { "epoch": 0.05622650563864672, "grad_norm": 0.5203348994255066, "learning_rate": 4.939662358788364e-06, "loss": 0.6317, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 703, "tokens_per_second_per_gpu": 16871.59, "total_tokens": 17923826 }, { "epoch": 0.05630648644325362, "grad_norm": 0.5411732196807861, "learning_rate": 4.909585842496287e-06, "loss": 0.6407, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 704, "tokens_per_second_per_gpu": 17433.86, "total_tokens": 17950148 }, { "epoch": 0.05638646724786051, "grad_norm": 0.5115430951118469, "learning_rate": 4.879571351294287e-06, "loss": 0.6517, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 705, "tokens_per_second_per_gpu": 17553.94, "total_tokens": 17976281 }, { "epoch": 0.05646644805246741, "grad_norm": 0.5059305429458618, "learning_rate": 4.849619250899458e-06, "loss": 0.6271, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 706, "tokens_per_second_per_gpu": 16888.3, "total_tokens": 18001831 }, { "epoch": 0.0565464288570743, "grad_norm": 0.4909086525440216, "learning_rate": 4.8197299062687e-06, "loss": 0.656, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 707, "tokens_per_second_per_gpu": 17176.8, "total_tokens": 18028323 }, { "epoch": 0.0566264096616812, "grad_norm": 0.5115363597869873, "learning_rate": 4.78990368159424e-06, "loss": 0.6764, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 708, "tokens_per_second_per_gpu": 17153.77, "total_tokens": 18054022 }, { "epoch": 0.056706390466288094, "grad_norm": 0.5104652643203735, "learning_rate": 4.76014094029921e-06, "loss": 0.648, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 709, "tokens_per_second_per_gpu": 17076.04, "total_tokens": 18080235 }, { "epoch": 0.05678637127089498, "grad_norm": 0.5099148154258728, "learning_rate": 4.7304420450332244e-06, "loss": 0.6074, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 710, "tokens_per_second_per_gpu": 16836.58, "total_tokens": 18105272 }, { "epoch": 0.05686635207550188, "grad_norm": 0.5084642171859741, "learning_rate": 4.700807357667953e-06, "loss": 0.6519, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 711, "tokens_per_second_per_gpu": 17614.67, "total_tokens": 18131849 }, { "epoch": 0.05694633288010877, "grad_norm": 0.5015023946762085, "learning_rate": 4.671237239292699e-06, "loss": 0.5743, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 712, "tokens_per_second_per_gpu": 16473.36, "total_tokens": 18156262 }, { "epoch": 0.05702631368471567, "grad_norm": 0.5393797159194946, "learning_rate": 4.641732050210032e-06, "loss": 0.667, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 713, "tokens_per_second_per_gpu": 17115.49, "total_tokens": 18181782 }, { "epoch": 0.057106294489322564, "grad_norm": 0.5561414361000061, "learning_rate": 4.612292149931369e-06, "loss": 0.6896, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 714, "tokens_per_second_per_gpu": 17365.17, "total_tokens": 18208558 }, { "epoch": 0.05718627529392946, "grad_norm": 0.5471202731132507, "learning_rate": 4.582917897172603e-06, "loss": 0.6506, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 715, "tokens_per_second_per_gpu": 17052.18, "total_tokens": 18234567 }, { "epoch": 0.057266256098536354, "grad_norm": 0.4913035035133362, "learning_rate": 4.5536096498497295e-06, "loss": 0.6357, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 716, "tokens_per_second_per_gpu": 17184.7, "total_tokens": 18261123 }, { "epoch": 0.05734623690314324, "grad_norm": 0.49759647250175476, "learning_rate": 4.524367765074499e-06, "loss": 0.6172, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 717, "tokens_per_second_per_gpu": 17501.25, "total_tokens": 18287293 }, { "epoch": 0.05742621770775014, "grad_norm": 0.5413016080856323, "learning_rate": 4.495192599150045e-06, "loss": 0.6359, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 718, "tokens_per_second_per_gpu": 16512.11, "total_tokens": 18312199 }, { "epoch": 0.05750619851235703, "grad_norm": 0.5255224108695984, "learning_rate": 4.46608450756656e-06, "loss": 0.638, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 719, "tokens_per_second_per_gpu": 17083.09, "total_tokens": 18337670 }, { "epoch": 0.05758617931696393, "grad_norm": 0.5278708338737488, "learning_rate": 4.437043844996952e-06, "loss": 0.6669, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 720, "tokens_per_second_per_gpu": 17489.34, "total_tokens": 18364339 }, { "epoch": 0.057666160121570824, "grad_norm": 0.5288352370262146, "learning_rate": 4.408070965292534e-06, "loss": 0.6484, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 721, "tokens_per_second_per_gpu": 17045.79, "total_tokens": 18389586 }, { "epoch": 0.05774614092617772, "grad_norm": 0.4860366880893707, "learning_rate": 4.379166221478697e-06, "loss": 0.6261, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 722, "tokens_per_second_per_gpu": 17383.61, "total_tokens": 18416280 }, { "epoch": 0.057826121730784615, "grad_norm": 0.5295699834823608, "learning_rate": 4.350329965750622e-06, "loss": 0.6549, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 723, "tokens_per_second_per_gpu": 16710.45, "total_tokens": 18441370 }, { "epoch": 0.0579061025353915, "grad_norm": 0.4987591505050659, "learning_rate": 4.321562549468991e-06, "loss": 0.6431, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 724, "tokens_per_second_per_gpu": 17326.31, "total_tokens": 18468281 }, { "epoch": 0.0579860833399984, "grad_norm": 0.5106927752494812, "learning_rate": 4.292864323155684e-06, "loss": 0.6309, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 725, "tokens_per_second_per_gpu": 17342.4, "total_tokens": 18494509 }, { "epoch": 0.058066064144605294, "grad_norm": 0.4820137023925781, "learning_rate": 4.264235636489542e-06, "loss": 0.6057, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 726, "tokens_per_second_per_gpu": 16684.63, "total_tokens": 18519945 }, { "epoch": 0.05814604494921219, "grad_norm": 0.5269767642021179, "learning_rate": 4.235676838302069e-06, "loss": 0.6297, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 727, "tokens_per_second_per_gpu": 17668.63, "total_tokens": 18546042 }, { "epoch": 0.058226025753819084, "grad_norm": 0.46701568365097046, "learning_rate": 4.207188276573214e-06, "loss": 0.6421, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 728, "tokens_per_second_per_gpu": 17517.42, "total_tokens": 18572973 }, { "epoch": 0.05830600655842598, "grad_norm": 0.5261129140853882, "learning_rate": 4.178770298427107e-06, "loss": 0.659, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 729, "tokens_per_second_per_gpu": 16871.14, "total_tokens": 18598790 }, { "epoch": 0.058385987363032875, "grad_norm": 0.5487871170043945, "learning_rate": 4.150423250127846e-06, "loss": 0.6549, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 730, "tokens_per_second_per_gpu": 16979.14, "total_tokens": 18624176 }, { "epoch": 0.05846596816763976, "grad_norm": 0.4980189800262451, "learning_rate": 4.12214747707527e-06, "loss": 0.6359, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 731, "tokens_per_second_per_gpu": 17217.12, "total_tokens": 18649963 }, { "epoch": 0.05854594897224666, "grad_norm": 0.588450014591217, "learning_rate": 4.093943323800746e-06, "loss": 0.6685, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 732, "tokens_per_second_per_gpu": 17620.46, "total_tokens": 18676309 }, { "epoch": 0.058625929776853554, "grad_norm": 0.5422666668891907, "learning_rate": 4.065811133962987e-06, "loss": 0.6858, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 733, "tokens_per_second_per_gpu": 17164.56, "total_tokens": 18701937 }, { "epoch": 0.05870591058146045, "grad_norm": 0.49481356143951416, "learning_rate": 4.037751250343841e-06, "loss": 0.6455, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 734, "tokens_per_second_per_gpu": 17268.18, "total_tokens": 18728402 }, { "epoch": 0.058785891386067345, "grad_norm": 0.4996780753135681, "learning_rate": 4.009764014844143e-06, "loss": 0.6418, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 735, "tokens_per_second_per_gpu": 16964.57, "total_tokens": 18754341 }, { "epoch": 0.05886587219067424, "grad_norm": 0.5555558204650879, "learning_rate": 3.981849768479516e-06, "loss": 0.6603, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 736, "tokens_per_second_per_gpu": 16663.17, "total_tokens": 18778777 }, { "epoch": 0.058945852995281135, "grad_norm": 0.5153935551643372, "learning_rate": 3.954008851376252e-06, "loss": 0.6305, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 737, "tokens_per_second_per_gpu": 16853.63, "total_tokens": 18804000 }, { "epoch": 0.059025833799888024, "grad_norm": 0.5119479298591614, "learning_rate": 3.9262416027671354e-06, "loss": 0.622, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 738, "tokens_per_second_per_gpu": 16758.19, "total_tokens": 18829052 }, { "epoch": 0.05910581460449492, "grad_norm": 0.5353497862815857, "learning_rate": 3.898548360987325e-06, "loss": 0.6104, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 739, "tokens_per_second_per_gpu": 16663.69, "total_tokens": 18854098 }, { "epoch": 0.059185795409101814, "grad_norm": 0.5033715963363647, "learning_rate": 3.8709294634702374e-06, "loss": 0.6282, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 740, "tokens_per_second_per_gpu": 16812.07, "total_tokens": 18879460 }, { "epoch": 0.05926577621370871, "grad_norm": 0.5525617599487305, "learning_rate": 3.8433852467434175e-06, "loss": 0.6933, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 741, "tokens_per_second_per_gpu": 16728.85, "total_tokens": 18905035 }, { "epoch": 0.059345757018315605, "grad_norm": 0.5698568820953369, "learning_rate": 3.81591604642446e-06, "loss": 0.6629, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 742, "tokens_per_second_per_gpu": 17133.82, "total_tokens": 18930728 }, { "epoch": 0.0594257378229225, "grad_norm": 0.5329509973526001, "learning_rate": 3.7885221972168974e-06, "loss": 0.6, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 743, "tokens_per_second_per_gpu": 16556.46, "total_tokens": 18955130 }, { "epoch": 0.059505718627529396, "grad_norm": 0.5058096647262573, "learning_rate": 3.7612040329061405e-06, "loss": 0.6008, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 744, "tokens_per_second_per_gpu": 17059.72, "total_tokens": 18980442 }, { "epoch": 0.059585699432136284, "grad_norm": 0.5127116441726685, "learning_rate": 3.7339618863553983e-06, "loss": 0.5898, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 745, "tokens_per_second_per_gpu": 16644.72, "total_tokens": 19005603 }, { "epoch": 0.05966568023674318, "grad_norm": 0.5248084664344788, "learning_rate": 3.7067960895016277e-06, "loss": 0.6807, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 746, "tokens_per_second_per_gpu": 17100.45, "total_tokens": 19031447 }, { "epoch": 0.059745661041350075, "grad_norm": 0.5084623694419861, "learning_rate": 3.679706973351491e-06, "loss": 0.6247, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 747, "tokens_per_second_per_gpu": 16938.92, "total_tokens": 19057263 }, { "epoch": 0.05982564184595697, "grad_norm": 0.5016053915023804, "learning_rate": 3.6526948679773256e-06, "loss": 0.6035, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 748, "tokens_per_second_per_gpu": 17456.58, "total_tokens": 19082873 }, { "epoch": 0.059905622650563865, "grad_norm": 0.5239890813827515, "learning_rate": 3.625760102513103e-06, "loss": 0.6351, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 749, "tokens_per_second_per_gpu": 16711.43, "total_tokens": 19108031 }, { "epoch": 0.05998560345517076, "grad_norm": 0.5268155336380005, "learning_rate": 3.598903005150444e-06, "loss": 0.6499, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 750, "tokens_per_second_per_gpu": 17401.95, "total_tokens": 19133855 }, { "epoch": 0.060065584259777656, "grad_norm": 0.5115627646446228, "learning_rate": 3.5721239031346067e-06, "loss": 0.5869, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 751, "tokens_per_second_per_gpu": 17175.88, "total_tokens": 19159287 }, { "epoch": 0.060145565064384544, "grad_norm": 0.5156000852584839, "learning_rate": 3.545423122760493e-06, "loss": 0.6222, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 752, "tokens_per_second_per_gpu": 16409.08, "total_tokens": 19184023 }, { "epoch": 0.06022554586899144, "grad_norm": 0.5103474259376526, "learning_rate": 3.5188009893686916e-06, "loss": 0.6848, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 753, "tokens_per_second_per_gpu": 17179.22, "total_tokens": 19209757 }, { "epoch": 0.060305526673598335, "grad_norm": 0.5245898365974426, "learning_rate": 3.492257827341492e-06, "loss": 0.6132, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 754, "tokens_per_second_per_gpu": 16744.73, "total_tokens": 19235219 }, { "epoch": 0.06038550747820523, "grad_norm": 0.5107713937759399, "learning_rate": 3.4657939600989453e-06, "loss": 0.6396, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 755, "tokens_per_second_per_gpu": 17234.96, "total_tokens": 19261421 }, { "epoch": 0.060465488282812126, "grad_norm": 0.5091108679771423, "learning_rate": 3.4394097100949286e-06, "loss": 0.6414, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 756, "tokens_per_second_per_gpu": 16886.75, "total_tokens": 19287079 }, { "epoch": 0.06054546908741902, "grad_norm": 0.5734265446662903, "learning_rate": 3.4131053988131947e-06, "loss": 0.6478, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 757, "tokens_per_second_per_gpu": 16271.26, "total_tokens": 19311542 }, { "epoch": 0.060625449892025916, "grad_norm": 0.5568541884422302, "learning_rate": 3.3868813467634833e-06, "loss": 0.6899, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 758, "tokens_per_second_per_gpu": 16956.37, "total_tokens": 19336639 }, { "epoch": 0.060705430696632805, "grad_norm": 0.513871967792511, "learning_rate": 3.360737873477584e-06, "loss": 0.6322, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 759, "tokens_per_second_per_gpu": 17458.5, "total_tokens": 19362877 }, { "epoch": 0.0607854115012397, "grad_norm": 0.9253639578819275, "learning_rate": 3.3346752975054763e-06, "loss": 0.6365, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 760, "tokens_per_second_per_gpu": 16756.47, "total_tokens": 19387787 }, { "epoch": 0.060865392305846595, "grad_norm": 0.5255782604217529, "learning_rate": 3.308693936411421e-06, "loss": 0.6198, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 761, "tokens_per_second_per_gpu": 16632.82, "total_tokens": 19412823 }, { "epoch": 0.06094537311045349, "grad_norm": 0.5765253901481628, "learning_rate": 3.2827941067700996e-06, "loss": 0.683, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 762, "tokens_per_second_per_gpu": 17055.63, "total_tokens": 19438474 }, { "epoch": 0.061025353915060386, "grad_norm": 0.5258163809776306, "learning_rate": 3.2569761241627694e-06, "loss": 0.6129, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 763, "tokens_per_second_per_gpu": 16890.04, "total_tokens": 19463735 }, { "epoch": 0.06110533471966728, "grad_norm": 0.5253279209136963, "learning_rate": 3.2312403031733943e-06, "loss": 0.6451, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 764, "tokens_per_second_per_gpu": 16498.84, "total_tokens": 19488908 }, { "epoch": 0.06118531552427418, "grad_norm": 0.5541175603866577, "learning_rate": 3.2055869573848374e-06, "loss": 0.6668, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 765, "tokens_per_second_per_gpu": 16835.26, "total_tokens": 19514039 }, { "epoch": 0.061265296328881065, "grad_norm": 0.5414297580718994, "learning_rate": 3.1800163993750166e-06, "loss": 0.6561, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 766, "tokens_per_second_per_gpu": 16740.75, "total_tokens": 19539620 }, { "epoch": 0.06134527713348796, "grad_norm": 0.5167970657348633, "learning_rate": 3.1545289407131128e-06, "loss": 0.6536, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 767, "tokens_per_second_per_gpu": 17410.67, "total_tokens": 19565987 }, { "epoch": 0.061425257938094856, "grad_norm": 0.5267289280891418, "learning_rate": 3.1291248919557717e-06, "loss": 0.6601, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 768, "tokens_per_second_per_gpu": 16764.95, "total_tokens": 19591059 }, { "epoch": 0.06150523874270175, "grad_norm": 0.5405831336975098, "learning_rate": 3.103804562643302e-06, "loss": 0.6209, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 769, "tokens_per_second_per_gpu": 16595.39, "total_tokens": 19615537 }, { "epoch": 0.061585219547308646, "grad_norm": 0.5549702048301697, "learning_rate": 3.0785682612959334e-06, "loss": 0.7085, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 770, "tokens_per_second_per_gpu": 17500.29, "total_tokens": 19642176 }, { "epoch": 0.06166520035191554, "grad_norm": 0.526394784450531, "learning_rate": 3.0534162954100264e-06, "loss": 0.6627, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 771, "tokens_per_second_per_gpu": 16967.6, "total_tokens": 19668033 }, { "epoch": 0.06174518115652244, "grad_norm": 0.5020858645439148, "learning_rate": 3.028348971454356e-06, "loss": 0.6248, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 772, "tokens_per_second_per_gpu": 16806.61, "total_tokens": 19693414 }, { "epoch": 0.06182516196112933, "grad_norm": 0.5282226204872131, "learning_rate": 3.003366594866345e-06, "loss": 0.5409, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 773, "tokens_per_second_per_gpu": 15666.25, "total_tokens": 19716633 }, { "epoch": 0.06190514276573622, "grad_norm": 0.5440317988395691, "learning_rate": 2.978469470048376e-06, "loss": 0.6455, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 774, "tokens_per_second_per_gpu": 16783.48, "total_tokens": 19741484 }, { "epoch": 0.061985123570343116, "grad_norm": 0.5348433256149292, "learning_rate": 2.953657900364053e-06, "loss": 0.6522, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 775, "tokens_per_second_per_gpu": 17053.01, "total_tokens": 19767834 }, { "epoch": 0.06206510437495001, "grad_norm": 0.543786883354187, "learning_rate": 2.9289321881345257e-06, "loss": 0.6371, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 776, "tokens_per_second_per_gpu": 16892.92, "total_tokens": 19793247 }, { "epoch": 0.06214508517955691, "grad_norm": 0.5311694741249084, "learning_rate": 2.9042926346347932e-06, "loss": 0.649, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 777, "tokens_per_second_per_gpu": 16817.82, "total_tokens": 19818294 }, { "epoch": 0.0622250659841638, "grad_norm": 0.5300283432006836, "learning_rate": 2.8797395400900362e-06, "loss": 0.641, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 778, "tokens_per_second_per_gpu": 16882.54, "total_tokens": 19843333 }, { "epoch": 0.0623050467887707, "grad_norm": 0.5160916447639465, "learning_rate": 2.855273203671969e-06, "loss": 0.6468, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 779, "tokens_per_second_per_gpu": 17472.3, "total_tokens": 19869580 }, { "epoch": 0.06238502759337759, "grad_norm": 0.5387117266654968, "learning_rate": 2.830893923495173e-06, "loss": 0.6213, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 780, "tokens_per_second_per_gpu": 17099.02, "total_tokens": 19895160 }, { "epoch": 0.06246500839798448, "grad_norm": 0.5398359894752502, "learning_rate": 2.8066019966134907e-06, "loss": 0.6452, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 781, "tokens_per_second_per_gpu": 17350.46, "total_tokens": 19920688 }, { "epoch": 0.06254498920259138, "grad_norm": 0.5316033363342285, "learning_rate": 2.7823977190163788e-06, "loss": 0.6397, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 782, "tokens_per_second_per_gpu": 16772.7, "total_tokens": 19945648 }, { "epoch": 0.06262497000719827, "grad_norm": 0.5031722187995911, "learning_rate": 2.7582813856253276e-06, "loss": 0.6356, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 783, "tokens_per_second_per_gpu": 17215.15, "total_tokens": 19971748 }, { "epoch": 0.06270495081180516, "grad_norm": 0.5138970017433167, "learning_rate": 2.7342532902902418e-06, "loss": 0.6307, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 784, "tokens_per_second_per_gpu": 17223.48, "total_tokens": 19998237 }, { "epoch": 0.06278493161641206, "grad_norm": 0.4964427053928375, "learning_rate": 2.7103137257858867e-06, "loss": 0.5311, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 785, "tokens_per_second_per_gpu": 16260.69, "total_tokens": 20022768 }, { "epoch": 0.06286491242101895, "grad_norm": 0.5650128126144409, "learning_rate": 2.6864629838082957e-06, "loss": 0.5814, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 786, "tokens_per_second_per_gpu": 16690.38, "total_tokens": 20047748 }, { "epoch": 0.06294489322562585, "grad_norm": 0.5131796598434448, "learning_rate": 2.6627013549712355e-06, "loss": 0.6104, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 787, "tokens_per_second_per_gpu": 16507.9, "total_tokens": 20072431 }, { "epoch": 0.06302487403023274, "grad_norm": 0.5409046411514282, "learning_rate": 2.639029128802657e-06, "loss": 0.6366, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 788, "tokens_per_second_per_gpu": 17400.52, "total_tokens": 20098484 }, { "epoch": 0.06310485483483964, "grad_norm": 0.5458826422691345, "learning_rate": 2.615446593741161e-06, "loss": 0.6474, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 789, "tokens_per_second_per_gpu": 17471.76, "total_tokens": 20124504 }, { "epoch": 0.06318483563944653, "grad_norm": 0.5500627160072327, "learning_rate": 2.5919540371325005e-06, "loss": 0.6645, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 790, "tokens_per_second_per_gpu": 16887.62, "total_tokens": 20149719 }, { "epoch": 0.06326481644405342, "grad_norm": 0.7003596425056458, "learning_rate": 2.5685517452260566e-06, "loss": 0.6207, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 791, "tokens_per_second_per_gpu": 17039.19, "total_tokens": 20174892 }, { "epoch": 0.06334479724866032, "grad_norm": 0.5149163007736206, "learning_rate": 2.5452400031713786e-06, "loss": 0.5908, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 792, "tokens_per_second_per_gpu": 16747.71, "total_tokens": 20199527 }, { "epoch": 0.06342477805326721, "grad_norm": 0.5385146141052246, "learning_rate": 2.522019095014683e-06, "loss": 0.6026, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 793, "tokens_per_second_per_gpu": 16726.82, "total_tokens": 20224267 }, { "epoch": 0.06350475885787411, "grad_norm": 0.5230799317359924, "learning_rate": 2.4988893036954045e-06, "loss": 0.6515, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 794, "tokens_per_second_per_gpu": 17114.37, "total_tokens": 20250327 }, { "epoch": 0.063584739662481, "grad_norm": 0.5139489769935608, "learning_rate": 2.4758509110427576e-06, "loss": 0.6482, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 795, "tokens_per_second_per_gpu": 16803.05, "total_tokens": 20275583 }, { "epoch": 0.0636647204670879, "grad_norm": 0.523923933506012, "learning_rate": 2.45290419777228e-06, "loss": 0.6579, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 796, "tokens_per_second_per_gpu": 17032.63, "total_tokens": 20301068 }, { "epoch": 0.06374470127169479, "grad_norm": 0.7031800746917725, "learning_rate": 2.4300494434824373e-06, "loss": 0.6209, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 797, "tokens_per_second_per_gpu": 16706.86, "total_tokens": 20326831 }, { "epoch": 0.06382468207630168, "grad_norm": 0.5347440838813782, "learning_rate": 2.407286926651192e-06, "loss": 0.6361, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 798, "tokens_per_second_per_gpu": 16973.33, "total_tokens": 20351944 }, { "epoch": 0.06390466288090858, "grad_norm": 0.5046122074127197, "learning_rate": 2.3846169246326345e-06, "loss": 0.6284, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 799, "tokens_per_second_per_gpu": 17113.48, "total_tokens": 20377650 }, { "epoch": 0.06398464368551547, "grad_norm": 0.5298998355865479, "learning_rate": 2.362039713653581e-06, "loss": 0.6105, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 800, "tokens_per_second_per_gpu": 16898.19, "total_tokens": 20403133 }, { "epoch": 0.06406462449012237, "grad_norm": 0.5419802665710449, "learning_rate": 2.339555568810221e-06, "loss": 0.6345, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 801, "tokens_per_second_per_gpu": 16951.5, "total_tokens": 20428474 }, { "epoch": 0.06414460529472926, "grad_norm": 0.5342543125152588, "learning_rate": 2.317164764064769e-06, "loss": 0.6599, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 802, "tokens_per_second_per_gpu": 17249.38, "total_tokens": 20454441 }, { "epoch": 0.06422458609933616, "grad_norm": 0.5269400477409363, "learning_rate": 2.2948675722421086e-06, "loss": 0.5818, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 803, "tokens_per_second_per_gpu": 16686.12, "total_tokens": 20478852 }, { "epoch": 0.06430456690394305, "grad_norm": 0.5437107086181641, "learning_rate": 2.27266426502649e-06, "loss": 0.6462, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 804, "tokens_per_second_per_gpu": 16684.85, "total_tokens": 20503980 }, { "epoch": 0.06438454770854994, "grad_norm": 0.5457687973976135, "learning_rate": 2.2505551129582047e-06, "loss": 0.6608, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 805, "tokens_per_second_per_gpu": 16383.96, "total_tokens": 20528619 }, { "epoch": 0.06446452851315684, "grad_norm": 0.5263291001319885, "learning_rate": 2.2285403854302912e-06, "loss": 0.6213, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 806, "tokens_per_second_per_gpu": 17247.48, "total_tokens": 20554387 }, { "epoch": 0.06454450931776373, "grad_norm": 0.5361708402633667, "learning_rate": 2.206620350685257e-06, "loss": 0.6427, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 807, "tokens_per_second_per_gpu": 17243.59, "total_tokens": 20579876 }, { "epoch": 0.06462449012237063, "grad_norm": 0.5289268493652344, "learning_rate": 2.1847952758118118e-06, "loss": 0.6201, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 808, "tokens_per_second_per_gpu": 16553.91, "total_tokens": 20604719 }, { "epoch": 0.06470447092697752, "grad_norm": 0.544245183467865, "learning_rate": 2.163065426741603e-06, "loss": 0.6662, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 809, "tokens_per_second_per_gpu": 17452.98, "total_tokens": 20630589 }, { "epoch": 0.06478445173158442, "grad_norm": 0.5488360524177551, "learning_rate": 2.1414310682459805e-06, "loss": 0.6423, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 810, "tokens_per_second_per_gpu": 16514.2, "total_tokens": 20655490 }, { "epoch": 0.06486443253619131, "grad_norm": 0.5205331444740295, "learning_rate": 2.119892463932781e-06, "loss": 0.5988, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 811, "tokens_per_second_per_gpu": 17129.86, "total_tokens": 20681611 }, { "epoch": 0.0649444133407982, "grad_norm": 0.5465454459190369, "learning_rate": 2.098449876243096e-06, "loss": 0.6205, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 812, "tokens_per_second_per_gpu": 14461.03, "total_tokens": 20706354 }, { "epoch": 0.0650243941454051, "grad_norm": 0.5238451361656189, "learning_rate": 2.0771035664480944e-06, "loss": 0.6657, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 813, "tokens_per_second_per_gpu": 17103.66, "total_tokens": 20732674 }, { "epoch": 0.06510437495001199, "grad_norm": 0.5532448291778564, "learning_rate": 2.0558537946458177e-06, "loss": 0.7047, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 814, "tokens_per_second_per_gpu": 17073.99, "total_tokens": 20758451 }, { "epoch": 0.0651843557546189, "grad_norm": 0.5232256054878235, "learning_rate": 2.0347008197580376e-06, "loss": 0.6141, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 815, "tokens_per_second_per_gpu": 17314.13, "total_tokens": 20784400 }, { "epoch": 0.06526433655922578, "grad_norm": 0.537419855594635, "learning_rate": 2.013644899527074e-06, "loss": 0.6804, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 816, "tokens_per_second_per_gpu": 17404.52, "total_tokens": 20811146 }, { "epoch": 0.06534431736383269, "grad_norm": 0.5733768343925476, "learning_rate": 1.9926862905126663e-06, "loss": 0.6745, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 817, "tokens_per_second_per_gpu": 16847.29, "total_tokens": 20836415 }, { "epoch": 0.06542429816843957, "grad_norm": 0.5123438239097595, "learning_rate": 1.9718252480888567e-06, "loss": 0.6181, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 818, "tokens_per_second_per_gpu": 17343.21, "total_tokens": 20862400 }, { "epoch": 0.06550427897304646, "grad_norm": 0.5344765782356262, "learning_rate": 1.95106202644086e-06, "loss": 0.6232, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 819, "tokens_per_second_per_gpu": 16935.96, "total_tokens": 20887638 }, { "epoch": 0.06558425977765336, "grad_norm": 0.5133531093597412, "learning_rate": 1.930396878561983e-06, "loss": 0.6081, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 820, "tokens_per_second_per_gpu": 17029.02, "total_tokens": 20913351 }, { "epoch": 0.06566424058226025, "grad_norm": 0.5093186497688293, "learning_rate": 1.9098300562505266e-06, "loss": 0.5818, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 821, "tokens_per_second_per_gpu": 17013.0, "total_tokens": 20938940 }, { "epoch": 0.06574422138686715, "grad_norm": 0.5392187833786011, "learning_rate": 1.8893618101067357e-06, "loss": 0.6245, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 822, "tokens_per_second_per_gpu": 17008.86, "total_tokens": 20964899 }, { "epoch": 0.06582420219147404, "grad_norm": 0.5658339858055115, "learning_rate": 1.8689923895297247e-06, "loss": 0.6505, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 823, "tokens_per_second_per_gpu": 16746.45, "total_tokens": 20990024 }, { "epoch": 0.06590418299608095, "grad_norm": 0.5425558090209961, "learning_rate": 1.848722042714457e-06, "loss": 0.6539, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 824, "tokens_per_second_per_gpu": 16762.28, "total_tokens": 21015394 }, { "epoch": 0.06598416380068783, "grad_norm": 0.49881938099861145, "learning_rate": 1.8285510166487154e-06, "loss": 0.6495, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 825, "tokens_per_second_per_gpu": 17344.43, "total_tokens": 21041742 }, { "epoch": 0.06606414460529474, "grad_norm": 0.5435726642608643, "learning_rate": 1.808479557110081e-06, "loss": 0.6394, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 826, "tokens_per_second_per_gpu": 16958.22, "total_tokens": 21066554 }, { "epoch": 0.06614412540990162, "grad_norm": 0.5088474750518799, "learning_rate": 1.7885079086629598e-06, "loss": 0.6283, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 827, "tokens_per_second_per_gpu": 16846.33, "total_tokens": 21091431 }, { "epoch": 0.06622410621450851, "grad_norm": 0.5184812545776367, "learning_rate": 1.7686363146555807e-06, "loss": 0.6048, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 828, "tokens_per_second_per_gpu": 16807.75, "total_tokens": 21116183 }, { "epoch": 0.06630408701911542, "grad_norm": 0.5359786748886108, "learning_rate": 1.7488650172170496e-06, "loss": 0.6232, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 829, "tokens_per_second_per_gpu": 16758.36, "total_tokens": 21141076 }, { "epoch": 0.0663840678237223, "grad_norm": 0.5581966638565063, "learning_rate": 1.7291942572543806e-06, "loss": 0.6722, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 830, "tokens_per_second_per_gpu": 16923.34, "total_tokens": 21166455 }, { "epoch": 0.0664640486283292, "grad_norm": 0.5573216080665588, "learning_rate": 1.709624274449584e-06, "loss": 0.6268, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 831, "tokens_per_second_per_gpu": 16440.4, "total_tokens": 21190565 }, { "epoch": 0.0665440294329361, "grad_norm": 0.5541191697120667, "learning_rate": 1.6901553072567189e-06, "loss": 0.6127, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 832, "tokens_per_second_per_gpu": 16602.29, "total_tokens": 21214401 }, { "epoch": 0.066624010237543, "grad_norm": 0.5143559575080872, "learning_rate": 1.6707875928990059e-06, "loss": 0.6113, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 833, "tokens_per_second_per_gpu": 16831.5, "total_tokens": 21239453 }, { "epoch": 0.06670399104214988, "grad_norm": 0.5253430604934692, "learning_rate": 1.651521367365936e-06, "loss": 0.6203, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 834, "tokens_per_second_per_gpu": 17026.44, "total_tokens": 21265048 }, { "epoch": 0.06678397184675677, "grad_norm": 0.5263636708259583, "learning_rate": 1.6323568654103838e-06, "loss": 0.6411, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 835, "tokens_per_second_per_gpu": 17001.65, "total_tokens": 21290537 }, { "epoch": 0.06686395265136368, "grad_norm": 0.5092071890830994, "learning_rate": 1.6132943205457607e-06, "loss": 0.6245, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 836, "tokens_per_second_per_gpu": 17222.45, "total_tokens": 21316714 }, { "epoch": 0.06694393345597056, "grad_norm": 0.48893386125564575, "learning_rate": 1.5943339650431578e-06, "loss": 0.598, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 837, "tokens_per_second_per_gpu": 17556.4, "total_tokens": 21343578 }, { "epoch": 0.06702391426057747, "grad_norm": 0.5376018285751343, "learning_rate": 1.5754760299285255e-06, "loss": 0.6301, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 838, "tokens_per_second_per_gpu": 16566.21, "total_tokens": 21367631 }, { "epoch": 0.06710389506518435, "grad_norm": 0.5278213024139404, "learning_rate": 1.5567207449798517e-06, "loss": 0.613, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 839, "tokens_per_second_per_gpu": 16833.56, "total_tokens": 21393005 }, { "epoch": 0.06718387586979126, "grad_norm": 0.5237742066383362, "learning_rate": 1.538068338724361e-06, "loss": 0.6129, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 840, "tokens_per_second_per_gpu": 16785.39, "total_tokens": 21418322 }, { "epoch": 0.06726385667439815, "grad_norm": 0.5155054926872253, "learning_rate": 1.5195190384357405e-06, "loss": 0.6684, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 841, "tokens_per_second_per_gpu": 16989.1, "total_tokens": 21444279 }, { "epoch": 0.06734383747900503, "grad_norm": 0.509067714214325, "learning_rate": 1.5010730701313626e-06, "loss": 0.5873, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 842, "tokens_per_second_per_gpu": 16848.11, "total_tokens": 21469610 }, { "epoch": 0.06742381828361194, "grad_norm": 0.5532313585281372, "learning_rate": 1.4827306585695234e-06, "loss": 0.7063, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 843, "tokens_per_second_per_gpu": 17019.37, "total_tokens": 21495536 }, { "epoch": 0.06750379908821882, "grad_norm": 0.5344107747077942, "learning_rate": 1.4644920272467245e-06, "loss": 0.649, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 844, "tokens_per_second_per_gpu": 17240.12, "total_tokens": 21521566 }, { "epoch": 0.06758377989282573, "grad_norm": 0.5316765904426575, "learning_rate": 1.446357398394934e-06, "loss": 0.6567, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 845, "tokens_per_second_per_gpu": 16519.7, "total_tokens": 21546818 }, { "epoch": 0.06766376069743262, "grad_norm": 0.5471608638763428, "learning_rate": 1.4283269929788779e-06, "loss": 0.719, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 846, "tokens_per_second_per_gpu": 17558.19, "total_tokens": 21573813 }, { "epoch": 0.06774374150203952, "grad_norm": 0.5042493343353271, "learning_rate": 1.4104010306933558e-06, "loss": 0.5862, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 847, "tokens_per_second_per_gpu": 16810.13, "total_tokens": 21598898 }, { "epoch": 0.0678237223066464, "grad_norm": 0.5342798233032227, "learning_rate": 1.3925797299605649e-06, "loss": 0.629, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 848, "tokens_per_second_per_gpu": 16624.04, "total_tokens": 21624135 }, { "epoch": 0.0679037031112533, "grad_norm": 0.4844224750995636, "learning_rate": 1.3748633079274254e-06, "loss": 0.6284, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 849, "tokens_per_second_per_gpu": 17643.78, "total_tokens": 21651187 }, { "epoch": 0.0679836839158602, "grad_norm": 0.5576471090316772, "learning_rate": 1.3572519804629537e-06, "loss": 0.6663, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 850, "tokens_per_second_per_gpu": 16739.95, "total_tokens": 21676719 }, { "epoch": 0.06806366472046708, "grad_norm": 0.561067521572113, "learning_rate": 1.339745962155613e-06, "loss": 0.6616, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 851, "tokens_per_second_per_gpu": 17580.06, "total_tokens": 21703252 }, { "epoch": 0.06814364552507399, "grad_norm": 0.4975440204143524, "learning_rate": 1.322345466310717e-06, "loss": 0.6046, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 852, "tokens_per_second_per_gpu": 17651.26, "total_tokens": 21729685 }, { "epoch": 0.06822362632968088, "grad_norm": 0.5225350856781006, "learning_rate": 1.30505070494781e-06, "loss": 0.6014, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 853, "tokens_per_second_per_gpu": 16744.83, "total_tokens": 21754924 }, { "epoch": 0.06830360713428778, "grad_norm": 0.5175594687461853, "learning_rate": 1.2878618887981064e-06, "loss": 0.6292, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 854, "tokens_per_second_per_gpu": 17130.6, "total_tokens": 21780725 }, { "epoch": 0.06838358793889467, "grad_norm": 0.5212383270263672, "learning_rate": 1.2707792273019049e-06, "loss": 0.6432, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 855, "tokens_per_second_per_gpu": 16877.97, "total_tokens": 21806402 }, { "epoch": 0.06846356874350155, "grad_norm": 0.5338414907455444, "learning_rate": 1.2538029286060428e-06, "loss": 0.6623, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 856, "tokens_per_second_per_gpu": 17129.27, "total_tokens": 21832107 }, { "epoch": 0.06854354954810846, "grad_norm": 0.5500073432922363, "learning_rate": 1.2369331995613664e-06, "loss": 0.5917, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 857, "tokens_per_second_per_gpu": 17256.77, "total_tokens": 21857407 }, { "epoch": 0.06862353035271535, "grad_norm": 0.5338061451911926, "learning_rate": 1.2201702457201948e-06, "loss": 0.5952, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 858, "tokens_per_second_per_gpu": 16673.84, "total_tokens": 21882039 }, { "epoch": 0.06870351115732225, "grad_norm": 0.5579566955566406, "learning_rate": 1.2035142713338366e-06, "loss": 0.6569, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 859, "tokens_per_second_per_gpu": 16671.62, "total_tokens": 21907057 }, { "epoch": 0.06878349196192914, "grad_norm": 0.5408582091331482, "learning_rate": 1.1869654793500784e-06, "loss": 0.6789, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 860, "tokens_per_second_per_gpu": 17620.35, "total_tokens": 21933442 }, { "epoch": 0.06886347276653604, "grad_norm": 0.5381220579147339, "learning_rate": 1.1705240714107301e-06, "loss": 0.6369, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 861, "tokens_per_second_per_gpu": 16727.68, "total_tokens": 21958329 }, { "epoch": 0.06894345357114293, "grad_norm": 0.6026800274848938, "learning_rate": 1.1541902478491607e-06, "loss": 0.6602, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 862, "tokens_per_second_per_gpu": 17363.28, "total_tokens": 21984511 }, { "epoch": 0.06902343437574981, "grad_norm": 0.5562117099761963, "learning_rate": 1.1379642076878528e-06, "loss": 0.6528, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 863, "tokens_per_second_per_gpu": 16971.05, "total_tokens": 22009918 }, { "epoch": 0.06910341518035672, "grad_norm": 0.5344293713569641, "learning_rate": 1.1218461486359878e-06, "loss": 0.5938, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 864, "tokens_per_second_per_gpu": 17145.65, "total_tokens": 22035209 }, { "epoch": 0.0691833959849636, "grad_norm": 0.5310640931129456, "learning_rate": 1.1058362670870248e-06, "loss": 0.6437, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 865, "tokens_per_second_per_gpu": 17253.19, "total_tokens": 22060530 }, { "epoch": 0.06926337678957051, "grad_norm": 0.5512663125991821, "learning_rate": 1.0899347581163222e-06, "loss": 0.6598, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 866, "tokens_per_second_per_gpu": 16895.55, "total_tokens": 22085916 }, { "epoch": 0.0693433575941774, "grad_norm": 0.5096883773803711, "learning_rate": 1.0741418154787443e-06, "loss": 0.5948, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 867, "tokens_per_second_per_gpu": 17271.65, "total_tokens": 22111625 }, { "epoch": 0.0694233383987843, "grad_norm": 0.564525306224823, "learning_rate": 1.058457631606319e-06, "loss": 0.6606, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 868, "tokens_per_second_per_gpu": 16660.76, "total_tokens": 22136753 }, { "epoch": 0.06950331920339119, "grad_norm": 0.5231457352638245, "learning_rate": 1.042882397605871e-06, "loss": 0.6307, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 869, "tokens_per_second_per_gpu": 17528.22, "total_tokens": 22163028 }, { "epoch": 0.06958330000799808, "grad_norm": 0.5321533679962158, "learning_rate": 1.0274163032567165e-06, "loss": 0.61, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 870, "tokens_per_second_per_gpu": 16565.4, "total_tokens": 22187415 }, { "epoch": 0.06966328081260498, "grad_norm": 0.5243551135063171, "learning_rate": 1.012059537008332e-06, "loss": 0.6004, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 871, "tokens_per_second_per_gpu": 17433.64, "total_tokens": 22213442 }, { "epoch": 0.06974326161721187, "grad_norm": 0.5377528667449951, "learning_rate": 9.968122859780648e-07, "loss": 0.6148, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 872, "tokens_per_second_per_gpu": 16785.52, "total_tokens": 22238355 }, { "epoch": 0.06982324242181877, "grad_norm": 0.5466241240501404, "learning_rate": 9.816747359488632e-07, "loss": 0.6273, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 873, "tokens_per_second_per_gpu": 16524.06, "total_tokens": 22263013 }, { "epoch": 0.06990322322642566, "grad_norm": 0.5036591291427612, "learning_rate": 9.666470713669918e-07, "loss": 0.5697, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 874, "tokens_per_second_per_gpu": 16487.97, "total_tokens": 22287505 }, { "epoch": 0.06998320403103256, "grad_norm": 0.5080577731132507, "learning_rate": 9.517294753398066e-07, "loss": 0.629, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 875, "tokens_per_second_per_gpu": 17594.93, "total_tokens": 22314169 }, { "epoch": 0.07006318483563945, "grad_norm": 0.48881781101226807, "learning_rate": 9.369221296335007e-07, "loss": 0.5497, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 876, "tokens_per_second_per_gpu": 16353.97, "total_tokens": 22338437 }, { "epoch": 0.07014316564024634, "grad_norm": 0.6482576727867126, "learning_rate": 9.222252146709143e-07, "loss": 0.6313, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 877, "tokens_per_second_per_gpu": 18105.32, "total_tokens": 22365579 }, { "epoch": 0.07022314644485324, "grad_norm": 0.5619848966598511, "learning_rate": 9.076389095293148e-07, "loss": 0.6667, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 878, "tokens_per_second_per_gpu": 17285.05, "total_tokens": 22391841 }, { "epoch": 0.07030312724946013, "grad_norm": 0.5285101532936096, "learning_rate": 8.931633919382299e-07, "loss": 0.7238, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 879, "tokens_per_second_per_gpu": 17521.55, "total_tokens": 22418478 }, { "epoch": 0.07038310805406703, "grad_norm": 0.5396612286567688, "learning_rate": 8.787988382772705e-07, "loss": 0.5924, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 880, "tokens_per_second_per_gpu": 16798.7, "total_tokens": 22443336 }, { "epoch": 0.07046308885867392, "grad_norm": 0.5362244248390198, "learning_rate": 8.645454235739903e-07, "loss": 0.6344, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 881, "tokens_per_second_per_gpu": 17292.72, "total_tokens": 22469492 }, { "epoch": 0.07054306966328082, "grad_norm": 0.5551726222038269, "learning_rate": 8.504033215017527e-07, "loss": 0.6013, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 882, "tokens_per_second_per_gpu": 16799.15, "total_tokens": 22494816 }, { "epoch": 0.07062305046788771, "grad_norm": 0.5338584184646606, "learning_rate": 8.363727043776037e-07, "loss": 0.5833, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 883, "tokens_per_second_per_gpu": 16136.84, "total_tokens": 22518177 }, { "epoch": 0.0707030312724946, "grad_norm": 0.551880419254303, "learning_rate": 8.224537431601886e-07, "loss": 0.655, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 884, "tokens_per_second_per_gpu": 16505.32, "total_tokens": 22543491 }, { "epoch": 0.0707830120771015, "grad_norm": 0.5214104056358337, "learning_rate": 8.086466074476562e-07, "loss": 0.659, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 885, "tokens_per_second_per_gpu": 17549.53, "total_tokens": 22569806 }, { "epoch": 0.07086299288170839, "grad_norm": 0.542853057384491, "learning_rate": 7.949514654755963e-07, "loss": 0.6105, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 886, "tokens_per_second_per_gpu": 16930.88, "total_tokens": 22594691 }, { "epoch": 0.07094297368631529, "grad_norm": 0.5428863167762756, "learning_rate": 7.81368484114996e-07, "loss": 0.6104, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 887, "tokens_per_second_per_gpu": 17283.71, "total_tokens": 22620948 }, { "epoch": 0.07102295449092218, "grad_norm": 0.5423809289932251, "learning_rate": 7.678978288701911e-07, "loss": 0.636, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 888, "tokens_per_second_per_gpu": 17444.73, "total_tokens": 22647130 }, { "epoch": 0.07110293529552908, "grad_norm": 0.5242039561271667, "learning_rate": 7.545396638768698e-07, "loss": 0.6312, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 889, "tokens_per_second_per_gpu": 16473.27, "total_tokens": 22671849 }, { "epoch": 0.07118291610013597, "grad_norm": 0.5590543746948242, "learning_rate": 7.412941519000527e-07, "loss": 0.6512, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 890, "tokens_per_second_per_gpu": 16881.91, "total_tokens": 22697023 }, { "epoch": 0.07126289690474286, "grad_norm": 0.5263185501098633, "learning_rate": 7.281614543321269e-07, "loss": 0.6223, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 891, "tokens_per_second_per_gpu": 16850.16, "total_tokens": 22722217 }, { "epoch": 0.07134287770934976, "grad_norm": 0.5166627764701843, "learning_rate": 7.151417311908648e-07, "loss": 0.6212, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 892, "tokens_per_second_per_gpu": 16842.39, "total_tokens": 22747378 }, { "epoch": 0.07142285851395665, "grad_norm": 0.5226914286613464, "learning_rate": 7.022351411174866e-07, "loss": 0.6251, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 893, "tokens_per_second_per_gpu": 17338.77, "total_tokens": 22773619 }, { "epoch": 0.07150283931856355, "grad_norm": 0.5193334221839905, "learning_rate": 6.894418413747183e-07, "loss": 0.6043, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 894, "tokens_per_second_per_gpu": 17232.17, "total_tokens": 22799126 }, { "epoch": 0.07158282012317044, "grad_norm": 0.5356144905090332, "learning_rate": 6.767619878448783e-07, "loss": 0.6715, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 895, "tokens_per_second_per_gpu": 17571.32, "total_tokens": 22825099 }, { "epoch": 0.07166280092777734, "grad_norm": 0.5201844573020935, "learning_rate": 6.641957350279838e-07, "loss": 0.637, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 896, "tokens_per_second_per_gpu": 17330.69, "total_tokens": 22851265 }, { "epoch": 0.07174278173238423, "grad_norm": 0.5070226192474365, "learning_rate": 6.517432360398556e-07, "loss": 0.6164, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 897, "tokens_per_second_per_gpu": 17725.81, "total_tokens": 22877914 }, { "epoch": 0.07182276253699112, "grad_norm": 0.5197336673736572, "learning_rate": 6.394046426102673e-07, "loss": 0.6104, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 898, "tokens_per_second_per_gpu": 17001.09, "total_tokens": 22903502 }, { "epoch": 0.07190274334159802, "grad_norm": 0.5605859756469727, "learning_rate": 6.271801050810856e-07, "loss": 0.7018, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 899, "tokens_per_second_per_gpu": 16955.61, "total_tokens": 22929177 }, { "epoch": 0.07198272414620491, "grad_norm": 0.4978031516075134, "learning_rate": 6.150697724044407e-07, "loss": 0.6308, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 900, "tokens_per_second_per_gpu": 17524.78, "total_tokens": 22956098 }, { "epoch": 0.07206270495081181, "grad_norm": 0.5414256453514099, "learning_rate": 6.030737921409169e-07, "loss": 0.6352, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 901, "tokens_per_second_per_gpu": 16727.63, "total_tokens": 22981145 }, { "epoch": 0.0721426857554187, "grad_norm": 0.5312582850456238, "learning_rate": 5.911923104577455e-07, "loss": 0.6422, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 902, "tokens_per_second_per_gpu": 16767.2, "total_tokens": 23006386 }, { "epoch": 0.0722226665600256, "grad_norm": 0.5257201194763184, "learning_rate": 5.794254721270331e-07, "loss": 0.6095, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 903, "tokens_per_second_per_gpu": 16540.78, "total_tokens": 23031228 }, { "epoch": 0.07230264736463249, "grad_norm": 0.5407463908195496, "learning_rate": 5.677734205239904e-07, "loss": 0.6093, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 904, "tokens_per_second_per_gpu": 17082.27, "total_tokens": 23056715 }, { "epoch": 0.07238262816923938, "grad_norm": 0.5344308018684387, "learning_rate": 5.562362976251901e-07, "loss": 0.655, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 905, "tokens_per_second_per_gpu": 17119.96, "total_tokens": 23082109 }, { "epoch": 0.07246260897384628, "grad_norm": 0.5326920747756958, "learning_rate": 5.448142440068316e-07, "loss": 0.643, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 906, "tokens_per_second_per_gpu": 17190.5, "total_tokens": 23107313 }, { "epoch": 0.07254258977845317, "grad_norm": 0.5334905385971069, "learning_rate": 5.335073988430373e-07, "loss": 0.5988, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 907, "tokens_per_second_per_gpu": 16700.52, "total_tokens": 23131980 }, { "epoch": 0.07262257058306007, "grad_norm": 0.49993833899497986, "learning_rate": 5.223158999041444e-07, "loss": 0.6369, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 908, "tokens_per_second_per_gpu": 16759.27, "total_tokens": 23157639 }, { "epoch": 0.07270255138766696, "grad_norm": 0.5197968482971191, "learning_rate": 5.112398835550348e-07, "loss": 0.6437, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 909, "tokens_per_second_per_gpu": 17090.58, "total_tokens": 23183670 }, { "epoch": 0.07278253219227386, "grad_norm": 0.5189102292060852, "learning_rate": 5.002794847534765e-07, "loss": 0.5972, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 910, "tokens_per_second_per_gpu": 16958.96, "total_tokens": 23208974 }, { "epoch": 0.07286251299688075, "grad_norm": 0.5296539068222046, "learning_rate": 4.894348370484648e-07, "loss": 0.6398, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 911, "tokens_per_second_per_gpu": 17188.2, "total_tokens": 23234688 }, { "epoch": 0.07294249380148764, "grad_norm": 0.5202224850654602, "learning_rate": 4.787060725786141e-07, "loss": 0.6371, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 912, "tokens_per_second_per_gpu": 17395.29, "total_tokens": 23260424 }, { "epoch": 0.07302247460609454, "grad_norm": 0.5297356843948364, "learning_rate": 4.6809332207053083e-07, "loss": 0.5857, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 913, "tokens_per_second_per_gpu": 16700.69, "total_tokens": 23285195 }, { "epoch": 0.07310245541070143, "grad_norm": 0.5384907126426697, "learning_rate": 4.575967148372318e-07, "loss": 0.6212, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 914, "tokens_per_second_per_gpu": 17063.84, "total_tokens": 23310815 }, { "epoch": 0.07318243621530833, "grad_norm": 0.5373451709747314, "learning_rate": 4.4721637877656377e-07, "loss": 0.6259, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 915, "tokens_per_second_per_gpu": 17316.89, "total_tokens": 23336669 }, { "epoch": 0.07326241701991522, "grad_norm": 0.5419356226921082, "learning_rate": 4.3695244036964567e-07, "loss": 0.6562, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 916, "tokens_per_second_per_gpu": 17299.46, "total_tokens": 23362881 }, { "epoch": 0.07334239782452212, "grad_norm": 0.54438316822052, "learning_rate": 4.268050246793276e-07, "loss": 0.612, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 917, "tokens_per_second_per_gpu": 16889.02, "total_tokens": 23388504 }, { "epoch": 0.07342237862912901, "grad_norm": 0.5277045369148254, "learning_rate": 4.167742553486676e-07, "loss": 0.6303, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 918, "tokens_per_second_per_gpu": 16606.08, "total_tokens": 23413492 }, { "epoch": 0.0735023594337359, "grad_norm": 0.5115805864334106, "learning_rate": 4.068602545994249e-07, "loss": 0.6335, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 919, "tokens_per_second_per_gpu": 17113.19, "total_tokens": 23439554 }, { "epoch": 0.0735823402383428, "grad_norm": 0.5200874209403992, "learning_rate": 3.9706314323056936e-07, "loss": 0.6528, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 920, "tokens_per_second_per_gpu": 17124.8, "total_tokens": 23465496 }, { "epoch": 0.07366232104294969, "grad_norm": 0.535873293876648, "learning_rate": 3.8738304061681107e-07, "loss": 0.6929, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 921, "tokens_per_second_per_gpu": 17206.32, "total_tokens": 23492042 }, { "epoch": 0.07374230184755659, "grad_norm": 0.5385628938674927, "learning_rate": 3.7782006470714614e-07, "loss": 0.6276, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 922, "tokens_per_second_per_gpu": 16672.51, "total_tokens": 23516992 }, { "epoch": 0.07382228265216348, "grad_norm": 0.5313562750816345, "learning_rate": 3.68374332023419e-07, "loss": 0.5886, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 923, "tokens_per_second_per_gpu": 16765.04, "total_tokens": 23541526 }, { "epoch": 0.07390226345677038, "grad_norm": 0.5149674415588379, "learning_rate": 3.590459576589e-07, "loss": 0.6305, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 924, "tokens_per_second_per_gpu": 17117.27, "total_tokens": 23567238 }, { "epoch": 0.07398224426137727, "grad_norm": 0.5549934506416321, "learning_rate": 3.498350552768859e-07, "loss": 0.6459, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 925, "tokens_per_second_per_gpu": 16924.56, "total_tokens": 23592535 }, { "epoch": 0.07406222506598416, "grad_norm": 0.5501406788825989, "learning_rate": 3.4074173710931804e-07, "loss": 0.6181, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 926, "tokens_per_second_per_gpu": 17134.37, "total_tokens": 23618080 }, { "epoch": 0.07414220587059106, "grad_norm": 0.5153313875198364, "learning_rate": 3.3176611395540625e-07, "loss": 0.6321, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 927, "tokens_per_second_per_gpu": 17355.28, "total_tokens": 23644485 }, { "epoch": 0.07422218667519795, "grad_norm": 0.5083333849906921, "learning_rate": 3.2290829518028867e-07, "loss": 0.627, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 928, "tokens_per_second_per_gpu": 17407.43, "total_tokens": 23670424 }, { "epoch": 0.07430216747980485, "grad_norm": 0.544588565826416, "learning_rate": 3.1416838871368925e-07, "loss": 0.6258, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 929, "tokens_per_second_per_gpu": 17224.15, "total_tokens": 23696336 }, { "epoch": 0.07438214828441174, "grad_norm": 0.5177492499351501, "learning_rate": 3.0554650104861137e-07, "loss": 0.6479, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 930, "tokens_per_second_per_gpu": 17223.45, "total_tokens": 23722109 }, { "epoch": 0.07446212908901864, "grad_norm": 0.5235950350761414, "learning_rate": 2.970427372400353e-07, "loss": 0.5707, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 931, "tokens_per_second_per_gpu": 16511.77, "total_tokens": 23746933 }, { "epoch": 0.07454210989362553, "grad_norm": 0.4983116090297699, "learning_rate": 2.8865720090364037e-07, "loss": 0.6071, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 932, "tokens_per_second_per_gpu": 17590.81, "total_tokens": 23774001 }, { "epoch": 0.07462209069823242, "grad_norm": 0.5138797760009766, "learning_rate": 2.8038999421453827e-07, "loss": 0.6404, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 933, "tokens_per_second_per_gpu": 17531.8, "total_tokens": 23800499 }, { "epoch": 0.07470207150283932, "grad_norm": 0.5279234051704407, "learning_rate": 2.7224121790603517e-07, "loss": 0.6455, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 934, "tokens_per_second_per_gpu": 17253.47, "total_tokens": 23826516 }, { "epoch": 0.07478205230744621, "grad_norm": 0.5243753790855408, "learning_rate": 2.6421097126839714e-07, "loss": 0.6143, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 935, "tokens_per_second_per_gpu": 17104.36, "total_tokens": 23852173 }, { "epoch": 0.07486203311205311, "grad_norm": 0.5066297054290771, "learning_rate": 2.5629935214764866e-07, "loss": 0.5797, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 936, "tokens_per_second_per_gpu": 16824.53, "total_tokens": 23877312 }, { "epoch": 0.07494201391666, "grad_norm": 0.5267778635025024, "learning_rate": 2.4850645694436736e-07, "loss": 0.6155, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 937, "tokens_per_second_per_gpu": 16797.05, "total_tokens": 23902155 }, { "epoch": 0.0750219947212669, "grad_norm": 0.5988689661026001, "learning_rate": 2.4083238061252565e-07, "loss": 0.6415, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 938, "tokens_per_second_per_gpu": 17265.67, "total_tokens": 23928370 }, { "epoch": 0.07510197552587379, "grad_norm": 0.49074527621269226, "learning_rate": 2.332772166583208e-07, "loss": 0.595, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 939, "tokens_per_second_per_gpu": 17273.07, "total_tokens": 23954278 }, { "epoch": 0.07518195633048068, "grad_norm": 0.5439552068710327, "learning_rate": 2.2584105713904126e-07, "loss": 0.6362, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 940, "tokens_per_second_per_gpu": 16440.73, "total_tokens": 23978844 }, { "epoch": 0.07526193713508758, "grad_norm": 0.5561083555221558, "learning_rate": 2.1852399266194312e-07, "loss": 0.6293, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 941, "tokens_per_second_per_gpu": 16716.95, "total_tokens": 24003689 }, { "epoch": 0.07534191793969447, "grad_norm": 0.5233296155929565, "learning_rate": 2.1132611238315004e-07, "loss": 0.6433, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 942, "tokens_per_second_per_gpu": 17137.28, "total_tokens": 24029244 }, { "epoch": 0.07542189874430137, "grad_norm": 0.5398736596107483, "learning_rate": 2.0424750400655947e-07, "loss": 0.6613, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 943, "tokens_per_second_per_gpu": 17276.11, "total_tokens": 24055284 }, { "epoch": 0.07550187954890826, "grad_norm": 0.5166030526161194, "learning_rate": 1.9728825378278248e-07, "loss": 0.6078, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 944, "tokens_per_second_per_gpu": 17373.57, "total_tokens": 24081384 }, { "epoch": 0.07558186035351516, "grad_norm": 0.5131341218948364, "learning_rate": 1.9044844650808468e-07, "loss": 0.6308, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 945, "tokens_per_second_per_gpu": 17527.39, "total_tokens": 24107343 }, { "epoch": 0.07566184115812205, "grad_norm": 0.5063515901565552, "learning_rate": 1.8372816552336025e-07, "loss": 0.57, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 946, "tokens_per_second_per_gpu": 17410.89, "total_tokens": 24133150 }, { "epoch": 0.07574182196272894, "grad_norm": 0.5385065078735352, "learning_rate": 1.7712749271311392e-07, "loss": 0.6102, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 947, "tokens_per_second_per_gpu": 17121.83, "total_tokens": 24158646 }, { "epoch": 0.07582180276733584, "grad_norm": 0.5242645740509033, "learning_rate": 1.706465085044584e-07, "loss": 0.6648, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 948, "tokens_per_second_per_gpu": 17418.14, "total_tokens": 24185422 }, { "epoch": 0.07590178357194273, "grad_norm": 0.5127405524253845, "learning_rate": 1.6428529186614195e-07, "loss": 0.5975, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 949, "tokens_per_second_per_gpu": 16750.54, "total_tokens": 24210681 }, { "epoch": 0.07598176437654963, "grad_norm": 0.4975850582122803, "learning_rate": 1.580439203075812e-07, "loss": 0.6102, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 950, "tokens_per_second_per_gpu": 17360.01, "total_tokens": 24237288 }, { "epoch": 0.07606174518115652, "grad_norm": 0.5240901112556458, "learning_rate": 1.519224698779198e-07, "loss": 0.6304, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 951, "tokens_per_second_per_gpu": 16648.45, "total_tokens": 24262146 }, { "epoch": 0.07614172598576342, "grad_norm": 0.5453839898109436, "learning_rate": 1.4592101516509916e-07, "loss": 0.6353, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 952, "tokens_per_second_per_gpu": 17196.04, "total_tokens": 24288304 }, { "epoch": 0.07622170679037031, "grad_norm": 0.5186492204666138, "learning_rate": 1.400396292949513e-07, "loss": 0.6199, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 953, "tokens_per_second_per_gpu": 16644.24, "total_tokens": 24313146 }, { "epoch": 0.0763016875949772, "grad_norm": 0.49842411279678345, "learning_rate": 1.3427838393030634e-07, "loss": 0.6134, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 954, "tokens_per_second_per_gpu": 17154.09, "total_tokens": 24339160 }, { "epoch": 0.0763816683995841, "grad_norm": 0.5567916035652161, "learning_rate": 1.2863734927012094e-07, "loss": 0.6414, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 955, "tokens_per_second_per_gpu": 16807.04, "total_tokens": 24364306 }, { "epoch": 0.07646164920419099, "grad_norm": 0.5145589709281921, "learning_rate": 1.231165940486234e-07, "loss": 0.5819, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 956, "tokens_per_second_per_gpu": 16993.98, "total_tokens": 24389531 }, { "epoch": 0.07654163000879789, "grad_norm": 0.543738842010498, "learning_rate": 1.1771618553447217e-07, "loss": 0.651, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 957, "tokens_per_second_per_gpu": 17372.18, "total_tokens": 24415520 }, { "epoch": 0.07662161081340478, "grad_norm": 0.49085065722465515, "learning_rate": 1.1243618952994195e-07, "loss": 0.6086, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 958, "tokens_per_second_per_gpu": 17246.99, "total_tokens": 24441786 }, { "epoch": 0.07670159161801168, "grad_norm": 0.5156171917915344, "learning_rate": 1.0727667037011668e-07, "loss": 0.5842, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 959, "tokens_per_second_per_gpu": 17115.42, "total_tokens": 24467022 }, { "epoch": 0.07678157242261857, "grad_norm": 0.5145148634910583, "learning_rate": 1.0223769092211012e-07, "loss": 0.5551, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 960, "tokens_per_second_per_gpu": 16428.63, "total_tokens": 24490929 }, { "epoch": 0.07686155322722546, "grad_norm": 0.5537049174308777, "learning_rate": 9.731931258429638e-08, "loss": 0.6454, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 961, "tokens_per_second_per_gpu": 17267.19, "total_tokens": 24516248 }, { "epoch": 0.07694153403183236, "grad_norm": 0.543889045715332, "learning_rate": 9.252159528556404e-08, "loss": 0.594, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 962, "tokens_per_second_per_gpu": 16855.69, "total_tokens": 24541430 }, { "epoch": 0.07702151483643925, "grad_norm": 0.5286483764648438, "learning_rate": 8.784459748458318e-08, "loss": 0.6115, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 963, "tokens_per_second_per_gpu": 17319.91, "total_tokens": 24567628 }, { "epoch": 0.07710149564104615, "grad_norm": 0.5454973578453064, "learning_rate": 8.328837616909612e-08, "loss": 0.6497, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 964, "tokens_per_second_per_gpu": 17356.21, "total_tokens": 24594157 }, { "epoch": 0.07718147644565304, "grad_norm": 0.5202831029891968, "learning_rate": 7.885298685522235e-08, "loss": 0.6047, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 965, "tokens_per_second_per_gpu": 17130.28, "total_tokens": 24619949 }, { "epoch": 0.07726145725025994, "grad_norm": 0.5220912098884583, "learning_rate": 7.453848358678018e-08, "loss": 0.6013, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 966, "tokens_per_second_per_gpu": 17179.24, "total_tokens": 24645528 }, { "epoch": 0.07734143805486683, "grad_norm": 0.5430648326873779, "learning_rate": 7.034491893463059e-08, "loss": 0.6401, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 967, "tokens_per_second_per_gpu": 17248.77, "total_tokens": 24671124 }, { "epoch": 0.07742141885947372, "grad_norm": 0.5340640544891357, "learning_rate": 6.627234399603554e-08, "loss": 0.6728, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 968, "tokens_per_second_per_gpu": 17379.29, "total_tokens": 24697233 }, { "epoch": 0.07750139966408062, "grad_norm": 0.541018545627594, "learning_rate": 6.232080839403631e-08, "loss": 0.6661, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 969, "tokens_per_second_per_gpu": 17330.58, "total_tokens": 24723473 }, { "epoch": 0.07758138046868751, "grad_norm": 0.5246049761772156, "learning_rate": 5.849036027684607e-08, "loss": 0.6333, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 970, "tokens_per_second_per_gpu": 16577.2, "total_tokens": 24749018 }, { "epoch": 0.07766136127329441, "grad_norm": 0.5491915941238403, "learning_rate": 5.4781046317267103e-08, "loss": 0.6413, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 971, "tokens_per_second_per_gpu": 17163.06, "total_tokens": 24774116 }, { "epoch": 0.0777413420779013, "grad_norm": 0.5680013298988342, "learning_rate": 5.119291171211793e-08, "loss": 0.6917, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 972, "tokens_per_second_per_gpu": 17074.63, "total_tokens": 24799793 }, { "epoch": 0.0778213228825082, "grad_norm": 0.5304521322250366, "learning_rate": 4.772600018168816e-08, "loss": 0.622, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 973, "tokens_per_second_per_gpu": 16766.62, "total_tokens": 24825100 }, { "epoch": 0.07790130368711509, "grad_norm": 0.5385372042655945, "learning_rate": 4.438035396920004e-08, "loss": 0.6456, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 974, "tokens_per_second_per_gpu": 17139.0, "total_tokens": 24851690 }, { "epoch": 0.07798128449172198, "grad_norm": 0.549923300743103, "learning_rate": 4.115601384029666e-08, "loss": 0.6265, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 975, "tokens_per_second_per_gpu": 16505.25, "total_tokens": 24875789 }, { "epoch": 0.07806126529632888, "grad_norm": 0.5306264758110046, "learning_rate": 3.805301908254455e-08, "loss": 0.5789, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 976, "tokens_per_second_per_gpu": 17103.72, "total_tokens": 24900703 }, { "epoch": 0.07814124610093577, "grad_norm": 0.6005721688270569, "learning_rate": 3.50714075049563e-08, "loss": 0.5928, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 977, "tokens_per_second_per_gpu": 16680.56, "total_tokens": 24926020 }, { "epoch": 0.07822122690554267, "grad_norm": 0.5427029728889465, "learning_rate": 3.22112154375287e-08, "loss": 0.6533, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 978, "tokens_per_second_per_gpu": 16310.65, "total_tokens": 24950374 }, { "epoch": 0.07830120771014956, "grad_norm": 0.5275732278823853, "learning_rate": 2.947247773079753e-08, "loss": 0.6329, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 979, "tokens_per_second_per_gpu": 16975.33, "total_tokens": 24975610 }, { "epoch": 0.07838118851475646, "grad_norm": 0.5323116183280945, "learning_rate": 2.6855227755419046e-08, "loss": 0.6537, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 980, "tokens_per_second_per_gpu": 17462.53, "total_tokens": 25001687 }, { "epoch": 0.07846116931936335, "grad_norm": 0.5180700421333313, "learning_rate": 2.4359497401758026e-08, "loss": 0.6228, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 981, "tokens_per_second_per_gpu": 17019.02, "total_tokens": 25026807 }, { "epoch": 0.07854115012397024, "grad_norm": 0.5496319532394409, "learning_rate": 2.1985317079500358e-08, "loss": 0.6294, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 982, "tokens_per_second_per_gpu": 16897.37, "total_tokens": 25051565 }, { "epoch": 0.07862113092857714, "grad_norm": 0.5391293168067932, "learning_rate": 1.973271571728441e-08, "loss": 0.6272, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 983, "tokens_per_second_per_gpu": 16772.23, "total_tokens": 25076522 }, { "epoch": 0.07870111173318403, "grad_norm": 0.5196576714515686, "learning_rate": 1.7601720762346895e-08, "loss": 0.6276, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 984, "tokens_per_second_per_gpu": 16931.06, "total_tokens": 25102258 }, { "epoch": 0.07878109253779093, "grad_norm": 0.5071877241134644, "learning_rate": 1.5592358180189782e-08, "loss": 0.6424, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 985, "tokens_per_second_per_gpu": 17018.31, "total_tokens": 25128366 }, { "epoch": 0.07886107334239782, "grad_norm": 0.538322925567627, "learning_rate": 1.370465245426167e-08, "loss": 0.6304, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 986, "tokens_per_second_per_gpu": 16589.6, "total_tokens": 25152870 }, { "epoch": 0.07894105414700472, "grad_norm": 0.49802151322364807, "learning_rate": 1.1938626585660252e-08, "loss": 0.5973, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 987, "tokens_per_second_per_gpu": 17044.83, "total_tokens": 25178205 }, { "epoch": 0.07902103495161161, "grad_norm": 0.5292408466339111, "learning_rate": 1.0294302092853647e-08, "loss": 0.6749, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 988, "tokens_per_second_per_gpu": 17062.32, "total_tokens": 25203764 }, { "epoch": 0.0791010157562185, "grad_norm": 0.5256536602973938, "learning_rate": 8.771699011416169e-09, "loss": 0.6082, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 989, "tokens_per_second_per_gpu": 17511.87, "total_tokens": 25229634 }, { "epoch": 0.0791809965608254, "grad_norm": 0.5201627016067505, "learning_rate": 7.370835893788508e-09, "loss": 0.6419, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 990, "tokens_per_second_per_gpu": 17090.54, "total_tokens": 25255664 }, { "epoch": 0.07926097736543229, "grad_norm": 0.56233149766922, "learning_rate": 6.091729809042379e-09, "loss": 0.6216, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 991, "tokens_per_second_per_gpu": 17406.21, "total_tokens": 25281827 }, { "epoch": 0.0793409581700392, "grad_norm": 0.5474256277084351, "learning_rate": 4.9343963426840006e-09, "loss": 0.6036, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 992, "tokens_per_second_per_gpu": 16687.8, "total_tokens": 25306499 }, { "epoch": 0.07942093897464608, "grad_norm": 0.4903515577316284, "learning_rate": 3.898849596456477e-09, "loss": 0.5721, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 993, "tokens_per_second_per_gpu": 17239.93, "total_tokens": 25332595 }, { "epoch": 0.07950091977925298, "grad_norm": 0.5319344997406006, "learning_rate": 2.9851021881688314e-09, "loss": 0.6523, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 994, "tokens_per_second_per_gpu": 17089.9, "total_tokens": 25358509 }, { "epoch": 0.07958090058385987, "grad_norm": 0.5517778396606445, "learning_rate": 2.193165251545004e-09, "loss": 0.6289, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 995, "tokens_per_second_per_gpu": 17147.96, "total_tokens": 25384428 }, { "epoch": 0.07966088138846676, "grad_norm": 0.5497578978538513, "learning_rate": 1.5230484360873043e-09, "loss": 0.6752, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 996, "tokens_per_second_per_gpu": 17469.68, "total_tokens": 25411007 }, { "epoch": 0.07974086219307366, "grad_norm": 0.5634413361549377, "learning_rate": 9.74759906957612e-10, "loss": 0.6161, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 997, "tokens_per_second_per_gpu": 16112.77, "total_tokens": 25435004 }, { "epoch": 0.07982084299768055, "grad_norm": 0.5436570048332214, "learning_rate": 5.483063448785686e-10, "loss": 0.636, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 998, "tokens_per_second_per_gpu": 16917.97, "total_tokens": 25460910 }, { "epoch": 0.07990082380228745, "grad_norm": 0.5557326078414917, "learning_rate": 2.436929460525317e-10, "loss": 0.6001, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 999, "tokens_per_second_per_gpu": 17178.6, "total_tokens": 25486316 }, { "epoch": 0.07998080460689434, "grad_norm": 0.5365746021270752, "learning_rate": 6.092342209607083e-11, "loss": 0.616, "memory/device_reserved (GiB)": 69.96, "memory/max_active (GiB)": 66.03, "memory/max_allocated (GiB)": 66.03, "step": 1000, "tokens_per_second_per_gpu": 16970.21, "total_tokens": 25511597 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.2087150526464e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }